Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	adds formatting fix
Browse filesThis view is limited to 50 files because it contains too many changes.  
							See raw diff
- .gitignore +14 -11
 - FORMATTING_FIX_SUMMARY.md +15 -8
 - H100_LIGHTWEIGHT_GUIDE.md +276 -0
 - INTERACTIVE_PIPELINE_IMPROVEMENTS.md +330 -0
 - PIPELINE_SUMMARY.md +330 -0
 - README.md +1 -1
 - README_END_TO_END.md +304 -0
 - cloud_deployment.sh +0 -279
 - config/train_smollm3.py +4 -0
 - config/train_smollm3_h100_lightweight.py +112 -0
 - config/train_smollm3_openhermes_fr.py +4 -0
 - config/train_smollm3_openhermes_fr_a100_balanced.py +4 -0
 - config/train_smollm3_openhermes_fr_a100_large.py +4 -0
 - config/train_smollm3_openhermes_fr_a100_max_performance.py +4 -0
 - config/train_smollm3_openhermes_fr_a100_multiple_passes.py +4 -0
 - A100_LARGE_SCALE_GUIDE.md → docs/A100_LARGE_SCALE_GUIDE.md +0 -0
 - docs/APP_CONFIGURATION_GUIDE.md +234 -0
 - CLOUD_DEPLOYMENT_GUIDE.md → docs/CLOUD_DEPLOYMENT_GUIDE.md +0 -0
 - CLOUD_TRAINING_GUIDE.md → docs/CLOUD_TRAINING_GUIDE.md +0 -0
 - DEPLOYMENT_GUIDE.md → docs/DEPLOYMENT_GUIDE.md +0 -0
 - docs/ENVIRONMENT_VARIABLES.md +113 -0
 - docs/HF_DATASETS_GUIDE.md +269 -0
 - docs/HF_SPACES_GUIDE.md +163 -0
 - docs/MONITORING_IMPROVEMENTS_SUMMARY.md +191 -0
 - docs/MONITORING_INTEGRATION_GUIDE.md +245 -0
 - NO_THINK_TAG_GUIDE.md → docs/NO_THINK_TAG_GUIDE.md +0 -0
 - PUSH_GUIDE.md → docs/PUSH_GUIDE.md +0 -0
 - docs/PUSH_SCRIPT_GUIDE.md +267 -0
 - TRACKIO_INTEGRATION.md → docs/TRACKIO_INTEGRATION.md +0 -0
 - TRACKIO_INTEGRATION_VERIFICATION.md → docs/TRACKIO_INTEGRATION_VERIFICATION.md +0 -0
 - TRACKIO_INTERFACE_GUIDE.md → docs/TRACKIO_INTERFACE_GUIDE.md +0 -0
 - launch.sh +690 -0
 - requirements.txt → requirements/requirements.txt +0 -0
 - requirements_core.txt → requirements/requirements_core.txt +7 -1
 - requirements_minimal.txt → requirements/requirements_minimal.txt +0 -0
 - add_demo_data.py → scripts/dataset_tonic/add_demo_data.py +0 -0
 - scripts/dataset_tonic/setup_hf_dataset.py +275 -0
 - push_to_huggingface.py → scripts/model_tonic/push_to_huggingface.py +56 -13
 - scripts/trackio_tonic/configure_trackio.py +145 -0
 - deploy_trackio_space.py → scripts/trackio_tonic/deploy_trackio_space.py +1 -1
 - scripts/trackio_tonic/trackio_api_client.py +286 -0
 - run_a100_large_experiment.py → scripts/training/train.py +0 -0
 - setup_launch.py +283 -0
 - config.py → src/config.py +0 -0
 - data.py → src/data.py +0 -0
 - model.py → src/model.py +0 -0
 - monitoring.py → src/monitoring.py +157 -58
 - train.py → src/train.py +76 -0
 - trainer.py → src/trainer.py +0 -0
 - templates/datasets/readme.md +0 -0
 
    	
        .gitignore
    CHANGED
    
    | 
         @@ -1,3 +1,6 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 1 | 
         
             
            # Python
         
     | 
| 2 | 
         
             
            __pycache__/
         
     | 
| 3 | 
         
             
            *.py[cod]
         
     | 
| 
         @@ -59,17 +62,17 @@ Thumbs.db 
     | 
|
| 59 | 
         
             
            logs/
         
     | 
| 60 | 
         
             
            tensorboard_logs/
         
     | 
| 61 | 
         | 
| 62 | 
         
            -
            # Model outputs
         
     | 
| 63 | 
         
            -
            output/
         
     | 
| 64 | 
         
            -
            checkpoints/
         
     | 
| 65 | 
         
            -
            models/
         
     | 
| 66 | 
         
            -
            wandb/
         
     | 
| 67 | 
         | 
| 68 | 
         
             
            # Datasets
         
     | 
| 69 | 
         
            -
            data/
         
     | 
| 70 | 
         
            -
            datasets/
         
     | 
| 71 | 
         
            -
            my_dataset/
         
     | 
| 72 | 
         
            -
            test_dataset/
         
     | 
| 73 | 
         | 
| 74 | 
         
             
            # Temporary files
         
     | 
| 75 | 
         
             
            tmp/
         
     | 
| 
         @@ -86,9 +89,9 @@ accelerate_config.yaml 
     | 
|
| 86 | 
         | 
| 87 | 
         
             
            # Training outputs
         
     | 
| 88 | 
         
             
            runs/
         
     | 
| 89 | 
         
            -
             
     | 
| 90 | 
         
             
            !config/*.json
         
     | 
| 91 | 
         
            -
             
     | 
| 92 | 
         | 
| 93 | 
         
             
            # Evaluation results
         
     | 
| 94 | 
         
             
            eval_results/
         
     | 
| 
         | 
|
| 1 | 
         
            +
            .cursorrules/
         
     | 
| 2 | 
         
            +
            *.mdc
         
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
             
            # Python
         
     | 
| 5 | 
         
             
            __pycache__/
         
     | 
| 6 | 
         
             
            *.py[cod]
         
     | 
| 
         | 
|
| 62 | 
         
             
            logs/
         
     | 
| 63 | 
         
             
            tensorboard_logs/
         
     | 
| 64 | 
         | 
| 65 | 
         
            +
            # # Model outputs
         
     | 
| 66 | 
         
            +
            # output/
         
     | 
| 67 | 
         
            +
            # checkpoints/
         
     | 
| 68 | 
         
            +
            # models/
         
     | 
| 69 | 
         
            +
            # wandb/
         
     | 
| 70 | 
         | 
| 71 | 
         
             
            # Datasets
         
     | 
| 72 | 
         
            +
            # data/
         
     | 
| 73 | 
         
            +
            # datasets/
         
     | 
| 74 | 
         
            +
            # my_dataset/
         
     | 
| 75 | 
         
            +
            # test_dataset/
         
     | 
| 76 | 
         | 
| 77 | 
         
             
            # Temporary files
         
     | 
| 78 | 
         
             
            tmp/
         
     | 
| 
         | 
|
| 89 | 
         | 
| 90 | 
         
             
            # Training outputs
         
     | 
| 91 | 
         
             
            runs/
         
     | 
| 92 | 
         
            +
            #*.json
         
     | 
| 93 | 
         
             
            !config/*.json
         
     | 
| 94 | 
         
            +
            #!*.json.example
         
     | 
| 95 | 
         | 
| 96 | 
         
             
            # Evaluation results
         
     | 
| 97 | 
         
             
            eval_results/
         
     | 
    	
        FORMATTING_FIX_SUMMARY.md
    CHANGED
    
    | 
         @@ -19,10 +19,10 @@ I fixed the issue by standardizing all logging statements to use traditional str 
     | 
|
| 19 | 
         | 
| 20 | 
         
             
            ### Files Fixed
         
     | 
| 21 | 
         | 
| 22 | 
         
            -
            1. **`monitoring.py`** - Fixed all logging statements
         
     | 
| 23 | 
         
            -
            2. **`trainer.py`** - Fixed all logging statements  
         
     | 
| 24 | 
         
            -
            3. **`model.py`** - Fixed all logging statements
         
     | 
| 25 | 
         
            -
            4. **`data.py`** - Fixed all logging statements
         
     | 
| 26 | 
         | 
| 27 | 
         
             
            ### Changes Made
         
     | 
| 28 | 
         | 
| 
         @@ -52,6 +52,7 @@ This script tests: 
     | 
|
| 52 | 
         
             
            - ✅ Logging functionality
         
     | 
| 53 | 
         
             
            - ✅ Module imports
         
     | 
| 54 | 
         
             
            - ✅ Configuration loading
         
     | 
| 
         | 
|
| 55 | 
         
             
            - ✅ Error handling
         
     | 
| 56 | 
         | 
| 57 | 
         
             
            ## 🚀 Usage
         
     | 
| 
         @@ -68,25 +69,29 @@ python run_a100_large_experiment.py \ 
     | 
|
| 68 | 
         | 
| 69 | 
         
             
            ## 📋 Key Changes
         
     | 
| 70 | 
         | 
| 71 | 
         
            -
            ### 1. Monitoring Module (`monitoring.py`)
         
     | 
| 72 | 
         
             
            - Fixed all `logger.info()`, `logger.error()`, `logger.warning()` calls
         
     | 
| 73 | 
         
             
            - Replaced f-strings with `%` formatting
         
     | 
| 74 | 
         
             
            - Fixed string concatenation in file paths
         
     | 
| 
         | 
|
| 75 | 
         | 
| 76 | 
         
            -
            ### 2. Trainer Module (`trainer.py`)
         
     | 
| 77 | 
         
             
            - Fixed logging in `SmolLM3Trainer` class
         
     | 
| 78 | 
         
             
            - Fixed console output formatting
         
     | 
| 79 | 
         
             
            - Fixed error message formatting
         
     | 
| 
         | 
|
| 80 | 
         | 
| 81 | 
         
            -
            ### 3. Model Module (`model.py`)
         
     | 
| 82 | 
         
             
            - Fixed model loading logging
         
     | 
| 83 | 
         
             
            - Fixed configuration logging
         
     | 
| 84 | 
         
             
            - Fixed error reporting
         
     | 
| 
         | 
|
| 85 | 
         | 
| 86 | 
         
            -
            ### 4. Data Module (`data.py`)
         
     | 
| 87 | 
         
             
            - Fixed dataset loading logging
         
     | 
| 88 | 
         
             
            - Fixed processing progress logging
         
     | 
| 89 | 
         
             
            - Fixed error handling
         
     | 
| 
         | 
|
| 90 | 
         | 
| 91 | 
         
             
            ## 🔧 Technical Details
         
     | 
| 92 | 
         | 
| 
         @@ -119,6 +124,7 @@ To verify the fix works: 
     | 
|
| 119 | 
         
             
               - ✅ Logging tests
         
     | 
| 120 | 
         
             
               - ✅ Import tests  
         
     | 
| 121 | 
         
             
               - ✅ Configuration tests
         
     | 
| 
         | 
|
| 122 | 
         | 
| 123 | 
         
             
            3. **Run your training command**:
         
     | 
| 124 | 
         
             
               ```bash
         
     | 
| 
         @@ -131,6 +137,7 @@ To verify the fix works: 
     | 
|
| 131 | 
         
             
            - No changes to the training logic or configuration
         
     | 
| 132 | 
         
             
            - All error messages and logging remain informative
         
     | 
| 133 | 
         
             
            - The fix is backward compatible
         
     | 
| 
         | 
|
| 134 | 
         | 
| 135 | 
         
             
            ## 🚨 Prevention
         
     | 
| 136 | 
         | 
| 
         | 
|
| 19 | 
         | 
| 20 | 
         
             
            ### Files Fixed
         
     | 
| 21 | 
         | 
| 22 | 
         
            +
            1. **`src/monitoring.py`** - Fixed all logging statements
         
     | 
| 23 | 
         
            +
            2. **`src/trainer.py`** - Fixed all logging statements  
         
     | 
| 24 | 
         
            +
            3. **`src/model.py`** - Fixed all logging statements
         
     | 
| 25 | 
         
            +
            4. **`src/data.py`** - Fixed all logging statements
         
     | 
| 26 | 
         | 
| 27 | 
         
             
            ### Changes Made
         
     | 
| 28 | 
         | 
| 
         | 
|
| 52 | 
         
             
            - ✅ Logging functionality
         
     | 
| 53 | 
         
             
            - ✅ Module imports
         
     | 
| 54 | 
         
             
            - ✅ Configuration loading
         
     | 
| 55 | 
         
            +
            - ✅ Monitoring creation
         
     | 
| 56 | 
         
             
            - ✅ Error handling
         
     | 
| 57 | 
         | 
| 58 | 
         
             
            ## 🚀 Usage
         
     | 
| 
         | 
|
| 69 | 
         | 
| 70 | 
         
             
            ## 📋 Key Changes
         
     | 
| 71 | 
         | 
| 72 | 
         
            +
            ### 1. Monitoring Module (`src/monitoring.py`)
         
     | 
| 73 | 
         
             
            - Fixed all `logger.info()`, `logger.error()`, `logger.warning()` calls
         
     | 
| 74 | 
         
             
            - Replaced f-strings with `%` formatting
         
     | 
| 75 | 
         
             
            - Fixed string concatenation in file paths
         
     | 
| 76 | 
         
            +
            - Fixed HF Datasets integration logging
         
     | 
| 77 | 
         | 
| 78 | 
         
            +
            ### 2. Trainer Module (`src/trainer.py`)
         
     | 
| 79 | 
         
             
            - Fixed logging in `SmolLM3Trainer` class
         
     | 
| 80 | 
         
             
            - Fixed console output formatting
         
     | 
| 81 | 
         
             
            - Fixed error message formatting
         
     | 
| 82 | 
         
            +
            - Fixed callback logging
         
     | 
| 83 | 
         | 
| 84 | 
         
            +
            ### 3. Model Module (`src/model.py`)
         
     | 
| 85 | 
         
             
            - Fixed model loading logging
         
     | 
| 86 | 
         
             
            - Fixed configuration logging
         
     | 
| 87 | 
         
             
            - Fixed error reporting
         
     | 
| 88 | 
         
            +
            - Fixed parameter logging
         
     | 
| 89 | 
         | 
| 90 | 
         
            +
            ### 4. Data Module (`src/data.py`)
         
     | 
| 91 | 
         
             
            - Fixed dataset loading logging
         
     | 
| 92 | 
         
             
            - Fixed processing progress logging
         
     | 
| 93 | 
         
             
            - Fixed error handling
         
     | 
| 94 | 
         
            +
            - Fixed split processing logging
         
     | 
| 95 | 
         | 
| 96 | 
         
             
            ## 🔧 Technical Details
         
     | 
| 97 | 
         | 
| 
         | 
|
| 124 | 
         
             
               - ✅ Logging tests
         
     | 
| 125 | 
         
             
               - ✅ Import tests  
         
     | 
| 126 | 
         
             
               - ✅ Configuration tests
         
     | 
| 127 | 
         
            +
               - ✅ Monitoring creation tests
         
     | 
| 128 | 
         | 
| 129 | 
         
             
            3. **Run your training command**:
         
     | 
| 130 | 
         
             
               ```bash
         
     | 
| 
         | 
|
| 137 | 
         
             
            - No changes to the training logic or configuration
         
     | 
| 138 | 
         
             
            - All error messages and logging remain informative
         
     | 
| 139 | 
         
             
            - The fix is backward compatible
         
     | 
| 140 | 
         
            +
            - HF Datasets integration is preserved
         
     | 
| 141 | 
         | 
| 142 | 
         
             
            ## 🚨 Prevention
         
     | 
| 143 | 
         | 
    	
        H100_LIGHTWEIGHT_GUIDE.md
    ADDED
    
    | 
         @@ -0,0 +1,276 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            # H100 Lightweight Training Configuration Guide
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            This guide explains the new **H100 Lightweight (Rapid)** training configuration, optimized for rapid fine-tuning on H100 GPUs with a small, carefully selected dataset.
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            ## 🎯 Overview
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            The H100 Lightweight configuration is designed for:
         
     | 
| 8 | 
         
            +
            - **Rapid experimentation** on H100 GPUs
         
     | 
| 9 | 
         
            +
            - **Efficient training** with 80K carefully selected samples
         
     | 
| 10 | 
         
            +
            - **Quick iteration** for research and development
         
     | 
| 11 | 
         
            +
            - **Cost-effective** training sessions
         
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
            ## 🚀 Key Features
         
     | 
| 14 | 
         
            +
             
     | 
| 15 | 
         
            +
            ### **Optimized for H100**
         
     | 
| 16 | 
         
            +
            - **Batch Size**: 16 (larger than A100 configs)
         
     | 
| 17 | 
         
            +
            - **Gradient Accumulation**: 4 (reduced for faster updates)
         
     | 
| 18 | 
         
            +
            - **Learning Rate**: 8e-6 (slightly higher for rapid convergence)
         
     | 
| 19 | 
         
            +
            - **Sequence Length**: 8192 (full context window)
         
     | 
| 20 | 
         
            +
             
     | 
| 21 | 
         
            +
            ### **Dataset Sampling**
         
     | 
| 22 | 
         
            +
            - **Source**: OpenHermes-FR dataset
         
     | 
| 23 | 
         
            +
            - **Sample Size**: 80,000 random samples
         
     | 
| 24 | 
         
            +
            - **Validation**: 1,000 samples (if available)
         
     | 
| 25 | 
         
            +
            - **Reproducibility**: Fixed random seed (42)
         
     | 
| 26 | 
         
            +
             
     | 
| 27 | 
         
            +
            ### **Training Optimizations**
         
     | 
| 28 | 
         
            +
            - **Warmup Steps**: 50 (reduced for rapid training)
         
     | 
| 29 | 
         
            +
            - **Evaluation**: Every 50 steps
         
     | 
| 30 | 
         
            +
            - **Logging**: Every 5 steps
         
     | 
| 31 | 
         
            +
            - **Saving**: Every 200 steps
         
     | 
| 32 | 
         
            +
            - **Checkpoints**: Keep only 2 (save storage)
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
            ## 📊 Configuration Details
         
     | 
| 35 | 
         
            +
             
     | 
| 36 | 
         
            +
            ### **Model Configuration**
         
     | 
| 37 | 
         
            +
            ```python
         
     | 
| 38 | 
         
            +
            model_name="HuggingFaceTB/SmolLM3-3B"
         
     | 
| 39 | 
         
            +
            max_seq_length=8192
         
     | 
| 40 | 
         
            +
            use_flash_attention=True
         
     | 
| 41 | 
         
            +
            use_gradient_checkpointing=True
         
     | 
| 42 | 
         
            +
            ```
         
     | 
| 43 | 
         
            +
             
     | 
| 44 | 
         
            +
            ### **Training Parameters**
         
     | 
| 45 | 
         
            +
            ```python
         
     | 
| 46 | 
         
            +
            batch_size=16
         
     | 
| 47 | 
         
            +
            gradient_accumulation_steps=4
         
     | 
| 48 | 
         
            +
            learning_rate=8e-6
         
     | 
| 49 | 
         
            +
            warmup_steps=50
         
     | 
| 50 | 
         
            +
            max_epochs=1
         
     | 
| 51 | 
         
            +
            ```
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
            ### **H100-Specific Optimizations**
         
     | 
| 54 | 
         
            +
            ```python
         
     | 
| 55 | 
         
            +
            dataloader_num_workers=4
         
     | 
| 56 | 
         
            +
            dataloader_pin_memory=True
         
     | 
| 57 | 
         
            +
            gradient_clipping=1.0
         
     | 
| 58 | 
         
            +
            group_by_length=True
         
     | 
| 59 | 
         
            +
            pad_to_multiple_of=8
         
     | 
| 60 | 
         
            +
            ```
         
     | 
| 61 | 
         
            +
             
     | 
| 62 | 
         
            +
            ### **Memory Optimizations**
         
     | 
| 63 | 
         
            +
            ```python
         
     | 
| 64 | 
         
            +
            save_total_limit=2
         
     | 
| 65 | 
         
            +
            early_stopping_patience=3
         
     | 
| 66 | 
         
            +
            max_grad_norm=1.0
         
     | 
| 67 | 
         
            +
            warmup_ratio=0.1
         
     | 
| 68 | 
         
            +
            ```
         
     | 
| 69 | 
         
            +
             
     | 
| 70 | 
         
            +
            ## 🔧 Usage
         
     | 
| 71 | 
         
            +
             
     | 
| 72 | 
         
            +
            ### **Interactive Selection**
         
     | 
| 73 | 
         
            +
            ```bash
         
     | 
| 74 | 
         
            +
            ./launch.sh
         
     | 
| 75 | 
         
            +
            # Select "H100 Lightweight (Rapid)" when prompted
         
     | 
| 76 | 
         
            +
            ```
         
     | 
| 77 | 
         
            +
             
     | 
| 78 | 
         
            +
            ### **Expected Training Time**
         
     | 
| 79 | 
         
            +
            - **H100**: ~2-4 hours (depending on hardware)
         
     | 
| 80 | 
         
            +
            - **A100**: ~4-6 hours
         
     | 
| 81 | 
         
            +
            - **V100**: ~6-8 hours
         
     | 
| 82 | 
         
            +
             
     | 
| 83 | 
         
            +
            ### **Memory Requirements**
         
     | 
| 84 | 
         
            +
            - **GPU Memory**: 40GB+ (H100 recommended)
         
     | 
| 85 | 
         
            +
            - **System RAM**: 32GB+
         
     | 
| 86 | 
         
            +
            - **Storage**: 50GB+ for dataset and checkpoints
         
     | 
| 87 | 
         
            +
             
     | 
| 88 | 
         
            +
            ## 📈 Performance Characteristics
         
     | 
| 89 | 
         
            +
             
     | 
| 90 | 
         
            +
            ### **Training Speed**
         
     | 
| 91 | 
         
            +
            - **Steps per Second**: ~2-3 (on H100)
         
     | 
| 92 | 
         
            +
            - **Samples per Second**: ~32-48
         
     | 
| 93 | 
         
            +
            - **Effective Batch Size**: 64 (16 × 4)
         
     | 
| 94 | 
         
            +
             
     | 
| 95 | 
         
            +
            ### **Convergence**
         
     | 
| 96 | 
         
            +
            - **Expected Loss**: 1.2-1.8 (after 1 epoch)
         
     | 
| 97 | 
         
            +
            - **Evaluation Frequency**: Every 50 steps
         
     | 
| 98 | 
         
            +
            - **Early Stopping**: After 3 evaluations without improvement
         
     | 
| 99 | 
         
            +
             
     | 
| 100 | 
         
            +
            ### **Dataset Efficiency**
         
     | 
| 101 | 
         
            +
            - **80K samples**: ~1.3% of full OpenHermes-FR
         
     | 
| 102 | 
         
            +
            - **Random sampling**: Ensures diversity
         
     | 
| 103 | 
         
            +
            - **Fixed seed**: Reproducible results
         
     | 
| 104 | 
         
            +
             
     | 
| 105 | 
         
            +
            ## 🎯 Use Cases
         
     | 
| 106 | 
         
            +
             
     | 
| 107 | 
         
            +
            ### **Perfect For**
         
     | 
| 108 | 
         
            +
            - **Rapid prototyping** of new ideas
         
     | 
| 109 | 
         
            +
            - **Hyperparameter tuning** experiments
         
     | 
| 110 | 
         
            +
            - **Model comparison** studies
         
     | 
| 111 | 
         
            +
            - **Research validation** before full training
         
     | 
| 112 | 
         
            +
            - **Educational purposes** and learning
         
     | 
| 113 | 
         
            +
             
     | 
| 114 | 
         
            +
            ### **Not Recommended For**
         
     | 
| 115 | 
         
            +
            - **Production models** (use Multiple Passes instead)
         
     | 
| 116 | 
         
            +
            - **Competition submissions** (use full dataset)
         
     | 
| 117 | 
         
            +
            - **Research papers** (use complete training)
         
     | 
| 118 | 
         
            +
             
     | 
| 119 | 
         
            +
            ## 🔄 Comparison with Other Configurations
         
     | 
| 120 | 
         
            +
             
     | 
| 121 | 
         
            +
            | Configuration | Dataset Size | Batch Size | Epochs | Training Time | Use Case |
         
     | 
| 122 | 
         
            +
            |---------------|--------------|------------|--------|---------------|----------|
         
     | 
| 123 | 
         
            +
            | **Basic Training** | Full SmolTalk | 2 | 3 | 6-8 hours | Learning |
         
     | 
| 124 | 
         
            +
            | **H100 Lightweight** | 80K Hermes-FR | 16 | 1 | 2-4 hours | Rapid experiments |
         
     | 
| 125 | 
         
            +
            | **A100 Large Scale** | Full Hermes-FR | 8 | 1.3 | 8-12 hours | Serious research |
         
     | 
| 126 | 
         
            +
            | **Multiple Passes** | Full Hermes-FR | 6 | 4 | 24-36 hours | Production |
         
     | 
| 127 | 
         
            +
             
     | 
| 128 | 
         
            +
            ## 🛠️ Customization
         
     | 
| 129 | 
         
            +
             
     | 
| 130 | 
         
            +
            ### **Modifying Sample Size**
         
     | 
| 131 | 
         
            +
            ```bash
         
     | 
| 132 | 
         
            +
            # In the launch script, you can modify:
         
     | 
| 133 | 
         
            +
            DATASET_SAMPLE_SIZE=50000  # For 50K samples
         
     | 
| 134 | 
         
            +
            DATASET_SAMPLE_SIZE=100000 # For 100K samples
         
     | 
| 135 | 
         
            +
            ```
         
     | 
| 136 | 
         
            +
             
     | 
| 137 | 
         
            +
            ### **Adjusting Training Parameters**
         
     | 
| 138 | 
         
            +
            ```bash
         
     | 
| 139 | 
         
            +
            # Modify in config/train_smollm3_h100_lightweight.py:
         
     | 
| 140 | 
         
            +
            batch_size=12              # Smaller batch size
         
     | 
| 141 | 
         
            +
            learning_rate=6e-6         # Lower learning rate
         
     | 
| 142 | 
         
            +
            warmup_steps=100          # More warmup steps
         
     | 
| 143 | 
         
            +
            ```
         
     | 
| 144 | 
         
            +
             
     | 
| 145 | 
         
            +
            ### **Changing Dataset**
         
     | 
| 146 | 
         
            +
            ```bash
         
     | 
| 147 | 
         
            +
            # Modify the dataset name in the configuration:
         
     | 
| 148 | 
         
            +
            dataset_name="your-custom-dataset"
         
     | 
| 149 | 
         
            +
            ```
         
     | 
| 150 | 
         
            +
             
     | 
| 151 | 
         
            +
            ## 📊 Monitoring and Results
         
     | 
| 152 | 
         
            +
             
     | 
| 153 | 
         
            +
            ### **Trackio Integration**
         
     | 
| 154 | 
         
            +
            - **Real-time metrics**: Loss, learning rate, gradient norm
         
     | 
| 155 | 
         
            +
            - **Training curves**: Visual progress tracking
         
     | 
| 156 | 
         
            +
            - **Resource usage**: GPU utilization, memory consumption
         
     | 
| 157 | 
         
            +
            - **Artifacts**: Model checkpoints, logs
         
     | 
| 158 | 
         
            +
             
     | 
| 159 | 
         
            +
            ### **Expected Metrics**
         
     | 
| 160 | 
         
            +
            - **Training Loss**: Starts ~3.0, ends ~1.5
         
     | 
| 161 | 
         
            +
            - **Validation Loss**: Should be close to training loss
         
     | 
| 162 | 
         
            +
            - **Learning Rate**: Cosine decay from 8e-6 to 2e-6
         
     | 
| 163 | 
         
            +
            - **Gradient Norm**: Should stay below 1.0
         
     | 
| 164 | 
         
            +
             
     | 
| 165 | 
         
            +
            ### **Success Indicators**
         
     | 
| 166 | 
         
            +
            - **Converging loss**: Steady decrease over time
         
     | 
| 167 | 
         
            +
            - **Stable gradients**: Consistent gradient norms
         
     | 
| 168 | 
         
            +
            - **Good validation**: Validation loss follows training loss
         
     | 
| 169 | 
         
            +
            - **No overfitting**: Validation loss doesn't increase
         
     | 
| 170 | 
         
            +
             
     | 
| 171 | 
         
            +
            ## 🚨 Troubleshooting
         
     | 
| 172 | 
         
            +
             
     | 
| 173 | 
         
            +
            ### **Common Issues**
         
     | 
| 174 | 
         
            +
             
     | 
| 175 | 
         
            +
            #### **Out of Memory (OOM)**
         
     | 
| 176 | 
         
            +
            ```bash
         
     | 
| 177 | 
         
            +
            # Reduce batch size in config:
         
     | 
| 178 | 
         
            +
            batch_size=12  # Instead of 16
         
     | 
| 179 | 
         
            +
            gradient_accumulation_steps=6  # Instead of 4
         
     | 
| 180 | 
         
            +
            ```
         
     | 
| 181 | 
         
            +
             
     | 
| 182 | 
         
            +
            #### **Slow Training**
         
     | 
| 183 | 
         
            +
            ```bash
         
     | 
| 184 | 
         
            +
            # Check GPU utilization:
         
     | 
| 185 | 
         
            +
            nvidia-smi
         
     | 
| 186 | 
         
            +
            # Ensure CUDA is properly installed
         
     | 
| 187 | 
         
            +
            python -c "import torch; print(torch.cuda.is_available())"
         
     | 
| 188 | 
         
            +
            ```
         
     | 
| 189 | 
         
            +
             
     | 
| 190 | 
         
            +
            #### **Poor Convergence**
         
     | 
| 191 | 
         
            +
            ```bash
         
     | 
| 192 | 
         
            +
            # Try different learning rate:
         
     | 
| 193 | 
         
            +
            learning_rate=6e-6  # Instead of 8e-6
         
     | 
| 194 | 
         
            +
            # Or increase warmup:
         
     | 
| 195 | 
         
            +
            warmup_steps=100   # Instead of 50
         
     | 
| 196 | 
         
            +
            ```
         
     | 
| 197 | 
         
            +
             
     | 
| 198 | 
         
            +
            #### **Dataset Issues**
         
     | 
| 199 | 
         
            +
            ```bash
         
     | 
| 200 | 
         
            +
            # Check dataset loading:
         
     | 
| 201 | 
         
            +
            python -c "from datasets import load_dataset; print(len(load_dataset('legmlai/openhermes-fr')['train']))"
         
     | 
| 202 | 
         
            +
            ```
         
     | 
| 203 | 
         
            +
             
     | 
| 204 | 
         
            +
            ### **Performance Tips**
         
     | 
| 205 | 
         
            +
             
     | 
| 206 | 
         
            +
            1. **Use H100 if available**: Significantly faster than A100
         
     | 
| 207 | 
         
            +
            2. **Monitor GPU memory**: Keep utilization below 90%
         
     | 
| 208 | 
         
            +
            3. **Check logs regularly**: Look for convergence issues
         
     | 
| 209 | 
         
            +
            4. **Save checkpoints**: Don't lose progress
         
     | 
| 210 | 
         
            +
            5. **Use early stopping**: Prevent overfitting
         
     | 
| 211 | 
         
            +
             
     | 
| 212 | 
         
            +
            ## 📋 Example Workflow
         
     | 
| 213 | 
         
            +
             
     | 
| 214 | 
         
            +
            ### **Complete H100 Lightweight Training**
         
     | 
| 215 | 
         
            +
            ```bash
         
     | 
| 216 | 
         
            +
            # 1. Setup
         
     | 
| 217 | 
         
            +
            python setup_launch.py
         
     | 
| 218 | 
         
            +
             
     | 
| 219 | 
         
            +
            # 2. Check requirements
         
     | 
| 220 | 
         
            +
            python check_requirements.py
         
     | 
| 221 | 
         
            +
             
     | 
| 222 | 
         
            +
            # 3. Run interactive pipeline
         
     | 
| 223 | 
         
            +
            ./launch.sh
         
     | 
| 224 | 
         
            +
             
     | 
| 225 | 
         
            +
            # 4. Select configuration
         
     | 
| 226 | 
         
            +
            # Choose: "H100 Lightweight (Rapid)"
         
     | 
| 227 | 
         
            +
             
     | 
| 228 | 
         
            +
            # 5. Monitor training
         
     | 
| 229 | 
         
            +
            # Watch Trackio Space for real-time progress
         
     | 
| 230 | 
         
            +
             
     | 
| 231 | 
         
            +
            # 6. Check results
         
     | 
| 232 | 
         
            +
            # Model will be pushed to HF Hub
         
     | 
| 233 | 
         
            +
            # Summary in training_summary.md
         
     | 
| 234 | 
         
            +
            ```
         
     | 
| 235 | 
         
            +
             
     | 
| 236 | 
         
            +
            ### **Expected Output**
         
     | 
| 237 | 
         
            +
            ```
         
     | 
| 238 | 
         
            +
            ✅ Dataset prepared: 80000 train samples, 1000 validation samples
         
     | 
| 239 | 
         
            +
            📈 Training started with 5000 total steps
         
     | 
| 240 | 
         
            +
            ⏱️ Estimated time: 2-4 hours
         
     | 
| 241 | 
         
            +
            📊 Monitor progress at: https://huggingface.co/spaces/...
         
     | 
| 242 | 
         
            +
            ```
         
     | 
| 243 | 
         
            +
             
     | 
| 244 | 
         
            +
            ## 🎉 Benefits
         
     | 
| 245 | 
         
            +
             
     | 
| 246 | 
         
            +
            ### **Speed**
         
     | 
| 247 | 
         
            +
            - **3-4x faster** than full dataset training
         
     | 
| 248 | 
         
            +
            - **Rapid iteration** for research
         
     | 
| 249 | 
         
            +
            - **Quick validation** of ideas
         
     | 
| 250 | 
         
            +
             
     | 
| 251 | 
         
            +
            ### **Efficiency**
         
     | 
| 252 | 
         
            +
            - **Reduced costs** (less GPU time)
         
     | 
| 253 | 
         
            +
            - **Lower storage** requirements
         
     | 
| 254 | 
         
            +
            - **Faster experimentation** cycle
         
     | 
| 255 | 
         
            +
             
     | 
| 256 | 
         
            +
            ### **Quality**
         
     | 
| 257 | 
         
            +
            - **Still high quality** results
         
     | 
| 258 | 
         
            +
            - **Good for prototyping**
         
     | 
| 259 | 
         
            +
            - **Suitable for many use cases**
         
     | 
| 260 | 
         
            +
             
     | 
| 261 | 
         
            +
            ## 🔮 Future Enhancements
         
     | 
| 262 | 
         
            +
             
     | 
| 263 | 
         
            +
            ### **Planned Improvements**
         
     | 
| 264 | 
         
            +
            - **Adaptive sampling**: Smart dataset selection
         
     | 
| 265 | 
         
            +
            - **Multi-GPU support**: Distributed training
         
     | 
| 266 | 
         
            +
            - **Advanced monitoring**: More detailed metrics
         
     | 
| 267 | 
         
            +
            - **Auto-tuning**: Automatic hyperparameter optimization
         
     | 
| 268 | 
         
            +
             
     | 
| 269 | 
         
            +
            ### **Extensibility**
         
     | 
| 270 | 
         
            +
            - **Custom datasets**: Easy integration
         
     | 
| 271 | 
         
            +
            - **Different models**: Support for other architectures
         
     | 
| 272 | 
         
            +
            - **Advanced sampling**: Stratified, balanced sampling
         
     | 
| 273 | 
         
            +
             
     | 
| 274 | 
         
            +
            ---
         
     | 
| 275 | 
         
            +
             
     | 
| 276 | 
         
            +
            **Happy Rapid Training on H100! 🚀** 
         
     | 
    	
        INTERACTIVE_PIPELINE_IMPROVEMENTS.md
    ADDED
    
    | 
         @@ -0,0 +1,330 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            # Interactive Pipeline Improvements
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            This document explains the improvements made to the `launch.sh` script to make it interactive and configurable for different training scenarios.
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            ## 🎯 Key Improvements
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            ### 1. **Interactive User Interface**
         
     | 
| 8 | 
         
            +
            - **Colored Output**: Added color-coded status messages for better UX
         
     | 
| 9 | 
         
            +
            - **Input Validation**: Real-time validation of user inputs
         
     | 
| 10 | 
         
            +
            - **Default Values**: Smart defaults for common configurations
         
     | 
| 11 | 
         
            +
            - **Error Handling**: Graceful error handling with helpful messages
         
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
            ### 2. **Training Configuration Selection**
         
     | 
| 14 | 
         
            +
            The script now offers 4 predefined training configurations:
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            #### **Basic Training (Default)**
         
     | 
| 17 | 
         
            +
            ```bash
         
     | 
| 18 | 
         
            +
            Model: SmolLM3-3B
         
     | 
| 19 | 
         
            +
            Dataset: SmolTalk
         
     | 
| 20 | 
         
            +
            Epochs: 3
         
     | 
| 21 | 
         
            +
            Batch Size: 2
         
     | 
| 22 | 
         
            +
            Learning Rate: 5e-6
         
     | 
| 23 | 
         
            +
            Sequence Length: 4096
         
     | 
| 24 | 
         
            +
            Best for: Quick experiments, learning
         
     | 
| 25 | 
         
            +
            ```
         
     | 
| 26 | 
         
            +
             
     | 
| 27 | 
         
            +
            #### **H100 Lightweight (Rapid)**
         
     | 
| 28 | 
         
            +
            ```bash
         
     | 
| 29 | 
         
            +
            Model: SmolLM3-3B
         
     | 
| 30 | 
         
            +
            Dataset: OpenHermes-FR (80K samples)
         
     | 
| 31 | 
         
            +
            Epochs: 1
         
     | 
| 32 | 
         
            +
            Batch Size: 16
         
     | 
| 33 | 
         
            +
            Learning Rate: 8e-6
         
     | 
| 34 | 
         
            +
            Sequence Length: 8192
         
     | 
| 35 | 
         
            +
            Best for: Rapid training on H100
         
     | 
| 36 | 
         
            +
            ```
         
     | 
| 37 | 
         
            +
             
     | 
| 38 | 
         
            +
            #### **A100 Large Scale**
         
     | 
| 39 | 
         
            +
            ```bash
         
     | 
| 40 | 
         
            +
            Model: SmolLM3-3B
         
     | 
| 41 | 
         
            +
            Dataset: OpenHermes-FR
         
     | 
| 42 | 
         
            +
            Epochs: 1.3 passes
         
     | 
| 43 | 
         
            +
            Batch Size: 8
         
     | 
| 44 | 
         
            +
            Learning Rate: 5e-6
         
     | 
| 45 | 
         
            +
            Sequence Length: 8192
         
     | 
| 46 | 
         
            +
            Best for: High-performance training
         
     | 
| 47 | 
         
            +
            ```
         
     | 
| 48 | 
         
            +
             
     | 
| 49 | 
         
            +
            #### **Multiple Passes**
         
     | 
| 50 | 
         
            +
            ```bash
         
     | 
| 51 | 
         
            +
            Model: SmolLM3-3B
         
     | 
| 52 | 
         
            +
            Dataset: OpenHermes-FR
         
     | 
| 53 | 
         
            +
            Epochs: 4 passes
         
     | 
| 54 | 
         
            +
            Batch Size: 6
         
     | 
| 55 | 
         
            +
            Learning Rate: 3e-6
         
     | 
| 56 | 
         
            +
            Sequence Length: 8192
         
     | 
| 57 | 
         
            +
            Best for: Thorough training
         
     | 
| 58 | 
         
            +
            ```
         
     | 
| 59 | 
         
            +
             
     | 
| 60 | 
         
            +
            #### **Custom Configuration**
         
     | 
| 61 | 
         
            +
            - User-defined parameters
         
     | 
| 62 | 
         
            +
            - Flexible model and dataset selection
         
     | 
| 63 | 
         
            +
            - Custom training parameters
         
     | 
| 64 | 
         
            +
             
     | 
| 65 | 
         
            +
            ### 3. **Enhanced User Experience**
         
     | 
| 66 | 
         
            +
             
     | 
| 67 | 
         
            +
            #### **Step-by-Step Guidance**
         
     | 
| 68 | 
         
            +
            1. **Authentication** - HF username and token validation
         
     | 
| 69 | 
         
            +
            2. **Configuration Selection** - Choose from predefined configs
         
     | 
| 70 | 
         
            +
            3. **Experiment Setup** - Configure experiment details
         
     | 
| 71 | 
         
            +
            4. **Training Parameters** - Adjust hyperparameters
         
     | 
| 72 | 
         
            +
            5. **Deployment Setup** - Trackio Space configuration
         
     | 
| 73 | 
         
            +
            6. **Confirmation** - Review and confirm settings
         
     | 
| 74 | 
         
            +
             
     | 
| 75 | 
         
            +
            #### **Input Functions**
         
     | 
| 76 | 
         
            +
            ```bash
         
     | 
| 77 | 
         
            +
            # Get input with default value
         
     | 
| 78 | 
         
            +
            get_input "Prompt" "default_value" VARIABLE_NAME
         
     | 
| 79 | 
         
            +
             
     | 
| 80 | 
         
            +
            # Select from options
         
     | 
| 81 | 
         
            +
            select_option "Choose option:" "Option 1" "Option 2" "Option 3" VARIABLE_NAME
         
     | 
| 82 | 
         
            +
             
     | 
| 83 | 
         
            +
            # Validate HF token
         
     | 
| 84 | 
         
            +
            validate_hf_token "$HF_TOKEN"
         
     | 
| 85 | 
         
            +
            ```
         
     | 
| 86 | 
         
            +
             
     | 
| 87 | 
         
            +
            #### **Colored Output Functions**
         
     | 
| 88 | 
         
            +
            ```bash
         
     | 
| 89 | 
         
            +
            print_status "Success message"    # Green ✅
         
     | 
| 90 | 
         
            +
            print_warning "Warning message"   # Yellow ⚠️
         
     | 
| 91 | 
         
            +
            print_error "Error message"       # Red ❌
         
     | 
| 92 | 
         
            +
            print_info "Info message"         # Blue ℹ️
         
     | 
| 93 | 
         
            +
            print_header "Header message"     # Purple 🚀
         
     | 
| 94 | 
         
            +
            print_step "Step message"         # Cyan 📋
         
     | 
| 95 | 
         
            +
            ```
         
     | 
| 96 | 
         
            +
             
     | 
| 97 | 
         
            +
            ### 4. **Dynamic Configuration Generation**
         
     | 
| 98 | 
         
            +
             
     | 
| 99 | 
         
            +
            The script now generates training configurations based on user selection:
         
     | 
| 100 | 
         
            +
             
     | 
| 101 | 
         
            +
            ```python
         
     | 
| 102 | 
         
            +
            # Generated config file
         
     | 
| 103 | 
         
            +
            config = SmolLM3Config(
         
     | 
| 104 | 
         
            +
                model_name="$MODEL_NAME",
         
     | 
| 105 | 
         
            +
                max_seq_length=$MAX_SEQ_LENGTH,
         
     | 
| 106 | 
         
            +
                batch_size=$BATCH_SIZE,
         
     | 
| 107 | 
         
            +
                learning_rate=$LEARNING_RATE,
         
     | 
| 108 | 
         
            +
                # ... other parameters
         
     | 
| 109 | 
         
            +
            )
         
     | 
| 110 | 
         
            +
            ```
         
     | 
| 111 | 
         
            +
             
     | 
| 112 | 
         
            +
            ### 5. **Improved Error Handling**
         
     | 
| 113 | 
         
            +
             
     | 
| 114 | 
         
            +
            #### **Input Validation**
         
     | 
| 115 | 
         
            +
            - Required field validation
         
     | 
| 116 | 
         
            +
            - HF token validation
         
     | 
| 117 | 
         
            +
            - Numeric input validation
         
     | 
| 118 | 
         
            +
            - Choice validation
         
     | 
| 119 | 
         
            +
             
     | 
| 120 | 
         
            +
            #### **Graceful Degradation**
         
     | 
| 121 | 
         
            +
            - Clear error messages
         
     | 
| 122 | 
         
            +
            - Recovery suggestions
         
     | 
| 123 | 
         
            +
            - Exit on critical errors
         
     | 
| 124 | 
         
            +
             
     | 
| 125 | 
         
            +
            ### 6. **Configuration Management**
         
     | 
| 126 | 
         
            +
             
     | 
| 127 | 
         
            +
            #### **User Credentials**
         
     | 
| 128 | 
         
            +
            - Interactive username input
         
     | 
| 129 | 
         
            +
            - Secure token input
         
     | 
| 130 | 
         
            +
            - Real-time token validation
         
     | 
| 131 | 
         
            +
             
     | 
| 132 | 
         
            +
            #### **Experiment Details**
         
     | 
| 133 | 
         
            +
            - Dynamic experiment naming
         
     | 
| 134 | 
         
            +
            - Repository name generation
         
     | 
| 135 | 
         
            +
            - Dataset repository configuration
         
     | 
| 136 | 
         
            +
             
     | 
| 137 | 
         
            +
            #### **Training Parameters**
         
     | 
| 138 | 
         
            +
            - Batch size selection
         
     | 
| 139 | 
         
            +
            - Learning rate adjustment
         
     | 
| 140 | 
         
            +
            - Sequence length configuration
         
     | 
| 141 | 
         
            +
            - Save/eval/logging steps
         
     | 
| 142 | 
         
            +
             
     | 
| 143 | 
         
            +
            ### 7. **Enhanced Monitoring Integration**
         
     | 
| 144 | 
         
            +
             
     | 
| 145 | 
         
            +
            #### **Trackio Space**
         
     | 
| 146 | 
         
            +
            - Dynamic space naming
         
     | 
| 147 | 
         
            +
            - Automatic deployment
         
     | 
| 148 | 
         
            +
            - URL generation
         
     | 
| 149 | 
         
            +
             
     | 
| 150 | 
         
            +
            #### **HF Datasets**
         
     | 
| 151 | 
         
            +
            - Dataset repository setup
         
     | 
| 152 | 
         
            +
            - Experiment data storage
         
     | 
| 153 | 
         
            +
            - Access configuration
         
     | 
| 154 | 
         
            +
             
     | 
| 155 | 
         
            +
            ## 🔧 Technical Improvements
         
     | 
| 156 | 
         
            +
             
     | 
| 157 | 
         
            +
            ### 1. **Modular Functions**
         
     | 
| 158 | 
         
            +
            ```bash
         
     | 
| 159 | 
         
            +
            # Input handling
         
     | 
| 160 | 
         
            +
            get_input()          # Get user input with defaults
         
     | 
| 161 | 
         
            +
            select_option()      # Select from options
         
     | 
| 162 | 
         
            +
            validate_hf_token()  # Validate HF token
         
     | 
| 163 | 
         
            +
             
     | 
| 164 | 
         
            +
            # Configuration
         
     | 
| 165 | 
         
            +
            show_training_configs()    # Display available configs
         
     | 
| 166 | 
         
            +
            get_training_config()      # Get config based on selection
         
     | 
| 167 | 
         
            +
            create_training_config()   # Generate config file
         
     | 
| 168 | 
         
            +
             
     | 
| 169 | 
         
            +
            # Output formatting
         
     | 
| 170 | 
         
            +
            print_status()       # Success messages
         
     | 
| 171 | 
         
            +
            print_warning()      # Warning messages
         
     | 
| 172 | 
         
            +
            print_error()        # Error messages
         
     | 
| 173 | 
         
            +
            print_info()         # Info messages
         
     | 
| 174 | 
         
            +
            print_header()       # Header messages
         
     | 
| 175 | 
         
            +
            print_step()         # Step messages
         
     | 
| 176 | 
         
            +
            ```
         
     | 
| 177 | 
         
            +
             
     | 
| 178 | 
         
            +
            ### 2. **Configuration Selection Logic**
         
     | 
| 179 | 
         
            +
            ```bash
         
     | 
| 180 | 
         
            +
            case "$config_type" in
         
     | 
| 181 | 
         
            +
                "Basic Training")
         
     | 
| 182 | 
         
            +
                    MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
         
     | 
| 183 | 
         
            +
                    DATASET_NAME="HuggingFaceTB/smoltalk"
         
     | 
| 184 | 
         
            +
                    # ... other parameters
         
     | 
| 185 | 
         
            +
                    ;;
         
     | 
| 186 | 
         
            +
                "A100 Large Scale")
         
     | 
| 187 | 
         
            +
                    MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
         
     | 
| 188 | 
         
            +
                    DATASET_NAME="legmlai/openhermes-fr"
         
     | 
| 189 | 
         
            +
                    # ... other parameters
         
     | 
| 190 | 
         
            +
                    ;;
         
     | 
| 191 | 
         
            +
                # ... other configurations
         
     | 
| 192 | 
         
            +
            esac
         
     | 
| 193 | 
         
            +
            ```
         
     | 
| 194 | 
         
            +
             
     | 
| 195 | 
         
            +
            ### 3. **Dynamic File Generation**
         
     | 
| 196 | 
         
            +
            ```bash
         
     | 
| 197 | 
         
            +
            # Generate training config
         
     | 
| 198 | 
         
            +
            create_training_config "$CONFIG_FILE"
         
     | 
| 199 | 
         
            +
             
     | 
| 200 | 
         
            +
            # Generate deployment input
         
     | 
| 201 | 
         
            +
            cat > deploy_input.txt << EOF
         
     | 
| 202 | 
         
            +
            $HF_USERNAME
         
     | 
| 203 | 
         
            +
            $TRACKIO_SPACE_NAME
         
     | 
| 204 | 
         
            +
            $HF_TOKEN
         
     | 
| 205 | 
         
            +
            EOF
         
     | 
| 206 | 
         
            +
            ```
         
     | 
| 207 | 
         
            +
             
     | 
| 208 | 
         
            +
            ## 📊 User Workflow
         
     | 
| 209 | 
         
            +
             
     | 
| 210 | 
         
            +
            ### **Before (Static)**
         
     | 
| 211 | 
         
            +
            1. Edit `launch.sh` manually
         
     | 
| 212 | 
         
            +
            2. Update hardcoded variables
         
     | 
| 213 | 
         
            +
            3. Run script
         
     | 
| 214 | 
         
            +
            4. Hope configuration is correct
         
     | 
| 215 | 
         
            +
             
     | 
| 216 | 
         
            +
            ### **After (Interactive)**
         
     | 
| 217 | 
         
            +
            1. Run `./launch.sh`
         
     | 
| 218 | 
         
            +
            2. Follow interactive prompts
         
     | 
| 219 | 
         
            +
            3. Select training configuration
         
     | 
| 220 | 
         
            +
            4. Confirm settings
         
     | 
| 221 | 
         
            +
            5. Watch automated pipeline
         
     | 
| 222 | 
         
            +
             
     | 
| 223 | 
         
            +
            ## 🎯 Benefits
         
     | 
| 224 | 
         
            +
             
     | 
| 225 | 
         
            +
            ### **For Users**
         
     | 
| 226 | 
         
            +
            - **No Manual Editing**: No need to edit script files
         
     | 
| 227 | 
         
            +
            - **Guided Experience**: Step-by-step prompts
         
     | 
| 228 | 
         
            +
            - **Validation**: Real-time input validation
         
     | 
| 229 | 
         
            +
            - **Flexibility**: Multiple configuration options
         
     | 
| 230 | 
         
            +
            - **Safety**: Confirmation before execution
         
     | 
| 231 | 
         
            +
             
     | 
| 232 | 
         
            +
            ### **For Developers**
         
     | 
| 233 | 
         
            +
            - **Maintainable**: Modular function structure
         
     | 
| 234 | 
         
            +
            - **Extensible**: Easy to add new configurations
         
     | 
| 235 | 
         
            +
            - **Robust**: Comprehensive error handling
         
     | 
| 236 | 
         
            +
            - **User-Friendly**: Clear feedback and guidance
         
     | 
| 237 | 
         
            +
             
     | 
| 238 | 
         
            +
            ### **For Different Use Cases**
         
     | 
| 239 | 
         
            +
            - **Beginners**: Basic Training configuration
         
     | 
| 240 | 
         
            +
            - **H100 Users**: H100 Lightweight for rapid experiments
         
     | 
| 241 | 
         
            +
            - **Researchers**: A100 Large Scale for serious experiments
         
     | 
| 242 | 
         
            +
            - **Production**: Multiple Passes for thorough training
         
     | 
| 243 | 
         
            +
            - **Custom**: User-defined parameters for specific needs
         
     | 
| 244 | 
         
            +
             
     | 
| 245 | 
         
            +
            ## 🔄 Configuration Examples
         
     | 
| 246 | 
         
            +
             
     | 
| 247 | 
         
            +
            ### **Quick Start (Basic Training)**
         
     | 
| 248 | 
         
            +
            ```bash
         
     | 
| 249 | 
         
            +
            ./launch.sh
         
     | 
| 250 | 
         
            +
            # Follow prompts:
         
     | 
| 251 | 
         
            +
            # 1. Enter HF username and token
         
     | 
| 252 | 
         
            +
            # 2. Select "Basic Training"
         
     | 
| 253 | 
         
            +
            # 3. Confirm settings
         
     | 
| 254 | 
         
            +
            # 4. Watch automated pipeline
         
     | 
| 255 | 
         
            +
            ```
         
     | 
| 256 | 
         
            +
             
     | 
| 257 | 
         
            +
            ### **High-Performance Training (A100)**
         
     | 
| 258 | 
         
            +
            ```bash
         
     | 
| 259 | 
         
            +
            ./launch.sh
         
     | 
| 260 | 
         
            +
            # Follow prompts:
         
     | 
| 261 | 
         
            +
            # 1. Enter HF username and token
         
     | 
| 262 | 
         
            +
            # 2. Select "A100 Large Scale"
         
     | 
| 263 | 
         
            +
            # 3. Adjust parameters if needed
         
     | 
| 264 | 
         
            +
            # 4. Confirm and run
         
     | 
| 265 | 
         
            +
            ```
         
     | 
| 266 | 
         
            +
             
     | 
| 267 | 
         
            +
            ### **Rapid Training (H100)**
         
     | 
| 268 | 
         
            +
            ```bash
         
     | 
| 269 | 
         
            +
            ./launch.sh
         
     | 
| 270 | 
         
            +
            # Follow prompts:
         
     | 
| 271 | 
         
            +
            # 1. Enter HF username and token
         
     | 
| 272 | 
         
            +
            # 2. Select "H100 Lightweight (Rapid)"
         
     | 
| 273 | 
         
            +
            # 3. Confirm settings
         
     | 
| 274 | 
         
            +
            # 4. Watch rapid training on H100
         
     | 
| 275 | 
         
            +
            ```
         
     | 
| 276 | 
         
            +
             
     | 
| 277 | 
         
            +
            ### **Custom Training**
         
     | 
| 278 | 
         
            +
            ```bash
         
     | 
| 279 | 
         
            +
            ./launch.sh
         
     | 
| 280 | 
         
            +
            # Follow prompts:
         
     | 
| 281 | 
         
            +
            # 1. Enter HF username and token
         
     | 
| 282 | 
         
            +
            # 2. Select "Custom Configuration"
         
     | 
| 283 | 
         
            +
            # 3. Enter custom parameters:
         
     | 
| 284 | 
         
            +
            #    - Model: microsoft/DialoGPT-medium
         
     | 
| 285 | 
         
            +
            #    - Dataset: your-custom-dataset
         
     | 
| 286 | 
         
            +
            #    - Epochs: 5
         
     | 
| 287 | 
         
            +
            #    - Batch Size: 4
         
     | 
| 288 | 
         
            +
            #    - Learning Rate: 1e-5
         
     | 
| 289 | 
         
            +
            # 4. Confirm and run
         
     | 
| 290 | 
         
            +
            ```
         
     | 
| 291 | 
         
            +
             
     | 
| 292 | 
         
            +
            ## 🚀 Future Enhancements
         
     | 
| 293 | 
         
            +
             
     | 
| 294 | 
         
            +
            ### **Planned Improvements**
         
     | 
| 295 | 
         
            +
            - **GUI Interface**: Web-based configuration interface
         
     | 
| 296 | 
         
            +
            - **Configuration Templates**: Save/load custom configurations
         
     | 
| 297 | 
         
            +
            - **Advanced Validation**: More sophisticated input validation
         
     | 
| 298 | 
         
            +
            - **Progress Tracking**: Real-time progress indicators
         
     | 
| 299 | 
         
            +
            - **Rollback Capability**: Undo changes if needed
         
     | 
| 300 | 
         
            +
             
     | 
| 301 | 
         
            +
            ### **Extensibility**
         
     | 
| 302 | 
         
            +
            - **Plugin System**: Add custom training configurations
         
     | 
| 303 | 
         
            +
            - **API Integration**: Connect to external services
         
     | 
| 304 | 
         
            +
            - **Multi-GPU Support**: Distributed training options
         
     | 
| 305 | 
         
            +
            - **Advanced Monitoring**: Enhanced tracking capabilities
         
     | 
| 306 | 
         
            +
             
     | 
| 307 | 
         
            +
            ## 📋 Migration Guide
         
     | 
| 308 | 
         
            +
             
     | 
| 309 | 
         
            +
            ### **For Existing Users**
         
     | 
| 310 | 
         
            +
            1. **Backup**: Save your current `launch.sh`
         
     | 
| 311 | 
         
            +
            2. **Update**: Replace with new interactive version
         
     | 
| 312 | 
         
            +
            3. **Test**: Run with basic configuration first
         
     | 
| 313 | 
         
            +
            4. **Migrate**: Use interactive prompts instead of manual editing
         
     | 
| 314 | 
         
            +
             
     | 
| 315 | 
         
            +
            ### **For New Users**
         
     | 
| 316 | 
         
            +
            1. **Setup**: Run `python setup_launch.py`
         
     | 
| 317 | 
         
            +
            2. **Check**: Run `python check_requirements.py`
         
     | 
| 318 | 
         
            +
            3. **Launch**: Run `./launch.sh`
         
     | 
| 319 | 
         
            +
            4. **Follow**: Use interactive prompts
         
     | 
| 320 | 
         
            +
             
     | 
| 321 | 
         
            +
            ## 🎉 Conclusion
         
     | 
| 322 | 
         
            +
             
     | 
| 323 | 
         
            +
            The interactive pipeline provides a much better user experience with:
         
     | 
| 324 | 
         
            +
            - **Guided Configuration**: No manual editing required
         
     | 
| 325 | 
         
            +
            - **Multiple Options**: Predefined configurations for different use cases
         
     | 
| 326 | 
         
            +
            - **Validation**: Real-time input validation and error handling
         
     | 
| 327 | 
         
            +
            - **Flexibility**: Custom configuration support
         
     | 
| 328 | 
         
            +
            - **Safety**: Confirmation steps and error recovery
         
     | 
| 329 | 
         
            +
             
     | 
| 330 | 
         
            +
            The script is now production-ready for users of all skill levels, from beginners to advanced researchers. 
         
     | 
    	
        PIPELINE_SUMMARY.md
    ADDED
    
    | 
         @@ -0,0 +1,330 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            # SmolLM3 End-to-End Pipeline - Implementation Summary
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            This document summarizes the comprehensive refactoring and enhancement of the SmolLM3 fine-tuning codebase to create a complete end-to-end pipeline.
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            ## 🎯 Overview
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            The pipeline now provides a complete solution from Trackio Space deployment to model push, with integrated monitoring, dataset management, and automated deployment.
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            ## 📁 Files Created/Modified
         
     | 
| 10 | 
         
            +
             
     | 
| 11 | 
         
            +
            ### **Core Pipeline Files**
         
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
            1. **`launch.sh`** - Complete end-to-end pipeline script
         
     | 
| 14 | 
         
            +
               - 16-step comprehensive pipeline
         
     | 
| 15 | 
         
            +
               - Automated environment setup
         
     | 
| 16 | 
         
            +
               - Integrated monitoring and deployment
         
     | 
| 17 | 
         
            +
               - Dynamic configuration generation
         
     | 
| 18 | 
         
            +
             
     | 
| 19 | 
         
            +
            2. **`setup_launch.py`** - User configuration helper
         
     | 
| 20 | 
         
            +
               - Interactive setup for user credentials
         
     | 
| 21 | 
         
            +
               - Automatic script configuration
         
     | 
| 22 | 
         
            +
               - Requirements checker generation
         
     | 
| 23 | 
         
            +
             
     | 
| 24 | 
         
            +
            3. **`test_pipeline.py`** - Comprehensive testing suite
         
     | 
| 25 | 
         
            +
               - Import testing
         
     | 
| 26 | 
         
            +
               - Component verification
         
     | 
| 27 | 
         
            +
               - CUDA and HF token validation
         
     | 
| 28 | 
         
            +
             
     | 
| 29 | 
         
            +
            4. **`README_END_TO_END.md`** - Complete documentation
         
     | 
| 30 | 
         
            +
               - Step-by-step usage guide
         
     | 
| 31 | 
         
            +
               - Troubleshooting section
         
     | 
| 32 | 
         
            +
               - Advanced configuration options
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
            ### **Scripts and Utilities**
         
     | 
| 35 | 
         
            +
             
     | 
| 36 | 
         
            +
            5. **`scripts/trackio_tonic/trackio_api_client.py`** - API client for Trackio
         
     | 
| 37 | 
         
            +
               - Complete API client implementation
         
     | 
| 38 | 
         
            +
               - Error handling and retry logic
         
     | 
| 39 | 
         
            +
               - Support for both JSON and SSE responses
         
     | 
| 40 | 
         
            +
             
     | 
| 41 | 
         
            +
            6. **`scripts/trackio_tonic/deploy_trackio_space.py`** - Space deployment
         
     | 
| 42 | 
         
            +
               - Automated HF Space creation
         
     | 
| 43 | 
         
            +
               - File upload and configuration
         
     | 
| 44 | 
         
            +
               - Space testing and validation
         
     | 
| 45 | 
         
            +
             
     | 
| 46 | 
         
            +
            7. **`scripts/trackio_tonic/configure_trackio.py`** - Configuration helper
         
     | 
| 47 | 
         
            +
               - Environment variable setup
         
     | 
| 48 | 
         
            +
               - Dataset repository configuration
         
     | 
| 49 | 
         
            +
               - Usage examples and validation
         
     | 
| 50 | 
         
            +
             
     | 
| 51 | 
         
            +
            8. **`scripts/model_tonic/push_to_huggingface.py`** - Model deployment
         
     | 
| 52 | 
         
            +
               - Complete model upload pipeline
         
     | 
| 53 | 
         
            +
               - Model card generation
         
     | 
| 54 | 
         
            +
               - Training results documentation
         
     | 
| 55 | 
         
            +
             
     | 
| 56 | 
         
            +
            9. **`scripts/dataset_tonic/setup_hf_dataset.py`** - Dataset setup
         
     | 
| 57 | 
         
            +
               - HF Dataset repository creation
         
     | 
| 58 | 
         
            +
               - Initial experiment data structure
         
     | 
| 59 | 
         
            +
               - Dataset access configuration
         
     | 
| 60 | 
         
            +
             
     | 
| 61 | 
         
            +
            ### **Source Code Updates**
         
     | 
| 62 | 
         
            +
             
     | 
| 63 | 
         
            +
            10. **`src/monitoring.py`** - Enhanced monitoring
         
     | 
| 64 | 
         
            +
                - HF Datasets integration
         
     | 
| 65 | 
         
            +
                - Trackio API client integration
         
     | 
| 66 | 
         
            +
                - Comprehensive metrics logging
         
     | 
| 67 | 
         
            +
             
     | 
| 68 | 
         
            +
            11. **`src/train.py`** - Updated training script
         
     | 
| 69 | 
         
            +
                - Monitoring integration
         
     | 
| 70 | 
         
            +
                - HF Datasets support
         
     | 
| 71 | 
         
            +
                - Enhanced error handling
         
     | 
| 72 | 
         
            +
             
     | 
| 73 | 
         
            +
            12. **`src/config.py`** - Configuration management
         
     | 
| 74 | 
         
            +
                - Dynamic config loading
         
     | 
| 75 | 
         
            +
                - Multiple config type support
         
     | 
| 76 | 
         
            +
                - Fallback mechanisms
         
     | 
| 77 | 
         
            +
             
     | 
| 78 | 
         
            +
            13. **`src/data.py`** - Enhanced dataset handling
         
     | 
| 79 | 
         
            +
                - Multiple format support
         
     | 
| 80 | 
         
            +
                - Automatic conversion
         
     | 
| 81 | 
         
            +
                - Bad entry filtering
         
     | 
| 82 | 
         
            +
             
     | 
| 83 | 
         
            +
            14. **`src/model.py`** - Model wrapper
         
     | 
| 84 | 
         
            +
                - SmolLM3-specific optimizations
         
     | 
| 85 | 
         
            +
                - Flash attention support
         
     | 
| 86 | 
         
            +
                - Long context handling
         
     | 
| 87 | 
         
            +
             
     | 
| 88 | 
         
            +
            15. **`src/trainer.py`** - Training orchestration
         
     | 
| 89 | 
         
            +
                - Monitoring callback integration
         
     | 
| 90 | 
         
            +
                - Enhanced logging
         
     | 
| 91 | 
         
            +
                - Checkpoint management
         
     | 
| 92 | 
         
            +
             
     | 
| 93 | 
         
            +
            ## 🔧 Key Improvements
         
     | 
| 94 | 
         
            +
             
     | 
| 95 | 
         
            +
            ### **1. Import Path Fixes**
         
     | 
| 96 | 
         
            +
            - Fixed all import paths to work with the refactored structure
         
     | 
| 97 | 
         
            +
            - Added proper sys.path handling for cross-module imports
         
     | 
| 98 | 
         
            +
            - Ensured compatibility between different script locations
         
     | 
| 99 | 
         
            +
             
     | 
| 100 | 
         
            +
            ### **2. Monitoring Integration**
         
     | 
| 101 | 
         
            +
            - **Trackio Space**: Real-time experiment tracking
         
     | 
| 102 | 
         
            +
            - **HF Datasets**: Persistent experiment storage
         
     | 
| 103 | 
         
            +
            - **System Metrics**: GPU, memory, and CPU monitoring
         
     | 
| 104 | 
         
            +
            - **Training Callbacks**: Automatic metric logging
         
     | 
| 105 | 
         
            +
             
     | 
| 106 | 
         
            +
            ### **3. Dataset Handling**
         
     | 
| 107 | 
         
            +
            - **Multi-format Support**: Prompt/completion, instruction/output, chat formats
         
     | 
| 108 | 
         
            +
            - **Automatic Conversion**: Handles different dataset structures
         
     | 
| 109 | 
         
            +
            - **Validation**: Ensures data quality and completeness
         
     | 
| 110 | 
         
            +
            - **Splitting**: Automatic train/validation/test splits
         
     | 
| 111 | 
         
            +
             
     | 
| 112 | 
         
            +
            ### **4. Configuration Management**
         
     | 
| 113 | 
         
            +
            - **Dynamic Generation**: Creates configs based on user input
         
     | 
| 114 | 
         
            +
            - **Multiple Types**: Support for different training configurations
         
     | 
| 115 | 
         
            +
            - **Environment Variables**: Proper integration with environment
         
     | 
| 116 | 
         
            +
            - **Validation**: Ensures configuration correctness
         
     | 
| 117 | 
         
            +
             
     | 
| 118 | 
         
            +
            ### **5. Deployment Automation**
         
     | 
| 119 | 
         
            +
            - **Model Upload**: Complete model push to HF Hub
         
     | 
| 120 | 
         
            +
            - **Model Cards**: Comprehensive documentation generation
         
     | 
| 121 | 
         
            +
            - **Training Results**: Complete experiment documentation
         
     | 
| 122 | 
         
            +
            - **Testing**: Automated model validation
         
     | 
| 123 | 
         
            +
             
     | 
| 124 | 
         
            +
            ## 🚀 Pipeline Steps
         
     | 
| 125 | 
         
            +
             
     | 
| 126 | 
         
            +
            The end-to-end pipeline performs these 16 steps:
         
     | 
| 127 | 
         
            +
             
     | 
| 128 | 
         
            +
            1. **Environment Setup** - System dependencies and Python environment
         
     | 
| 129 | 
         
            +
            2. **PyTorch Installation** - CUDA-enabled PyTorch installation
         
     | 
| 130 | 
         
            +
            3. **Dependencies** - All required Python packages
         
     | 
| 131 | 
         
            +
            4. **Authentication** - HF token setup and validation
         
     | 
| 132 | 
         
            +
            5. **Trackio Deployment** - HF Space creation and configuration
         
     | 
| 133 | 
         
            +
            6. **Dataset Setup** - HF Dataset repository creation
         
     | 
| 134 | 
         
            +
            7. **Trackio Configuration** - Environment and dataset configuration
         
     | 
| 135 | 
         
            +
            8. **Training Config** - Dynamic configuration generation
         
     | 
| 136 | 
         
            +
            9. **Dataset Preparation** - Download and format conversion
         
     | 
| 137 | 
         
            +
            10. **Parameter Calculation** - Training steps and batch calculations
         
     | 
| 138 | 
         
            +
            11. **Training Execution** - Model fine-tuning with monitoring
         
     | 
| 139 | 
         
            +
            12. **Model Push** - Upload to HF Hub with documentation
         
     | 
| 140 | 
         
            +
            13. **Model Testing** - Validation of uploaded model
         
     | 
| 141 | 
         
            +
            14. **Summary Report** - Complete training documentation
         
     | 
| 142 | 
         
            +
            15. **Resource Links** - All online resource URLs
         
     | 
| 143 | 
         
            +
            16. **Next Steps** - Usage instructions and recommendations
         
     | 
| 144 | 
         
            +
             
     | 
| 145 | 
         
            +
            ## 📊 Monitoring Features
         
     | 
| 146 | 
         
            +
             
     | 
| 147 | 
         
            +
            ### **Trackio Space Interface**
         
     | 
| 148 | 
         
            +
            - Real-time training metrics
         
     | 
| 149 | 
         
            +
            - Experiment comparison
         
     | 
| 150 | 
         
            +
            - System resource monitoring
         
     | 
| 151 | 
         
            +
            - Training progress visualization
         
     | 
| 152 | 
         
            +
             
     | 
| 153 | 
         
            +
            ### **HF Dataset Storage**
         
     | 
| 154 | 
         
            +
            - Persistent experiment data
         
     | 
| 155 | 
         
            +
            - Version-controlled history
         
     | 
| 156 | 
         
            +
            - Collaborative sharing
         
     | 
| 157 | 
         
            +
            - Automated backup
         
     | 
| 158 | 
         
            +
             
     | 
| 159 | 
         
            +
            ### **Comprehensive Logging**
         
     | 
| 160 | 
         
            +
            - Training metrics (loss, accuracy, etc.)
         
     | 
| 161 | 
         
            +
            - System metrics (GPU, memory, CPU)
         
     | 
| 162 | 
         
            +
            - Configuration parameters
         
     | 
| 163 | 
         
            +
            - Training artifacts
         
     | 
| 164 | 
         
            +
             
     | 
| 165 | 
         
            +
            ## 🔧 Configuration Options
         
     | 
| 166 | 
         
            +
             
     | 
| 167 | 
         
            +
            ### **User Configuration**
         
     | 
| 168 | 
         
            +
            ```bash
         
     | 
| 169 | 
         
            +
            # Required
         
     | 
| 170 | 
         
            +
            HF_TOKEN="your_token"
         
     | 
| 171 | 
         
            +
            HF_USERNAME="your_username"
         
     | 
| 172 | 
         
            +
             
     | 
| 173 | 
         
            +
            # Optional
         
     | 
| 174 | 
         
            +
            MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
         
     | 
| 175 | 
         
            +
            DATASET_NAME="HuggingFaceTB/smoltalk"
         
     | 
| 176 | 
         
            +
            ```
         
     | 
| 177 | 
         
            +
             
     | 
| 178 | 
         
            +
            ### **Training Parameters**
         
     | 
| 179 | 
         
            +
            ```bash
         
     | 
| 180 | 
         
            +
            BATCH_SIZE=2
         
     | 
| 181 | 
         
            +
            GRADIENT_ACCUMULATION_STEPS=8
         
     | 
| 182 | 
         
            +
            LEARNING_RATE=5e-6
         
     | 
| 183 | 
         
            +
            MAX_EPOCHS=3
         
     | 
| 184 | 
         
            +
            MAX_SEQ_LENGTH=4096
         
     | 
| 185 | 
         
            +
            ```
         
     | 
| 186 | 
         
            +
             
     | 
| 187 | 
         
            +
            ### **Monitoring Configuration**
         
     | 
| 188 | 
         
            +
            ```bash
         
     | 
| 189 | 
         
            +
            TRACKIO_DATASET_REPO="username/trackio-experiments"
         
     | 
| 190 | 
         
            +
            EXPERIMENT_NAME="smollm3_finetune_YYYYMMDD_HHMMSS"
         
     | 
| 191 | 
         
            +
            ```
         
     | 
| 192 | 
         
            +
             
     | 
| 193 | 
         
            +
            ## 🛠️ Error Handling
         
     | 
| 194 | 
         
            +
             
     | 
| 195 | 
         
            +
            ### **Comprehensive Error Handling**
         
     | 
| 196 | 
         
            +
            - Import error detection and reporting
         
     | 
| 197 | 
         
            +
            - Configuration validation
         
     | 
| 198 | 
         
            +
            - Network timeout handling
         
     | 
| 199 | 
         
            +
            - Graceful degradation
         
     | 
| 200 | 
         
            +
             
     | 
| 201 | 
         
            +
            ### **Debugging Support**
         
     | 
| 202 | 
         
            +
            - Detailed logging at all levels
         
     | 
| 203 | 
         
            +
            - Component-specific error messages
         
     | 
| 204 | 
         
            +
            - Fallback mechanisms
         
     | 
| 205 | 
         
            +
            - Testing utilities
         
     | 
| 206 | 
         
            +
             
     | 
| 207 | 
         
            +
            ## 📈 Performance Optimizations
         
     | 
| 208 | 
         
            +
             
     | 
| 209 | 
         
            +
            ### **Training Optimizations**
         
     | 
| 210 | 
         
            +
            - Flash Attention for efficiency
         
     | 
| 211 | 
         
            +
            - Gradient checkpointing for memory
         
     | 
| 212 | 
         
            +
            - Mixed precision training
         
     | 
| 213 | 
         
            +
            - Optimized data loading
         
     | 
| 214 | 
         
            +
             
     | 
| 215 | 
         
            +
            ### **Monitoring Optimizations**
         
     | 
| 216 | 
         
            +
            - Asynchronous logging
         
     | 
| 217 | 
         
            +
            - Batch metric updates
         
     | 
| 218 | 
         
            +
            - Efficient data storage
         
     | 
| 219 | 
         
            +
            - Minimal overhead
         
     | 
| 220 | 
         
            +
             
     | 
| 221 | 
         
            +
            ## 🔄 Integration Points
         
     | 
| 222 | 
         
            +
             
     | 
| 223 | 
         
            +
            ### **Hugging Face Ecosystem**
         
     | 
| 224 | 
         
            +
            - **HF Hub**: Model and dataset storage
         
     | 
| 225 | 
         
            +
            - **HF Spaces**: Trackio monitoring interface
         
     | 
| 226 | 
         
            +
            - **HF Datasets**: Experiment data persistence
         
     | 
| 227 | 
         
            +
            - **HF CLI**: Authentication and deployment
         
     | 
| 228 | 
         
            +
             
     | 
| 229 | 
         
            +
            ### **External Services**
         
     | 
| 230 | 
         
            +
            - **Trackio**: Experiment tracking
         
     | 
| 231 | 
         
            +
            - **CUDA**: GPU acceleration
         
     | 
| 232 | 
         
            +
            - **PyTorch**: Deep learning framework
         
     | 
| 233 | 
         
            +
            - **Transformers**: Model library
         
     | 
| 234 | 
         
            +
             
     | 
| 235 | 
         
            +
            ## 🎯 Usage Workflow
         
     | 
| 236 | 
         
            +
             
     | 
| 237 | 
         
            +
            ### **1. Setup Phase**
         
     | 
| 238 | 
         
            +
            ```bash
         
     | 
| 239 | 
         
            +
            python setup_launch.py  # Configure with user info
         
     | 
| 240 | 
         
            +
            python test_pipeline.py # Verify all components
         
     | 
| 241 | 
         
            +
            ```
         
     | 
| 242 | 
         
            +
             
     | 
| 243 | 
         
            +
            ### **2. Execution Phase**
         
     | 
| 244 | 
         
            +
            ```bash
         
     | 
| 245 | 
         
            +
            chmod +x launch.sh      # Make executable
         
     | 
| 246 | 
         
            +
            ./launch.sh            # Run complete pipeline
         
     | 
| 247 | 
         
            +
            ```
         
     | 
| 248 | 
         
            +
             
     | 
| 249 | 
         
            +
            ### **3. Monitoring Phase**
         
     | 
| 250 | 
         
            +
            - Track progress in Trackio Space
         
     | 
| 251 | 
         
            +
            - Monitor metrics in real-time
         
     | 
| 252 | 
         
            +
            - Check logs for issues
         
     | 
| 253 | 
         
            +
            - Validate results
         
     | 
| 254 | 
         
            +
             
     | 
| 255 | 
         
            +
            ### **4. Results Phase**
         
     | 
| 256 | 
         
            +
            - Access model on HF Hub
         
     | 
| 257 | 
         
            +
            - Review training summary
         
     | 
| 258 | 
         
            +
            - Test model performance
         
     | 
| 259 | 
         
            +
            - Share results
         
     | 
| 260 | 
         
            +
             
     | 
| 261 | 
         
            +
            ## 📋 Quality Assurance
         
     | 
| 262 | 
         
            +
             
     | 
| 263 | 
         
            +
            ### **Testing Coverage**
         
     | 
| 264 | 
         
            +
            - Import testing for all modules
         
     | 
| 265 | 
         
            +
            - Script availability verification
         
     | 
| 266 | 
         
            +
            - Configuration validation
         
     | 
| 267 | 
         
            +
            - CUDA and token testing
         
     | 
| 268 | 
         
            +
            - Component integration testing
         
     | 
| 269 | 
         
            +
             
     | 
| 270 | 
         
            +
            ### **Documentation**
         
     | 
| 271 | 
         
            +
            - Comprehensive README
         
     | 
| 272 | 
         
            +
            - Step-by-step guides
         
     | 
| 273 | 
         
            +
            - Troubleshooting section
         
     | 
| 274 | 
         
            +
            - Advanced usage examples
         
     | 
| 275 | 
         
            +
             
     | 
| 276 | 
         
            +
            ### **Error Recovery**
         
     | 
| 277 | 
         
            +
            - Graceful error handling
         
     | 
| 278 | 
         
            +
            - Detailed error messages
         
     | 
| 279 | 
         
            +
            - Recovery mechanisms
         
     | 
| 280 | 
         
            +
            - Fallback options
         
     | 
| 281 | 
         
            +
             
     | 
| 282 | 
         
            +
            ## 🚀 Future Enhancements
         
     | 
| 283 | 
         
            +
             
     | 
| 284 | 
         
            +
            ### **Planned Improvements**
         
     | 
| 285 | 
         
            +
            - Multi-GPU training support
         
     | 
| 286 | 
         
            +
            - Distributed training
         
     | 
| 287 | 
         
            +
            - Advanced hyperparameter tuning
         
     | 
| 288 | 
         
            +
            - Custom dataset upload
         
     | 
| 289 | 
         
            +
            - Model evaluation metrics
         
     | 
| 290 | 
         
            +
            - Automated testing pipeline
         
     | 
| 291 | 
         
            +
             
     | 
| 292 | 
         
            +
            ### **Extensibility**
         
     | 
| 293 | 
         
            +
            - Plugin architecture for custom components
         
     | 
| 294 | 
         
            +
            - Configuration templates
         
     | 
| 295 | 
         
            +
            - Custom monitoring backends
         
     | 
| 296 | 
         
            +
            - Advanced deployment options
         
     | 
| 297 | 
         
            +
             
     | 
| 298 | 
         
            +
            ## 📊 Success Metrics
         
     | 
| 299 | 
         
            +
             
     | 
| 300 | 
         
            +
            ### **Pipeline Completeness**
         
     | 
| 301 | 
         
            +
            - ✅ All 16 steps implemented
         
     | 
| 302 | 
         
            +
            - ✅ Error handling at each step
         
     | 
| 303 | 
         
            +
            - ✅ Monitoring integration
         
     | 
| 304 | 
         
            +
            - ✅ Documentation complete
         
     | 
| 305 | 
         
            +
             
     | 
| 306 | 
         
            +
            ### **User Experience**
         
     | 
| 307 | 
         
            +
            - ✅ Simple setup process
         
     | 
| 308 | 
         
            +
            - ✅ Clear error messages
         
     | 
| 309 | 
         
            +
            - ✅ Comprehensive documentation
         
     | 
| 310 | 
         
            +
            - ✅ Testing utilities
         
     | 
| 311 | 
         
            +
             
     | 
| 312 | 
         
            +
            ### **Technical Quality**
         
     | 
| 313 | 
         
            +
            - ✅ Import path fixes
         
     | 
| 314 | 
         
            +
            - ✅ Configuration management
         
     | 
| 315 | 
         
            +
            - ✅ Monitoring integration
         
     | 
| 316 | 
         
            +
            - ✅ Deployment automation
         
     | 
| 317 | 
         
            +
             
     | 
| 318 | 
         
            +
            ## 🎉 Conclusion
         
     | 
| 319 | 
         
            +
             
     | 
| 320 | 
         
            +
            The SmolLM3 end-to-end pipeline provides a complete solution for fine-tuning with integrated monitoring, automated deployment, and comprehensive documentation. The refactored codebase is now production-ready with proper error handling, testing, and user experience considerations.
         
     | 
| 321 | 
         
            +
             
     | 
| 322 | 
         
            +
            **Key Achievements:**
         
     | 
| 323 | 
         
            +
            - Complete end-to-end automation
         
     | 
| 324 | 
         
            +
            - Integrated monitoring and tracking
         
     | 
| 325 | 
         
            +
            - Comprehensive error handling
         
     | 
| 326 | 
         
            +
            - Production-ready deployment
         
     | 
| 327 | 
         
            +
            - Extensive documentation
         
     | 
| 328 | 
         
            +
            - Testing and validation suite
         
     | 
| 329 | 
         
            +
             
     | 
| 330 | 
         
            +
            The pipeline is now ready for users to easily fine-tune SmolLM3 models with full monitoring and deployment capabilities. 
         
     | 
    	
        README.md
    CHANGED
    
    | 
         @@ -1,4 +1,4 @@ 
     | 
|
| 1 | 
         
            -
            # SmolLM3 Fine-tuning 
     | 
| 2 | 
         | 
| 3 | 
         
             
            This repository provides a complete setup for fine-tuning SmolLM3 models using the FlexAI console, following the nanoGPT structure but adapted for modern transformer models.
         
     | 
| 4 | 
         | 
| 
         | 
|
| 1 | 
         
            +
            # SmolLM3 Fine-tuning
         
     | 
| 2 | 
         | 
| 3 | 
         
             
            This repository provides a complete setup for fine-tuning SmolLM3 models using the FlexAI console, following the nanoGPT structure but adapted for modern transformer models.
         
     | 
| 4 | 
         | 
    	
        README_END_TO_END.md
    ADDED
    
    | 
         @@ -0,0 +1,304 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            # SmolLM3 End-to-End Fine-tuning Pipeline
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            This repository provides a complete end-to-end pipeline for fine-tuning SmolLM3 models with integrated experiment tracking, monitoring, and model deployment.
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            ## 🚀 Quick Start
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            ### 1. Setup Configuration
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            ```bash
         
     | 
| 10 | 
         
            +
            # Run the setup script to configure with your information
         
     | 
| 11 | 
         
            +
            python setup_launch.py
         
     | 
| 12 | 
         
            +
            ```
         
     | 
| 13 | 
         
            +
             
     | 
| 14 | 
         
            +
            This will prompt you for:
         
     | 
| 15 | 
         
            +
            - Your Hugging Face username
         
     | 
| 16 | 
         
            +
            - Your Hugging Face token
         
     | 
| 17 | 
         
            +
            - Optional model and dataset customizations
         
     | 
| 18 | 
         
            +
             
     | 
| 19 | 
         
            +
            ### 2. Check Requirements
         
     | 
| 20 | 
         
            +
             
     | 
| 21 | 
         
            +
            ```bash
         
     | 
| 22 | 
         
            +
            # Verify all dependencies are installed
         
     | 
| 23 | 
         
            +
            python check_requirements.py
         
     | 
| 24 | 
         
            +
            ```
         
     | 
| 25 | 
         
            +
             
     | 
| 26 | 
         
            +
            ### 3. Run the Pipeline
         
     | 
| 27 | 
         
            +
             
     | 
| 28 | 
         
            +
            ```bash
         
     | 
| 29 | 
         
            +
            # Make the script executable and run
         
     | 
| 30 | 
         
            +
            chmod +x launch.sh
         
     | 
| 31 | 
         
            +
            ./launch.sh
         
     | 
| 32 | 
         
            +
            ```
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
            ## 📋 What the Pipeline Does
         
     | 
| 35 | 
         
            +
             
     | 
| 36 | 
         
            +
            The end-to-end pipeline performs the following steps:
         
     | 
| 37 | 
         
            +
             
     | 
| 38 | 
         
            +
            ### 1. **Environment Setup**
         
     | 
| 39 | 
         
            +
            - Installs system dependencies
         
     | 
| 40 | 
         
            +
            - Creates Python virtual environment
         
     | 
| 41 | 
         
            +
            - Installs PyTorch with CUDA support
         
     | 
| 42 | 
         
            +
            - Installs all required Python packages
         
     | 
| 43 | 
         
            +
             
     | 
| 44 | 
         
            +
            ### 2. **Trackio Space Deployment**
         
     | 
| 45 | 
         
            +
            - Creates a new Hugging Face Space for experiment tracking
         
     | 
| 46 | 
         
            +
            - Configures the Trackio monitoring interface
         
     | 
| 47 | 
         
            +
            - Sets up environment variables
         
     | 
| 48 | 
         
            +
             
     | 
| 49 | 
         
            +
            ### 3. **HF Dataset Setup**
         
     | 
| 50 | 
         
            +
            - Creates a Hugging Face Dataset repository for experiment storage
         
     | 
| 51 | 
         
            +
            - Configures dataset access and permissions
         
     | 
| 52 | 
         
            +
            - Sets up initial experiment data structure
         
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
            +
            ### 4. **Dataset Preparation**
         
     | 
| 55 | 
         
            +
            - Downloads the specified dataset from Hugging Face Hub
         
     | 
| 56 | 
         
            +
            - Converts to training format (prompt/completion pairs)
         
     | 
| 57 | 
         
            +
            - Handles multiple dataset formats automatically
         
     | 
| 58 | 
         
            +
            - Creates train/validation splits
         
     | 
| 59 | 
         
            +
             
     | 
| 60 | 
         
            +
            ### 5. **Training Configuration**
         
     | 
| 61 | 
         
            +
            - Creates optimized training configuration
         
     | 
| 62 | 
         
            +
            - Sets up monitoring integration
         
     | 
| 63 | 
         
            +
            - Configures model parameters and hyperparameters
         
     | 
| 64 | 
         
            +
             
     | 
| 65 | 
         
            +
            ### 6. **Model Training**
         
     | 
| 66 | 
         
            +
            - Runs the SmolLM3 fine-tuning process
         
     | 
| 67 | 
         
            +
            - Logs metrics to Trackio Space in real-time
         
     | 
| 68 | 
         
            +
            - Saves experiment data to HF Dataset
         
     | 
| 69 | 
         
            +
            - Creates checkpoints during training
         
     | 
| 70 | 
         
            +
             
     | 
| 71 | 
         
            +
            ### 7. **Model Deployment**
         
     | 
| 72 | 
         
            +
            - Pushes trained model to Hugging Face Hub
         
     | 
| 73 | 
         
            +
            - Creates comprehensive model card
         
     | 
| 74 | 
         
            +
            - Uploads training results and logs
         
     | 
| 75 | 
         
            +
            - Tests the uploaded model
         
     | 
| 76 | 
         
            +
             
     | 
| 77 | 
         
            +
            ### 8. **Summary Report**
         
     | 
| 78 | 
         
            +
            - Generates detailed training summary
         
     | 
| 79 | 
         
            +
            - Provides links to all resources
         
     | 
| 80 | 
         
            +
            - Documents configuration and results
         
     | 
| 81 | 
         
            +
             
     | 
| 82 | 
         
            +
            ## 🎯 Features
         
     | 
| 83 | 
         
            +
             
     | 
| 84 | 
         
            +
            ### **Integrated Monitoring**
         
     | 
| 85 | 
         
            +
            - Real-time experiment tracking via Trackio Space
         
     | 
| 86 | 
         
            +
            - Persistent storage in Hugging Face Datasets
         
     | 
| 87 | 
         
            +
            - Comprehensive metrics logging
         
     | 
| 88 | 
         
            +
            - System resource monitoring
         
     | 
| 89 | 
         
            +
             
     | 
| 90 | 
         
            +
            ### **Flexible Dataset Support**
         
     | 
| 91 | 
         
            +
            - Automatic format detection and conversion
         
     | 
| 92 | 
         
            +
            - Support for multiple dataset types
         
     | 
| 93 | 
         
            +
            - Built-in data preprocessing
         
     | 
| 94 | 
         
            +
            - Train/validation split handling
         
     | 
| 95 | 
         
            +
             
     | 
| 96 | 
         
            +
            ### **Optimized Training**
         
     | 
| 97 | 
         
            +
            - Flash Attention support for efficiency
         
     | 
| 98 | 
         
            +
            - Gradient checkpointing for memory optimization
         
     | 
| 99 | 
         
            +
            - Mixed precision training
         
     | 
| 100 | 
         
            +
            - Automatic hyperparameter optimization
         
     | 
| 101 | 
         
            +
             
     | 
| 102 | 
         
            +
            ### **Complete Deployment**
         
     | 
| 103 | 
         
            +
            - Automated model upload to Hugging Face Hub
         
     | 
| 104 | 
         
            +
            - Comprehensive model cards
         
     | 
| 105 | 
         
            +
            - Training results documentation
         
     | 
| 106 | 
         
            +
            - Model testing and validation
         
     | 
| 107 | 
         
            +
             
     | 
| 108 | 
         
            +
            ## 📊 Monitoring & Tracking
         
     | 
| 109 | 
         
            +
             
     | 
| 110 | 
         
            +
            ### **Trackio Space Interface**
         
     | 
| 111 | 
         
            +
            - Real-time training metrics visualization
         
     | 
| 112 | 
         
            +
            - Experiment management and comparison
         
     | 
| 113 | 
         
            +
            - System resource monitoring
         
     | 
| 114 | 
         
            +
            - Training progress tracking
         
     | 
| 115 | 
         
            +
             
     | 
| 116 | 
         
            +
            ### **HF Dataset Storage**
         
     | 
| 117 | 
         
            +
            - Persistent experiment data storage
         
     | 
| 118 | 
         
            +
            - Version-controlled experiment history
         
     | 
| 119 | 
         
            +
            - Collaborative experiment sharing
         
     | 
| 120 | 
         
            +
            - Automated data backup
         
     | 
| 121 | 
         
            +
             
     | 
| 122 | 
         
            +
            ## 🔧 Configuration
         
     | 
| 123 | 
         
            +
             
     | 
| 124 | 
         
            +
            ### **Required Configuration**
         
     | 
| 125 | 
         
            +
            Update these variables in `launch.sh`:
         
     | 
| 126 | 
         
            +
             
     | 
| 127 | 
         
            +
            ```bash
         
     | 
| 128 | 
         
            +
            # Your Hugging Face credentials
         
     | 
| 129 | 
         
            +
            HF_TOKEN="your_hf_token_here"
         
     | 
| 130 | 
         
            +
            HF_USERNAME="your-username"
         
     | 
| 131 | 
         
            +
             
     | 
| 132 | 
         
            +
            # Model and dataset
         
     | 
| 133 | 
         
            +
            MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
         
     | 
| 134 | 
         
            +
            DATASET_NAME="HuggingFaceTB/smoltalk"
         
     | 
| 135 | 
         
            +
             
     | 
| 136 | 
         
            +
            # Output repositories
         
     | 
| 137 | 
         
            +
            REPO_NAME="your-username/smollm3-finetuned-$(date +%Y%m%d)"
         
     | 
| 138 | 
         
            +
            TRACKIO_DATASET_REPO="your-username/trackio-experiments"
         
     | 
| 139 | 
         
            +
            ```
         
     | 
| 140 | 
         
            +
             
     | 
| 141 | 
         
            +
            ### **Training Parameters**
         
     | 
| 142 | 
         
            +
            Customize training parameters:
         
     | 
| 143 | 
         
            +
             
     | 
| 144 | 
         
            +
            ```bash
         
     | 
| 145 | 
         
            +
            # Training configuration
         
     | 
| 146 | 
         
            +
            BATCH_SIZE=2
         
     | 
| 147 | 
         
            +
            GRADIENT_ACCUMULATION_STEPS=8
         
     | 
| 148 | 
         
            +
            LEARNING_RATE=5e-6
         
     | 
| 149 | 
         
            +
            MAX_EPOCHS=3
         
     | 
| 150 | 
         
            +
            MAX_SEQ_LENGTH=4096
         
     | 
| 151 | 
         
            +
            ```
         
     | 
| 152 | 
         
            +
             
     | 
| 153 | 
         
            +
            ## 📁 Output Structure
         
     | 
| 154 | 
         
            +
             
     | 
| 155 | 
         
            +
            After running the pipeline, you'll have:
         
     | 
| 156 | 
         
            +
             
     | 
| 157 | 
         
            +
            ```
         
     | 
| 158 | 
         
            +
            ├── training_dataset/           # Prepared dataset
         
     | 
| 159 | 
         
            +
            │   ├── train.json
         
     | 
| 160 | 
         
            +
            │   └── validation.json
         
     | 
| 161 | 
         
            +
            ├── /output-checkpoint/         # Model checkpoints
         
     | 
| 162 | 
         
            +
            │   ├── config.json
         
     | 
| 163 | 
         
            +
            │   ├── pytorch_model.bin
         
     | 
| 164 | 
         
            +
            │   └── training_results/
         
     | 
| 165 | 
         
            +
            ├── training.log               # Training logs
         
     | 
| 166 | 
         
            +
            ├── training_summary.md        # Summary report
         
     | 
| 167 | 
         
            +
            └── config/train_smollm3_end_to_end.py  # Training config
         
     | 
| 168 | 
         
            +
            ```
         
     | 
| 169 | 
         
            +
             
     | 
| 170 | 
         
            +
            ## 🌐 Online Resources
         
     | 
| 171 | 
         
            +
             
     | 
| 172 | 
         
            +
            The pipeline creates these online resources:
         
     | 
| 173 | 
         
            +
             
     | 
| 174 | 
         
            +
            - **Model Repository**: `https://huggingface.co/your-username/smollm3-finetuned-YYYYMMDD`
         
     | 
| 175 | 
         
            +
            - **Trackio Space**: `https://huggingface.co/spaces/your-username/trackio-monitoring-YYYYMMDD`
         
     | 
| 176 | 
         
            +
            - **Experiment Dataset**: `https://huggingface.co/datasets/your-username/trackio-experiments`
         
     | 
| 177 | 
         
            +
             
     | 
| 178 | 
         
            +
            ## 🛠️ Troubleshooting
         
     | 
| 179 | 
         
            +
             
     | 
| 180 | 
         
            +
            ### **Common Issues**
         
     | 
| 181 | 
         
            +
             
     | 
| 182 | 
         
            +
            1. **HF Token Issues**
         
     | 
| 183 | 
         
            +
               ```bash
         
     | 
| 184 | 
         
            +
               # Verify your token is correct
         
     | 
| 185 | 
         
            +
               huggingface-cli whoami
         
     | 
| 186 | 
         
            +
               ```
         
     | 
| 187 | 
         
            +
             
     | 
| 188 | 
         
            +
            2. **CUDA Issues**
         
     | 
| 189 | 
         
            +
               ```bash
         
     | 
| 190 | 
         
            +
               # Check CUDA availability
         
     | 
| 191 | 
         
            +
               python -c "import torch; print(torch.cuda.is_available())"
         
     | 
| 192 | 
         
            +
               ```
         
     | 
| 193 | 
         
            +
             
     | 
| 194 | 
         
            +
            3. **Memory Issues**
         
     | 
| 195 | 
         
            +
               ```bash
         
     | 
| 196 | 
         
            +
               # Reduce batch size or gradient accumulation
         
     | 
| 197 | 
         
            +
               BATCH_SIZE=1
         
     | 
| 198 | 
         
            +
               GRADIENT_ACCUMULATION_STEPS=16
         
     | 
| 199 | 
         
            +
               ```
         
     | 
| 200 | 
         
            +
             
     | 
| 201 | 
         
            +
            4. **Dataset Issues**
         
     | 
| 202 | 
         
            +
               ```bash
         
     | 
| 203 | 
         
            +
               # Test dataset access
         
     | 
| 204 | 
         
            +
               python -c "from datasets import load_dataset; print(load_dataset('your-dataset'))"
         
     | 
| 205 | 
         
            +
               ```
         
     | 
| 206 | 
         
            +
             
     | 
| 207 | 
         
            +
            ### **Debug Mode**
         
     | 
| 208 | 
         
            +
             
     | 
| 209 | 
         
            +
            Run individual components for debugging:
         
     | 
| 210 | 
         
            +
             
     | 
| 211 | 
         
            +
            ```bash
         
     | 
| 212 | 
         
            +
            # Test Trackio deployment
         
     | 
| 213 | 
         
            +
            cd scripts/trackio_tonic
         
     | 
| 214 | 
         
            +
            python deploy_trackio_space.py
         
     | 
| 215 | 
         
            +
             
     | 
| 216 | 
         
            +
            # Test dataset setup
         
     | 
| 217 | 
         
            +
            cd scripts/dataset_tonic
         
     | 
| 218 | 
         
            +
            python setup_hf_dataset.py
         
     | 
| 219 | 
         
            +
             
     | 
| 220 | 
         
            +
            # Test training
         
     | 
| 221 | 
         
            +
            python src/train.py config/train_smollm3_end_to_end.py --help
         
     | 
| 222 | 
         
            +
            ```
         
     | 
| 223 | 
         
            +
             
     | 
| 224 | 
         
            +
            ## 📚 Advanced Usage
         
     | 
| 225 | 
         
            +
             
     | 
| 226 | 
         
            +
            ### **Custom Datasets**
         
     | 
| 227 | 
         
            +
             
     | 
| 228 | 
         
            +
            For custom datasets, ensure they have one of these formats:
         
     | 
| 229 | 
         
            +
             
     | 
| 230 | 
         
            +
            ```json
         
     | 
| 231 | 
         
            +
            // Format 1: Prompt/Completion
         
     | 
| 232 | 
         
            +
            {
         
     | 
| 233 | 
         
            +
              "prompt": "What is machine learning?",
         
     | 
| 234 | 
         
            +
              "completion": "Machine learning is..."
         
     | 
| 235 | 
         
            +
            }
         
     | 
| 236 | 
         
            +
             
     | 
| 237 | 
         
            +
            // Format 2: Instruction/Output
         
     | 
| 238 | 
         
            +
            {
         
     | 
| 239 | 
         
            +
              "instruction": "Explain machine learning",
         
     | 
| 240 | 
         
            +
              "output": "Machine learning is..."
         
     | 
| 241 | 
         
            +
            }
         
     | 
| 242 | 
         
            +
             
     | 
| 243 | 
         
            +
            // Format 3: Chat format
         
     | 
| 244 | 
         
            +
            {
         
     | 
| 245 | 
         
            +
              "messages": [
         
     | 
| 246 | 
         
            +
                {"role": "user", "content": "What is ML?"},
         
     | 
| 247 | 
         
            +
                {"role": "assistant", "content": "ML is..."}
         
     | 
| 248 | 
         
            +
              ]
         
     | 
| 249 | 
         
            +
            }
         
     | 
| 250 | 
         
            +
            ```
         
     | 
| 251 | 
         
            +
             
     | 
| 252 | 
         
            +
            ### **Custom Models**
         
     | 
| 253 | 
         
            +
             
     | 
| 254 | 
         
            +
            To use different models, update the configuration:
         
     | 
| 255 | 
         
            +
             
     | 
| 256 | 
         
            +
            ```bash
         
     | 
| 257 | 
         
            +
            MODEL_NAME="microsoft/DialoGPT-medium"
         
     | 
| 258 | 
         
            +
            MAX_SEQ_LENGTH=1024
         
     | 
| 259 | 
         
            +
            ```
         
     | 
| 260 | 
         
            +
             
     | 
| 261 | 
         
            +
            ### **Custom Training**
         
     | 
| 262 | 
         
            +
             
     | 
| 263 | 
         
            +
            Modify training parameters in the generated config:
         
     | 
| 264 | 
         
            +
             
     | 
| 265 | 
         
            +
            ```python
         
     | 
| 266 | 
         
            +
            # In config/train_smollm3_end_to_end.py
         
     | 
| 267 | 
         
            +
            config = SmolLM3Config(
         
     | 
| 268 | 
         
            +
                learning_rate=1e-5,  # Custom learning rate
         
     | 
| 269 | 
         
            +
                max_iters=5000,      # Custom training steps
         
     | 
| 270 | 
         
            +
                # ... other parameters
         
     | 
| 271 | 
         
            +
            )
         
     | 
| 272 | 
         
            +
            ```
         
     | 
| 273 | 
         
            +
             
     | 
| 274 | 
         
            +
            ## 🤝 Contributing
         
     | 
| 275 | 
         
            +
             
     | 
| 276 | 
         
            +
            1. Fork the repository
         
     | 
| 277 | 
         
            +
            2. Create a feature branch
         
     | 
| 278 | 
         
            +
            3. Make your changes
         
     | 
| 279 | 
         
            +
            4. Test the pipeline
         
     | 
| 280 | 
         
            +
            5. Submit a pull request
         
     | 
| 281 | 
         
            +
             
     | 
| 282 | 
         
            +
            ## 📄 License
         
     | 
| 283 | 
         
            +
             
     | 
| 284 | 
         
            +
            This project is licensed under the MIT License - see the LICENSE file for details.
         
     | 
| 285 | 
         
            +
             
     | 
| 286 | 
         
            +
            ## 🙏 Acknowledgments
         
     | 
| 287 | 
         
            +
             
     | 
| 288 | 
         
            +
            - Hugging Face for the excellent transformers library
         
     | 
| 289 | 
         
            +
            - The SmolLM3 team for the base model
         
     | 
| 290 | 
         
            +
            - The Trackio team for experiment tracking
         
     | 
| 291 | 
         
            +
            - The open-source community for contributions
         
     | 
| 292 | 
         
            +
             
     | 
| 293 | 
         
            +
            ## 📞 Support
         
     | 
| 294 | 
         
            +
             
     | 
| 295 | 
         
            +
            For issues and questions:
         
     | 
| 296 | 
         
            +
             
     | 
| 297 | 
         
            +
            1. Check the troubleshooting section
         
     | 
| 298 | 
         
            +
            2. Review the logs in `training.log`
         
     | 
| 299 | 
         
            +
            3. Check the Trackio Space for monitoring data
         
     | 
| 300 | 
         
            +
            4. Open an issue on GitHub
         
     | 
| 301 | 
         
            +
             
     | 
| 302 | 
         
            +
            ---
         
     | 
| 303 | 
         
            +
             
     | 
| 304 | 
         
            +
            **Happy Fine-tuning! 🚀** 
         
     | 
    	
        cloud_deployment.sh
    DELETED
    
    | 
         @@ -1,279 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            #!/bin/bash
         
     | 
| 2 | 
         
            -
            # Cloud Deployment Script for SmolLM3 DPO Training
         
     | 
| 3 | 
         
            -
            # This script sets up a cloud instance for training and uploading to Hugging Face
         
     | 
| 4 | 
         
            -
             
     | 
| 5 | 
         
            -
            set -e  # Exit on any error
         
     | 
| 6 | 
         
            -
             
     | 
| 7 | 
         
            -
            echo "🚀 Starting SmolLM3 DPO Cloud Deployment"
         
     | 
| 8 | 
         
            -
            echo "=========================================="
         
     | 
| 9 | 
         
            -
             
     | 
| 10 | 
         
            -
            # Configuration
         
     | 
| 11 | 
         
            -
            MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
         
     | 
| 12 | 
         
            -
            DATASET_NAME="HuggingFaceTB/smoltalk"
         
     | 
| 13 | 
         
            -
            EXPERIMENT_NAME="smollm3_dpo_6epochs"
         
     | 
| 14 | 
         
            -
            REPO_NAME="your-username/smollm3-dpo-6epochs"  # Change this to your username
         
     | 
| 15 | 
         
            -
            TRACKIO_URL="https://your-trackio-space.hf.space"  # Change this to your Trackio Space URL
         
     | 
| 16 | 
         
            -
            HF_TOKEN="your_hf_token_here"  # Change this to your HF token
         
     | 
| 17 | 
         
            -
             
     | 
| 18 | 
         
            -
            # Training Configuration
         
     | 
| 19 | 
         
            -
            BATCH_SIZE=2
         
     | 
| 20 | 
         
            -
            GRADIENT_ACCUMULATION_STEPS=8
         
     | 
| 21 | 
         
            -
            LEARNING_RATE=5e-6
         
     | 
| 22 | 
         
            -
            MAX_EPOCHS=6
         
     | 
| 23 | 
         
            -
            MAX_SEQ_LENGTH=4096
         
     | 
| 24 | 
         
            -
            SAVE_STEPS=500
         
     | 
| 25 | 
         
            -
            EVAL_STEPS=100
         
     | 
| 26 | 
         
            -
            LOGGING_STEPS=10
         
     | 
| 27 | 
         
            -
             
     | 
| 28 | 
         
            -
            echo "📋 Configuration:"
         
     | 
| 29 | 
         
            -
            echo "  Model: $MODEL_NAME"
         
     | 
| 30 | 
         
            -
            echo "  Dataset: $DATASET_NAME"
         
     | 
| 31 | 
         
            -
            echo "  Experiment: $EXPERIMENT_NAME"
         
     | 
| 32 | 
         
            -
            echo "  Repository: $REPO_NAME"
         
     | 
| 33 | 
         
            -
            echo "  Epochs: $MAX_EPOCHS"
         
     | 
| 34 | 
         
            -
            echo "  Batch Size: $BATCH_SIZE"
         
     | 
| 35 | 
         
            -
            echo "  Learning Rate: $LEARNING_RATE"
         
     | 
| 36 | 
         
            -
             
     | 
| 37 | 
         
            -
            # Step 1: Update system and install dependencies
         
     | 
| 38 | 
         
            -
            echo ""
         
     | 
| 39 | 
         
            -
            echo "🔧 Step 1: Installing system dependencies..."
         
     | 
| 40 | 
         
            -
            sudo apt-get update
         
     | 
| 41 | 
         
            -
            sudo apt-get install -y git curl wget unzip
         
     | 
| 42 | 
         
            -
             
     | 
| 43 | 
         
            -
            # Step 2: Install Python and pip
         
     | 
| 44 | 
         
            -
            echo ""
         
     | 
| 45 | 
         
            -
            echo "🐍 Step 2: Installing Python dependencies..."
         
     | 
| 46 | 
         
            -
            sudo apt-get install -y python3 python3-pip python3-venv
         
     | 
| 47 | 
         
            -
             
     | 
| 48 | 
         
            -
            # Step 3: Create virtual environment
         
     | 
| 49 | 
         
            -
            echo ""
         
     | 
| 50 | 
         
            -
            echo "📦 Step 3: Setting up Python virtual environment..."
         
     | 
| 51 | 
         
            -
            python3 -m venv smollm3_env
         
     | 
| 52 | 
         
            -
            source smollm3_env/bin/activate
         
     | 
| 53 | 
         
            -
             
     | 
| 54 | 
         
            -
            # Step 4: Install PyTorch and CUDA
         
     | 
| 55 | 
         
            -
            echo ""
         
     | 
| 56 | 
         
            -
            echo "🔥 Step 4: Installing PyTorch with CUDA support..."
         
     | 
| 57 | 
         
            -
            pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
         
     | 
| 58 | 
         
            -
             
     | 
| 59 | 
         
            -
            # Step 5: Install project dependencies
         
     | 
| 60 | 
         
            -
            echo ""
         
     | 
| 61 | 
         
            -
            echo "📚 Step 5: Installing project dependencies..."
         
     | 
| 62 | 
         
            -
            pip install -r requirements.txt
         
     | 
| 63 | 
         
            -
             
     | 
| 64 | 
         
            -
            # Step 6: Install additional dependencies for DPO
         
     | 
| 65 | 
         
            -
            echo ""
         
     | 
| 66 | 
         
            -
            echo "🎯 Step 6: Installing DPO-specific dependencies..."
         
     | 
| 67 | 
         
            -
            pip install trl>=0.7.0
         
     | 
| 68 | 
         
            -
            pip install peft>=0.4.0
         
     | 
| 69 | 
         
            -
            pip install accelerate>=0.20.0
         
     | 
| 70 | 
         
            -
             
     | 
| 71 | 
         
            -
            # Step 7: Set up Hugging Face token
         
     | 
| 72 | 
         
            -
            echo ""
         
     | 
| 73 | 
         
            -
            echo "🔑 Step 7: Setting up Hugging Face authentication..."
         
     | 
| 74 | 
         
            -
            export HF_TOKEN="$HF_TOKEN"
         
     | 
| 75 | 
         
            -
            huggingface-cli login --token $HF_TOKEN
         
     | 
| 76 | 
         
            -
             
     | 
| 77 | 
         
            -
            # Step 8: Create DPO configuration
         
     | 
| 78 | 
         
            -
            echo ""
         
     | 
| 79 | 
         
            -
            echo "⚙️ Step 8: Creating DPO configuration..."
         
     | 
| 80 | 
         
            -
            cat > config/train_smollm3_dpo_6epochs.py << EOF
         
     | 
| 81 | 
         
            -
            """
         
     | 
| 82 | 
         
            -
            SmolLM3 DPO Training Configuration - 6 Epochs
         
     | 
| 83 | 
         
            -
            Optimized for cloud deployment
         
     | 
| 84 | 
         
            -
            """
         
     | 
| 85 | 
         
            -
             
     | 
| 86 | 
         
            -
            from config.train_smollm3_dpo import SmolLM3DPOConfig
         
     | 
| 87 | 
         
            -
             
     | 
| 88 | 
         
            -
            config = SmolLM3DPOConfig(
         
     | 
| 89 | 
         
            -
                # Model configuration
         
     | 
| 90 | 
         
            -
                model_name="$MODEL_NAME",
         
     | 
| 91 | 
         
            -
                max_seq_length=$MAX_SEQ_LENGTH,
         
     | 
| 92 | 
         
            -
                use_flash_attention=True,
         
     | 
| 93 | 
         
            -
                use_gradient_checkpointing=True,
         
     | 
| 94 | 
         
            -
                
         
     | 
| 95 | 
         
            -
                # Training configuration
         
     | 
| 96 | 
         
            -
                batch_size=$BATCH_SIZE,
         
     | 
| 97 | 
         
            -
                gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS,
         
     | 
| 98 | 
         
            -
                learning_rate=$LEARNING_RATE,
         
     | 
| 99 | 
         
            -
                weight_decay=0.01,
         
     | 
| 100 | 
         
            -
                warmup_steps=100,
         
     | 
| 101 | 
         
            -
                max_iters=None,  # Will be calculated based on epochs
         
     | 
| 102 | 
         
            -
                eval_interval=100,
         
     | 
| 103 | 
         
            -
                log_interval=10,
         
     | 
| 104 | 
         
            -
                save_interval=500,
         
     | 
| 105 | 
         
            -
                
         
     | 
| 106 | 
         
            -
                # DPO configuration
         
     | 
| 107 | 
         
            -
                beta=0.1,
         
     | 
| 108 | 
         
            -
                max_prompt_length=$((MAX_SEQ_LENGTH // 2)),
         
     | 
| 109 | 
         
            -
                
         
     | 
| 110 | 
         
            -
                # Optimizer configuration
         
     | 
| 111 | 
         
            -
                optimizer="adamw",
         
     | 
| 112 | 
         
            -
                beta1=0.9,
         
     | 
| 113 | 
         
            -
                beta2=0.95,
         
     | 
| 114 | 
         
            -
                eps=1e-8,
         
     | 
| 115 | 
         
            -
                
         
     | 
| 116 | 
         
            -
                # Scheduler configuration
         
     | 
| 117 | 
         
            -
                scheduler="cosine",
         
     | 
| 118 | 
         
            -
                min_lr=1e-6,
         
     | 
| 119 | 
         
            -
                
         
     | 
| 120 | 
         
            -
                # Mixed precision
         
     | 
| 121 | 
         
            -
                fp16=True,
         
     | 
| 122 | 
         
            -
                bf16=False,
         
     | 
| 123 | 
         
            -
                
         
     | 
| 124 | 
         
            -
                # Logging and saving
         
     | 
| 125 | 
         
            -
                save_steps=$SAVE_STEPS,
         
     | 
| 126 | 
         
            -
                eval_steps=$EVAL_STEPS,
         
     | 
| 127 | 
         
            -
                logging_steps=$LOGGING_STEPS,
         
     | 
| 128 | 
         
            -
                save_total_limit=3,
         
     | 
| 129 | 
         
            -
                
         
     | 
| 130 | 
         
            -
                # Evaluation
         
     | 
| 131 | 
         
            -
                eval_strategy="steps",
         
     | 
| 132 | 
         
            -
                metric_for_best_model="eval_loss",
         
     | 
| 133 | 
         
            -
                greater_is_better=False,
         
     | 
| 134 | 
         
            -
                load_best_model_at_end=True,
         
     | 
| 135 | 
         
            -
                
         
     | 
| 136 | 
         
            -
                # Data configuration
         
     | 
| 137 | 
         
            -
                data_dir="smoltalk_dataset",
         
     | 
| 138 | 
         
            -
                train_file="train.json",
         
     | 
| 139 | 
         
            -
                validation_file="validation.json",
         
     | 
| 140 | 
         
            -
                
         
     | 
| 141 | 
         
            -
                # Chat template configuration
         
     | 
| 142 | 
         
            -
                use_chat_template=True,
         
     | 
| 143 | 
         
            -
                chat_template_kwargs={
         
     | 
| 144 | 
         
            -
                    "enable_thinking": False,
         
     | 
| 145 | 
         
            -
                    "add_generation_prompt": True
         
     | 
| 146 | 
         
            -
                },
         
     | 
| 147 | 
         
            -
                
         
     | 
| 148 | 
         
            -
                # Trackio monitoring configuration
         
     | 
| 149 | 
         
            -
                enable_tracking=True,
         
     | 
| 150 | 
         
            -
                trackio_url="$TRACKIO_URL",
         
     | 
| 151 | 
         
            -
                trackio_token=None,
         
     | 
| 152 | 
         
            -
                log_artifacts=True,
         
     | 
| 153 | 
         
            -
                log_metrics=True,
         
     | 
| 154 | 
         
            -
                log_config=True,
         
     | 
| 155 | 
         
            -
                experiment_name="$EXPERIMENT_NAME"
         
     | 
| 156 | 
         
            -
            )
         
     | 
| 157 | 
         
            -
            EOF
         
     | 
| 158 | 
         
            -
             
     | 
| 159 | 
         
            -
            # Step 9: Download and prepare dataset
         
     | 
| 160 | 
         
            -
            echo ""
         
     | 
| 161 | 
         
            -
            echo "📊 Step 9: Downloading and preparing dataset..."
         
     | 
| 162 | 
         
            -
            python -c "
         
     | 
| 163 | 
         
            -
            from datasets import load_dataset
         
     | 
| 164 | 
         
            -
            import json
         
     | 
| 165 | 
         
            -
            import os
         
     | 
| 166 | 
         
            -
             
     | 
| 167 | 
         
            -
            # Load SmolTalk dataset
         
     | 
| 168 | 
         
            -
            print('Loading SmolTalk dataset...')
         
     | 
| 169 | 
         
            -
            dataset = load_dataset('$DATASET_NAME')
         
     | 
| 170 | 
         
            -
             
     | 
| 171 | 
         
            -
            # Create dataset directory
         
     | 
| 172 | 
         
            -
            os.makedirs('smoltalk_dataset', exist_ok=True)
         
     | 
| 173 | 
         
            -
             
     | 
| 174 | 
         
            -
            # Convert to DPO format (preference pairs)
         
     | 
| 175 | 
         
            -
            def convert_to_dpo_format(example):
         
     | 
| 176 | 
         
            -
                # For SmolTalk, we'll create preference pairs based on response quality
         
     | 
| 177 | 
         
            -
                # This is a simplified example - you may need to adjust based on your needs
         
     | 
| 178 | 
         
            -
                return {
         
     | 
| 179 | 
         
            -
                    'prompt': example.get('prompt', ''),
         
     | 
| 180 | 
         
            -
                    'chosen': example.get('chosen', ''),
         
     | 
| 181 | 
         
            -
                    'rejected': example.get('rejected', '')
         
     | 
| 182 | 
         
            -
                }
         
     | 
| 183 | 
         
            -
             
     | 
| 184 | 
         
            -
            # Process train split
         
     | 
| 185 | 
         
            -
            train_data = []
         
     | 
| 186 | 
         
            -
            for example in dataset['train']:
         
     | 
| 187 | 
         
            -
                dpo_example = convert_to_dpo_format(example)
         
     | 
| 188 | 
         
            -
                if dpo_example['prompt'] and dpo_example['chosen'] and dpo_example['rejected']:
         
     | 
| 189 | 
         
            -
                    train_data.append(dpo_example)
         
     | 
| 190 | 
         
            -
             
     | 
| 191 | 
         
            -
            # Process validation split
         
     | 
| 192 | 
         
            -
            val_data = []
         
     | 
| 193 | 
         
            -
            for example in dataset['validation']:
         
     | 
| 194 | 
         
            -
                dpo_example = convert_to_dpo_format(example)
         
     | 
| 195 | 
         
            -
                if dpo_example['prompt'] and dpo_example['chosen'] and dpo_example['rejected']:
         
     | 
| 196 | 
         
            -
                    val_data.append(dpo_example)
         
     | 
| 197 | 
         
            -
             
     | 
| 198 | 
         
            -
            # Save to files
         
     | 
| 199 | 
         
            -
            with open('smoltalk_dataset/train.json', 'w') as f:
         
     | 
| 200 | 
         
            -
                json.dump(train_data, f, indent=2)
         
     | 
| 201 | 
         
            -
             
     | 
| 202 | 
         
            -
            with open('smoltalk_dataset/validation.json', 'w') as f:
         
     | 
| 203 | 
         
            -
                json.dump(val_data, f, indent=2)
         
     | 
| 204 | 
         
            -
             
     | 
| 205 | 
         
            -
            print(f'Dataset prepared: {len(train_data)} train samples, {len(val_data)} validation samples')
         
     | 
| 206 | 
         
            -
            "
         
     | 
| 207 | 
         
            -
             
     | 
| 208 | 
         
            -
            # Step 10: Calculate training steps based on epochs
         
     | 
| 209 | 
         
            -
            echo ""
         
     | 
| 210 | 
         
            -
            echo "📈 Step 10: Calculating training parameters..."
         
     | 
| 211 | 
         
            -
            TOTAL_SAMPLES=$(python -c "import json; data=json.load(open('smoltalk_dataset/train.json')); print(len(data))")
         
     | 
| 212 | 
         
            -
            EFFECTIVE_BATCH_SIZE=$((BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS))
         
     | 
| 213 | 
         
            -
            STEPS_PER_EPOCH=$((TOTAL_SAMPLES / EFFECTIVE_BATCH_SIZE))
         
     | 
| 214 | 
         
            -
            MAX_STEPS=$((STEPS_PER_EPOCH * MAX_EPOCHS))
         
     | 
| 215 | 
         
            -
             
     | 
| 216 | 
         
            -
            echo "  Total samples: $TOTAL_SAMPLES"
         
     | 
| 217 | 
         
            -
            echo "  Effective batch size: $EFFECTIVE_BATCH_SIZE"
         
     | 
| 218 | 
         
            -
            echo "  Steps per epoch: $STEPS_PER_EPOCH"
         
     | 
| 219 | 
         
            -
            echo "  Total training steps: $MAX_STEPS"
         
     | 
| 220 | 
         
            -
             
     | 
| 221 | 
         
            -
            # Step 11: Start DPO training
         
     | 
| 222 | 
         
            -
            echo ""
         
     | 
| 223 | 
         
            -
            echo "🎯 Step 11: Starting DPO training..."
         
     | 
| 224 | 
         
            -
            python train.py config/train_smollm3_dpo_6epochs.py \
         
     | 
| 225 | 
         
            -
                --dataset_dir smoltalk_dataset \
         
     | 
| 226 | 
         
            -
                --out_dir /output-checkpoint \
         
     | 
| 227 | 
         
            -
                --init_from scratch \
         
     | 
| 228 | 
         
            -
                --max_iters $MAX_STEPS \
         
     | 
| 229 | 
         
            -
                --batch_size $BATCH_SIZE \
         
     | 
| 230 | 
         
            -
                --learning_rate $LEARNING_RATE \
         
     | 
| 231 | 
         
            -
                --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
         
     | 
| 232 | 
         
            -
                --max_seq_length $MAX_SEQ_LENGTH \
         
     | 
| 233 | 
         
            -
                --save_steps $SAVE_STEPS \
         
     | 
| 234 | 
         
            -
                --eval_steps $EVAL_STEPS \
         
     | 
| 235 | 
         
            -
                --logging_steps $LOGGING_STEPS \
         
     | 
| 236 | 
         
            -
                --enable_tracking \
         
     | 
| 237 | 
         
            -
                --trackio_url "$TRACKIO_URL" \
         
     | 
| 238 | 
         
            -
                --experiment_name "$EXPERIMENT_NAME"
         
     | 
| 239 | 
         
            -
             
     | 
| 240 | 
         
            -
            # Step 12: Push model to Hugging Face Hub
         
     | 
| 241 | 
         
            -
            echo ""
         
     | 
| 242 | 
         
            -
            echo "📤 Step 12: Pushing model to Hugging Face Hub..."
         
     | 
| 243 | 
         
            -
            python push_to_huggingface.py /output-checkpoint "$REPO_NAME" \
         
     | 
| 244 | 
         
            -
                --token "$HF_TOKEN" \
         
     | 
| 245 | 
         
            -
                --trackio-url "$TRACKIO_URL" \
         
     | 
| 246 | 
         
            -
                --experiment-name "$EXPERIMENT_NAME"
         
     | 
| 247 | 
         
            -
             
     | 
| 248 | 
         
            -
            # Step 13: Test the uploaded model
         
     | 
| 249 | 
         
            -
            echo ""
         
     | 
| 250 | 
         
            -
            echo "🧪 Step 13: Testing uploaded model..."
         
     | 
| 251 | 
         
            -
            python -c "
         
     | 
| 252 | 
         
            -
            from transformers import AutoModelForCausalLM, AutoTokenizer
         
     | 
| 253 | 
         
            -
            import torch
         
     | 
| 254 | 
         
            -
             
     | 
| 255 | 
         
            -
            print('Loading uploaded model...')
         
     | 
| 256 | 
         
            -
            model = AutoModelForCausalLM.from_pretrained('$REPO_NAME', torch_dtype=torch.float16, device_map='auto')
         
     | 
| 257 | 
         
            -
            tokenizer = AutoTokenizer.from_pretrained('$REPO_NAME')
         
     | 
| 258 | 
         
            -
             
     | 
| 259 | 
         
            -
            print('Testing model generation...')
         
     | 
| 260 | 
         
            -
            prompt = 'Hello, how are you?'
         
     | 
| 261 | 
         
            -
            inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
         
     | 
| 262 | 
         
            -
            outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, temperature=0.7)
         
     | 
| 263 | 
         
            -
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         
     | 
| 264 | 
         
            -
            print(f'Prompt: {prompt}')
         
     | 
| 265 | 
         
            -
            print(f'Response: {response}')
         
     | 
| 266 | 
         
            -
            print('✅ Model test completed successfully!')
         
     | 
| 267 | 
         
            -
            "
         
     | 
| 268 | 
         
            -
             
     | 
| 269 | 
         
            -
            echo ""
         
     | 
| 270 | 
         
            -
            echo "🎉 Deployment completed successfully!"
         
     | 
| 271 | 
         
            -
            echo "====================================="
         
     | 
| 272 | 
         
            -
            echo "📊 Model: https://huggingface.co/$REPO_NAME"
         
     | 
| 273 | 
         
            -
            echo "📈 Trackio: $TRACKIO_URL"
         
     | 
| 274 | 
         
            -
            echo "📋 Experiment: $EXPERIMENT_NAME"
         
     | 
| 275 | 
         
            -
            echo ""
         
     | 
| 276 | 
         
            -
            echo "Next steps:"
         
     | 
| 277 | 
         
            -
            echo "1. Monitor training progress in your Trackio Space"
         
     | 
| 278 | 
         
            -
            echo "2. Check the model repository on Hugging Face Hub"
         
     | 
| 279 | 
         
            -
            echo "3. Use the model in your applications" 
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        config/train_smollm3.py
    CHANGED
    
    | 
         @@ -76,6 +76,10 @@ class SmolLM3Config: 
     | 
|
| 76 | 
         
             
                log_metrics: bool = True
         
     | 
| 77 | 
         
             
                log_config: bool = True
         
     | 
| 78 | 
         
             
                experiment_name: Optional[str] = None
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 79 | 
         | 
| 80 | 
         
             
                def __post_init__(self):
         
     | 
| 81 | 
         
             
                    if self.chat_template_kwargs is None:
         
     | 
| 
         | 
|
| 76 | 
         
             
                log_metrics: bool = True
         
     | 
| 77 | 
         
             
                log_config: bool = True
         
     | 
| 78 | 
         
             
                experiment_name: Optional[str] = None
         
     | 
| 79 | 
         
            +
                # HF Datasets configuration
         
     | 
| 80 | 
         
            +
                hf_token: Optional[str] = None
         
     | 
| 81 | 
         
            +
                dataset_repo: Optional[str] = None
         
     | 
| 82 | 
         
            +
             
     | 
| 83 | 
         | 
| 84 | 
         
             
                def __post_init__(self):
         
     | 
| 85 | 
         
             
                    if self.chat_template_kwargs is None:
         
     | 
    	
        config/train_smollm3_h100_lightweight.py
    ADDED
    
    | 
         @@ -0,0 +1,112 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            """
         
     | 
| 2 | 
         
            +
            SmolLM3 H100 Lightweight Training Configuration
         
     | 
| 3 | 
         
            +
            Optimized for rapid training on H100 with 80K Hermes-FR samples
         
     | 
| 4 | 
         
            +
            """
         
     | 
| 5 | 
         
            +
             
     | 
| 6 | 
         
            +
            from config.train_smollm3 import SmolLM3Config
         
     | 
| 7 | 
         
            +
             
     | 
| 8 | 
         
            +
            config = SmolLM3Config(
         
     | 
| 9 | 
         
            +
                # Model configuration
         
     | 
| 10 | 
         
            +
                model_name="HuggingFaceTB/SmolLM3-3B",
         
     | 
| 11 | 
         
            +
                max_seq_length=8192,
         
     | 
| 12 | 
         
            +
                use_flash_attention=True,
         
     | 
| 13 | 
         
            +
                use_gradient_checkpointing=True,
         
     | 
| 14 | 
         
            +
                
         
     | 
| 15 | 
         
            +
                # Training configuration - Optimized for H100
         
     | 
| 16 | 
         
            +
                batch_size=16,  # Larger batch size for H100
         
     | 
| 17 | 
         
            +
                gradient_accumulation_steps=4,  # Reduced for faster updates
         
     | 
| 18 | 
         
            +
                learning_rate=8e-6,  # Slightly higher for rapid convergence
         
     | 
| 19 | 
         
            +
                weight_decay=0.01,
         
     | 
| 20 | 
         
            +
                warmup_steps=50,  # Reduced warmup for rapid training
         
     | 
| 21 | 
         
            +
                max_iters=None,  # Will be calculated based on epochs
         
     | 
| 22 | 
         
            +
                eval_interval=50,  # More frequent evaluation
         
     | 
| 23 | 
         
            +
                log_interval=5,  # More frequent logging
         
     | 
| 24 | 
         
            +
                save_interval=200,  # More frequent saving
         
     | 
| 25 | 
         
            +
                
         
     | 
| 26 | 
         
            +
                # Optimizer configuration - Optimized for rapid training
         
     | 
| 27 | 
         
            +
                optimizer="adamw",
         
     | 
| 28 | 
         
            +
                beta1=0.9,
         
     | 
| 29 | 
         
            +
                beta2=0.95,
         
     | 
| 30 | 
         
            +
                eps=1e-8,
         
     | 
| 31 | 
         
            +
                
         
     | 
| 32 | 
         
            +
                # Scheduler configuration - Faster learning
         
     | 
| 33 | 
         
            +
                scheduler="cosine",
         
     | 
| 34 | 
         
            +
                min_lr=2e-6,  # Higher minimum LR
         
     | 
| 35 | 
         
            +
                
         
     | 
| 36 | 
         
            +
                # Mixed precision - Full precision for H100
         
     | 
| 37 | 
         
            +
                fp16=True,
         
     | 
| 38 | 
         
            +
                bf16=False,
         
     | 
| 39 | 
         
            +
                
         
     | 
| 40 | 
         
            +
                # Logging and saving - More frequent for rapid training
         
     | 
| 41 | 
         
            +
                save_steps=200,
         
     | 
| 42 | 
         
            +
                eval_steps=50,
         
     | 
| 43 | 
         
            +
                logging_steps=5,
         
     | 
| 44 | 
         
            +
                save_total_limit=2,  # Keep fewer checkpoints
         
     | 
| 45 | 
         
            +
                
         
     | 
| 46 | 
         
            +
                # Evaluation
         
     | 
| 47 | 
         
            +
                eval_strategy="steps",
         
     | 
| 48 | 
         
            +
                metric_for_best_model="eval_loss",
         
     | 
| 49 | 
         
            +
                greater_is_better=False,
         
     | 
| 50 | 
         
            +
                load_best_model_at_end=True,
         
     | 
| 51 | 
         
            +
                
         
     | 
| 52 | 
         
            +
                # Data configuration - Hermes-FR with sampling
         
     | 
| 53 | 
         
            +
                dataset_name="legmlai/openhermes-fr",
         
     | 
| 54 | 
         
            +
                dataset_split="train",
         
     | 
| 55 | 
         
            +
                input_field="prompt",
         
     | 
| 56 | 
         
            +
                target_field="completion",
         
     | 
| 57 | 
         
            +
                filter_bad_entries=False,
         
     | 
| 58 | 
         
            +
                bad_entry_field="bad_entry",
         
     | 
| 59 | 
         
            +
                
         
     | 
| 60 | 
         
            +
                # Chat template configuration
         
     | 
| 61 | 
         
            +
                use_chat_template=True,
         
     | 
| 62 | 
         
            +
                chat_template_kwargs={
         
     | 
| 63 | 
         
            +
                    "enable_thinking": False,
         
     | 
| 64 | 
         
            +
                    "add_generation_prompt": True,
         
     | 
| 65 | 
         
            +
                    "no_think_system_message": True
         
     | 
| 66 | 
         
            +
                },
         
     | 
| 67 | 
         
            +
                
         
     | 
| 68 | 
         
            +
                # Trackio monitoring configuration
         
     | 
| 69 | 
         
            +
                enable_tracking=True,
         
     | 
| 70 | 
         
            +
                trackio_url=None,  # Will be set by launch script
         
     | 
| 71 | 
         
            +
                trackio_token=None,
         
     | 
| 72 | 
         
            +
                log_artifacts=True,
         
     | 
| 73 | 
         
            +
                log_metrics=True,
         
     | 
| 74 | 
         
            +
                log_config=True,
         
     | 
| 75 | 
         
            +
                experiment_name=None,  # Will be set by launch script
         
     | 
| 76 | 
         
            +
                
         
     | 
| 77 | 
         
            +
                # HF Datasets configuration
         
     | 
| 78 | 
         
            +
                dataset_repo=None,  # Will be set by launch script
         
     | 
| 79 | 
         
            +
                
         
     | 
| 80 | 
         
            +
                # H100-specific optimizations
         
     | 
| 81 | 
         
            +
                dataloader_num_workers=4,  # Optimized for H100
         
     | 
| 82 | 
         
            +
                dataloader_pin_memory=True,
         
     | 
| 83 | 
         
            +
                gradient_clipping=1.0,  # Prevent gradient explosion
         
     | 
| 84 | 
         
            +
                
         
     | 
| 85 | 
         
            +
                # Memory optimizations for rapid training
         
     | 
| 86 | 
         
            +
                max_grad_norm=1.0,
         
     | 
| 87 | 
         
            +
                warmup_ratio=0.1,  # 10% warmup
         
     | 
| 88 | 
         
            +
                lr_scheduler_type="cosine",
         
     | 
| 89 | 
         
            +
                
         
     | 
| 90 | 
         
            +
                # Early stopping for rapid training
         
     | 
| 91 | 
         
            +
                early_stopping_patience=3,
         
     | 
| 92 | 
         
            +
                early_stopping_threshold=0.001,
         
     | 
| 93 | 
         
            +
                
         
     | 
| 94 | 
         
            +
                # H100-specific training optimizations
         
     | 
| 95 | 
         
            +
                remove_unused_columns=False,
         
     | 
| 96 | 
         
            +
                group_by_length=True,  # Group similar length sequences
         
     | 
| 97 | 
         
            +
                length_column_name="length",
         
     | 
| 98 | 
         
            +
                ignore_data_skip=False,
         
     | 
| 99 | 
         
            +
                
         
     | 
| 100 | 
         
            +
                # Reporting
         
     | 
| 101 | 
         
            +
                report_to=["tensorboard"],
         
     | 
| 102 | 
         
            +
                run_name="smollm3-h100-lightweight",
         
     | 
| 103 | 
         
            +
                
         
     | 
| 104 | 
         
            +
                # Seed for reproducibility
         
     | 
| 105 | 
         
            +
                seed=42,
         
     | 
| 106 | 
         
            +
                
         
     | 
| 107 | 
         
            +
                # Data collator settings
         
     | 
| 108 | 
         
            +
                data_collator_kwargs={
         
     | 
| 109 | 
         
            +
                    "pad_to_multiple_of": 8,  # Optimized for H100
         
     | 
| 110 | 
         
            +
                    "return_tensors": "pt"
         
     | 
| 111 | 
         
            +
                }
         
     | 
| 112 | 
         
            +
            ) 
         
     | 
    	
        config/train_smollm3_openhermes_fr.py
    CHANGED
    
    | 
         @@ -85,6 +85,10 @@ class SmolLM3ConfigOpenHermesFR(SmolLM3Config): 
     | 
|
| 85 | 
         
             
                log_metrics: bool = True
         
     | 
| 86 | 
         
             
                log_config: bool = True
         
     | 
| 87 | 
         
             
                experiment_name: Optional[str] = None
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 88 | 
         | 
| 89 | 
         
             
                def __post_init__(self):
         
     | 
| 90 | 
         
             
                    if self.chat_template_kwargs is None:
         
     | 
| 
         | 
|
| 85 | 
         
             
                log_metrics: bool = True
         
     | 
| 86 | 
         
             
                log_config: bool = True
         
     | 
| 87 | 
         
             
                experiment_name: Optional[str] = None
         
     | 
| 88 | 
         
            +
                # HF Datasets configuration
         
     | 
| 89 | 
         
            +
                hf_token: Optional[str] = None
         
     | 
| 90 | 
         
            +
                dataset_repo: Optional[str] = None
         
     | 
| 91 | 
         
            +
             
     | 
| 92 | 
         | 
| 93 | 
         
             
                def __post_init__(self):
         
     | 
| 94 | 
         
             
                    if self.chat_template_kwargs is None:
         
     | 
    	
        config/train_smollm3_openhermes_fr_a100_balanced.py
    CHANGED
    
    | 
         @@ -91,6 +91,10 @@ class SmolLM3ConfigOpenHermesFRBalanced(SmolLM3Config): 
     | 
|
| 91 | 
         
             
                log_metrics: bool = True
         
     | 
| 92 | 
         
             
                log_config: bool = True
         
     | 
| 93 | 
         
             
                experiment_name: Optional[str] = None
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 94 | 
         | 
| 95 | 
         
             
                # Additional A100 optimizations for balanced performance
         
     | 
| 96 | 
         
             
                dataloader_num_workers: int = 10  # More workers for faster data loading
         
     | 
| 
         | 
|
| 91 | 
         
             
                log_metrics: bool = True
         
     | 
| 92 | 
         
             
                log_config: bool = True
         
     | 
| 93 | 
         
             
                experiment_name: Optional[str] = None
         
     | 
| 94 | 
         
            +
                # HF Datasets configuration
         
     | 
| 95 | 
         
            +
                hf_token: Optional[str] = None
         
     | 
| 96 | 
         
            +
                dataset_repo: Optional[str] = None
         
     | 
| 97 | 
         
            +
             
     | 
| 98 | 
         | 
| 99 | 
         
             
                # Additional A100 optimizations for balanced performance
         
     | 
| 100 | 
         
             
                dataloader_num_workers: int = 10  # More workers for faster data loading
         
     | 
    	
        config/train_smollm3_openhermes_fr_a100_large.py
    CHANGED
    
    | 
         @@ -85,6 +85,10 @@ class SmolLM3ConfigOpenHermesFRA100Large(SmolLM3Config): 
     | 
|
| 85 | 
         
             
                log_metrics: bool = True
         
     | 
| 86 | 
         
             
                log_config: bool = True
         
     | 
| 87 | 
         
             
                experiment_name: Optional[str] = None
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 88 | 
         | 
| 89 | 
         
             
                # Additional A100 optimizations
         
     | 
| 90 | 
         
             
                dataloader_num_workers: int = 8  # More workers for faster data loading
         
     | 
| 
         | 
|
| 85 | 
         
             
                log_metrics: bool = True
         
     | 
| 86 | 
         
             
                log_config: bool = True
         
     | 
| 87 | 
         
             
                experiment_name: Optional[str] = None
         
     | 
| 88 | 
         
            +
                # HF Datasets configuration
         
     | 
| 89 | 
         
            +
                hf_token: Optional[str] = None
         
     | 
| 90 | 
         
            +
                dataset_repo: Optional[str] = None
         
     | 
| 91 | 
         
            +
             
     | 
| 92 | 
         | 
| 93 | 
         
             
                # Additional A100 optimizations
         
     | 
| 94 | 
         
             
                dataloader_num_workers: int = 8  # More workers for faster data loading
         
     | 
    	
        config/train_smollm3_openhermes_fr_a100_max_performance.py
    CHANGED
    
    | 
         @@ -85,6 +85,10 @@ class SmolLM3ConfigOpenHermesFRMaxPerformance(SmolLM3Config): 
     | 
|
| 85 | 
         
             
                log_metrics: bool = True
         
     | 
| 86 | 
         
             
                log_config: bool = True
         
     | 
| 87 | 
         
             
                experiment_name: Optional[str] = None
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 88 | 
         | 
| 89 | 
         
             
                # Additional A100 optimizations for maximum performance
         
     | 
| 90 | 
         
             
                dataloader_num_workers: int = 12  # More workers for faster data loading
         
     | 
| 
         | 
|
| 85 | 
         
             
                log_metrics: bool = True
         
     | 
| 86 | 
         
             
                log_config: bool = True
         
     | 
| 87 | 
         
             
                experiment_name: Optional[str] = None
         
     | 
| 88 | 
         
            +
                # HF Datasets configuration
         
     | 
| 89 | 
         
            +
                hf_token: Optional[str] = None
         
     | 
| 90 | 
         
            +
                dataset_repo: Optional[str] = None
         
     | 
| 91 | 
         
            +
             
     | 
| 92 | 
         | 
| 93 | 
         
             
                # Additional A100 optimizations for maximum performance
         
     | 
| 94 | 
         
             
                dataloader_num_workers: int = 12  # More workers for faster data loading
         
     | 
    	
        config/train_smollm3_openhermes_fr_a100_multiple_passes.py
    CHANGED
    
    | 
         @@ -85,6 +85,10 @@ class SmolLM3ConfigOpenHermesFRMultiplePasses(SmolLM3Config): 
     | 
|
| 85 | 
         
             
                log_metrics: bool = True
         
     | 
| 86 | 
         
             
                log_config: bool = True
         
     | 
| 87 | 
         
             
                experiment_name: Optional[str] = None
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 88 | 
         | 
| 89 | 
         
             
                # Additional A100 optimizations
         
     | 
| 90 | 
         
             
                dataloader_num_workers: int = 8  # More workers for faster data loading
         
     | 
| 
         | 
|
| 85 | 
         
             
                log_metrics: bool = True
         
     | 
| 86 | 
         
             
                log_config: bool = True
         
     | 
| 87 | 
         
             
                experiment_name: Optional[str] = None
         
     | 
| 88 | 
         
            +
                # HF Datasets configuration
         
     | 
| 89 | 
         
            +
                hf_token: Optional[str] = None
         
     | 
| 90 | 
         
            +
                dataset_repo: Optional[str] = None
         
     | 
| 91 | 
         
            +
             
     | 
| 92 | 
         | 
| 93 | 
         
             
                # Additional A100 optimizations
         
     | 
| 94 | 
         
             
                dataloader_num_workers: int = 8  # More workers for faster data loading
         
     | 
    	
        A100_LARGE_SCALE_GUIDE.md → docs/A100_LARGE_SCALE_GUIDE.md
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        docs/APP_CONFIGURATION_GUIDE.md
    ADDED
    
    | 
         @@ -0,0 +1,234 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            # ⚙️ App Configuration Guide
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            ## Overview
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            The Trackio app now includes a **Configuration tab** that allows you to set your Hugging Face token and dataset repository directly through the interface, providing an alternative to environment variables.
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            ## 🚀 New Features
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            ### **Configuration Tab**
         
     | 
| 10 | 
         
            +
            - ✅ **HF Token Input**: Secure password field for your Hugging Face token
         
     | 
| 11 | 
         
            +
            - ✅ **Dataset Repository Input**: Text field for your dataset repository
         
     | 
| 12 | 
         
            +
            - ✅ **Update Configuration**: Apply new settings and reload experiments
         
     | 
| 13 | 
         
            +
            - ✅ **Test Connection**: Verify access to the dataset repository
         
     | 
| 14 | 
         
            +
            - ✅ **Create Dataset**: Create a new dataset repository if it doesn't exist
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            ### **Flexible Configuration**
         
     | 
| 17 | 
         
            +
            - ✅ **Environment Variables**: Still supported as fallback
         
     | 
| 18 | 
         
            +
            - ✅ **Interface Input**: New direct input method
         
     | 
| 19 | 
         
            +
            - ✅ **Dynamic Updates**: Change configuration without restarting
         
     | 
| 20 | 
         
            +
            - ✅ **Validation**: Input validation and error handling
         
     | 
| 21 | 
         
            +
             
     | 
| 22 | 
         
            +
            ## 📋 Configuration Tab Usage
         
     | 
| 23 | 
         
            +
             
     | 
| 24 | 
         
            +
            ### **1. Access the Configuration Tab**
         
     | 
| 25 | 
         
            +
            - Open the Trackio app
         
     | 
| 26 | 
         
            +
            - Click on the "⚙️ Configuration" tab
         
     | 
| 27 | 
         
            +
            - You'll see input fields for HF Token and Dataset Repository
         
     | 
| 28 | 
         
            +
             
     | 
| 29 | 
         
            +
            ### **2. Set Your HF Token**
         
     | 
| 30 | 
         
            +
            ```
         
     | 
| 31 | 
         
            +
            Hugging Face Token: hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
         
     | 
| 32 | 
         
            +
            ```
         
     | 
| 33 | 
         
            +
            - **Type**: Password field (hidden for security)
         
     | 
| 34 | 
         
            +
            - **Required**: Yes (for dataset access)
         
     | 
| 35 | 
         
            +
            - **Format**: Your HF token starting with `hf_`
         
     | 
| 36 | 
         
            +
            - **Help**: Click the help text for instructions on getting your token
         
     | 
| 37 | 
         
            +
             
     | 
| 38 | 
         
            +
            ### **3. Set Your Dataset Repository**
         
     | 
| 39 | 
         
            +
            ```
         
     | 
| 40 | 
         
            +
            Dataset Repository: your-username/your-dataset-name
         
     | 
| 41 | 
         
            +
            ```
         
     | 
| 42 | 
         
            +
            - **Type**: Text field
         
     | 
| 43 | 
         
            +
            - **Required**: No (defaults to `tonic/trackio-experiments`)
         
     | 
| 44 | 
         
            +
            - **Format**: `username/dataset-name`
         
     | 
| 45 | 
         
            +
            - **Examples**: 
         
     | 
| 46 | 
         
            +
              - `tonic/trackio-experiments`
         
     | 
| 47 | 
         
            +
              - `your-username/my-experiments`
         
     | 
| 48 | 
         
            +
              - `your-org/team-experiments`
         
     | 
| 49 | 
         
            +
             
     | 
| 50 | 
         
            +
            ### **4. Use the Action Buttons**
         
     | 
| 51 | 
         
            +
             
     | 
| 52 | 
         
            +
            #### **Update Configuration**
         
     | 
| 53 | 
         
            +
            - Applies new settings immediately
         
     | 
| 54 | 
         
            +
            - Reloads experiments with new configuration
         
     | 
| 55 | 
         
            +
            - Shows current status and experiment count
         
     | 
| 56 | 
         
            +
             
     | 
| 57 | 
         
            +
            #### **Test Connection**
         
     | 
| 58 | 
         
            +
            - Verifies access to the dataset repository
         
     | 
| 59 | 
         
            +
            - Tests HF token permissions
         
     | 
| 60 | 
         
            +
            - Shows dataset information and experiment count
         
     | 
| 61 | 
         
            +
             
     | 
| 62 | 
         
            +
            #### **Create Dataset**
         
     | 
| 63 | 
         
            +
            - Creates a new dataset repository if it doesn't exist
         
     | 
| 64 | 
         
            +
            - Sets up the correct schema for experiments
         
     | 
| 65 | 
         
            +
            - Makes the dataset private by default
         
     | 
| 66 | 
         
            +
             
     | 
| 67 | 
         
            +
            ## 🔧 Configuration Methods
         
     | 
| 68 | 
         
            +
             
     | 
| 69 | 
         
            +
            ### **Method 1: Interface Input (New)**
         
     | 
| 70 | 
         
            +
            1. Go to "⚙️ Configuration" tab
         
     | 
| 71 | 
         
            +
            2. Enter your HF token and dataset repository
         
     | 
| 72 | 
         
            +
            3. Click "Update Configuration"
         
     | 
| 73 | 
         
            +
            4. Verify with "Test Connection"
         
     | 
| 74 | 
         
            +
             
     | 
| 75 | 
         
            +
            ### **Method 2: Environment Variables (Existing)**
         
     | 
| 76 | 
         
            +
            ```bash
         
     | 
| 77 | 
         
            +
            # Set environment variables
         
     | 
| 78 | 
         
            +
            export HF_TOKEN=your_hf_token_here
         
     | 
| 79 | 
         
            +
            export TRACKIO_DATASET_REPO=your-username/your-dataset-name
         
     | 
| 80 | 
         
            +
             
     | 
| 81 | 
         
            +
            # Or for HF Spaces, add to Space settings
         
     | 
| 82 | 
         
            +
            HF_TOKEN=your_hf_token_here
         
     | 
| 83 | 
         
            +
            TRACKIO_DATASET_REPO=your-username/your-dataset-name
         
     | 
| 84 | 
         
            +
            ```
         
     | 
| 85 | 
         
            +
             
     | 
| 86 | 
         
            +
            ### **Method 3: Hybrid Approach**
         
     | 
| 87 | 
         
            +
            - Set environment variables as defaults
         
     | 
| 88 | 
         
            +
            - Override specific values through the interface
         
     | 
| 89 | 
         
            +
            - Interface values take precedence over environment variables
         
     | 
| 90 | 
         
            +
             
     | 
| 91 | 
         
            +
            ## 📊 Configuration Priority
         
     | 
| 92 | 
         
            +
             
     | 
| 93 | 
         
            +
            The app uses this priority order for configuration:
         
     | 
| 94 | 
         
            +
             
     | 
| 95 | 
         
            +
            1. **Interface Input** (highest priority)
         
     | 
| 96 | 
         
            +
            2. **Environment Variables** (fallback)
         
     | 
| 97 | 
         
            +
            3. **Default Values** (lowest priority)
         
     | 
| 98 | 
         
            +
             
     | 
| 99 | 
         
            +
            ## 🛠️ Getting Your HF Token
         
     | 
| 100 | 
         
            +
             
     | 
| 101 | 
         
            +
            ### **Step-by-Step Instructions**
         
     | 
| 102 | 
         
            +
            1. Go to [Hugging Face Settings](https://huggingface.co/settings/tokens)
         
     | 
| 103 | 
         
            +
            2. Click "New token"
         
     | 
| 104 | 
         
            +
            3. Give it a name (e.g., "Trackio Access")
         
     | 
| 105 | 
         
            +
            4. Select "Write" permissions
         
     | 
| 106 | 
         
            +
            5. Click "Generate token"
         
     | 
| 107 | 
         
            +
            6. Copy the token (starts with `hf_`)
         
     | 
| 108 | 
         
            +
            7. Paste it in the app's HF Token field
         
     | 
| 109 | 
         
            +
             
     | 
| 110 | 
         
            +
            ### **Token Permissions**
         
     | 
| 111 | 
         
            +
            - **Read**: Required for loading experiments
         
     | 
| 112 | 
         
            +
            - **Write**: Required for saving experiments
         
     | 
| 113 | 
         
            +
            - **Scope**: Should have access to your dataset repositories
         
     | 
| 114 | 
         
            +
             
     | 
| 115 | 
         
            +
            ## 📁 Dataset Repository Format
         
     | 
| 116 | 
         
            +
             
     | 
| 117 | 
         
            +
            ### **Correct Format**
         
     | 
| 118 | 
         
            +
            ```
         
     | 
| 119 | 
         
            +
            username/dataset-name
         
     | 
| 120 | 
         
            +
            ```
         
     | 
| 121 | 
         
            +
             
     | 
| 122 | 
         
            +
            ### **Examples**
         
     | 
| 123 | 
         
            +
            - `tonic/trackio-experiments` (default)
         
     | 
| 124 | 
         
            +
            - `your-username/my-experiments`
         
     | 
| 125 | 
         
            +
            - `your-org/team-experiments`
         
     | 
| 126 | 
         
            +
            - `your-username/smollm3-experiments`
         
     | 
| 127 | 
         
            +
             
     | 
| 128 | 
         
            +
            ### **Validation**
         
     | 
| 129 | 
         
            +
            - Must contain exactly one `/`
         
     | 
| 130 | 
         
            +
            - Username must be valid HF username
         
     | 
| 131 | 
         
            +
            - Dataset name must be valid (alphanumeric + hyphens)
         
     | 
| 132 | 
         
            +
             
     | 
| 133 | 
         
            +
            ## 🔍 Testing Your Configuration
         
     | 
| 134 | 
         
            +
             
     | 
| 135 | 
         
            +
            ### **1. Test Connection**
         
     | 
| 136 | 
         
            +
            - Enter your HF token and dataset repository
         
     | 
| 137 | 
         
            +
            - Click "Test Connection"
         
     | 
| 138 | 
         
            +
            - Should show: "✅ Connection successful!"
         
     | 
| 139 | 
         
            +
             
     | 
| 140 | 
         
            +
            ### **2. Create Dataset (if needed)**
         
     | 
| 141 | 
         
            +
            - If dataset doesn't exist, click "Create Dataset"
         
     | 
| 142 | 
         
            +
            - Should show: "✅ Dataset created successfully!"
         
     | 
| 143 | 
         
            +
             
     | 
| 144 | 
         
            +
            ### **3. Update Configuration**
         
     | 
| 145 | 
         
            +
            - Click "Update Configuration"
         
     | 
| 146 | 
         
            +
            - Should show: "✅ Configuration updated successfully!"
         
     | 
| 147 | 
         
            +
             
     | 
| 148 | 
         
            +
            ## 🚨 Troubleshooting
         
     | 
| 149 | 
         
            +
             
     | 
| 150 | 
         
            +
            ### **Issue: "Please provide a Hugging Face token"**
         
     | 
| 151 | 
         
            +
            **Solution**: 
         
     | 
| 152 | 
         
            +
            - Enter your HF token in the interface
         
     | 
| 153 | 
         
            +
            - Or set the `HF_TOKEN` environment variable
         
     | 
| 154 | 
         
            +
             
     | 
| 155 | 
         
            +
            ### **Issue: "Connection failed: 401 Unauthorized"**
         
     | 
| 156 | 
         
            +
            **Solutions**:
         
     | 
| 157 | 
         
            +
            1. Check your HF token is correct
         
     | 
| 158 | 
         
            +
            2. Verify the token has read access to the dataset
         
     | 
| 159 | 
         
            +
            3. Ensure the dataset repository exists
         
     | 
| 160 | 
         
            +
             
     | 
| 161 | 
         
            +
            ### **Issue: "Failed to create dataset"**
         
     | 
| 162 | 
         
            +
            **Solutions**:
         
     | 
| 163 | 
         
            +
            1. Check your HF token has write permissions
         
     | 
| 164 | 
         
            +
            2. Verify the username in the repository name
         
     | 
| 165 | 
         
            +
            3. Ensure the dataset name is valid
         
     | 
| 166 | 
         
            +
             
     | 
| 167 | 
         
            +
            ### **Issue: "Dataset repository must be in format: username/dataset-name"**
         
     | 
| 168 | 
         
            +
            **Solution**: 
         
     | 
| 169 | 
         
            +
            - Use the correct format: `username/dataset-name`
         
     | 
| 170 | 
         
            +
            - Example: `your-username/my-experiments`
         
     | 
| 171 | 
         
            +
             
     | 
| 172 | 
         
            +
            ## 📈 Benefits
         
     | 
| 173 | 
         
            +
             
     | 
| 174 | 
         
            +
            ### **For Users**
         
     | 
| 175 | 
         
            +
            - ✅ **Easy Setup**: No need to set environment variables
         
     | 
| 176 | 
         
            +
            - ✅ **Visual Interface**: Clear input fields and validation
         
     | 
| 177 | 
         
            +
            - ✅ **Immediate Feedback**: Test connection and see results
         
     | 
| 178 | 
         
            +
            - ✅ **Flexible**: Can change configuration anytime
         
     | 
| 179 | 
         
            +
             
     | 
| 180 | 
         
            +
            ### **For Development**
         
     | 
| 181 | 
         
            +
            - ✅ **Backward Compatible**: Environment variables still work
         
     | 
| 182 | 
         
            +
            - ✅ **Fallback Support**: Graceful degradation
         
     | 
| 183 | 
         
            +
            - ✅ **Error Handling**: Clear error messages
         
     | 
| 184 | 
         
            +
            - ✅ **Validation**: Input validation and testing
         
     | 
| 185 | 
         
            +
             
     | 
| 186 | 
         
            +
            ### **For Deployment**
         
     | 
| 187 | 
         
            +
            - ✅ **HF Spaces Ready**: Works on Hugging Face Spaces
         
     | 
| 188 | 
         
            +
            - ✅ **No Restart Required**: Dynamic configuration updates
         
     | 
| 189 | 
         
            +
            - ✅ **Secure**: Password field for token input
         
     | 
| 190 | 
         
            +
            - ✅ **User-Friendly**: Clear instructions and help text
         
     | 
| 191 | 
         
            +
             
     | 
| 192 | 
         
            +
            ## 🎯 Usage Examples
         
     | 
| 193 | 
         
            +
             
     | 
| 194 | 
         
            +
            ### **Basic Setup**
         
     | 
| 195 | 
         
            +
            1. Open the app
         
     | 
| 196 | 
         
            +
            2. Go to "⚙️ Configuration" tab
         
     | 
| 197 | 
         
            +
            3. Enter your HF token
         
     | 
| 198 | 
         
            +
            4. Enter your dataset repository
         
     | 
| 199 | 
         
            +
            5. Click "Update Configuration"
         
     | 
| 200 | 
         
            +
            6. Click "Test Connection" to verify
         
     | 
| 201 | 
         
            +
             
     | 
| 202 | 
         
            +
            ### **Advanced Setup**
         
     | 
| 203 | 
         
            +
            1. Set environment variables as defaults
         
     | 
| 204 | 
         
            +
            2. Use interface to override specific values
         
     | 
| 205 | 
         
            +
            3. Test connection to verify access
         
     | 
| 206 | 
         
            +
            4. Create dataset if it doesn't exist
         
     | 
| 207 | 
         
            +
            5. Start using the app with persistent storage
         
     | 
| 208 | 
         
            +
             
     | 
| 209 | 
         
            +
            ### **Team Setup**
         
     | 
| 210 | 
         
            +
            1. Create a shared dataset repository
         
     | 
| 211 | 
         
            +
            2. Share the repository name with team
         
     | 
| 212 | 
         
            +
            3. Each team member sets their own HF token
         
     | 
| 213 | 
         
            +
            4. All experiments are stored in the shared dataset
         
     | 
| 214 | 
         
            +
             
     | 
| 215 | 
         
            +
            ## 📋 Configuration Status
         
     | 
| 216 | 
         
            +
             
     | 
| 217 | 
         
            +
            The app shows current configuration status:
         
     | 
| 218 | 
         
            +
            ```
         
     | 
| 219 | 
         
            +
            📊 Dataset: your-username/your-dataset
         
     | 
| 220 | 
         
            +
            🔑 HF Token: Set
         
     | 
| 221 | 
         
            +
            📈 Experiments: 5
         
     | 
| 222 | 
         
            +
            ```
         
     | 
| 223 | 
         
            +
             
     | 
| 224 | 
         
            +
            ## 🔄 Updating Configuration
         
     | 
| 225 | 
         
            +
             
     | 
| 226 | 
         
            +
            You can update configuration at any time:
         
     | 
| 227 | 
         
            +
            1. Go to "⚙️ Configuration" tab
         
     | 
| 228 | 
         
            +
            2. Change HF token or dataset repository
         
     | 
| 229 | 
         
            +
            3. Click "Update Configuration"
         
     | 
| 230 | 
         
            +
            4. Experiments will reload with new settings
         
     | 
| 231 | 
         
            +
             
     | 
| 232 | 
         
            +
            ---
         
     | 
| 233 | 
         
            +
             
     | 
| 234 | 
         
            +
            **🎉 Your Trackio app is now more flexible and user-friendly with direct configuration input!** 
         
     | 
    	
        CLOUD_DEPLOYMENT_GUIDE.md → docs/CLOUD_DEPLOYMENT_GUIDE.md
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        CLOUD_TRAINING_GUIDE.md → docs/CLOUD_TRAINING_GUIDE.md
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        DEPLOYMENT_GUIDE.md → docs/DEPLOYMENT_GUIDE.md
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        docs/ENVIRONMENT_VARIABLES.md
    ADDED
    
    | 
         @@ -0,0 +1,113 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            # 🔧 Trackio Environment Variables Reference
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            ## Quick Setup
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            Set these environment variables in your Hugging Face Space:
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            ```bash
         
     | 
| 8 | 
         
            +
            # Required: Your HF token for dataset access
         
     | 
| 9 | 
         
            +
            HF_TOKEN=your_hf_token_here
         
     | 
| 10 | 
         
            +
             
     | 
| 11 | 
         
            +
            # Optional: Dataset repository to use (defaults to tonic/trackio-experiments)
         
     | 
| 12 | 
         
            +
            TRACKIO_DATASET_REPO=your-username/your-dataset-name
         
     | 
| 13 | 
         
            +
            ```
         
     | 
| 14 | 
         
            +
             
     | 
| 15 | 
         
            +
            ## Environment Variables
         
     | 
| 16 | 
         
            +
             
     | 
| 17 | 
         
            +
            | Variable | Required | Default | Description |
         
     | 
| 18 | 
         
            +
            |----------|----------|---------|-------------|
         
     | 
| 19 | 
         
            +
            | `HF_TOKEN` | ✅ Yes | None | Your Hugging Face token for dataset access |
         
     | 
| 20 | 
         
            +
            | `TRACKIO_DATASET_REPO` | ❌ No | `tonic/trackio-experiments` | Dataset repository to load experiments from |
         
     | 
| 21 | 
         
            +
            | `SPACE_ID` | 🔄 Auto | None | HF Space ID (automatically detected) |
         
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
            +
            ## Configuration Examples
         
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
            ### 1. Default Setup
         
     | 
| 26 | 
         
            +
            ```bash
         
     | 
| 27 | 
         
            +
            HF_TOKEN=your_token_here
         
     | 
| 28 | 
         
            +
            # Uses: tonic/trackio-experiments
         
     | 
| 29 | 
         
            +
            ```
         
     | 
| 30 | 
         
            +
             
     | 
| 31 | 
         
            +
            ### 2. Personal Dataset
         
     | 
| 32 | 
         
            +
            ```bash
         
     | 
| 33 | 
         
            +
            HF_TOKEN=your_token_here
         
     | 
| 34 | 
         
            +
            TRACKIO_DATASET_REPO=your-username/trackio-experiments
         
     | 
| 35 | 
         
            +
            ```
         
     | 
| 36 | 
         
            +
             
     | 
| 37 | 
         
            +
            ### 3. Team Dataset
         
     | 
| 38 | 
         
            +
            ```bash
         
     | 
| 39 | 
         
            +
            HF_TOKEN=your_token_here
         
     | 
| 40 | 
         
            +
            TRACKIO_DATASET_REPO=your-org/team-experiments
         
     | 
| 41 | 
         
            +
            ```
         
     | 
| 42 | 
         
            +
             
     | 
| 43 | 
         
            +
            ### 4. Project-Specific Dataset
         
     | 
| 44 | 
         
            +
            ```bash
         
     | 
| 45 | 
         
            +
            HF_TOKEN=your_token_here
         
     | 
| 46 | 
         
            +
            TRACKIO_DATASET_REPO=your-username/smollm3-experiments
         
     | 
| 47 | 
         
            +
            ```
         
     | 
| 48 | 
         
            +
             
     | 
| 49 | 
         
            +
            ## How to Set in HF Spaces
         
     | 
| 50 | 
         
            +
             
     | 
| 51 | 
         
            +
            1. Go to your Hugging Face Space settings
         
     | 
| 52 | 
         
            +
            2. Navigate to "Settings" → "Environment variables"
         
     | 
| 53 | 
         
            +
            3. Add the variables:
         
     | 
| 54 | 
         
            +
               - `HF_TOKEN`: Your HF token
         
     | 
| 55 | 
         
            +
               - `TRACKIO_DATASET_REPO`: Your dataset repository (optional)
         
     | 
| 56 | 
         
            +
             
     | 
| 57 | 
         
            +
            ## Testing Configuration
         
     | 
| 58 | 
         
            +
             
     | 
| 59 | 
         
            +
            Run the configuration script to check your setup:
         
     | 
| 60 | 
         
            +
             
     | 
| 61 | 
         
            +
            ```bash
         
     | 
| 62 | 
         
            +
            python configure_trackio.py
         
     | 
| 63 | 
         
            +
            ```
         
     | 
| 64 | 
         
            +
             
     | 
| 65 | 
         
            +
            This will:
         
     | 
| 66 | 
         
            +
            - ✅ Show current environment variables
         
     | 
| 67 | 
         
            +
            - 🧪 Test dataset access
         
     | 
| 68 | 
         
            +
            - 📊 Display experiment count
         
     | 
| 69 | 
         
            +
            - 💾 Generate configuration file
         
     | 
| 70 | 
         
            +
             
     | 
| 71 | 
         
            +
            ## Getting Your HF Token
         
     | 
| 72 | 
         
            +
             
     | 
| 73 | 
         
            +
            1. Go to [Hugging Face Settings](https://huggingface.co/settings/tokens)
         
     | 
| 74 | 
         
            +
            2. Click "New token"
         
     | 
| 75 | 
         
            +
            3. Give it a name (e.g., "Trackio Access")
         
     | 
| 76 | 
         
            +
            4. Select "Write" permissions
         
     | 
| 77 | 
         
            +
            5. Copy the token and set it as `HF_TOKEN`
         
     | 
| 78 | 
         
            +
             
     | 
| 79 | 
         
            +
            ## Dataset Repository Format
         
     | 
| 80 | 
         
            +
             
     | 
| 81 | 
         
            +
            The `TRACKIO_DATASET_REPO` should follow this format:
         
     | 
| 82 | 
         
            +
            ```
         
     | 
| 83 | 
         
            +
            username/dataset-name
         
     | 
| 84 | 
         
            +
            ```
         
     | 
| 85 | 
         
            +
             
     | 
| 86 | 
         
            +
            Examples:
         
     | 
| 87 | 
         
            +
            - `tonic/trackio-experiments`
         
     | 
| 88 | 
         
            +
            - `your-username/my-experiments`
         
     | 
| 89 | 
         
            +
            - `your-org/team-experiments`
         
     | 
| 90 | 
         
            +
             
     | 
| 91 | 
         
            +
            ## Troubleshooting
         
     | 
| 92 | 
         
            +
             
     | 
| 93 | 
         
            +
            ### Issue: "HF_TOKEN not found"
         
     | 
| 94 | 
         
            +
            **Solution**: Set your HF token in the Space environment variables
         
     | 
| 95 | 
         
            +
             
     | 
| 96 | 
         
            +
            ### Issue: "Failed to load dataset"
         
     | 
| 97 | 
         
            +
            **Solutions**:
         
     | 
| 98 | 
         
            +
            1. Check your token has read access to the dataset
         
     | 
| 99 | 
         
            +
            2. Verify the dataset repository exists
         
     | 
| 100 | 
         
            +
            3. Try the backup fallback (automatic)
         
     | 
| 101 | 
         
            +
             
     | 
| 102 | 
         
            +
            ### Issue: "Failed to save experiments"
         
     | 
| 103 | 
         
            +
            **Solutions**:
         
     | 
| 104 | 
         
            +
            1. Check your token has write permissions
         
     | 
| 105 | 
         
            +
            2. Verify the dataset repository exists
         
     | 
| 106 | 
         
            +
            3. Check network connectivity
         
     | 
| 107 | 
         
            +
             
     | 
| 108 | 
         
            +
            ## Security Notes
         
     | 
| 109 | 
         
            +
             
     | 
| 110 | 
         
            +
            - 🔒 Dataset is private by default
         
     | 
| 111 | 
         
            +
            - 🔑 Only accessible with your HF_TOKEN
         
     | 
| 112 | 
         
            +
            - 🛡️ No sensitive data exposed publicly
         
     | 
| 113 | 
         
            +
            - 🔐 Secure storage on HF infrastructure 
         
     | 
    	
        docs/HF_DATASETS_GUIDE.md
    ADDED
    
    | 
         @@ -0,0 +1,269 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            # 🚀 Trackio with Hugging Face Datasets - Complete Guide
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            ## Overview
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            This guide explains how to use Hugging Face Datasets for persistent storage of Trackio experiments, providing reliable data persistence across Hugging Face Spaces deployments.
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            ## 🏗️ Architecture
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            ### Why HF Datasets?
         
     | 
| 10 | 
         
            +
             
     | 
| 11 | 
         
            +
            1. **Persistent Storage**: Data survives Space restarts and redeployments
         
     | 
| 12 | 
         
            +
            2. **Version Control**: Automatic versioning of experiment data
         
     | 
| 13 | 
         
            +
            3. **Access Control**: Private datasets for security
         
     | 
| 14 | 
         
            +
            4. **Reliability**: HF's infrastructure ensures data availability
         
     | 
| 15 | 
         
            +
            5. **Scalability**: Handles large amounts of experiment data
         
     | 
| 16 | 
         
            +
             
     | 
| 17 | 
         
            +
            ### Data Flow
         
     | 
| 18 | 
         
            +
             
     | 
| 19 | 
         
            +
            ```
         
     | 
| 20 | 
         
            +
            Training Script → Trackio App → HF Dataset → Trackio App → Plots
         
     | 
| 21 | 
         
            +
            ```
         
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
            +
            ## 🚀 Setup Instructions
         
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
            ### 1. Create HF Token
         
     | 
| 26 | 
         
            +
             
     | 
| 27 | 
         
            +
            1. Go to [Hugging Face Settings](https://huggingface.co/settings/tokens)
         
     | 
| 28 | 
         
            +
            2. Create a new token with `write` permissions
         
     | 
| 29 | 
         
            +
            3. Copy the token for use in your Space
         
     | 
| 30 | 
         
            +
             
     | 
| 31 | 
         
            +
            ### 2. Set Up Dataset Repository
         
     | 
| 32 | 
         
            +
             
     | 
| 33 | 
         
            +
            ```bash
         
     | 
| 34 | 
         
            +
            # Run the setup script
         
     | 
| 35 | 
         
            +
            python setup_hf_dataset.py
         
     | 
| 36 | 
         
            +
            ```
         
     | 
| 37 | 
         
            +
             
     | 
| 38 | 
         
            +
            This will:
         
     | 
| 39 | 
         
            +
            - Create a private dataset: `tonic/trackio-experiments`
         
     | 
| 40 | 
         
            +
            - Add your existing experiments
         
     | 
| 41 | 
         
            +
            - Configure the dataset for Trackio
         
     | 
| 42 | 
         
            +
             
     | 
| 43 | 
         
            +
            ### 3. Configure Hugging Face Space
         
     | 
| 44 | 
         
            +
             
     | 
| 45 | 
         
            +
            #### Environment Variables
         
     | 
| 46 | 
         
            +
            Set these in your HF Space settings:
         
     | 
| 47 | 
         
            +
            ```bash
         
     | 
| 48 | 
         
            +
            HF_TOKEN=your_hf_token_here
         
     | 
| 49 | 
         
            +
            TRACKIO_DATASET_REPO=your-username/your-dataset-name
         
     | 
| 50 | 
         
            +
            ```
         
     | 
| 51 | 
         
            +
             
     | 
| 52 | 
         
            +
            **Environment Variables Explained:**
         
     | 
| 53 | 
         
            +
            - `HF_TOKEN`: Your Hugging Face token (required for dataset access)
         
     | 
| 54 | 
         
            +
            - `TRACKIO_DATASET_REPO`: Dataset repository to use (optional, defaults to `tonic/trackio-experiments`)
         
     | 
| 55 | 
         
            +
             
     | 
| 56 | 
         
            +
            **Example Configurations:**
         
     | 
| 57 | 
         
            +
            ```bash
         
     | 
| 58 | 
         
            +
            # Use default dataset
         
     | 
| 59 | 
         
            +
            HF_TOKEN=your_token_here
         
     | 
| 60 | 
         
            +
             
     | 
| 61 | 
         
            +
            # Use personal dataset
         
     | 
| 62 | 
         
            +
            HF_TOKEN=your_token_here
         
     | 
| 63 | 
         
            +
            TRACKIO_DATASET_REPO=your-username/trackio-experiments
         
     | 
| 64 | 
         
            +
             
     | 
| 65 | 
         
            +
            # Use team dataset
         
     | 
| 66 | 
         
            +
            HF_TOKEN=your_token_here
         
     | 
| 67 | 
         
            +
            TRACKIO_DATASET_REPO=your-org/team-experiments
         
     | 
| 68 | 
         
            +
             
     | 
| 69 | 
         
            +
            # Use project-specific dataset
         
     | 
| 70 | 
         
            +
            HF_TOKEN=your_token_here
         
     | 
| 71 | 
         
            +
            TRACKIO_DATASET_REPO=your-username/smollm3-experiments
         
     | 
| 72 | 
         
            +
            ```
         
     | 
| 73 | 
         
            +
             
     | 
| 74 | 
         
            +
            #### Requirements
         
     | 
| 75 | 
         
            +
            Update your `requirements.txt`:
         
     | 
| 76 | 
         
            +
            ```txt
         
     | 
| 77 | 
         
            +
            gradio>=4.0.0
         
     | 
| 78 | 
         
            +
            plotly>=5.0.0
         
     | 
| 79 | 
         
            +
            pandas>=1.5.0
         
     | 
| 80 | 
         
            +
            numpy>=1.24.0
         
     | 
| 81 | 
         
            +
            datasets>=2.14.0
         
     | 
| 82 | 
         
            +
            huggingface-hub>=0.16.0
         
     | 
| 83 | 
         
            +
            requests>=2.31.0
         
     | 
| 84 | 
         
            +
            ```
         
     | 
| 85 | 
         
            +
             
     | 
| 86 | 
         
            +
            ### 4. Deploy Updated App
         
     | 
| 87 | 
         
            +
             
     | 
| 88 | 
         
            +
            The updated `app.py` now:
         
     | 
| 89 | 
         
            +
            - Loads experiments from HF Dataset
         
     | 
| 90 | 
         
            +
            - Saves new experiments to the dataset
         
     | 
| 91 | 
         
            +
            - Falls back to backup data if dataset unavailable
         
     | 
| 92 | 
         
            +
            - Provides better error handling
         
     | 
| 93 | 
         
            +
             
     | 
| 94 | 
         
            +
            ### 5. Configure Environment Variables
         
     | 
| 95 | 
         
            +
             
     | 
| 96 | 
         
            +
            Use the configuration script to check your setup:
         
     | 
| 97 | 
         
            +
             
     | 
| 98 | 
         
            +
            ```bash
         
     | 
| 99 | 
         
            +
            python configure_trackio.py
         
     | 
| 100 | 
         
            +
            ```
         
     | 
| 101 | 
         
            +
             
     | 
| 102 | 
         
            +
            This script will:
         
     | 
| 103 | 
         
            +
            - Show current environment variables
         
     | 
| 104 | 
         
            +
            - Test dataset access
         
     | 
| 105 | 
         
            +
            - Generate configuration file
         
     | 
| 106 | 
         
            +
            - Provide usage examples
         
     | 
| 107 | 
         
            +
             
     | 
| 108 | 
         
            +
            **Available Environment Variables:**
         
     | 
| 109 | 
         
            +
             
     | 
| 110 | 
         
            +
            | Variable | Required | Default | Description |
         
     | 
| 111 | 
         
            +
            |----------|----------|---------|-------------|
         
     | 
| 112 | 
         
            +
            | `HF_TOKEN` | Yes | None | Your Hugging Face token |
         
     | 
| 113 | 
         
            +
            | `TRACKIO_DATASET_REPO` | No | `tonic/trackio-experiments` | Dataset repository to use |
         
     | 
| 114 | 
         
            +
            | `SPACE_ID` | Auto | None | HF Space ID (auto-detected) |
         
     | 
| 115 | 
         
            +
             
     | 
| 116 | 
         
            +
            ## 📊 Dataset Schema
         
     | 
| 117 | 
         
            +
             
     | 
| 118 | 
         
            +
            The HF Dataset contains these columns:
         
     | 
| 119 | 
         
            +
             
     | 
| 120 | 
         
            +
            | Column | Type | Description |
         
     | 
| 121 | 
         
            +
            |--------|------|-------------|
         
     | 
| 122 | 
         
            +
            | `experiment_id` | string | Unique experiment identifier |
         
     | 
| 123 | 
         
            +
            | `name` | string | Experiment name |
         
     | 
| 124 | 
         
            +
            | `description` | string | Experiment description |
         
     | 
| 125 | 
         
            +
            | `created_at` | string | ISO timestamp |
         
     | 
| 126 | 
         
            +
            | `status` | string | running/completed/failed |
         
     | 
| 127 | 
         
            +
            | `metrics` | string | JSON array of metric entries |
         
     | 
| 128 | 
         
            +
            | `parameters` | string | JSON object of experiment parameters |
         
     | 
| 129 | 
         
            +
            | `artifacts` | string | JSON array of artifacts |
         
     | 
| 130 | 
         
            +
            | `logs` | string | JSON array of log entries |
         
     | 
| 131 | 
         
            +
            | `last_updated` | string | ISO timestamp of last update |
         
     | 
| 132 | 
         
            +
             
     | 
| 133 | 
         
            +
            ## 🔧 Technical Details
         
     | 
| 134 | 
         
            +
             
     | 
| 135 | 
         
            +
            ### Loading Experiments
         
     | 
| 136 | 
         
            +
             
     | 
| 137 | 
         
            +
            ```python
         
     | 
| 138 | 
         
            +
            from datasets import load_dataset
         
     | 
| 139 | 
         
            +
             
     | 
| 140 | 
         
            +
            # Load from HF Dataset
         
     | 
| 141 | 
         
            +
            dataset = load_dataset("tonic/trackio-experiments", token=HF_TOKEN)
         
     | 
| 142 | 
         
            +
             
     | 
| 143 | 
         
            +
            # Convert to experiments dict
         
     | 
| 144 | 
         
            +
            for row in dataset['train']:
         
     | 
| 145 | 
         
            +
                experiment = {
         
     | 
| 146 | 
         
            +
                    'id': row['experiment_id'],
         
     | 
| 147 | 
         
            +
                    'metrics': json.loads(row['metrics']),
         
     | 
| 148 | 
         
            +
                    'parameters': json.loads(row['parameters']),
         
     | 
| 149 | 
         
            +
                    # ... other fields
         
     | 
| 150 | 
         
            +
                }
         
     | 
| 151 | 
         
            +
            ```
         
     | 
| 152 | 
         
            +
             
     | 
| 153 | 
         
            +
            ### Saving Experiments
         
     | 
| 154 | 
         
            +
             
     | 
| 155 | 
         
            +
            ```python
         
     | 
| 156 | 
         
            +
            from datasets import Dataset
         
     | 
| 157 | 
         
            +
            from huggingface_hub import HfApi
         
     | 
| 158 | 
         
            +
             
     | 
| 159 | 
         
            +
            # Convert experiments to dataset format
         
     | 
| 160 | 
         
            +
            dataset_data = []
         
     | 
| 161 | 
         
            +
            for exp_id, exp_data in experiments.items():
         
     | 
| 162 | 
         
            +
                dataset_data.append({
         
     | 
| 163 | 
         
            +
                    'experiment_id': exp_id,
         
     | 
| 164 | 
         
            +
                    'metrics': json.dumps(exp_data['metrics']),
         
     | 
| 165 | 
         
            +
                    'parameters': json.dumps(exp_data['parameters']),
         
     | 
| 166 | 
         
            +
                    # ... other fields
         
     | 
| 167 | 
         
            +
                })
         
     | 
| 168 | 
         
            +
             
     | 
| 169 | 
         
            +
            # Push to HF Hub
         
     | 
| 170 | 
         
            +
            dataset = Dataset.from_list(dataset_data)
         
     | 
| 171 | 
         
            +
            dataset.push_to_hub("tonic/trackio-experiments", token=HF_TOKEN, private=True)
         
     | 
| 172 | 
         
            +
            ```
         
     | 
| 173 | 
         
            +
             
     | 
| 174 | 
         
            +
            ## 📈 Your Current Experiments
         
     | 
| 175 | 
         
            +
             
     | 
| 176 | 
         
            +
            ### Available Experiments
         
     | 
| 177 | 
         
            +
             
     | 
| 178 | 
         
            +
            1. **`exp_20250720_130853`** (petite-elle-l-aime-3)
         
     | 
| 179 | 
         
            +
               - 4 metric entries (steps 25, 50, 75, 100)
         
     | 
| 180 | 
         
            +
               - Loss decreasing: 1.1659 → 1.1528
         
     | 
| 181 | 
         
            +
               - Good convergence pattern
         
     | 
| 182 | 
         
            +
             
     | 
| 183 | 
         
            +
            2. **`exp_20250720_134319`** (petite-elle-l-aime-3-1)
         
     | 
| 184 | 
         
            +
               - 2 metric entries (step 25)
         
     | 
| 185 | 
         
            +
               - Loss: 1.166
         
     | 
| 186 | 
         
            +
               - GPU memory tracking
         
     | 
| 187 | 
         
            +
             
     | 
| 188 | 
         
            +
            ### Metrics Available for Plotting
         
     | 
| 189 | 
         
            +
             
     | 
| 190 | 
         
            +
            - `loss` - Training loss curve
         
     | 
| 191 | 
         
            +
            - `learning_rate` - Learning rate schedule
         
     | 
| 192 | 
         
            +
            - `mean_token_accuracy` - Token-level accuracy
         
     | 
| 193 | 
         
            +
            - `grad_norm` - Gradient norm
         
     | 
| 194 | 
         
            +
            - `num_tokens` - Tokens processed
         
     | 
| 195 | 
         
            +
            - `epoch` - Training epoch
         
     | 
| 196 | 
         
            +
            - `gpu_0_memory_allocated` - GPU memory usage
         
     | 
| 197 | 
         
            +
            - `cpu_percent` - CPU usage
         
     | 
| 198 | 
         
            +
            - `memory_percent` - System memory
         
     | 
| 199 | 
         
            +
             
     | 
| 200 | 
         
            +
            ## 🎯 Usage Instructions
         
     | 
| 201 | 
         
            +
             
     | 
| 202 | 
         
            +
            ### 1. View Experiments
         
     | 
| 203 | 
         
            +
            - Go to "View Experiments" tab
         
     | 
| 204 | 
         
            +
            - Enter experiment ID: `exp_20250720_130853` or `exp_20250720_134319`
         
     | 
| 205 | 
         
            +
            - Click "View Experiment"
         
     | 
| 206 | 
         
            +
             
     | 
| 207 | 
         
            +
            ### 2. Create Plots
         
     | 
| 208 | 
         
            +
            - Go to "Visualizations" tab
         
     | 
| 209 | 
         
            +
            - Enter experiment ID
         
     | 
| 210 | 
         
            +
            - Select metric to plot
         
     | 
| 211 | 
         
            +
            - Click "Create Plot"
         
     | 
| 212 | 
         
            +
             
     | 
| 213 | 
         
            +
            ### 3. Compare Experiments
         
     | 
| 214 | 
         
            +
            - Use "Experiment Comparison" feature
         
     | 
| 215 | 
         
            +
            - Enter: `exp_20250720_130853,exp_20250720_134319`
         
     | 
| 216 | 
         
            +
            - Compare loss curves
         
     | 
| 217 | 
         
            +
             
     | 
| 218 | 
         
            +
            ## 🔍 Troubleshooting
         
     | 
| 219 | 
         
            +
             
     | 
| 220 | 
         
            +
            ### Issue: "No metrics data available"
         
     | 
| 221 | 
         
            +
            **Solutions**:
         
     | 
| 222 | 
         
            +
            1. Check HF_TOKEN is set correctly
         
     | 
| 223 | 
         
            +
            2. Verify dataset repository exists
         
     | 
| 224 | 
         
            +
            3. Check network connectivity to HF Hub
         
     | 
| 225 | 
         
            +
             
     | 
| 226 | 
         
            +
            ### Issue: "Failed to load from dataset"
         
     | 
| 227 | 
         
            +
            **Solutions**:
         
     | 
| 228 | 
         
            +
            1. App falls back to backup data automatically
         
     | 
| 229 | 
         
            +
            2. Check dataset permissions
         
     | 
| 230 | 
         
            +
            3. Verify token has read access
         
     | 
| 231 | 
         
            +
             
     | 
| 232 | 
         
            +
            ### Issue: "Failed to save experiments"
         
     | 
| 233 | 
         
            +
            **Solutions**:
         
     | 
| 234 | 
         
            +
            1. Check token has write permissions
         
     | 
| 235 | 
         
            +
            2. Verify dataset repository exists
         
     | 
| 236 | 
         
            +
            3. Check network connectivity
         
     | 
| 237 | 
         
            +
             
     | 
| 238 | 
         
            +
            ## 🚀 Benefits of This Approach
         
     | 
| 239 | 
         
            +
             
     | 
| 240 | 
         
            +
            ### ✅ Advantages
         
     | 
| 241 | 
         
            +
            - **Persistent**: Data survives Space restarts
         
     | 
| 242 | 
         
            +
            - **Reliable**: HF's infrastructure ensures availability
         
     | 
| 243 | 
         
            +
            - **Secure**: Private datasets protect your data
         
     | 
| 244 | 
         
            +
            - **Scalable**: Handles large amounts of experiment data
         
     | 
| 245 | 
         
            +
            - **Versioned**: Automatic versioning of experiment data
         
     | 
| 246 | 
         
            +
             
     | 
| 247 | 
         
            +
            ### 🔄 Fallback Strategy
         
     | 
| 248 | 
         
            +
            1. **Primary**: Load from HF Dataset
         
     | 
| 249 | 
         
            +
            2. **Secondary**: Use backup data (your existing experiments)
         
     | 
| 250 | 
         
            +
            3. **Tertiary**: Create new experiments locally
         
     | 
| 251 | 
         
            +
             
     | 
| 252 | 
         
            +
            ## 📋 Next Steps
         
     | 
| 253 | 
         
            +
             
     | 
| 254 | 
         
            +
            1. **Set HF_TOKEN**: Add your token to Space environment
         
     | 
| 255 | 
         
            +
            2. **Run Setup**: Execute `setup_hf_dataset.py`
         
     | 
| 256 | 
         
            +
            3. **Deploy App**: Push updated `app.py` to your Space
         
     | 
| 257 | 
         
            +
            4. **Test Plots**: Verify experiments load and plots work
         
     | 
| 258 | 
         
            +
            5. **Monitor Training**: New experiments will be saved to dataset
         
     | 
| 259 | 
         
            +
             
     | 
| 260 | 
         
            +
            ## 🔐 Security Notes
         
     | 
| 261 | 
         
            +
             
     | 
| 262 | 
         
            +
            - Dataset is **private** by default
         
     | 
| 263 | 
         
            +
            - Only accessible with your HF_TOKEN
         
     | 
| 264 | 
         
            +
            - Experiment data is stored securely on HF infrastructure
         
     | 
| 265 | 
         
            +
            - No sensitive data is exposed publicly
         
     | 
| 266 | 
         
            +
             
     | 
| 267 | 
         
            +
            ---
         
     | 
| 268 | 
         
            +
             
     | 
| 269 | 
         
            +
            **Your experiments are now configured for reliable persistence using Hugging Face Datasets!** 🎉 
         
     | 
    	
        docs/HF_SPACES_GUIDE.md
    ADDED
    
    | 
         @@ -0,0 +1,163 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            # 🚀 Trackio on Hugging Face Spaces - Complete Guide
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            ## Overview
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            This guide explains how to properly deploy and use Trackio on Hugging Face Spaces, addressing the unique challenges of ephemeral storage and data persistence.
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            ## 🏗️ Hugging Face Spaces Architecture
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            ### Key Challenges
         
     | 
| 10 | 
         
            +
             
     | 
| 11 | 
         
            +
            1. **Ephemeral Storage**: File system gets reset between deployments
         
     | 
| 12 | 
         
            +
            2. **No Persistent Storage**: Files written during runtime don't persist
         
     | 
| 13 | 
         
            +
            3. **Multiple Instances**: Training and monitoring might run in different environments
         
     | 
| 14 | 
         
            +
            4. **Limited File System**: Restricted write permissions in certain directories
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            ### How Trackio Handles HF Spaces
         
     | 
| 17 | 
         
            +
             
     | 
| 18 | 
         
            +
            The updated Trackio app now includes:
         
     | 
| 19 | 
         
            +
             
     | 
| 20 | 
         
            +
            - **Automatic HF Spaces Detection**: Detects when running on HF Spaces
         
     | 
| 21 | 
         
            +
            - **Persistent Path Selection**: Uses `/tmp/` for better persistence
         
     | 
| 22 | 
         
            +
            - **Backup Recovery**: Automatically recovers experiments from backup data
         
     | 
| 23 | 
         
            +
            - **Fallback Storage**: Multiple storage locations for redundancy
         
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
            ## 📊 Your Current Experiments
         
     | 
| 26 | 
         
            +
             
     | 
| 27 | 
         
            +
            Based on your logs, you have these experiments available:
         
     | 
| 28 | 
         
            +
             
     | 
| 29 | 
         
            +
            ### Experiment 1: `exp_20250720_130853`
         
     | 
| 30 | 
         
            +
            - **Name**: petite-elle-l-aime-3
         
     | 
| 31 | 
         
            +
            - **Status**: Running
         
     | 
| 32 | 
         
            +
            - **Metrics**: 4 entries (steps 25, 50, 75, 100)
         
     | 
| 33 | 
         
            +
            - **Key Metrics**: Loss decreasing from 1.1659 to 1.1528
         
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            ### Experiment 2: `exp_20250720_134319`
         
     | 
| 36 | 
         
            +
            - **Name**: petite-elle-l-aime-3-1
         
     | 
| 37 | 
         
            +
            - **Status**: Running
         
     | 
| 38 | 
         
            +
            - **Metrics**: 2 entries (step 25)
         
     | 
| 39 | 
         
            +
            - **Key Metrics**: Loss 1.166, GPU memory usage
         
     | 
| 40 | 
         
            +
             
     | 
| 41 | 
         
            +
            ## 🎯 How to Use Your Experiments
         
     | 
| 42 | 
         
            +
             
     | 
| 43 | 
         
            +
            ### 1. View Experiments
         
     | 
| 44 | 
         
            +
            - Go to the "View Experiments" tab
         
     | 
| 45 | 
         
            +
            - Enter experiment ID: `exp_20250720_130853` or `exp_20250720_134319`
         
     | 
| 46 | 
         
            +
            - Click "View Experiment" to see details
         
     | 
| 47 | 
         
            +
             
     | 
| 48 | 
         
            +
            ### 2. Create Plots
         
     | 
| 49 | 
         
            +
            - Go to the "Visualizations" tab
         
     | 
| 50 | 
         
            +
            - Enter experiment ID
         
     | 
| 51 | 
         
            +
            - Select metric to plot:
         
     | 
| 52 | 
         
            +
              - `loss` - Training loss curve
         
     | 
| 53 | 
         
            +
              - `learning_rate` - Learning rate schedule
         
     | 
| 54 | 
         
            +
              - `mean_token_accuracy` - Token accuracy
         
     | 
| 55 | 
         
            +
              - `grad_norm` - Gradient norm
         
     | 
| 56 | 
         
            +
              - `gpu_0_memory_allocated` - GPU memory usage
         
     | 
| 57 | 
         
            +
             
     | 
| 58 | 
         
            +
            ### 3. Compare Experiments
         
     | 
| 59 | 
         
            +
            - Use the "Experiment Comparison" feature
         
     | 
| 60 | 
         
            +
            - Enter: `exp_20250720_130853,exp_20250720_134319`
         
     | 
| 61 | 
         
            +
            - Compare loss curves between experiments
         
     | 
| 62 | 
         
            +
             
     | 
| 63 | 
         
            +
            ## 🔧 Technical Details
         
     | 
| 64 | 
         
            +
             
     | 
| 65 | 
         
            +
            ### Data Persistence Strategy
         
     | 
| 66 | 
         
            +
             
     | 
| 67 | 
         
            +
            ```python
         
     | 
| 68 | 
         
            +
            # HF Spaces detection
         
     | 
| 69 | 
         
            +
            if os.environ.get('SPACE_ID'):
         
     | 
| 70 | 
         
            +
                data_file = "/tmp/trackio_experiments.json"
         
     | 
| 71 | 
         
            +
            else:
         
     | 
| 72 | 
         
            +
                data_file = "trackio_experiments.json"
         
     | 
| 73 | 
         
            +
            ```
         
     | 
| 74 | 
         
            +
             
     | 
| 75 | 
         
            +
            ### Backup Recovery
         
     | 
| 76 | 
         
            +
             
     | 
| 77 | 
         
            +
            The app automatically recovers your experiments from backup data when:
         
     | 
| 78 | 
         
            +
            - Running on HF Spaces
         
     | 
| 79 | 
         
            +
            - No existing experiments found
         
     | 
| 80 | 
         
            +
            - Data file is missing or empty
         
     | 
| 81 | 
         
            +
             
     | 
| 82 | 
         
            +
            ### Storage Locations
         
     | 
| 83 | 
         
            +
             
     | 
| 84 | 
         
            +
            1. **Primary**: `/tmp/trackio_experiments.json`
         
     | 
| 85 | 
         
            +
            2. **Backup**: `/tmp/trackio_backup.json`
         
     | 
| 86 | 
         
            +
            3. **Fallback**: Local directory (for development)
         
     | 
| 87 | 
         
            +
             
     | 
| 88 | 
         
            +
            ## 🚀 Deployment Best Practices
         
     | 
| 89 | 
         
            +
             
     | 
| 90 | 
         
            +
            ### 1. Environment Variables
         
     | 
| 91 | 
         
            +
            ```bash
         
     | 
| 92 | 
         
            +
            # Set in HF Spaces environment
         
     | 
| 93 | 
         
            +
            SPACE_ID=your-space-id
         
     | 
| 94 | 
         
            +
            TRACKIO_URL=https://your-space.hf.space
         
     | 
| 95 | 
         
            +
            ```
         
     | 
| 96 | 
         
            +
             
     | 
| 97 | 
         
            +
            ### 2. File Structure
         
     | 
| 98 | 
         
            +
            ```
         
     | 
| 99 | 
         
            +
            your-space/
         
     | 
| 100 | 
         
            +
            ├── app.py                 # Main Trackio app
         
     | 
| 101 | 
         
            +
            ├── requirements.txt       # Dependencies
         
     | 
| 102 | 
         
            +
            ├── README.md             # Space description
         
     | 
| 103 | 
         
            +
            └── .gitignore           # Ignore temporary files
         
     | 
| 104 | 
         
            +
            ```
         
     | 
| 105 | 
         
            +
             
     | 
| 106 | 
         
            +
            ### 3. Requirements
         
     | 
| 107 | 
         
            +
            ```txt
         
     | 
| 108 | 
         
            +
            gradio>=4.0.0
         
     | 
| 109 | 
         
            +
            plotly>=5.0.0
         
     | 
| 110 | 
         
            +
            pandas>=1.5.0
         
     | 
| 111 | 
         
            +
            numpy>=1.24.0
         
     | 
| 112 | 
         
            +
            ```
         
     | 
| 113 | 
         
            +
             
     | 
| 114 | 
         
            +
            ## 📈 Monitoring Your Training
         
     | 
| 115 | 
         
            +
             
     | 
| 116 | 
         
            +
            ### Real-time Metrics
         
     | 
| 117 | 
         
            +
            Your experiments show:
         
     | 
| 118 | 
         
            +
            - **Loss**: Decreasing from 1.1659 to 1.1528 (good convergence)
         
     | 
| 119 | 
         
            +
            - **Learning Rate**: Properly scheduled from 7e-08 to 2.8875e-07
         
     | 
| 120 | 
         
            +
            - **Token Accuracy**: Around 75-76% (reasonable for early training)
         
     | 
| 121 | 
         
            +
            - **GPU Memory**: ~17GB allocated, 75GB reserved
         
     | 
| 122 | 
         
            +
             
     | 
| 123 | 
         
            +
            ### Expected Behavior
         
     | 
| 124 | 
         
            +
            - Loss should continue decreasing
         
     | 
| 125 | 
         
            +
            - Learning rate will follow cosine schedule
         
     | 
| 126 | 
         
            +
            - Token accuracy should improve over time
         
     | 
| 127 | 
         
            +
            - GPU memory usage should remain stable
         
     | 
| 128 | 
         
            +
             
     | 
| 129 | 
         
            +
            ## 🔍 Troubleshooting
         
     | 
| 130 | 
         
            +
             
     | 
| 131 | 
         
            +
            ### Issue: "No metrics data available"
         
     | 
| 132 | 
         
            +
            **Solution**: The app now automatically recovers experiments from backup
         
     | 
| 133 | 
         
            +
             
     | 
| 134 | 
         
            +
            ### Issue: Plots not showing
         
     | 
| 135 | 
         
            +
            **Solution**: 
         
     | 
| 136 | 
         
            +
            1. Check experiment ID is correct
         
     | 
| 137 | 
         
            +
            2. Try different metrics (loss, learning_rate, etc.)
         
     | 
| 138 | 
         
            +
            3. Refresh the page
         
     | 
| 139 | 
         
            +
             
     | 
| 140 | 
         
            +
            ### Issue: Data not persisting
         
     | 
| 141 | 
         
            +
            **Solution**: 
         
     | 
| 142 | 
         
            +
            1. App now uses `/tmp/` for better persistence
         
     | 
| 143 | 
         
            +
            2. Backup recovery ensures data availability
         
     | 
| 144 | 
         
            +
            3. Multiple storage locations provide redundancy
         
     | 
| 145 | 
         
            +
             
     | 
| 146 | 
         
            +
            ## 🎯 Next Steps
         
     | 
| 147 | 
         
            +
             
     | 
| 148 | 
         
            +
            1. **Deploy Updated App**: Push the updated `app.py` to your HF Space
         
     | 
| 149 | 
         
            +
            2. **Test Plots**: Try plotting your experiments
         
     | 
| 150 | 
         
            +
            3. **Monitor Training**: Continue monitoring your training runs
         
     | 
| 151 | 
         
            +
            4. **Add New Experiments**: Create new experiments as needed
         
     | 
| 152 | 
         
            +
             
     | 
| 153 | 
         
            +
            ## 📞 Support
         
     | 
| 154 | 
         
            +
             
     | 
| 155 | 
         
            +
            If you encounter issues:
         
     | 
| 156 | 
         
            +
            1. Check the logs in your HF Space
         
     | 
| 157 | 
         
            +
            2. Verify experiment IDs are correct
         
     | 
| 158 | 
         
            +
            3. Try the backup recovery feature
         
     | 
| 159 | 
         
            +
            4. Contact for additional support
         
     | 
| 160 | 
         
            +
             
     | 
| 161 | 
         
            +
            ---
         
     | 
| 162 | 
         
            +
             
     | 
| 163 | 
         
            +
            **Your experiments are now properly configured and should display correctly in the Trackio interface!** 🎉 
         
     | 
    	
        docs/MONITORING_IMPROVEMENTS_SUMMARY.md
    ADDED
    
    | 
         @@ -0,0 +1,191 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            # 🚀 Monitoring Improvements Summary
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            ## Overview
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            The monitoring system has been significantly enhanced to support **Hugging Face Datasets** for persistent experiment storage, making it ideal for deployment on Hugging Face Spaces and other cloud environments.
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            ## ✅ Key Improvements Made
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            ### 1. **Enhanced `monitoring.py`**
         
     | 
| 10 | 
         
            +
            - ✅ **HF Datasets Integration**: Added support for saving experiments to HF Datasets repositories
         
     | 
| 11 | 
         
            +
            - ✅ **Environment Variables**: Automatic detection of `HF_TOKEN` and `TRACKIO_DATASET_REPO`
         
     | 
| 12 | 
         
            +
            - ✅ **Fallback Support**: Graceful degradation if HF Datasets unavailable
         
     | 
| 13 | 
         
            +
            - ✅ **Dual Storage**: Experiments saved to both Trackio and HF Datasets
         
     | 
| 14 | 
         
            +
            - ✅ **Periodic Saving**: Metrics saved to HF Dataset every 10 steps
         
     | 
| 15 | 
         
            +
            - ✅ **Error Handling**: Robust error logging and recovery
         
     | 
| 16 | 
         
            +
             
     | 
| 17 | 
         
            +
            ### 2. **Updated `train.py`**
         
     | 
| 18 | 
         
            +
            - ✅ **Monitoring Integration**: Automatic monitoring setup in training scripts
         
     | 
| 19 | 
         
            +
            - ✅ **Configuration Logging**: Experiment configuration logged at start
         
     | 
| 20 | 
         
            +
            - ✅ **Training Callbacks**: Monitoring callbacks added to trainer
         
     | 
| 21 | 
         
            +
            - ✅ **Summary Logging**: Training summaries logged at completion
         
     | 
| 22 | 
         
            +
            - ✅ **Error Logging**: Errors logged to monitoring system
         
     | 
| 23 | 
         
            +
            - ✅ **Cleanup**: Proper monitoring session cleanup
         
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
            ### 3. **Configuration Files Updated**
         
     | 
| 26 | 
         
            +
            - ✅ **HF Datasets Config**: Added `hf_token` and `dataset_repo` parameters
         
     | 
| 27 | 
         
            +
            - ✅ **Environment Support**: Environment variables automatically detected
         
     | 
| 28 | 
         
            +
            - ✅ **Backward Compatible**: Existing configurations still work
         
     | 
| 29 | 
         
            +
             
     | 
| 30 | 
         
            +
            ### 4. **New Utility Scripts**
         
     | 
| 31 | 
         
            +
            - ✅ **`configure_trackio.py`**: Configuration testing and setup
         
     | 
| 32 | 
         
            +
            - ✅ **`integrate_monitoring.py`**: Automated integration script
         
     | 
| 33 | 
         
            +
            - ✅ **`test_monitoring_integration.py`**: Comprehensive testing
         
     | 
| 34 | 
         
            +
            - ✅ **`setup_hf_dataset.py`**: Dataset repository setup
         
     | 
| 35 | 
         
            +
             
     | 
| 36 | 
         
            +
            ### 5. **Documentation**
         
     | 
| 37 | 
         
            +
            - ✅ **`MONITORING_INTEGRATION_GUIDE.md`**: Comprehensive usage guide
         
     | 
| 38 | 
         
            +
            - ✅ **`ENVIRONMENT_VARIABLES.md`**: Environment variable reference
         
     | 
| 39 | 
         
            +
            - ✅ **`HF_DATASETS_GUIDE.md`**: Detailed HF Datasets guide
         
     | 
| 40 | 
         
            +
             
     | 
| 41 | 
         
            +
            ## 🔧 Environment Variables
         
     | 
| 42 | 
         
            +
             
     | 
| 43 | 
         
            +
            | Variable | Required | Default | Description |
         
     | 
| 44 | 
         
            +
            |----------|----------|---------|-------------|
         
     | 
| 45 | 
         
            +
            | `HF_TOKEN` | ✅ Yes | None | Your Hugging Face token |
         
     | 
| 46 | 
         
            +
            | `TRACKIO_DATASET_REPO` | ❌ No | `tonic/trackio-experiments` | Dataset repository |
         
     | 
| 47 | 
         
            +
            | `TRACKIO_URL` | ❌ No | None | Trackio server URL |
         
     | 
| 48 | 
         
            +
            | `TRACKIO_TOKEN` | ❌ No | None | Trackio authentication token |
         
     | 
| 49 | 
         
            +
             
     | 
| 50 | 
         
            +
            ## 📊 What Gets Monitored
         
     | 
| 51 | 
         
            +
             
     | 
| 52 | 
         
            +
            ### **Training Metrics**
         
     | 
| 53 | 
         
            +
            - Loss values (training and validation)
         
     | 
| 54 | 
         
            +
            - Learning rate
         
     | 
| 55 | 
         
            +
            - Gradient norms
         
     | 
| 56 | 
         
            +
            - Training steps and epochs
         
     | 
| 57 | 
         
            +
             
     | 
| 58 | 
         
            +
            ### **System Metrics**
         
     | 
| 59 | 
         
            +
            - GPU memory usage
         
     | 
| 60 | 
         
            +
            - GPU utilization
         
     | 
| 61 | 
         
            +
            - CPU usage
         
     | 
| 62 | 
         
            +
            - Memory usage
         
     | 
| 63 | 
         
            +
             
     | 
| 64 | 
         
            +
            ### **Experiment Data**
         
     | 
| 65 | 
         
            +
            - Configuration parameters
         
     | 
| 66 | 
         
            +
            - Model checkpoints
         
     | 
| 67 | 
         
            +
            - Evaluation results
         
     | 
| 68 | 
         
            +
            - Training summaries
         
     | 
| 69 | 
         
            +
             
     | 
| 70 | 
         
            +
            ### **Artifacts**
         
     | 
| 71 | 
         
            +
            - Configuration files
         
     | 
| 72 | 
         
            +
            - Training logs
         
     | 
| 73 | 
         
            +
            - Evaluation results
         
     | 
| 74 | 
         
            +
            - Model checkpoints
         
     | 
| 75 | 
         
            +
             
     | 
| 76 | 
         
            +
            ## 🚀 Usage Examples
         
     | 
| 77 | 
         
            +
             
     | 
| 78 | 
         
            +
            ### **Basic Training**
         
     | 
| 79 | 
         
            +
            ```bash
         
     | 
| 80 | 
         
            +
            # Set environment variables
         
     | 
| 81 | 
         
            +
            export HF_TOKEN=your_token_here
         
     | 
| 82 | 
         
            +
            export TRACKIO_DATASET_REPO=your-username/experiments
         
     | 
| 83 | 
         
            +
             
     | 
| 84 | 
         
            +
            # Run training with monitoring
         
     | 
| 85 | 
         
            +
            python train.py config/train_smollm3_openhermes_fr.py
         
     | 
| 86 | 
         
            +
            ```
         
     | 
| 87 | 
         
            +
             
     | 
| 88 | 
         
            +
            ### **Advanced Configuration**
         
     | 
| 89 | 
         
            +
            ```bash
         
     | 
| 90 | 
         
            +
            # Train with custom settings
         
     | 
| 91 | 
         
            +
            python train.py config/train_smollm3_openhermes_fr.py \
         
     | 
| 92 | 
         
            +
              --experiment_name "smollm3_french_v2" \
         
     | 
| 93 | 
         
            +
              --hf_token your_token_here \
         
     | 
| 94 | 
         
            +
              --dataset_repo your-username/french-experiments
         
     | 
| 95 | 
         
            +
            ```
         
     | 
| 96 | 
         
            +
             
     | 
| 97 | 
         
            +
            ### **Testing Setup**
         
     | 
| 98 | 
         
            +
            ```bash
         
     | 
| 99 | 
         
            +
            # Test configuration
         
     | 
| 100 | 
         
            +
            python configure_trackio.py
         
     | 
| 101 | 
         
            +
             
     | 
| 102 | 
         
            +
            # Test monitoring integration
         
     | 
| 103 | 
         
            +
            python test_monitoring_integration.py
         
     | 
| 104 | 
         
            +
             
     | 
| 105 | 
         
            +
            # Test dataset access
         
     | 
| 106 | 
         
            +
            python test_hf_datasets.py
         
     | 
| 107 | 
         
            +
            ```
         
     | 
| 108 | 
         
            +
             
     | 
| 109 | 
         
            +
            ## 📈 Benefits
         
     | 
| 110 | 
         
            +
             
     | 
| 111 | 
         
            +
            ### **For HF Spaces Deployment**
         
     | 
| 112 | 
         
            +
            - ✅ **Persistent Storage**: Data survives Space restarts
         
     | 
| 113 | 
         
            +
            - ✅ **No Local Storage**: No dependency on ephemeral storage
         
     | 
| 114 | 
         
            +
            - ✅ **Scalable**: Works with any dataset size
         
     | 
| 115 | 
         
            +
            - ✅ **Secure**: Private dataset storage
         
     | 
| 116 | 
         
            +
             
     | 
| 117 | 
         
            +
            ### **For Experiment Management**
         
     | 
| 118 | 
         
            +
            - ✅ **Centralized**: All experiments in one place
         
     | 
| 119 | 
         
            +
            - ✅ **Searchable**: Easy to find specific experiments
         
     | 
| 120 | 
         
            +
            - ✅ **Versioned**: Dataset versioning for experiments
         
     | 
| 121 | 
         
            +
            - ✅ **Collaborative**: Share experiments with team
         
     | 
| 122 | 
         
            +
             
     | 
| 123 | 
         
            +
            ### **For Development**
         
     | 
| 124 | 
         
            +
            - ✅ **Flexible**: Easy to switch between datasets
         
     | 
| 125 | 
         
            +
            - ✅ **Configurable**: Environment-based configuration
         
     | 
| 126 | 
         
            +
            - ✅ **Robust**: Fallback mechanisms
         
     | 
| 127 | 
         
            +
            - ✅ **Debuggable**: Comprehensive logging
         
     | 
| 128 | 
         
            +
             
     | 
| 129 | 
         
            +
            ## 🧪 Testing Results
         
     | 
| 130 | 
         
            +
             
     | 
| 131 | 
         
            +
            All monitoring integration tests passed:
         
     | 
| 132 | 
         
            +
            - ✅ Module Import
         
     | 
| 133 | 
         
            +
            - ✅ Monitor Creation
         
     | 
| 134 | 
         
            +
            - ✅ Config Creation
         
     | 
| 135 | 
         
            +
            - ✅ Metrics Logging
         
     | 
| 136 | 
         
            +
            - ✅ Configuration Logging
         
     | 
| 137 | 
         
            +
            - ✅ System Metrics
         
     | 
| 138 | 
         
            +
            - ✅ Training Summary
         
     | 
| 139 | 
         
            +
            - ✅ Callback Creation
         
     | 
| 140 | 
         
            +
             
     | 
| 141 | 
         
            +
            ## 📋 Files Modified/Created
         
     | 
| 142 | 
         
            +
             
     | 
| 143 | 
         
            +
            ### **Core Files**
         
     | 
| 144 | 
         
            +
            - `monitoring.py` - Enhanced with HF Datasets support
         
     | 
| 145 | 
         
            +
            - `train.py` - Updated with monitoring integration
         
     | 
| 146 | 
         
            +
            - `requirements_core.txt` - Added monitoring dependencies
         
     | 
| 147 | 
         
            +
            - `requirements_space.txt` - Updated for HF Spaces
         
     | 
| 148 | 
         
            +
             
     | 
| 149 | 
         
            +
            ### **Configuration Files**
         
     | 
| 150 | 
         
            +
            - `config/train_smollm3.py` - Added HF Datasets config
         
     | 
| 151 | 
         
            +
            - `config/train_smollm3_openhermes_fr.py` - Added HF Datasets config
         
     | 
| 152 | 
         
            +
            - `config/train_smollm3_openhermes_fr_a100_balanced.py` - Added HF Datasets config
         
     | 
| 153 | 
         
            +
            - `config/train_smollm3_openhermes_fr_a100_large.py` - Added HF Datasets config
         
     | 
| 154 | 
         
            +
            - `config/train_smollm3_openhermes_fr_a100_max_performance.py` - Added HF Datasets config
         
     | 
| 155 | 
         
            +
            - `config/train_smollm3_openhermes_fr_a100_multiple_passes.py` - Added HF Datasets config
         
     | 
| 156 | 
         
            +
             
     | 
| 157 | 
         
            +
            ### **New Utility Scripts**
         
     | 
| 158 | 
         
            +
            - `configure_trackio.py` - Configuration testing
         
     | 
| 159 | 
         
            +
            - `integrate_monitoring.py` - Automated integration
         
     | 
| 160 | 
         
            +
            - `test_monitoring_integration.py` - Comprehensive testing
         
     | 
| 161 | 
         
            +
            - `setup_hf_dataset.py` - Dataset setup
         
     | 
| 162 | 
         
            +
             
     | 
| 163 | 
         
            +
            ### **Documentation**
         
     | 
| 164 | 
         
            +
            - `MONITORING_INTEGRATION_GUIDE.md` - Usage guide
         
     | 
| 165 | 
         
            +
            - `ENVIRONMENT_VARIABLES.md` - Environment reference
         
     | 
| 166 | 
         
            +
            - `HF_DATASETS_GUIDE.md` - HF Datasets guide
         
     | 
| 167 | 
         
            +
            - `MONITORING_IMPROVEMENTS_SUMMARY.md` - This summary
         
     | 
| 168 | 
         
            +
             
     | 
| 169 | 
         
            +
            ## 🎯 Next Steps
         
     | 
| 170 | 
         
            +
             
     | 
| 171 | 
         
            +
            1. **Set up your HF token and dataset repository**
         
     | 
| 172 | 
         
            +
            2. **Test the configuration with `python configure_trackio.py`**
         
     | 
| 173 | 
         
            +
            3. **Run a training experiment to verify full functionality**
         
     | 
| 174 | 
         
            +
            4. **Check your HF Dataset repository for experiment data**
         
     | 
| 175 | 
         
            +
            5. **View results in your Trackio interface**
         
     | 
| 176 | 
         
            +
             
     | 
| 177 | 
         
            +
            ## 🔍 Troubleshooting
         
     | 
| 178 | 
         
            +
             
     | 
| 179 | 
         
            +
            ### **Common Issues**
         
     | 
| 180 | 
         
            +
            - **HF_TOKEN not set**: Set your Hugging Face token
         
     | 
| 181 | 
         
            +
            - **Dataset access failed**: Check token permissions and repository existence
         
     | 
| 182 | 
         
            +
            - **Monitoring not working**: Run `python test_monitoring_integration.py` to diagnose
         
     | 
| 183 | 
         
            +
             
     | 
| 184 | 
         
            +
            ### **Getting Help**
         
     | 
| 185 | 
         
            +
            - Check the comprehensive guides in the documentation files
         
     | 
| 186 | 
         
            +
            - Run the test scripts to verify your setup
         
     | 
| 187 | 
         
            +
            - Check logs for specific error messages
         
     | 
| 188 | 
         
            +
             
     | 
| 189 | 
         
            +
            ---
         
     | 
| 190 | 
         
            +
             
     | 
| 191 | 
         
            +
            **🎉 The monitoring system is now ready for production use with persistent HF Datasets storage!** 
         
     | 
    	
        docs/MONITORING_INTEGRATION_GUIDE.md
    ADDED
    
    | 
         @@ -0,0 +1,245 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            # 🔧 Improved Monitoring Integration Guide
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            ## Overview
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            The monitoring system has been enhanced to support **Hugging Face Datasets** for persistent experiment storage, making it ideal for deployment on Hugging Face Spaces and other cloud environments.
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            ## 🚀 Key Improvements
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            ### 1. **HF Datasets Integration**
         
     | 
| 10 | 
         
            +
            - ✅ **Persistent Storage**: Experiments are saved to HF Datasets repositories
         
     | 
| 11 | 
         
            +
            - ✅ **Environment Variables**: Configurable via `HF_TOKEN` and `TRACKIO_DATASET_REPO`
         
     | 
| 12 | 
         
            +
            - ✅ **Fallback Support**: Graceful degradation if HF Datasets unavailable
         
     | 
| 13 | 
         
            +
            - ✅ **Automatic Backup**: Local files as backup
         
     | 
| 14 | 
         
            +
             
     | 
| 15 | 
         
            +
            ### 2. **Enhanced Monitoring Features**
         
     | 
| 16 | 
         
            +
            - 📊 **Real-time Metrics**: Training metrics logged to both Trackio and HF Datasets
         
     | 
| 17 | 
         
            +
            - 🔧 **System Metrics**: GPU memory, CPU usage, and system performance
         
     | 
| 18 | 
         
            +
            - 📈 **Training Summaries**: Comprehensive experiment summaries
         
     | 
| 19 | 
         
            +
            - 🛡️ **Error Handling**: Robust error logging and recovery
         
     | 
| 20 | 
         
            +
             
     | 
| 21 | 
         
            +
            ### 3. **Easy Integration**
         
     | 
| 22 | 
         
            +
            - 🔌 **Automatic Setup**: Environment variables automatically detected
         
     | 
| 23 | 
         
            +
            - 📝 **Configuration**: Simple setup with environment variables
         
     | 
| 24 | 
         
            +
            - 🔄 **Backward Compatible**: Works with existing Trackio setup
         
     | 
| 25 | 
         
            +
             
     | 
| 26 | 
         
            +
            ## 📋 Environment Variables
         
     | 
| 27 | 
         
            +
             
     | 
| 28 | 
         
            +
            | Variable | Required | Default | Description |
         
     | 
| 29 | 
         
            +
            |----------|----------|---------|-------------|
         
     | 
| 30 | 
         
            +
            | `HF_TOKEN` | ✅ Yes | None | Your Hugging Face token |
         
     | 
| 31 | 
         
            +
            | `TRACKIO_DATASET_REPO` | ❌ No | `tonic/trackio-experiments` | Dataset repository |
         
     | 
| 32 | 
         
            +
            | `TRACKIO_URL` | ❌ No | None | Trackio server URL |
         
     | 
| 33 | 
         
            +
            | `TRACKIO_TOKEN` | ❌ No | None | Trackio authentication token |
         
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            ## 🛠️ Setup Instructions
         
     | 
| 36 | 
         
            +
             
     | 
| 37 | 
         
            +
            ### 1. **Get Your HF Token**
         
     | 
| 38 | 
         
            +
            ```bash
         
     | 
| 39 | 
         
            +
            # Go to https://huggingface.co/settings/tokens
         
     | 
| 40 | 
         
            +
            # Create a new token with "Write" permissions
         
     | 
| 41 | 
         
            +
            # Copy the token
         
     | 
| 42 | 
         
            +
            ```
         
     | 
| 43 | 
         
            +
             
     | 
| 44 | 
         
            +
            ### 2. **Set Environment Variables**
         
     | 
| 45 | 
         
            +
            ```bash
         
     | 
| 46 | 
         
            +
            # For HF Spaces, add these to your Space settings:
         
     | 
| 47 | 
         
            +
            HF_TOKEN=your_hf_token_here
         
     | 
| 48 | 
         
            +
            TRACKIO_DATASET_REPO=your-username/your-dataset-name
         
     | 
| 49 | 
         
            +
             
     | 
| 50 | 
         
            +
            # For local development:
         
     | 
| 51 | 
         
            +
            export HF_TOKEN=your_hf_token_here
         
     | 
| 52 | 
         
            +
            export TRACKIO_DATASET_REPO=your-username/your-dataset-name
         
     | 
| 53 | 
         
            +
            ```
         
     | 
| 54 | 
         
            +
             
     | 
| 55 | 
         
            +
            ### 3. **Create Dataset Repository**
         
     | 
| 56 | 
         
            +
            ```bash
         
     | 
| 57 | 
         
            +
            # Run the setup script
         
     | 
| 58 | 
         
            +
            python setup_hf_dataset.py
         
     | 
| 59 | 
         
            +
             
     | 
| 60 | 
         
            +
            # Or manually create a dataset on HF Hub
         
     | 
| 61 | 
         
            +
            # Go to https://huggingface.co/datasets
         
     | 
| 62 | 
         
            +
            # Create a new dataset repository
         
     | 
| 63 | 
         
            +
            ```
         
     | 
| 64 | 
         
            +
             
     | 
| 65 | 
         
            +
            ### 4. **Test Configuration**
         
     | 
| 66 | 
         
            +
            ```bash
         
     | 
| 67 | 
         
            +
            # Test your setup
         
     | 
| 68 | 
         
            +
            python configure_trackio.py
         
     | 
| 69 | 
         
            +
             
     | 
| 70 | 
         
            +
            # Test dataset access
         
     | 
| 71 | 
         
            +
            python test_hf_datasets.py
         
     | 
| 72 | 
         
            +
            ```
         
     | 
| 73 | 
         
            +
             
     | 
| 74 | 
         
            +
            ## 🚀 Usage Examples
         
     | 
| 75 | 
         
            +
             
     | 
| 76 | 
         
            +
            ### **Basic Training with Monitoring**
         
     | 
| 77 | 
         
            +
            ```bash
         
     | 
| 78 | 
         
            +
            # Train with default monitoring
         
     | 
| 79 | 
         
            +
            python train.py config/train_smollm3_openhermes_fr.py
         
     | 
| 80 | 
         
            +
             
     | 
| 81 | 
         
            +
            # Train with custom dataset repository
         
     | 
| 82 | 
         
            +
            TRACKIO_DATASET_REPO=your-username/smollm3-experiments python train.py config/train_smollm3_openhermes_fr.py
         
     | 
| 83 | 
         
            +
            ```
         
     | 
| 84 | 
         
            +
             
     | 
| 85 | 
         
            +
            ### **Advanced Training Configuration**
         
     | 
| 86 | 
         
            +
            ```bash
         
     | 
| 87 | 
         
            +
            # Train with custom experiment name
         
     | 
| 88 | 
         
            +
            python train.py config/train_smollm3_openhermes_fr.py \
         
     | 
| 89 | 
         
            +
              --experiment_name "smollm3_french_tuning_v2" \
         
     | 
| 90 | 
         
            +
              --hf_token your_token_here \
         
     | 
| 91 | 
         
            +
              --dataset_repo your-username/french-experiments
         
     | 
| 92 | 
         
            +
            ```
         
     | 
| 93 | 
         
            +
             
     | 
| 94 | 
         
            +
            ### **Training Scripts with Monitoring**
         
     | 
| 95 | 
         
            +
            ```bash
         
     | 
| 96 | 
         
            +
            # All training scripts now support monitoring:
         
     | 
| 97 | 
         
            +
            python train.py config/train_smollm3_openhermes_fr_a100_balanced.py
         
     | 
| 98 | 
         
            +
            python train.py config/train_smollm3_openhermes_fr_a100_large.py
         
     | 
| 99 | 
         
            +
            python train.py config/train_smollm3_openhermes_fr_a100_max_performance.py
         
     | 
| 100 | 
         
            +
            python train.py config/train_smollm3_openhermes_fr_a100_multiple_passes.py
         
     | 
| 101 | 
         
            +
            ```
         
     | 
| 102 | 
         
            +
             
     | 
| 103 | 
         
            +
            ## 📊 What Gets Monitored
         
     | 
| 104 | 
         
            +
             
     | 
| 105 | 
         
            +
            ### **Training Metrics**
         
     | 
| 106 | 
         
            +
            - Loss values (training and validation)
         
     | 
| 107 | 
         
            +
            - Learning rate
         
     | 
| 108 | 
         
            +
            - Gradient norms
         
     | 
| 109 | 
         
            +
            - Training steps and epochs
         
     | 
| 110 | 
         
            +
             
     | 
| 111 | 
         
            +
            ### **System Metrics**
         
     | 
| 112 | 
         
            +
            - GPU memory usage
         
     | 
| 113 | 
         
            +
            - GPU utilization
         
     | 
| 114 | 
         
            +
            - CPU usage
         
     | 
| 115 | 
         
            +
            - Memory usage
         
     | 
| 116 | 
         
            +
             
     | 
| 117 | 
         
            +
            ### **Experiment Data**
         
     | 
| 118 | 
         
            +
            - Configuration parameters
         
     | 
| 119 | 
         
            +
            - Model checkpoints
         
     | 
| 120 | 
         
            +
            - Evaluation results
         
     | 
| 121 | 
         
            +
            - Training summaries
         
     | 
| 122 | 
         
            +
             
     | 
| 123 | 
         
            +
            ### **Artifacts**
         
     | 
| 124 | 
         
            +
            - Configuration files
         
     | 
| 125 | 
         
            +
            - Training logs
         
     | 
| 126 | 
         
            +
            - Evaluation results
         
     | 
| 127 | 
         
            +
            - Model checkpoints
         
     | 
| 128 | 
         
            +
             
     | 
| 129 | 
         
            +
            ## 🔍 Viewing Results
         
     | 
| 130 | 
         
            +
             
     | 
| 131 | 
         
            +
            ### **1. Trackio Interface**
         
     | 
| 132 | 
         
            +
            - Visit your Trackio Space
         
     | 
| 133 | 
         
            +
            - Navigate to "Experiments" tab
         
     | 
| 134 | 
         
            +
            - View real-time metrics and plots
         
     | 
| 135 | 
         
            +
             
     | 
| 136 | 
         
            +
            ### **2. HF Dataset Repository**
         
     | 
| 137 | 
         
            +
            - Go to your dataset repository on HF Hub
         
     | 
| 138 | 
         
            +
            - Browse experiment data
         
     | 
| 139 | 
         
            +
            - Download experiment files
         
     | 
| 140 | 
         
            +
             
     | 
| 141 | 
         
            +
            ### **3. Local Files**
         
     | 
| 142 | 
         
            +
            - Check local backup files
         
     | 
| 143 | 
         
            +
            - Review training logs
         
     | 
| 144 | 
         
            +
            - Examine configuration files
         
     | 
| 145 | 
         
            +
             
     | 
| 146 | 
         
            +
            ## 🛠️ Configuration Examples
         
     | 
| 147 | 
         
            +
             
     | 
| 148 | 
         
            +
            ### **Default Setup**
         
     | 
| 149 | 
         
            +
            ```python
         
     | 
| 150 | 
         
            +
            # Uses default dataset: tonic/trackio-experiments
         
     | 
| 151 | 
         
            +
            # Requires only HF_TOKEN
         
     | 
| 152 | 
         
            +
            ```
         
     | 
| 153 | 
         
            +
             
     | 
| 154 | 
         
            +
            ### **Personal Dataset**
         
     | 
| 155 | 
         
            +
            ```bash
         
     | 
| 156 | 
         
            +
            export HF_TOKEN=your_token_here
         
     | 
| 157 | 
         
            +
            export TRACKIO_DATASET_REPO=your-username/trackio-experiments
         
     | 
| 158 | 
         
            +
            ```
         
     | 
| 159 | 
         
            +
             
     | 
| 160 | 
         
            +
            ### **Team Dataset**
         
     | 
| 161 | 
         
            +
            ```bash
         
     | 
| 162 | 
         
            +
            export HF_TOKEN=your_token_here
         
     | 
| 163 | 
         
            +
            export TRACKIO_DATASET_REPO=your-org/team-experiments
         
     | 
| 164 | 
         
            +
            ```
         
     | 
| 165 | 
         
            +
             
     | 
| 166 | 
         
            +
            ### **Project-Specific Dataset**
         
     | 
| 167 | 
         
            +
            ```bash
         
     | 
| 168 | 
         
            +
            export HF_TOKEN=your_token_here
         
     | 
| 169 | 
         
            +
            export TRACKIO_DATASET_REPO=your-username/smollm3-experiments
         
     | 
| 170 | 
         
            +
            ```
         
     | 
| 171 | 
         
            +
             
     | 
| 172 | 
         
            +
            ## 🔧 Troubleshooting
         
     | 
| 173 | 
         
            +
             
     | 
| 174 | 
         
            +
            ### **Issue: "HF_TOKEN not found"**
         
     | 
| 175 | 
         
            +
            ```bash
         
     | 
| 176 | 
         
            +
            # Solution: Set your HF token
         
     | 
| 177 | 
         
            +
            export HF_TOKEN=your_token_here
         
     | 
| 178 | 
         
            +
            # Or add to HF Space environment variables
         
     | 
| 179 | 
         
            +
            ```
         
     | 
| 180 | 
         
            +
             
     | 
| 181 | 
         
            +
            ### **Issue: "Failed to load dataset"**
         
     | 
| 182 | 
         
            +
            ```bash
         
     | 
| 183 | 
         
            +
            # Solutions:
         
     | 
| 184 | 
         
            +
            # 1. Check token has read access
         
     | 
| 185 | 
         
            +
            # 2. Verify dataset repository exists
         
     | 
| 186 | 
         
            +
            # 3. Run setup script: python setup_hf_dataset.py
         
     | 
| 187 | 
         
            +
            ```
         
     | 
| 188 | 
         
            +
             
     | 
| 189 | 
         
            +
            ### **Issue: "Failed to save experiments"**
         
     | 
| 190 | 
         
            +
            ```bash
         
     | 
| 191 | 
         
            +
            # Solutions:
         
     | 
| 192 | 
         
            +
            # 1. Check token has write permissions
         
     | 
| 193 | 
         
            +
            # 2. Verify dataset repository exists
         
     | 
| 194 | 
         
            +
            # 3. Check network connectivity
         
     | 
| 195 | 
         
            +
            ```
         
     | 
| 196 | 
         
            +
             
     | 
| 197 | 
         
            +
            ### **Issue: "Monitoring not working"**
         
     | 
| 198 | 
         
            +
            ```bash
         
     | 
| 199 | 
         
            +
            # Solutions:
         
     | 
| 200 | 
         
            +
            # 1. Check environment variables
         
     | 
| 201 | 
         
            +
            # 2. Run configuration test: python configure_trackio.py
         
     | 
| 202 | 
         
            +
            # 3. Check logs for specific errors
         
     | 
| 203 | 
         
            +
            ```
         
     | 
| 204 | 
         
            +
             
     | 
| 205 | 
         
            +
            ## 📈 Benefits
         
     | 
| 206 | 
         
            +
             
     | 
| 207 | 
         
            +
            ### **For HF Spaces Deployment**
         
     | 
| 208 | 
         
            +
            - ✅ **Persistent Storage**: Data survives Space restarts
         
     | 
| 209 | 
         
            +
            - ✅ **No Local Storage**: No dependency on ephemeral storage
         
     | 
| 210 | 
         
            +
            - ✅ **Scalable**: Works with any dataset size
         
     | 
| 211 | 
         
            +
            - ✅ **Secure**: Private dataset storage
         
     | 
| 212 | 
         
            +
             
     | 
| 213 | 
         
            +
            ### **For Experiment Management**
         
     | 
| 214 | 
         
            +
            - ✅ **Centralized**: All experiments in one place
         
     | 
| 215 | 
         
            +
            - ✅ **Searchable**: Easy to find specific experiments
         
     | 
| 216 | 
         
            +
            - ✅ **Versioned**: Dataset versioning for experiments
         
     | 
| 217 | 
         
            +
            - ✅ **Collaborative**: Share experiments with team
         
     | 
| 218 | 
         
            +
             
     | 
| 219 | 
         
            +
            ### **For Development**
         
     | 
| 220 | 
         
            +
            - ✅ **Flexible**: Easy to switch between datasets
         
     | 
| 221 | 
         
            +
            - ✅ **Configurable**: Environment-based configuration
         
     | 
| 222 | 
         
            +
            - ✅ **Robust**: Fallback mechanisms
         
     | 
| 223 | 
         
            +
            - ✅ **Debuggable**: Comprehensive logging
         
     | 
| 224 | 
         
            +
             
     | 
| 225 | 
         
            +
            ## 🎯 Next Steps
         
     | 
| 226 | 
         
            +
             
     | 
| 227 | 
         
            +
            1. **Set up your HF token and dataset repository**
         
     | 
| 228 | 
         
            +
            2. **Test the configuration with `python configure_trackio.py`**
         
     | 
| 229 | 
         
            +
            3. **Run a training experiment to verify monitoring**
         
     | 
| 230 | 
         
            +
            4. **Check your HF Dataset repository for experiment data**
         
     | 
| 231 | 
         
            +
            5. **View results in your Trackio interface**
         
     | 
| 232 | 
         
            +
             
     | 
| 233 | 
         
            +
            ## 📚 Related Files
         
     | 
| 234 | 
         
            +
             
     | 
| 235 | 
         
            +
            - `monitoring.py` - Enhanced monitoring with HF Datasets support
         
     | 
| 236 | 
         
            +
            - `train.py` - Updated training script with monitoring integration
         
     | 
| 237 | 
         
            +
            - `configure_trackio.py` - Configuration and testing script
         
     | 
| 238 | 
         
            +
            - `setup_hf_dataset.py` - Dataset repository setup
         
     | 
| 239 | 
         
            +
            - `test_hf_datasets.py` - Dataset access testing
         
     | 
| 240 | 
         
            +
            - `ENVIRONMENT_VARIABLES.md` - Environment variable reference
         
     | 
| 241 | 
         
            +
            - `HF_DATASETS_GUIDE.md` - Detailed HF Datasets guide
         
     | 
| 242 | 
         
            +
             
     | 
| 243 | 
         
            +
            ---
         
     | 
| 244 | 
         
            +
             
     | 
| 245 | 
         
            +
            **🎉 Your experiments are now persistently stored and easily accessible!** 
         
     | 
    	
        NO_THINK_TAG_GUIDE.md → docs/NO_THINK_TAG_GUIDE.md
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        PUSH_GUIDE.md → docs/PUSH_GUIDE.md
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        docs/PUSH_SCRIPT_GUIDE.md
    ADDED
    
    | 
         @@ -0,0 +1,267 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            # 🚀 Push to Hugging Face Script Guide
         
     | 
| 2 | 
         
            +
             
     | 
| 3 | 
         
            +
            ## Overview
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            The `push_to_huggingface.py` script has been enhanced to integrate with **HF Datasets** for experiment tracking and provides complete model deployment with persistent experiment storage.
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            ## 🚀 Key Improvements
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            ### **1. HF Datasets Integration**
         
     | 
| 10 | 
         
            +
            - ✅ **Dataset Repository Support**: Configurable dataset repository for experiment storage
         
     | 
| 11 | 
         
            +
            - ✅ **Environment Variables**: Automatic detection of `HF_TOKEN` and `TRACKIO_DATASET_REPO`
         
     | 
| 12 | 
         
            +
            - ✅ **Enhanced Logging**: Logs push actions to both Trackio and HF Datasets
         
     | 
| 13 | 
         
            +
            - ✅ **Model Card Integration**: Includes dataset repository information in model cards
         
     | 
| 14 | 
         
            +
             
     | 
| 15 | 
         
            +
            ### **2. Enhanced Configuration**
         
     | 
| 16 | 
         
            +
            - ✅ **Flexible Token Input**: Multiple ways to provide HF token
         
     | 
| 17 | 
         
            +
            - ✅ **Dataset Repository Tracking**: Links models to their experiment datasets
         
     | 
| 18 | 
         
            +
            - ✅ **Environment Variable Support**: Fallback to environment variables
         
     | 
| 19 | 
         
            +
            - ✅ **Command Line Arguments**: New arguments for HF Datasets integration
         
     | 
| 20 | 
         
            +
             
     | 
| 21 | 
         
            +
            ### **3. Improved Model Cards**
         
     | 
| 22 | 
         
            +
            - ✅ **Dataset Repository Info**: Shows which dataset contains experiment data
         
     | 
| 23 | 
         
            +
            - ✅ **Experiment Tracking Section**: Explains how to access training data
         
     | 
| 24 | 
         
            +
            - ✅ **Enhanced Documentation**: Better model cards with experiment links
         
     | 
| 25 | 
         
            +
             
     | 
| 26 | 
         
            +
            ## 📋 Usage Examples
         
     | 
| 27 | 
         
            +
             
     | 
| 28 | 
         
            +
            ### **Basic Usage**
         
     | 
| 29 | 
         
            +
            ```bash
         
     | 
| 30 | 
         
            +
            # Push model with default settings
         
     | 
| 31 | 
         
            +
            python push_to_huggingface.py /path/to/model username/repo-name
         
     | 
| 32 | 
         
            +
            ```
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
            ### **With HF Datasets Integration**
         
     | 
| 35 | 
         
            +
            ```bash
         
     | 
| 36 | 
         
            +
            # Push model with custom dataset repository
         
     | 
| 37 | 
         
            +
            python push_to_huggingface.py /path/to/model username/repo-name \
         
     | 
| 38 | 
         
            +
              --dataset-repo username/experiments
         
     | 
| 39 | 
         
            +
            ```
         
     | 
| 40 | 
         
            +
             
     | 
| 41 | 
         
            +
            ### **With Custom Token**
         
     | 
| 42 | 
         
            +
            ```bash
         
     | 
| 43 | 
         
            +
            # Push model with custom HF token
         
     | 
| 44 | 
         
            +
            python push_to_huggingface.py /path/to/model username/repo-name \
         
     | 
| 45 | 
         
            +
              --hf-token your_token_here
         
     | 
| 46 | 
         
            +
            ```
         
     | 
| 47 | 
         
            +
             
     | 
| 48 | 
         
            +
            ### **Complete Example**
         
     | 
| 49 | 
         
            +
            ```bash
         
     | 
| 50 | 
         
            +
            # Push model with all options
         
     | 
| 51 | 
         
            +
            python push_to_huggingface.py /path/to/model username/repo-name \
         
     | 
| 52 | 
         
            +
              --dataset-repo username/experiments \
         
     | 
| 53 | 
         
            +
              --hf-token your_token_here \
         
     | 
| 54 | 
         
            +
              --private \
         
     | 
| 55 | 
         
            +
              --experiment-name "smollm3_finetune_v2"
         
     | 
| 56 | 
         
            +
            ```
         
     | 
| 57 | 
         
            +
             
     | 
| 58 | 
         
            +
            ## 🔧 Command Line Arguments
         
     | 
| 59 | 
         
            +
             
     | 
| 60 | 
         
            +
            | Argument | Required | Default | Description |
         
     | 
| 61 | 
         
            +
            |----------|----------|---------|-------------|
         
     | 
| 62 | 
         
            +
            | `model_path` | ✅ Yes | None | Path to trained model directory |
         
     | 
| 63 | 
         
            +
            | `repo_name` | ✅ Yes | None | HF repository name (username/repo-name) |
         
     | 
| 64 | 
         
            +
            | `--token` | ❌ No | `HF_TOKEN` env | Hugging Face token |
         
     | 
| 65 | 
         
            +
            | `--hf-token` | ❌ No | `HF_TOKEN` env | HF token (alternative to --token) |
         
     | 
| 66 | 
         
            +
            | `--private` | ❌ No | False | Make repository private |
         
     | 
| 67 | 
         
            +
            | `--trackio-url` | ❌ No | None | Trackio Space URL for logging |
         
     | 
| 68 | 
         
            +
            | `--experiment-name` | ❌ No | None | Experiment name for Trackio |
         
     | 
| 69 | 
         
            +
            | `--dataset-repo` | ❌ No | `TRACKIO_DATASET_REPO` env | HF Dataset repository |
         
     | 
| 70 | 
         
            +
             
     | 
| 71 | 
         
            +
            ## 🛠️ Configuration Methods
         
     | 
| 72 | 
         
            +
             
     | 
| 73 | 
         
            +
            ### **Method 1: Command Line Arguments**
         
     | 
| 74 | 
         
            +
            ```bash
         
     | 
| 75 | 
         
            +
            python push_to_huggingface.py model_path repo_name \
         
     | 
| 76 | 
         
            +
              --dataset-repo username/experiments \
         
     | 
| 77 | 
         
            +
              --hf-token your_token_here
         
     | 
| 78 | 
         
            +
            ```
         
     | 
| 79 | 
         
            +
             
     | 
| 80 | 
         
            +
            ### **Method 2: Environment Variables**
         
     | 
| 81 | 
         
            +
            ```bash
         
     | 
| 82 | 
         
            +
            export HF_TOKEN=your_token_here
         
     | 
| 83 | 
         
            +
            export TRACKIO_DATASET_REPO=username/experiments
         
     | 
| 84 | 
         
            +
            python push_to_huggingface.py model_path repo_name
         
     | 
| 85 | 
         
            +
            ```
         
     | 
| 86 | 
         
            +
             
     | 
| 87 | 
         
            +
            ### **Method 3: Hybrid Approach**
         
     | 
| 88 | 
         
            +
            ```bash
         
     | 
| 89 | 
         
            +
            # Set defaults via environment variables
         
     | 
| 90 | 
         
            +
            export HF_TOKEN=your_token_here
         
     | 
| 91 | 
         
            +
            export TRACKIO_DATASET_REPO=username/experiments
         
     | 
| 92 | 
         
            +
             
     | 
| 93 | 
         
            +
            # Override specific values via command line
         
     | 
| 94 | 
         
            +
            python push_to_huggingface.py model_path repo_name \
         
     | 
| 95 | 
         
            +
              --dataset-repo username/specific-experiments
         
     | 
| 96 | 
         
            +
            ```
         
     | 
| 97 | 
         
            +
             
     | 
| 98 | 
         
            +
            ## 📊 What Gets Pushed
         
     | 
| 99 | 
         
            +
             
     | 
| 100 | 
         
            +
            ### **Model Files**
         
     | 
| 101 | 
         
            +
            - ✅ **Model Weights**: `pytorch_model.bin`
         
     | 
| 102 | 
         
            +
            - ✅ **Configuration**: `config.json`
         
     | 
| 103 | 
         
            +
            - ✅ **Tokenizer**: `tokenizer.json`, `tokenizer_config.json`
         
     | 
| 104 | 
         
            +
            - ✅ **All Other Files**: Any additional files in model directory
         
     | 
| 105 | 
         
            +
             
     | 
| 106 | 
         
            +
            ### **Documentation**
         
     | 
| 107 | 
         
            +
            - ✅ **Model Card**: Comprehensive README.md with model information
         
     | 
| 108 | 
         
            +
            - ✅ **Training Configuration**: JSON configuration used for training
         
     | 
| 109 | 
         
            +
            - ✅ **Training Results**: JSON results and metrics
         
     | 
| 110 | 
         
            +
            - ✅ **Training Logs**: Text logs from training process
         
     | 
| 111 | 
         
            +
             
     | 
| 112 | 
         
            +
            ### **Experiment Data**
         
     | 
| 113 | 
         
            +
            - ✅ **Dataset Repository**: Links to HF Dataset containing experiment data
         
     | 
| 114 | 
         
            +
            - ✅ **Training Metrics**: All training metrics stored in dataset
         
     | 
| 115 | 
         
            +
            - ✅ **Configuration**: Training configuration stored in dataset
         
     | 
| 116 | 
         
            +
            - ✅ **Artifacts**: Training artifacts and logs
         
     | 
| 117 | 
         
            +
             
     | 
| 118 | 
         
            +
            ## 🔍 Enhanced Model Cards
         
     | 
| 119 | 
         
            +
             
     | 
| 120 | 
         
            +
            The improved script creates enhanced model cards that include:
         
     | 
| 121 | 
         
            +
             
     | 
| 122 | 
         
            +
            ### **Model Information**
         
     | 
| 123 | 
         
            +
            - Base model and architecture
         
     | 
| 124 | 
         
            +
            - Training date and model size
         
     | 
| 125 | 
         
            +
            - **Dataset repository** for experiment data
         
     | 
| 126 | 
         
            +
             
     | 
| 127 | 
         
            +
            ### **Training Configuration**
         
     | 
| 128 | 
         
            +
            - Complete training parameters
         
     | 
| 129 | 
         
            +
            - Hardware information
         
     | 
| 130 | 
         
            +
            - Training duration and steps
         
     | 
| 131 | 
         
            +
             
     | 
| 132 | 
         
            +
            ### **Experiment Tracking**
         
     | 
| 133 | 
         
            +
            - Links to HF Dataset repository
         
     | 
| 134 | 
         
            +
            - Instructions for accessing experiment data
         
     | 
| 135 | 
         
            +
            - Training metrics and results
         
     | 
| 136 | 
         
            +
             
     | 
| 137 | 
         
            +
            ### **Usage Examples**
         
     | 
| 138 | 
         
            +
            - Code examples for loading and using the model
         
     | 
| 139 | 
         
            +
            - Generation examples
         
     | 
| 140 | 
         
            +
            - Performance information
         
     | 
| 141 | 
         
            +
             
     | 
| 142 | 
         
            +
            ## 📈 Logging Integration
         
     | 
| 143 | 
         
            +
             
     | 
| 144 | 
         
            +
            ### **Trackio Logging**
         
     | 
| 145 | 
         
            +
            - ✅ **Push Actions**: Logs model push events
         
     | 
| 146 | 
         
            +
            - ✅ **Model Information**: Repository name, size, configuration
         
     | 
| 147 | 
         
            +
            - ✅ **Training Data**: Links to experiment dataset
         
     | 
| 148 | 
         
            +
             
     | 
| 149 | 
         
            +
            ### **HF Datasets Logging**
         
     | 
| 150 | 
         
            +
            - ✅ **Experiment Summary**: Final training summary
         
     | 
| 151 | 
         
            +
            - ✅ **Push Metadata**: Model repository and push date
         
     | 
| 152 | 
         
            +
            - ✅ **Configuration**: Complete training configuration
         
     | 
| 153 | 
         
            +
             
     | 
| 154 | 
         
            +
            ### **Dual Storage**
         
     | 
| 155 | 
         
            +
            - ✅ **Trackio**: Real-time monitoring and visualization
         
     | 
| 156 | 
         
            +
            - ✅ **HF Datasets**: Persistent experiment storage
         
     | 
| 157 | 
         
            +
            - ✅ **Synchronized**: Both systems updated together
         
     | 
| 158 | 
         
            +
             
     | 
| 159 | 
         
            +
            ## 🚨 Troubleshooting
         
     | 
| 160 | 
         
            +
             
     | 
| 161 | 
         
            +
            ### **Issue: "Missing required files"**
         
     | 
| 162 | 
         
            +
            **Solutions**:
         
     | 
| 163 | 
         
            +
            1. Check model directory contains required files
         
     | 
| 164 | 
         
            +
            2. Ensure model was saved correctly during training
         
     | 
| 165 | 
         
            +
            3. Verify file permissions
         
     | 
| 166 | 
         
            +
             
     | 
| 167 | 
         
            +
            ### **Issue: "Failed to create repository"**
         
     | 
| 168 | 
         
            +
            **Solutions**:
         
     | 
| 169 | 
         
            +
            1. Check HF token has write permissions
         
     | 
| 170 | 
         
            +
            2. Verify repository name format: `username/repo-name`
         
     | 
| 171 | 
         
            +
            3. Ensure repository doesn't already exist (or use `--private`)
         
     | 
| 172 | 
         
            +
             
     | 
| 173 | 
         
            +
            ### **Issue: "Failed to upload files"**
         
     | 
| 174 | 
         
            +
            **Solutions**:
         
     | 
| 175 | 
         
            +
            1. Check network connectivity
         
     | 
| 176 | 
         
            +
            2. Verify HF token is valid
         
     | 
| 177 | 
         
            +
            3. Ensure repository was created successfully
         
     | 
| 178 | 
         
            +
             
     | 
| 179 | 
         
            +
            ### **Issue: "Dataset repository not found"**
         
     | 
| 180 | 
         
            +
            **Solutions**:
         
     | 
| 181 | 
         
            +
            1. Check dataset repository exists
         
     | 
| 182 | 
         
            +
            2. Verify HF token has read access
         
     | 
| 183 | 
         
            +
            3. Use `--dataset-repo` to specify correct repository
         
     | 
| 184 | 
         
            +
             
     | 
| 185 | 
         
            +
            ## 📋 Workflow Integration
         
     | 
| 186 | 
         
            +
             
     | 
| 187 | 
         
            +
            ### **Complete Training Workflow**
         
     | 
| 188 | 
         
            +
            1. **Train Model**: Use training scripts with monitoring
         
     | 
| 189 | 
         
            +
            2. **Monitor Progress**: View metrics in Trackio interface
         
     | 
| 190 | 
         
            +
            3. **Push Model**: Use improved push script
         
     | 
| 191 | 
         
            +
            4. **Access Data**: View experiments in HF Dataset repository
         
     | 
| 192 | 
         
            +
             
     | 
| 193 | 
         
            +
            ### **Example Workflow**
         
     | 
| 194 | 
         
            +
            ```bash
         
     | 
| 195 | 
         
            +
            # 1. Train model with monitoring
         
     | 
| 196 | 
         
            +
            python train.py config/train_smollm3_openhermes_fr.py \
         
     | 
| 197 | 
         
            +
              --experiment_name "smollm3_french_v2"
         
     | 
| 198 | 
         
            +
             
     | 
| 199 | 
         
            +
            # 2. Push model to HF Hub
         
     | 
| 200 | 
         
            +
            python push_to_huggingface.py outputs/model username/smollm3-french \
         
     | 
| 201 | 
         
            +
              --dataset-repo username/experiments \
         
     | 
| 202 | 
         
            +
              --experiment-name "smollm3_french_v2"
         
     | 
| 203 | 
         
            +
             
     | 
| 204 | 
         
            +
            # 3. View results
         
     | 
| 205 | 
         
            +
            # - Model: https://huggingface.co/username/smollm3-french
         
     | 
| 206 | 
         
            +
            # - Experiments: https://huggingface.co/datasets/username/experiments
         
     | 
| 207 | 
         
            +
            # - Trackio: Your Trackio Space interface
         
     | 
| 208 | 
         
            +
            ```
         
     | 
| 209 | 
         
            +
             
     | 
| 210 | 
         
            +
            ## 🎯 Benefits
         
     | 
| 211 | 
         
            +
             
     | 
| 212 | 
         
            +
            ### **For Model Deployment**
         
     | 
| 213 | 
         
            +
            - ✅ **Complete Documentation**: Enhanced model cards with experiment links
         
     | 
| 214 | 
         
            +
            - ✅ **Persistent Storage**: Experiment data stored in HF Datasets
         
     | 
| 215 | 
         
            +
            - ✅ **Easy Access**: Direct links to training data and metrics
         
     | 
| 216 | 
         
            +
            - ✅ **Reproducibility**: Complete training configuration included
         
     | 
| 217 | 
         
            +
             
     | 
| 218 | 
         
            +
            ### **For Experiment Management**
         
     | 
| 219 | 
         
            +
            - ✅ **Centralized Storage**: All experiments in HF Dataset repository
         
     | 
| 220 | 
         
            +
            - ✅ **Version Control**: Model versions linked to experiment data
         
     | 
| 221 | 
         
            +
            - ✅ **Collaboration**: Share experiments and models easily
         
     | 
| 222 | 
         
            +
            - ✅ **Searchability**: Easy to find specific experiments
         
     | 
| 223 | 
         
            +
             
     | 
| 224 | 
         
            +
            ### **For Development**
         
     | 
| 225 | 
         
            +
            - ✅ **Flexible Configuration**: Multiple ways to set parameters
         
     | 
| 226 | 
         
            +
            - ✅ **Backward Compatible**: Works with existing setups
         
     | 
| 227 | 
         
            +
            - ✅ **Error Handling**: Clear error messages and troubleshooting
         
     | 
| 228 | 
         
            +
            - ✅ **Integration**: Works with existing monitoring system
         
     | 
| 229 | 
         
            +
             
     | 
| 230 | 
         
            +
            ## 📊 Testing Results
         
     | 
| 231 | 
         
            +
             
     | 
| 232 | 
         
            +
            All push script tests passed:
         
     | 
| 233 | 
         
            +
            - ✅ **HuggingFacePusher Initialization**: Works with new parameters
         
     | 
| 234 | 
         
            +
            - ✅ **Model Card Creation**: Includes HF Datasets integration
         
     | 
| 235 | 
         
            +
            - ✅ **Logging Integration**: Logs to both Trackio and HF Datasets
         
     | 
| 236 | 
         
            +
            - ✅ **Argument Parsing**: Handles new command line arguments
         
     | 
| 237 | 
         
            +
            - ✅ **Environment Variables**: Proper fallback handling
         
     | 
| 238 | 
         
            +
             
     | 
| 239 | 
         
            +
            ## 🔄 Migration Guide
         
     | 
| 240 | 
         
            +
             
     | 
| 241 | 
         
            +
            ### **From Old Script**
         
     | 
| 242 | 
         
            +
            ```bash
         
     | 
| 243 | 
         
            +
            # Old way
         
     | 
| 244 | 
         
            +
            python push_to_huggingface.py model_path repo_name --token your_token
         
     | 
| 245 | 
         
            +
             
     | 
| 246 | 
         
            +
            # New way (same functionality)
         
     | 
| 247 | 
         
            +
            python push_to_huggingface.py model_path repo_name --hf-token your_token
         
     | 
| 248 | 
         
            +
             
     | 
| 249 | 
         
            +
            # New way with HF Datasets
         
     | 
| 250 | 
         
            +
            python push_to_huggingface.py model_path repo_name \
         
     | 
| 251 | 
         
            +
              --hf-token your_token \
         
     | 
| 252 | 
         
            +
              --dataset-repo username/experiments
         
     | 
| 253 | 
         
            +
            ```
         
     | 
| 254 | 
         
            +
             
     | 
| 255 | 
         
            +
            ### **Environment Variables**
         
     | 
| 256 | 
         
            +
            ```bash
         
     | 
| 257 | 
         
            +
            # Set environment variables for automatic detection
         
     | 
| 258 | 
         
            +
            export HF_TOKEN=your_token_here
         
     | 
| 259 | 
         
            +
            export TRACKIO_DATASET_REPO=username/experiments
         
     | 
| 260 | 
         
            +
             
     | 
| 261 | 
         
            +
            # Then use simple command
         
     | 
| 262 | 
         
            +
            python push_to_huggingface.py model_path repo_name
         
     | 
| 263 | 
         
            +
            ```
         
     | 
| 264 | 
         
            +
             
     | 
| 265 | 
         
            +
            ---
         
     | 
| 266 | 
         
            +
             
     | 
| 267 | 
         
            +
            **🎉 Your push script is now fully integrated with HF Datasets for complete experiment tracking and model deployment!** 
         
     | 
    	
        TRACKIO_INTEGRATION.md → docs/TRACKIO_INTEGRATION.md
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        TRACKIO_INTEGRATION_VERIFICATION.md → docs/TRACKIO_INTEGRATION_VERIFICATION.md
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        TRACKIO_INTERFACE_GUIDE.md → docs/TRACKIO_INTERFACE_GUIDE.md
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        launch.sh
    ADDED
    
    | 
         @@ -0,0 +1,690 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            #!/bin/bash
         
     | 
| 2 | 
         
            +
            # Interactive SmolLM3 End-to-End Fine-tuning Pipeline
         
     | 
| 3 | 
         
            +
            # This script creates a complete finetuning pipeline with user configuration
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            set -e  # Exit on any error
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            # Colors for output
         
     | 
| 8 | 
         
            +
            RED='\033[0;31m'
         
     | 
| 9 | 
         
            +
            GREEN='\033[0;32m'
         
     | 
| 10 | 
         
            +
            YELLOW='\033[1;33m'
         
     | 
| 11 | 
         
            +
            BLUE='\033[0;34m'
         
     | 
| 12 | 
         
            +
            PURPLE='\033[0;35m'
         
     | 
| 13 | 
         
            +
            CYAN='\033[0;36m'
         
     | 
| 14 | 
         
            +
            NC='\033[0m' # No Color
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            # Function to print colored output
         
     | 
| 17 | 
         
            +
            print_status() {
         
     | 
| 18 | 
         
            +
                echo -e "${GREEN}✅ $1${NC}"
         
     | 
| 19 | 
         
            +
            }
         
     | 
| 20 | 
         
            +
             
     | 
| 21 | 
         
            +
            print_warning() {
         
     | 
| 22 | 
         
            +
                echo -e "${YELLOW}⚠️  $1${NC}"
         
     | 
| 23 | 
         
            +
            }
         
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
            print_error() {
         
     | 
| 26 | 
         
            +
                echo -e "${RED}❌ $1${NC}"
         
     | 
| 27 | 
         
            +
            }
         
     | 
| 28 | 
         
            +
             
     | 
| 29 | 
         
            +
            print_info() {
         
     | 
| 30 | 
         
            +
                echo -e "${BLUE}ℹ️  $1${NC}"
         
     | 
| 31 | 
         
            +
            }
         
     | 
| 32 | 
         
            +
             
     | 
| 33 | 
         
            +
            print_header() {
         
     | 
| 34 | 
         
            +
                echo -e "${PURPLE}🚀 $1${NC}"
         
     | 
| 35 | 
         
            +
            }
         
     | 
| 36 | 
         
            +
             
     | 
| 37 | 
         
            +
            print_step() {
         
     | 
| 38 | 
         
            +
                echo -e "${CYAN}📋 $1${NC}"
         
     | 
| 39 | 
         
            +
            }
         
     | 
| 40 | 
         
            +
             
     | 
| 41 | 
         
            +
            # Function to get user input with default value
         
     | 
| 42 | 
         
            +
            get_input() {
         
     | 
| 43 | 
         
            +
                local prompt="$1"
         
     | 
| 44 | 
         
            +
                local default="$2"
         
     | 
| 45 | 
         
            +
                local var_name="$3"
         
     | 
| 46 | 
         
            +
                
         
     | 
| 47 | 
         
            +
                if [ -n "$default" ]; then
         
     | 
| 48 | 
         
            +
                    read -p "$prompt [$default]: " input
         
     | 
| 49 | 
         
            +
                    if [ -z "$input" ]; then
         
     | 
| 50 | 
         
            +
                        input="$default"
         
     | 
| 51 | 
         
            +
                    fi
         
     | 
| 52 | 
         
            +
                else
         
     | 
| 53 | 
         
            +
                    read -p "$prompt: " input
         
     | 
| 54 | 
         
            +
                    while [ -z "$input" ]; do
         
     | 
| 55 | 
         
            +
                        print_error "This field is required!"
         
     | 
| 56 | 
         
            +
                        read -p "$prompt: " input
         
     | 
| 57 | 
         
            +
                    done
         
     | 
| 58 | 
         
            +
                fi
         
     | 
| 59 | 
         
            +
                
         
     | 
| 60 | 
         
            +
                eval "$var_name=\"$input\""
         
     | 
| 61 | 
         
            +
            }
         
     | 
| 62 | 
         
            +
             
     | 
| 63 | 
         
            +
            # Function to select from options
         
     | 
| 64 | 
         
            +
            select_option() {
         
     | 
| 65 | 
         
            +
                local prompt="$1"
         
     | 
| 66 | 
         
            +
                local options=("${@:2}")
         
     | 
| 67 | 
         
            +
                local var_name="${!#}"
         
     | 
| 68 | 
         
            +
                
         
     | 
| 69 | 
         
            +
                echo "$prompt"
         
     | 
| 70 | 
         
            +
                for i in "${!options[@]}"; do
         
     | 
| 71 | 
         
            +
                    echo "  $((i+1)). ${options[$i]}"
         
     | 
| 72 | 
         
            +
                done
         
     | 
| 73 | 
         
            +
                
         
     | 
| 74 | 
         
            +
                while true; do
         
     | 
| 75 | 
         
            +
                    read -p "Enter your choice (1-${#options[@]}): " choice
         
     | 
| 76 | 
         
            +
                    if [[ "$choice" =~ ^[0-9]+$ ]] && [ "$choice" -ge 1 ] && [ "$choice" -le "${#options[@]}" ]; then
         
     | 
| 77 | 
         
            +
                        eval "$var_name=\"${options[$((choice-1))]}\""
         
     | 
| 78 | 
         
            +
                        break
         
     | 
| 79 | 
         
            +
                    else
         
     | 
| 80 | 
         
            +
                        print_error "Invalid choice. Please enter a number between 1 and ${#options[@]}"
         
     | 
| 81 | 
         
            +
                    fi
         
     | 
| 82 | 
         
            +
                done
         
     | 
| 83 | 
         
            +
            }
         
     | 
| 84 | 
         
            +
             
     | 
| 85 | 
         
            +
            # Function to validate HF token
         
     | 
| 86 | 
         
            +
            validate_hf_token() {
         
     | 
| 87 | 
         
            +
                local token="$1"
         
     | 
| 88 | 
         
            +
                if [ -z "$token" ]; then
         
     | 
| 89 | 
         
            +
                    return 1
         
     | 
| 90 | 
         
            +
                fi
         
     | 
| 91 | 
         
            +
                
         
     | 
| 92 | 
         
            +
                # Test the token
         
     | 
| 93 | 
         
            +
                export HF_TOKEN="$token"
         
     | 
| 94 | 
         
            +
                if huggingface-cli whoami >/dev/null 2>&1; then
         
     | 
| 95 | 
         
            +
                    return 0
         
     | 
| 96 | 
         
            +
                else
         
     | 
| 97 | 
         
            +
                    return 1
         
     | 
| 98 | 
         
            +
                fi
         
     | 
| 99 | 
         
            +
            }
         
     | 
| 100 | 
         
            +
             
     | 
| 101 | 
         
            +
            # Function to show training configurations
         
     | 
| 102 | 
         
            +
            show_training_configs() {
         
     | 
| 103 | 
         
            +
                echo ""
         
     | 
| 104 | 
         
            +
                print_header "Available Training Configurations"
         
     | 
| 105 | 
         
            +
                echo "======================================"
         
     | 
| 106 | 
         
            +
                echo ""
         
     | 
| 107 | 
         
            +
                echo "1. Basic Training (Default)"
         
     | 
| 108 | 
         
            +
                echo "   - Model: SmolLM3-3B"
         
     | 
| 109 | 
         
            +
                echo "   - Dataset: SmolTalk"
         
     | 
| 110 | 
         
            +
                echo "   - Epochs: 3"
         
     | 
| 111 | 
         
            +
                echo "   - Batch Size: 2"
         
     | 
| 112 | 
         
            +
                echo "   - Learning Rate: 5e-6"
         
     | 
| 113 | 
         
            +
                echo ""
         
     | 
| 114 | 
         
            +
                echo "2. H100 Lightweight (Rapid)"
         
     | 
| 115 | 
         
            +
                echo "   - Model: SmolLM3-3B"
         
     | 
| 116 | 
         
            +
                echo "   - Dataset: OpenHermes-FR (80K samples)"
         
     | 
| 117 | 
         
            +
                echo "   - Epochs: 1"
         
     | 
| 118 | 
         
            +
                echo "   - Batch Size: 16"
         
     | 
| 119 | 
         
            +
                echo "   - Learning Rate: 8e-6"
         
     | 
| 120 | 
         
            +
                echo "   - Sequence Length: 8192"
         
     | 
| 121 | 
         
            +
                echo "   - Optimized for H100 rapid training"
         
     | 
| 122 | 
         
            +
                echo ""
         
     | 
| 123 | 
         
            +
                echo "3. A100 Large Scale"
         
     | 
| 124 | 
         
            +
                echo "   - Model: SmolLM3-3B"
         
     | 
| 125 | 
         
            +
                echo "   - Dataset: OpenHermes-FR"
         
     | 
| 126 | 
         
            +
                echo "   - Epochs: 1.3 passes"
         
     | 
| 127 | 
         
            +
                echo "   - Batch Size: 8"
         
     | 
| 128 | 
         
            +
                echo "   - Learning Rate: 5e-6"
         
     | 
| 129 | 
         
            +
                echo "   - Sequence Length: 8192"
         
     | 
| 130 | 
         
            +
                echo ""
         
     | 
| 131 | 
         
            +
                echo "4. Multiple Passes"
         
     | 
| 132 | 
         
            +
                echo "   - Model: SmolLM3-3B"
         
     | 
| 133 | 
         
            +
                echo "   - Dataset: OpenHermes-FR"
         
     | 
| 134 | 
         
            +
                echo "   - Epochs: 4 passes"
         
     | 
| 135 | 
         
            +
                echo "   - Batch Size: 6"
         
     | 
| 136 | 
         
            +
                echo "   - Learning Rate: 3e-6"
         
     | 
| 137 | 
         
            +
                echo "   - Sequence Length: 8192"
         
     | 
| 138 | 
         
            +
                echo ""
         
     | 
| 139 | 
         
            +
                echo "5. Custom Configuration"
         
     | 
| 140 | 
         
            +
                echo "   - User-defined parameters"
         
     | 
| 141 | 
         
            +
                echo ""
         
     | 
| 142 | 
         
            +
            }
         
     | 
| 143 | 
         
            +
             
     | 
| 144 | 
         
            +
            # Function to get training configuration
         
     | 
| 145 | 
         
            +
            get_training_config() {
         
     | 
| 146 | 
         
            +
                local config_type="$1"
         
     | 
| 147 | 
         
            +
                
         
     | 
| 148 | 
         
            +
                case "$config_type" in
         
     | 
| 149 | 
         
            +
                    "Basic Training")
         
     | 
| 150 | 
         
            +
                        MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
         
     | 
| 151 | 
         
            +
                        DATASET_NAME="HuggingFaceTB/smoltalk"
         
     | 
| 152 | 
         
            +
                        MAX_EPOCHS=3
         
     | 
| 153 | 
         
            +
                        BATCH_SIZE=2
         
     | 
| 154 | 
         
            +
                        GRADIENT_ACCUMULATION_STEPS=8
         
     | 
| 155 | 
         
            +
                        LEARNING_RATE=5e-6
         
     | 
| 156 | 
         
            +
                        MAX_SEQ_LENGTH=4096
         
     | 
| 157 | 
         
            +
                        CONFIG_FILE="config/train_smollm3.py"
         
     | 
| 158 | 
         
            +
                        ;;
         
     | 
| 159 | 
         
            +
                    "H100 Lightweight (Rapid)")
         
     | 
| 160 | 
         
            +
                        MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
         
     | 
| 161 | 
         
            +
                        DATASET_NAME="legmlai/openhermes-fr"
         
     | 
| 162 | 
         
            +
                        MAX_EPOCHS=1
         
     | 
| 163 | 
         
            +
                        BATCH_SIZE=16
         
     | 
| 164 | 
         
            +
                        GRADIENT_ACCUMULATION_STEPS=4
         
     | 
| 165 | 
         
            +
                        LEARNING_RATE=8e-6
         
     | 
| 166 | 
         
            +
                        MAX_SEQ_LENGTH=8192
         
     | 
| 167 | 
         
            +
                        DATASET_SAMPLE_SIZE=80000
         
     | 
| 168 | 
         
            +
                        CONFIG_FILE="config/train_smollm3_h100_lightweight.py"
         
     | 
| 169 | 
         
            +
                        ;;
         
     | 
| 170 | 
         
            +
                    "A100 Large Scale")
         
     | 
| 171 | 
         
            +
                        MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
         
     | 
| 172 | 
         
            +
                        DATASET_NAME="legmlai/openhermes-fr"
         
     | 
| 173 | 
         
            +
                        MAX_EPOCHS=1
         
     | 
| 174 | 
         
            +
                        BATCH_SIZE=8
         
     | 
| 175 | 
         
            +
                        GRADIENT_ACCUMULATION_STEPS=16
         
     | 
| 176 | 
         
            +
                        LEARNING_RATE=5e-6
         
     | 
| 177 | 
         
            +
                        MAX_SEQ_LENGTH=8192
         
     | 
| 178 | 
         
            +
                        CONFIG_FILE="config/train_smollm3_openhermes_fr_a100_large.py"
         
     | 
| 179 | 
         
            +
                        ;;
         
     | 
| 180 | 
         
            +
                    "Multiple Passes")
         
     | 
| 181 | 
         
            +
                        MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
         
     | 
| 182 | 
         
            +
                        DATASET_NAME="legmlai/openhermes-fr"
         
     | 
| 183 | 
         
            +
                        MAX_EPOCHS=4
         
     | 
| 184 | 
         
            +
                        BATCH_SIZE=6
         
     | 
| 185 | 
         
            +
                        GRADIENT_ACCUMULATION_STEPS=20
         
     | 
| 186 | 
         
            +
                        LEARNING_RATE=3e-6
         
     | 
| 187 | 
         
            +
                        MAX_SEQ_LENGTH=8192
         
     | 
| 188 | 
         
            +
                        CONFIG_FILE="config/train_smollm3_openhermes_fr_a100_multiple_passes.py"
         
     | 
| 189 | 
         
            +
                        ;;
         
     | 
| 190 | 
         
            +
                    "Custom Configuration")
         
     | 
| 191 | 
         
            +
                        get_custom_config
         
     | 
| 192 | 
         
            +
                        ;;
         
     | 
| 193 | 
         
            +
                esac
         
     | 
| 194 | 
         
            +
            }
         
     | 
| 195 | 
         
            +
             
     | 
| 196 | 
         
            +
            # Function to get custom configuration
         
     | 
| 197 | 
         
            +
            get_custom_config() {
         
     | 
| 198 | 
         
            +
                print_step "Custom Configuration Setup"
         
     | 
| 199 | 
         
            +
                echo "============================="
         
     | 
| 200 | 
         
            +
                
         
     | 
| 201 | 
         
            +
                get_input "Model name" "HuggingFaceTB/SmolLM3-3B" MODEL_NAME
         
     | 
| 202 | 
         
            +
                get_input "Dataset name" "HuggingFaceTB/smoltalk" DATASET_NAME
         
     | 
| 203 | 
         
            +
                get_input "Number of epochs" "3" MAX_EPOCHS
         
     | 
| 204 | 
         
            +
                get_input "Batch size" "2" BATCH_SIZE
         
     | 
| 205 | 
         
            +
                get_input "Gradient accumulation steps" "8" GRADIENT_ACCUMULATION_STEPS
         
     | 
| 206 | 
         
            +
                get_input "Learning rate" "5e-6" LEARNING_RATE
         
     | 
| 207 | 
         
            +
                get_input "Max sequence length" "4096" MAX_SEQ_LENGTH
         
     | 
| 208 | 
         
            +
                
         
     | 
| 209 | 
         
            +
                # Select config file based on dataset
         
     | 
| 210 | 
         
            +
                if [[ "$DATASET_NAME" == *"openhermes"* ]]; then
         
     | 
| 211 | 
         
            +
                    CONFIG_FILE="config/train_smollm3_openhermes_fr.py"
         
     | 
| 212 | 
         
            +
                else
         
     | 
| 213 | 
         
            +
                    CONFIG_FILE="config/train_smollm3.py"
         
     | 
| 214 | 
         
            +
                fi
         
     | 
| 215 | 
         
            +
            }
         
     | 
| 216 | 
         
            +
             
     | 
| 217 | 
         
            +
            # Function to create training configuration file
         
     | 
| 218 | 
         
            +
            create_training_config() {
         
     | 
| 219 | 
         
            +
                local config_file="$1"
         
     | 
| 220 | 
         
            +
                
         
     | 
| 221 | 
         
            +
                cat > "$config_file" << EOF
         
     | 
| 222 | 
         
            +
            """
         
     | 
| 223 | 
         
            +
            SmolLM3 Training Configuration - Generated by launch.sh
         
     | 
| 224 | 
         
            +
            Optimized for: $TRAINING_CONFIG_TYPE
         
     | 
| 225 | 
         
            +
            """
         
     | 
| 226 | 
         
            +
             
     | 
| 227 | 
         
            +
            from config.train_smollm3 import SmolLM3Config
         
     | 
| 228 | 
         
            +
             
     | 
| 229 | 
         
            +
            config = SmolLM3Config(
         
     | 
| 230 | 
         
            +
                # Model configuration
         
     | 
| 231 | 
         
            +
                model_name="$MODEL_NAME",
         
     | 
| 232 | 
         
            +
                max_seq_length=$MAX_SEQ_LENGTH,
         
     | 
| 233 | 
         
            +
                use_flash_attention=True,
         
     | 
| 234 | 
         
            +
                use_gradient_checkpointing=True,
         
     | 
| 235 | 
         
            +
                
         
     | 
| 236 | 
         
            +
                # Training configuration
         
     | 
| 237 | 
         
            +
                batch_size=$BATCH_SIZE,
         
     | 
| 238 | 
         
            +
                gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS,
         
     | 
| 239 | 
         
            +
                learning_rate=$LEARNING_RATE,
         
     | 
| 240 | 
         
            +
                weight_decay=0.01,
         
     | 
| 241 | 
         
            +
                warmup_steps=100,
         
     | 
| 242 | 
         
            +
                max_iters=None,  # Will be calculated based on epochs
         
     | 
| 243 | 
         
            +
                eval_interval=100,
         
     | 
| 244 | 
         
            +
                log_interval=10,
         
     | 
| 245 | 
         
            +
                save_interval=500,
         
     | 
| 246 | 
         
            +
                
         
     | 
| 247 | 
         
            +
                # Optimizer configuration
         
     | 
| 248 | 
         
            +
                optimizer="adamw",
         
     | 
| 249 | 
         
            +
                beta1=0.9,
         
     | 
| 250 | 
         
            +
                beta2=0.95,
         
     | 
| 251 | 
         
            +
                eps=1e-8,
         
     | 
| 252 | 
         
            +
                
         
     | 
| 253 | 
         
            +
                # Scheduler configuration
         
     | 
| 254 | 
         
            +
                scheduler="cosine",
         
     | 
| 255 | 
         
            +
                min_lr=1e-6,
         
     | 
| 256 | 
         
            +
                
         
     | 
| 257 | 
         
            +
                # Mixed precision
         
     | 
| 258 | 
         
            +
                fp16=True,
         
     | 
| 259 | 
         
            +
                bf16=False,
         
     | 
| 260 | 
         
            +
                
         
     | 
| 261 | 
         
            +
                # Logging and saving
         
     | 
| 262 | 
         
            +
                save_steps=$SAVE_STEPS,
         
     | 
| 263 | 
         
            +
                eval_steps=$EVAL_STEPS,
         
     | 
| 264 | 
         
            +
                logging_steps=$LOGGING_STEPS,
         
     | 
| 265 | 
         
            +
                save_total_limit=3,
         
     | 
| 266 | 
         
            +
                
         
     | 
| 267 | 
         
            +
                # Evaluation
         
     | 
| 268 | 
         
            +
                eval_strategy="steps",
         
     | 
| 269 | 
         
            +
                metric_for_best_model="eval_loss",
         
     | 
| 270 | 
         
            +
                greater_is_better=False,
         
     | 
| 271 | 
         
            +
                load_best_model_at_end=True,
         
     | 
| 272 | 
         
            +
                
         
     | 
| 273 | 
         
            +
                # Data configuration
         
     | 
| 274 | 
         
            +
                dataset_name="$DATASET_NAME",
         
     | 
| 275 | 
         
            +
                dataset_split="train",
         
     | 
| 276 | 
         
            +
                input_field="prompt",
         
     | 
| 277 | 
         
            +
                target_field="completion",
         
     | 
| 278 | 
         
            +
                filter_bad_entries=False,
         
     | 
| 279 | 
         
            +
                bad_entry_field="bad_entry",
         
     | 
| 280 | 
         
            +
                
         
     | 
| 281 | 
         
            +
                # Chat template configuration
         
     | 
| 282 | 
         
            +
                use_chat_template=True,
         
     | 
| 283 | 
         
            +
                chat_template_kwargs={
         
     | 
| 284 | 
         
            +
                    "enable_thinking": False,
         
     | 
| 285 | 
         
            +
                    "add_generation_prompt": True,
         
     | 
| 286 | 
         
            +
                    "no_think_system_message": True
         
     | 
| 287 | 
         
            +
                },
         
     | 
| 288 | 
         
            +
                
         
     | 
| 289 | 
         
            +
                # Trackio monitoring configuration
         
     | 
| 290 | 
         
            +
                enable_tracking=True,
         
     | 
| 291 | 
         
            +
                trackio_url="$TRACKIO_URL",
         
     | 
| 292 | 
         
            +
                trackio_token=None,
         
     | 
| 293 | 
         
            +
                log_artifacts=True,
         
     | 
| 294 | 
         
            +
                log_metrics=True,
         
     | 
| 295 | 
         
            +
                log_config=True,
         
     | 
| 296 | 
         
            +
                experiment_name="$EXPERIMENT_NAME",
         
     | 
| 297 | 
         
            +
                
         
     | 
| 298 | 
         
            +
                # HF Datasets configuration
         
     | 
| 299 | 
         
            +
                dataset_repo="$TRACKIO_DATASET_REPO"
         
     | 
| 300 | 
         
            +
            )
         
     | 
| 301 | 
         
            +
            EOF
         
     | 
| 302 | 
         
            +
            }
         
     | 
| 303 | 
         
            +
             
     | 
| 304 | 
         
            +
            # Main script starts here
         
     | 
| 305 | 
         
            +
            print_header "SmolLM3 End-to-End Fine-tuning Pipeline"
         
     | 
| 306 | 
         
            +
            echo "=============================================="
         
     | 
| 307 | 
         
            +
            echo ""
         
     | 
| 308 | 
         
            +
             
     | 
| 309 | 
         
            +
            # Step 1: Get user credentials
         
     | 
| 310 | 
         
            +
            print_step "Step 1: User Authentication"
         
     | 
| 311 | 
         
            +
            echo "================================"
         
     | 
| 312 | 
         
            +
             
     | 
| 313 | 
         
            +
            get_input "Hugging Face username" "" HF_USERNAME
         
     | 
| 314 | 
         
            +
            get_input "Hugging Face token (get from https://huggingface.co/settings/tokens)" "" HF_TOKEN
         
     | 
| 315 | 
         
            +
             
     | 
| 316 | 
         
            +
            # Validate HF token
         
     | 
| 317 | 
         
            +
            print_info "Validating Hugging Face token..."
         
     | 
| 318 | 
         
            +
            if validate_hf_token "$HF_TOKEN"; then
         
     | 
| 319 | 
         
            +
                print_status "HF token validated successfully"
         
     | 
| 320 | 
         
            +
            else
         
     | 
| 321 | 
         
            +
                print_error "Invalid HF token. Please check your token and try again."
         
     | 
| 322 | 
         
            +
                exit 1
         
     | 
| 323 | 
         
            +
            fi
         
     | 
| 324 | 
         
            +
             
     | 
| 325 | 
         
            +
            # Step 2: Select training configuration
         
     | 
| 326 | 
         
            +
            print_step "Step 2: Training Configuration"
         
     | 
| 327 | 
         
            +
            echo "=================================="
         
     | 
| 328 | 
         
            +
             
     | 
| 329 | 
         
            +
            show_training_configs
         
     | 
| 330 | 
         
            +
            select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "Custom Configuration" TRAINING_CONFIG_TYPE
         
     | 
| 331 | 
         
            +
             
     | 
| 332 | 
         
            +
            get_training_config "$TRAINING_CONFIG_TYPE"
         
     | 
| 333 | 
         
            +
             
     | 
| 334 | 
         
            +
            # Step 3: Get experiment details
         
     | 
| 335 | 
         
            +
            print_step "Step 3: Experiment Details"
         
     | 
| 336 | 
         
            +
            echo "=============================="
         
     | 
| 337 | 
         
            +
             
     | 
| 338 | 
         
            +
            get_input "Experiment name" "smollm3_finetune_$(date +%Y%m%d_%H%M%S)" EXPERIMENT_NAME
         
     | 
| 339 | 
         
            +
            get_input "Model repository name" "$HF_USERNAME/smollm3-finetuned-$(date +%Y%m%d)" REPO_NAME
         
     | 
| 340 | 
         
            +
            get_input "Trackio dataset repository" "$HF_USERNAME/trackio-experiments" TRACKIO_DATASET_REPO
         
     | 
| 341 | 
         
            +
             
     | 
| 342 | 
         
            +
            # Step 4: Training parameters
         
     | 
| 343 | 
         
            +
            print_step "Step 4: Training Parameters"
         
     | 
| 344 | 
         
            +
            echo "==============================="
         
     | 
| 345 | 
         
            +
             
     | 
| 346 | 
         
            +
            echo "Current configuration:"
         
     | 
| 347 | 
         
            +
            echo "  Model: $MODEL_NAME"
         
     | 
| 348 | 
         
            +
            echo "  Dataset: $DATASET_NAME"
         
     | 
| 349 | 
         
            +
            if [ "$TRAINING_CONFIG_TYPE" = "H100 Lightweight (Rapid)" ]; then
         
     | 
| 350 | 
         
            +
                echo "  Dataset Sample Size: ${DATASET_SAMPLE_SIZE:-80000}"
         
     | 
| 351 | 
         
            +
            fi
         
     | 
| 352 | 
         
            +
            echo "  Epochs: $MAX_EPOCHS"
         
     | 
| 353 | 
         
            +
            echo "  Batch Size: $BATCH_SIZE"
         
     | 
| 354 | 
         
            +
            echo "  Gradient Accumulation: $GRADIENT_ACCUMULATION_STEPS"
         
     | 
| 355 | 
         
            +
            echo "  Learning Rate: $LEARNING_RATE"
         
     | 
| 356 | 
         
            +
            echo "  Sequence Length: $MAX_SEQ_LENGTH"
         
     | 
| 357 | 
         
            +
             
     | 
| 358 | 
         
            +
            get_input "Save steps" "500" SAVE_STEPS
         
     | 
| 359 | 
         
            +
            get_input "Evaluation steps" "100" EVAL_STEPS
         
     | 
| 360 | 
         
            +
            get_input "Logging steps" "10" LOGGING_STEPS
         
     | 
| 361 | 
         
            +
             
     | 
| 362 | 
         
            +
            # Step 5: Trackio Space configuration
         
     | 
| 363 | 
         
            +
            print_step "Step 5: Trackio Space Configuration"
         
     | 
| 364 | 
         
            +
            echo "======================================"
         
     | 
| 365 | 
         
            +
             
     | 
| 366 | 
         
            +
            get_input "Trackio Space name" "trackio-monitoring-$(date +%Y%m%d)" TRACKIO_SPACE_NAME
         
     | 
| 367 | 
         
            +
            TRACKIO_URL="https://huggingface.co/spaces/$HF_USERNAME/$TRACKIO_SPACE_NAME"
         
     | 
| 368 | 
         
            +
             
     | 
| 369 | 
         
            +
            # Step 6: Confirm configuration
         
     | 
| 370 | 
         
            +
            print_step "Step 6: Configuration Summary"
         
     | 
| 371 | 
         
            +
            echo "================================="
         
     | 
| 372 | 
         
            +
             
     | 
| 373 | 
         
            +
            echo ""
         
     | 
| 374 | 
         
            +
            echo "📋 Configuration Summary:"
         
     | 
| 375 | 
         
            +
            echo "========================"
         
     | 
| 376 | 
         
            +
            echo "  User: $HF_USERNAME"
         
     | 
| 377 | 
         
            +
            echo "  Experiment: $EXPERIMENT_NAME"
         
     | 
| 378 | 
         
            +
            echo "  Model: $MODEL_NAME"
         
     | 
| 379 | 
         
            +
            echo "  Dataset: $DATASET_NAME"
         
     | 
| 380 | 
         
            +
            echo "  Training Config: $TRAINING_CONFIG_TYPE"
         
     | 
| 381 | 
         
            +
            if [ "$TRAINING_CONFIG_TYPE" = "H100 Lightweight (Rapid)" ]; then
         
     | 
| 382 | 
         
            +
                echo "  Dataset Sample Size: ${DATASET_SAMPLE_SIZE:-80000}"
         
     | 
| 383 | 
         
            +
            fi
         
     | 
| 384 | 
         
            +
            echo "  Epochs: $MAX_EPOCHS"
         
     | 
| 385 | 
         
            +
            echo "  Batch Size: $BATCH_SIZE"
         
     | 
| 386 | 
         
            +
            echo "  Learning Rate: $LEARNING_RATE"
         
     | 
| 387 | 
         
            +
            echo "  Model Repo: $REPO_NAME"
         
     | 
| 388 | 
         
            +
            echo "  Trackio Space: $TRACKIO_URL"
         
     | 
| 389 | 
         
            +
            echo "  HF Dataset: $TRACKIO_DATASET_REPO"
         
     | 
| 390 | 
         
            +
            echo ""
         
     | 
| 391 | 
         
            +
             
     | 
| 392 | 
         
            +
            read -p "Proceed with this configuration? (y/N): " confirm
         
     | 
| 393 | 
         
            +
            if [[ ! "$confirm" =~ ^[Yy]$ ]]; then
         
     | 
| 394 | 
         
            +
                print_info "Configuration cancelled. Exiting."
         
     | 
| 395 | 
         
            +
                exit 0
         
     | 
| 396 | 
         
            +
            fi
         
     | 
| 397 | 
         
            +
             
     | 
| 398 | 
         
            +
            # Step 7: Environment setup
         
     | 
| 399 | 
         
            +
            print_step "Step 7: Environment Setup"
         
     | 
| 400 | 
         
            +
            echo "============================"
         
     | 
| 401 | 
         
            +
             
     | 
| 402 | 
         
            +
            print_info "Installing system dependencies..."
         
     | 
| 403 | 
         
            +
            sudo apt-get update
         
     | 
| 404 | 
         
            +
            sudo apt-get install -y git curl wget unzip python3-pip python3-venv
         
     | 
| 405 | 
         
            +
             
     | 
| 406 | 
         
            +
            print_info "Creating Python virtual environment..."
         
     | 
| 407 | 
         
            +
            python3 -m venv smollm3_env
         
     | 
| 408 | 
         
            +
            source smollm3_env/bin/activate
         
     | 
| 409 | 
         
            +
             
     | 
| 410 | 
         
            +
            print_info "Installing PyTorch with CUDA support..."
         
     | 
| 411 | 
         
            +
            pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
         
     | 
| 412 | 
         
            +
             
     | 
| 413 | 
         
            +
            print_info "Installing project dependencies..."
         
     | 
| 414 | 
         
            +
            pip install -r requirements/requirements_core.txt
         
     | 
| 415 | 
         
            +
             
     | 
| 416 | 
         
            +
            print_info "Installing additional dependencies..."
         
     | 
| 417 | 
         
            +
            pip install trl>=0.7.0
         
     | 
| 418 | 
         
            +
            pip install peft>=0.4.0
         
     | 
| 419 | 
         
            +
            pip install accelerate>=0.20.0
         
     | 
| 420 | 
         
            +
            pip install huggingface-hub>=0.16.0
         
     | 
| 421 | 
         
            +
            pip install datasets>=2.14.0
         
     | 
| 422 | 
         
            +
            pip install requests>=2.31.0
         
     | 
| 423 | 
         
            +
             
     | 
| 424 | 
         
            +
            # Step 8: Authentication setup
         
     | 
| 425 | 
         
            +
            print_step "Step 8: Authentication Setup"
         
     | 
| 426 | 
         
            +
            echo "================================"
         
     | 
| 427 | 
         
            +
             
     | 
| 428 | 
         
            +
            export HF_TOKEN="$HF_TOKEN"
         
     | 
| 429 | 
         
            +
            export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
         
     | 
| 430 | 
         
            +
            huggingface-cli login --token $HF_TOKEN
         
     | 
| 431 | 
         
            +
             
     | 
| 432 | 
         
            +
            # Step 9: Deploy Trackio Space
         
     | 
| 433 | 
         
            +
            print_step "Step 9: Deploying Trackio Space"
         
     | 
| 434 | 
         
            +
            echo "==================================="
         
     | 
| 435 | 
         
            +
             
     | 
| 436 | 
         
            +
            cd scripts/trackio_tonic
         
     | 
| 437 | 
         
            +
             
     | 
| 438 | 
         
            +
            # Create deployment script input
         
     | 
| 439 | 
         
            +
            cat > deploy_input.txt << EOF
         
     | 
| 440 | 
         
            +
            $HF_USERNAME
         
     | 
| 441 | 
         
            +
            $TRACKIO_SPACE_NAME
         
     | 
| 442 | 
         
            +
            $HF_TOKEN
         
     | 
| 443 | 
         
            +
            EOF
         
     | 
| 444 | 
         
            +
             
     | 
| 445 | 
         
            +
            # Run deployment script
         
     | 
| 446 | 
         
            +
            python deploy_trackio_space.py < deploy_input.txt
         
     | 
| 447 | 
         
            +
             
     | 
| 448 | 
         
            +
            print_status "Trackio Space deployed: $TRACKIO_URL"
         
     | 
| 449 | 
         
            +
             
     | 
| 450 | 
         
            +
            # Step 10: Setup HF Dataset
         
     | 
| 451 | 
         
            +
            print_step "Step 10: Setting up HF Dataset"
         
     | 
| 452 | 
         
            +
            echo "=================================="
         
     | 
| 453 | 
         
            +
             
     | 
| 454 | 
         
            +
            cd ../dataset_tonic
         
     | 
| 455 | 
         
            +
            python setup_hf_dataset.py
         
     | 
| 456 | 
         
            +
             
     | 
| 457 | 
         
            +
            # Step 11: Configure Trackio
         
     | 
| 458 | 
         
            +
            print_step "Step 11: Configuring Trackio"
         
     | 
| 459 | 
         
            +
            echo "================================="
         
     | 
| 460 | 
         
            +
             
     | 
| 461 | 
         
            +
            cd ../trackio_tonic
         
     | 
| 462 | 
         
            +
            python configure_trackio.py
         
     | 
| 463 | 
         
            +
             
     | 
| 464 | 
         
            +
            # Step 12: Create training configuration
         
     | 
| 465 | 
         
            +
            print_step "Step 12: Creating Training Configuration"
         
     | 
| 466 | 
         
            +
            echo "==========================================="
         
     | 
| 467 | 
         
            +
             
     | 
| 468 | 
         
            +
            cd ../..
         
     | 
| 469 | 
         
            +
            create_training_config "$CONFIG_FILE"
         
     | 
| 470 | 
         
            +
             
     | 
| 471 | 
         
            +
            # Step 13: Download and prepare dataset
         
     | 
| 472 | 
         
            +
            print_step "Step 13: Preparing Dataset"
         
     | 
| 473 | 
         
            +
            echo "==============================="
         
     | 
| 474 | 
         
            +
             
     | 
| 475 | 
         
            +
            python -c "
         
     | 
| 476 | 
         
            +
            from datasets import load_dataset
         
     | 
| 477 | 
         
            +
            import json
         
     | 
| 478 | 
         
            +
            import os
         
     | 
| 479 | 
         
            +
            import random
         
     | 
| 480 | 
         
            +
             
     | 
| 481 | 
         
            +
            # Load dataset
         
     | 
| 482 | 
         
            +
            print('Loading dataset: $DATASET_NAME')
         
     | 
| 483 | 
         
            +
            dataset = load_dataset('$DATASET_NAME')
         
     | 
| 484 | 
         
            +
             
     | 
| 485 | 
         
            +
            # Create dataset directory
         
     | 
| 486 | 
         
            +
            os.makedirs('training_dataset', exist_ok=True)
         
     | 
| 487 | 
         
            +
             
     | 
| 488 | 
         
            +
            # Convert to training format
         
     | 
| 489 | 
         
            +
            def convert_to_training_format(example):
         
     | 
| 490 | 
         
            +
                # Handle different dataset formats
         
     | 
| 491 | 
         
            +
                if 'prompt' in example and 'completion' in example:
         
     | 
| 492 | 
         
            +
                    return {
         
     | 
| 493 | 
         
            +
                        'prompt': example['prompt'],
         
     | 
| 494 | 
         
            +
                        'completion': example['completion']
         
     | 
| 495 | 
         
            +
                    }
         
     | 
| 496 | 
         
            +
                elif 'instruction' in example and 'output' in example:
         
     | 
| 497 | 
         
            +
                    return {
         
     | 
| 498 | 
         
            +
                        'prompt': example['instruction'],
         
     | 
| 499 | 
         
            +
                        'completion': example['output']
         
     | 
| 500 | 
         
            +
                    }
         
     | 
| 501 | 
         
            +
                elif 'messages' in example:
         
     | 
| 502 | 
         
            +
                    # Handle chat format
         
     | 
| 503 | 
         
            +
                    messages = example['messages']
         
     | 
| 504 | 
         
            +
                    if len(messages) >= 2:
         
     | 
| 505 | 
         
            +
                        return {
         
     | 
| 506 | 
         
            +
                            'prompt': messages[0]['content'],
         
     | 
| 507 | 
         
            +
                            'completion': messages[1]['content']
         
     | 
| 508 | 
         
            +
                        }
         
     | 
| 509 | 
         
            +
                else:
         
     | 
| 510 | 
         
            +
                    # Fallback
         
     | 
| 511 | 
         
            +
                    return {
         
     | 
| 512 | 
         
            +
                        'prompt': str(example.get('input', '')),
         
     | 
| 513 | 
         
            +
                        'completion': str(example.get('output', ''))
         
     | 
| 514 | 
         
            +
                    }
         
     | 
| 515 | 
         
            +
             
     | 
| 516 | 
         
            +
            # Process train split
         
     | 
| 517 | 
         
            +
            train_data = []
         
     | 
| 518 | 
         
            +
            for example in dataset['train']:
         
     | 
| 519 | 
         
            +
                training_example = convert_to_training_format(example)
         
     | 
| 520 | 
         
            +
                if training_example['prompt'] and training_example['completion']:
         
     | 
| 521 | 
         
            +
                    train_data.append(training_example)
         
     | 
| 522 | 
         
            +
             
     | 
| 523 | 
         
            +
            # Apply dataset sampling for lightweight configuration
         
     | 
| 524 | 
         
            +
            if '$TRAINING_CONFIG_TYPE' == 'H100 Lightweight (Rapid)' and len(train_data) > ${DATASET_SAMPLE_SIZE:-0}:
         
     | 
| 525 | 
         
            +
                print(f'Sampling {${DATASET_SAMPLE_SIZE:-80000}} random samples from {len(train_data)} total samples')
         
     | 
| 526 | 
         
            +
                random.seed(42)  # For reproducibility
         
     | 
| 527 | 
         
            +
                train_data = random.sample(train_data, ${DATASET_SAMPLE_SIZE:-80000})
         
     | 
| 528 | 
         
            +
                print(f'Selected {len(train_data)} samples for lightweight training')
         
     | 
| 529 | 
         
            +
             
     | 
| 530 | 
         
            +
            # Process validation split if available
         
     | 
| 531 | 
         
            +
            val_data = []
         
     | 
| 532 | 
         
            +
            if 'validation' in dataset:
         
     | 
| 533 | 
         
            +
                for example in dataset['validation']:
         
     | 
| 534 | 
         
            +
                    training_example = convert_to_training_format(example)
         
     | 
| 535 | 
         
            +
                    if training_example['prompt'] and training_example['completion']:
         
     | 
| 536 | 
         
            +
                        val_data.append(training_example)
         
     | 
| 537 | 
         
            +
             
     | 
| 538 | 
         
            +
            # For lightweight config, also sample validation if it's large
         
     | 
| 539 | 
         
            +
            if '$TRAINING_CONFIG_TYPE' == 'H100 Lightweight (Rapid)' and len(val_data) > 1000:
         
     | 
| 540 | 
         
            +
                print(f'Sampling 1000 random validation samples from {len(val_data)} total')
         
     | 
| 541 | 
         
            +
                random.seed(42)  # For reproducibility
         
     | 
| 542 | 
         
            +
                val_data = random.sample(val_data, 1000)
         
     | 
| 543 | 
         
            +
             
     | 
| 544 | 
         
            +
            # Save to files
         
     | 
| 545 | 
         
            +
            with open('training_dataset/train.json', 'w') as f:
         
     | 
| 546 | 
         
            +
                json.dump(train_data, f, indent=2)
         
     | 
| 547 | 
         
            +
             
     | 
| 548 | 
         
            +
            if val_data:
         
     | 
| 549 | 
         
            +
                with open('training_dataset/validation.json', 'w') as f:
         
     | 
| 550 | 
         
            +
                    json.dump(val_data, f, indent=2)
         
     | 
| 551 | 
         
            +
             
     | 
| 552 | 
         
            +
            print(f'Dataset prepared: {len(train_data)} train samples, {len(val_data)} validation samples')
         
     | 
| 553 | 
         
            +
            "
         
     | 
| 554 | 
         
            +
             
     | 
| 555 | 
         
            +
            # Step 14: Calculate training parameters
         
     | 
| 556 | 
         
            +
            print_step "Step 14: Calculating Training Parameters"
         
     | 
| 557 | 
         
            +
            echo "============================================"
         
     | 
| 558 | 
         
            +
             
     | 
| 559 | 
         
            +
            TOTAL_SAMPLES=$(python -c "import json; data=json.load(open('training_dataset/train.json')); print(len(data))")
         
     | 
| 560 | 
         
            +
            EFFECTIVE_BATCH_SIZE=$((BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS))
         
     | 
| 561 | 
         
            +
            STEPS_PER_EPOCH=$((TOTAL_SAMPLES / EFFECTIVE_BATCH_SIZE))
         
     | 
| 562 | 
         
            +
            MAX_STEPS=$((STEPS_PER_EPOCH * MAX_EPOCHS))
         
     | 
| 563 | 
         
            +
             
     | 
| 564 | 
         
            +
            echo "  Total samples: $TOTAL_SAMPLES"
         
     | 
| 565 | 
         
            +
            echo "  Effective batch size: $EFFECTIVE_BATCH_SIZE"
         
     | 
| 566 | 
         
            +
            echo "  Steps per epoch: $STEPS_PER_EPOCH"
         
     | 
| 567 | 
         
            +
            echo "  Total training steps: $MAX_STEPS"
         
     | 
| 568 | 
         
            +
             
     | 
| 569 | 
         
            +
            # Step 15: Start training
         
     | 
| 570 | 
         
            +
            print_step "Step 15: Starting Training"
         
     | 
| 571 | 
         
            +
            echo "=============================="
         
     | 
| 572 | 
         
            +
             
     | 
| 573 | 
         
            +
            python src/train.py "$CONFIG_FILE" \
         
     | 
| 574 | 
         
            +
                --dataset_dir training_dataset \
         
     | 
| 575 | 
         
            +
                --out_dir /output-checkpoint \
         
     | 
| 576 | 
         
            +
                --init_from scratch \
         
     | 
| 577 | 
         
            +
                --max_iters $MAX_STEPS \
         
     | 
| 578 | 
         
            +
                --batch_size $BATCH_SIZE \
         
     | 
| 579 | 
         
            +
                --learning_rate $LEARNING_RATE \
         
     | 
| 580 | 
         
            +
                --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
         
     | 
| 581 | 
         
            +
                --max_seq_length $MAX_SEQ_LENGTH \
         
     | 
| 582 | 
         
            +
                --save_steps $SAVE_STEPS \
         
     | 
| 583 | 
         
            +
                --eval_steps $EVAL_STEPS \
         
     | 
| 584 | 
         
            +
                --logging_steps $LOGGING_STEPS \
         
     | 
| 585 | 
         
            +
                --enable_tracking \
         
     | 
| 586 | 
         
            +
                --trackio_url "$TRACKIO_URL" \
         
     | 
| 587 | 
         
            +
                --experiment_name "$EXPERIMENT_NAME" \
         
     | 
| 588 | 
         
            +
                --hf_token "$HF_TOKEN" \
         
     | 
| 589 | 
         
            +
                --dataset_repo "$TRACKIO_DATASET_REPO"
         
     | 
| 590 | 
         
            +
             
     | 
| 591 | 
         
            +
            # Step 16: Push model to Hugging Face Hub
         
     | 
| 592 | 
         
            +
            print_step "Step 16: Pushing Model to HF Hub"
         
     | 
| 593 | 
         
            +
            echo "====================================="
         
     | 
| 594 | 
         
            +
             
     | 
| 595 | 
         
            +
            python scripts/model_tonic/push_to_huggingface.py /output-checkpoint "$REPO_NAME" \
         
     | 
| 596 | 
         
            +
                --token "$HF_TOKEN" \
         
     | 
| 597 | 
         
            +
                --trackio-url "$TRACKIO_URL" \
         
     | 
| 598 | 
         
            +
                --experiment-name "$EXPERIMENT_NAME" \
         
     | 
| 599 | 
         
            +
                --dataset-repo "$TRACKIO_DATASET_REPO"
         
     | 
| 600 | 
         
            +
             
     | 
| 601 | 
         
            +
            # Step 17: Test the uploaded model
         
     | 
| 602 | 
         
            +
            print_step "Step 17: Testing Uploaded Model"
         
     | 
| 603 | 
         
            +
            echo "==================================="
         
     | 
| 604 | 
         
            +
             
     | 
| 605 | 
         
            +
            python -c "
         
     | 
| 606 | 
         
            +
            from transformers import AutoModelForCausalLM, AutoTokenizer
         
     | 
| 607 | 
         
            +
            import torch
         
     | 
| 608 | 
         
            +
             
     | 
| 609 | 
         
            +
            print('Loading uploaded model...')
         
     | 
| 610 | 
         
            +
            model = AutoModelForCausalLM.from_pretrained('$REPO_NAME', torch_dtype=torch.float16, device_map='auto')
         
     | 
| 611 | 
         
            +
            tokenizer = AutoTokenizer.from_pretrained('$REPO_NAME')
         
     | 
| 612 | 
         
            +
             
     | 
| 613 | 
         
            +
            print('Testing model generation...')
         
     | 
| 614 | 
         
            +
            prompt = 'Hello, how are you?'
         
     | 
| 615 | 
         
            +
            inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
         
     | 
| 616 | 
         
            +
            outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, temperature=0.7)
         
     | 
| 617 | 
         
            +
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         
     | 
| 618 | 
         
            +
            print(f'Prompt: {prompt}')
         
     | 
| 619 | 
         
            +
            print(f'Response: {response}')
         
     | 
| 620 | 
         
            +
            print('✅ Model test completed successfully!')
         
     | 
| 621 | 
         
            +
            "
         
     | 
| 622 | 
         
            +
             
     | 
| 623 | 
         
            +
            # Step 18: Create summary report
         
     | 
| 624 | 
         
            +
            print_step "Step 18: Creating Summary Report"
         
     | 
| 625 | 
         
            +
            echo "===================================="
         
     | 
| 626 | 
         
            +
             
     | 
| 627 | 
         
            +
            cat > training_summary.md << EOF
         
     | 
| 628 | 
         
            +
            # SmolLM3 Fine-tuning Summary
         
     | 
| 629 | 
         
            +
             
     | 
| 630 | 
         
            +
            ## Configuration
         
     | 
| 631 | 
         
            +
            - **Model**: $MODEL_NAME
         
     | 
| 632 | 
         
            +
            - **Dataset**: $DATASET_NAME
         
     | 
| 633 | 
         
            +
            - **Experiment**: $EXPERIMENT_NAME
         
     | 
| 634 | 
         
            +
            - **Repository**: $REPO_NAME
         
     | 
| 635 | 
         
            +
            - **Trackio Space**: $TRACKIO_URL
         
     | 
| 636 | 
         
            +
            - **HF Dataset**: $TRACKIO_DATASET_REPO
         
     | 
| 637 | 
         
            +
            - **Training Config**: $TRAINING_CONFIG_TYPE
         
     | 
| 638 | 
         
            +
            $(if [ "$TRAINING_CONFIG_TYPE" = "H100 Lightweight (Rapid)" ]; then
         
     | 
| 639 | 
         
            +
            echo "- **Dataset Sample Size**: ${DATASET_SAMPLE_SIZE:-80000}"
         
     | 
| 640 | 
         
            +
            fi)
         
     | 
| 641 | 
         
            +
             
     | 
| 642 | 
         
            +
            ## Training Parameters
         
     | 
| 643 | 
         
            +
            - **Batch Size**: $BATCH_SIZE
         
     | 
| 644 | 
         
            +
            - **Gradient Accumulation**: $GRADIENT_ACCUMULATION_STEPS
         
     | 
| 645 | 
         
            +
            - **Learning Rate**: $LEARNING_RATE
         
     | 
| 646 | 
         
            +
            - **Max Epochs**: $MAX_EPOCHS
         
     | 
| 647 | 
         
            +
            - **Max Steps**: $MAX_STEPS
         
     | 
| 648 | 
         
            +
            - **Total Samples**: $TOTAL_SAMPLES
         
     | 
| 649 | 
         
            +
            - **Sequence Length**: $MAX_SEQ_LENGTH
         
     | 
| 650 | 
         
            +
             
     | 
| 651 | 
         
            +
            ## Results
         
     | 
| 652 | 
         
            +
            - **Model Repository**: https://huggingface.co/$REPO_NAME
         
     | 
| 653 | 
         
            +
            - **Trackio Monitoring**: $TRACKIO_URL
         
     | 
| 654 | 
         
            +
            - **Experiment Data**: https://huggingface.co/datasets/$TRACKIO_DATASET_REPO
         
     | 
| 655 | 
         
            +
             
     | 
| 656 | 
         
            +
            ## Next Steps
         
     | 
| 657 | 
         
            +
            1. Monitor training progress in your Trackio Space
         
     | 
| 658 | 
         
            +
            2. Check the model repository on Hugging Face Hub
         
     | 
| 659 | 
         
            +
            3. Use the model in your applications
         
     | 
| 660 | 
         
            +
            4. Share your results with the community
         
     | 
| 661 | 
         
            +
             
     | 
| 662 | 
         
            +
            ## Files Created
         
     | 
| 663 | 
         
            +
            - Training configuration: \`$CONFIG_FILE\`
         
     | 
| 664 | 
         
            +
            - Dataset: \`training_dataset/\`
         
     | 
| 665 | 
         
            +
            - Model checkpoint: \`/output-checkpoint/\`
         
     | 
| 666 | 
         
            +
            - Training logs: \`training.log\`
         
     | 
| 667 | 
         
            +
            - Summary report: \`training_summary.md\`
         
     | 
| 668 | 
         
            +
            EOF
         
     | 
| 669 | 
         
            +
             
     | 
| 670 | 
         
            +
            print_status "Summary report saved to: training_summary.md"
         
     | 
| 671 | 
         
            +
             
     | 
| 672 | 
         
            +
            # Final summary
         
     | 
| 673 | 
         
            +
            echo ""
         
     | 
| 674 | 
         
            +
            print_header "🎉 End-to-End Pipeline Completed Successfully!"
         
     | 
| 675 | 
         
            +
            echo "=================================================="
         
     | 
| 676 | 
         
            +
            echo ""
         
     | 
| 677 | 
         
            +
            echo "📊 Model: https://huggingface.co/$REPO_NAME"
         
     | 
| 678 | 
         
            +
            echo "📈 Trackio: $TRACKIO_URL"
         
     | 
| 679 | 
         
            +
            echo "📋 Experiment: $EXPERIMENT_NAME"
         
     | 
| 680 | 
         
            +
            echo "📊 Dataset: https://huggingface.co/datasets/$TRACKIO_DATASET_REPO"
         
     | 
| 681 | 
         
            +
            echo ""
         
     | 
| 682 | 
         
            +
            echo "📋 Summary report saved to: training_summary.md"
         
     | 
| 683 | 
         
            +
            echo ""
         
     | 
| 684 | 
         
            +
            echo "🚀 Next steps:"
         
     | 
| 685 | 
         
            +
            echo "1. Monitor training progress in your Trackio Space"
         
     | 
| 686 | 
         
            +
            echo "2. Check the model repository on Hugging Face Hub"
         
     | 
| 687 | 
         
            +
            echo "3. Use the model in your applications"
         
     | 
| 688 | 
         
            +
            echo "4. Share your results with the community"
         
     | 
| 689 | 
         
            +
            echo ""
         
     | 
| 690 | 
         
            +
            print_status "Pipeline completed successfully!" 
         
     | 
    	
        requirements.txt → requirements/requirements.txt
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        requirements_core.txt → requirements/requirements_core.txt
    RENAMED
    
    | 
         @@ -9,6 +9,12 @@ tokenizers>=0.13.0 
     | 
|
| 9 | 
         
             
            bitsandbytes>=0.41.0
         
     | 
| 10 | 
         
             
            numpy>=1.24.0
         
     | 
| 11 | 
         
             
            tqdm>=4.65.0
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 12 | 
         
             
            trackio>=0.1.0
         
     | 
| 13 | 
         
             
            psutil>=5.9.0 
         
     | 
| 14 | 
         
            -
            pynvml>=12.0.0
         
     | 
| 
         | 
|
| 9 | 
         
             
            bitsandbytes>=0.41.0
         
     | 
| 10 | 
         
             
            numpy>=1.24.0
         
     | 
| 11 | 
         
             
            tqdm>=4.65.0
         
     | 
| 12 | 
         
            +
             
     | 
| 13 | 
         
            +
             
     | 
| 14 | 
         
            +
            # Monitoring dependencies
         
     | 
| 15 | 
         
            +
            requests>=2.31.0
         
     | 
| 16 | 
         
            +
            pandas>=2.0.0
         
     | 
| 17 | 
         
            +
            plotly>=5.0.0
         
     | 
| 18 | 
         
             
            trackio>=0.1.0
         
     | 
| 19 | 
         
             
            psutil>=5.9.0 
         
     | 
| 20 | 
         
            +
            pynvml>=12.0.0
         
     | 
    	
        requirements_minimal.txt → requirements/requirements_minimal.txt
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        add_demo_data.py → scripts/dataset_tonic/add_demo_data.py
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        scripts/dataset_tonic/setup_hf_dataset.py
    ADDED
    
    | 
         @@ -0,0 +1,275 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            #!/usr/bin/env python3
         
     | 
| 2 | 
         
            +
            """
         
     | 
| 3 | 
         
            +
            Setup script for Hugging Face Dataset repository for Trackio experiments
         
     | 
| 4 | 
         
            +
            """
         
     | 
| 5 | 
         
            +
             
     | 
| 6 | 
         
            +
            import os
         
     | 
| 7 | 
         
            +
            import json
         
     | 
| 8 | 
         
            +
            from datetime import datetime
         
     | 
| 9 | 
         
            +
            from datasets import Dataset
         
     | 
| 10 | 
         
            +
            from huggingface_hub import HfApi
         
     | 
| 11 | 
         
            +
             
     | 
| 12 | 
         
            +
            def setup_trackio_dataset():
         
     | 
| 13 | 
         
            +
                """Set up the Trackio experiments dataset on Hugging Face Hub"""
         
     | 
| 14 | 
         
            +
                
         
     | 
| 15 | 
         
            +
                # Configuration - get from environment variables with fallbacks
         
     | 
| 16 | 
         
            +
                dataset_repo = os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
         
     | 
| 17 | 
         
            +
                hf_token = os.environ.get('HF_TOKEN')
         
     | 
| 18 | 
         
            +
                
         
     | 
| 19 | 
         
            +
                if not hf_token:
         
     | 
| 20 | 
         
            +
                    print("❌ HF_TOKEN not found. Please set the HF_TOKEN environment variable.")
         
     | 
| 21 | 
         
            +
                    print("You can get your token from: https://huggingface.co/settings/tokens")
         
     | 
| 22 | 
         
            +
                    return False
         
     | 
| 23 | 
         
            +
                
         
     | 
| 24 | 
         
            +
                print(f"🚀 Setting up Trackio dataset: {dataset_repo}")
         
     | 
| 25 | 
         
            +
                print(f"🔧 Using dataset repository: {dataset_repo}")
         
     | 
| 26 | 
         
            +
                
         
     | 
| 27 | 
         
            +
                # Initial experiment data
         
     | 
| 28 | 
         
            +
                initial_experiments = [
         
     | 
| 29 | 
         
            +
                    {
         
     | 
| 30 | 
         
            +
                        'experiment_id': 'exp_20250720_130853',
         
     | 
| 31 | 
         
            +
                        'name': 'petite-elle-l-aime-3',
         
     | 
| 32 | 
         
            +
                        'description': 'SmolLM3 fine-tuning experiment',
         
     | 
| 33 | 
         
            +
                        'created_at': '2025-07-20T11:20:01.780908',
         
     | 
| 34 | 
         
            +
                        'status': 'running',
         
     | 
| 35 | 
         
            +
                        'metrics': json.dumps([
         
     | 
| 36 | 
         
            +
                            {
         
     | 
| 37 | 
         
            +
                                'timestamp': '2025-07-20T11:20:01.780908',
         
     | 
| 38 | 
         
            +
                                'step': 25,
         
     | 
| 39 | 
         
            +
                                'metrics': {
         
     | 
| 40 | 
         
            +
                                    'loss': 1.1659,
         
     | 
| 41 | 
         
            +
                                    'grad_norm': 10.3125,
         
     | 
| 42 | 
         
            +
                                    'learning_rate': 7e-08,
         
     | 
| 43 | 
         
            +
                                    'num_tokens': 1642080.0,
         
     | 
| 44 | 
         
            +
                                    'mean_token_accuracy': 0.75923578992486,
         
     | 
| 45 | 
         
            +
                                    'epoch': 0.004851130919895701
         
     | 
| 46 | 
         
            +
                                }
         
     | 
| 47 | 
         
            +
                            },
         
     | 
| 48 | 
         
            +
                            {
         
     | 
| 49 | 
         
            +
                                'timestamp': '2025-07-20T11:26:39.042155',
         
     | 
| 50 | 
         
            +
                                'step': 50,
         
     | 
| 51 | 
         
            +
                                'metrics': {
         
     | 
| 52 | 
         
            +
                                    'loss': 1.165,
         
     | 
| 53 | 
         
            +
                                    'grad_norm': 10.75,
         
     | 
| 54 | 
         
            +
                                    'learning_rate': 1.4291666666666667e-07,
         
     | 
| 55 | 
         
            +
                                    'num_tokens': 3324682.0,
         
     | 
| 56 | 
         
            +
                                    'mean_token_accuracy': 0.7577659255266189,
         
     | 
| 57 | 
         
            +
                                    'epoch': 0.009702261839791402
         
     | 
| 58 | 
         
            +
                                }
         
     | 
| 59 | 
         
            +
                            },
         
     | 
| 60 | 
         
            +
                            {
         
     | 
| 61 | 
         
            +
                                'timestamp': '2025-07-20T11:33:16.203045',
         
     | 
| 62 | 
         
            +
                                'step': 75,
         
     | 
| 63 | 
         
            +
                                'metrics': {
         
     | 
| 64 | 
         
            +
                                    'loss': 1.1639,
         
     | 
| 65 | 
         
            +
                                    'grad_norm': 10.6875,
         
     | 
| 66 | 
         
            +
                                    'learning_rate': 2.1583333333333334e-07,
         
     | 
| 67 | 
         
            +
                                    'num_tokens': 4987941.0,
         
     | 
| 68 | 
         
            +
                                    'mean_token_accuracy': 0.7581205774843692,
         
     | 
| 69 | 
         
            +
                                    'epoch': 0.014553392759687101
         
     | 
| 70 | 
         
            +
                                }
         
     | 
| 71 | 
         
            +
                            },
         
     | 
| 72 | 
         
            +
                            {
         
     | 
| 73 | 
         
            +
                                'timestamp': '2025-07-20T11:39:53.453917',
         
     | 
| 74 | 
         
            +
                                'step': 100,
         
     | 
| 75 | 
         
            +
                                'metrics': {
         
     | 
| 76 | 
         
            +
                                    'loss': 1.1528,
         
     | 
| 77 | 
         
            +
                                    'grad_norm': 10.75,
         
     | 
| 78 | 
         
            +
                                    'learning_rate': 2.8875e-07,
         
     | 
| 79 | 
         
            +
                                    'num_tokens': 6630190.0,
         
     | 
| 80 | 
         
            +
                                    'mean_token_accuracy': 0.7614579878747463,
         
     | 
| 81 | 
         
            +
                                    'epoch': 0.019404523679582803
         
     | 
| 82 | 
         
            +
                                }
         
     | 
| 83 | 
         
            +
                            }
         
     | 
| 84 | 
         
            +
                        ]),
         
     | 
| 85 | 
         
            +
                        'parameters': json.dumps({
         
     | 
| 86 | 
         
            +
                            'model_name': 'HuggingFaceTB/SmolLM3-3B',
         
     | 
| 87 | 
         
            +
                            'max_seq_length': 12288,
         
     | 
| 88 | 
         
            +
                            'use_flash_attention': True,
         
     | 
| 89 | 
         
            +
                            'use_gradient_checkpointing': False,
         
     | 
| 90 | 
         
            +
                            'batch_size': 8,
         
     | 
| 91 | 
         
            +
                            'gradient_accumulation_steps': 16,
         
     | 
| 92 | 
         
            +
                            'learning_rate': 3.5e-06,
         
     | 
| 93 | 
         
            +
                            'weight_decay': 0.01,
         
     | 
| 94 | 
         
            +
                            'warmup_steps': 1200,
         
     | 
| 95 | 
         
            +
                            'max_iters': 18000,
         
     | 
| 96 | 
         
            +
                            'eval_interval': 1000,
         
     | 
| 97 | 
         
            +
                            'log_interval': 25,
         
     | 
| 98 | 
         
            +
                            'save_interval': 2000,
         
     | 
| 99 | 
         
            +
                            'optimizer': 'adamw_torch',
         
     | 
| 100 | 
         
            +
                            'beta1': 0.9,
         
     | 
| 101 | 
         
            +
                            'beta2': 0.999,
         
     | 
| 102 | 
         
            +
                            'eps': 1e-08,
         
     | 
| 103 | 
         
            +
                            'scheduler': 'cosine',
         
     | 
| 104 | 
         
            +
                            'min_lr': 3.5e-07,
         
     | 
| 105 | 
         
            +
                            'fp16': False,
         
     | 
| 106 | 
         
            +
                            'bf16': True,
         
     | 
| 107 | 
         
            +
                            'ddp_backend': 'nccl',
         
     | 
| 108 | 
         
            +
                            'ddp_find_unused_parameters': False,
         
     | 
| 109 | 
         
            +
                            'save_steps': 2000,
         
     | 
| 110 | 
         
            +
                            'eval_steps': 1000,
         
     | 
| 111 | 
         
            +
                            'logging_steps': 25,
         
     | 
| 112 | 
         
            +
                            'save_total_limit': 5,
         
     | 
| 113 | 
         
            +
                            'eval_strategy': 'steps',
         
     | 
| 114 | 
         
            +
                            'metric_for_best_model': 'eval_loss',
         
     | 
| 115 | 
         
            +
                            'greater_is_better': False,
         
     | 
| 116 | 
         
            +
                            'load_best_model_at_end': True,
         
     | 
| 117 | 
         
            +
                            'data_dir': None,
         
     | 
| 118 | 
         
            +
                            'train_file': None,
         
     | 
| 119 | 
         
            +
                            'validation_file': None,
         
     | 
| 120 | 
         
            +
                            'test_file': None,
         
     | 
| 121 | 
         
            +
                            'use_chat_template': True,
         
     | 
| 122 | 
         
            +
                            'chat_template_kwargs': {'add_generation_prompt': True, 'no_think_system_message': True},
         
     | 
| 123 | 
         
            +
                            'enable_tracking': True,
         
     | 
| 124 | 
         
            +
                            'trackio_url': 'https://tonic-test-trackio-test.hf.space',
         
     | 
| 125 | 
         
            +
                            'trackio_token': None,
         
     | 
| 126 | 
         
            +
                            'log_artifacts': True,
         
     | 
| 127 | 
         
            +
                            'log_metrics': True,
         
     | 
| 128 | 
         
            +
                            'log_config': True,
         
     | 
| 129 | 
         
            +
                            'experiment_name': 'petite-elle-l-aime-3',
         
     | 
| 130 | 
         
            +
                            'dataset_name': 'legmlai/openhermes-fr',
         
     | 
| 131 | 
         
            +
                            'dataset_split': 'train',
         
     | 
| 132 | 
         
            +
                            'input_field': 'prompt',
         
     | 
| 133 | 
         
            +
                            'target_field': 'accepted_completion',
         
     | 
| 134 | 
         
            +
                            'filter_bad_entries': True,
         
     | 
| 135 | 
         
            +
                            'bad_entry_field': 'bad_entry',
         
     | 
| 136 | 
         
            +
                            'packing': False,
         
     | 
| 137 | 
         
            +
                            'max_prompt_length': 12288,
         
     | 
| 138 | 
         
            +
                            'max_completion_length': 8192,
         
     | 
| 139 | 
         
            +
                            'truncation': True,
         
     | 
| 140 | 
         
            +
                            'dataloader_num_workers': 10,
         
     | 
| 141 | 
         
            +
                            'dataloader_pin_memory': True,
         
     | 
| 142 | 
         
            +
                            'dataloader_prefetch_factor': 3,
         
     | 
| 143 | 
         
            +
                            'max_grad_norm': 1.0,
         
     | 
| 144 | 
         
            +
                            'group_by_length': True
         
     | 
| 145 | 
         
            +
                        }),
         
     | 
| 146 | 
         
            +
                        'artifacts': json.dumps([]),
         
     | 
| 147 | 
         
            +
                        'logs': json.dumps([]),
         
     | 
| 148 | 
         
            +
                        'last_updated': datetime.now().isoformat()
         
     | 
| 149 | 
         
            +
                    },
         
     | 
| 150 | 
         
            +
                    {
         
     | 
| 151 | 
         
            +
                        'experiment_id': 'exp_20250720_134319',
         
     | 
| 152 | 
         
            +
                        'name': 'petite-elle-l-aime-3-1',
         
     | 
| 153 | 
         
            +
                        'description': 'SmolLM3 fine-tuning experiment',
         
     | 
| 154 | 
         
            +
                        'created_at': '2025-07-20T11:54:31.993219',
         
     | 
| 155 | 
         
            +
                        'status': 'running',
         
     | 
| 156 | 
         
            +
                        'metrics': json.dumps([
         
     | 
| 157 | 
         
            +
                            {
         
     | 
| 158 | 
         
            +
                                'timestamp': '2025-07-20T11:54:31.993219',
         
     | 
| 159 | 
         
            +
                                'step': 25,
         
     | 
| 160 | 
         
            +
                                'metrics': {
         
     | 
| 161 | 
         
            +
                                    'loss': 1.166,
         
     | 
| 162 | 
         
            +
                                    'grad_norm': 10.375,
         
     | 
| 163 | 
         
            +
                                    'learning_rate': 7e-08,
         
     | 
| 164 | 
         
            +
                                    'num_tokens': 1642080.0,
         
     | 
| 165 | 
         
            +
                                    'mean_token_accuracy': 0.7590958896279335,
         
     | 
| 166 | 
         
            +
                                    'epoch': 0.004851130919895701
         
     | 
| 167 | 
         
            +
                                }
         
     | 
| 168 | 
         
            +
                            },
         
     | 
| 169 | 
         
            +
                            {
         
     | 
| 170 | 
         
            +
                                'timestamp': '2025-07-20T11:54:33.589487',
         
     | 
| 171 | 
         
            +
                                'step': 25,
         
     | 
| 172 | 
         
            +
                                'metrics': {
         
     | 
| 173 | 
         
            +
                                    'gpu_0_memory_allocated': 17.202261447906494,
         
     | 
| 174 | 
         
            +
                                    'gpu_0_memory_reserved': 75.474609375,
         
     | 
| 175 | 
         
            +
                                    'gpu_0_utilization': 0,
         
     | 
| 176 | 
         
            +
                                    'cpu_percent': 2.7,
         
     | 
| 177 | 
         
            +
                                    'memory_percent': 10.1
         
     | 
| 178 | 
         
            +
                                }
         
     | 
| 179 | 
         
            +
                            }
         
     | 
| 180 | 
         
            +
                        ]),
         
     | 
| 181 | 
         
            +
                        'parameters': json.dumps({
         
     | 
| 182 | 
         
            +
                            'model_name': 'HuggingFaceTB/SmolLM3-3B',
         
     | 
| 183 | 
         
            +
                            'max_seq_length': 12288,
         
     | 
| 184 | 
         
            +
                            'use_flash_attention': True,
         
     | 
| 185 | 
         
            +
                            'use_gradient_checkpointing': False,
         
     | 
| 186 | 
         
            +
                            'batch_size': 8,
         
     | 
| 187 | 
         
            +
                            'gradient_accumulation_steps': 16,
         
     | 
| 188 | 
         
            +
                            'learning_rate': 3.5e-06,
         
     | 
| 189 | 
         
            +
                            'weight_decay': 0.01,
         
     | 
| 190 | 
         
            +
                            'warmup_steps': 1200,
         
     | 
| 191 | 
         
            +
                            'max_iters': 18000,
         
     | 
| 192 | 
         
            +
                            'eval_interval': 1000,
         
     | 
| 193 | 
         
            +
                            'log_interval': 25,
         
     | 
| 194 | 
         
            +
                            'save_interval': 2000,
         
     | 
| 195 | 
         
            +
                            'optimizer': 'adamw_torch',
         
     | 
| 196 | 
         
            +
                            'beta1': 0.9,
         
     | 
| 197 | 
         
            +
                            'beta2': 0.999,
         
     | 
| 198 | 
         
            +
                            'eps': 1e-08,
         
     | 
| 199 | 
         
            +
                            'scheduler': 'cosine',
         
     | 
| 200 | 
         
            +
                            'min_lr': 3.5e-07,
         
     | 
| 201 | 
         
            +
                            'fp16': False,
         
     | 
| 202 | 
         
            +
                            'bf16': True,
         
     | 
| 203 | 
         
            +
                            'ddp_backend': 'nccl',
         
     | 
| 204 | 
         
            +
                            'ddp_find_unused_parameters': False,
         
     | 
| 205 | 
         
            +
                            'save_steps': 2000,
         
     | 
| 206 | 
         
            +
                            'eval_steps': 1000,
         
     | 
| 207 | 
         
            +
                            'logging_steps': 25,
         
     | 
| 208 | 
         
            +
                            'save_total_limit': 5,
         
     | 
| 209 | 
         
            +
                            'eval_strategy': 'steps',
         
     | 
| 210 | 
         
            +
                            'metric_for_best_model': 'eval_loss',
         
     | 
| 211 | 
         
            +
                            'greater_is_better': False,
         
     | 
| 212 | 
         
            +
                            'load_best_model_at_end': True,
         
     | 
| 213 | 
         
            +
                            'data_dir': None,
         
     | 
| 214 | 
         
            +
                            'train_file': None,
         
     | 
| 215 | 
         
            +
                            'validation_file': None,
         
     | 
| 216 | 
         
            +
                            'test_file': None,
         
     | 
| 217 | 
         
            +
                            'use_chat_template': True,
         
     | 
| 218 | 
         
            +
                            'chat_template_kwargs': {'add_generation_prompt': True, 'no_think_system_message': True},
         
     | 
| 219 | 
         
            +
                            'enable_tracking': True,
         
     | 
| 220 | 
         
            +
                            'trackio_url': 'https://tonic-test-trackio-test.hf.space',
         
     | 
| 221 | 
         
            +
                            'trackio_token': None,
         
     | 
| 222 | 
         
            +
                            'log_artifacts': True,
         
     | 
| 223 | 
         
            +
                            'log_metrics': True,
         
     | 
| 224 | 
         
            +
                            'log_config': True,
         
     | 
| 225 | 
         
            +
                            'experiment_name': 'petite-elle-l-aime-3-1',
         
     | 
| 226 | 
         
            +
                            'dataset_name': 'legmlai/openhermes-fr',
         
     | 
| 227 | 
         
            +
                            'dataset_split': 'train',
         
     | 
| 228 | 
         
            +
                            'input_field': 'prompt',
         
     | 
| 229 | 
         
            +
                            'target_field': 'accepted_completion',
         
     | 
| 230 | 
         
            +
                            'filter_bad_entries': True,
         
     | 
| 231 | 
         
            +
                            'bad_entry_field': 'bad_entry',
         
     | 
| 232 | 
         
            +
                            'packing': False,
         
     | 
| 233 | 
         
            +
                            'max_prompt_length': 12288,
         
     | 
| 234 | 
         
            +
                            'max_completion_length': 8192,
         
     | 
| 235 | 
         
            +
                            'truncation': True,
         
     | 
| 236 | 
         
            +
                            'dataloader_num_workers': 10,
         
     | 
| 237 | 
         
            +
                            'dataloader_pin_memory': True,
         
     | 
| 238 | 
         
            +
                            'dataloader_prefetch_factor': 3,
         
     | 
| 239 | 
         
            +
                            'max_grad_norm': 1.0,
         
     | 
| 240 | 
         
            +
                            'group_by_length': True
         
     | 
| 241 | 
         
            +
                        }),
         
     | 
| 242 | 
         
            +
                        'artifacts': json.dumps([]),
         
     | 
| 243 | 
         
            +
                        'logs': json.dumps([]),
         
     | 
| 244 | 
         
            +
                        'last_updated': datetime.now().isoformat()
         
     | 
| 245 | 
         
            +
                    }
         
     | 
| 246 | 
         
            +
                ]
         
     | 
| 247 | 
         
            +
                
         
     | 
| 248 | 
         
            +
                try:
         
     | 
| 249 | 
         
            +
                    # Create dataset
         
     | 
| 250 | 
         
            +
                    dataset = Dataset.from_list(initial_experiments)
         
     | 
| 251 | 
         
            +
                    
         
     | 
| 252 | 
         
            +
                    # Push to HF Hub
         
     | 
| 253 | 
         
            +
                    api = HfApi(token=hf_token)
         
     | 
| 254 | 
         
            +
                    dataset.push_to_hub(
         
     | 
| 255 | 
         
            +
                        dataset_repo,
         
     | 
| 256 | 
         
            +
                        token=hf_token,
         
     | 
| 257 | 
         
            +
                        private=True  # Make it private for security
         
     | 
| 258 | 
         
            +
                    )
         
     | 
| 259 | 
         
            +
                    
         
     | 
| 260 | 
         
            +
                    print(f"✅ Successfully created dataset: {dataset_repo}")
         
     | 
| 261 | 
         
            +
                    print(f"📊 Added {len(initial_experiments)} experiments")
         
     | 
| 262 | 
         
            +
                    print("🔒 Dataset is private (only accessible with your token)")
         
     | 
| 263 | 
         
            +
                    print("\n🎯 Next steps:")
         
     | 
| 264 | 
         
            +
                    print("1. Set HF_TOKEN in your Hugging Face Space environment")
         
     | 
| 265 | 
         
            +
                    print("2. Deploy the updated app.py to your Space")
         
     | 
| 266 | 
         
            +
                    print("3. The app will now load experiments from the dataset")
         
     | 
| 267 | 
         
            +
                    
         
     | 
| 268 | 
         
            +
                    return True
         
     | 
| 269 | 
         
            +
                    
         
     | 
| 270 | 
         
            +
                except Exception as e:
         
     | 
| 271 | 
         
            +
                    print(f"❌ Failed to create dataset: {e}")
         
     | 
| 272 | 
         
            +
                    return False
         
     | 
| 273 | 
         
            +
             
     | 
| 274 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 275 | 
         
            +
                setup_trackio_dataset() 
         
     | 
    	
        push_to_huggingface.py → scripts/model_tonic/push_to_huggingface.py
    RENAMED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 1 | 
         
             
            #!/usr/bin/env python3
         
     | 
| 2 | 
         
             
            """
         
     | 
| 3 | 
         
             
            Push Trained Model and Results to Hugging Face Hub
         
     | 
| 4 | 
         
            -
            Integrates with Trackio monitoring and  
     | 
| 5 | 
         
             
            """
         
     | 
| 6 | 
         | 
| 7 | 
         
             
            import os
         
     | 
| 
         @@ -23,6 +23,9 @@ except ImportError: 
     | 
|
| 23 | 
         
             
                print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub")
         
     | 
| 24 | 
         | 
| 25 | 
         
             
            try:
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 26 | 
         
             
                from monitoring import SmolLM3Monitor
         
     | 
| 27 | 
         
             
                MONITORING_AVAILABLE = True
         
     | 
| 28 | 
         
             
            except ImportError:
         
     | 
| 
         @@ -32,7 +35,7 @@ except ImportError: 
     | 
|
| 32 | 
         
             
            logger = logging.getLogger(__name__)
         
     | 
| 33 | 
         | 
| 34 | 
         
             
            class HuggingFacePusher:
         
     | 
| 35 | 
         
            -
                """Push trained models and results to Hugging Face Hub"""
         
     | 
| 36 | 
         | 
| 37 | 
         
             
                def __init__(
         
     | 
| 38 | 
         
             
                    self,
         
     | 
| 
         @@ -41,15 +44,21 @@ class HuggingFacePusher: 
     | 
|
| 41 | 
         
             
                    token: Optional[str] = None,
         
     | 
| 42 | 
         
             
                    private: bool = False,
         
     | 
| 43 | 
         
             
                    trackio_url: Optional[str] = None,
         
     | 
| 44 | 
         
            -
                    experiment_name: Optional[str] = None
         
     | 
| 
         | 
|
| 
         | 
|
| 45 | 
         
             
                ):
         
     | 
| 46 | 
         
             
                    self.model_path = Path(model_path)
         
     | 
| 47 | 
         
             
                    self.repo_name = repo_name
         
     | 
| 48 | 
         
            -
                    self.token = token or os.getenv('HF_TOKEN')
         
     | 
| 49 | 
         
             
                    self.private = private
         
     | 
| 50 | 
         
             
                    self.trackio_url = trackio_url
         
     | 
| 51 | 
         
             
                    self.experiment_name = experiment_name
         
     | 
| 52 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 53 | 
         
             
                    # Initialize HF API
         
     | 
| 54 | 
         
             
                    if HF_AVAILABLE:
         
     | 
| 55 | 
         
             
                        self.api = HfApi(token=self.token)
         
     | 
| 
         @@ -58,14 +67,17 @@ class HuggingFacePusher: 
     | 
|
| 58 | 
         | 
| 59 | 
         
             
                    # Initialize monitoring if available
         
     | 
| 60 | 
         
             
                    self.monitor = None
         
     | 
| 61 | 
         
            -
                    if MONITORING_AVAILABLE 
     | 
| 62 | 
         
             
                        self.monitor = SmolLM3Monitor(
         
     | 
| 63 | 
         
             
                            experiment_name=experiment_name or "model_push",
         
     | 
| 64 | 
         
             
                            trackio_url=trackio_url,
         
     | 
| 65 | 
         
            -
                            enable_tracking= 
     | 
| 
         | 
|
| 
         | 
|
| 66 | 
         
             
                        )
         
     | 
| 67 | 
         | 
| 68 | 
         
             
                    logger.info(f"Initialized HuggingFacePusher for {repo_name}")
         
     | 
| 
         | 
|
| 69 | 
         | 
| 70 | 
         
             
                def create_repository(self) -> bool:
         
     | 
| 71 | 
         
             
                    """Create the Hugging Face repository"""
         
     | 
| 
         @@ -131,6 +143,7 @@ This is a fine-tuned SmolLM3 model based on the HuggingFaceTB/SmolLM3-3B archite 
     | 
|
| 131 | 
         
             
            - **Fine-tuning Method**: Supervised Fine-tuning
         
     | 
| 132 | 
         
             
            - **Training Date**: {datetime.now().strftime('%Y-%m-%d')}
         
     | 
| 133 | 
         
             
            - **Model Size**: {self._get_model_size():.1f} GB
         
     | 
| 
         | 
|
| 134 | 
         | 
| 135 | 
         
             
            ## Training Configuration
         
     | 
| 136 | 
         | 
| 
         @@ -166,6 +179,7 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True)) 
     | 
|
| 166 | 
         
             
            - **Training Time**: {results.get('training_time_hours', 'Unknown')} hours
         
     | 
| 167 | 
         
             
            - **Final Loss**: {results.get('final_loss', 'Unknown')}
         
     | 
| 168 | 
         
             
            - **Final Accuracy**: {results.get('final_accuracy', 'Unknown')}
         
     | 
| 
         | 
|
| 169 | 
         | 
| 170 | 
         
             
            ## Model Performance
         
     | 
| 171 | 
         | 
| 
         @@ -173,6 +187,10 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True)) 
     | 
|
| 173 | 
         
             
            - **Validation Loss**: {results.get('eval_loss', 'Unknown')}
         
     | 
| 174 | 
         
             
            - **Training Steps**: {results.get('total_steps', 'Unknown')}
         
     | 
| 175 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 176 | 
         
             
            ## Limitations and Biases
         
     | 
| 177 | 
         | 
| 178 | 
         
             
            This model is fine-tuned for specific tasks and may not generalize well to all use cases. Please evaluate the model's performance on your specific task before deployment.
         
     | 
| 
         @@ -293,6 +311,7 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True)) 
     | 
|
| 293 | 
         
             
            - **Model Size**: {self._get_model_size():.1f} GB
         
     | 
| 294 | 
         
             
            - **Training Steps**: {results.get('total_steps', 'Unknown')}
         
     | 
| 295 | 
         
             
            - **Final Loss**: {results.get('final_loss', 'Unknown')}
         
     | 
| 
         | 
|
| 296 | 
         | 
| 297 | 
         
             
            ## Training Configuration
         
     | 
| 298 | 
         | 
| 
         @@ -306,6 +325,10 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True)) 
     | 
|
| 306 | 
         
             
            {json.dumps(results, indent=2)}
         
     | 
| 307 | 
         
             
            ```
         
     | 
| 308 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 309 | 
         
             
            ## Files
         
     | 
| 310 | 
         | 
| 311 | 
         
             
            - `pytorch_model.bin`: Model weights
         
     | 
| 
         @@ -327,8 +350,8 @@ MIT License 
     | 
|
| 327 | 
         
             
                        upload_file(
         
     | 
| 328 | 
         
             
                            path_or_fileobj=str(readme_path),
         
     | 
| 329 | 
         
             
                            path_in_repo="README.md",
         
     | 
| 330 | 
         
            -
                             
     | 
| 331 | 
         
            -
                             
     | 
| 332 | 
         
             
                        )
         
     | 
| 333 | 
         | 
| 334 | 
         
             
                        # Clean up
         
     | 
| 
         @@ -342,23 +365,36 @@ MIT License 
     | 
|
| 342 | 
         
             
                        return False
         
     | 
| 343 | 
         | 
| 344 | 
         
             
                def log_to_trackio(self, action: str, details: Dict[str, Any]):
         
     | 
| 345 | 
         
            -
                    """Log push action to Trackio"""
         
     | 
| 346 | 
         
             
                    if self.monitor:
         
     | 
| 347 | 
         
             
                        try:
         
     | 
| 
         | 
|
| 348 | 
         
             
                            self.monitor.log_metrics({
         
     | 
| 349 | 
         
             
                                "push_action": action,
         
     | 
| 350 | 
         
             
                                "repo_name": self.repo_name,
         
     | 
| 351 | 
         
             
                                "model_size_gb": self._get_model_size(),
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 352 | 
         
             
                                **details
         
     | 
| 353 | 
         
             
                            })
         
     | 
| 354 | 
         
            -
                             
     | 
| 
         | 
|
| 355 | 
         
             
                        except Exception as e:
         
     | 
| 356 | 
         
             
                            logger.error(f"❌ Failed to log to Trackio: {e}")
         
     | 
| 357 | 
         | 
| 358 | 
         
             
                def push_model(self, training_config: Optional[Dict[str, Any]] = None, 
         
     | 
| 359 | 
         
             
                               results: Optional[Dict[str, Any]] = None) -> bool:
         
     | 
| 360 | 
         
            -
                    """Complete model push process"""
         
     | 
| 361 | 
         
             
                    logger.info(f"🚀 Starting model push to {self.repo_name}")
         
     | 
| 
         | 
|
| 362 | 
         | 
| 363 | 
         
             
                    # Validate model path
         
     | 
| 364 | 
         
             
                    if not self.validate_model_path():
         
     | 
| 
         @@ -399,7 +435,7 @@ MIT License 
     | 
|
| 399 | 
         
             
                    if results:
         
     | 
| 400 | 
         
             
                        self.upload_training_results(str(self.model_path))
         
     | 
| 401 | 
         | 
| 402 | 
         
            -
                    # Log to Trackio
         
     | 
| 403 | 
         
             
                    self.log_to_trackio("model_push", {
         
     | 
| 404 | 
         
             
                        "model_path": str(self.model_path),
         
     | 
| 405 | 
         
             
                        "repo_name": self.repo_name,
         
     | 
| 
         @@ -409,6 +445,7 @@ MIT License 
     | 
|
| 409 | 
         
             
                    })
         
     | 
| 410 | 
         | 
| 411 | 
         
             
                    logger.info(f"🎉 Model successfully pushed to: https://huggingface.co/{self.repo_name}")
         
     | 
| 
         | 
|
| 412 | 
         
             
                    return True
         
     | 
| 413 | 
         | 
| 414 | 
         
             
                def _load_training_config(self) -> Dict[str, Any]:
         
     | 
| 
         @@ -437,9 +474,11 @@ def parse_args(): 
     | 
|
| 437 | 
         | 
| 438 | 
         
             
                # Optional arguments
         
     | 
| 439 | 
         
             
                parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
         
     | 
| 
         | 
|
| 440 | 
         
             
                parser.add_argument('--private', action='store_true', help='Make repository private')
         
     | 
| 441 | 
         
             
                parser.add_argument('--trackio-url', type=str, default=None, help='Trackio Space URL for logging')
         
     | 
| 442 | 
         
             
                parser.add_argument('--experiment-name', type=str, default=None, help='Experiment name for Trackio')
         
     | 
| 
         | 
|
| 443 | 
         | 
| 444 | 
         
             
                return parser.parse_args()
         
     | 
| 445 | 
         | 
| 
         @@ -463,7 +502,9 @@ def main(): 
     | 
|
| 463 | 
         
             
                        token=args.token,
         
     | 
| 464 | 
         
             
                        private=args.private,
         
     | 
| 465 | 
         
             
                        trackio_url=args.trackio_url,
         
     | 
| 466 | 
         
            -
                        experiment_name=args.experiment_name
         
     | 
| 
         | 
|
| 
         | 
|
| 467 | 
         
             
                    )
         
     | 
| 468 | 
         | 
| 469 | 
         
             
                    # Push model
         
     | 
| 
         @@ -472,6 +513,8 @@ def main(): 
     | 
|
| 472 | 
         
             
                    if success:
         
     | 
| 473 | 
         
             
                        logger.info("✅ Model push completed successfully!")
         
     | 
| 474 | 
         
             
                        logger.info(f"🌐 View your model at: https://huggingface.co/{args.repo_name}")
         
     | 
| 
         | 
|
| 
         | 
|
| 475 | 
         
             
                    else:
         
     | 
| 476 | 
         
             
                        logger.error("❌ Model push failed!")
         
     | 
| 477 | 
         
             
                        return 1
         
     | 
| 
         | 
|
| 1 | 
         
             
            #!/usr/bin/env python3
         
     | 
| 2 | 
         
             
            """
         
     | 
| 3 | 
         
             
            Push Trained Model and Results to Hugging Face Hub
         
     | 
| 4 | 
         
            +
            Integrates with Trackio monitoring and HF Datasets for complete model deployment
         
     | 
| 5 | 
         
             
            """
         
     | 
| 6 | 
         | 
| 7 | 
         
             
            import os
         
     | 
| 
         | 
|
| 23 | 
         
             
                print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub")
         
     | 
| 24 | 
         | 
| 25 | 
         
             
            try:
         
     | 
| 26 | 
         
            +
                import sys
         
     | 
| 27 | 
         
            +
                import os
         
     | 
| 28 | 
         
            +
                sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'src'))
         
     | 
| 29 | 
         
             
                from monitoring import SmolLM3Monitor
         
     | 
| 30 | 
         
             
                MONITORING_AVAILABLE = True
         
     | 
| 31 | 
         
             
            except ImportError:
         
     | 
| 
         | 
|
| 35 | 
         
             
            logger = logging.getLogger(__name__)
         
     | 
| 36 | 
         | 
| 37 | 
         
             
            class HuggingFacePusher:
         
     | 
| 38 | 
         
            +
                """Push trained models and results to Hugging Face Hub with HF Datasets integration"""
         
     | 
| 39 | 
         | 
| 40 | 
         
             
                def __init__(
         
     | 
| 41 | 
         
             
                    self,
         
     | 
| 
         | 
|
| 44 | 
         
             
                    token: Optional[str] = None,
         
     | 
| 45 | 
         
             
                    private: bool = False,
         
     | 
| 46 | 
         
             
                    trackio_url: Optional[str] = None,
         
     | 
| 47 | 
         
            +
                    experiment_name: Optional[str] = None,
         
     | 
| 48 | 
         
            +
                    dataset_repo: Optional[str] = None,
         
     | 
| 49 | 
         
            +
                    hf_token: Optional[str] = None
         
     | 
| 50 | 
         
             
                ):
         
     | 
| 51 | 
         
             
                    self.model_path = Path(model_path)
         
     | 
| 52 | 
         
             
                    self.repo_name = repo_name
         
     | 
| 53 | 
         
            +
                    self.token = token or hf_token or os.getenv('HF_TOKEN')
         
     | 
| 54 | 
         
             
                    self.private = private
         
     | 
| 55 | 
         
             
                    self.trackio_url = trackio_url
         
     | 
| 56 | 
         
             
                    self.experiment_name = experiment_name
         
     | 
| 57 | 
         | 
| 58 | 
         
            +
                    # HF Datasets configuration
         
     | 
| 59 | 
         
            +
                    self.dataset_repo = dataset_repo or os.getenv('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
         
     | 
| 60 | 
         
            +
                    self.hf_token = hf_token or os.getenv('HF_TOKEN')
         
     | 
| 61 | 
         
            +
                    
         
     | 
| 62 | 
         
             
                    # Initialize HF API
         
     | 
| 63 | 
         
             
                    if HF_AVAILABLE:
         
     | 
| 64 | 
         
             
                        self.api = HfApi(token=self.token)
         
     | 
| 
         | 
|
| 67 | 
         | 
| 68 | 
         
             
                    # Initialize monitoring if available
         
     | 
| 69 | 
         
             
                    self.monitor = None
         
     | 
| 70 | 
         
            +
                    if MONITORING_AVAILABLE:
         
     | 
| 71 | 
         
             
                        self.monitor = SmolLM3Monitor(
         
     | 
| 72 | 
         
             
                            experiment_name=experiment_name or "model_push",
         
     | 
| 73 | 
         
             
                            trackio_url=trackio_url,
         
     | 
| 74 | 
         
            +
                            enable_tracking=bool(trackio_url),
         
     | 
| 75 | 
         
            +
                            hf_token=self.hf_token,
         
     | 
| 76 | 
         
            +
                            dataset_repo=self.dataset_repo
         
     | 
| 77 | 
         
             
                        )
         
     | 
| 78 | 
         | 
| 79 | 
         
             
                    logger.info(f"Initialized HuggingFacePusher for {repo_name}")
         
     | 
| 80 | 
         
            +
                    logger.info(f"Dataset repository: {self.dataset_repo}")
         
     | 
| 81 | 
         | 
| 82 | 
         
             
                def create_repository(self) -> bool:
         
     | 
| 83 | 
         
             
                    """Create the Hugging Face repository"""
         
     | 
| 
         | 
|
| 143 | 
         
             
            - **Fine-tuning Method**: Supervised Fine-tuning
         
     | 
| 144 | 
         
             
            - **Training Date**: {datetime.now().strftime('%Y-%m-%d')}
         
     | 
| 145 | 
         
             
            - **Model Size**: {self._get_model_size():.1f} GB
         
     | 
| 146 | 
         
            +
            - **Dataset Repository**: {self.dataset_repo}
         
     | 
| 147 | 
         | 
| 148 | 
         
             
            ## Training Configuration
         
     | 
| 149 | 
         | 
| 
         | 
|
| 179 | 
         
             
            - **Training Time**: {results.get('training_time_hours', 'Unknown')} hours
         
     | 
| 180 | 
         
             
            - **Final Loss**: {results.get('final_loss', 'Unknown')}
         
     | 
| 181 | 
         
             
            - **Final Accuracy**: {results.get('final_accuracy', 'Unknown')}
         
     | 
| 182 | 
         
            +
            - **Dataset Repository**: {self.dataset_repo}
         
     | 
| 183 | 
         | 
| 184 | 
         
             
            ## Model Performance
         
     | 
| 185 | 
         | 
| 
         | 
|
| 187 | 
         
             
            - **Validation Loss**: {results.get('eval_loss', 'Unknown')}
         
     | 
| 188 | 
         
             
            - **Training Steps**: {results.get('total_steps', 'Unknown')}
         
     | 
| 189 | 
         | 
| 190 | 
         
            +
            ## Experiment Tracking
         
     | 
| 191 | 
         
            +
             
     | 
| 192 | 
         
            +
            This model was trained with experiment tracking enabled. Training metrics and configuration are stored in the HF Dataset repository: `{self.dataset_repo}`
         
     | 
| 193 | 
         
            +
             
     | 
| 194 | 
         
             
            ## Limitations and Biases
         
     | 
| 195 | 
         | 
| 196 | 
         
             
            This model is fine-tuned for specific tasks and may not generalize well to all use cases. Please evaluate the model's performance on your specific task before deployment.
         
     | 
| 
         | 
|
| 311 | 
         
             
            - **Model Size**: {self._get_model_size():.1f} GB
         
     | 
| 312 | 
         
             
            - **Training Steps**: {results.get('total_steps', 'Unknown')}
         
     | 
| 313 | 
         
             
            - **Final Loss**: {results.get('final_loss', 'Unknown')}
         
     | 
| 314 | 
         
            +
            - **Dataset Repository**: {self.dataset_repo}
         
     | 
| 315 | 
         | 
| 316 | 
         
             
            ## Training Configuration
         
     | 
| 317 | 
         | 
| 
         | 
|
| 325 | 
         
             
            {json.dumps(results, indent=2)}
         
     | 
| 326 | 
         
             
            ```
         
     | 
| 327 | 
         | 
| 328 | 
         
            +
            ## Experiment Tracking
         
     | 
| 329 | 
         
            +
             
     | 
| 330 | 
         
            +
            Training metrics and configuration are stored in the HF Dataset repository: `{self.dataset_repo}`
         
     | 
| 331 | 
         
            +
             
     | 
| 332 | 
         
             
            ## Files
         
     | 
| 333 | 
         | 
| 334 | 
         
             
            - `pytorch_model.bin`: Model weights
         
     | 
| 
         | 
|
| 350 | 
         
             
                        upload_file(
         
     | 
| 351 | 
         
             
                            path_or_fileobj=str(readme_path),
         
     | 
| 352 | 
         
             
                            path_in_repo="README.md",
         
     | 
| 353 | 
         
            +
                            token=self.token,
         
     | 
| 354 | 
         
            +
                            repo_id=self.repo_name
         
     | 
| 355 | 
         
             
                        )
         
     | 
| 356 | 
         | 
| 357 | 
         
             
                        # Clean up
         
     | 
| 
         | 
|
| 365 | 
         
             
                        return False
         
     | 
| 366 | 
         | 
| 367 | 
         
             
                def log_to_trackio(self, action: str, details: Dict[str, Any]):
         
     | 
| 368 | 
         
            +
                    """Log push action to Trackio and HF Datasets"""
         
     | 
| 369 | 
         
             
                    if self.monitor:
         
     | 
| 370 | 
         
             
                        try:
         
     | 
| 371 | 
         
            +
                            # Log to Trackio
         
     | 
| 372 | 
         
             
                            self.monitor.log_metrics({
         
     | 
| 373 | 
         
             
                                "push_action": action,
         
     | 
| 374 | 
         
             
                                "repo_name": self.repo_name,
         
     | 
| 375 | 
         
             
                                "model_size_gb": self._get_model_size(),
         
     | 
| 376 | 
         
            +
                                "dataset_repo": self.dataset_repo,
         
     | 
| 377 | 
         
            +
                                **details
         
     | 
| 378 | 
         
            +
                            })
         
     | 
| 379 | 
         
            +
                            
         
     | 
| 380 | 
         
            +
                            # Log training summary
         
     | 
| 381 | 
         
            +
                            self.monitor.log_training_summary({
         
     | 
| 382 | 
         
            +
                                "model_push": True,
         
     | 
| 383 | 
         
            +
                                "model_repo": self.repo_name,
         
     | 
| 384 | 
         
            +
                                "dataset_repo": self.dataset_repo,
         
     | 
| 385 | 
         
            +
                                "push_date": datetime.now().isoformat(),
         
     | 
| 386 | 
         
             
                                **details
         
     | 
| 387 | 
         
             
                            })
         
     | 
| 388 | 
         
            +
                            
         
     | 
| 389 | 
         
            +
                            logger.info(f"✅ Logged {action} to Trackio and HF Datasets")
         
     | 
| 390 | 
         
             
                        except Exception as e:
         
     | 
| 391 | 
         
             
                            logger.error(f"❌ Failed to log to Trackio: {e}")
         
     | 
| 392 | 
         | 
| 393 | 
         
             
                def push_model(self, training_config: Optional[Dict[str, Any]] = None, 
         
     | 
| 394 | 
         
             
                               results: Optional[Dict[str, Any]] = None) -> bool:
         
     | 
| 395 | 
         
            +
                    """Complete model push process with HF Datasets integration"""
         
     | 
| 396 | 
         
             
                    logger.info(f"🚀 Starting model push to {self.repo_name}")
         
     | 
| 397 | 
         
            +
                    logger.info(f"📊 Dataset repository: {self.dataset_repo}")
         
     | 
| 398 | 
         | 
| 399 | 
         
             
                    # Validate model path
         
     | 
| 400 | 
         
             
                    if not self.validate_model_path():
         
     | 
| 
         | 
|
| 435 | 
         
             
                    if results:
         
     | 
| 436 | 
         
             
                        self.upload_training_results(str(self.model_path))
         
     | 
| 437 | 
         | 
| 438 | 
         
            +
                    # Log to Trackio and HF Datasets
         
     | 
| 439 | 
         
             
                    self.log_to_trackio("model_push", {
         
     | 
| 440 | 
         
             
                        "model_path": str(self.model_path),
         
     | 
| 441 | 
         
             
                        "repo_name": self.repo_name,
         
     | 
| 
         | 
|
| 445 | 
         
             
                    })
         
     | 
| 446 | 
         | 
| 447 | 
         
             
                    logger.info(f"🎉 Model successfully pushed to: https://huggingface.co/{self.repo_name}")
         
     | 
| 448 | 
         
            +
                    logger.info(f"📊 Experiment data stored in: {self.dataset_repo}")
         
     | 
| 449 | 
         
             
                    return True
         
     | 
| 450 | 
         | 
| 451 | 
         
             
                def _load_training_config(self) -> Dict[str, Any]:
         
     | 
| 
         | 
|
| 474 | 
         | 
| 475 | 
         
             
                # Optional arguments
         
     | 
| 476 | 
         
             
                parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
         
     | 
| 477 | 
         
            +
                parser.add_argument('--hf-token', type=str, default=None, help='Hugging Face token (alternative to --token)')
         
     | 
| 478 | 
         
             
                parser.add_argument('--private', action='store_true', help='Make repository private')
         
     | 
| 479 | 
         
             
                parser.add_argument('--trackio-url', type=str, default=None, help='Trackio Space URL for logging')
         
     | 
| 480 | 
         
             
                parser.add_argument('--experiment-name', type=str, default=None, help='Experiment name for Trackio')
         
     | 
| 481 | 
         
            +
                parser.add_argument('--dataset-repo', type=str, default=None, help='HF Dataset repository for experiment storage')
         
     | 
| 482 | 
         | 
| 483 | 
         
             
                return parser.parse_args()
         
     | 
| 484 | 
         | 
| 
         | 
|
| 502 | 
         
             
                        token=args.token,
         
     | 
| 503 | 
         
             
                        private=args.private,
         
     | 
| 504 | 
         
             
                        trackio_url=args.trackio_url,
         
     | 
| 505 | 
         
            +
                        experiment_name=args.experiment_name,
         
     | 
| 506 | 
         
            +
                        dataset_repo=args.dataset_repo,
         
     | 
| 507 | 
         
            +
                        hf_token=args.hf_token
         
     | 
| 508 | 
         
             
                    )
         
     | 
| 509 | 
         | 
| 510 | 
         
             
                    # Push model
         
     | 
| 
         | 
|
| 513 | 
         
             
                    if success:
         
     | 
| 514 | 
         
             
                        logger.info("✅ Model push completed successfully!")
         
     | 
| 515 | 
         
             
                        logger.info(f"🌐 View your model at: https://huggingface.co/{args.repo_name}")
         
     | 
| 516 | 
         
            +
                        if args.dataset_repo:
         
     | 
| 517 | 
         
            +
                            logger.info(f"📊 View experiment data at: https://huggingface.co/datasets/{args.dataset_repo}")
         
     | 
| 518 | 
         
             
                    else:
         
     | 
| 519 | 
         
             
                        logger.error("❌ Model push failed!")
         
     | 
| 520 | 
         
             
                        return 1
         
     | 
    	
        scripts/trackio_tonic/configure_trackio.py
    ADDED
    
    | 
         @@ -0,0 +1,145 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            #!/usr/bin/env python3
         
     | 
| 2 | 
         
            +
            """
         
     | 
| 3 | 
         
            +
            Configuration script for Trackio environment variables
         
     | 
| 4 | 
         
            +
            """
         
     | 
| 5 | 
         
            +
             
     | 
| 6 | 
         
            +
            import os
         
     | 
| 7 | 
         
            +
            import json
         
     | 
| 8 | 
         
            +
            from datetime import datetime
         
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
            +
            def configure_trackio():
         
     | 
| 11 | 
         
            +
                """Configure Trackio environment variables"""
         
     | 
| 12 | 
         
            +
                
         
     | 
| 13 | 
         
            +
                print("🔧 Trackio Configuration")
         
     | 
| 14 | 
         
            +
                print("=" * 40)
         
     | 
| 15 | 
         
            +
                
         
     | 
| 16 | 
         
            +
                # Current configuration
         
     | 
| 17 | 
         
            +
                current_config = {
         
     | 
| 18 | 
         
            +
                    'HF_TOKEN': os.environ.get('HF_TOKEN', 'Not set'),
         
     | 
| 19 | 
         
            +
                    'TRACKIO_DATASET_REPO': os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments'),
         
     | 
| 20 | 
         
            +
                    'SPACE_ID': os.environ.get('SPACE_ID', 'Not set')
         
     | 
| 21 | 
         
            +
                }
         
     | 
| 22 | 
         
            +
                
         
     | 
| 23 | 
         
            +
                print("📋 Current Configuration:")
         
     | 
| 24 | 
         
            +
                for key, value in current_config.items():
         
     | 
| 25 | 
         
            +
                    status = "✅" if value != "Not set" else "❌"
         
     | 
| 26 | 
         
            +
                    print(f"   {status} {key}: {value}")
         
     | 
| 27 | 
         
            +
                
         
     | 
| 28 | 
         
            +
                print("\n🎯 Configuration Options:")
         
     | 
| 29 | 
         
            +
                print("1. Set HF_TOKEN - Required for dataset access")
         
     | 
| 30 | 
         
            +
                print("2. Set TRACKIO_DATASET_REPO - Dataset repository (optional)")
         
     | 
| 31 | 
         
            +
                print("3. Set SPACE_ID - HF Space ID (auto-detected)")
         
     | 
| 32 | 
         
            +
                
         
     | 
| 33 | 
         
            +
                # Check if running on HF Spaces
         
     | 
| 34 | 
         
            +
                if os.environ.get('SPACE_ID'):
         
     | 
| 35 | 
         
            +
                    print("\n🚀 Running on Hugging Face Spaces")
         
     | 
| 36 | 
         
            +
                    print(f"   Space ID: {os.environ.get('SPACE_ID')}")
         
     | 
| 37 | 
         
            +
                
         
     | 
| 38 | 
         
            +
                # Validate configuration
         
     | 
| 39 | 
         
            +
                print("\n🔍 Configuration Validation:")
         
     | 
| 40 | 
         
            +
                
         
     | 
| 41 | 
         
            +
                # Check HF_TOKEN
         
     | 
| 42 | 
         
            +
                if current_config['HF_TOKEN'] != 'Not set':
         
     | 
| 43 | 
         
            +
                    print("✅ HF_TOKEN is set")
         
     | 
| 44 | 
         
            +
                    print("   This allows the app to read/write to HF Datasets")
         
     | 
| 45 | 
         
            +
                else:
         
     | 
| 46 | 
         
            +
                    print("❌ HF_TOKEN is not set")
         
     | 
| 47 | 
         
            +
                    print("   Please set HF_TOKEN to enable dataset functionality")
         
     | 
| 48 | 
         
            +
                    print("   Get your token from: https://huggingface.co/settings/tokens")
         
     | 
| 49 | 
         
            +
                
         
     | 
| 50 | 
         
            +
                # Check dataset repository
         
     | 
| 51 | 
         
            +
                dataset_repo = current_config['TRACKIO_DATASET_REPO']
         
     | 
| 52 | 
         
            +
                print(f"📊 Dataset Repository: {dataset_repo}")
         
     | 
| 53 | 
         
            +
                
         
     | 
| 54 | 
         
            +
                # Test dataset access if token is available
         
     | 
| 55 | 
         
            +
                if current_config['HF_TOKEN'] != 'Not set':
         
     | 
| 56 | 
         
            +
                    print("\n🧪 Testing Dataset Access...")
         
     | 
| 57 | 
         
            +
                    try:
         
     | 
| 58 | 
         
            +
                        from datasets import load_dataset
         
     | 
| 59 | 
         
            +
                        
         
     | 
| 60 | 
         
            +
                        dataset = load_dataset(dataset_repo, token=current_config['HF_TOKEN'])
         
     | 
| 61 | 
         
            +
                        print(f"✅ Successfully loaded dataset: {dataset_repo}")
         
     | 
| 62 | 
         
            +
                        
         
     | 
| 63 | 
         
            +
                        # Show experiment count
         
     | 
| 64 | 
         
            +
                        if 'train' in dataset:
         
     | 
| 65 | 
         
            +
                            experiment_count = len(dataset['train'])
         
     | 
| 66 | 
         
            +
                            print(f"📈 Found {experiment_count} experiments in dataset")
         
     | 
| 67 | 
         
            +
                            
         
     | 
| 68 | 
         
            +
                            # Show sample experiments
         
     | 
| 69 | 
         
            +
                            if experiment_count > 0:
         
     | 
| 70 | 
         
            +
                                print("🔬 Sample experiments:")
         
     | 
| 71 | 
         
            +
                                for i, row in enumerate(dataset['train'][:3]):  # Show first 3
         
     | 
| 72 | 
         
            +
                                    exp_id = row.get('experiment_id', 'Unknown')
         
     | 
| 73 | 
         
            +
                                    name = row.get('name', 'Unnamed')
         
     | 
| 74 | 
         
            +
                                    print(f"   {i+1}. {exp_id}: {name}")
         
     | 
| 75 | 
         
            +
                        
         
     | 
| 76 | 
         
            +
                    except Exception as e:
         
     | 
| 77 | 
         
            +
                        print(f"❌ Failed to load dataset: {e}")
         
     | 
| 78 | 
         
            +
                        print("   This might be normal if the dataset doesn't exist yet")
         
     | 
| 79 | 
         
            +
                
         
     | 
| 80 | 
         
            +
                # Generate configuration file
         
     | 
| 81 | 
         
            +
                config_file = "trackio_config.json"
         
     | 
| 82 | 
         
            +
                config_data = {
         
     | 
| 83 | 
         
            +
                    'hf_token': current_config['HF_TOKEN'],
         
     | 
| 84 | 
         
            +
                    'dataset_repo': current_config['TRACKIO_DATASET_REPO'],
         
     | 
| 85 | 
         
            +
                    'space_id': current_config['SPACE_ID'],
         
     | 
| 86 | 
         
            +
                    'last_updated': datetime.now().isoformat(),
         
     | 
| 87 | 
         
            +
                    'notes': 'Trackio configuration - set these as environment variables in your HF Space'
         
     | 
| 88 | 
         
            +
                }
         
     | 
| 89 | 
         
            +
                
         
     | 
| 90 | 
         
            +
                with open(config_file, 'w') as f:
         
     | 
| 91 | 
         
            +
                    json.dump(config_data, f, indent=2)
         
     | 
| 92 | 
         
            +
                
         
     | 
| 93 | 
         
            +
                print(f"\n💾 Configuration saved to: {config_file}")
         
     | 
| 94 | 
         
            +
                
         
     | 
| 95 | 
         
            +
                # Show environment variable commands
         
     | 
| 96 | 
         
            +
                print("\n📝 Environment Variables for HF Space:")
         
     | 
| 97 | 
         
            +
                print("=" * 50)
         
     | 
| 98 | 
         
            +
                print(f"HF_TOKEN={current_config['HF_TOKEN']}")
         
     | 
| 99 | 
         
            +
                print(f"TRACKIO_DATASET_REPO={current_config['TRACKIO_DATASET_REPO']}")
         
     | 
| 100 | 
         
            +
                
         
     | 
| 101 | 
         
            +
                print("\n🎯 Next Steps:")
         
     | 
| 102 | 
         
            +
                print("1. Set HF_TOKEN in your HF Space environment variables")
         
     | 
| 103 | 
         
            +
                print("2. Optionally set TRACKIO_DATASET_REPO to use a different dataset")
         
     | 
| 104 | 
         
            +
                print("3. Deploy your updated app.py to the Space")
         
     | 
| 105 | 
         
            +
                print("4. Run setup_hf_dataset.py if you haven't created the dataset yet")
         
     | 
| 106 | 
         
            +
             
     | 
| 107 | 
         
            +
            def show_usage_examples():
         
     | 
| 108 | 
         
            +
                """Show usage examples for different dataset repositories"""
         
     | 
| 109 | 
         
            +
                
         
     | 
| 110 | 
         
            +
                print("\n📚 Usage Examples")
         
     | 
| 111 | 
         
            +
                print("=" * 30)
         
     | 
| 112 | 
         
            +
                
         
     | 
| 113 | 
         
            +
                examples = [
         
     | 
| 114 | 
         
            +
                    {
         
     | 
| 115 | 
         
            +
                        'name': 'Default Dataset',
         
     | 
| 116 | 
         
            +
                        'repo': 'tonic/trackio-experiments',
         
     | 
| 117 | 
         
            +
                        'description': 'Default dataset for your experiments'
         
     | 
| 118 | 
         
            +
                    },
         
     | 
| 119 | 
         
            +
                    {
         
     | 
| 120 | 
         
            +
                        'name': 'Personal Dataset',
         
     | 
| 121 | 
         
            +
                        'repo': 'your-username/trackio-experiments',
         
     | 
| 122 | 
         
            +
                        'description': 'Your personal experiment dataset'
         
     | 
| 123 | 
         
            +
                    },
         
     | 
| 124 | 
         
            +
                    {
         
     | 
| 125 | 
         
            +
                        'name': 'Team Dataset',
         
     | 
| 126 | 
         
            +
                        'repo': 'your-org/team-experiments',
         
     | 
| 127 | 
         
            +
                        'description': 'Shared dataset for team experiments'
         
     | 
| 128 | 
         
            +
                    },
         
     | 
| 129 | 
         
            +
                    {
         
     | 
| 130 | 
         
            +
                        'name': 'Project Dataset',
         
     | 
| 131 | 
         
            +
                        'repo': 'your-username/smollm3-experiments',
         
     | 
| 132 | 
         
            +
                        'description': 'Dataset specific to SmolLM3 experiments'
         
     | 
| 133 | 
         
            +
                    }
         
     | 
| 134 | 
         
            +
                ]
         
     | 
| 135 | 
         
            +
                
         
     | 
| 136 | 
         
            +
                for i, example in enumerate(examples, 1):
         
     | 
| 137 | 
         
            +
                    print(f"{i}. {example['name']}")
         
     | 
| 138 | 
         
            +
                    print(f"   Repository: {example['repo']}")
         
     | 
| 139 | 
         
            +
                    print(f"   Description: {example['description']}")
         
     | 
| 140 | 
         
            +
                    print(f"   Set with: TRACKIO_DATASET_REPO={example['repo']}")
         
     | 
| 141 | 
         
            +
                    print()
         
     | 
| 142 | 
         
            +
             
     | 
| 143 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 144 | 
         
            +
                configure_trackio()
         
     | 
| 145 | 
         
            +
                show_usage_examples() 
         
     | 
    	
        deploy_trackio_space.py → scripts/trackio_tonic/deploy_trackio_space.py
    RENAMED
    
    | 
         @@ -95,7 +95,7 @@ class TrackioSpaceDeployer: 
     | 
|
| 95 | 
         | 
| 96 | 
         
             
                        # Write README.md for the space
         
     | 
| 97 | 
         
             
                        space_readme = f"""---
         
     | 
| 98 | 
         
            -
            title: Trackio  
     | 
| 99 | 
         
             
            emoji: 🐠
         
     | 
| 100 | 
         
             
            colorFrom: indigo
         
     | 
| 101 | 
         
             
            colorTo: yellow
         
     | 
| 
         | 
|
| 95 | 
         | 
| 96 | 
         
             
                        # Write README.md for the space
         
     | 
| 97 | 
         
             
                        space_readme = f"""---
         
     | 
| 98 | 
         
            +
            title: Trackio Tonic
         
     | 
| 99 | 
         
             
            emoji: 🐠
         
     | 
| 100 | 
         
             
            colorFrom: indigo
         
     | 
| 101 | 
         
             
            colorTo: yellow
         
     | 
    	
        scripts/trackio_tonic/trackio_api_client.py
    ADDED
    
    | 
         @@ -0,0 +1,286 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            #!/usr/bin/env python3
         
     | 
| 2 | 
         
            +
            """
         
     | 
| 3 | 
         
            +
            Trackio API Client for Hugging Face Spaces
         
     | 
| 4 | 
         
            +
            Connects to the Trackio Space using the actual API endpoints
         
     | 
| 5 | 
         
            +
            """
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            import requests
         
     | 
| 8 | 
         
            +
            import json
         
     | 
| 9 | 
         
            +
            import time
         
     | 
| 10 | 
         
            +
            import logging
         
     | 
| 11 | 
         
            +
            from typing import Dict, Any, Optional
         
     | 
| 12 | 
         
            +
            from datetime import datetime
         
     | 
| 13 | 
         
            +
             
     | 
| 14 | 
         
            +
            # Setup logging
         
     | 
| 15 | 
         
            +
            logging.basicConfig(level=logging.INFO)
         
     | 
| 16 | 
         
            +
            logger = logging.getLogger(__name__)
         
     | 
| 17 | 
         
            +
             
     | 
| 18 | 
         
            +
            class TrackioAPIClient:
         
     | 
| 19 | 
         
            +
                """API client for Trackio Space"""
         
     | 
| 20 | 
         
            +
                
         
     | 
| 21 | 
         
            +
                def __init__(self, space_url: str):
         
     | 
| 22 | 
         
            +
                    self.space_url = space_url.rstrip('/')
         
     | 
| 23 | 
         
            +
                    self.base_url = f"{self.space_url}/gradio_api/call"
         
     | 
| 24 | 
         
            +
                    
         
     | 
| 25 | 
         
            +
                def _make_api_call(self, endpoint: str, data: list, max_retries: int = 3) -> Dict[str, Any]:
         
     | 
| 26 | 
         
            +
                    """Make an API call to the Trackio Space"""
         
     | 
| 27 | 
         
            +
                    url = f"{self.base_url}/{endpoint}"
         
     | 
| 28 | 
         
            +
                    
         
     | 
| 29 | 
         
            +
                    payload = {
         
     | 
| 30 | 
         
            +
                        "data": data
         
     | 
| 31 | 
         
            +
                    }
         
     | 
| 32 | 
         
            +
                    
         
     | 
| 33 | 
         
            +
                    for attempt in range(max_retries):
         
     | 
| 34 | 
         
            +
                        try:
         
     | 
| 35 | 
         
            +
                            logger.debug(f"Attempt {attempt + 1}: Making POST request to {url}")
         
     | 
| 36 | 
         
            +
                            
         
     | 
| 37 | 
         
            +
                            # POST request to get EVENT_ID
         
     | 
| 38 | 
         
            +
                            response = requests.post(
         
     | 
| 39 | 
         
            +
                                url,
         
     | 
| 40 | 
         
            +
                                json=payload,
         
     | 
| 41 | 
         
            +
                                headers={"Content-Type": "application/json"},
         
     | 
| 42 | 
         
            +
                                timeout=30
         
     | 
| 43 | 
         
            +
                            )
         
     | 
| 44 | 
         
            +
                            
         
     | 
| 45 | 
         
            +
                            if response.status_code != 200:
         
     | 
| 46 | 
         
            +
                                logger.error(f"POST request failed: {response.status_code} - {response.text}")
         
     | 
| 47 | 
         
            +
                                if attempt < max_retries - 1:
         
     | 
| 48 | 
         
            +
                                    time.sleep(2 ** attempt)  # Exponential backoff
         
     | 
| 49 | 
         
            +
                                    continue
         
     | 
| 50 | 
         
            +
                                return {"error": f"POST failed: {response.status_code}"}
         
     | 
| 51 | 
         
            +
                            
         
     | 
| 52 | 
         
            +
                            # Extract EVENT_ID from response
         
     | 
| 53 | 
         
            +
                            response_data = response.json()
         
     | 
| 54 | 
         
            +
                            logger.debug(f"POST response: {response_data}")
         
     | 
| 55 | 
         
            +
                            
         
     | 
| 56 | 
         
            +
                            # Check for event_id (correct field name)
         
     | 
| 57 | 
         
            +
                            if "event_id" in response_data:
         
     | 
| 58 | 
         
            +
                                event_id = response_data["event_id"]
         
     | 
| 59 | 
         
            +
                            elif "hash" in response_data:
         
     | 
| 60 | 
         
            +
                                event_id = response_data["hash"]
         
     | 
| 61 | 
         
            +
                            else:
         
     | 
| 62 | 
         
            +
                                logger.error(f"No event_id or hash in response: {response_data}")
         
     | 
| 63 | 
         
            +
                                return {"error": "No EVENT_ID in response"}
         
     | 
| 64 | 
         
            +
                            
         
     | 
| 65 | 
         
            +
                            # GET request to get results
         
     | 
| 66 | 
         
            +
                            get_url = f"{url}/{event_id}"
         
     | 
| 67 | 
         
            +
                            logger.debug(f"Making GET request to: {get_url}")
         
     | 
| 68 | 
         
            +
                            
         
     | 
| 69 | 
         
            +
                            # Wait a bit for the processing to complete
         
     | 
| 70 | 
         
            +
                            time.sleep(1)
         
     | 
| 71 | 
         
            +
                            
         
     | 
| 72 | 
         
            +
                            get_response = requests.get(get_url, timeout=30)
         
     | 
| 73 | 
         
            +
                            
         
     | 
| 74 | 
         
            +
                            if get_response.status_code != 200:
         
     | 
| 75 | 
         
            +
                                logger.error(f"GET request failed: {get_response.status_code} - {get_response.text}")
         
     | 
| 76 | 
         
            +
                                if attempt < max_retries - 1:
         
     | 
| 77 | 
         
            +
                                    time.sleep(2 ** attempt)
         
     | 
| 78 | 
         
            +
                                    continue
         
     | 
| 79 | 
         
            +
                                return {"error": f"GET failed: {get_response.status_code}"}
         
     | 
| 80 | 
         
            +
                            
         
     | 
| 81 | 
         
            +
                            # Check if response is empty
         
     | 
| 82 | 
         
            +
                            if not get_response.content:
         
     | 
| 83 | 
         
            +
                                logger.warning(f"Empty response from GET request (attempt {attempt + 1})")
         
     | 
| 84 | 
         
            +
                                if attempt < max_retries - 1:
         
     | 
| 85 | 
         
            +
                                    time.sleep(2 ** attempt)
         
     | 
| 86 | 
         
            +
                                    continue
         
     | 
| 87 | 
         
            +
                                return {"error": "Empty response from server"}
         
     | 
| 88 | 
         
            +
                            
         
     | 
| 89 | 
         
            +
                            # Parse the response - handle both JSON and SSE formats
         
     | 
| 90 | 
         
            +
                            response_text = get_response.text.strip()
         
     | 
| 91 | 
         
            +
                            logger.debug(f"Raw response: {response_text}")
         
     | 
| 92 | 
         
            +
                            
         
     | 
| 93 | 
         
            +
                            # Try to parse as JSON first
         
     | 
| 94 | 
         
            +
                            try:
         
     | 
| 95 | 
         
            +
                                result_data = get_response.json()
         
     | 
| 96 | 
         
            +
                                logger.debug(f"Parsed as JSON: {result_data}")
         
     | 
| 97 | 
         
            +
                                
         
     | 
| 98 | 
         
            +
                                if "data" in result_data and len(result_data["data"]) > 0:
         
     | 
| 99 | 
         
            +
                                    return {"success": True, "data": result_data["data"][0]}
         
     | 
| 100 | 
         
            +
                                else:
         
     | 
| 101 | 
         
            +
                                    logger.warning(f"No data in JSON response (attempt {attempt + 1}): {result_data}")
         
     | 
| 102 | 
         
            +
                                    if attempt < max_retries - 1:
         
     | 
| 103 | 
         
            +
                                        time.sleep(2 ** attempt)
         
     | 
| 104 | 
         
            +
                                        continue
         
     | 
| 105 | 
         
            +
                                    return {"error": "No data in JSON response", "raw": result_data}
         
     | 
| 106 | 
         
            +
                                    
         
     | 
| 107 | 
         
            +
                            except json.JSONDecodeError:
         
     | 
| 108 | 
         
            +
                                # Try to parse as Server-Sent Events (SSE) format
         
     | 
| 109 | 
         
            +
                                logger.debug("Response is not JSON, trying SSE format")
         
     | 
| 110 | 
         
            +
                                
         
     | 
| 111 | 
         
            +
                                # Parse SSE format: "event: complete\ndata: [\"message\"]"
         
     | 
| 112 | 
         
            +
                                lines = response_text.split('\n')
         
     | 
| 113 | 
         
            +
                                data_line = None
         
     | 
| 114 | 
         
            +
                                
         
     | 
| 115 | 
         
            +
                                for line in lines:
         
     | 
| 116 | 
         
            +
                                    if line.startswith('data: '):
         
     | 
| 117 | 
         
            +
                                        data_line = line[6:]  # Remove 'data: ' prefix
         
     | 
| 118 | 
         
            +
                                        break
         
     | 
| 119 | 
         
            +
                                
         
     | 
| 120 | 
         
            +
                                if data_line:
         
     | 
| 121 | 
         
            +
                                    try:
         
     | 
| 122 | 
         
            +
                                        # Parse the data array from SSE
         
     | 
| 123 | 
         
            +
                                        import ast
         
     | 
| 124 | 
         
            +
                                        data_array = ast.literal_eval(data_line)
         
     | 
| 125 | 
         
            +
                                        
         
     | 
| 126 | 
         
            +
                                        if isinstance(data_array, list) and len(data_array) > 0:
         
     | 
| 127 | 
         
            +
                                            result_message = data_array[0]
         
     | 
| 128 | 
         
            +
                                            logger.debug(f"Parsed SSE data: {result_message}")
         
     | 
| 129 | 
         
            +
                                            return {"success": True, "data": result_message}
         
     | 
| 130 | 
         
            +
                                        else:
         
     | 
| 131 | 
         
            +
                                            logger.warning(f"Invalid SSE data format (attempt {attempt + 1}): {data_array}")
         
     | 
| 132 | 
         
            +
                                            if attempt < max_retries - 1:
         
     | 
| 133 | 
         
            +
                                                time.sleep(2 ** attempt)
         
     | 
| 134 | 
         
            +
                                                continue
         
     | 
| 135 | 
         
            +
                                            return {"error": "Invalid SSE data format", "raw": data_array}
         
     | 
| 136 | 
         
            +
                                            
         
     | 
| 137 | 
         
            +
                                    except (ValueError, SyntaxError) as e:
         
     | 
| 138 | 
         
            +
                                        logger.error(f"Failed to parse SSE data: {e}")
         
     | 
| 139 | 
         
            +
                                        logger.debug(f"Raw SSE data: {data_line}")
         
     | 
| 140 | 
         
            +
                                        if attempt < max_retries - 1:
         
     | 
| 141 | 
         
            +
                                            time.sleep(2 ** attempt)
         
     | 
| 142 | 
         
            +
                                            continue
         
     | 
| 143 | 
         
            +
                                        return {"error": f"Failed to parse SSE data: {e}"}
         
     | 
| 144 | 
         
            +
                                else:
         
     | 
| 145 | 
         
            +
                                    logger.error(f"No data line found in SSE response")
         
     | 
| 146 | 
         
            +
                                    if attempt < max_retries - 1:
         
     | 
| 147 | 
         
            +
                                        time.sleep(2 ** attempt)
         
     | 
| 148 | 
         
            +
                                        continue
         
     | 
| 149 | 
         
            +
                                    return {"error": "No data line in SSE response", "raw": response_text}
         
     | 
| 150 | 
         
            +
                                
         
     | 
| 151 | 
         
            +
                        except requests.exceptions.RequestException as e:
         
     | 
| 152 | 
         
            +
                            logger.error(f"API call failed (attempt {attempt + 1}): {e}")
         
     | 
| 153 | 
         
            +
                            if attempt < max_retries - 1:
         
     | 
| 154 | 
         
            +
                                time.sleep(2 ** attempt)
         
     | 
| 155 | 
         
            +
                                continue
         
     | 
| 156 | 
         
            +
                            return {"error": f"Request failed: {e}"}
         
     | 
| 157 | 
         
            +
                        except Exception as e:
         
     | 
| 158 | 
         
            +
                            logger.error(f"Unexpected error (attempt {attempt + 1}): {e}")
         
     | 
| 159 | 
         
            +
                            if attempt < max_retries - 1:
         
     | 
| 160 | 
         
            +
                                time.sleep(2 ** attempt)
         
     | 
| 161 | 
         
            +
                                continue
         
     | 
| 162 | 
         
            +
                            return {"error": f"Unexpected error: {e}"}
         
     | 
| 163 | 
         
            +
                    
         
     | 
| 164 | 
         
            +
                    return {"error": f"Failed after {max_retries} attempts"}
         
     | 
| 165 | 
         
            +
                
         
     | 
| 166 | 
         
            +
                def create_experiment(self, name: str, description: str = "") -> Dict[str, Any]:
         
     | 
| 167 | 
         
            +
                    """Create a new experiment"""
         
     | 
| 168 | 
         
            +
                    logger.info(f"Creating experiment: {name}")
         
     | 
| 169 | 
         
            +
                    
         
     | 
| 170 | 
         
            +
                    result = self._make_api_call("create_experiment_interface", [name, description])
         
     | 
| 171 | 
         
            +
                    
         
     | 
| 172 | 
         
            +
                    if "success" in result:
         
     | 
| 173 | 
         
            +
                        logger.info(f"Experiment created successfully: {result['data']}")
         
     | 
| 174 | 
         
            +
                        return result
         
     | 
| 175 | 
         
            +
                    else:
         
     | 
| 176 | 
         
            +
                        logger.error(f"Failed to create experiment: {result}")
         
     | 
| 177 | 
         
            +
                        return result
         
     | 
| 178 | 
         
            +
                
         
     | 
| 179 | 
         
            +
                def log_metrics(self, experiment_id: str, metrics: Dict[str, Any], step: Optional[int] = None) -> Dict[str, Any]:
         
     | 
| 180 | 
         
            +
                    """Log metrics for an experiment"""
         
     | 
| 181 | 
         
            +
                    metrics_json = json.dumps(metrics)
         
     | 
| 182 | 
         
            +
                    step_str = str(step) if step is not None else ""
         
     | 
| 183 | 
         
            +
                    
         
     | 
| 184 | 
         
            +
                    logger.info(f"Logging metrics for experiment {experiment_id} at step {step}")
         
     | 
| 185 | 
         
            +
                    
         
     | 
| 186 | 
         
            +
                    result = self._make_api_call("log_metrics_interface", [experiment_id, metrics_json, step_str])
         
     | 
| 187 | 
         
            +
                    
         
     | 
| 188 | 
         
            +
                    if "success" in result:
         
     | 
| 189 | 
         
            +
                        logger.info(f"Metrics logged successfully: {result['data']}")
         
     | 
| 190 | 
         
            +
                        return result
         
     | 
| 191 | 
         
            +
                    else:
         
     | 
| 192 | 
         
            +
                        logger.error(f"Failed to log metrics: {result}")
         
     | 
| 193 | 
         
            +
                        return result
         
     | 
| 194 | 
         
            +
                
         
     | 
| 195 | 
         
            +
                def log_parameters(self, experiment_id: str, parameters: Dict[str, Any]) -> Dict[str, Any]:
         
     | 
| 196 | 
         
            +
                    """Log parameters for an experiment"""
         
     | 
| 197 | 
         
            +
                    parameters_json = json.dumps(parameters)
         
     | 
| 198 | 
         
            +
                    
         
     | 
| 199 | 
         
            +
                    logger.info(f"Logging parameters for experiment {experiment_id}")
         
     | 
| 200 | 
         
            +
                    
         
     | 
| 201 | 
         
            +
                    result = self._make_api_call("log_parameters_interface", [experiment_id, parameters_json])
         
     | 
| 202 | 
         
            +
                    
         
     | 
| 203 | 
         
            +
                    if "success" in result:
         
     | 
| 204 | 
         
            +
                        logger.info(f"Parameters logged successfully: {result['data']}")
         
     | 
| 205 | 
         
            +
                        return result
         
     | 
| 206 | 
         
            +
                    else:
         
     | 
| 207 | 
         
            +
                        logger.error(f"Failed to log parameters: {result}")
         
     | 
| 208 | 
         
            +
                        return result
         
     | 
| 209 | 
         
            +
                
         
     | 
| 210 | 
         
            +
                def get_experiment_details(self, experiment_id: str) -> Dict[str, Any]:
         
     | 
| 211 | 
         
            +
                    """Get experiment details"""
         
     | 
| 212 | 
         
            +
                    logger.info(f"Getting details for experiment {experiment_id}")
         
     | 
| 213 | 
         
            +
                    
         
     | 
| 214 | 
         
            +
                    result = self._make_api_call("get_experiment_details_interface", [experiment_id])
         
     | 
| 215 | 
         
            +
                    
         
     | 
| 216 | 
         
            +
                    if "success" in result:
         
     | 
| 217 | 
         
            +
                        logger.info(f"Experiment details retrieved: {result['data']}")
         
     | 
| 218 | 
         
            +
                        return result
         
     | 
| 219 | 
         
            +
                    else:
         
     | 
| 220 | 
         
            +
                        logger.error(f"Failed to get experiment details: {result}")
         
     | 
| 221 | 
         
            +
                        return result
         
     | 
| 222 | 
         
            +
                
         
     | 
| 223 | 
         
            +
                def list_experiments(self) -> Dict[str, Any]:
         
     | 
| 224 | 
         
            +
                    """List all experiments"""
         
     | 
| 225 | 
         
            +
                    logger.info("Listing experiments")
         
     | 
| 226 | 
         
            +
                    
         
     | 
| 227 | 
         
            +
                    result = self._make_api_call("list_experiments_interface", [])
         
     | 
| 228 | 
         
            +
                    
         
     | 
| 229 | 
         
            +
                    if "success" in result:
         
     | 
| 230 | 
         
            +
                        logger.info(f"Experiments listed successfully: {result['data']}")
         
     | 
| 231 | 
         
            +
                        return result
         
     | 
| 232 | 
         
            +
                    else:
         
     | 
| 233 | 
         
            +
                        logger.error(f"Failed to list experiments: {result}")
         
     | 
| 234 | 
         
            +
                        return result
         
     | 
| 235 | 
         
            +
                
         
     | 
| 236 | 
         
            +
                def update_experiment_status(self, experiment_id: str, status: str) -> Dict[str, Any]:
         
     | 
| 237 | 
         
            +
                    """Update experiment status"""
         
     | 
| 238 | 
         
            +
                    logger.info(f"Updating experiment {experiment_id} status to {status}")
         
     | 
| 239 | 
         
            +
                    
         
     | 
| 240 | 
         
            +
                    result = self._make_api_call("update_experiment_status_interface", [experiment_id, status])
         
     | 
| 241 | 
         
            +
                    
         
     | 
| 242 | 
         
            +
                    if "success" in result:
         
     | 
| 243 | 
         
            +
                        logger.info(f"Experiment status updated successfully: {result['data']}")
         
     | 
| 244 | 
         
            +
                        return result
         
     | 
| 245 | 
         
            +
                    else:
         
     | 
| 246 | 
         
            +
                        logger.error(f"Failed to update experiment status: {result}")
         
     | 
| 247 | 
         
            +
                        return result
         
     | 
| 248 | 
         
            +
                
         
     | 
| 249 | 
         
            +
                def simulate_training_data(self, experiment_id: str) -> Dict[str, Any]:
         
     | 
| 250 | 
         
            +
                    """Simulate training data for testing"""
         
     | 
| 251 | 
         
            +
                    logger.info(f"Simulating training data for experiment {experiment_id}")
         
     | 
| 252 | 
         
            +
                    
         
     | 
| 253 | 
         
            +
                    result = self._make_api_call("simulate_training_data_interface", [experiment_id])
         
     | 
| 254 | 
         
            +
                    
         
     | 
| 255 | 
         
            +
                    if "success" in result:
         
     | 
| 256 | 
         
            +
                        logger.info(f"Training data simulated successfully: {result['data']}")
         
     | 
| 257 | 
         
            +
                        return result
         
     | 
| 258 | 
         
            +
                    else:
         
     | 
| 259 | 
         
            +
                        logger.error(f"Failed to simulate training data: {result}")
         
     | 
| 260 | 
         
            +
                        return result
         
     | 
| 261 | 
         
            +
                
         
     | 
| 262 | 
         
            +
                def get_training_metrics(self, experiment_id: str) -> Dict[str, Any]:
         
     | 
| 263 | 
         
            +
                    """Get training metrics for an experiment"""
         
     | 
| 264 | 
         
            +
                    logger.info(f"Getting training metrics for experiment {experiment_id}")
         
     | 
| 265 | 
         
            +
                    
         
     | 
| 266 | 
         
            +
                    result = self._make_api_call("get_training_metrics_interface", [experiment_id])
         
     | 
| 267 | 
         
            +
                    
         
     | 
| 268 | 
         
            +
                    if "success" in result:
         
     | 
| 269 | 
         
            +
                        logger.info(f"Training metrics retrieved: {result['data']}")
         
     | 
| 270 | 
         
            +
                        return result
         
     | 
| 271 | 
         
            +
                    else:
         
     | 
| 272 | 
         
            +
                        logger.error(f"Failed to get training metrics: {result}")
         
     | 
| 273 | 
         
            +
                        return result
         
     | 
| 274 | 
         
            +
                
         
     | 
| 275 | 
         
            +
                def get_experiment_metrics_history(self, experiment_id: str) -> Dict[str, Any]:
         
     | 
| 276 | 
         
            +
                    """Get experiment metrics history"""
         
     | 
| 277 | 
         
            +
                    logger.info(f"Getting metrics history for experiment {experiment_id}")
         
     | 
| 278 | 
         
            +
                    
         
     | 
| 279 | 
         
            +
                    result = self._make_api_call("get_experiment_metrics_history_interface", [experiment_id])
         
     | 
| 280 | 
         
            +
                    
         
     | 
| 281 | 
         
            +
                    if "success" in result:
         
     | 
| 282 | 
         
            +
                        logger.info(f"Metrics history retrieved: {result['data']}")
         
     | 
| 283 | 
         
            +
                        return result
         
     | 
| 284 | 
         
            +
                    else:
         
     | 
| 285 | 
         
            +
                        logger.error(f"Failed to get metrics history: {result}")
         
     | 
| 286 | 
         
            +
                        return result 
         
     | 
    	
        run_a100_large_experiment.py → scripts/training/train.py
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        setup_launch.py
    ADDED
    
    | 
         @@ -0,0 +1,283 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            #!/usr/bin/env python3
         
     | 
| 2 | 
         
            +
            """
         
     | 
| 3 | 
         
            +
            Setup script for the interactive SmolLM3 end-to-end fine-tuning pipeline
         
     | 
| 4 | 
         
            +
            Helps users prepare for the interactive launch script
         
     | 
| 5 | 
         
            +
            """
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            import os
         
     | 
| 8 | 
         
            +
            import re
         
     | 
| 9 | 
         
            +
            from pathlib import Path
         
     | 
| 10 | 
         
            +
             
     | 
| 11 | 
         
            +
            def setup_launch_script():
         
     | 
| 12 | 
         
            +
                """Setup the launch.sh script with user configuration"""
         
     | 
| 13 | 
         
            +
                
         
     | 
| 14 | 
         
            +
                print("🚀 SmolLM3 Interactive End-to-End Fine-tuning Setup")
         
     | 
| 15 | 
         
            +
                print("=" * 60)
         
     | 
| 16 | 
         
            +
                
         
     | 
| 17 | 
         
            +
                print("\n📋 This setup will help you prepare for the interactive pipeline.")
         
     | 
| 18 | 
         
            +
                print("The launch script will now prompt you for all necessary information.")
         
     | 
| 19 | 
         
            +
                
         
     | 
| 20 | 
         
            +
                # Check if launch.sh exists
         
     | 
| 21 | 
         
            +
                launch_path = Path("launch.sh")
         
     | 
| 22 | 
         
            +
                if not launch_path.exists():
         
     | 
| 23 | 
         
            +
                    print("❌ launch.sh not found")
         
     | 
| 24 | 
         
            +
                    return False
         
     | 
| 25 | 
         
            +
                
         
     | 
| 26 | 
         
            +
                print("\n✅ launch.sh found - no configuration needed!")
         
     | 
| 27 | 
         
            +
                print("The script is now interactive and will prompt you for all settings.")
         
     | 
| 28 | 
         
            +
                
         
     | 
| 29 | 
         
            +
                return True
         
     | 
| 30 | 
         
            +
             
     | 
| 31 | 
         
            +
            def create_requirements_check():
         
     | 
| 32 | 
         
            +
                """Create a requirements check script"""
         
     | 
| 33 | 
         
            +
                
         
     | 
| 34 | 
         
            +
                check_script = """#!/usr/bin/env python3
         
     | 
| 35 | 
         
            +
            \"\"\"
         
     | 
| 36 | 
         
            +
            Requirements check for SmolLM3 fine-tuning
         
     | 
| 37 | 
         
            +
            \"\"\"
         
     | 
| 38 | 
         
            +
             
     | 
| 39 | 
         
            +
            import sys
         
     | 
| 40 | 
         
            +
            import subprocess
         
     | 
| 41 | 
         
            +
             
     | 
| 42 | 
         
            +
            def check_requirements():
         
     | 
| 43 | 
         
            +
                \"\"\"Check if all requirements are met\"\"\"
         
     | 
| 44 | 
         
            +
                
         
     | 
| 45 | 
         
            +
                print("🔍 Checking requirements...")
         
     | 
| 46 | 
         
            +
                
         
     | 
| 47 | 
         
            +
                # Check Python version
         
     | 
| 48 | 
         
            +
                if sys.version_info < (3, 8):
         
     | 
| 49 | 
         
            +
                    print("❌ Python 3.8+ required")
         
     | 
| 50 | 
         
            +
                    return False
         
     | 
| 51 | 
         
            +
                else:
         
     | 
| 52 | 
         
            +
                    print(f"✅ Python {sys.version_info.major}.{sys.version_info.minor}")
         
     | 
| 53 | 
         
            +
                
         
     | 
| 54 | 
         
            +
                # Check required packages
         
     | 
| 55 | 
         
            +
                required_packages = [
         
     | 
| 56 | 
         
            +
                    'torch',
         
     | 
| 57 | 
         
            +
                    'transformers',
         
     | 
| 58 | 
         
            +
                    'datasets',
         
     | 
| 59 | 
         
            +
                    'accelerate',
         
     | 
| 60 | 
         
            +
                    'trl',
         
     | 
| 61 | 
         
            +
                    'huggingface_hub',
         
     | 
| 62 | 
         
            +
                    'requests'
         
     | 
| 63 | 
         
            +
                ]
         
     | 
| 64 | 
         
            +
                
         
     | 
| 65 | 
         
            +
                missing_packages = []
         
     | 
| 66 | 
         
            +
                for package in required_packages:
         
     | 
| 67 | 
         
            +
                    try:
         
     | 
| 68 | 
         
            +
                        __import__(package)
         
     | 
| 69 | 
         
            +
                        print(f"✅ {package}")
         
     | 
| 70 | 
         
            +
                    except ImportError:
         
     | 
| 71 | 
         
            +
                        print(f"❌ {package}")
         
     | 
| 72 | 
         
            +
                        missing_packages.append(package)
         
     | 
| 73 | 
         
            +
                
         
     | 
| 74 | 
         
            +
                if missing_packages:
         
     | 
| 75 | 
         
            +
                    print(f"\\n📦 Install missing packages:")
         
     | 
| 76 | 
         
            +
                    print(f"pip install {' '.join(missing_packages)}")
         
     | 
| 77 | 
         
            +
                    return False
         
     | 
| 78 | 
         
            +
                
         
     | 
| 79 | 
         
            +
                # Check CUDA
         
     | 
| 80 | 
         
            +
                try:
         
     | 
| 81 | 
         
            +
                    import torch
         
     | 
| 82 | 
         
            +
                    if torch.cuda.is_available():
         
     | 
| 83 | 
         
            +
                        print(f"✅ CUDA available: {torch.cuda.get_device_name(0)}")
         
     | 
| 84 | 
         
            +
                    else:
         
     | 
| 85 | 
         
            +
                        print("⚠️  CUDA not available (training will be slower)")
         
     | 
| 86 | 
         
            +
                except:
         
     | 
| 87 | 
         
            +
                    print("⚠️  Could not check CUDA availability")
         
     | 
| 88 | 
         
            +
                
         
     | 
| 89 | 
         
            +
                print("\\n✅ All requirements met!")
         
     | 
| 90 | 
         
            +
                return True
         
     | 
| 91 | 
         
            +
             
     | 
| 92 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 93 | 
         
            +
                check_requirements()
         
     | 
| 94 | 
         
            +
            """
         
     | 
| 95 | 
         
            +
                
         
     | 
| 96 | 
         
            +
                with open("check_requirements.py", 'w') as f:
         
     | 
| 97 | 
         
            +
                    f.write(check_script)
         
     | 
| 98 | 
         
            +
                
         
     | 
| 99 | 
         
            +
                print("✅ Created check_requirements.py")
         
     | 
| 100 | 
         
            +
             
     | 
| 101 | 
         
            +
            def create_quick_start_guide():
         
     | 
| 102 | 
         
            +
                """Create a quick start guide"""
         
     | 
| 103 | 
         
            +
                
         
     | 
| 104 | 
         
            +
                guide = """# SmolLM3 Interactive Pipeline - Quick Start Guide
         
     | 
| 105 | 
         
            +
             
     | 
| 106 | 
         
            +
            ## 🚀 Quick Start
         
     | 
| 107 | 
         
            +
             
     | 
| 108 | 
         
            +
            ### 1. Check Requirements
         
     | 
| 109 | 
         
            +
            ```bash
         
     | 
| 110 | 
         
            +
            python check_requirements.py
         
     | 
| 111 | 
         
            +
            ```
         
     | 
| 112 | 
         
            +
             
     | 
| 113 | 
         
            +
            ### 2. Run the Interactive Pipeline
         
     | 
| 114 | 
         
            +
            ```bash
         
     | 
| 115 | 
         
            +
            chmod +x launch.sh
         
     | 
| 116 | 
         
            +
            ./launch.sh
         
     | 
| 117 | 
         
            +
            ```
         
     | 
| 118 | 
         
            +
             
     | 
| 119 | 
         
            +
            ## 📋 What the Interactive Pipeline Does
         
     | 
| 120 | 
         
            +
             
     | 
| 121 | 
         
            +
            The pipeline will guide you through:
         
     | 
| 122 | 
         
            +
             
     | 
| 123 | 
         
            +
            1. **Authentication** - Enter your HF username and token
         
     | 
| 124 | 
         
            +
            2. **Configuration Selection** - Choose from predefined training configs:
         
     | 
| 125 | 
         
            +
               - Basic Training (SmolLM3 + SmolTalk)
         
     | 
| 126 | 
         
            +
               - H100 Lightweight (Rapid training on H100)
         
     | 
| 127 | 
         
            +
               - A100 Large Scale (SmolLM3 + OpenHermes-FR)
         
     | 
| 128 | 
         
            +
               - Multiple Passes (Extended training)
         
     | 
| 129 | 
         
            +
               - Custom Configuration (User-defined)
         
     | 
| 130 | 
         
            +
            3. **Experiment Setup** - Configure experiment name and repositories
         
     | 
| 131 | 
         
            +
            4. **Training Parameters** - Adjust batch size, learning rate, etc.
         
     | 
| 132 | 
         
            +
            5. **Deployment** - Automatic Trackio Space and HF Dataset setup
         
     | 
| 133 | 
         
            +
            6. **Training** - Monitored fine-tuning with real-time tracking
         
     | 
| 134 | 
         
            +
            7. **Model Push** - Upload to HF Hub with documentation
         
     | 
| 135 | 
         
            +
             
     | 
| 136 | 
         
            +
            ## 🎯 Available Training Configurations
         
     | 
| 137 | 
         
            +
             
     | 
| 138 | 
         
            +
            ### 1. Basic Training (Default)
         
     | 
| 139 | 
         
            +
            - **Model**: SmolLM3-3B
         
     | 
| 140 | 
         
            +
            - **Dataset**: SmolTalk
         
     | 
| 141 | 
         
            +
            - **Epochs**: 3
         
     | 
| 142 | 
         
            +
            - **Batch Size**: 2
         
     | 
| 143 | 
         
            +
            - **Learning Rate**: 5e-6
         
     | 
| 144 | 
         
            +
            - **Best for**: Quick experiments, learning
         
     | 
| 145 | 
         
            +
             
     | 
| 146 | 
         
            +
            ### 2. H100 Lightweight (Rapid)
         
     | 
| 147 | 
         
            +
            - **Model**: SmolLM3-3B
         
     | 
| 148 | 
         
            +
            - **Dataset**: OpenHermes-FR (80K samples)
         
     | 
| 149 | 
         
            +
            - **Epochs**: 1
         
     | 
| 150 | 
         
            +
            - **Batch Size**: 16
         
     | 
| 151 | 
         
            +
            - **Learning Rate**: 8e-6
         
     | 
| 152 | 
         
            +
            - **Sequence Length**: 8192
         
     | 
| 153 | 
         
            +
            - **Best for**: Rapid training on H100
         
     | 
| 154 | 
         
            +
             
     | 
| 155 | 
         
            +
            ### 3. A100 Large Scale
         
     | 
| 156 | 
         
            +
            - **Model**: SmolLM3-3B
         
     | 
| 157 | 
         
            +
            - **Dataset**: OpenHermes-FR
         
     | 
| 158 | 
         
            +
            - **Epochs**: 1.3 passes
         
     | 
| 159 | 
         
            +
            - **Batch Size**: 8
         
     | 
| 160 | 
         
            +
            - **Learning Rate**: 5e-6
         
     | 
| 161 | 
         
            +
            - **Sequence Length**: 8192
         
     | 
| 162 | 
         
            +
            - **Best for**: High-performance training
         
     | 
| 163 | 
         
            +
             
     | 
| 164 | 
         
            +
            ### 4. Multiple Passes
         
     | 
| 165 | 
         
            +
            - **Model**: SmolLM3-3B
         
     | 
| 166 | 
         
            +
            - **Dataset**: OpenHermes-FR
         
     | 
| 167 | 
         
            +
            - **Epochs**: 4 passes
         
     | 
| 168 | 
         
            +
            - **Batch Size**: 6
         
     | 
| 169 | 
         
            +
            - **Learning Rate**: 3e-6
         
     | 
| 170 | 
         
            +
            - **Sequence Length**: 8192
         
     | 
| 171 | 
         
            +
            - **Best for**: Thorough training
         
     | 
| 172 | 
         
            +
             
     | 
| 173 | 
         
            +
            ### 5. Custom Configuration
         
     | 
| 174 | 
         
            +
            - **User-defined parameters**
         
     | 
| 175 | 
         
            +
            - **Flexible model and dataset selection**
         
     | 
| 176 | 
         
            +
            - **Custom training parameters**
         
     | 
| 177 | 
         
            +
             
     | 
| 178 | 
         
            +
            ## 🔧 Prerequisites
         
     | 
| 179 | 
         
            +
             
     | 
| 180 | 
         
            +
            1. **Hugging Face Account**
         
     | 
| 181 | 
         
            +
               - Create account at https://huggingface.co
         
     | 
| 182 | 
         
            +
               - Generate token at https://huggingface.co/settings/tokens
         
     | 
| 183 | 
         
            +
             
     | 
| 184 | 
         
            +
            2. **System Requirements**
         
     | 
| 185 | 
         
            +
               - Python 3.8+
         
     | 
| 186 | 
         
            +
               - CUDA-compatible GPU (recommended)
         
     | 
| 187 | 
         
            +
               - 16GB+ RAM
         
     | 
| 188 | 
         
            +
               - 50GB+ storage
         
     | 
| 189 | 
         
            +
             
     | 
| 190 | 
         
            +
            3. **Dependencies**
         
     | 
| 191 | 
         
            +
               - PyTorch with CUDA
         
     | 
| 192 | 
         
            +
               - Transformers
         
     | 
| 193 | 
         
            +
               - Datasets
         
     | 
| 194 | 
         
            +
               - Accelerate
         
     | 
| 195 | 
         
            +
               - TRL
         
     | 
| 196 | 
         
            +
             
     | 
| 197 | 
         
            +
            ## 📊 Expected Outputs
         
     | 
| 198 | 
         
            +
             
     | 
| 199 | 
         
            +
            After running the pipeline, you'll have:
         
     | 
| 200 | 
         
            +
             
     | 
| 201 | 
         
            +
            - **Model Repository**: `https://huggingface.co/your-username/smollm3-finetuned-YYYYMMDD`
         
     | 
| 202 | 
         
            +
            - **Trackio Space**: `https://huggingface.co/spaces/your-username/trackio-monitoring-YYYYMMDD`
         
     | 
| 203 | 
         
            +
            - **Experiment Dataset**: `https://huggingface.co/datasets/your-username/trackio-experiments`
         
     | 
| 204 | 
         
            +
            - **Training Summary**: `training_summary.md`
         
     | 
| 205 | 
         
            +
             
     | 
| 206 | 
         
            +
            ## 🛠️ Troubleshooting
         
     | 
| 207 | 
         
            +
             
     | 
| 208 | 
         
            +
            ### Common Issues
         
     | 
| 209 | 
         
            +
             
     | 
| 210 | 
         
            +
            1. **HF Token Issues**
         
     | 
| 211 | 
         
            +
               ```bash
         
     | 
| 212 | 
         
            +
               huggingface-cli whoami
         
     | 
| 213 | 
         
            +
               ```
         
     | 
| 214 | 
         
            +
             
     | 
| 215 | 
         
            +
            2. **CUDA Issues**
         
     | 
| 216 | 
         
            +
               ```bash
         
     | 
| 217 | 
         
            +
               python -c "import torch; print(torch.cuda.is_available())"
         
     | 
| 218 | 
         
            +
               ```
         
     | 
| 219 | 
         
            +
             
     | 
| 220 | 
         
            +
            3. **Memory Issues**
         
     | 
| 221 | 
         
            +
               - Reduce batch size in custom configuration
         
     | 
| 222 | 
         
            +
               - Increase gradient accumulation steps
         
     | 
| 223 | 
         
            +
             
     | 
| 224 | 
         
            +
            4. **Network Issues**
         
     | 
| 225 | 
         
            +
               - Check internet connection
         
     | 
| 226 | 
         
            +
               - Verify HF token permissions
         
     | 
| 227 | 
         
            +
             
     | 
| 228 | 
         
            +
            ## 🎯 Tips for Success
         
     | 
| 229 | 
         
            +
             
     | 
| 230 | 
         
            +
            1. **Start with Basic Training** for your first run
         
     | 
| 231 | 
         
            +
            2. **Use H100 Lightweight** for rapid experiments on H100
         
     | 
| 232 | 
         
            +
            3. **Use A100 Large Scale** for serious experiments
         
     | 
| 233 | 
         
            +
            3. **Monitor in Trackio Space** for real-time progress
         
     | 
| 234 | 
         
            +
            4. **Check logs** if something goes wrong
         
     | 
| 235 | 
         
            +
            5. **Test the model** after training completes
         
     | 
| 236 | 
         
            +
             
     | 
| 237 | 
         
            +
            ## 📞 Support
         
     | 
| 238 | 
         
            +
             
     | 
| 239 | 
         
            +
            - Check the troubleshooting section
         
     | 
| 240 | 
         
            +
            - Review logs in `training.log`
         
     | 
| 241 | 
         
            +
            - Monitor progress in Trackio Space
         
     | 
| 242 | 
         
            +
            - Open an issue on GitHub
         
     | 
| 243 | 
         
            +
             
     | 
| 244 | 
         
            +
            ---
         
     | 
| 245 | 
         
            +
             
     | 
| 246 | 
         
            +
            **Happy Fine-tuning! 🚀**
         
     | 
| 247 | 
         
            +
            """
         
     | 
| 248 | 
         
            +
                
         
     | 
| 249 | 
         
            +
                with open("QUICK_START_GUIDE.md", 'w') as f:
         
     | 
| 250 | 
         
            +
                    f.write(guide)
         
     | 
| 251 | 
         
            +
                
         
     | 
| 252 | 
         
            +
                print("✅ Created QUICK_START_GUIDE.md")
         
     | 
| 253 | 
         
            +
             
     | 
| 254 | 
         
            +
            def main():
         
     | 
| 255 | 
         
            +
                """Main setup function"""
         
     | 
| 256 | 
         
            +
                
         
     | 
| 257 | 
         
            +
                print("Welcome to SmolLM3 Interactive End-to-End Fine-tuning Setup!")
         
     | 
| 258 | 
         
            +
                print("This will help you prepare for the interactive pipeline.")
         
     | 
| 259 | 
         
            +
                
         
     | 
| 260 | 
         
            +
                if setup_launch_script():
         
     | 
| 261 | 
         
            +
                    create_requirements_check()
         
     | 
| 262 | 
         
            +
                    create_quick_start_guide()
         
     | 
| 263 | 
         
            +
                    
         
     | 
| 264 | 
         
            +
                    print("\n🎉 Setup completed successfully!")
         
     | 
| 265 | 
         
            +
                    print("\n📋 Files created:")
         
     | 
| 266 | 
         
            +
                    print("  - check_requirements.py (requirement checker)")
         
     | 
| 267 | 
         
            +
                    print("  - QUICK_START_GUIDE.md (usage guide)")
         
     | 
| 268 | 
         
            +
                    
         
     | 
| 269 | 
         
            +
                    print("\n🚀 Ready to start training!")
         
     | 
| 270 | 
         
            +
                    print("Next steps:")
         
     | 
| 271 | 
         
            +
                    print("1. Run: python check_requirements.py")
         
     | 
| 272 | 
         
            +
                    print("2. Run: chmod +x launch.sh")
         
     | 
| 273 | 
         
            +
                    print("3. Run: ./launch.sh")
         
     | 
| 274 | 
         
            +
                    print("4. Follow the interactive prompts")
         
     | 
| 275 | 
         
            +
                    
         
     | 
| 276 | 
         
            +
                    print("\n📚 For detailed information, see:")
         
     | 
| 277 | 
         
            +
                    print("  - QUICK_START_GUIDE.md")
         
     | 
| 278 | 
         
            +
                    print("  - README_END_TO_END.md")
         
     | 
| 279 | 
         
            +
                else:
         
     | 
| 280 | 
         
            +
                    print("\n❌ Setup failed. Please check your input and try again.")
         
     | 
| 281 | 
         
            +
             
     | 
| 282 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 283 | 
         
            +
                main() 
         
     | 
    	
        config.py → src/config.py
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        data.py → src/data.py
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        model.py → src/model.py
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        monitoring.py → src/monitoring.py
    RENAMED
    
    | 
         @@ -1,6 +1,6 @@ 
     | 
|
| 1 | 
         
             
            """
         
     | 
| 2 | 
         
             
            Trackio Monitoring Integration for SmolLM3 Fine-tuning
         
     | 
| 3 | 
         
            -
            Provides comprehensive experiment tracking and monitoring capabilities
         
     | 
| 4 | 
         
             
            """
         
     | 
| 5 | 
         | 
| 6 | 
         
             
            import os
         
     | 
| 
         @@ -13,7 +13,7 @@ from pathlib import Path 
     | 
|
| 13 | 
         | 
| 14 | 
         
             
            # Import the real API client
         
     | 
| 15 | 
         
             
            try:
         
     | 
| 16 | 
         
            -
                from trackio_api_client import TrackioAPIClient
         
     | 
| 17 | 
         
             
                TRACKIO_AVAILABLE = True
         
     | 
| 18 | 
         
             
            except ImportError:
         
     | 
| 19 | 
         
             
                TRACKIO_AVAILABLE = False
         
     | 
| 
         @@ -22,7 +22,7 @@ except ImportError: 
     | 
|
| 22 | 
         
             
            logger = logging.getLogger(__name__)
         
     | 
| 23 | 
         | 
| 24 | 
         
             
            class SmolLM3Monitor:
         
     | 
| 25 | 
         
            -
                """Monitoring and tracking for SmolLM3 fine-tuning experiments"""
         
     | 
| 26 | 
         | 
| 27 | 
         
             
                def __init__(
         
     | 
| 28 | 
         
             
                    self,
         
     | 
| 
         @@ -32,7 +32,9 @@ class SmolLM3Monitor: 
     | 
|
| 32 | 
         
             
                    enable_tracking: bool = True,
         
     | 
| 33 | 
         
             
                    log_artifacts: bool = True,
         
     | 
| 34 | 
         
             
                    log_metrics: bool = True,
         
     | 
| 35 | 
         
            -
                    log_config: bool = True
         
     | 
| 
         | 
|
| 
         | 
|
| 36 | 
         
             
                ):
         
     | 
| 37 | 
         
             
                    self.experiment_name = experiment_name
         
     | 
| 38 | 
         
             
                    self.enable_tracking = enable_tracking and TRACKIO_AVAILABLE
         
     | 
| 
         @@ -40,6 +42,10 @@ class SmolLM3Monitor: 
     | 
|
| 40 | 
         
             
                    self.log_metrics_enabled = log_metrics  # Rename to avoid conflict
         
     | 
| 41 | 
         
             
                    self.log_config_enabled = log_config  # Rename to avoid conflict
         
     | 
| 42 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 43 | 
         
             
                    # Initialize experiment metadata first
         
     | 
| 44 | 
         
             
                    self.experiment_id = None
         
     | 
| 45 | 
         
             
                    self.start_time = datetime.now()
         
     | 
| 
         @@ -51,7 +57,33 @@ class SmolLM3Monitor: 
     | 
|
| 51 | 
         
             
                    if self.enable_tracking:
         
     | 
| 52 | 
         
             
                        self._setup_trackio(trackio_url, trackio_token)
         
     | 
| 53 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 54 | 
         
             
                    logger.info("Initialized monitoring for experiment: %s", experiment_name)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 55 | 
         | 
| 56 | 
         
             
                def _setup_trackio(self, trackio_url: Optional[str], trackio_token: Optional[str]):
         
     | 
| 57 | 
         
             
                    """Setup Trackio API client"""
         
     | 
| 
         @@ -91,6 +123,44 @@ class SmolLM3Monitor: 
     | 
|
| 91 | 
         
             
                        logger.error("Failed to initialize Trackio API: %s", e)
         
     | 
| 92 | 
         
             
                        self.enable_tracking = False
         
     | 
| 93 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 94 | 
         
             
                def log_configuration(self, config: Dict[str, Any]):
         
     | 
| 95 | 
         
             
                    """Log experiment configuration"""
         
     | 
| 96 | 
         
             
                    if not self.enable_tracking or not self.log_config_enabled:
         
     | 
| 
         @@ -98,24 +168,30 @@ class SmolLM3Monitor: 
     | 
|
| 98 | 
         | 
| 99 | 
         
             
                    try:
         
     | 
| 100 | 
         
             
                        # Log configuration as parameters
         
     | 
| 101 | 
         
            -
                         
     | 
| 102 | 
         
            -
                             
     | 
| 103 | 
         
            -
             
     | 
| 104 | 
         
            -
             
     | 
| 105 | 
         
            -
                        
         
     | 
| 106 | 
         
            -
                        if "success" in result:
         
     | 
| 107 | 
         
            -
                            # Also save config locally
         
     | 
| 108 | 
         
            -
                            config_path = "config_{}_{}.json".format(
         
     | 
| 109 | 
         
            -
                                self.experiment_name, 
         
     | 
| 110 | 
         
            -
                                self.start_time.strftime('%Y%m%d_%H%M%S')
         
     | 
| 111 | 
         
             
                            )
         
     | 
| 112 | 
         
            -
                            with open(config_path, 'w') as f:
         
     | 
| 113 | 
         
            -
                                json.dump(config, f, indent=2, default=str)
         
     | 
| 114 | 
         | 
| 115 | 
         
            -
                             
     | 
| 116 | 
         
            -
             
     | 
| 117 | 
         
            -
             
     | 
| 118 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 119 | 
         | 
| 120 | 
         
             
                    except Exception as e:
         
     | 
| 121 | 
         
             
                        logger.error("Failed to log configuration: %s", e)
         
     | 
| 
         @@ -136,18 +212,26 @@ class SmolLM3Monitor: 
     | 
|
| 136 | 
         
             
                            metrics['step'] = step
         
     | 
| 137 | 
         | 
| 138 | 
         
             
                        # Log to Trackio
         
     | 
| 139 | 
         
            -
                         
     | 
| 140 | 
         
            -
                             
     | 
| 141 | 
         
            -
             
     | 
| 142 | 
         
            -
             
     | 
| 143 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 144 | 
         | 
| 145 | 
         
            -
                         
     | 
| 146 | 
         
            -
             
     | 
| 147 | 
         
            -
             
     | 
| 148 | 
         
            -
             
     | 
| 149 | 
         
            -
                         
     | 
| 150 | 
         
            -
                             
     | 
| 
         | 
|
| 
         | 
|
| 151 | 
         | 
| 152 | 
         
             
                    except Exception as e:
         
     | 
| 153 | 
         
             
                        logger.error("Failed to log metrics: %s", e)
         
     | 
| 
         @@ -166,16 +250,19 @@ class SmolLM3Monitor: 
     | 
|
| 166 | 
         
             
                            "checkpoint_size": os.path.getsize(checkpoint_path) if os.path.exists(checkpoint_path) else 0
         
     | 
| 167 | 
         
             
                        }
         
     | 
| 168 | 
         | 
| 169 | 
         
            -
                         
     | 
| 170 | 
         
            -
                             
     | 
| 171 | 
         
            -
             
     | 
| 172 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 173 | 
         | 
| 174 | 
         
            -
                         
     | 
| 175 | 
         
            -
             
     | 
| 176 | 
         
            -
                            logger.info("Checkpoint logged: %s", checkpoint_path)
         
     | 
| 177 | 
         
            -
                        else:
         
     | 
| 178 | 
         
            -
                            logger.error("Failed to log checkpoint: %s", result)
         
     | 
| 179 | 
         | 
| 180 | 
         
             
                    except Exception as e:
         
     | 
| 181 | 
         
             
                        logger.error("Failed to log checkpoint: %s", e)
         
     | 
| 
         @@ -245,25 +332,31 @@ class SmolLM3Monitor: 
     | 
|
| 245 | 
         
             
                        summary['experiment_duration_seconds'] = duration
         
     | 
| 246 | 
         
             
                        summary['experiment_duration_hours'] = duration / 3600
         
     | 
| 247 | 
         | 
| 248 | 
         
            -
                        # Log final summary
         
     | 
| 249 | 
         
            -
                         
     | 
| 250 | 
         
            -
                             
     | 
| 251 | 
         
            -
             
     | 
| 252 | 
         
            -
             
     | 
| 253 | 
         
            -
                        
         
     | 
| 254 | 
         
            -
                        if "success" in result:
         
     | 
| 255 | 
         
            -
                            # Save summary locally
         
     | 
| 256 | 
         
            -
                            summary_path = "training_summary_{}_{}.json".format(
         
     | 
| 257 | 
         
            -
                                self.experiment_name,
         
     | 
| 258 | 
         
            -
                                self.start_time.strftime('%Y%m%d_%H%M%S')
         
     | 
| 259 | 
         
             
                            )
         
     | 
| 260 | 
         
            -
                            with open(summary_path, 'w') as f:
         
     | 
| 261 | 
         
            -
                                json.dump(summary, f, indent=2, default=str)
         
     | 
| 262 | 
         | 
| 263 | 
         
            -
                             
     | 
| 264 | 
         
            -
             
     | 
| 265 | 
         
            -
             
     | 
| 266 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 267 | 
         | 
| 268 | 
         
             
                    except Exception as e:
         
     | 
| 269 | 
         
             
                        logger.error("Failed to log training summary: %s", e)
         
     | 
| 
         @@ -356,6 +449,10 @@ class SmolLM3Monitor: 
     | 
|
| 356 | 
         
             
                                logger.error("Failed to close monitoring session: %s", result)
         
     | 
| 357 | 
         
             
                        except Exception as e:
         
     | 
| 358 | 
         
             
                            logger.error("Failed to close monitoring session: %s", e)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 359 | 
         | 
| 360 | 
         
             
            # Utility function to create monitor from config
         
     | 
| 361 | 
         
             
            def create_monitor_from_config(config, experiment_name: Optional[str] = None) -> SmolLM3Monitor:
         
     | 
| 
         @@ -370,5 +467,7 @@ def create_monitor_from_config(config, experiment_name: Optional[str] = None) -> 
     | 
|
| 370 | 
         
             
                    enable_tracking=getattr(config, 'enable_tracking', True),
         
     | 
| 371 | 
         
             
                    log_artifacts=getattr(config, 'log_artifacts', True),
         
     | 
| 372 | 
         
             
                    log_metrics=getattr(config, 'log_metrics', True),
         
     | 
| 373 | 
         
            -
                    log_config=getattr(config, 'log_config', True)
         
     | 
| 
         | 
|
| 
         | 
|
| 374 | 
         
             
                ) 
         
     | 
| 
         | 
|
| 1 | 
         
             
            """
         
     | 
| 2 | 
         
             
            Trackio Monitoring Integration for SmolLM3 Fine-tuning
         
     | 
| 3 | 
         
            +
            Provides comprehensive experiment tracking and monitoring capabilities with HF Datasets support
         
     | 
| 4 | 
         
             
            """
         
     | 
| 5 | 
         | 
| 6 | 
         
             
            import os
         
     | 
| 
         | 
|
| 13 | 
         | 
| 14 | 
         
             
            # Import the real API client
         
     | 
| 15 | 
         
             
            try:
         
     | 
| 16 | 
         
            +
                from scripts.trackio_tonic.trackio_api_client import TrackioAPIClient
         
     | 
| 17 | 
         
             
                TRACKIO_AVAILABLE = True
         
     | 
| 18 | 
         
             
            except ImportError:
         
     | 
| 19 | 
         
             
                TRACKIO_AVAILABLE = False
         
     | 
| 
         | 
|
| 22 | 
         
             
            logger = logging.getLogger(__name__)
         
     | 
| 23 | 
         | 
| 24 | 
         
             
            class SmolLM3Monitor:
         
     | 
| 25 | 
         
            +
                """Monitoring and tracking for SmolLM3 fine-tuning experiments with HF Datasets support"""
         
     | 
| 26 | 
         | 
| 27 | 
         
             
                def __init__(
         
     | 
| 28 | 
         
             
                    self,
         
     | 
| 
         | 
|
| 32 | 
         
             
                    enable_tracking: bool = True,
         
     | 
| 33 | 
         
             
                    log_artifacts: bool = True,
         
     | 
| 34 | 
         
             
                    log_metrics: bool = True,
         
     | 
| 35 | 
         
            +
                    log_config: bool = True,
         
     | 
| 36 | 
         
            +
                    hf_token: Optional[str] = None,
         
     | 
| 37 | 
         
            +
                    dataset_repo: Optional[str] = None
         
     | 
| 38 | 
         
             
                ):
         
     | 
| 39 | 
         
             
                    self.experiment_name = experiment_name
         
     | 
| 40 | 
         
             
                    self.enable_tracking = enable_tracking and TRACKIO_AVAILABLE
         
     | 
| 
         | 
|
| 42 | 
         
             
                    self.log_metrics_enabled = log_metrics  # Rename to avoid conflict
         
     | 
| 43 | 
         
             
                    self.log_config_enabled = log_config  # Rename to avoid conflict
         
     | 
| 44 | 
         | 
| 45 | 
         
            +
                    # HF Datasets configuration
         
     | 
| 46 | 
         
            +
                    self.hf_token = hf_token or os.environ.get('HF_TOKEN')
         
     | 
| 47 | 
         
            +
                    self.dataset_repo = dataset_repo or os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
         
     | 
| 48 | 
         
            +
                    
         
     | 
| 49 | 
         
             
                    # Initialize experiment metadata first
         
     | 
| 50 | 
         
             
                    self.experiment_id = None
         
     | 
| 51 | 
         
             
                    self.start_time = datetime.now()
         
     | 
| 
         | 
|
| 57 | 
         
             
                    if self.enable_tracking:
         
     | 
| 58 | 
         
             
                        self._setup_trackio(trackio_url, trackio_token)
         
     | 
| 59 | 
         | 
| 60 | 
         
            +
                    # Initialize HF Datasets client
         
     | 
| 61 | 
         
            +
                    self.hf_dataset_client = None
         
     | 
| 62 | 
         
            +
                    if self.hf_token:
         
     | 
| 63 | 
         
            +
                        self._setup_hf_datasets()
         
     | 
| 64 | 
         
            +
                    
         
     | 
| 65 | 
         
             
                    logger.info("Initialized monitoring for experiment: %s", experiment_name)
         
     | 
| 66 | 
         
            +
                    logger.info("Dataset repository: %s", self.dataset_repo)
         
     | 
| 67 | 
         
            +
                
         
     | 
| 68 | 
         
            +
                def _setup_hf_datasets(self):
         
     | 
| 69 | 
         
            +
                    """Setup HF Datasets client for persistent storage"""
         
     | 
| 70 | 
         
            +
                    try:
         
     | 
| 71 | 
         
            +
                        from datasets import Dataset
         
     | 
| 72 | 
         
            +
                        from huggingface_hub import HfApi
         
     | 
| 73 | 
         
            +
                        
         
     | 
| 74 | 
         
            +
                        self.hf_dataset_client = {
         
     | 
| 75 | 
         
            +
                            'Dataset': Dataset,
         
     | 
| 76 | 
         
            +
                            'HfApi': HfApi,
         
     | 
| 77 | 
         
            +
                            'api': HfApi(token=self.hf_token)
         
     | 
| 78 | 
         
            +
                        }
         
     | 
| 79 | 
         
            +
                        logger.info("✅ HF Datasets client initialized for %s", self.dataset_repo)
         
     | 
| 80 | 
         
            +
                        
         
     | 
| 81 | 
         
            +
                    except ImportError:
         
     | 
| 82 | 
         
            +
                        logger.warning("⚠️ datasets or huggingface-hub not available. Install with: pip install datasets huggingface-hub")
         
     | 
| 83 | 
         
            +
                        self.hf_dataset_client = None
         
     | 
| 84 | 
         
            +
                    except Exception as e:
         
     | 
| 85 | 
         
            +
                        logger.error("Failed to initialize HF Datasets client: %s", e)
         
     | 
| 86 | 
         
            +
                        self.hf_dataset_client = None
         
     | 
| 87 | 
         | 
| 88 | 
         
             
                def _setup_trackio(self, trackio_url: Optional[str], trackio_token: Optional[str]):
         
     | 
| 89 | 
         
             
                    """Setup Trackio API client"""
         
     | 
| 
         | 
|
| 123 | 
         
             
                        logger.error("Failed to initialize Trackio API: %s", e)
         
     | 
| 124 | 
         
             
                        self.enable_tracking = False
         
     | 
| 125 | 
         | 
| 126 | 
         
            +
                def _save_to_hf_dataset(self, experiment_data: Dict[str, Any]):
         
     | 
| 127 | 
         
            +
                    """Save experiment data to HF Dataset"""
         
     | 
| 128 | 
         
            +
                    if not self.hf_dataset_client:
         
     | 
| 129 | 
         
            +
                        return False
         
     | 
| 130 | 
         
            +
                    
         
     | 
| 131 | 
         
            +
                    try:
         
     | 
| 132 | 
         
            +
                        # Convert experiment data to dataset format
         
     | 
| 133 | 
         
            +
                        dataset_data = [{
         
     | 
| 134 | 
         
            +
                            'experiment_id': self.experiment_id or "exp_{}".format(datetime.now().strftime('%Y%m%d_%H%M%S')),
         
     | 
| 135 | 
         
            +
                            'name': self.experiment_name,
         
     | 
| 136 | 
         
            +
                            'description': "SmolLM3 fine-tuning experiment",
         
     | 
| 137 | 
         
            +
                            'created_at': self.start_time.isoformat(),
         
     | 
| 138 | 
         
            +
                            'status': 'running',
         
     | 
| 139 | 
         
            +
                            'metrics': json.dumps(self.metrics_history),
         
     | 
| 140 | 
         
            +
                            'parameters': json.dumps(experiment_data),
         
     | 
| 141 | 
         
            +
                            'artifacts': json.dumps(self.artifacts),
         
     | 
| 142 | 
         
            +
                            'logs': json.dumps([]),
         
     | 
| 143 | 
         
            +
                            'last_updated': datetime.now().isoformat()
         
     | 
| 144 | 
         
            +
                        }]
         
     | 
| 145 | 
         
            +
                        
         
     | 
| 146 | 
         
            +
                        # Create dataset
         
     | 
| 147 | 
         
            +
                        Dataset = self.hf_dataset_client['Dataset']
         
     | 
| 148 | 
         
            +
                        dataset = Dataset.from_list(dataset_data)
         
     | 
| 149 | 
         
            +
                        
         
     | 
| 150 | 
         
            +
                        # Push to HF Hub
         
     | 
| 151 | 
         
            +
                        dataset.push_to_hub(
         
     | 
| 152 | 
         
            +
                            self.dataset_repo,
         
     | 
| 153 | 
         
            +
                            token=self.hf_token,
         
     | 
| 154 | 
         
            +
                            private=True
         
     | 
| 155 | 
         
            +
                        )
         
     | 
| 156 | 
         
            +
                        
         
     | 
| 157 | 
         
            +
                        logger.info("✅ Saved experiment data to %s", self.dataset_repo)
         
     | 
| 158 | 
         
            +
                        return True
         
     | 
| 159 | 
         
            +
                        
         
     | 
| 160 | 
         
            +
                    except Exception as e:
         
     | 
| 161 | 
         
            +
                        logger.error("Failed to save to HF Dataset: %s", e)
         
     | 
| 162 | 
         
            +
                        return False
         
     | 
| 163 | 
         
            +
                
         
     | 
| 164 | 
         
             
                def log_configuration(self, config: Dict[str, Any]):
         
     | 
| 165 | 
         
             
                    """Log experiment configuration"""
         
     | 
| 166 | 
         
             
                    if not self.enable_tracking or not self.log_config_enabled:
         
     | 
| 
         | 
|
| 168 | 
         | 
| 169 | 
         
             
                    try:
         
     | 
| 170 | 
         
             
                        # Log configuration as parameters
         
     | 
| 171 | 
         
            +
                        if self.trackio_client:
         
     | 
| 172 | 
         
            +
                            result = self.trackio_client.log_parameters(
         
     | 
| 173 | 
         
            +
                                experiment_id=self.experiment_id,
         
     | 
| 174 | 
         
            +
                                parameters=config
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 175 | 
         
             
                            )
         
     | 
| 
         | 
|
| 
         | 
|
| 176 | 
         | 
| 177 | 
         
            +
                            if "success" in result:
         
     | 
| 178 | 
         
            +
                                logger.info("Configuration logged to Trackio")
         
     | 
| 179 | 
         
            +
                            else:
         
     | 
| 180 | 
         
            +
                                logger.error("Failed to log configuration: %s", result)
         
     | 
| 181 | 
         
            +
                        
         
     | 
| 182 | 
         
            +
                        # Save to HF Dataset
         
     | 
| 183 | 
         
            +
                        self._save_to_hf_dataset(config)
         
     | 
| 184 | 
         
            +
                        
         
     | 
| 185 | 
         
            +
                        # Also save config locally
         
     | 
| 186 | 
         
            +
                        config_path = "config_{}_{}.json".format(
         
     | 
| 187 | 
         
            +
                            self.experiment_name, 
         
     | 
| 188 | 
         
            +
                            self.start_time.strftime('%Y%m%d_%H%M%S')
         
     | 
| 189 | 
         
            +
                        )
         
     | 
| 190 | 
         
            +
                        with open(config_path, 'w') as f:
         
     | 
| 191 | 
         
            +
                            json.dump(config, f, indent=2, default=str)
         
     | 
| 192 | 
         
            +
                        
         
     | 
| 193 | 
         
            +
                        self.artifacts.append(config_path)
         
     | 
| 194 | 
         
            +
                        logger.info("Configuration saved to %s", config_path)
         
     | 
| 195 | 
         | 
| 196 | 
         
             
                    except Exception as e:
         
     | 
| 197 | 
         
             
                        logger.error("Failed to log configuration: %s", e)
         
     | 
| 
         | 
|
| 212 | 
         
             
                            metrics['step'] = step
         
     | 
| 213 | 
         | 
| 214 | 
         
             
                        # Log to Trackio
         
     | 
| 215 | 
         
            +
                        if self.trackio_client:
         
     | 
| 216 | 
         
            +
                            result = self.trackio_client.log_metrics(
         
     | 
| 217 | 
         
            +
                                experiment_id=self.experiment_id,
         
     | 
| 218 | 
         
            +
                                metrics=metrics,
         
     | 
| 219 | 
         
            +
                                step=step
         
     | 
| 220 | 
         
            +
                            )
         
     | 
| 221 | 
         
            +
                            
         
     | 
| 222 | 
         
            +
                            if "success" in result:
         
     | 
| 223 | 
         
            +
                                logger.debug("Metrics logged to Trackio")
         
     | 
| 224 | 
         
            +
                            else:
         
     | 
| 225 | 
         
            +
                                logger.error("Failed to log metrics to Trackio: %s", result)
         
     | 
| 226 | 
         | 
| 227 | 
         
            +
                        # Store locally
         
     | 
| 228 | 
         
            +
                        self.metrics_history.append(metrics)
         
     | 
| 229 | 
         
            +
                        
         
     | 
| 230 | 
         
            +
                        # Save to HF Dataset periodically
         
     | 
| 231 | 
         
            +
                        if len(self.metrics_history) % 10 == 0:  # Save every 10 metrics
         
     | 
| 232 | 
         
            +
                            self._save_to_hf_dataset({'metrics': self.metrics_history})
         
     | 
| 233 | 
         
            +
                        
         
     | 
| 234 | 
         
            +
                        logger.debug("Metrics logged: %s", metrics)
         
     | 
| 235 | 
         | 
| 236 | 
         
             
                    except Exception as e:
         
     | 
| 237 | 
         
             
                        logger.error("Failed to log metrics: %s", e)
         
     | 
| 
         | 
|
| 250 | 
         
             
                            "checkpoint_size": os.path.getsize(checkpoint_path) if os.path.exists(checkpoint_path) else 0
         
     | 
| 251 | 
         
             
                        }
         
     | 
| 252 | 
         | 
| 253 | 
         
            +
                        if self.trackio_client:
         
     | 
| 254 | 
         
            +
                            result = self.trackio_client.log_parameters(
         
     | 
| 255 | 
         
            +
                                experiment_id=self.experiment_id,
         
     | 
| 256 | 
         
            +
                                parameters=checkpoint_info
         
     | 
| 257 | 
         
            +
                            )
         
     | 
| 258 | 
         
            +
                            
         
     | 
| 259 | 
         
            +
                            if "success" in result:
         
     | 
| 260 | 
         
            +
                                logger.info("Checkpoint logged to Trackio")
         
     | 
| 261 | 
         
            +
                            else:
         
     | 
| 262 | 
         
            +
                                logger.error("Failed to log checkpoint to Trackio: %s", result)
         
     | 
| 263 | 
         | 
| 264 | 
         
            +
                        self.artifacts.append(checkpoint_path)
         
     | 
| 265 | 
         
            +
                        logger.info("Checkpoint logged: %s", checkpoint_path)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 266 | 
         | 
| 267 | 
         
             
                    except Exception as e:
         
     | 
| 268 | 
         
             
                        logger.error("Failed to log checkpoint: %s", e)
         
     | 
| 
         | 
|
| 332 | 
         
             
                        summary['experiment_duration_seconds'] = duration
         
     | 
| 333 | 
         
             
                        summary['experiment_duration_hours'] = duration / 3600
         
     | 
| 334 | 
         | 
| 335 | 
         
            +
                        # Log final summary to Trackio
         
     | 
| 336 | 
         
            +
                        if self.trackio_client:
         
     | 
| 337 | 
         
            +
                            result = self.trackio_client.log_parameters(
         
     | 
| 338 | 
         
            +
                                experiment_id=self.experiment_id,
         
     | 
| 339 | 
         
            +
                                parameters=summary
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 340 | 
         
             
                            )
         
     | 
| 
         | 
|
| 
         | 
|
| 341 | 
         | 
| 342 | 
         
            +
                            if "success" in result:
         
     | 
| 343 | 
         
            +
                                logger.info("Training summary logged to Trackio")
         
     | 
| 344 | 
         
            +
                            else:
         
     | 
| 345 | 
         
            +
                                logger.error("Failed to log training summary to Trackio: %s", result)
         
     | 
| 346 | 
         
            +
                        
         
     | 
| 347 | 
         
            +
                        # Save to HF Dataset
         
     | 
| 348 | 
         
            +
                        self._save_to_hf_dataset(summary)
         
     | 
| 349 | 
         
            +
                        
         
     | 
| 350 | 
         
            +
                        # Save summary locally
         
     | 
| 351 | 
         
            +
                        summary_path = "training_summary_{}_{}.json".format(
         
     | 
| 352 | 
         
            +
                            self.experiment_name,
         
     | 
| 353 | 
         
            +
                            self.start_time.strftime('%Y%m%d_%H%M%S')
         
     | 
| 354 | 
         
            +
                        )
         
     | 
| 355 | 
         
            +
                        with open(summary_path, 'w') as f:
         
     | 
| 356 | 
         
            +
                            json.dump(summary, f, indent=2, default=str)
         
     | 
| 357 | 
         
            +
                        
         
     | 
| 358 | 
         
            +
                        self.artifacts.append(summary_path)
         
     | 
| 359 | 
         
            +
                        logger.info("Training summary logged and saved to %s", summary_path)
         
     | 
| 360 | 
         | 
| 361 | 
         
             
                    except Exception as e:
         
     | 
| 362 | 
         
             
                        logger.error("Failed to log training summary: %s", e)
         
     | 
| 
         | 
|
| 449 | 
         
             
                                logger.error("Failed to close monitoring session: %s", result)
         
     | 
| 450 | 
         
             
                        except Exception as e:
         
     | 
| 451 | 
         
             
                            logger.error("Failed to close monitoring session: %s", e)
         
     | 
| 452 | 
         
            +
                    
         
     | 
| 453 | 
         
            +
                    # Final save to HF Dataset
         
     | 
| 454 | 
         
            +
                    if self.hf_dataset_client:
         
     | 
| 455 | 
         
            +
                        self._save_to_hf_dataset({'status': 'completed'})
         
     | 
| 456 | 
         | 
| 457 | 
         
             
            # Utility function to create monitor from config
         
     | 
| 458 | 
         
             
            def create_monitor_from_config(config, experiment_name: Optional[str] = None) -> SmolLM3Monitor:
         
     | 
| 
         | 
|
| 467 | 
         
             
                    enable_tracking=getattr(config, 'enable_tracking', True),
         
     | 
| 468 | 
         
             
                    log_artifacts=getattr(config, 'log_artifacts', True),
         
     | 
| 469 | 
         
             
                    log_metrics=getattr(config, 'log_metrics', True),
         
     | 
| 470 | 
         
            +
                    log_config=getattr(config, 'log_config', True),
         
     | 
| 471 | 
         
            +
                    hf_token=getattr(config, 'hf_token', None),
         
     | 
| 472 | 
         
            +
                    dataset_repo=getattr(config, 'dataset_repo', None)
         
     | 
| 473 | 
         
             
                ) 
         
     | 
    	
        train.py → src/train.py
    RENAMED
    
    | 
         @@ -20,6 +20,7 @@ from config import get_config 
     | 
|
| 20 | 
         
             
            from model import SmolLM3Model
         
     | 
| 21 | 
         
             
            from data import SmolLM3Dataset
         
     | 
| 22 | 
         
             
            from trainer import SmolLM3Trainer
         
     | 
| 
         | 
|
| 23 | 
         | 
| 24 | 
         
             
            def setup_logging():
         
     | 
| 25 | 
         
             
                """Setup logging configuration"""
         
     | 
| 
         @@ -86,6 +87,12 @@ def parse_args(): 
     | 
|
| 86 | 
         
             
                parser.add_argument('--experiment_name', type=str, default=None,
         
     | 
| 87 | 
         
             
                                   help='Custom experiment name for tracking')
         
     | 
| 88 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 89 | 
         
             
                return parser.parse_args()
         
     | 
| 90 | 
         | 
| 91 | 
         
             
            def main():
         
     | 
| 
         @@ -119,6 +126,12 @@ def main(): 
     | 
|
| 119 | 
         
             
                if args.experiment_name is not None:
         
     | 
| 120 | 
         
             
                    config.experiment_name = args.experiment_name
         
     | 
| 121 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 122 | 
         
             
                # Setup paths
         
     | 
| 123 | 
         
             
                output_path = args.out_dir
         
     | 
| 124 | 
         | 
| 
         @@ -127,6 +140,22 @@ def main(): 
     | 
|
| 127 | 
         | 
| 128 | 
         
             
                logger.info(f"Output path: {output_path}")
         
     | 
| 129 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 130 | 
         
             
                # Initialize model
         
     | 
| 131 | 
         
             
                model = SmolLM3Model(
         
     | 
| 132 | 
         
             
                    model_name=args.model_name,
         
     | 
| 
         @@ -162,13 +191,60 @@ def main(): 
     | 
|
| 162 | 
         
             
                    init_from=args.init_from
         
     | 
| 163 | 
         
             
                )
         
     | 
| 164 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 165 | 
         
             
                # Start training
         
     | 
| 166 | 
         
             
                try:
         
     | 
| 167 | 
         
             
                    trainer.train()
         
     | 
| 168 | 
         
             
                    logger.info("Training completed successfully!")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 169 | 
         
             
                except Exception as e:
         
     | 
| 170 | 
         
             
                    logger.error(f"Training failed: {e}")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 171 | 
         
             
                    raise
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 172 | 
         | 
| 173 | 
         
             
            if __name__ == '__main__':
         
     | 
| 174 | 
         
             
                main() 
         
     | 
| 
         | 
|
| 20 | 
         
             
            from model import SmolLM3Model
         
     | 
| 21 | 
         
             
            from data import SmolLM3Dataset
         
     | 
| 22 | 
         
             
            from trainer import SmolLM3Trainer
         
     | 
| 23 | 
         
            +
            from monitoring import create_monitor_from_config
         
     | 
| 24 | 
         | 
| 25 | 
         
             
            def setup_logging():
         
     | 
| 26 | 
         
             
                """Setup logging configuration"""
         
     | 
| 
         | 
|
| 87 | 
         
             
                parser.add_argument('--experiment_name', type=str, default=None,
         
     | 
| 88 | 
         
             
                                   help='Custom experiment name for tracking')
         
     | 
| 89 | 
         | 
| 90 | 
         
            +
                # HF Datasets arguments
         
     | 
| 91 | 
         
            +
                parser.add_argument('--hf_token', type=str, default=None,
         
     | 
| 92 | 
         
            +
                                   help='Hugging Face token for dataset access')
         
     | 
| 93 | 
         
            +
                parser.add_argument('--dataset_repo', type=str, default=None,
         
     | 
| 94 | 
         
            +
                                   help='HF Dataset repository for experiment storage')
         
     | 
| 95 | 
         
            +
                
         
     | 
| 96 | 
         
             
                return parser.parse_args()
         
     | 
| 97 | 
         | 
| 98 | 
         
             
            def main():
         
     | 
| 
         | 
|
| 126 | 
         
             
                if args.experiment_name is not None:
         
     | 
| 127 | 
         
             
                    config.experiment_name = args.experiment_name
         
     | 
| 128 | 
         | 
| 129 | 
         
            +
                # Override HF Datasets configuration
         
     | 
| 130 | 
         
            +
                if args.hf_token is not None:
         
     | 
| 131 | 
         
            +
                    os.environ['HF_TOKEN'] = args.hf_token
         
     | 
| 132 | 
         
            +
                if args.dataset_repo is not None:
         
     | 
| 133 | 
         
            +
                    os.environ['TRACKIO_DATASET_REPO'] = args.dataset_repo
         
     | 
| 134 | 
         
            +
                
         
     | 
| 135 | 
         
             
                # Setup paths
         
     | 
| 136 | 
         
             
                output_path = args.out_dir
         
     | 
| 137 | 
         | 
| 
         | 
|
| 140 | 
         | 
| 141 | 
         
             
                logger.info(f"Output path: {output_path}")
         
     | 
| 142 | 
         | 
| 143 | 
         
            +
                # Initialize monitoring
         
     | 
| 144 | 
         
            +
                monitor = None
         
     | 
| 145 | 
         
            +
                if config.enable_tracking:
         
     | 
| 146 | 
         
            +
                    try:
         
     | 
| 147 | 
         
            +
                        monitor = create_monitor_from_config(config, args.experiment_name)
         
     | 
| 148 | 
         
            +
                        logger.info(f"✅ Monitoring initialized for experiment: {monitor.experiment_name}")
         
     | 
| 149 | 
         
            +
                        logger.info(f"📊 Dataset repository: {monitor.dataset_repo}")
         
     | 
| 150 | 
         
            +
                        
         
     | 
| 151 | 
         
            +
                        # Log configuration
         
     | 
| 152 | 
         
            +
                        config_dict = {k: v for k, v in vars(config).items() if not k.startswith('_')}
         
     | 
| 153 | 
         
            +
                        monitor.log_configuration(config_dict)
         
     | 
| 154 | 
         
            +
                        
         
     | 
| 155 | 
         
            +
                    except Exception as e:
         
     | 
| 156 | 
         
            +
                        logger.error(f"Failed to initialize monitoring: {e}")
         
     | 
| 157 | 
         
            +
                        logger.warning("Continuing without monitoring...")
         
     | 
| 158 | 
         
            +
                
         
     | 
| 159 | 
         
             
                # Initialize model
         
     | 
| 160 | 
         
             
                model = SmolLM3Model(
         
     | 
| 161 | 
         
             
                    model_name=args.model_name,
         
     | 
| 
         | 
|
| 191 | 
         
             
                    init_from=args.init_from
         
     | 
| 192 | 
         
             
                )
         
     | 
| 193 | 
         | 
| 194 | 
         
            +
                # Add monitoring callback if available
         
     | 
| 195 | 
         
            +
                if monitor:
         
     | 
| 196 | 
         
            +
                    try:
         
     | 
| 197 | 
         
            +
                        callback = monitor.create_monitoring_callback()
         
     | 
| 198 | 
         
            +
                        trainer.add_callback(callback)
         
     | 
| 199 | 
         
            +
                        logger.info("✅ Monitoring callback added to trainer")
         
     | 
| 200 | 
         
            +
                    except Exception as e:
         
     | 
| 201 | 
         
            +
                        logger.error(f"Failed to add monitoring callback: {e}")
         
     | 
| 202 | 
         
            +
                
         
     | 
| 203 | 
         
             
                # Start training
         
     | 
| 204 | 
         
             
                try:
         
     | 
| 205 | 
         
             
                    trainer.train()
         
     | 
| 206 | 
         
             
                    logger.info("Training completed successfully!")
         
     | 
| 207 | 
         
            +
                    
         
     | 
| 208 | 
         
            +
                    # Log training summary
         
     | 
| 209 | 
         
            +
                    if monitor:
         
     | 
| 210 | 
         
            +
                        try:
         
     | 
| 211 | 
         
            +
                            summary = {
         
     | 
| 212 | 
         
            +
                                'final_loss': getattr(trainer, 'final_loss', None),
         
     | 
| 213 | 
         
            +
                                'total_steps': getattr(trainer, 'total_steps', None),
         
     | 
| 214 | 
         
            +
                                'training_duration': getattr(trainer, 'training_duration', None),
         
     | 
| 215 | 
         
            +
                                'model_path': output_path,
         
     | 
| 216 | 
         
            +
                                'config_file': args.config
         
     | 
| 217 | 
         
            +
                            }
         
     | 
| 218 | 
         
            +
                            monitor.log_training_summary(summary)
         
     | 
| 219 | 
         
            +
                            logger.info("✅ Training summary logged")
         
     | 
| 220 | 
         
            +
                        except Exception as e:
         
     | 
| 221 | 
         
            +
                            logger.error(f"Failed to log training summary: {e}")
         
     | 
| 222 | 
         
            +
                    
         
     | 
| 223 | 
         
             
                except Exception as e:
         
     | 
| 224 | 
         
             
                    logger.error(f"Training failed: {e}")
         
     | 
| 225 | 
         
            +
                    
         
     | 
| 226 | 
         
            +
                    # Log error to monitoring
         
     | 
| 227 | 
         
            +
                    if monitor:
         
     | 
| 228 | 
         
            +
                        try:
         
     | 
| 229 | 
         
            +
                            error_summary = {
         
     | 
| 230 | 
         
            +
                                'error': str(e),
         
     | 
| 231 | 
         
            +
                                'status': 'failed',
         
     | 
| 232 | 
         
            +
                                'model_path': output_path,
         
     | 
| 233 | 
         
            +
                                'config_file': args.config
         
     | 
| 234 | 
         
            +
                            }
         
     | 
| 235 | 
         
            +
                            monitor.log_training_summary(error_summary)
         
     | 
| 236 | 
         
            +
                        except Exception as log_error:
         
     | 
| 237 | 
         
            +
                            logger.error(f"Failed to log error to monitoring: {log_error}")
         
     | 
| 238 | 
         
            +
                    
         
     | 
| 239 | 
         
             
                    raise
         
     | 
| 240 | 
         
            +
                finally:
         
     | 
| 241 | 
         
            +
                    # Close monitoring
         
     | 
| 242 | 
         
            +
                    if monitor:
         
     | 
| 243 | 
         
            +
                        try:
         
     | 
| 244 | 
         
            +
                            monitor.close()
         
     | 
| 245 | 
         
            +
                            logger.info("✅ Monitoring session closed")
         
     | 
| 246 | 
         
            +
                        except Exception as e:
         
     | 
| 247 | 
         
            +
                            logger.error(f"Failed to close monitoring: {e}")
         
     | 
| 248 | 
         | 
| 249 | 
         
             
            if __name__ == '__main__':
         
     | 
| 250 | 
         
             
                main() 
         
     | 
    	
        trainer.py → src/trainer.py
    RENAMED
    
    | 
         
            File without changes
         
     | 
    	
        templates/datasets/readme.md
    ADDED
    
    | 
         
            File without changes
         
     |