Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +31 -8
 
    	
        run_cloud_training.py
    CHANGED
    
    | 
         @@ -2,17 +2,20 @@ 
     | 
|
| 2 | 
         
             
            # -*- coding: utf-8 -*-
         
     | 
| 3 | 
         | 
| 4 | 
         
             
            """
         
     | 
| 5 | 
         
            -
            Fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-bnb-4bit using unsloth
         
     | 
| 6 | 
         
             
            RESEARCH TRAINING PHASE ONLY - No output generation
         
     | 
| 7 | 
         
             
            WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
         
     | 
| 
         | 
|
| 8 | 
         
             
            """
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            # Set critical environment variables before any imports
         
     | 
| 11 | 
         
             
            import os
         
     | 
| 12 | 
         
            -
            # Configure PyTorch memory allocator for better memory management with  
     | 
| 13 | 
         
            -
            os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
         
     | 
| 14 | 
         
             
            os.environ["XFORMERS_DISABLED"] = "1"
         
     | 
| 15 | 
         
             
            os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
         
     | 
| 
         | 
|
| 
         | 
|
| 16 | 
         | 
| 17 | 
         
             
            import json
         
     | 
| 18 | 
         
             
            import logging
         
     | 
| 
         @@ -597,10 +600,19 @@ def train(config_path, dataset_name, output_dir): 
     | 
|
| 597 | 
         
             
                    # Initialize ds_config_path to None before checking
         
     | 
| 598 | 
         
             
                    ds_config_path = None
         
     | 
| 599 | 
         | 
| 600 | 
         
            -
                    # Optimize batch size for  
     | 
| 601 | 
         
            -
                     
     | 
| 602 | 
         
            -
                     
     | 
| 603 | 
         
            -
                     
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 604 | 
         | 
| 605 | 
         
             
                    # Check if DeepSpeed config is available and if MPI is disabled
         
     | 
| 606 | 
         
             
                    deepspeed_config = config.get("deepspeed_config", None)
         
     | 
| 
         @@ -617,6 +629,17 @@ def train(config_path, dataset_name, output_dir): 
     | 
|
| 617 | 
         
             
                        if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
         
     | 
| 618 | 
         
             
                            deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
         
     | 
| 619 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 620 | 
         
             
                        # Ensure communication backend is set to avoid MPI
         
     | 
| 621 | 
         
             
                        if "communication_data_type" not in deepspeed_config:
         
     | 
| 622 | 
         
             
                            deepspeed_config["communication_data_type"] = "fp16"
         
     | 
| 
         @@ -764,7 +787,7 @@ def train(config_path, dataset_name, output_dir): 
     | 
|
| 764 | 
         
             
                    remove_training_marker()
         
     | 
| 765 | 
         | 
| 766 | 
         
             
            if __name__ == "__main__":
         
     | 
| 767 | 
         
            -
                parser = argparse.ArgumentParser(description="Fine-tune Unsloth/DeepSeek-R1-Distill-Qwen-14B-4bit model (RESEARCH ONLY)")
         
     | 
| 768 | 
         
             
                parser.add_argument("--config", type=str, default="transformers_config.json", 
         
     | 
| 769 | 
         
             
                                    help="Path to the transformers config JSON file")
         
     | 
| 770 | 
         
             
                parser.add_argument("--dataset", type=str, default="phi4-cognitive-dataset", 
         
     | 
| 
         | 
|
| 2 | 
         
             
            # -*- coding: utf-8 -*-
         
     | 
| 3 | 
         | 
| 4 | 
         
             
            """
         
     | 
| 5 | 
         
            +
            Fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit using unsloth
         
     | 
| 6 | 
         
             
            RESEARCH TRAINING PHASE ONLY - No output generation
         
     | 
| 7 | 
         
             
            WORKS WITH PRE-TOKENIZED DATASET - No re-tokenization
         
     | 
| 8 | 
         
            +
            OPTIMIZED FOR L40S GPU (48GB VRAM)
         
     | 
| 9 | 
         
             
            """
         
     | 
| 10 | 
         | 
| 11 | 
         
             
            # Set critical environment variables before any imports
         
     | 
| 12 | 
         
             
            import os
         
     | 
| 13 | 
         
            +
            # Configure PyTorch memory allocator for better memory management with L40S GPU
         
     | 
| 14 | 
         
            +
            os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"
         
     | 
| 15 | 
         
             
            os.environ["XFORMERS_DISABLED"] = "1"
         
     | 
| 16 | 
         
             
            os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
         
     | 
| 17 | 
         
            +
            # L40S-specific CUDA optimization
         
     | 
| 18 | 
         
            +
            os.environ["CUDA_AUTO_BOOST"] = "1"
         
     | 
| 19 | 
         | 
| 20 | 
         
             
            import json
         
     | 
| 21 | 
         
             
            import logging
         
     | 
| 
         | 
|
| 600 | 
         
             
                    # Initialize ds_config_path to None before checking
         
     | 
| 601 | 
         
             
                    ds_config_path = None
         
     | 
| 602 | 
         | 
| 603 | 
         
            +
                    # Optimize batch size for L40S GPU
         
     | 
| 604 | 
         
            +
                    gpu_info = torch.cuda.get_device_properties(0)
         
     | 
| 605 | 
         
            +
                    logger.info(f"GPU Model: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
         
     | 
| 606 | 
         
            +
                    
         
     | 
| 607 | 
         
            +
                    # For L40S GPU, we can use a larger batch size and shard model across the single GPU
         
     | 
| 608 | 
         
            +
                    if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:  # Check if it's L40S (>40GB VRAM)
         
     | 
| 609 | 
         
            +
                        logger.info("Detected L40S GPU - optimizing for high-memory GPU")
         
     | 
| 610 | 
         
            +
                        per_device_train_batch_size = training_config.get("per_device_train_batch_size", 6)
         
     | 
| 611 | 
         
            +
                        logger.info(f"Using optimized batch size for L40S: {per_device_train_batch_size}")
         
     | 
| 612 | 
         
            +
                    else:
         
     | 
| 613 | 
         
            +
                        # Default to a smaller batch size for other GPUs
         
     | 
| 614 | 
         
            +
                        per_device_train_batch_size = 2
         
     | 
| 615 | 
         
            +
                        logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
         
     | 
| 616 | 
         | 
| 617 | 
         
             
                    # Check if DeepSpeed config is available and if MPI is disabled
         
     | 
| 618 | 
         
             
                    deepspeed_config = config.get("deepspeed_config", None)
         
     | 
| 
         | 
|
| 629 | 
         
             
                        if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
         
     | 
| 630 | 
         
             
                            deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
         
     | 
| 631 | 
         | 
| 632 | 
         
            +
                        # L40S-specific optimization: Enable ZeRO stage 2 with CPU offloading
         
     | 
| 633 | 
         
            +
                        if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:
         
     | 
| 634 | 
         
            +
                            logger.info("Configuring DeepSpeed specifically for L40S GPU")
         
     | 
| 635 | 
         
            +
                            # Adjust ZeRO stage for L40S (48GB VRAM)
         
     | 
| 636 | 
         
            +
                            deepspeed_config["zero_optimization"]["stage"] = 2
         
     | 
| 637 | 
         
            +
                            # Enable CPU offloading for optimizer states to save GPU memory
         
     | 
| 638 | 
         
            +
                            deepspeed_config["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
         
     | 
| 639 | 
         
            +
                            # Adjust communication efficiency for single high-end GPU
         
     | 
| 640 | 
         
            +
                            deepspeed_config["reduce_bucket_size"] = 1e9
         
     | 
| 641 | 
         
            +
                            deepspeed_config["allgather_bucket_size"] = 1e9
         
     | 
| 642 | 
         
            +
                        
         
     | 
| 643 | 
         
             
                        # Ensure communication backend is set to avoid MPI
         
     | 
| 644 | 
         
             
                        if "communication_data_type" not in deepspeed_config:
         
     | 
| 645 | 
         
             
                            deepspeed_config["communication_data_type"] = "fp16"
         
     | 
| 
         | 
|
| 787 | 
         
             
                    remove_training_marker()
         
     | 
| 788 | 
         | 
| 789 | 
         
             
            if __name__ == "__main__":
         
     | 
| 790 | 
         
            +
                parser = argparse.ArgumentParser(description="Fine-tune Unsloth/DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit model (RESEARCH ONLY)")
         
     | 
| 791 | 
         
             
                parser.add_argument("--config", type=str, default="transformers_config.json", 
         
     | 
| 792 | 
         
             
                                    help="Path to the transformers config JSON file")
         
     | 
| 793 | 
         
             
                parser.add_argument("--dataset", type=str, default="phi4-cognitive-dataset", 
         
     |