George-API commited on
Commit
3f693be
·
verified ·
1 Parent(s): f3ab403

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +15 -51
run_cloud_training.py CHANGED
@@ -18,8 +18,8 @@ os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
18
  # L40S-specific CUDA optimization
19
  os.environ["CUDA_AUTO_BOOST"] = "1"
20
 
21
- # Explicitly disable DeepSpeed MPI requirement
22
- os.environ["DEEPSPEED_MPI_REQUIRED"] = "0"
23
 
24
  import json
25
  import logging
@@ -46,24 +46,16 @@ logging.basicConfig(
46
  )
47
  logger = logging.getLogger(__name__)
48
 
49
- # Set up DeepSpeed without requiring MPI
50
  os.environ["MASTER_ADDR"] = "localhost"
51
  os.environ["MASTER_PORT"] = "9994"
52
  os.environ["RANK"] = "0"
53
  os.environ["LOCAL_RANK"] = "0"
54
  os.environ["WORLD_SIZE"] = "1"
55
 
56
- # Try to import deepspeed, with fallback for environments without MPI
 
57
  deepspeed_available = False
58
- try:
59
- import deepspeed
60
- deepspeed_available = True
61
- logger.info("DeepSpeed successfully imported")
62
- except ImportError as e:
63
- logger.warning(f"Failed to import DeepSpeed: {e}")
64
- logger.warning("Will continue without DeepSpeed support")
65
- # Set a flag to disable DeepSpeed
66
- os.environ["DISABLE_DEEPSPEED"] = "1"
67
 
68
  # Disable all attention optimizations that might cause issues
69
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
@@ -611,35 +603,11 @@ def train(config_path, dataset_name, output_dir):
611
  logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
612
 
613
  # Check if DeepSpeed config is available and if DeepSpeed is available
614
- deepspeed_config = config.get("deepspeed_config", None)
615
- if deepspeed_config and deepspeed_available and os.environ.get("DISABLE_DEEPSPEED", "0") != "1":
616
- logger.info("DeepSpeed configuration found - enabling DeepSpeed for distributed training")
617
-
618
- # Create a temporary DeepSpeed config file
619
- ds_config_path = os.path.join(output_dir, "ds_config_temp.json")
620
-
621
- # Update DeepSpeed config with dynamic values
622
- if isinstance(deepspeed_config.get("train_micro_batch_size_per_gpu"), str) and deepspeed_config.get("train_micro_batch_size_per_gpu") == "auto":
623
- deepspeed_config["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
624
-
625
- if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
626
- deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
627
-
628
- # Write the DeepSpeed config to a file
629
- with open(ds_config_path, 'w') as f:
630
- json.dump(deepspeed_config, f, indent=2)
631
-
632
- logger.info(f"Created DeepSpeed config at {ds_config_path}")
633
- # Set using_deepspeed flag
634
- using_deepspeed = True
635
- elif os.environ.get("DISABLE_DEEPSPEED", "0") == "1":
636
- logger.warning("DeepSpeed is disabled - using standard training without DeepSpeed")
637
- ds_config_path = None
638
- using_deepspeed = False
639
- else:
640
- logger.warning("DeepSpeed is disabled - using standard training without DeepSpeed")
641
- ds_config_path = None
642
- using_deepspeed = False
643
 
644
  # Initialize model with our safe loading function
645
  logger.info("Loading pre-quantized model with eager attention")
@@ -707,22 +675,18 @@ def train(config_path, dataset_name, output_dir):
707
  }
708
 
709
  # Add DeepSpeed config path if available and enabled
710
- if using_deepspeed and ds_config_path:
711
- logger.info("Adding DeepSpeed configuration to training arguments")
712
- training_args_dict["deepspeed"] = ds_config_path
713
- else:
714
- logger.info("DeepSpeed is disabled - using standard distributed training")
715
 
716
  # Create TrainingArguments with validated parameters
717
  try:
718
  training_args = TrainingArguments(**training_args_dict)
719
  except Exception as e:
720
- logger.error(f"Failed to create training arguments with DeepSpeed: {e}")
721
  if "deepspeed" in training_args_dict:
722
- logger.warning("Removing DeepSpeed configuration and trying again")
723
  del training_args_dict["deepspeed"]
724
- training_args = TrainingArguments(**training_args_dict)
725
- using_deepspeed = False
726
 
727
  # Create trainer with pre-tokenized collator
728
  trainer = Trainer(
 
18
  # L40S-specific CUDA optimization
19
  os.environ["CUDA_AUTO_BOOST"] = "1"
20
 
21
+ # Completely disable DeepSpeed for Hugging Face Spaces to avoid compatibility issues
22
+ os.environ["DISABLE_DEEPSPEED"] = "1"
23
 
24
  import json
25
  import logging
 
46
  )
47
  logger = logging.getLogger(__name__)
48
 
49
+ # Set up environment variables
50
  os.environ["MASTER_ADDR"] = "localhost"
51
  os.environ["MASTER_PORT"] = "9994"
52
  os.environ["RANK"] = "0"
53
  os.environ["LOCAL_RANK"] = "0"
54
  os.environ["WORLD_SIZE"] = "1"
55
 
56
+ # DeepSpeed is disabled for Hugging Face Spaces due to compatibility issues
57
+ logger.info("DeepSpeed is disabled for Hugging Face Spaces to avoid compatibility issues")
58
  deepspeed_available = False
 
 
 
 
 
 
 
 
 
59
 
60
  # Disable all attention optimizations that might cause issues
61
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 
603
  logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
604
 
605
  # Check if DeepSpeed config is available and if DeepSpeed is available
606
+ # Note: DeepSpeed is now disabled by default for HF Spaces
607
+ deepspeed_config = None
608
+ logger.info("DeepSpeed is disabled for Hugging Face Spaces to avoid compatibility issues")
609
+ ds_config_path = None
610
+ using_deepspeed = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
 
612
  # Initialize model with our safe loading function
613
  logger.info("Loading pre-quantized model with eager attention")
 
675
  }
676
 
677
  # Add DeepSpeed config path if available and enabled
678
+ # DeepSpeed is disabled for Hugging Face Spaces
679
+ logger.info("DeepSpeed is disabled - using standard training")
 
 
 
680
 
681
  # Create TrainingArguments with validated parameters
682
  try:
683
  training_args = TrainingArguments(**training_args_dict)
684
  except Exception as e:
685
+ logger.error(f"Failed to create training arguments: {e}")
686
  if "deepspeed" in training_args_dict:
687
+ logger.warning("Removing any DeepSpeed configuration")
688
  del training_args_dict["deepspeed"]
689
+ training_args = TrainingArguments(**training_args_dict)
 
690
 
691
  # Create trainer with pre-tokenized collator
692
  trainer = Trainer(