Spaces:
Running
Running
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +15 -51
run_cloud_training.py
CHANGED
@@ -18,8 +18,8 @@ os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
18 |
# L40S-specific CUDA optimization
|
19 |
os.environ["CUDA_AUTO_BOOST"] = "1"
|
20 |
|
21 |
-
#
|
22 |
-
os.environ["
|
23 |
|
24 |
import json
|
25 |
import logging
|
@@ -46,24 +46,16 @@ logging.basicConfig(
|
|
46 |
)
|
47 |
logger = logging.getLogger(__name__)
|
48 |
|
49 |
-
# Set up
|
50 |
os.environ["MASTER_ADDR"] = "localhost"
|
51 |
os.environ["MASTER_PORT"] = "9994"
|
52 |
os.environ["RANK"] = "0"
|
53 |
os.environ["LOCAL_RANK"] = "0"
|
54 |
os.environ["WORLD_SIZE"] = "1"
|
55 |
|
56 |
-
#
|
|
|
57 |
deepspeed_available = False
|
58 |
-
try:
|
59 |
-
import deepspeed
|
60 |
-
deepspeed_available = True
|
61 |
-
logger.info("DeepSpeed successfully imported")
|
62 |
-
except ImportError as e:
|
63 |
-
logger.warning(f"Failed to import DeepSpeed: {e}")
|
64 |
-
logger.warning("Will continue without DeepSpeed support")
|
65 |
-
# Set a flag to disable DeepSpeed
|
66 |
-
os.environ["DISABLE_DEEPSPEED"] = "1"
|
67 |
|
68 |
# Disable all attention optimizations that might cause issues
|
69 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
@@ -611,35 +603,11 @@ def train(config_path, dataset_name, output_dir):
|
|
611 |
logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
|
612 |
|
613 |
# Check if DeepSpeed config is available and if DeepSpeed is available
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
ds_config_path = os.path.join(output_dir, "ds_config_temp.json")
|
620 |
-
|
621 |
-
# Update DeepSpeed config with dynamic values
|
622 |
-
if isinstance(deepspeed_config.get("train_micro_batch_size_per_gpu"), str) and deepspeed_config.get("train_micro_batch_size_per_gpu") == "auto":
|
623 |
-
deepspeed_config["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
|
624 |
-
|
625 |
-
if isinstance(deepspeed_config.get("train_batch_size"), str) and deepspeed_config.get("train_batch_size") == "auto":
|
626 |
-
deepspeed_config["train_batch_size"] = per_device_train_batch_size * gpu_count
|
627 |
-
|
628 |
-
# Write the DeepSpeed config to a file
|
629 |
-
with open(ds_config_path, 'w') as f:
|
630 |
-
json.dump(deepspeed_config, f, indent=2)
|
631 |
-
|
632 |
-
logger.info(f"Created DeepSpeed config at {ds_config_path}")
|
633 |
-
# Set using_deepspeed flag
|
634 |
-
using_deepspeed = True
|
635 |
-
elif os.environ.get("DISABLE_DEEPSPEED", "0") == "1":
|
636 |
-
logger.warning("DeepSpeed is disabled - using standard training without DeepSpeed")
|
637 |
-
ds_config_path = None
|
638 |
-
using_deepspeed = False
|
639 |
-
else:
|
640 |
-
logger.warning("DeepSpeed is disabled - using standard training without DeepSpeed")
|
641 |
-
ds_config_path = None
|
642 |
-
using_deepspeed = False
|
643 |
|
644 |
# Initialize model with our safe loading function
|
645 |
logger.info("Loading pre-quantized model with eager attention")
|
@@ -707,22 +675,18 @@ def train(config_path, dataset_name, output_dir):
|
|
707 |
}
|
708 |
|
709 |
# Add DeepSpeed config path if available and enabled
|
710 |
-
|
711 |
-
|
712 |
-
training_args_dict["deepspeed"] = ds_config_path
|
713 |
-
else:
|
714 |
-
logger.info("DeepSpeed is disabled - using standard distributed training")
|
715 |
|
716 |
# Create TrainingArguments with validated parameters
|
717 |
try:
|
718 |
training_args = TrainingArguments(**training_args_dict)
|
719 |
except Exception as e:
|
720 |
-
logger.error(f"Failed to create training arguments
|
721 |
if "deepspeed" in training_args_dict:
|
722 |
-
logger.warning("Removing DeepSpeed configuration
|
723 |
del training_args_dict["deepspeed"]
|
724 |
-
|
725 |
-
using_deepspeed = False
|
726 |
|
727 |
# Create trainer with pre-tokenized collator
|
728 |
trainer = Trainer(
|
|
|
18 |
# L40S-specific CUDA optimization
|
19 |
os.environ["CUDA_AUTO_BOOST"] = "1"
|
20 |
|
21 |
+
# Completely disable DeepSpeed for Hugging Face Spaces to avoid compatibility issues
|
22 |
+
os.environ["DISABLE_DEEPSPEED"] = "1"
|
23 |
|
24 |
import json
|
25 |
import logging
|
|
|
46 |
)
|
47 |
logger = logging.getLogger(__name__)
|
48 |
|
49 |
+
# Set up environment variables
|
50 |
os.environ["MASTER_ADDR"] = "localhost"
|
51 |
os.environ["MASTER_PORT"] = "9994"
|
52 |
os.environ["RANK"] = "0"
|
53 |
os.environ["LOCAL_RANK"] = "0"
|
54 |
os.environ["WORLD_SIZE"] = "1"
|
55 |
|
56 |
+
# DeepSpeed is disabled for Hugging Face Spaces due to compatibility issues
|
57 |
+
logger.info("DeepSpeed is disabled for Hugging Face Spaces to avoid compatibility issues")
|
58 |
deepspeed_available = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
# Disable all attention optimizations that might cause issues
|
61 |
os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
|
603 |
logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
|
604 |
|
605 |
# Check if DeepSpeed config is available and if DeepSpeed is available
|
606 |
+
# Note: DeepSpeed is now disabled by default for HF Spaces
|
607 |
+
deepspeed_config = None
|
608 |
+
logger.info("DeepSpeed is disabled for Hugging Face Spaces to avoid compatibility issues")
|
609 |
+
ds_config_path = None
|
610 |
+
using_deepspeed = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
611 |
|
612 |
# Initialize model with our safe loading function
|
613 |
logger.info("Loading pre-quantized model with eager attention")
|
|
|
675 |
}
|
676 |
|
677 |
# Add DeepSpeed config path if available and enabled
|
678 |
+
# DeepSpeed is disabled for Hugging Face Spaces
|
679 |
+
logger.info("DeepSpeed is disabled - using standard training")
|
|
|
|
|
|
|
680 |
|
681 |
# Create TrainingArguments with validated parameters
|
682 |
try:
|
683 |
training_args = TrainingArguments(**training_args_dict)
|
684 |
except Exception as e:
|
685 |
+
logger.error(f"Failed to create training arguments: {e}")
|
686 |
if "deepspeed" in training_args_dict:
|
687 |
+
logger.warning("Removing any DeepSpeed configuration")
|
688 |
del training_args_dict["deepspeed"]
|
689 |
+
training_args = TrainingArguments(**training_args_dict)
|
|
|
690 |
|
691 |
# Create trainer with pre-tokenized collator
|
692 |
trainer = Trainer(
|