Spaces:

Dovakiins
/

qwerrwe

Build error

App Files Files Community

winglian commited on Jan 26, 2024

Commit

e923e62

unverified ·

1 Parent(s): ba944e6

more checks and fixes for deepspeed and fsdp (#1208) [skip ci]

Browse files

Files changed (6) hide show

deepspeed_configs/zero1.json +0 -9
deepspeed_configs/zero2.json +0 -9
deepspeed_configs/zero3.json +0 -9
deepspeed_configs/zero3_bf16.json +0 -9
src/axolotl/utils/config.py +28 -20
src/axolotl/utils/models.py +10 -8

deepspeed_configs/zero1.json CHANGED Viewed

@@ -15,15 +15,6 @@
     "hysteresis": 2,
     "min_loss_scale": 1
   },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
   "gradient_accumulation_steps": "auto",
   "train_batch_size": "auto",
   "train_micro_batch_size_per_gpu": "auto",

     "hysteresis": 2,
     "min_loss_scale": 1
   },
   "gradient_accumulation_steps": "auto",
   "train_batch_size": "auto",
   "train_micro_batch_size_per_gpu": "auto",

deepspeed_configs/zero2.json CHANGED Viewed

@@ -19,15 +19,6 @@
     "hysteresis": 2,
     "min_loss_scale": 1
   },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
   "gradient_accumulation_steps": "auto",
   "train_batch_size": "auto",
   "train_micro_batch_size_per_gpu": "auto",

     "hysteresis": 2,
     "min_loss_scale": 1
   },
   "gradient_accumulation_steps": "auto",
   "train_batch_size": "auto",
   "train_micro_batch_size_per_gpu": "auto",

deepspeed_configs/zero3.json CHANGED Viewed

@@ -23,15 +23,6 @@
     "hysteresis": 2,
     "min_loss_scale": 1
   },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
   "gradient_accumulation_steps": "auto",
   "train_batch_size": "auto",
   "train_micro_batch_size_per_gpu": "auto",

     "hysteresis": 2,
     "min_loss_scale": 1
   },
   "gradient_accumulation_steps": "auto",
   "train_batch_size": "auto",
   "train_micro_batch_size_per_gpu": "auto",

deepspeed_configs/zero3_bf16.json CHANGED Viewed

@@ -23,15 +23,6 @@
     "hysteresis": 2,
     "min_loss_scale": 1
   },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
   "gradient_accumulation_steps": "auto",
   "train_batch_size": "auto",
   "train_micro_batch_size_per_gpu": "auto",

     "hysteresis": 2,
     "min_loss_scale": 1
   },
   "gradient_accumulation_steps": "auto",
   "train_batch_size": "auto",
   "train_micro_batch_size_per_gpu": "auto",

src/axolotl/utils/config.py CHANGED Viewed

@@ -95,7 +95,7 @@ def normalize_config(cfg):
         save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs)
         if save_steps < 1.0:  # prevent saves on every step
             cfg.save_steps = save_steps
-    if cfg.evals_per_epoch:
         eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs)
         if eval_steps < 1.0:  # prevent evals on every step
             cfg.eval_steps = eval_steps
@@ -485,35 +485,43 @@ def validate_config(cfg):
             "`use_reentrant` must be false when used with partially frozen model."
         )
-    if cfg.flash_attention and cfg.deepspeed and Path(cfg.deepspeed).is_file():
         with open(cfg.deepspeed, encoding="utf-8") as file:
             contents = file.read()
             deepspeed_cfg: DictDefault = DictDefault(json.loads(contents))
-            if (
-                deepspeed_cfg.zero_optimization
-                and deepspeed_cfg.zero_optimization.stage == 3
-            ):
-                if not (
-                    (
-                        deepspeed_cfg.bf16
-                        and deepspeed_cfg.bf16.enabled  # pylint: disable=no-member
-                        is True
-                    )
-                    or (
-                        deepspeed_cfg.fp16
-                        and deepspeed_cfg.fp16.enabled  # pylint: disable=no-member
-                        is True
-                    )
                 ):
-                    raise ValueError(
-                        "bf16.enabled or fp16.enabled must be set to true when using ZeRO-3 with flash-attention"
-                    )
     if cfg.test_datasets and cfg.val_set_size:
         raise ValueError(
             "non-zero val_set_size should not be used with test_datasets configuration"
         )
     # TODO
     # MPT 7b
     # https://github.com/facebookresearch/bitsandbytes/issues/25

         save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs)
         if save_steps < 1.0:  # prevent saves on every step
             cfg.save_steps = save_steps
+    if (cfg.val_set_size or cfg.test_datasets) and cfg.evals_per_epoch:
         eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs)
         if eval_steps < 1.0:  # prevent evals on every step
             cfg.eval_steps = eval_steps
             "`use_reentrant` must be false when used with partially frozen model."
         )
+    if cfg.deepspeed and Path(cfg.deepspeed).is_file():
         with open(cfg.deepspeed, encoding="utf-8") as file:
             contents = file.read()
             deepspeed_cfg: DictDefault = DictDefault(json.loads(contents))
+            if cfg.flash_attention:
+                if (
+                    deepspeed_cfg.zero_optimization
+                    and deepspeed_cfg.zero_optimization.stage == 3
                 ):
+                    if not (
+                        (
+                            deepspeed_cfg.bf16
+                            and deepspeed_cfg.bf16.enabled  # pylint: disable=no-member
+                            is True
+                        )
+                        or (
+                            deepspeed_cfg.fp16
+                            and deepspeed_cfg.fp16.enabled  # pylint: disable=no-member
+                            is True
+                        )
+                    ):
+                        raise ValueError(
+                            "bf16.enabled or fp16.enabled must be set to true when using ZeRO-3 with flash-attention"
+                        )
+            if "8bit" in cfg.optimizer and deepspeed_cfg.optimizer:
+                LOG.warning(
+                    f"conflicting optimizer: {cfg.optimizer} used alongside deepspeed optimizer."
+                )
     if cfg.test_datasets and cfg.val_set_size:
         raise ValueError(
             "non-zero val_set_size should not be used with test_datasets configuration"
         )
+    if cfg.fsdp and "bnb" in cfg.optimizer:
+        raise ValueError(f"FSDP not compatible with {cfg.optimizer}")
     # TODO
     # MPT 7b
     # https://github.com/facebookresearch/bitsandbytes/issues/25

src/axolotl/utils/models.py CHANGED Viewed

@@ -642,15 +642,17 @@ def load_model(
     # make sure these are fp32 per Ramesh et al. (2021)
     embedding_modules = get_linear_embedding_layers(cfg.model_config_type)
-    for name, module in model.named_modules():
-        if any(m in name for m in ["norm", "gate"]):
-            module.to(torch.float32)
-        if model_config.model_type == "btlm":
-            # don't upcast lm_head for btlm
-            continue
-        if any(m in name for m in embedding_modules):
-            if hasattr(module, "weight"):
                 module.to(torch.float32)
     needs_fa2_dtype = cfg.adapter or cfg.fsdp
     skip_prepare_model_for_kbit_training = False

     # make sure these are fp32 per Ramesh et al. (2021)
     embedding_modules = get_linear_embedding_layers(cfg.model_config_type)
+    if not cfg.fsdp:
+        # FSDP doesn't like mixed Float and BFloat16
+        for name, module in model.named_modules():
+            if any(m in name for m in ["norm", "gate"]):
                 module.to(torch.float32)
+            if model_config.model_type == "btlm":
+                # don't upcast lm_head for btlm
+                continue
+            if any(m in name for m in embedding_modules):
+                if hasattr(module, "weight"):
+                    module.to(torch.float32)
     needs_fa2_dtype = cfg.adapter or cfg.fsdp
     skip_prepare_model_for_kbit_training = False