deepspeed-model-memory-usage

Running

App Files Files Community

andstor commited on Feb 8

Commit

6ca6353

•

1 Parent(s): 55d1b4f

Update src/parallelism_utils.py

Browse files

Files changed (1) hide show

src/parallelism_utils.py +6 -12

src/parallelism_utils.py CHANGED Viewed

@@ -9,10 +9,10 @@ def get_precision_fac(precision: str):
         raise ValueError("Precision must be either 'mixed' or 'single'")
-def get_params_fac(model_dtype: torch.dtype):
-    if model_dtype == torch.float16:
         return 2
-    elif model_dtype == torch.float32:
         return 4
     else:
         raise ValueError("Model dtype must be either torch.float16 or torch.float32")
@@ -29,19 +29,13 @@ FP32_PARAM_FACTOR = 4
 MASTER_PARAMS_FACTOR = FP32_PARAM_FACTOR
-# TODO: check if params_fac is needed during full fp32 training.
-# Normally, mixed precision training results in 1.5x memory compared to FP32.
-# Currently, we are assuming 2x memory for FP32, as deepspeed's ZeRO-2 is optimized for FP16 training.
 def estimate_zero1_model_states_mem_needs(total_params,
                                           num_gpus_per_node=1,
                                           num_nodes=1,
                                           cpu_offload=True,
                                           additional_buffer_factor=1.5,
                                           precision="mixed",
-                                          model_dtype = torch.float16,
                                           ):
     total_gpus = num_nodes * num_gpus_per_node
@@ -68,7 +62,7 @@ def estimate_zero2_model_states_mem_needs(total_params,
                                           cpu_offload=True,
                                           additional_buffer_factor=1.5,
                                           precision="mixed",
-                                          model_dtype = torch.float16,
                                           ):
     total_gpus = num_nodes * num_gpus_per_node
@@ -98,7 +92,7 @@ def estimate_zero3_model_states_mem_needs(total_params,
                                           zero_init=True,
                                           additional_buffer_factor=1.5,
                                           precision="mixed",
-                                          model_dtype = torch.float16,
                                           ):
     total_gpus = num_nodes * num_gpus_per_node

         raise ValueError("Precision must be either 'mixed' or 'single'")
+def get_params_fac(model_dtype: str):
+    if model_dtype == "float16":
         return 2
+    elif model_dtype == "float32":
         return 4
     else:
         raise ValueError("Model dtype must be either torch.float16 or torch.float32")
 MASTER_PARAMS_FACTOR = FP32_PARAM_FACTOR
 def estimate_zero1_model_states_mem_needs(total_params,
                                           num_gpus_per_node=1,
                                           num_nodes=1,
                                           cpu_offload=True,
                                           additional_buffer_factor=1.5,
                                           precision="mixed",
+                                          model_dtype = "float16",
                                           ):
     total_gpus = num_nodes * num_gpus_per_node
                                           cpu_offload=True,
                                           additional_buffer_factor=1.5,
                                           precision="mixed",
+                                          model_dtype = "float16",
                                           ):
     total_gpus = num_nodes * num_gpus_per_node
                                           zero_init=True,
                                           additional_buffer_factor=1.5,
                                           precision="mixed",
+                                          model_dtype = "float16",
                                           ):
     total_gpus = num_nodes * num_gpus_per_node