muellerzr HF staff commited on
Commit
6b9c5c7
1 Parent(s): 27e2fc4

Update for more explained training methods

Browse files
Files changed (4) hide show
  1. requirements.txt +1 -1
  2. src/app.py +50 -4
  3. src/hub_utils.py +2 -2
  4. src/model_utils.py +4 -2
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- accelerate @ git+https://github.com/huggingface/accelerate
2
  transformers
3
  timm
4
  huggingface_hub==0.19.4
 
1
+ accelerate @ git+https://github.com/huggingface/accelerate@improve-model-estimator
2
  transformers
3
  timm
4
  huggingface_hub==0.19.4
src/app.py CHANGED
@@ -1,8 +1,9 @@
1
  import gradio as gr
2
  import pandas as pd
 
3
  from hub_utils import check_for_discussion, report_results
4
- from model_utils import calculate_memory, get_model
5
  from huggingface_hub.utils import HfHubHTTPError
 
6
 
7
 
8
  def get_results(model_name: str, library: str, options: list, access_token: str):
@@ -13,7 +14,46 @@ def get_results(model_name: str, library: str, options: list, access_token: str)
13
  has_discussion = True
14
  title = f"## Memory usage for '{model_name}'"
15
  data = calculate_memory(model, options)
16
- return [title, gr.update(visible=True, value=pd.DataFrame(data)), gr.update(visible=not has_discussion)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
 
19
  with gr.Blocks() as demo:
@@ -33,7 +73,13 @@ with gr.Blocks() as demo:
33
  )
34
  out_text = gr.Markdown()
35
  out = gr.DataFrame(
36
- headers=["dtype", "Largest Layer", "Total Size", "Training using Adam"],
 
 
 
 
 
 
37
  interactive=False,
38
  visible=False,
39
  )
@@ -56,7 +102,7 @@ with gr.Blocks() as demo:
56
  btn.click(
57
  get_results,
58
  inputs=[inp, library, options, access_token],
59
- outputs=[out_text, out, post_to_hub],
60
  api_name=False,
61
  )
62
 
 
1
  import gradio as gr
2
  import pandas as pd
3
+ from accelerate.utils import convert_bytes
4
  from hub_utils import check_for_discussion, report_results
 
5
  from huggingface_hub.utils import HfHubHTTPError
6
+ from model_utils import calculate_memory, get_model
7
 
8
 
9
  def get_results(model_name: str, library: str, options: list, access_token: str):
 
14
  has_discussion = True
15
  title = f"## Memory usage for '{model_name}'"
16
  data = calculate_memory(model, options)
17
+ stages = {"model": [], "gradients": [], "optimizer": [], "step": []}
18
+ for i, option in enumerate(data):
19
+ for stage in stages:
20
+ stages[stage].append(option["Training using Adam"][stage])
21
+ value = max(data[i]["Training using Adam"].values())
22
+ if value == -1:
23
+ value = "N/A"
24
+ else:
25
+ value = convert_bytes(value)
26
+ data[i]["Training using Adam"] = value
27
+
28
+ if any(value != -1 for value in stages["model"]):
29
+ out_explain = "## Training using Adam explained:\n"
30
+ out_explain += "When training on a batch size of 1, each stage of the training process is expected to have near the following memory results for each precision you selected:\n"
31
+ memory_values = pd.DataFrame(
32
+ columns=["dtype", "Model", "Gradient calculation", "Backward pass", "Optimizer step"]
33
+ )
34
+ for i, dtype in enumerate(options):
35
+ if stages["model"][i] != -1:
36
+ memory_values.loc[len(memory_values)] = [
37
+ dtype,
38
+ convert_bytes(stages["model"][i]),
39
+ convert_bytes(stages["gradients"][i]),
40
+ convert_bytes(stages["optimizer"][i]),
41
+ convert_bytes(stages["step"][i]),
42
+ ]
43
+ return [
44
+ title,
45
+ gr.update(visible=True, value=pd.DataFrame(data)),
46
+ gr.update(visible=True, value=out_explain),
47
+ gr.update(visible=True, value=memory_values),
48
+ gr.update(visible=not has_discussion),
49
+ ]
50
+ return [
51
+ title,
52
+ gr.update(visible=True, value=pd.DataFrame(data)),
53
+ gr.update(visible=False, value=""),
54
+ gr.update(visible=False, value=pd.DataFrame()),
55
+ gr.update(visible=not has_discussion),
56
+ ]
57
 
58
 
59
  with gr.Blocks() as demo:
 
73
  )
74
  out_text = gr.Markdown()
75
  out = gr.DataFrame(
76
+ headers=["dtype", "Largest Layer", "Total Size", "Training using Adam (Peek vRAM)"],
77
+ interactive=False,
78
+ visible=False,
79
+ )
80
+ out_explain = gr.Markdown()
81
+ memory_values = gr.DataFrame(
82
+ headers=["dtype", "Model", "Gradient calculation", "Backward pass", "Optimizer step"],
83
  interactive=False,
84
  visible=False,
85
  )
 
102
  btn.click(
103
  get_results,
104
  inputs=[inp, library, options, access_token],
105
+ outputs=[out_text, out, out_explain, memory_values, post_to_hub],
106
  api_name=False,
107
  )
108
 
src/hub_utils.py CHANGED
@@ -27,9 +27,9 @@ def report_results(model_name, library, access_token):
27
  post = f"""# Model Memory Requirements\n
28
 
29
  You will need about {data[1]} VRAM to load this model for inference, and {data[3]} VRAM to train it using Adam.
30
-
31
  These calculations were measured from the [Model Memory Utility Space](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) on the Hub.
32
-
33
  The minimum recommended vRAM needed for this model assumes using [Accelerate or `device_map="auto"`](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) and is denoted by the size of the "largest layer".
34
  When performing inference, expect to add up to an additional 20% to this, as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). More tests will be performed in the future to get a more accurate benchmark for each model.
35
 
 
27
  post = f"""# Model Memory Requirements\n
28
 
29
  You will need about {data[1]} VRAM to load this model for inference, and {data[3]} VRAM to train it using Adam.
30
+
31
  These calculations were measured from the [Model Memory Utility Space](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) on the Hub.
32
+
33
  The minimum recommended vRAM needed for this model assumes using [Accelerate or `device_map="auto"`](https://huggingface.co/docs/accelerate/usage_guides/big_modeling) and is denoted by the size of the "largest layer".
34
  When performing inference, expect to add up to an additional 20% to this, as found by [EleutherAI](https://blog.eleuther.ai/transformer-math/). More tests will be performed in the future to get a more accurate benchmark for each model.
35
 
src/model_utils.py CHANGED
@@ -3,7 +3,7 @@ from urllib.parse import urlparse
3
 
4
  import gradio as gr
5
  import torch
6
- from accelerate.commands.estimate import check_has_model, create_empty_model
7
  from accelerate.utils import calculate_maximum_sizes, convert_bytes
8
  from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
9
 
@@ -84,10 +84,12 @@ def calculate_memory(model: torch.nn.Module, options: list):
84
  dtype_largest_layer = largest_layer[0]
85
 
86
  modifier = DTYPE_MODIFIER[dtype]
 
 
 
87
  dtype_total_size /= modifier
88
  dtype_largest_layer /= modifier
89
 
90
- dtype_training_size = convert_bytes(dtype_total_size * 4)
91
  dtype_total_size = convert_bytes(dtype_total_size)
92
  dtype_largest_layer = convert_bytes(dtype_largest_layer)
93
  data.append(
 
3
 
4
  import gradio as gr
5
  import torch
6
+ from accelerate.commands.estimate import check_has_model, create_empty_model, estimate_training_usage
7
  from accelerate.utils import calculate_maximum_sizes, convert_bytes
8
  from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
9
 
 
84
  dtype_largest_layer = largest_layer[0]
85
 
86
  modifier = DTYPE_MODIFIER[dtype]
87
+ dtype_training_size = estimate_training_usage(
88
+ dtype_total_size, dtype if dtype != "float16/bfloat16" else "float16"
89
+ )
90
  dtype_total_size /= modifier
91
  dtype_largest_layer /= modifier
92
 
 
93
  dtype_total_size = convert_bytes(dtype_total_size)
94
  dtype_largest_layer = convert_bytes(dtype_largest_layer)
95
  data.append(