adamcasson commited on
Commit
5e49fae
1 Parent(s): 07c0cb4

fix bug and refactor

Browse files
Files changed (1) hide show
  1. app.py +62 -19
app.py CHANGED
@@ -44,9 +44,11 @@ def calculator(
44
  d_model: int,
45
  n_heads: int,
46
  n_vocab: int,
47
- n_ctx: int,
48
  ff_ratio: int,
 
 
49
  incl_embed: bool,
 
50
  ) -> Tuple[int, int, int]:
51
  d_attn = d_model // n_heads
52
  if d_model % n_heads != 0:
@@ -61,37 +63,68 @@ def calculator(
61
  flops_per_sequence = sum(flops_terms)
62
  params = sum(params)
63
  else:
64
- flops_per_sequence = sum(flops_terms[1:3])
65
- params = sum(params[1:3])
 
 
 
 
66
 
67
- return params, flops_per_sequence, flops_per_sequence / n_ctx
 
 
 
 
 
68
 
69
 
70
  with gr.Blocks() as iface:
71
  gr.Markdown(
72
- "Calculate how many FLOPs a Transformer language model has using the method described in [DeepMind's Chinchilla scaling law paper](https://arxiv.org/abs/2203.15556) (see Appendix F)."
73
  )
74
  with gr.Row():
75
  with gr.Column():
 
76
  n_layer = gr.Number(label="Number of layers (n_layer)")
77
  d_model = gr.Number(label="Model dimensions (d_model)")
78
  n_heads = gr.Number(label="Number of attention heads per layer (n_heads)")
79
  n_vocab = gr.Number(label="Vocabulary size (n_vocab)")
80
- n_ctx = gr.Number(label="Sequence length")
81
  ff_ratio = gr.Number(value=4, label="Feedforward ratio")
 
 
 
 
 
 
 
82
  incl_embed = gr.Checkbox(value=True, label="Include embeddings")
 
 
 
83
 
84
  btn = gr.Button(value="Enter", variant="primary")
85
 
86
  with gr.Column():
 
87
  params = gr.Number(label="Model parameters")
88
  flops_per_sequence = gr.Number(label="FLOPs per sequence")
89
  flops_per_token = gr.Number(label="FLOPs per token")
 
90
 
91
  btn.click(
92
  calculator,
93
- inputs=[n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
94
- outputs=[params, flops_per_sequence, flops_per_token],
 
 
 
 
 
 
 
 
 
 
95
  )
96
 
97
  gr.Markdown("### GPT-3 model family examples")
@@ -100,18 +133,28 @@ with gr.Blocks() as iface:
100
  )
101
  gr.Examples(
102
  [
103
- [12, 768, 12, 50257, 4096, 4, True],
104
- [24, 1024, 16, 50257, 4096, 4, True],
105
- [24, 2048, 32, 50257, 4096, 4, True],
106
- [32, 2560, 32, 50257, 4096, 4, True],
107
- [32, 4096, 32, 50257, 4096, 4, True],
108
- [40, 5120, 40, 50257, 4096, 4, True],
109
- [48, 7168, 56, 50257, 4096, 4, True],
110
- [64, 9216, 72, 50257, 4096, 4, True],
111
- [96, 12288, 96, 50257, 4096, 4, True],
 
 
 
 
 
 
 
 
 
 
 
112
  ],
113
- [n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
114
- [params, flops_per_sequence, flops_per_token],
115
  calculator,
116
  cache_examples=False,
117
  )
 
44
  d_model: int,
45
  n_heads: int,
46
  n_vocab: int,
 
47
  ff_ratio: int,
48
+ n_ctx: int,
49
+ n_tokens: int,
50
  incl_embed: bool,
51
+ fwd_only: bool,
52
  ) -> Tuple[int, int, int]:
53
  d_attn = d_model // n_heads
54
  if d_model % n_heads != 0:
 
63
  flops_per_sequence = sum(flops_terms)
64
  params = sum(params)
65
  else:
66
+ flops_per_sequence = sum(flops_terms[1:])
67
+ params = sum(params[1:])
68
+
69
+ flops_per_token = flops_per_sequence / n_ctx
70
+
71
+ n_tokens_flops = flops_per_token * n_tokens
72
 
73
+ if not fwd_only:
74
+ flops_per_sequence *= 3
75
+ flops_per_token *= 3
76
+ n_tokens_flops *= 3
77
+
78
+ return params, flops_per_sequence, flops_per_token, n_tokens_flops
79
 
80
 
81
  with gr.Blocks() as iface:
82
  gr.Markdown(
83
+ "Calculate how many FLOPs a Transformer language model uses with the method described in [DeepMind's Chinchilla scaling law paper](https://arxiv.org/abs/2203.15556) (see Appendix F)."
84
  )
85
  with gr.Row():
86
  with gr.Column():
87
+ gr.Markdown("#### Architecture details")
88
  n_layer = gr.Number(label="Number of layers (n_layer)")
89
  d_model = gr.Number(label="Model dimensions (d_model)")
90
  n_heads = gr.Number(label="Number of attention heads per layer (n_heads)")
91
  n_vocab = gr.Number(label="Vocabulary size (n_vocab)")
 
92
  ff_ratio = gr.Number(value=4, label="Feedforward ratio")
93
+ gr.Markdown("#### Data details")
94
+ n_ctx = gr.Number(label="Sequence length (n_ctx)")
95
+ n_tokens = gr.Number(
96
+ value=0,
97
+ label="Total number of training tokens (n_tokens) (optional)",
98
+ )
99
+ gr.Markdown("#### Settings")
100
  incl_embed = gr.Checkbox(value=True, label="Include embeddings")
101
+ fwd_only = gr.Checkbox(
102
+ value=False, label="Calculate FLOPs for only forward pass"
103
+ )
104
 
105
  btn = gr.Button(value="Enter", variant="primary")
106
 
107
  with gr.Column():
108
+ gr.Markdown("#### Output")
109
  params = gr.Number(label="Model parameters")
110
  flops_per_sequence = gr.Number(label="FLOPs per sequence")
111
  flops_per_token = gr.Number(label="FLOPs per token")
112
+ n_tokens_flops = gr.Number(label="Total FLOPs for n_tokens")
113
 
114
  btn.click(
115
  calculator,
116
+ inputs=[
117
+ n_layer,
118
+ d_model,
119
+ n_heads,
120
+ n_vocab,
121
+ ff_ratio,
122
+ n_ctx,
123
+ n_tokens,
124
+ incl_embed,
125
+ fwd_only,
126
+ ],
127
+ outputs=[params, flops_per_sequence, flops_per_token, n_tokens_flops],
128
  )
129
 
130
  gr.Markdown("### GPT-3 model family examples")
 
133
  )
134
  gr.Examples(
135
  [
136
+ [12, 768, 12, 50257, 4, 4096, 0, True, False],
137
+ [24, 1024, 16, 50257, 4, 4096, 0, True, False],
138
+ [24, 2048, 32, 50257, 4, 4096, 0, True, False],
139
+ [32, 2560, 32, 50257, 4, 4096, 0, True, False],
140
+ [32, 4096, 32, 50257, 4, 4096, 0, True, False],
141
+ [40, 5120, 40, 50257, 4, 4096, 0, True, False],
142
+ [48, 7168, 56, 50257, 4, 4096, 0, True, False],
143
+ [64, 9216, 72, 50257, 4, 4096, 0, True, False],
144
+ [96, 12288, 96, 50257, 4, 4096, 0, True, False],
145
+ ],
146
+ [
147
+ n_layer,
148
+ d_model,
149
+ n_heads,
150
+ n_vocab,
151
+ ff_ratio,
152
+ n_ctx,
153
+ n_tokens,
154
+ incl_embed,
155
+ fwd_only,
156
  ],
157
+ [params, flops_per_sequence, flops_per_token, n_tokens_flops],
 
158
  calculator,
159
  cache_examples=False,
160
  )