machineuser commited on
Commit
0449a8e
·
1 Parent(s): 5f8c4d2

import from banana

Browse files
app_to_share/loss_vs_compute.csv ADDED
The diff for this file is too large to render. See raw diff
 
app_to_share/optimal_training/app_requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bokeh==2.0.2
2
+ Jinja2==2.11.2
3
+ MarkupSafe==1.1.1
4
+ numpy==1.18.4
5
+ packaging==20.4
6
+ pandas==1.0.3
7
+ Pillow==7.1.2
8
+ pkg-resources==0.0.0
9
+ pyparsing==2.4.7
10
+ python-dateutil==2.8.1
11
+ pytz==2020.1
12
+ PyYAML==5.3.1
13
+ randomcolor==0.4.4.5
14
+ scipy==1.4.1
15
+ six==1.15.0
16
+ tornado==6.0.4
17
+ typing-extensions==3.7.4.2
app_to_share/optimal_training/bokeh_test.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ''' Present an interactive function explorer with slider widgets.
2
+ Scrub the sliders to change the properties of the ``sin`` curve, or
3
+ type into the title text box to update the title of the plot.
4
+ Use the ``bokeh serve`` command to run the example by executing:
5
+ bokeh serve sliders.py
6
+ at your command prompt. Then navigate to the URL
7
+ http://localhost:5006/sliders
8
+ in your browser.
9
+ '''
10
+ import numpy as np
11
+
12
+ from bokeh.io import curdoc
13
+ from bokeh.layouts import column, row
14
+ from bokeh.models import ColumnDataSource, Slider, TextInput
15
+ from bokeh.plotting import figure
16
+
17
+ # Set up data
18
+ N = 200
19
+ x = np.linspace(0, 4 * np.pi, N)
20
+ y = np.sin(x)
21
+ source = ColumnDataSource(data=dict(x=x, y=y))
22
+
23
+ # Set up plot
24
+ plot = figure(plot_height=400, plot_width=400, title="my sine wave",
25
+ tools="crosshair,pan,reset,save,wheel_zoom",
26
+ x_range=[0, 4 * np.pi], y_range=[-2.5, 2.5])
27
+
28
+ plot.line('x', 'y', source=source, line_width=3, line_alpha=0.6)
29
+
30
+ # Set up widgets
31
+ text = TextInput(title="title", value='my sine wave')
32
+ offset = Slider(title="offset", value=0.0, start=-5.0, end=5.0, step=0.1)
33
+ amplitude = Slider(title="amplitude", value=1.0, start=-5.0, end=5.0, step=0.1)
34
+ phase = Slider(title="phase", value=0.0, start=0.0, end=2 * np.pi)
35
+ freq = Slider(title="frequency", value=1.0, start=0.1, end=5.1, step=0.1)
36
+ slider_moves = {"offset": 0, "amplitude": 0, "phase": 0, "freq": 0}
37
+
38
+
39
+ # Set up callbacks
40
+ def update_title(attrname, old, new):
41
+ plot.title.text = text.value
42
+
43
+
44
+ text.on_change('value', update_title)
45
+
46
+
47
+ def update_data(attrname, old, new):
48
+ # Get the current slider values
49
+ a = amplitude.value
50
+ b = offset.value
51
+ w = phase.value
52
+ k = freq.value
53
+
54
+ # Generate the new curve
55
+ x = np.linspace(0, 4 * np.pi, N)
56
+ y = a * np.sin(k * x + w) + b
57
+
58
+ source.data = dict(x=x, y=y)
59
+
60
+
61
+ def offset_force(attrname, old, new):
62
+ slider_moves["offset"] += 1
63
+
64
+ if slider_moves["amplitude"] < slider_moves["offset"]:
65
+ a = amplitude.value = offset.value
66
+ w = phase.value = offset.value
67
+ k = freq.value = offset.value
68
+ b = offset.value
69
+ x = np.linspace(0, 4 * np.pi, N)
70
+ y = a * np.sin(k * x + w) + b
71
+
72
+ source.data = dict(x=x, y=y)
73
+
74
+
75
+ def amp_force(attrname, old, new):
76
+ slider_moves["amplitude"] += 1
77
+
78
+ if slider_moves["offset"] < slider_moves["amplitude"]:
79
+ b = offset.value = amplitude.value * 2
80
+ w = phase.value = amplitude.value * 2
81
+ k = freq.value = amplitude.value * 2
82
+ a = amplitude.value
83
+ x = np.linspace(0, 4 * np.pi, N)
84
+ y = a * np.sin(k * x + w) + b
85
+
86
+ source.data = dict(x=x, y=y)
87
+
88
+
89
+ for w in [phase, freq]:
90
+ w.on_change('value', update_data)
91
+
92
+ offset.on_change('value', offset_force)
93
+ amplitude.on_change('value', amp_force)
94
+
95
+ # Set up layouts and add to document
96
+ inputs = column(text, offset, amplitude, phase, freq)
97
+
98
+ curdoc().add_root(row(inputs, plot, width=800))
99
+ curdoc().title = "Sliders"
app_to_share/optimal_training/conversions.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ from scipy.optimize import root
4
+
5
+ day_ratio = 24 * 3600
6
+
7
+ depth_width_ratio = 128
8
+
9
+ constants_per_gpu = {
10
+ "V100": [2.21527743e+07, 1.18538628e+00, 1.43150104e+00, 1.66015023e+00,
11
+ 1.32808220e+00, 5.91503856e+00],
12
+ "V100 (without tensor cores and cudnn.benchmark)": [1.82997989e+07, 1.05349588e+00, 1.25312127e+00, 1.67071294e+00,
13
+ 1.44610885e+00, 5.55824273e+00],
14
+ "P100": [6.01863899e+07, 9.23656025e-01, 1.03230702e+00, 1.46733667e+00,
15
+ 1.03031298e+00, 5.38021875e+00],
16
+ "P4": [4.84472202e+07, 9.86822195e-01, 1.23474901e+00, 1.38493518e+00,
17
+ 1.04630858e+00, 1.03572754e+01],
18
+ "K80": [2.58592374e+07, 6.42050890e-01, 7.06115162e-01, 1.44360777e+00,
19
+ 7.50695980e-01, 6.25951436e+00]
20
+
21
+ }
22
+
23
+ price_per_gpu = {
24
+ "K80": 0.584,
25
+ "P4": 0.689,
26
+ "V100": 2.005,
27
+ "V100 (without tensor cores and cudnn.benchmark)": 2.005,
28
+ "P100": 1.416,
29
+ }
30
+
31
+ optimal_batch_size_per_gpu = {
32
+ "P4": 16,
33
+ "V100": 64,
34
+ "V100 (without tensor cores and cudnn.benchmark)": 64,
35
+ "P100": 64,
36
+ "K80": 16
37
+ }
38
+
39
+ features_per_amp_mode = {
40
+ "O0": (1, 0, 0),
41
+ "O1": (0, 1, 0),
42
+ "O2": (0, 0, 1)
43
+ }
44
+
45
+ gpu_consumption = {
46
+ "V100": 119.3495934959e-3,
47
+ "V100 (without tensor cores and cudnn.benchmark)": 119.3495934959e-3,
48
+ "K80": 142.42e-3,
49
+ "P4": 55.27e-3,
50
+ "P100": 139.65e-3
51
+ }
52
+
53
+ co2_intensity = 534 * 1e-3
54
+
55
+
56
+ def flo_speed(features, constants):
57
+ k, k1, k2, b, c, layer_base = constants
58
+ o0, o1, o2, x, y, z = features
59
+ return k * np.power(k1, o1) * np.power(k2, o2) * x / (x + layer_base) * np.power(y, b) * np.power(np.log(z + 1), c)
60
+
61
+
62
+ def param_polynomial(width, depth=None, inner=None):
63
+ if depth is not None:
64
+ if inner is not None:
65
+ return 5 * depth * (width ** 2) + 2 * depth * (width * inner) + 7 * depth * width + depth * inner + 3 * width + 3
66
+ else:
67
+ return 7 * depth * (width ** 2) + 8 * depth * width + 3 * width + 3
68
+ else:
69
+ if inner is not None:
70
+ return 5 * depth_width_ratio * (width ** 3) + 2 * depth_width_ratio * (width ** 2 * inner) + 7 * depth_width_ratio * width ** 2 + depth_width_ratio * width * inner + 3 * width + 3
71
+ else:
72
+ return 7 / depth_width_ratio * (width ** 3) + 8 / depth_width_ratio * (width ** 2) + 3 * width + 3
73
+
74
+
75
+ def optimal_model_shape(width, param_number, base=8):
76
+ depth = max(1, math.floor(width / depth_width_ratio))
77
+ poly_params = np.array([depth * 7, depth * 8 + 3, 3 - param_number])
78
+ roots = np.roots(poly_params)
79
+ corresponding_width = int(base * round(max(roots) / base))
80
+ return depth, corresponding_width
81
+
82
+
83
+ def alternate_model_shape(width, param_number, base=8):
84
+ linear_depth = max(1, math.floor(width / depth_width_ratio))
85
+ depth = max(linear_depth + 1, math.floor(0.3 * width ** 1.25 / depth_width_ratio))
86
+ poly_params = np.array([depth * 7, depth * 8 + 3, 3 - param_number])
87
+ roots = np.roots(poly_params)
88
+ corresponding_width = int(base * round(max(roots) / base))
89
+ return depth, corresponding_width
90
+
91
+
92
+ def hours_to_width(hours, gpu, amp_mode, param_popt):
93
+ seconds = hours * 3600
94
+ d, e, f = param_popt
95
+ constants = constants_per_gpu[gpu]
96
+ amp_features = features_per_amp_mode[amp_mode]
97
+
98
+ def equation_function(width):
99
+ return np.power((param_polynomial(width) - f) / d, 1 / e) / flo_speed(
100
+ (*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]),
101
+ constants) * day_ratio - seconds
102
+
103
+ width = iterative_solutions(equation_function, initial_guess=128)
104
+ # print("width: {}".format(math.floor(width)))
105
+ # print("depth: {}".format(width / depth_width_ratio))
106
+ # print("param number: {:.4e}".format(param_polynomial(width)))
107
+ speed = flo_speed((*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]), constants)
108
+ # print("speed: {:.4e}".format(speed))
109
+ # print("flos from speed: {:.4e}".format(seconds * speed))
110
+ # print("flos from params: {:.4e}".format(np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio))
111
+ # print("params from flos: {:.4e}".format(np.exp(param_fit(speed * seconds / day_ratio, *param_popt))))
112
+ return width
113
+
114
+
115
+ def iterative_solutions(equation_function, initial_guess):
116
+ while initial_guess > 16:
117
+ solution_array = root(equation_function, np.array([initial_guess]), method="hybr").x
118
+ width = solution_array[0]
119
+ should_be_zero = equation_function(width)
120
+ if np.abs(should_be_zero) < 1e0:
121
+ return width
122
+ else:
123
+ initial_guess *= 0.5
124
+ return width
125
+
126
+
127
+ def width_to_flo(width, d, e, f):
128
+ return np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio
129
+
130
+
131
+ def loss_fit(x, a, b, c):
132
+ return a * np.power(x, -b) + c
133
+
134
+
135
+ def param_fit(x, d, e, f):
136
+ return np.log(d * np.power(x, e) + f)
137
+
138
+
139
+ def hours_to_dollars(hours, gpu):
140
+ return hours * price_per_gpu[gpu]
141
+
142
+
143
+ def dollars_to_hours(dollars, gpu):
144
+ return dollars / price_per_gpu[gpu]
145
+
146
+
147
+ def hours_to_kWh(hours, gpu):
148
+ return hours * gpu_consumption[gpu]
149
+
150
+
151
+ def hours_to_co2(hours, gpu):
152
+ return hours * gpu_consumption[gpu] * co2_intensity
153
+
154
+
155
+ def loss_to_flo(loss, a, b, c):
156
+ return ((loss - c) / a) ** (-1 / b)
157
+
158
+
159
+ def param_to_flo(param_number, d, e, f):
160
+ return ((param_number - f) / d) ** (1 / e)
161
+
162
+
163
+ def safe_flo_to_param(flo, d, e, f):
164
+ return d * np.power(flo, e) + f
165
+
166
+
167
+ def param_to_width(param_number):
168
+ poly_params = np.array([7 / depth_width_ratio, 8 / depth_width_ratio, 3, 3 - param_number])
169
+ roots = np.roots(poly_params)
170
+ real_roots = [np.real(candidate) for candidate in roots if np.imag(candidate) < 1e-5]
171
+ width = max(real_roots)
172
+ return width
173
+
174
+
175
+ def safe_param_to_width(param_number):
176
+ try:
177
+ return param_to_width(param_number)
178
+ except np.linalg.LinAlgError:
179
+ return safe_param_to_width(1.5 * param_number)
180
+
181
+
182
+ def width_to_hours(width, gpu, amp_mode, param_popt):
183
+ d, e, f = param_popt
184
+ constants = constants_per_gpu[gpu]
185
+ amp_features = features_per_amp_mode[amp_mode]
186
+ flos_from_params = np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio
187
+ speed = flo_speed((*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]), constants)
188
+ seconds = flos_from_params / speed
189
+
190
+ return seconds / 3600
191
+
192
+
193
+ def param_prime(width, depth=None):
194
+ if depth is not None:
195
+ return 14 * depth * (width ** 2) + 8 * depth + 3
196
+ else:
197
+ return 21 / depth_width_ratio * (width ** 2) + 16 / depth_width_ratio * width + 3
198
+
199
+
200
+ def flo_speed_prime(width, gpu, amp_mode):
201
+ k, k1, k2, b, c, layer_base = constants_per_gpu[gpu]
202
+ o0, o1, o2 = features_per_amp_mode[amp_mode]
203
+ mult_constant = k * np.power(k1, o1) * np.power(k2, o2) * np.power(np.log(optimal_batch_size_per_gpu[gpu] + 1), c)
204
+ return mult_constant * ((b + 1) * np.power(width, b) / (width + layer_base * depth_width_ratio)
205
+ - np.power(width, b + 1) / (width + layer_base * depth_width_ratio) ** 2)
206
+
207
+
208
+ # awful equation; we're trying to find the width for which lowering width actually makes the model less efficient
209
+ def tipping_point(gpu, amp_mode, param_popt):
210
+ d, e, f = param_popt
211
+ o0, o1, o2 = features_per_amp_mode[amp_mode]
212
+
213
+ def equation_function(width):
214
+ return np.power((param_polynomial(width) - f) / d, -1) / e * param_prime(width) / d \
215
+ * flo_speed((o0, o1, o2, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]),
216
+ constants_per_gpu[gpu]) - \
217
+ flo_speed_prime(width, gpu, amp_mode)
218
+
219
+ tipping_width = iterative_solutions(equation_function, initial_guess=100)
220
+ return tipping_width
221
+
222
+
223
+ def update_tip(tip, width, gpu, amp_mode, loss_popt, param_popt):
224
+ a, b, c = loss_popt
225
+ d, e, f = param_popt
226
+ tip["width"] = width
227
+ tip["param_number"] = param_polynomial(width)
228
+ tip["flo"] = np.power((param_polynomial(tip["param_number"]) - f) / d, 1 / e)
229
+ tip["loss"] = loss_fit(tip["flo"], a, b, c)
230
+ tip["hours"] = width_to_hours(width, gpu, amp_mode, param_popt)
app_to_share/optimal_training/main.py ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bokeh.io import curdoc
2
+ from bokeh.layouts import column, row
3
+ from bokeh.models import Slider, Select, ColumnDataSource, Span, Div, Button, LogColorMapper, ColorBar, LogTicker
4
+ from bokeh.models.tools import CrosshairTool
5
+ from bokeh.plotting import figure
6
+ from bokeh.events import Tap
7
+ from bokeh.transform import log_cmap
8
+ import pandas as pd
9
+ from scipy.spatial import ConvexHull
10
+ from scipy.optimize import curve_fit
11
+ from time import sleep
12
+
13
+ from utils import *
14
+ from conversions import *
15
+
16
+ ########################################################################################################################
17
+ # Basic dimensions
18
+ ########################################################################################################################
19
+
20
+ plot_width = 1200
21
+ plot_height = 400
22
+ sidebar_width = 400
23
+ in_text_plot_width = 800
24
+ in_text_plot_height = 300
25
+
26
+ ########################################################################################################################
27
+ # Set up data
28
+ ########################################################################################################################
29
+
30
+ df = pd.read_csv("optimal_training/static/loss_vs_compute.csv")
31
+ loss_keys = [key for key in df.keys() if "loss" in key]
32
+
33
+ losses_per_run = {key: np.array(clean_run(list(zip(df["global_step"], df[key])))) for key in loss_keys}
34
+ losses_per_run = {k: v for k, v in losses_per_run.items() if len(v) > 5}
35
+ bounds_per_run = {key: [min(value[:, 0]), max(value[:, 0])] for key, value in losses_per_run.items()}
36
+ params_per_run = {key: param_count(run) for key, run in losses_per_run.items()}
37
+ ordered_keys = sorted(losses_per_run, key=lambda x: params_per_run[x])
38
+ losses_per_run = [losses_per_run[key] for key in ordered_keys]
39
+ bounds_per_run = [bounds_per_run[key] for key in ordered_keys]
40
+ params_per_run = [params_per_run[key] for key in ordered_keys]
41
+ palette = "Viridis256"
42
+ color_mapper = LogColorMapper(palette=palette, low=min(params_per_run), high=max(params_per_run))
43
+ general_bounds = bounds_per_run[2][0], bounds_per_run[-2][1]
44
+ print("{:.4e}, {:.4e}".format(general_bounds[0] * day_ratio, general_bounds[1] * day_ratio))
45
+ color_list = ["#000000" in params_per_run]
46
+ # there's a bogus point of small coordinates at position 0 to get the ConvexHull facing the origin
47
+ # hacky, but it's the syntax here, qhull_options=QG0 means the ConvexHull facing point 0
48
+ bounded_points = np.array([(10e8, 3, -1)] + [(a, b, i) for i, run in enumerate(losses_per_run) for a, b in run if
49
+ general_bounds[0] < a < general_bounds[1]])
50
+ all_points = np.array([(a, b, i) for i, run in enumerate(losses_per_run) for a, b in run])
51
+ all_hull = ConvexHull(bounded_points[:, :2], qhull_options='QG0')
52
+ log_points = np.array([(np.log(a), b) for a, b, i in bounded_points])
53
+ log_hull = ConvexHull(log_points, qhull_options='QG0')
54
+ indexed_runs = [np.array([(a, b) for a, b in run]) for run in losses_per_run]
55
+
56
+ ########################################################################################################################
57
+ # Set up loss_plot
58
+ ########################################################################################################################
59
+
60
+ color_bar = ColorBar(color_mapper=color_mapper, ticker=LogTicker(), label_standoff=12,
61
+ border_line_color=None, location=(0, 0), title="Num of params")
62
+ loss_plot = figure(plot_height=plot_height, plot_width=plot_width,
63
+ title="Validation loss during training for an array of models of different sizes",
64
+ tools="pan,reset,save,wheel_zoom,tap", active_scroll="wheel_zoom",
65
+ x_range=[min(all_points[:, 0]) * day_ratio, max(all_points[:, 0]) * day_ratio],
66
+ y_range=[min(all_points[:, 1]), max(all_points[:, 1])],
67
+ x_axis_type="log", y_axis_type="log",
68
+ x_axis_label="Floating-point operations (excluding embeddings & softmax)",
69
+ y_axis_label="Validation loss on Wikitext-103", output_backend="webgl")
70
+ loss_plot.add_tools(CrosshairTool(dimensions="width", line_alpha=0.2))
71
+ loss_plot.add_layout(color_bar, "left")
72
+ # for i, run in indexed_runs.items():
73
+ # source = ColumnDataSource(data=dict(x=run[:, 0] * day_ratio, y=run[:, 1]))
74
+ # loss_plot.line('x', 'y', source=source, line_width=1, line_alpha=0.6, color=color_list[i])
75
+ # loss_plot.scatter('x', 'y', source=source, line_width=1, line_alpha=0.6, color=color_list[i])
76
+
77
+ source = ColumnDataSource(data=dict(
78
+ xs=[run[:, 0] * day_ratio for run in indexed_runs], # x coords for each line (list of lists)
79
+ ys=[run[:, 1] for run in indexed_runs], # y coords for each line (list of lists)
80
+ params=params_per_run # data to use for colormapping
81
+ ))
82
+ loss_plot.multi_line('xs', 'ys', source=source,
83
+ color=log_cmap('params', palette, min(params_per_run), max(params_per_run)))
84
+ source = ColumnDataSource(data=dict(
85
+ x=[compute for run in indexed_runs for compute in run[:, 0] * day_ratio], # x coords for each line (list of lists)
86
+ y=[loss for run in indexed_runs for loss in run[:, 1]], # y coords for each line (list of lists)
87
+ params=[repeated_params for i, params in enumerate(params_per_run)
88
+ for repeated_params in [params] * len(indexed_runs[i])] # data to use for colormapping
89
+ ))
90
+ loss_plot.scatter('x', 'y', source=source,
91
+ color=log_cmap('params', palette, min(params_per_run), max(params_per_run)), size=3)
92
+
93
+ hull_indices = set(index for pair in all_hull.simplices[all_hull.good] for index in pair)
94
+ hull_indices = sorted(hull_indices, key=lambda x: bounded_points[x, 0])
95
+
96
+ ########################################################################################################################
97
+ # Fit frontier
98
+ ########################################################################################################################
99
+
100
+ hull_points = np.array([bounded_points[index] for index in hull_indices])
101
+ loss_popt, loss_pcov = curve_fit(loss_fit, hull_points[:, 0], hull_points[:, 1])
102
+ a, b, c = loss_popt
103
+ print(a, b, c)
104
+ display_abscisses = np.array([min(all_points[:, 0]) / 1.25] + sorted(list(all_points[:, 0])) +
105
+ [max(all_points[:, 0]) * 1.25])
106
+ source = ColumnDataSource(
107
+ data=dict(x=sorted(display_abscisses * day_ratio), y=loss_fit(sorted(display_abscisses), *loss_popt)))
108
+ loss_plot.line('x', 'y', source=source, line_width=1, line_alpha=0.8, color="red")
109
+
110
+ ########################################################################################################################
111
+ # Set up param_plot
112
+ ########################################################################################################################
113
+
114
+ param_plot = figure(plot_height=plot_height, plot_width=plot_width,
115
+ title="Optimal number of non-embedding parameters per floating-point operations budget",
116
+ tools="pan,reset,save,wheel_zoom,tap", active_scroll="wheel_zoom",
117
+ x_range=loss_plot.x_range,
118
+ y_range=[min(params_per_run), max(params_per_run)],
119
+ x_axis_type="log", y_axis_type="log",
120
+ x_axis_label="Floating-point operations (excluding embeddings & softmax)",
121
+ y_axis_label="Optimal number of non-embedding parameters", output_backend="webgl")
122
+ param_plot.add_tools(CrosshairTool(dimensions="width", line_alpha=0.2))
123
+ param_plot.add_layout(color_bar, "left")
124
+
125
+ logspace_points = convert_to_logspace(bounded_points, *loss_popt)
126
+ logspace_losses_per_run = [convert_to_logspace(run, *loss_popt) for run in losses_per_run]
127
+ passing_points = []
128
+ for run_index, log_run in enumerate(logspace_losses_per_run):
129
+ current_point = None
130
+ passed = False
131
+ difference = log_run[:, 1] - log_run[:, 0]
132
+ passing_points.append(np.argmax(difference))
133
+ compute_at_passing_points = np.array([(losses_per_run[i][passing_point, 0], params_per_run[i])
134
+ for i, passing_point in enumerate(passing_points)])
135
+ compute_at_hull = np.array([(losses_per_run[i][passing_point, 0], params_per_run[i])
136
+ for i, passing_point in enumerate(passing_points) if i in set(hull_points[:, 2])])
137
+ run_indices_at_hull = [i for i, passing_point in enumerate(passing_points) if i in set(hull_points[:, 2])]
138
+
139
+ param_popt, param_pcov = curve_fit(param_fit, compute_at_hull[:, 0], np.log(compute_at_hull[:, 1]))
140
+ d, e, f = param_popt
141
+
142
+ source = ColumnDataSource(data=dict(x=compute_at_hull[:, 0] * day_ratio,
143
+ y=compute_at_hull[:, 1],
144
+ params=[params for i, params in enumerate(params_per_run) if
145
+ i in set(hull_points[:, 2])]))
146
+ param_plot.scatter('x', 'y', source=source,
147
+ color=log_cmap('params', palette, min(params_per_run), max(params_per_run)))
148
+ display_abscisses = np.array([min(compute_at_hull[:, 0]) / 1.25] + sorted(list(compute_at_hull[:, 0])) +
149
+ [max(compute_at_hull[:, 0]) * 1.25])
150
+ source = ColumnDataSource(data=dict(x=display_abscisses * day_ratio,
151
+ y=safe_flo_to_param(display_abscisses, d, e, f)))
152
+ param_plot.line('x', 'y', source=source, line_width=1, line_alpha=0.8, color="orange")
153
+
154
+ ########################################################################################################################
155
+ # Set up widgets
156
+ ########################################################################################################################
157
+
158
+ hours_end = 24
159
+ hours_initial = 3.23
160
+ gpu_dropdown = Select(title="GPU",
161
+ options=["V100", "P100", "P4", "K80", ],
162
+ value="V100", width=sidebar_width, sizing_mode="stretch_width")
163
+ amp_mode_dropdown = Select(title="AMP mode", options=["O0", "O1", "O2"], value="O0", width=sidebar_width,
164
+ sizing_mode="stretch_width")
165
+ tipping_width = tipping_point(gpu_dropdown.value, amp_mode_dropdown.value, param_popt)
166
+ tip = {}
167
+ update_tip(tip, tipping_width, gpu_dropdown.value, amp_mode_dropdown.value, loss_popt, param_popt)
168
+ hours_slider = Slider(title="Wall time (hours)", value=hours_initial, start=tip["hours"], end=hours_end, step=1 / 100,
169
+ width=sidebar_width, sizing_mode="stretch_width")
170
+ dollars_slider = Slider(title="Budget (dollars)", value=hours_to_dollars(hours_initial, gpu_dropdown.value),
171
+ start=dollars_to_hours(tip["hours"], gpu_dropdown.value),
172
+ end=hours_to_dollars(hours_end, gpu_dropdown.value),
173
+ step=1 / 100, width=sidebar_width, sizing_mode="stretch_width")
174
+ input_buffer = Div(text="", width=sidebar_width, height=10,
175
+ style={"display": "block", "margin": "0 auto", "width": f"{sidebar_width}px",
176
+ "text-align": 'center'})
177
+ top_sidebar_div_style = {"display": "block", "margin": "0 auto", 'font-size': "125%",
178
+ "width": f"{sidebar_width}px", "text-align": 'center'}
179
+ energy_text = Div(text=energy_fill(hours_to_kWh(hours_slider.value, gpu_dropdown.value),
180
+ hours_to_co2(hours_slider.value, gpu_dropdown.value)),
181
+ width=sidebar_width, height=45,
182
+ style=top_sidebar_div_style)
183
+ slider_moves = {"hours": 0, "dollars": 0, "kWh": 0, "co2": 0}
184
+ n_sliders = len(slider_moves)
185
+
186
+ width = hours_to_width(hours_slider.value, gpu_dropdown.value, amp_mode_dropdown.value, param_popt)
187
+ flo = width_to_flo(width, *param_popt)
188
+ optimal_params = safe_flo_to_param(flo / 24 / 3600, *param_popt)
189
+ final_loss = loss_fit(flo / 24 / 3600, *loss_popt)
190
+ example_shape = {}
191
+ example_shape['example_depth'], example_shape['example_width'] = optimal_model_shape(width, optimal_params)
192
+ example_shape['alternate_depth'], example_shape['alternate_width'] = alternate_model_shape(width, optimal_params)
193
+
194
+ flo_line = Span(location=flo, line_alpha=0.7,
195
+ dimension='height', line_color='purple',
196
+ line_dash='dashed', line_width=1)
197
+ loss_line = Span(location=final_loss, line_alpha=0.7,
198
+ dimension='width', line_color='red',
199
+ line_dash='dashed', line_width=1)
200
+ param_line = Span(location=optimal_params, line_alpha=0.7,
201
+ dimension='width', line_color='orange',
202
+ line_dash='dashed', line_width=1)
203
+ loss_plot.add_layout(flo_line)
204
+ loss_plot.add_layout(loss_line)
205
+ param_plot.add_layout(flo_line)
206
+ param_plot.add_layout(param_line)
207
+
208
+ sidebar_div_style = {"display": "block", "margin": "0 auto", "width": f"{sidebar_width}px", "text-align": 'center'}
209
+ big_sidebar_div_style = {"display": "block", "margin": "0 auto", "width": f"{sidebar_width}px",
210
+ "text-align": 'center', 'font-size': "200%", 'font-weight': "bold"}
211
+ static_loss_text = Div(text="Expected wt-103 validation loss:", width=sidebar_width, height=10, style=sidebar_div_style)
212
+ optimal_loss_text = Div(text="{:.2f}".format(final_loss), width=sidebar_width, height=45,
213
+ style={"display": "block", "margin": "0 auto", 'font-size': "200%",
214
+ 'font-weight': "bold", "width": f"{sidebar_width}px", "text-align": 'center'})
215
+ static_param_text = Div(text="Optimal number of non-embedding parameters:", width=sidebar_width, height=10,
216
+ style=sidebar_div_style)
217
+ optimal_param_text = Div(text="{:.2e}".format(optimal_params), width=sidebar_width, height=45,
218
+ style=big_sidebar_div_style)
219
+ static_shape_text = Div(text="For example, this could be a model of", width=sidebar_width, height=10,
220
+ style=sidebar_div_style)
221
+ optimal_shape_text = Div(text=f"{example_shape['example_depth']} layers of {example_shape['example_width']} dimensions",
222
+ width=sidebar_width, height=30, style=big_sidebar_div_style)
223
+ static_altshape_text = Div(text="Or a model of", width=sidebar_width, height=10, style=sidebar_div_style)
224
+ optimal_altshape_text = Div(
225
+ text=f"{example_shape['alternate_depth']} layers of {example_shape['alternate_width']} dimensions",
226
+ width=sidebar_width, height=30, style=big_sidebar_div_style)
227
+
228
+
229
+ def compare_and_update(width):
230
+ if width >= tip["width"]:
231
+ update_width(width)
232
+ hours = width_to_hours(width, gpu_dropdown.value, amp_mode_dropdown.value, param_popt)
233
+ hours_slider.value = hours
234
+ else:
235
+ width = min(tip["width"], width + 5)
236
+ update_width(width)
237
+ compare_and_update(width)
238
+
239
+
240
+ def update_width(width):
241
+ flo = width_to_flo(width, *param_popt)
242
+ flo_line.location = flo
243
+ optimal_params = safe_flo_to_param(flo / 24 / 3600, *param_popt)
244
+ final_loss = loss_fit(flo / 24 / 3600, *loss_popt)
245
+ loss_line.location = final_loss
246
+ param_line.location = optimal_params
247
+ example_shape['example_depth'], example_shape['example_width'] = optimal_model_shape(width, optimal_params)
248
+ example_shape['alternate_depth'], example_shape['alternate_width'] = alternate_model_shape(width, optimal_params)
249
+ optimal_shape_text.text = f"{example_shape['example_depth']} layers of {example_shape['example_width']} dimensions"
250
+ optimal_altshape_text.text = f"{example_shape['alternate_depth']} layers of {example_shape['alternate_width']} dimensions"
251
+ optimal_param_text.text = "{:.2e}".format(optimal_params)
252
+ optimal_loss_text.text = "{:.2f}".format(final_loss)
253
+
254
+
255
+ def hours_update(attrname, old, new):
256
+ slider_moves["hours"] += 1
257
+
258
+ # if hours was the first updated slider
259
+ if sum(slider_moves.values()) <= n_sliders * slider_moves["hours"] - n_sliders + 1:
260
+ dollars_slider.value = hours_to_dollars(hours_slider.value, gpu_dropdown.value)
261
+ energy_text.text = energy_fill(hours_to_kWh(hours_slider.value, gpu_dropdown.value),
262
+ hours_to_co2(hours_slider.value, gpu_dropdown.value))
263
+
264
+ width = hours_to_width(hours_slider.value, gpu_dropdown.value, amp_mode_dropdown.value, param_popt)
265
+ update_width(width)
266
+
267
+
268
+ def dollars_update(attrname, old, new):
269
+ slider_moves["dollars"] += 1
270
+
271
+ # if hours was the first updated slider
272
+ if sum(slider_moves.values()) <= n_sliders * slider_moves["dollars"] - n_sliders + 1:
273
+ hours_slider.value = dollars_to_hours(dollars_slider.value, gpu_dropdown.value)
274
+ energy_text.text = energy_fill(hours_to_kWh(hours_slider.value, gpu_dropdown.value),
275
+ hours_to_co2(hours_slider.value, gpu_dropdown.value))
276
+
277
+
278
+ def gpu_update(attrname, old, new):
279
+ update_tip(tip, tipping_point(gpu_dropdown.value, amp_mode_dropdown.value, param_popt), gpu_dropdown.value,
280
+ amp_mode_dropdown.value, loss_popt, param_popt)
281
+ hours_slider.start = tip["hours"]
282
+ dollars_slider.start = hours_to_dollars(tip["hours"], gpu_dropdown.value)
283
+ if dollars_to_hours(dollars_slider.value, gpu_dropdown.value) == hours_slider.value:
284
+ width = hours_to_width(hours_slider.value, gpu_dropdown.value, amp_mode_dropdown.value, param_popt)
285
+ compare_and_update(width)
286
+ else:
287
+ dollars_slider.end = hours_to_dollars(hours_end, new)
288
+ hours_slider.value = dollars_to_hours(dollars_slider.value, gpu_dropdown.value)
289
+ energy_text.text = energy_fill(hours_to_kWh(hours_slider.value, gpu_dropdown.value),
290
+ hours_to_co2(hours_slider.value, gpu_dropdown.value))
291
+
292
+
293
+ def amp_update(attrname, old, new):
294
+ update_tip(tip, tipping_point(gpu_dropdown.value, amp_mode_dropdown.value, param_popt), gpu_dropdown.value,
295
+ amp_mode_dropdown.value, loss_popt, param_popt)
296
+ width = hours_to_width(hours_slider.value, gpu_dropdown.value, amp_mode_dropdown.value, param_popt)
297
+ hours_slider.start = tip["hours"]
298
+ dollars_slider.start = hours_to_dollars(tip["hours"], gpu_dropdown.value)
299
+ compare_and_update(width)
300
+ energy_text.text = energy_fill(hours_to_kWh(hours_slider.value, gpu_dropdown.value),
301
+ hours_to_co2(hours_slider.value, gpu_dropdown.value))
302
+
303
+
304
+ def loss_tap(event):
305
+ _, loss = event.x, event.y
306
+ flo = loss_to_flo(loss, *loss_popt)
307
+ param_number = safe_flo_to_param(flo, *param_popt)
308
+ width = param_to_width(param_number)
309
+ compare_and_update(width)
310
+
311
+
312
+ loss_plot.on_event(Tap, loss_tap)
313
+
314
+
315
+ def param_tap(event):
316
+ _, param_number = event.x, event.y
317
+ width = param_to_width(param_number)
318
+ hours = width_to_hours(width, gpu_dropdown.value, amp_mode_dropdown.value, param_popt)
319
+ hours_slider.value = hours
320
+
321
+
322
+ param_plot.on_event(Tap, param_tap)
323
+
324
+ hours_slider.on_change('value', hours_update)
325
+ dollars_slider.on_change('value', dollars_update)
326
+ gpu_dropdown.on_change("value", gpu_update)
327
+ amp_mode_dropdown.on_change("value", amp_update)
328
+
329
+
330
+ ########################################################################################################################
331
+ # Buttons
332
+ ########################################################################################################################
333
+
334
+ def on_optimal_click():
335
+ code_box.text = hf_code(example_shape['example_width'], example_shape['example_depth'])
336
+
337
+
338
+ def on_alternate_click():
339
+ code_box.text = hf_code(example_shape['alternate_width'], example_shape['alternate_depth'])
340
+
341
+
342
+ input_text = Div(text="Choose a GPU, AMP mode, and budget:", width=sidebar_width, height=30,
343
+ style={"display": "block", "margin": "0 auto", 'font-size': "125%",
344
+ 'font-weight': "bold", "width": f"{sidebar_width}px", "text-align": 'center'})
345
+ initialize_optimal = Button(width=175, label="Initialize in 🤗transformers!")
346
+ initialize_optimal.align = "center"
347
+ initialize_optimal.on_click(on_optimal_click)
348
+ results_buffer = Div(text="", width=sidebar_width, height=5, style=sidebar_div_style)
349
+ initialize_alternate = Button(width=175, label="Initialize in 🤗transformers!")
350
+ initialize_alternate.align = "center"
351
+ initialize_alternate.on_click(on_alternate_click)
352
+
353
+ code_box_style = {"display": "block", "margin": "0 auto", "width": f"{sidebar_width + plot_width}px",
354
+ "text-align": 'center',
355
+ "white-space": "pre-wrap", "background": "#f4f4f4",
356
+ "border": "1px solid #ddd",
357
+ "border-left": "3px solid #f36d33",
358
+ "color": "#666",
359
+ "page-break-inside": "avoid",
360
+ "font-family": "monospace",
361
+ "font-size": "15px",
362
+ "line-height": "1.6",
363
+ "max-width": "100%",
364
+ "overflow": "hidden",
365
+ "min-height": "30px",
366
+ "word-wrap": "break-word"}
367
+ code_box = Div(text="Find the right model for you with the curves and sliders then click the buttons to display the "
368
+ "corresponding 🤗transformers code here!", width=sidebar_width + plot_width, style=code_box_style,
369
+ sizing_mode="scale_width")
370
+ code_box.align = "center"
371
+
372
+ ########################################################################################################################
373
+ # Add write-up text
374
+ ########################################################################################################################
375
+
376
+ text_width = "800px"
377
+ main_text_style = {"min-height": "100px",
378
+ "overflow": "hidden",
379
+ "display": "block",
380
+ "margin": "auto",
381
+ "width": text_width,
382
+ "font-size": "18px"}
383
+
384
+ formula_img_style_1 = {"min-height": "25px",
385
+ "display": "block",
386
+ "margin": "0 auto",
387
+ "width": text_width,
388
+ "height": "auto",
389
+ "max-width": "100%",
390
+ "max-height": "100%"}
391
+
392
+ formula_img_style_2 = {"min-height": "50px",
393
+ "display": "block",
394
+ "margin": "0 auto",
395
+ "width": text_width,
396
+ "height": "auto",
397
+ "max-width": "100%",
398
+ "max-height": "100%"}
399
+
400
+ text_1 = Div(text=md1, style=main_text_style)
401
+ text_2 = Div(text=md2, style=main_text_style)
402
+ text_3 = Div(text=md3, style=main_text_style)
403
+ text_4 = Div(text=md4, style=main_text_style)
404
+
405
+ ########################################################################################################################
406
+ # Loss plot in write-up
407
+ ########################################################################################################################
408
+
409
+ in_text_loss_plot = figure(plot_height=in_text_plot_height, plot_width=in_text_plot_width,
410
+ title="Validation loss during training for an array of models of different sizes",
411
+ tools="pan,reset,save,wheel_zoom,tap", active_scroll="wheel_zoom",
412
+ x_range=[min(all_points[:, 0]) * day_ratio, max(all_points[:, 0]) * day_ratio],
413
+ y_range=[min(all_points[:, 1]), max(all_points[:, 1])],
414
+ x_axis_type="log", y_axis_type="log",
415
+ x_axis_label="Floating-point operations (excluding embeddings & softmax)",
416
+ y_axis_label="Validation loss on Wikitext-103", output_backend="webgl")
417
+ in_text_loss_plot.add_layout(color_bar, "left")
418
+ in_text_loss_plot.align = "center"
419
+
420
+ source = ColumnDataSource(data=dict(
421
+ xs=[run[:, 0] * day_ratio for run in indexed_runs], # x coords for each line (list of lists)
422
+ ys=[run[:, 1] for run in indexed_runs], # y coords for each line (list of lists)
423
+ params=params_per_run # data to use for colormapping
424
+ ))
425
+ in_text_loss_plot.multi_line('xs', 'ys', source=source,
426
+ color=log_cmap('params', palette, min(params_per_run), max(params_per_run)))
427
+ source = ColumnDataSource(data=dict(
428
+ x=[compute for run in indexed_runs for compute in run[:, 0] * day_ratio], # x coords for each line (list of lists)
429
+ y=[loss for run in indexed_runs for loss in run[:, 1]], # y coords for each line (list of lists)
430
+ params=[repeated_params for i, params in enumerate(params_per_run)
431
+ for repeated_params in [params] * len(indexed_runs[i])] # data to use for colormapping
432
+ ))
433
+ in_text_loss_plot.scatter('x', 'y', source=source,
434
+ color=log_cmap('params', palette, min(params_per_run), max(params_per_run)), size=3)
435
+ # for i, run in indexed_runs.items():
436
+ # source = ColumnDataSource(data=dict(x=run[:, 0] * day_ratio, y=run[:, 1]))
437
+ # in_text_loss_plot.line('x', 'y', source=source, line_width=1, line_alpha=0.6, color=color_list[i])
438
+ # in_text_loss_plot.scatter('x', 'y', source=source, line_width=1, line_alpha=0.6, color=color_list[i])
439
+
440
+ in_text_param_plot = figure(plot_height=in_text_plot_height, plot_width=in_text_plot_width,
441
+ title="Optimal number of non-embedding parameters per floating-point operations budget",
442
+ tools="pan,reset,save,wheel_zoom,tap", active_scroll="wheel_zoom",
443
+ x_range=in_text_loss_plot.x_range,
444
+ y_range=[min(params_per_run), max(params_per_run)],
445
+ x_axis_type="log", y_axis_type="log",
446
+ x_axis_label="Floating-point operations (excluding embeddings & softmax)",
447
+ y_axis_label="Optimal number of non-embedding parameters", output_backend="webgl")
448
+ in_text_param_plot.add_layout(color_bar, "left")
449
+ in_text_param_plot.align = "center"
450
+ # for i, run_apex in enumerate(compute_at_hull):
451
+ # source = ColumnDataSource(data=dict(x=[compute_at_hull[i, 0] * day_ratio], y=[compute_at_hull[i, 1]]))
452
+ # in_text_param_plot.scatter('x', 'y', source=source, color=color_list[run_indices_at_hull[i]])
453
+
454
+ source = ColumnDataSource(data=dict(x=compute_at_hull[:, 0] * day_ratio, y=compute_at_hull[:, 1],
455
+ params=[params for i, params in enumerate(params_per_run) if
456
+ i in set(hull_points[:, 2])]))
457
+ in_text_param_plot.scatter('x', 'y', source=source,
458
+ color=log_cmap('params', palette, min(params_per_run), max(params_per_run)))
459
+
460
+ training_button = Button(width=175, label="Fit!")
461
+ training_button.align = "center"
462
+ fit_button = Button(width=175, label="Fit!")
463
+ fit_button.align = "center"
464
+
465
+
466
+ def on_train_click():
467
+ display_abscisses = np.array([min(all_points[:, 0]) / 1.25] + sorted(list(all_points[:, 0])) +
468
+ [max(all_points[:, 0]) * 1.25])
469
+ source = ColumnDataSource(
470
+ data=dict(x=sorted(display_abscisses * day_ratio), y=loss_fit(sorted(display_abscisses), *loss_popt)))
471
+ in_text_loss_plot.line('x', 'y', source=source, line_width=1, line_alpha=1, color="red")
472
+
473
+
474
+ def on_fit_click():
475
+ display_abscisses = np.array([min(compute_at_hull[:, 0]) / 1.25] + sorted(list(compute_at_hull[:, 0])) +
476
+ [max(compute_at_hull[:, 0]) * 1.25])
477
+ source = ColumnDataSource(data=dict(x=display_abscisses * day_ratio,
478
+ y=safe_flo_to_param(display_abscisses, d, e, f)))
479
+ in_text_param_plot.line('x', 'y', source=source, line_width=1, line_alpha=0.8, color="orange")
480
+
481
+
482
+ training_button.on_click(on_train_click)
483
+ fit_button.on_click(on_fit_click)
484
+
485
+ before_text = column(text_1, training_button, in_text_loss_plot, text_2, fit_button, in_text_param_plot, text_3)
486
+ after_text = column(text_4)
487
+
488
+ ########################################################################################################################
489
+ # Set up layouts and add to document
490
+ ########################################################################################################################
491
+
492
+ inputs = column(input_text, gpu_dropdown, amp_mode_dropdown, hours_slider, dollars_slider, input_buffer, energy_text,
493
+ sizing_mode="scale_width", width=sidebar_width, height=plot_height)
494
+
495
+ results = column(static_loss_text,
496
+ optimal_loss_text,
497
+ static_param_text,
498
+ optimal_param_text,
499
+ static_shape_text,
500
+ optimal_shape_text,
501
+ initialize_optimal,
502
+ results_buffer,
503
+ static_altshape_text,
504
+ optimal_altshape_text,
505
+ initialize_alternate, sizing_mode="scale_width", width=sidebar_width, height=plot_height)
506
+
507
+ # app = column(row(inputs, loss_plot, sizing_mode="scale_width"), row(results, param_plot, sizing_mode="scale_width"),
508
+ # code_box, sizing_mode="scale_width")
509
+ app = column(row(column(inputs, results, sizing_mode="fixed"),
510
+ column(loss_plot, param_plot, sizing_mode="stretch_width", )),
511
+ code_box, sizing_mode="scale_width")
512
+ before_text.align = "center"
513
+ app.align = "center"
514
+ after_text.align = "center"
515
+
516
+ main_body = column(before_text, app, after_text, sizing_mode="scale_width")
517
+
518
+ curdoc().add_root(main_body)
519
+ curdoc().title = "How big should my language model be ?"
app_to_share/optimal_training/static/formula_1.png ADDED
app_to_share/optimal_training/static/formula_2.png ADDED
app_to_share/optimal_training/static/loss_vs_compute.csv ADDED
The diff for this file is too large to render. See raw diff
 
app_to_share/optimal_training/utils.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import numpy as np
3
+
4
+ from conversions import day_ratio
5
+
6
+
7
+ def clean_run(run):
8
+ return [(a, float(b)) for a, b in run if b != "undefined"]
9
+
10
+
11
+ def param_count(run):
12
+ compute_per_eval = run[0][0]
13
+ return round(compute_per_eval / 4000 / 150 / 60 / 6 * day_ratio)
14
+
15
+
16
+ def convert_to_logspace(run, a, b, c):
17
+ logspace_run = copy.deepcopy(run)
18
+ logspace_run[:, 0] = b * np.log(run[:, 0])
19
+ logspace_run[:, 1] = -np.log(run[:, 1] - c) + np.log(a)
20
+ return logspace_run
21
+
22
+
23
+ # OpenAI used another unit for floating-point operations with a ratio of the number of seconds in a day; we'll display
24
+ # the raw number, but do the calculations with the ratio as it can overflow without it (convex hull notably fails)
25
+
26
+
27
+ def hf_code(width, depth):
28
+
29
+ return f"""<span style="color: #008800; font-weight: bold">import</span> <span style="color: #0e84b5; font-weight: bold">transformers</span>
30
+ config <span style="color: #333333">=</span> transformers<span style="color: #333333">.</span>TransfoXLConfig(d_model<span style="color: #333333">=</span><span style="color: #40a070">{width}</span>, d_embed<span style="color: #333333">=</span><span style="color: #40a070">{width}</span>, n_head<span style="color: #333333">=</span><span style="color: #40a070">8</span>, d_head<span style="color: #333333">=</span><span style="color: #40a070">{int(width / 8)}</span>, d_inner<span style="color: #333333">=</span><span style="color: #40a070">{width}</span>, n_layer<span style="color: #333333">=</span><span style="color: #40a070">{depth}</span>, tgt_len<span style="color: #333333">=</span><span style="color: #40a070">152</span>, mem_len<span style="color: #333333">=</span><span style="color: #40a070">152</span>)
31
+ model <span style="color: #333333">=</span> transformers<span style="color: #333333">.</span>TransfoXLModel(config)"""
32
+
33
+
34
+ def co2_to_trees(co2):
35
+ return co2 / 60 * 3650
36
+
37
+
38
+ def co2_to_kms(co2):
39
+ return co2 / 0.403 * 1.60934
40
+
41
+
42
+ def energy_fill(kWh, co2):
43
+ return 'This will consume about <span style="font-weight: bold">{:.2f}</span> ' \
44
+ 'kWh, releasing <span style="font-weight: bold">{:.2f}</span> ' \
45
+ 'kgs of CO2. That is equivalent to <span style="font-weight: bold">{:.2f}</span> ' \
46
+ 'kms with an average American passenger car and could be offset ' \
47
+ 'by growing a tree for <span style="font-weight: bold">{:.2f}</span> ' \
48
+ 'days.<sup><a href=' \
49
+ '"https://www.epa.gov/energy/greenhouse-gases-equivalencies-calculator-calculations-and-references#miles"' \
50
+ '>1</a></sup>'.format(kWh, co2, co2_to_kms(co2), co2_to_trees(co2))
51
+
52
+
53
+ md1 = """<h1 id="how-big-should-my-language-model-be">How Big Should My Language Model Be?</h1>
54
+ <img class='center' style='height: 5em; float: right;' src='https://raw.githubusercontent.com/TevenLeScao/transformer-xl/master/pytorch/assets/avatar_logo_joint.png' alt='avatar'>
55
+ <h4>Published on June 08, 2020.</h4>
56
+ <h4>Teven Le Scao, researcher at Hugging Face • <a href="https://twitter.com/Fluke_Ellington">@Fluke_Ellington</a> </h4>
57
+ <p>Natural Language Processing can sometimes feel like model size is optimized for headlines. <a href="https://arxiv.org/abs/2005.14165">175 billion parameters</a> is certainly an eye-catching number! Why not just train more efficiently with a smaller model? One surprising scaling effect of deep learning is that <strong>bigger neural networks are actually compute-efficient.</strong> This is something OpenAI in particular has explored in papers like <em><a href="https://arxiv.org/abs/2001.08361">Scaling Laws for Neural Language Models</a></em>. Research at Hugging Face also leverages this phenomenon, and we&#39;ve combined it with GPU speed estimations to ensure model size is just right for the compute budget of the experiment (when in doubt, it&#39;s bigger than you think!). This blog post will show how this impacts architecture decisions on a standard language modeling benchmark: <strong>we replicate the 14-layer state-of-the-art result from <a href="https://arxiv.org/pdf/1901.02860.pdf">Zhang et al.&#39;s Transformer-XL paper</a> without any hyper-parameter optimization and saving 25% of training time</strong>. We also estimate that <strong>the 18-layer model from the same paper trained for an order of magnitude too many training steps.</strong> <a name="start">Wanna</a> play with our demo before reading? Just click <a href="#demo">here</a>!</p>
58
+ <h2 id="1-there-is-an-optimal-time-to-stop-training-and-its-earlier-than-you-think">1. There is an optimal time to stop training (and it&#39;s earlier than you think)</h2>
59
+ <p>Let&#39;s look at some loss curves. For our example, the task will be training Transformer-XL, the state-of-the-art in language modeling, on Wikitext-103, a standard, medium-size benchmark. GPT-2 doesn&#39;t perform well on this dataset scale. As training progresses, we&#39;ll look at the performance of the model (as measured by validation loss) depending on compute cost (as measured by floating point operations). Let&#39;s run a few experiments! In the following plot, every line of colour corresponds to a Transformer-XL run of 200000 steps with a different number and size of layers, with all other hyperparameters kept the same. This spans models from a mere thousand to a hundred million parameters (excluding embeddings). Bigger models are on the right as they require more compute for every step. Don&#39;t worry, we&#39;ve already run them so you don&#39;t have to. All graphs are interactive, play with them!</p>"""
60
+ md2 = """
61
+ <p>As introduced in <em>Scaling Laws</em>, we plot the validation loss against non-embedding floating-point operations (neFLOs). There seems to be a frontier of performance for a given neFLO budget that no model manages to beat, depicted here in red. In <em>Scaling Laws</em>, it is referred to as the compute frontier. Every run reaches it, or comes close, after an initial phase of quick loss improvement, then tapers away as the end of training is not as efficient. This has a very practical implication: if you have a certain budget in floating-point operations, to reach the best performance, you should choose a model size that reaches the compute frontier after that many operations and stop it at that moment. This is actually way before model convergence, which usually happens around 10 times later! In essence, if you have extra compute budget, you should invest most of it in a bigger model, and only a small part in more training steps. In <em>Scaling Laws</em>, the OpenAI team fitted a power law to the compute frontier on GPT-2 training. This still seems to be a good fit in our task. In addition, we also fitted a power law between the compute budget and the number of parameters of the model that is optimal for that budget. It is pictured in the following plot.</p>
62
+
63
+ """
64
+ md3 = """
65
+ <p>As good models tend to spend considerable time tangent on the compute frontier, there is a bit of noise in the relationship. However, this also means that there is more tolerance in the estimation even if the model size we predict is a bit off, as the imperfect model will still be very close to optimal. We find that <strong>if the compute budget is multiplied by 10, the optimal model size is multiplied by 7.41 and the number of optimal training steps by only 1.35</strong>. Extrapolating with this rule to the much-bigger 18-layer SOTA model from Zhang et al., we find that <strong>its optimal number of training steps was around 250000</strong>. Even if this number is imprecise due to the change of scale, <strong>it is much smaller than the 4 million steps from their replication script</strong>. Starting from an even bigger model and stopping earlier would have yielded a better loss for that (huge) compute budget.</p>
66
+ <h2 id="2-gpus-are-optimized-for-large-wide-models">2. GPUs are optimized for large, wide models</h2>
67
+ <p>We now have a rule connecting performance and optimal size with neFLOs. However, neFLOs are a bit hard to picture. Can we translate that into a more immediate resource, like training time? Whether you are constrained by temporal or financial constraints, the main resource is GPU time. In order to establish a connection between neFLOs and GPU time, we benchmarked different Transformer-XL model sizes on 4 different GPUs available on Google Cloud Platform across tens of thousands of runs, taking into account mixed precision training. Here are our findings:</p>
68
+ <h5 id="speed-estimation">Speed estimation</h5>
69
+ <p>neFLOs per second speed can be modeled as a factorized multivariate function (sounds scary, but this just means the equation can be written simply as below) of model width (the number of neurons per layer), depth (the number of layers) and batch size, by increasing order of importance. In our estimations, the maximum prediction error was 15% of the observed speed.</p>
70
+ <img class='center' style='height: 1.25em;' src='https://raw.githubusercontent.com/TevenLeScao/transformer-xl/master/pytorch/assets/formula_1.png' alt='formula_1'>
71
+ <h5 id="width">Width</h5>
72
+ <p>GPUs are optimized for the large feed-forward layers of wide transformers. In all of our experiments, neFLOs per second depended on model width as <strong>a power law of exponent around 1.6</strong>. This means that a model that&#39;s twice as wide, which requires 4 times more operations, also goes through those operations around 3.16 times faster, <strong>nearly offsetting the additional compute cost</strong>.</p>
73
+ <h5 id="depth">Depth</h5>
74
+ <p>neFLOs per second were also positively correlated with depth. Our best results were attained by modeling this connection as proportional to depth * (depth + additive constant). This is coherent with the fact that Transformers must process layers serially. In essence, <strong>deeper models aren&#39;t actually faster, but they appear to be so as their overhead is smaller relative to the more productive operations</strong>. The additive constant, which represents this overhead, was consistently around 5 in our experiments, which essentially means that data loading to the GPU, embeddings, and softmax operations, represent around 5 transformer layers&#39; worth of time.</p>
75
+ <h5 id="batch-size">Batch size</h5>
76
+ <p>Batch size played the least role. It was <strong>positively correlated with speed for small values, but quickly saturated</strong> (and even seemed to hurt at high values, after 64 on the V100 and P100 and 16 on the K80 and P4). We modeled its contribution as a logarithmic function to keep things simple as it was also the variable for which the factorized independence assumption was the weakest. We ran all our experiments at size 64 on a single GPU. This is another perk of big models: <strong>as bigger batch sizes don&#39;t seem to help much, if your model is too big to fit on a GPU, you could just use a smaller batch size and gradient accumulation.</strong></p>
77
+ <h5 id="powers-of-2-still-matter-in-2020">Powers of 2 still matter in 2020!</h5>
78
+ <p>Finally, one surprising takeaway was that <strong>hyperparameters whose width or batch size were powers of 2 out-performed the others</strong>. That was the case on GPUs with and without Tensor Core capability. On Tensor Core GPUs like the V100, NVIDIA recommends tensor shapes that are multiples of 8; however, we kept seeing improvements beyond that, up to multiples of 512. In the end, we only fitted on powers of 2 as fitting on all data points meant a poor fit quality that consistently under-estimated speed for powers of 2 points, and one might as well choose the fastest parameters.</p>
79
+ <p>In the end, our final estimation of operation speed was as follows:</p>
80
+ <img class='center' style='height: 2.5em;' src='https://raw.githubusercontent.com/TevenLeScao/transformer-xl/master/pytorch/assets/formula_2.png' alt='formula_2'>
81
+ <p>with, for example on a V100 GPU without mixed precision, k=2.21*10<sup>7</sup>, a=1.66, b=5.92, and c=1.33. Different GPUs had close results with a different multiplicative constant.</p>
82
+ <h2 id="3-demonstration-on-a-language-modeling-task-wikitext-103">3. <a name="demo">Demonstration on a language modeling task: Wikitext-103</a></h2>
83
+ <p>Now that we have obtained a relation between model size and training speed, we can predict, for a certain GPU time or price budget, the optimal model size on the task and the performance it will achieve.</p>
84
+
85
+ """
86
+ md4 = """<p>Prices are indicated for Google Cloud Platform. The energy consumption was estimated thanks to Peter Henderson&#39;s <a href="https://github.com/Breakend/experiment-impact-tracker">Experiment impact tracker</a> and the CO2 emissions with <a href="https://www.electricitymap.org/zone/NL">Electricity map</a> Netherlands data (where Google&#39;s European servers are located). Even though huge training costs make headlines, it is still possible to replicate a state-of-the-art result on a medium-size dataset for thirty bucks! A single V100 with properly optimized training is already quite a powerful weapon.</p>
87
+ <p>Data shown is for single-GPU training at batch size 60 on Wikitext-103 for a target and memory length of 150, following CMU&#39;s Transformer-XL <a href="https://github.com/kimiyoung/transformer-xl">repo</a>. In order to leverage the Tensor Core capability of the V100, we set batch size 64 and sequence length 152 on that GPU. In our model size and speed predictions, we assumed that the inner feed-forward layer dimension was the same as the embedding and attention dimensions, and that the width-to-depth ratio was constant. This is a good way to save memory, as <em><a href="https://arxiv.org/abs/2001.04451">Reformer</a></em> has shown. <em><a href="https://arxiv.org/abs/2001.08361">Scaling Laws</a></em> has observed that shape doesn&#39;t impact performance significantly in GPT-2. However, for large scales, we found that the final performance of taller models with a bigger feed-forward layer was consistently better, which is why we give two possible model shapes. </p>
88
+ <p>In order to replicate the result of the medium-size Transformer-XL pre-trained model (3.15 loss), we tweaked our example model size to add a bigger feed-forward dimension and have high powers of 2 while keeping the same number of parameters. This gave us a model of 14 layers with 768 hidden dimensions and 1024 feed-forward dimensions. In comparison, the CMU pre-trained model was found through aggressive hyper-parameter search with a much more unusual shape of 16 layers of 410 hidden dimensions and 2100 feed-forward dimensions. In our experiment, even though it was 50% bigger, our model was actually 20% faster per batch on an NVIDIA RTX Titan as its shapes were high powers of 2, and it was a shorter, wider model. For that model, the script provided by the CMU team was already very close to optimal stopping time; in the end, we obtained the same performance with <strong>25% less training time</strong>. Most importantly, this was the case even though the pre-trained model&#39;s hyper-parameter tuning gave it a much more optimized shape, and we had also kept the same random seed it was tuned with. Since we calculated our scaling laws with much smaller-scale trainings, saving on parameter search might actually be the bigger gain here. If you took the shortcut to the demo before reading, you can come back the start <a href="#start">here</a>!</p>
89
+ <h2 id="4-takeaways">4. Takeaways</h2>
90
+ <ul>
91
+ <li>Big models are surprisingly efficient!</li>
92
+ <li>Training until convergence is not efficient at all.</li>
93
+ <li>Benchmarking smaller-scale runs allows us to predict model performance and optimal stopping time for production-scale models.</li>
94
+ <li>Using larger models stopped earlier and optimizing model size for speed lowers training costs.</li>
95
+ </ul>
96
+ <p>I built this tool automatically using the data from our Transformer-XL runs. If you are interested in having this feature available for other NLP tasks as part of the Hugging Face repository, you can contact me on Twitter at <a href="https://twitter.com/Fluke_Ellington">@Fluke_Ellington</a>, drop me a mail at <code>teven@huggingface.co</code>, or add a reaction on <a href="https://github.com/huggingface/transformers/issues/4847">our Github issue</a>!</p>
97
+
98
+ """
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bokeh==2.0.2
2
+ Jinja2==2.11.2
3
+ MarkupSafe==1.1.1
4
+ numpy==1.18.4
5
+ packaging==20.4
6
+ pandas==1.0.3
7
+ Pillow==7.1.2
8
+ pyparsing==2.4.7
9
+ python-dateutil==2.8.1
10
+ pytz==2020.1
11
+ PyYAML==5.3.1
12
+ randomcolor==0.4.4.5
13
+ scipy==1.4.1
14
+ six==1.15.0
15
+ tornado==6.0.4
16
+ typing-extensions==3.7.4.2
17
+