machineuser
import from banana
0449a8e
raw
history blame
8.16 kB
import math
import numpy as np
from scipy.optimize import root
day_ratio = 24 * 3600
depth_width_ratio = 128
constants_per_gpu = {
"V100": [2.21527743e+07, 1.18538628e+00, 1.43150104e+00, 1.66015023e+00,
1.32808220e+00, 5.91503856e+00],
"V100 (without tensor cores and cudnn.benchmark)": [1.82997989e+07, 1.05349588e+00, 1.25312127e+00, 1.67071294e+00,
1.44610885e+00, 5.55824273e+00],
"P100": [6.01863899e+07, 9.23656025e-01, 1.03230702e+00, 1.46733667e+00,
1.03031298e+00, 5.38021875e+00],
"P4": [4.84472202e+07, 9.86822195e-01, 1.23474901e+00, 1.38493518e+00,
1.04630858e+00, 1.03572754e+01],
"K80": [2.58592374e+07, 6.42050890e-01, 7.06115162e-01, 1.44360777e+00,
7.50695980e-01, 6.25951436e+00]
}
price_per_gpu = {
"K80": 0.584,
"P4": 0.689,
"V100": 2.005,
"V100 (without tensor cores and cudnn.benchmark)": 2.005,
"P100": 1.416,
}
optimal_batch_size_per_gpu = {
"P4": 16,
"V100": 64,
"V100 (without tensor cores and cudnn.benchmark)": 64,
"P100": 64,
"K80": 16
}
features_per_amp_mode = {
"O0": (1, 0, 0),
"O1": (0, 1, 0),
"O2": (0, 0, 1)
}
gpu_consumption = {
"V100": 119.3495934959e-3,
"V100 (without tensor cores and cudnn.benchmark)": 119.3495934959e-3,
"K80": 142.42e-3,
"P4": 55.27e-3,
"P100": 139.65e-3
}
co2_intensity = 534 * 1e-3
def flo_speed(features, constants):
k, k1, k2, b, c, layer_base = constants
o0, o1, o2, x, y, z = features
return k * np.power(k1, o1) * np.power(k2, o2) * x / (x + layer_base) * np.power(y, b) * np.power(np.log(z + 1), c)
def param_polynomial(width, depth=None, inner=None):
if depth is not None:
if inner is not None:
return 5 * depth * (width ** 2) + 2 * depth * (width * inner) + 7 * depth * width + depth * inner + 3 * width + 3
else:
return 7 * depth * (width ** 2) + 8 * depth * width + 3 * width + 3
else:
if inner is not None:
return 5 * depth_width_ratio * (width ** 3) + 2 * depth_width_ratio * (width ** 2 * inner) + 7 * depth_width_ratio * width ** 2 + depth_width_ratio * width * inner + 3 * width + 3
else:
return 7 / depth_width_ratio * (width ** 3) + 8 / depth_width_ratio * (width ** 2) + 3 * width + 3
def optimal_model_shape(width, param_number, base=8):
depth = max(1, math.floor(width / depth_width_ratio))
poly_params = np.array([depth * 7, depth * 8 + 3, 3 - param_number])
roots = np.roots(poly_params)
corresponding_width = int(base * round(max(roots) / base))
return depth, corresponding_width
def alternate_model_shape(width, param_number, base=8):
linear_depth = max(1, math.floor(width / depth_width_ratio))
depth = max(linear_depth + 1, math.floor(0.3 * width ** 1.25 / depth_width_ratio))
poly_params = np.array([depth * 7, depth * 8 + 3, 3 - param_number])
roots = np.roots(poly_params)
corresponding_width = int(base * round(max(roots) / base))
return depth, corresponding_width
def hours_to_width(hours, gpu, amp_mode, param_popt):
seconds = hours * 3600
d, e, f = param_popt
constants = constants_per_gpu[gpu]
amp_features = features_per_amp_mode[amp_mode]
def equation_function(width):
return np.power((param_polynomial(width) - f) / d, 1 / e) / flo_speed(
(*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]),
constants) * day_ratio - seconds
width = iterative_solutions(equation_function, initial_guess=128)
# print("width: {}".format(math.floor(width)))
# print("depth: {}".format(width / depth_width_ratio))
# print("param number: {:.4e}".format(param_polynomial(width)))
speed = flo_speed((*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]), constants)
# print("speed: {:.4e}".format(speed))
# print("flos from speed: {:.4e}".format(seconds * speed))
# print("flos from params: {:.4e}".format(np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio))
# print("params from flos: {:.4e}".format(np.exp(param_fit(speed * seconds / day_ratio, *param_popt))))
return width
def iterative_solutions(equation_function, initial_guess):
while initial_guess > 16:
solution_array = root(equation_function, np.array([initial_guess]), method="hybr").x
width = solution_array[0]
should_be_zero = equation_function(width)
if np.abs(should_be_zero) < 1e0:
return width
else:
initial_guess *= 0.5
return width
def width_to_flo(width, d, e, f):
return np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio
def loss_fit(x, a, b, c):
return a * np.power(x, -b) + c
def param_fit(x, d, e, f):
return np.log(d * np.power(x, e) + f)
def hours_to_dollars(hours, gpu):
return hours * price_per_gpu[gpu]
def dollars_to_hours(dollars, gpu):
return dollars / price_per_gpu[gpu]
def hours_to_kWh(hours, gpu):
return hours * gpu_consumption[gpu]
def hours_to_co2(hours, gpu):
return hours * gpu_consumption[gpu] * co2_intensity
def loss_to_flo(loss, a, b, c):
return ((loss - c) / a) ** (-1 / b)
def param_to_flo(param_number, d, e, f):
return ((param_number - f) / d) ** (1 / e)
def safe_flo_to_param(flo, d, e, f):
return d * np.power(flo, e) + f
def param_to_width(param_number):
poly_params = np.array([7 / depth_width_ratio, 8 / depth_width_ratio, 3, 3 - param_number])
roots = np.roots(poly_params)
real_roots = [np.real(candidate) for candidate in roots if np.imag(candidate) < 1e-5]
width = max(real_roots)
return width
def safe_param_to_width(param_number):
try:
return param_to_width(param_number)
except np.linalg.LinAlgError:
return safe_param_to_width(1.5 * param_number)
def width_to_hours(width, gpu, amp_mode, param_popt):
d, e, f = param_popt
constants = constants_per_gpu[gpu]
amp_features = features_per_amp_mode[amp_mode]
flos_from_params = np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio
speed = flo_speed((*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]), constants)
seconds = flos_from_params / speed
return seconds / 3600
def param_prime(width, depth=None):
if depth is not None:
return 14 * depth * (width ** 2) + 8 * depth + 3
else:
return 21 / depth_width_ratio * (width ** 2) + 16 / depth_width_ratio * width + 3
def flo_speed_prime(width, gpu, amp_mode):
k, k1, k2, b, c, layer_base = constants_per_gpu[gpu]
o0, o1, o2 = features_per_amp_mode[amp_mode]
mult_constant = k * np.power(k1, o1) * np.power(k2, o2) * np.power(np.log(optimal_batch_size_per_gpu[gpu] + 1), c)
return mult_constant * ((b + 1) * np.power(width, b) / (width + layer_base * depth_width_ratio)
- np.power(width, b + 1) / (width + layer_base * depth_width_ratio) ** 2)
# awful equation; we're trying to find the width for which lowering width actually makes the model less efficient
def tipping_point(gpu, amp_mode, param_popt):
d, e, f = param_popt
o0, o1, o2 = features_per_amp_mode[amp_mode]
def equation_function(width):
return np.power((param_polynomial(width) - f) / d, -1) / e * param_prime(width) / d \
* flo_speed((o0, o1, o2, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]),
constants_per_gpu[gpu]) - \
flo_speed_prime(width, gpu, amp_mode)
tipping_width = iterative_solutions(equation_function, initial_guess=100)
return tipping_width
def update_tip(tip, width, gpu, amp_mode, loss_popt, param_popt):
a, b, c = loss_popt
d, e, f = param_popt
tip["width"] = width
tip["param_number"] = param_polynomial(width)
tip["flo"] = np.power((param_polynomial(tip["param_number"]) - f) / d, 1 / e)
tip["loss"] = loss_fit(tip["flo"], a, b, c)
tip["hours"] = width_to_hours(width, gpu, amp_mode, param_popt)