|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 10580, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1890359168241966, |
|
"grad_norm": 0.37213513255119324, |
|
"learning_rate": 0.0003, |
|
"loss": 1.9201, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3780718336483932, |
|
"grad_norm": 0.5696528553962708, |
|
"learning_rate": 0.0003, |
|
"loss": 1.8659, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5671077504725898, |
|
"grad_norm": 0.44589683413505554, |
|
"learning_rate": 0.0003, |
|
"loss": 1.8382, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7561436672967864, |
|
"grad_norm": 0.4036942720413208, |
|
"learning_rate": 0.0003, |
|
"loss": 1.8431, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.945179584120983, |
|
"grad_norm": 0.45892009139060974, |
|
"learning_rate": 0.0003, |
|
"loss": 1.8369, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.575076923076923, |
|
"eval_loss": 1.6031557321548462, |
|
"eval_runtime": 5.71, |
|
"eval_samples_per_second": 87.566, |
|
"eval_steps_per_second": 11.033, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.1342155009451795, |
|
"grad_norm": 0.4805261194705963, |
|
"learning_rate": 0.0003, |
|
"loss": 1.7006, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3232514177693762, |
|
"grad_norm": 0.4189806878566742, |
|
"learning_rate": 0.0003, |
|
"loss": 1.6276, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5122873345935728, |
|
"grad_norm": 0.4777927100658417, |
|
"learning_rate": 0.0003, |
|
"loss": 1.6294, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7013232514177694, |
|
"grad_norm": 0.4425320327281952, |
|
"learning_rate": 0.0003, |
|
"loss": 1.6196, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8903591682419658, |
|
"grad_norm": 0.5979380011558533, |
|
"learning_rate": 0.0003, |
|
"loss": 1.6451, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.5745641025641026, |
|
"eval_loss": 1.635698676109314, |
|
"eval_runtime": 5.9951, |
|
"eval_samples_per_second": 83.401, |
|
"eval_steps_per_second": 10.509, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 2.0793950850661624, |
|
"grad_norm": 0.5178828239440918, |
|
"learning_rate": 0.0003, |
|
"loss": 1.514, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.268431001890359, |
|
"grad_norm": 0.7963550090789795, |
|
"learning_rate": 0.0003, |
|
"loss": 1.3597, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.4574669187145557, |
|
"grad_norm": 0.7210434675216675, |
|
"learning_rate": 0.0003, |
|
"loss": 1.3774, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.6465028355387523, |
|
"grad_norm": 0.6484540700912476, |
|
"learning_rate": 0.0003, |
|
"loss": 1.374, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.835538752362949, |
|
"grad_norm": 0.6798349618911743, |
|
"learning_rate": 0.0003, |
|
"loss": 1.3703, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.5716410256410256, |
|
"eval_loss": 1.7677161693572998, |
|
"eval_runtime": 5.9596, |
|
"eval_samples_per_second": 83.899, |
|
"eval_steps_per_second": 10.571, |
|
"step": 1587 |
|
}, |
|
{ |
|
"epoch": 3.0245746691871456, |
|
"grad_norm": 0.6527106761932373, |
|
"learning_rate": 0.0003, |
|
"loss": 1.3419, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.213610586011342, |
|
"grad_norm": 0.6613947749137878, |
|
"learning_rate": 0.0003, |
|
"loss": 1.0932, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.402646502835539, |
|
"grad_norm": 0.7362242341041565, |
|
"learning_rate": 0.0003, |
|
"loss": 1.1188, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.5916824196597354, |
|
"grad_norm": 0.6629425287246704, |
|
"learning_rate": 0.0003, |
|
"loss": 1.142, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.780718336483932, |
|
"grad_norm": 0.789070725440979, |
|
"learning_rate": 0.0003, |
|
"loss": 1.1661, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.9697542533081287, |
|
"grad_norm": 0.7567113637924194, |
|
"learning_rate": 0.0003, |
|
"loss": 1.1817, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.5718461538461539, |
|
"eval_loss": 1.8587489128112793, |
|
"eval_runtime": 5.7528, |
|
"eval_samples_per_second": 86.914, |
|
"eval_steps_per_second": 10.951, |
|
"step": 2116 |
|
}, |
|
{ |
|
"epoch": 4.158790170132325, |
|
"grad_norm": 0.7041072249412537, |
|
"learning_rate": 0.0003, |
|
"loss": 0.9335, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.3478260869565215, |
|
"grad_norm": 0.7739288210868835, |
|
"learning_rate": 0.0003, |
|
"loss": 0.9257, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.536862003780718, |
|
"grad_norm": 0.9699936509132385, |
|
"learning_rate": 0.0003, |
|
"loss": 0.9234, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.725897920604915, |
|
"grad_norm": 0.8464515209197998, |
|
"learning_rate": 0.0003, |
|
"loss": 0.9547, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.914933837429111, |
|
"grad_norm": 0.7238239049911499, |
|
"learning_rate": 0.0003, |
|
"loss": 0.9674, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.5712820512820512, |
|
"eval_loss": 1.931915283203125, |
|
"eval_runtime": 5.7956, |
|
"eval_samples_per_second": 86.272, |
|
"eval_steps_per_second": 10.87, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 5.103969754253308, |
|
"grad_norm": 0.8031799793243408, |
|
"learning_rate": 0.0003, |
|
"loss": 0.8353, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.293005671077505, |
|
"grad_norm": 0.862354576587677, |
|
"learning_rate": 0.0003, |
|
"loss": 0.731, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.482041587901701, |
|
"grad_norm": 0.9067243337631226, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7515, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.671077504725898, |
|
"grad_norm": 0.9791676998138428, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7769, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.8601134215500945, |
|
"grad_norm": 0.9806828498840332, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7936, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.5704102564102564, |
|
"eval_loss": 1.993375301361084, |
|
"eval_runtime": 5.8166, |
|
"eval_samples_per_second": 85.96, |
|
"eval_steps_per_second": 10.831, |
|
"step": 3174 |
|
}, |
|
{ |
|
"epoch": 6.049149338374291, |
|
"grad_norm": 0.8496273159980774, |
|
"learning_rate": 0.0003, |
|
"loss": 0.749, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 6.238185255198488, |
|
"grad_norm": 0.8800780177116394, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5822, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 6.427221172022684, |
|
"grad_norm": 0.8136658668518066, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6114, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 6.616257088846881, |
|
"grad_norm": 1.0112674236297607, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6329, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 6.805293005671078, |
|
"grad_norm": 0.8546850681304932, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6499, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 6.994328922495274, |
|
"grad_norm": 0.947127640247345, |
|
"learning_rate": 0.0003, |
|
"loss": 0.67, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.5683589743589743, |
|
"eval_loss": 2.046682357788086, |
|
"eval_runtime": 5.9678, |
|
"eval_samples_per_second": 83.784, |
|
"eval_steps_per_second": 10.557, |
|
"step": 3703 |
|
}, |
|
{ |
|
"epoch": 7.183364839319471, |
|
"grad_norm": 1.0099776983261108, |
|
"learning_rate": 0.0003, |
|
"loss": 0.477, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 7.3724007561436675, |
|
"grad_norm": 0.8788864016532898, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4951, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 7.561436672967864, |
|
"grad_norm": 0.9243162274360657, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5141, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 7.750472589792061, |
|
"grad_norm": 1.0089187622070312, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5317, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 7.939508506616257, |
|
"grad_norm": 0.9674586057662964, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5604, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.5692820512820512, |
|
"eval_loss": 2.121832847595215, |
|
"eval_runtime": 5.6926, |
|
"eval_samples_per_second": 87.833, |
|
"eval_steps_per_second": 11.067, |
|
"step": 4232 |
|
}, |
|
{ |
|
"epoch": 8.128544423440454, |
|
"grad_norm": 0.8500985503196716, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4532, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 8.31758034026465, |
|
"grad_norm": 0.8404316902160645, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4173, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 8.506616257088847, |
|
"grad_norm": 0.9724924564361572, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4386, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 8.695652173913043, |
|
"grad_norm": 0.9974843859672546, |
|
"learning_rate": 0.0003, |
|
"loss": 0.453, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 8.88468809073724, |
|
"grad_norm": 0.9835983514785767, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4747, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.5682051282051283, |
|
"eval_loss": 2.1342432498931885, |
|
"eval_runtime": 5.6644, |
|
"eval_samples_per_second": 88.271, |
|
"eval_steps_per_second": 11.122, |
|
"step": 4761 |
|
}, |
|
{ |
|
"epoch": 9.073724007561436, |
|
"grad_norm": 0.7832907438278198, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4297, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 9.262759924385634, |
|
"grad_norm": 0.9495198130607605, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3607, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 9.45179584120983, |
|
"grad_norm": 0.8801999688148499, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3836, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 9.640831758034027, |
|
"grad_norm": 0.9312089681625366, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4013, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 9.829867674858223, |
|
"grad_norm": 1.0668907165527344, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4191, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.5673846153846154, |
|
"eval_loss": 2.1679139137268066, |
|
"eval_runtime": 5.6062, |
|
"eval_samples_per_second": 89.186, |
|
"eval_steps_per_second": 11.237, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 10.01890359168242, |
|
"grad_norm": 0.8078038096427917, |
|
"learning_rate": 0.0003, |
|
"loss": 0.42, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 10.207939508506616, |
|
"grad_norm": 0.9549182653427124, |
|
"learning_rate": 0.0003, |
|
"loss": 0.329, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 10.396975425330814, |
|
"grad_norm": 1.0073105096817017, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3521, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 10.58601134215501, |
|
"grad_norm": 1.0606422424316406, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3671, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 10.775047258979207, |
|
"grad_norm": 0.8719451427459717, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3818, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 10.964083175803403, |
|
"grad_norm": 1.0333566665649414, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3971, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.5657948717948718, |
|
"eval_loss": 2.2080512046813965, |
|
"eval_runtime": 5.6645, |
|
"eval_samples_per_second": 88.27, |
|
"eval_steps_per_second": 11.122, |
|
"step": 5819 |
|
}, |
|
{ |
|
"epoch": 11.1531190926276, |
|
"grad_norm": 0.84638911485672, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3323, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 11.342155009451796, |
|
"grad_norm": 0.8573931455612183, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3304, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 11.531190926275993, |
|
"grad_norm": 1.0611106157302856, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3432, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 11.720226843100189, |
|
"grad_norm": 0.8618783950805664, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3601, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 11.909262759924385, |
|
"grad_norm": 1.0319652557373047, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3753, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.5663589743589743, |
|
"eval_loss": 2.184002637863159, |
|
"eval_runtime": 5.7032, |
|
"eval_samples_per_second": 87.67, |
|
"eval_steps_per_second": 11.046, |
|
"step": 6348 |
|
}, |
|
{ |
|
"epoch": 12.098298676748582, |
|
"grad_norm": 0.9860585331916809, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3454, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 12.287334593572778, |
|
"grad_norm": 0.8168879747390747, |
|
"learning_rate": 0.0003, |
|
"loss": 0.317, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 12.476370510396976, |
|
"grad_norm": 0.8994132876396179, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3286, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 12.665406427221171, |
|
"grad_norm": 0.9378811120986938, |
|
"learning_rate": 0.0003, |
|
"loss": 0.34, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 12.854442344045369, |
|
"grad_norm": 0.8742515444755554, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3571, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.5633846153846154, |
|
"eval_loss": 2.232431411743164, |
|
"eval_runtime": 5.6334, |
|
"eval_samples_per_second": 88.756, |
|
"eval_steps_per_second": 11.183, |
|
"step": 6877 |
|
}, |
|
{ |
|
"epoch": 13.043478260869565, |
|
"grad_norm": 0.8000278472900391, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3518, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 13.232514177693762, |
|
"grad_norm": 0.7633320689201355, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3046, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 13.421550094517958, |
|
"grad_norm": 0.9149838089942932, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3217, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 13.610586011342155, |
|
"grad_norm": 0.9108119010925293, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3315, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 13.799621928166351, |
|
"grad_norm": 0.8181828260421753, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3454, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 13.988657844990549, |
|
"grad_norm": 0.9064919352531433, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3526, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.5631794871794872, |
|
"eval_loss": 2.2189676761627197, |
|
"eval_runtime": 5.8178, |
|
"eval_samples_per_second": 85.943, |
|
"eval_steps_per_second": 10.829, |
|
"step": 7406 |
|
}, |
|
{ |
|
"epoch": 14.177693761814744, |
|
"grad_norm": 0.8295121788978577, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3011, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 14.366729678638942, |
|
"grad_norm": 0.78739994764328, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3091, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 14.555765595463138, |
|
"grad_norm": 0.9230947494506836, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3178, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 14.744801512287335, |
|
"grad_norm": 0.8687440752983093, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3298, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 14.93383742911153, |
|
"grad_norm": 0.7883646488189697, |
|
"learning_rate": 0.0003, |
|
"loss": 0.35, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.5638974358974359, |
|
"eval_loss": 2.208575487136841, |
|
"eval_runtime": 5.8338, |
|
"eval_samples_per_second": 85.707, |
|
"eval_steps_per_second": 10.799, |
|
"step": 7935 |
|
}, |
|
{ |
|
"epoch": 15.122873345935728, |
|
"grad_norm": 0.8580202460289001, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3089, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 15.311909262759924, |
|
"grad_norm": 1.0185339450836182, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2966, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 15.500945179584122, |
|
"grad_norm": 1.0962390899658203, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3135, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 15.689981096408317, |
|
"grad_norm": 0.9543150663375854, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3258, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 15.879017013232515, |
|
"grad_norm": 0.9073179364204407, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3323, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.5653846153846154, |
|
"eval_loss": 2.2654531002044678, |
|
"eval_runtime": 5.7427, |
|
"eval_samples_per_second": 87.067, |
|
"eval_steps_per_second": 10.97, |
|
"step": 8464 |
|
}, |
|
{ |
|
"epoch": 16.068052930056712, |
|
"grad_norm": 0.9761775135993958, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3298, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 16.257088846880908, |
|
"grad_norm": 0.8946503400802612, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2945, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 16.446124763705104, |
|
"grad_norm": 0.8116426467895508, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3051, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 16.6351606805293, |
|
"grad_norm": 0.8611814379692078, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3145, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 16.8241965973535, |
|
"grad_norm": 0.9202536940574646, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3281, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.5667179487179487, |
|
"eval_loss": 2.244356870651245, |
|
"eval_runtime": 5.9955, |
|
"eval_samples_per_second": 83.396, |
|
"eval_steps_per_second": 10.508, |
|
"step": 8993 |
|
}, |
|
{ |
|
"epoch": 17.013232514177695, |
|
"grad_norm": 0.7778891324996948, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3344, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 17.20226843100189, |
|
"grad_norm": 0.7352868914604187, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2843, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 17.391304347826086, |
|
"grad_norm": 0.7922298908233643, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2941, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 17.58034026465028, |
|
"grad_norm": 1.0191751718521118, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3164, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 17.76937618147448, |
|
"grad_norm": 1.0436335802078247, |
|
"learning_rate": 0.0003, |
|
"loss": 0.321, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 17.958412098298677, |
|
"grad_norm": 0.8792089223861694, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3328, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.5626153846153846, |
|
"eval_loss": 2.2596943378448486, |
|
"eval_runtime": 6.0856, |
|
"eval_samples_per_second": 82.161, |
|
"eval_steps_per_second": 10.352, |
|
"step": 9522 |
|
}, |
|
{ |
|
"epoch": 18.147448015122873, |
|
"grad_norm": 0.6146367788314819, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2967, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 18.33648393194707, |
|
"grad_norm": 0.8163793683052063, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2958, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 18.525519848771268, |
|
"grad_norm": 0.8955701589584351, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3065, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 18.714555765595463, |
|
"grad_norm": 0.7934162020683289, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3121, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 18.90359168241966, |
|
"grad_norm": 1.07945716381073, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3305, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.5633333333333334, |
|
"eval_loss": 2.2682220935821533, |
|
"eval_runtime": 5.8794, |
|
"eval_samples_per_second": 85.042, |
|
"eval_steps_per_second": 10.715, |
|
"step": 10051 |
|
}, |
|
{ |
|
"epoch": 19.092627599243855, |
|
"grad_norm": 0.694956362247467, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3069, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 19.281663516068054, |
|
"grad_norm": 0.8144003748893738, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2879, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 19.47069943289225, |
|
"grad_norm": 0.7943779826164246, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3002, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 19.659735349716446, |
|
"grad_norm": 0.7781754732131958, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3107, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 19.84877126654064, |
|
"grad_norm": 0.7696000933647156, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3228, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.5640512820512821, |
|
"eval_loss": 2.253227472305298, |
|
"eval_runtime": 5.7574, |
|
"eval_samples_per_second": 86.845, |
|
"eval_steps_per_second": 10.943, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 10580, |
|
"total_flos": 6.517631969856061e+17, |
|
"train_loss": 0.6476908660340625, |
|
"train_runtime": 24281.4349, |
|
"train_samples_per_second": 13.941, |
|
"train_steps_per_second": 0.436 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 10580, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"total_flos": 6.517631969856061e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|