|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 10580, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1890359168241966, |
|
"grad_norm": 0.45108333230018616, |
|
"learning_rate": 0.0005, |
|
"loss": 1.9159, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3780718336483932, |
|
"grad_norm": 0.6339304447174072, |
|
"learning_rate": 0.0005, |
|
"loss": 1.8756, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5671077504725898, |
|
"grad_norm": 0.5402090549468994, |
|
"learning_rate": 0.0005, |
|
"loss": 1.8511, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7561436672967864, |
|
"grad_norm": 0.43492260575294495, |
|
"learning_rate": 0.0005, |
|
"loss": 1.8581, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.945179584120983, |
|
"grad_norm": 0.5317010879516602, |
|
"learning_rate": 0.0005, |
|
"loss": 1.8583, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.5726153846153846, |
|
"eval_loss": 1.6375669240951538, |
|
"eval_runtime": 5.9206, |
|
"eval_samples_per_second": 84.451, |
|
"eval_steps_per_second": 10.641, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.1342155009451795, |
|
"grad_norm": 0.5848847031593323, |
|
"learning_rate": 0.0005, |
|
"loss": 1.6655, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3232514177693762, |
|
"grad_norm": 0.642599880695343, |
|
"learning_rate": 0.0005, |
|
"loss": 1.581, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5122873345935728, |
|
"grad_norm": 0.5563580393791199, |
|
"learning_rate": 0.0005, |
|
"loss": 1.5945, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7013232514177694, |
|
"grad_norm": 0.576246976852417, |
|
"learning_rate": 0.0005, |
|
"loss": 1.5962, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8903591682419658, |
|
"grad_norm": 0.6526830196380615, |
|
"learning_rate": 0.0005, |
|
"loss": 1.6329, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.5712820512820512, |
|
"eval_loss": 1.6881264448165894, |
|
"eval_runtime": 6.002, |
|
"eval_samples_per_second": 83.305, |
|
"eval_steps_per_second": 10.496, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 2.0793950850661624, |
|
"grad_norm": 0.614975094795227, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4729, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.268431001890359, |
|
"grad_norm": 0.7056101560592651, |
|
"learning_rate": 0.0005, |
|
"loss": 1.2845, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.4574669187145557, |
|
"grad_norm": 0.8371734619140625, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3214, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.6465028355387523, |
|
"grad_norm": 0.8234328627586365, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3326, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.835538752362949, |
|
"grad_norm": 0.7623058557510376, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3464, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.5663076923076923, |
|
"eval_loss": 1.8256189823150635, |
|
"eval_runtime": 5.9355, |
|
"eval_samples_per_second": 84.239, |
|
"eval_steps_per_second": 10.614, |
|
"step": 1587 |
|
}, |
|
{ |
|
"epoch": 3.0245746691871456, |
|
"grad_norm": 0.8697965145111084, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3199, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.213610586011342, |
|
"grad_norm": 0.7469469308853149, |
|
"learning_rate": 0.0005, |
|
"loss": 1.0191, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.402646502835539, |
|
"grad_norm": 0.8800496459007263, |
|
"learning_rate": 0.0005, |
|
"loss": 1.0606, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.5916824196597354, |
|
"grad_norm": 0.7127139568328857, |
|
"learning_rate": 0.0005, |
|
"loss": 1.0988, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.780718336483932, |
|
"grad_norm": 0.9053574204444885, |
|
"learning_rate": 0.0005, |
|
"loss": 1.1355, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.9697542533081287, |
|
"grad_norm": 0.9217483997344971, |
|
"learning_rate": 0.0005, |
|
"loss": 1.1624, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.5651794871794872, |
|
"eval_loss": 1.9222664833068848, |
|
"eval_runtime": 5.7781, |
|
"eval_samples_per_second": 86.533, |
|
"eval_steps_per_second": 10.903, |
|
"step": 2116 |
|
}, |
|
{ |
|
"epoch": 4.158790170132325, |
|
"grad_norm": 0.8502592444419861, |
|
"learning_rate": 0.0005, |
|
"loss": 0.8795, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.3478260869565215, |
|
"grad_norm": 0.8051011562347412, |
|
"learning_rate": 0.0005, |
|
"loss": 0.8713, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.536862003780718, |
|
"grad_norm": 1.0111184120178223, |
|
"learning_rate": 0.0005, |
|
"loss": 0.8974, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.725897920604915, |
|
"grad_norm": 0.9804710745811462, |
|
"learning_rate": 0.0005, |
|
"loss": 0.9396, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.914933837429111, |
|
"grad_norm": 1.1629979610443115, |
|
"learning_rate": 0.0005, |
|
"loss": 0.964, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.5642564102564103, |
|
"eval_loss": 1.9719713926315308, |
|
"eval_runtime": 6.1201, |
|
"eval_samples_per_second": 81.698, |
|
"eval_steps_per_second": 10.294, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 5.103969754253308, |
|
"grad_norm": 0.8145145177841187, |
|
"learning_rate": 0.0005, |
|
"loss": 0.8208, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.293005671077505, |
|
"grad_norm": 0.7975667715072632, |
|
"learning_rate": 0.0005, |
|
"loss": 0.7022, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.482041587901701, |
|
"grad_norm": 0.8971696496009827, |
|
"learning_rate": 0.0005, |
|
"loss": 0.7435, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.671077504725898, |
|
"grad_norm": 1.0234569311141968, |
|
"learning_rate": 0.0005, |
|
"loss": 0.7798, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.8601134215500945, |
|
"grad_norm": 1.0058211088180542, |
|
"learning_rate": 0.0005, |
|
"loss": 0.8117, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.5647179487179487, |
|
"eval_loss": 2.001645565032959, |
|
"eval_runtime": 6.1282, |
|
"eval_samples_per_second": 81.59, |
|
"eval_steps_per_second": 10.28, |
|
"step": 3174 |
|
}, |
|
{ |
|
"epoch": 6.049149338374291, |
|
"grad_norm": 0.9231036901473999, |
|
"learning_rate": 0.0005, |
|
"loss": 0.7673, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 6.238185255198488, |
|
"grad_norm": 0.9058907628059387, |
|
"learning_rate": 0.0005, |
|
"loss": 0.5828, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 6.427221172022684, |
|
"grad_norm": 0.8709693551063538, |
|
"learning_rate": 0.0005, |
|
"loss": 0.6272, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 6.616257088846881, |
|
"grad_norm": 0.9873590469360352, |
|
"learning_rate": 0.0005, |
|
"loss": 0.6599, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 6.805293005671078, |
|
"grad_norm": 0.9257177114486694, |
|
"learning_rate": 0.0005, |
|
"loss": 0.6918, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 6.994328922495274, |
|
"grad_norm": 1.0284956693649292, |
|
"learning_rate": 0.0005, |
|
"loss": 0.7242, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.5638974358974359, |
|
"eval_loss": 2.0784664154052734, |
|
"eval_runtime": 6.1613, |
|
"eval_samples_per_second": 81.152, |
|
"eval_steps_per_second": 10.225, |
|
"step": 3703 |
|
}, |
|
{ |
|
"epoch": 7.183364839319471, |
|
"grad_norm": 1.0036342144012451, |
|
"learning_rate": 0.0005, |
|
"loss": 0.5099, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 7.3724007561436675, |
|
"grad_norm": 0.9070968627929688, |
|
"learning_rate": 0.0005, |
|
"loss": 0.541, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 7.561436672967864, |
|
"grad_norm": 1.035317063331604, |
|
"learning_rate": 0.0005, |
|
"loss": 0.5681, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 7.750472589792061, |
|
"grad_norm": 1.0337880849838257, |
|
"learning_rate": 0.0005, |
|
"loss": 0.5991, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 7.939508506616257, |
|
"grad_norm": 0.9707123637199402, |
|
"learning_rate": 0.0005, |
|
"loss": 0.6381, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.5644615384615385, |
|
"eval_loss": 2.0954012870788574, |
|
"eval_runtime": 5.9732, |
|
"eval_samples_per_second": 83.708, |
|
"eval_steps_per_second": 10.547, |
|
"step": 4232 |
|
}, |
|
{ |
|
"epoch": 8.128544423440454, |
|
"grad_norm": 0.9024428129196167, |
|
"learning_rate": 0.0005, |
|
"loss": 0.5113, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 8.31758034026465, |
|
"grad_norm": 0.8392314314842224, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4752, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 8.506616257088847, |
|
"grad_norm": 0.9310020208358765, |
|
"learning_rate": 0.0005, |
|
"loss": 0.5088, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 8.695652173913043, |
|
"grad_norm": 1.115989327430725, |
|
"learning_rate": 0.0005, |
|
"loss": 0.5358, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 8.88468809073724, |
|
"grad_norm": 1.1282142400741577, |
|
"learning_rate": 0.0005, |
|
"loss": 0.573, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.5623076923076923, |
|
"eval_loss": 2.106734275817871, |
|
"eval_runtime": 5.9846, |
|
"eval_samples_per_second": 83.548, |
|
"eval_steps_per_second": 10.527, |
|
"step": 4761 |
|
}, |
|
{ |
|
"epoch": 9.073724007561436, |
|
"grad_norm": 0.7718724012374878, |
|
"learning_rate": 0.0005, |
|
"loss": 0.5165, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 9.262759924385634, |
|
"grad_norm": 1.0482347011566162, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4291, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 9.45179584120983, |
|
"grad_norm": 1.1258679628372192, |
|
"learning_rate": 0.0005, |
|
"loss": 0.467, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 9.640831758034027, |
|
"grad_norm": 1.0340867042541504, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4992, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 9.829867674858223, |
|
"grad_norm": 0.9999015927314758, |
|
"learning_rate": 0.0005, |
|
"loss": 0.5269, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.5646153846153846, |
|
"eval_loss": 2.1355865001678467, |
|
"eval_runtime": 5.6591, |
|
"eval_samples_per_second": 88.353, |
|
"eval_steps_per_second": 11.132, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 10.01890359168242, |
|
"grad_norm": 0.8213431239128113, |
|
"learning_rate": 0.0005, |
|
"loss": 0.5325, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 10.207939508506616, |
|
"grad_norm": 1.023992896080017, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3966, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 10.396975425330814, |
|
"grad_norm": 0.8211169242858887, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4307, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 10.58601134215501, |
|
"grad_norm": 0.9888399839401245, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4604, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 10.775047258979207, |
|
"grad_norm": 0.9565384387969971, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4888, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 10.964083175803403, |
|
"grad_norm": 1.0463844537734985, |
|
"learning_rate": 0.0005, |
|
"loss": 0.5144, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.5616410256410257, |
|
"eval_loss": 2.195107936859131, |
|
"eval_runtime": 5.722, |
|
"eval_samples_per_second": 87.382, |
|
"eval_steps_per_second": 11.01, |
|
"step": 5819 |
|
}, |
|
{ |
|
"epoch": 11.1531190926276, |
|
"grad_norm": 0.9159551858901978, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4094, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 11.342155009451796, |
|
"grad_norm": 0.9027566909790039, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4083, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 11.531190926275993, |
|
"grad_norm": 0.9325262904167175, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4339, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 11.720226843100189, |
|
"grad_norm": 1.0173691511154175, |
|
"learning_rate": 0.0005, |
|
"loss": 0.464, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 11.909262759924385, |
|
"grad_norm": 0.9742277264595032, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4887, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.5631282051282052, |
|
"eval_loss": 2.1778857707977295, |
|
"eval_runtime": 6.0538, |
|
"eval_samples_per_second": 82.593, |
|
"eval_steps_per_second": 10.407, |
|
"step": 6348 |
|
}, |
|
{ |
|
"epoch": 12.098298676748582, |
|
"grad_norm": 1.0347404479980469, |
|
"learning_rate": 0.0005, |
|
"loss": 0.438, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 12.287334593572778, |
|
"grad_norm": 0.8733549118041992, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3893, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 12.476370510396976, |
|
"grad_norm": 0.8693263530731201, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4142, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 12.665406427221171, |
|
"grad_norm": 1.1597702503204346, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4379, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 12.854442344045369, |
|
"grad_norm": 1.018873929977417, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4636, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.561076923076923, |
|
"eval_loss": 2.175729274749756, |
|
"eval_runtime": 5.683, |
|
"eval_samples_per_second": 87.981, |
|
"eval_steps_per_second": 11.086, |
|
"step": 6877 |
|
}, |
|
{ |
|
"epoch": 13.043478260869565, |
|
"grad_norm": 0.8772552013397217, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4552, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 13.232514177693762, |
|
"grad_norm": 0.8089916110038757, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3698, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 13.421550094517958, |
|
"grad_norm": 1.0659642219543457, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3971, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 13.610586011342155, |
|
"grad_norm": 1.2195113897323608, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4213, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 13.799621928166351, |
|
"grad_norm": 1.1026618480682373, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4487, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 13.988657844990549, |
|
"grad_norm": 1.120952844619751, |
|
"learning_rate": 0.0005, |
|
"loss": 0.467, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.5624102564102564, |
|
"eval_loss": 2.1781177520751953, |
|
"eval_runtime": 5.7664, |
|
"eval_samples_per_second": 86.709, |
|
"eval_steps_per_second": 10.925, |
|
"step": 7406 |
|
}, |
|
{ |
|
"epoch": 14.177693761814744, |
|
"grad_norm": 0.9397806525230408, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3685, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 14.366729678638942, |
|
"grad_norm": 0.8353122472763062, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3828, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 14.555765595463138, |
|
"grad_norm": 1.0425219535827637, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4026, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 14.744801512287335, |
|
"grad_norm": 1.0660319328308105, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4287, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 14.93383742911153, |
|
"grad_norm": 1.131705403327942, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4613, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.5611794871794872, |
|
"eval_loss": 2.231194019317627, |
|
"eval_runtime": 5.8033, |
|
"eval_samples_per_second": 86.158, |
|
"eval_steps_per_second": 10.856, |
|
"step": 7935 |
|
}, |
|
{ |
|
"epoch": 15.122873345935728, |
|
"grad_norm": 0.8532615303993225, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3889, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 15.311909262759924, |
|
"grad_norm": 0.9682427048683167, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3675, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 15.500945179584122, |
|
"grad_norm": 0.9799471497535706, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3968, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 15.689981096408317, |
|
"grad_norm": 0.9278863072395325, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4198, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 15.879017013232515, |
|
"grad_norm": 1.0345618724822998, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4405, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.562923076923077, |
|
"eval_loss": 2.1799755096435547, |
|
"eval_runtime": 5.8871, |
|
"eval_samples_per_second": 84.931, |
|
"eval_steps_per_second": 10.701, |
|
"step": 8464 |
|
}, |
|
{ |
|
"epoch": 16.068052930056712, |
|
"grad_norm": 0.9081653356552124, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4241, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 16.257088846880908, |
|
"grad_norm": 0.8579433560371399, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3634, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 16.446124763705104, |
|
"grad_norm": 1.042361855506897, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3837, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 16.6351606805293, |
|
"grad_norm": 1.032463550567627, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4071, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 16.8241965973535, |
|
"grad_norm": 1.0003693103790283, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4308, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.5627692307692308, |
|
"eval_loss": 2.1960062980651855, |
|
"eval_runtime": 6.0711, |
|
"eval_samples_per_second": 82.358, |
|
"eval_steps_per_second": 10.377, |
|
"step": 8993 |
|
}, |
|
{ |
|
"epoch": 17.013232514177695, |
|
"grad_norm": 0.793249249458313, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4471, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 17.20226843100189, |
|
"grad_norm": 0.9027940630912781, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3499, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 17.391304347826086, |
|
"grad_norm": 0.9174290299415588, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3698, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 17.58034026465028, |
|
"grad_norm": 1.135392665863037, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4027, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 17.76937618147448, |
|
"grad_norm": 0.9852614998817444, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4179, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 17.958412098298677, |
|
"grad_norm": 0.9487043619155884, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4401, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.560974358974359, |
|
"eval_loss": 2.2354702949523926, |
|
"eval_runtime": 5.7571, |
|
"eval_samples_per_second": 86.85, |
|
"eval_steps_per_second": 10.943, |
|
"step": 9522 |
|
}, |
|
{ |
|
"epoch": 18.147448015122873, |
|
"grad_norm": 0.7868739366531372, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3637, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 18.33648393194707, |
|
"grad_norm": 0.962329626083374, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3642, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 18.525519848771268, |
|
"grad_norm": 0.9594590663909912, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3865, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 18.714555765595463, |
|
"grad_norm": 1.0574795007705688, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4081, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 18.90359168241966, |
|
"grad_norm": 1.0566685199737549, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4334, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.5608205128205128, |
|
"eval_loss": 2.2380332946777344, |
|
"eval_runtime": 5.7808, |
|
"eval_samples_per_second": 86.493, |
|
"eval_steps_per_second": 10.898, |
|
"step": 10051 |
|
}, |
|
{ |
|
"epoch": 19.092627599243855, |
|
"grad_norm": 0.6632242798805237, |
|
"learning_rate": 0.0005, |
|
"loss": 0.393, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 19.281663516068054, |
|
"grad_norm": 0.8786609768867493, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3528, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 19.47069943289225, |
|
"grad_norm": 1.0305085182189941, |
|
"learning_rate": 0.0005, |
|
"loss": 0.3752, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 19.659735349716446, |
|
"grad_norm": 0.9528347849845886, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4023, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 19.84877126654064, |
|
"grad_norm": 0.9051763415336609, |
|
"learning_rate": 0.0005, |
|
"loss": 0.4218, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.560974358974359, |
|
"eval_loss": 2.2417104244232178, |
|
"eval_runtime": 5.7479, |
|
"eval_samples_per_second": 86.989, |
|
"eval_steps_per_second": 10.961, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 10580, |
|
"total_flos": 6.517631969856061e+17, |
|
"train_loss": 0.6977483630405708, |
|
"train_runtime": 24342.3236, |
|
"train_samples_per_second": 13.906, |
|
"train_steps_per_second": 0.435 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 10580, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"total_flos": 6.517631969856061e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|