|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 10580, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1890359168241966, |
|
"grad_norm": 0.24623066186904907, |
|
"learning_rate": 3e-05, |
|
"loss": 2.0175, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3780718336483932, |
|
"grad_norm": 0.3154822587966919, |
|
"learning_rate": 3e-05, |
|
"loss": 1.9025, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5671077504725898, |
|
"grad_norm": 0.2886754274368286, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8754, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7561436672967864, |
|
"grad_norm": 0.33380141854286194, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8795, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.945179584120983, |
|
"grad_norm": 0.3052491247653961, |
|
"learning_rate": 3e-05, |
|
"loss": 1.86, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.605677130044843, |
|
"eval_loss": 1.6920065879821777, |
|
"eval_runtime": 6.0828, |
|
"eval_samples_per_second": 82.199, |
|
"eval_steps_per_second": 10.357, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.1342155009451795, |
|
"grad_norm": 0.38703393936157227, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8651, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3232514177693762, |
|
"grad_norm": 0.5257138609886169, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8238, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5122873345935728, |
|
"grad_norm": 0.5151342749595642, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8252, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7013232514177694, |
|
"grad_norm": 0.4788012206554413, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8178, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8903591682419658, |
|
"grad_norm": 0.506370484828949, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8271, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.6118744394618834, |
|
"eval_loss": 1.6425946950912476, |
|
"eval_runtime": 6.042, |
|
"eval_samples_per_second": 82.754, |
|
"eval_steps_per_second": 10.427, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 2.0793950850661624, |
|
"grad_norm": 0.537693440914154, |
|
"learning_rate": 3e-05, |
|
"loss": 1.795, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.268431001890359, |
|
"grad_norm": 0.6806929111480713, |
|
"learning_rate": 3e-05, |
|
"loss": 1.773, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.4574669187145557, |
|
"grad_norm": 0.7064961194992065, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7609, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.6465028355387523, |
|
"grad_norm": 0.7571425437927246, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7536, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.835538752362949, |
|
"grad_norm": 0.8069006204605103, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7324, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.6163408071748879, |
|
"eval_loss": 1.59978187084198, |
|
"eval_runtime": 6.4372, |
|
"eval_samples_per_second": 77.673, |
|
"eval_steps_per_second": 9.787, |
|
"step": 1587 |
|
}, |
|
{ |
|
"epoch": 3.0245746691871456, |
|
"grad_norm": 0.7909080982208252, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7206, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.213610586011342, |
|
"grad_norm": 0.8461490869522095, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6696, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.402646502835539, |
|
"grad_norm": 0.9591927528381348, |
|
"learning_rate": 3e-05, |
|
"loss": 1.672, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.5916824196597354, |
|
"grad_norm": 1.0703200101852417, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6712, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.780718336483932, |
|
"grad_norm": 0.9409723281860352, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6877, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.9697542533081287, |
|
"grad_norm": 0.9087811708450317, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6818, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.621982062780269, |
|
"eval_loss": 1.5579897165298462, |
|
"eval_runtime": 6.0535, |
|
"eval_samples_per_second": 82.597, |
|
"eval_steps_per_second": 10.407, |
|
"step": 2116 |
|
}, |
|
{ |
|
"epoch": 4.158790170132325, |
|
"grad_norm": 1.0507543087005615, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6195, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.3478260869565215, |
|
"grad_norm": 1.1318222284317017, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6039, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.536862003780718, |
|
"grad_norm": 1.1028773784637451, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5924, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.725897920604915, |
|
"grad_norm": 1.1365679502487183, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5903, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.914933837429111, |
|
"grad_norm": 1.0776621103286743, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5864, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.6278206278026905, |
|
"eval_loss": 1.5211377143859863, |
|
"eval_runtime": 6.0103, |
|
"eval_samples_per_second": 83.19, |
|
"eval_steps_per_second": 10.482, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 5.103969754253308, |
|
"grad_norm": 1.1829136610031128, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5478, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.293005671077505, |
|
"grad_norm": 1.1871124505996704, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5151, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.482041587901701, |
|
"grad_norm": 1.2089450359344482, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5159, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.671077504725898, |
|
"grad_norm": 1.3170595169067383, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5241, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.8601134215500945, |
|
"grad_norm": 1.439405083656311, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5204, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.6326726457399103, |
|
"eval_loss": 1.4863040447235107, |
|
"eval_runtime": 6.0604, |
|
"eval_samples_per_second": 82.502, |
|
"eval_steps_per_second": 10.395, |
|
"step": 3174 |
|
}, |
|
{ |
|
"epoch": 6.049149338374291, |
|
"grad_norm": 1.3573708534240723, |
|
"learning_rate": 3e-05, |
|
"loss": 1.506, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 6.238185255198488, |
|
"grad_norm": 1.3852615356445312, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4401, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 6.427221172022684, |
|
"grad_norm": 1.391204595565796, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4469, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 6.616257088846881, |
|
"grad_norm": 1.5318756103515625, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4608, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 6.805293005671078, |
|
"grad_norm": 1.4233124256134033, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4419, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 6.994328922495274, |
|
"grad_norm": 1.4444003105163574, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4481, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.6371928251121076, |
|
"eval_loss": 1.4516844749450684, |
|
"eval_runtime": 6.0218, |
|
"eval_samples_per_second": 83.032, |
|
"eval_steps_per_second": 10.462, |
|
"step": 3703 |
|
}, |
|
{ |
|
"epoch": 7.183364839319471, |
|
"grad_norm": 1.607857346534729, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3742, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 7.3724007561436675, |
|
"grad_norm": 1.5741080045700073, |
|
"learning_rate": 3e-05, |
|
"loss": 1.383, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 7.561436672967864, |
|
"grad_norm": 1.6515594720840454, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3734, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 7.750472589792061, |
|
"grad_norm": 1.704078197479248, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3527, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 7.939508506616257, |
|
"grad_norm": 1.7636491060256958, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3768, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.6429058295964125, |
|
"eval_loss": 1.4120872020721436, |
|
"eval_runtime": 6.7429, |
|
"eval_samples_per_second": 74.152, |
|
"eval_steps_per_second": 9.343, |
|
"step": 4232 |
|
}, |
|
{ |
|
"epoch": 8.128544423440454, |
|
"grad_norm": 1.691927433013916, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3254, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 8.31758034026465, |
|
"grad_norm": 1.695786476135254, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2819, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 8.506616257088847, |
|
"grad_norm": 1.8343569040298462, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3225, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 8.695652173913043, |
|
"grad_norm": 1.678654670715332, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2961, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 8.88468809073724, |
|
"grad_norm": 2.0212690830230713, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2946, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.6481973094170403, |
|
"eval_loss": 1.3739463090896606, |
|
"eval_runtime": 6.0696, |
|
"eval_samples_per_second": 82.377, |
|
"eval_steps_per_second": 10.38, |
|
"step": 4761 |
|
}, |
|
{ |
|
"epoch": 9.073724007561436, |
|
"grad_norm": 1.8944847583770752, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2831, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 9.262759924385634, |
|
"grad_norm": 1.9275157451629639, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2022, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 9.45179584120983, |
|
"grad_norm": 1.927822470664978, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2259, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 9.640831758034027, |
|
"grad_norm": 1.8706145286560059, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2231, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 9.829867674858223, |
|
"grad_norm": 1.858546257019043, |
|
"learning_rate": 3e-05, |
|
"loss": 1.243, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.653237668161435, |
|
"eval_loss": 1.3363806009292603, |
|
"eval_runtime": 6.6274, |
|
"eval_samples_per_second": 75.444, |
|
"eval_steps_per_second": 9.506, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 10.01890359168242, |
|
"grad_norm": 2.2799806594848633, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2223, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 10.207939508506616, |
|
"grad_norm": 2.0888822078704834, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1567, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 10.396975425330814, |
|
"grad_norm": 2.1389222145080566, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1421, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 10.58601134215501, |
|
"grad_norm": 2.253117322921753, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1525, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 10.775047258979207, |
|
"grad_norm": 2.3171472549438477, |
|
"learning_rate": 3e-05, |
|
"loss": 1.168, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 10.964083175803403, |
|
"grad_norm": 2.1451621055603027, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1425, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.6593991031390134, |
|
"eval_loss": 1.2968207597732544, |
|
"eval_runtime": 5.9969, |
|
"eval_samples_per_second": 83.377, |
|
"eval_steps_per_second": 10.506, |
|
"step": 5819 |
|
}, |
|
{ |
|
"epoch": 11.1531190926276, |
|
"grad_norm": 2.0139310359954834, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0814, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 11.342155009451796, |
|
"grad_norm": 2.2693676948547363, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0731, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 11.531190926275993, |
|
"grad_norm": 2.4319376945495605, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0842, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 11.720226843100189, |
|
"grad_norm": 2.103640079498291, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0839, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 11.909262759924385, |
|
"grad_norm": 2.309508800506592, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0847, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.6652017937219731, |
|
"eval_loss": 1.2538820505142212, |
|
"eval_runtime": 6.0994, |
|
"eval_samples_per_second": 81.975, |
|
"eval_steps_per_second": 10.329, |
|
"step": 6348 |
|
}, |
|
{ |
|
"epoch": 12.098298676748582, |
|
"grad_norm": 2.184951066970825, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0154, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 12.287334593572778, |
|
"grad_norm": 2.6142184734344482, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0129, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 12.476370510396976, |
|
"grad_norm": 2.377812147140503, |
|
"learning_rate": 3e-05, |
|
"loss": 1.01, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 12.665406427221171, |
|
"grad_norm": 2.252695083618164, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0132, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 12.854442344045369, |
|
"grad_norm": 2.4628734588623047, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0152, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.670609865470852, |
|
"eval_loss": 1.216369390487671, |
|
"eval_runtime": 6.115, |
|
"eval_samples_per_second": 81.766, |
|
"eval_steps_per_second": 10.302, |
|
"step": 6877 |
|
}, |
|
{ |
|
"epoch": 13.043478260869565, |
|
"grad_norm": 2.2027320861816406, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0047, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 13.232514177693762, |
|
"grad_norm": 2.1473910808563232, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9308, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 13.421550094517958, |
|
"grad_norm": 2.633467435836792, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9265, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 13.610586011342155, |
|
"grad_norm": 2.5691654682159424, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9485, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 13.799621928166351, |
|
"grad_norm": 2.6028811931610107, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9502, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 13.988657844990549, |
|
"grad_norm": 2.6901166439056396, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9498, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.6757937219730942, |
|
"eval_loss": 1.176974892616272, |
|
"eval_runtime": 6.0622, |
|
"eval_samples_per_second": 82.479, |
|
"eval_steps_per_second": 10.392, |
|
"step": 7406 |
|
}, |
|
{ |
|
"epoch": 14.177693761814744, |
|
"grad_norm": 2.624717950820923, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8708, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 14.366729678638942, |
|
"grad_norm": 2.5228078365325928, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8685, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 14.555765595463138, |
|
"grad_norm": 2.8440725803375244, |
|
"learning_rate": 3e-05, |
|
"loss": 0.88, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 14.744801512287335, |
|
"grad_norm": 2.5001230239868164, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8968, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 14.93383742911153, |
|
"grad_norm": 2.868446111679077, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8652, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.6820717488789237, |
|
"eval_loss": 1.1323232650756836, |
|
"eval_runtime": 6.0411, |
|
"eval_samples_per_second": 82.767, |
|
"eval_steps_per_second": 10.429, |
|
"step": 7935 |
|
}, |
|
{ |
|
"epoch": 15.122873345935728, |
|
"grad_norm": 3.236562728881836, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8354, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 15.311909262759924, |
|
"grad_norm": 3.0811691284179688, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8042, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 15.500945179584122, |
|
"grad_norm": 2.6125237941741943, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8097, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 15.689981096408317, |
|
"grad_norm": 2.8925766944885254, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8133, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 15.879017013232515, |
|
"grad_norm": 2.927067518234253, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8265, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.690134529147982, |
|
"eval_loss": 1.0825910568237305, |
|
"eval_runtime": 6.0821, |
|
"eval_samples_per_second": 82.209, |
|
"eval_steps_per_second": 10.358, |
|
"step": 8464 |
|
}, |
|
{ |
|
"epoch": 16.068052930056712, |
|
"grad_norm": 2.8401355743408203, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7933, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 16.257088846880908, |
|
"grad_norm": 3.2201480865478516, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7449, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 16.446124763705104, |
|
"grad_norm": 3.2903711795806885, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7565, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 16.6351606805293, |
|
"grad_norm": 3.153130292892456, |
|
"learning_rate": 3e-05, |
|
"loss": 0.773, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 16.8241965973535, |
|
"grad_norm": 2.8367865085601807, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7432, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.6959910313901345, |
|
"eval_loss": 1.04633367061615, |
|
"eval_runtime": 6.1125, |
|
"eval_samples_per_second": 81.8, |
|
"eval_steps_per_second": 10.307, |
|
"step": 8993 |
|
}, |
|
{ |
|
"epoch": 17.013232514177695, |
|
"grad_norm": 3.268256664276123, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7576, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 17.20226843100189, |
|
"grad_norm": 3.0888004302978516, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6879, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 17.391304347826086, |
|
"grad_norm": 2.908324718475342, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6869, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 17.58034026465028, |
|
"grad_norm": 4.358062267303467, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7027, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 17.76937618147448, |
|
"grad_norm": 3.1521716117858887, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7188, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 17.958412098298677, |
|
"grad_norm": 3.1939432621002197, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7106, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.7021973094170404, |
|
"eval_loss": 1.0097612142562866, |
|
"eval_runtime": 5.6404, |
|
"eval_samples_per_second": 88.646, |
|
"eval_steps_per_second": 11.169, |
|
"step": 9522 |
|
}, |
|
{ |
|
"epoch": 18.147448015122873, |
|
"grad_norm": 3.3058857917785645, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6526, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 18.33648393194707, |
|
"grad_norm": 2.845630645751953, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6375, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 18.525519848771268, |
|
"grad_norm": 3.524501323699951, |
|
"learning_rate": 3e-05, |
|
"loss": 0.65, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 18.714555765595463, |
|
"grad_norm": 3.5869174003601074, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6529, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 18.90359168241966, |
|
"grad_norm": 2.8486292362213135, |
|
"learning_rate": 3e-05, |
|
"loss": 0.669, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.7077937219730942, |
|
"eval_loss": 0.9695693254470825, |
|
"eval_runtime": 6.1993, |
|
"eval_samples_per_second": 80.654, |
|
"eval_steps_per_second": 10.162, |
|
"step": 10051 |
|
}, |
|
{ |
|
"epoch": 19.092627599243855, |
|
"grad_norm": 2.8552756309509277, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6183, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 19.281663516068054, |
|
"grad_norm": 2.949084758758545, |
|
"learning_rate": 3e-05, |
|
"loss": 0.5824, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 19.47069943289225, |
|
"grad_norm": 3.137747049331665, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6093, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 19.659735349716446, |
|
"grad_norm": 2.7827863693237305, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6155, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 19.84877126654064, |
|
"grad_norm": 3.2223191261291504, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6043, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.7134618834080717, |
|
"eval_loss": 0.9358564019203186, |
|
"eval_runtime": 6.1149, |
|
"eval_samples_per_second": 81.767, |
|
"eval_steps_per_second": 10.303, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 10580, |
|
"total_flos": 9.64242245391745e+17, |
|
"train_loss": 1.2100651977192927, |
|
"train_runtime": 22841.5406, |
|
"train_samples_per_second": 14.819, |
|
"train_steps_per_second": 0.463 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 10580, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"total_flos": 9.64242245391745e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|