|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 50, |
|
"global_step": 1516, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002638522427440633, |
|
"eval_loss": 0.3697243332862854, |
|
"eval_runtime": 31.4109, |
|
"eval_samples_per_second": 63.672, |
|
"eval_steps_per_second": 0.255, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.052770448548812667, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00010526315789473685, |
|
"loss": 0.3823, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10554089709762533, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.0001997845988152935, |
|
"loss": 0.2239, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13192612137203166, |
|
"eval_loss": 0.11808302253484726, |
|
"eval_runtime": 29.4538, |
|
"eval_samples_per_second": 67.903, |
|
"eval_steps_per_second": 0.272, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.158311345646438, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.00019763058696822833, |
|
"loss": 0.1799, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21108179419525067, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.0001954765751211632, |
|
"loss": 0.1651, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2638522427440633, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.00019332256327409802, |
|
"loss": 0.1571, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2638522427440633, |
|
"eval_loss": 0.09250890463590622, |
|
"eval_runtime": 28.2273, |
|
"eval_samples_per_second": 70.853, |
|
"eval_steps_per_second": 0.283, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.316622691292876, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.00019116855142703286, |
|
"loss": 0.1535, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.36939313984168864, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 0.00018901453957996772, |
|
"loss": 0.1456, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.39577836411609496, |
|
"eval_loss": 0.08707328885793686, |
|
"eval_runtime": 27.6259, |
|
"eval_samples_per_second": 72.396, |
|
"eval_steps_per_second": 0.29, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.42216358839050133, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 0.00018686052773290255, |
|
"loss": 0.1402, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.47493403693931396, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0001847065158858374, |
|
"loss": 0.142, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5277044854881267, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 0.00018255250403877222, |
|
"loss": 0.1318, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5277044854881267, |
|
"eval_loss": 0.080934077501297, |
|
"eval_runtime": 27.3743, |
|
"eval_samples_per_second": 73.061, |
|
"eval_steps_per_second": 0.292, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5804749340369393, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.00018039849219170706, |
|
"loss": 0.1301, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.633245382585752, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 0.0001782444803446419, |
|
"loss": 0.1317, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6596306068601583, |
|
"eval_loss": 0.0750429555773735, |
|
"eval_runtime": 27.7505, |
|
"eval_samples_per_second": 72.071, |
|
"eval_steps_per_second": 0.288, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6860158311345647, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.00017609046849757676, |
|
"loss": 0.1269, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7387862796833773, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 0.0001739364566505116, |
|
"loss": 0.1267, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7915567282321899, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 0.00017178244480344642, |
|
"loss": 0.1226, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7915567282321899, |
|
"eval_loss": 0.07792137563228607, |
|
"eval_runtime": 27.3248, |
|
"eval_samples_per_second": 73.194, |
|
"eval_steps_per_second": 0.293, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8443271767810027, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 0.00016962843295638126, |
|
"loss": 0.1222, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8970976253298153, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 0.0001674744211093161, |
|
"loss": 0.1254, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9234828496042217, |
|
"eval_loss": 0.07484881579875946, |
|
"eval_runtime": 27.8135, |
|
"eval_samples_per_second": 71.907, |
|
"eval_steps_per_second": 0.288, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9498680738786279, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 0.00016532040926225093, |
|
"loss": 0.1177, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0026385224274406, |
|
"grad_norm": 0.1220703125, |
|
"learning_rate": 0.0001631663974151858, |
|
"loss": 0.1207, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0554089709762533, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 0.00016101238556812063, |
|
"loss": 0.1046, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0554089709762533, |
|
"eval_loss": 0.0715707540512085, |
|
"eval_runtime": 27.7758, |
|
"eval_samples_per_second": 72.005, |
|
"eval_steps_per_second": 0.288, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.108179419525066, |
|
"grad_norm": 0.1142578125, |
|
"learning_rate": 0.0001588583737210555, |
|
"loss": 0.1041, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1609498680738786, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 0.00015670436187399032, |
|
"loss": 0.1034, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.187335092348285, |
|
"eval_loss": 0.0693235993385315, |
|
"eval_runtime": 27.7658, |
|
"eval_samples_per_second": 72.031, |
|
"eval_steps_per_second": 0.288, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2137203166226913, |
|
"grad_norm": 0.1630859375, |
|
"learning_rate": 0.00015455035002692516, |
|
"loss": 0.1042, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.266490765171504, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 0.00015239633817986, |
|
"loss": 0.1032, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.3192612137203166, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 0.00015024232633279485, |
|
"loss": 0.1021, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3192612137203166, |
|
"eval_loss": 0.06579812616109848, |
|
"eval_runtime": 27.42, |
|
"eval_samples_per_second": 72.939, |
|
"eval_steps_per_second": 0.292, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3720316622691293, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 0.0001480883144857297, |
|
"loss": 0.1041, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.424802110817942, |
|
"grad_norm": 0.11474609375, |
|
"learning_rate": 0.00014593430263866452, |
|
"loss": 0.1006, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.4511873350923483, |
|
"eval_loss": 0.06417644023895264, |
|
"eval_runtime": 27.5371, |
|
"eval_samples_per_second": 72.629, |
|
"eval_steps_per_second": 0.291, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.4775725593667546, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 0.00014378029079159936, |
|
"loss": 0.1001, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.5303430079155673, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 0.0001416262789445342, |
|
"loss": 0.1013, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.58311345646438, |
|
"grad_norm": 0.1591796875, |
|
"learning_rate": 0.00013947226709746903, |
|
"loss": 0.1, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.58311345646438, |
|
"eval_loss": 0.06583409756422043, |
|
"eval_runtime": 28.0223, |
|
"eval_samples_per_second": 71.372, |
|
"eval_steps_per_second": 0.285, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6358839050131926, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 0.0001373182552504039, |
|
"loss": 0.1021, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.6886543535620053, |
|
"grad_norm": 0.14453125, |
|
"learning_rate": 0.00013516424340333873, |
|
"loss": 0.1002, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.7150395778364116, |
|
"eval_loss": 0.06498919427394867, |
|
"eval_runtime": 28.3581, |
|
"eval_samples_per_second": 70.527, |
|
"eval_steps_per_second": 0.282, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.741424802110818, |
|
"grad_norm": 0.111328125, |
|
"learning_rate": 0.00013301023155627356, |
|
"loss": 0.0967, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.7941952506596306, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 0.0001308562197092084, |
|
"loss": 0.1004, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.8469656992084431, |
|
"grad_norm": 0.13671875, |
|
"learning_rate": 0.00012870220786214323, |
|
"loss": 0.0992, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8469656992084431, |
|
"eval_loss": 0.06491042673587799, |
|
"eval_runtime": 27.748, |
|
"eval_samples_per_second": 72.077, |
|
"eval_steps_per_second": 0.288, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.899736147757256, |
|
"grad_norm": 0.15234375, |
|
"learning_rate": 0.0001265481960150781, |
|
"loss": 0.0967, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.9525065963060686, |
|
"grad_norm": 0.12451171875, |
|
"learning_rate": 0.00012439418416801293, |
|
"loss": 0.0956, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.978891820580475, |
|
"eval_loss": 0.06425958126783371, |
|
"eval_runtime": 27.654, |
|
"eval_samples_per_second": 72.322, |
|
"eval_steps_per_second": 0.289, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.005277044854881, |
|
"grad_norm": 0.12060546875, |
|
"learning_rate": 0.0001222401723209478, |
|
"loss": 0.0934, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.058047493403694, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 0.00012008616047388261, |
|
"loss": 0.0907, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.1108179419525066, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 0.00011793214862681745, |
|
"loss": 0.0861, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.1108179419525066, |
|
"eval_loss": 0.06223862245678902, |
|
"eval_runtime": 27.4046, |
|
"eval_samples_per_second": 72.981, |
|
"eval_steps_per_second": 0.292, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.163588390501319, |
|
"grad_norm": 0.134765625, |
|
"learning_rate": 0.0001157781367797523, |
|
"loss": 0.0864, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.216358839050132, |
|
"grad_norm": 0.123046875, |
|
"learning_rate": 0.00011362412493268713, |
|
"loss": 0.0842, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.242744063324538, |
|
"eval_loss": 0.060463495552539825, |
|
"eval_runtime": 27.4597, |
|
"eval_samples_per_second": 72.834, |
|
"eval_steps_per_second": 0.291, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.2691292875989446, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 0.00011147011308562199, |
|
"loss": 0.0863, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.321899736147757, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 0.00010931610123855683, |
|
"loss": 0.0858, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.37467018469657, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 0.00010716208939149166, |
|
"loss": 0.0866, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.37467018469657, |
|
"eval_loss": 0.06099672615528107, |
|
"eval_runtime": 27.7635, |
|
"eval_samples_per_second": 72.037, |
|
"eval_steps_per_second": 0.288, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.4274406332453826, |
|
"grad_norm": 0.1376953125, |
|
"learning_rate": 0.0001050080775444265, |
|
"loss": 0.0873, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.480211081794195, |
|
"grad_norm": 0.158203125, |
|
"learning_rate": 0.00010285406569736133, |
|
"loss": 0.0853, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.5065963060686016, |
|
"eval_loss": 0.06115744262933731, |
|
"eval_runtime": 27.8521, |
|
"eval_samples_per_second": 71.808, |
|
"eval_steps_per_second": 0.287, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.532981530343008, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 0.00010070005385029618, |
|
"loss": 0.0849, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.5857519788918206, |
|
"grad_norm": 0.1318359375, |
|
"learning_rate": 9.854604200323103e-05, |
|
"loss": 0.0814, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.638522427440633, |
|
"grad_norm": 0.1376953125, |
|
"learning_rate": 9.639203015616588e-05, |
|
"loss": 0.0864, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.638522427440633, |
|
"eval_loss": 0.05968466028571129, |
|
"eval_runtime": 27.6897, |
|
"eval_samples_per_second": 72.229, |
|
"eval_steps_per_second": 0.289, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.691292875989446, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 9.423801830910071e-05, |
|
"loss": 0.0869, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.7440633245382586, |
|
"grad_norm": 0.12890625, |
|
"learning_rate": 9.208400646203555e-05, |
|
"loss": 0.0821, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.7704485488126647, |
|
"eval_loss": 0.059157080948352814, |
|
"eval_runtime": 27.7435, |
|
"eval_samples_per_second": 72.089, |
|
"eval_steps_per_second": 0.288, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.796833773087071, |
|
"grad_norm": 0.1337890625, |
|
"learning_rate": 8.99299946149704e-05, |
|
"loss": 0.0842, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.849604221635884, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 8.777598276790523e-05, |
|
"loss": 0.0846, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.9023746701846966, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 8.562197092084006e-05, |
|
"loss": 0.0841, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.9023746701846966, |
|
"eval_loss": 0.05879725515842438, |
|
"eval_runtime": 27.612, |
|
"eval_samples_per_second": 72.432, |
|
"eval_steps_per_second": 0.29, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.955145118733509, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 8.346795907377491e-05, |
|
"loss": 0.0809, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.007915567282322, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 8.131394722670975e-05, |
|
"loss": 0.0815, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.034300791556728, |
|
"eval_loss": 0.05831225588917732, |
|
"eval_runtime": 27.6258, |
|
"eval_samples_per_second": 72.396, |
|
"eval_steps_per_second": 0.29, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.0606860158311346, |
|
"grad_norm": 0.130859375, |
|
"learning_rate": 7.91599353796446e-05, |
|
"loss": 0.0793, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.113456464379947, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 7.700592353257944e-05, |
|
"loss": 0.0775, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.16622691292876, |
|
"grad_norm": 0.1357421875, |
|
"learning_rate": 7.485191168551428e-05, |
|
"loss": 0.0795, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.16622691292876, |
|
"eval_loss": 0.0580158606171608, |
|
"eval_runtime": 27.9777, |
|
"eval_samples_per_second": 71.485, |
|
"eval_steps_per_second": 0.286, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.2189973614775726, |
|
"grad_norm": 0.1220703125, |
|
"learning_rate": 7.269789983844911e-05, |
|
"loss": 0.0766, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.271767810026385, |
|
"grad_norm": 0.1318359375, |
|
"learning_rate": 7.054388799138396e-05, |
|
"loss": 0.0732, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.2981530343007917, |
|
"eval_loss": 0.057783834636211395, |
|
"eval_runtime": 28.6683, |
|
"eval_samples_per_second": 69.763, |
|
"eval_steps_per_second": 0.279, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.324538258575198, |
|
"grad_norm": 0.130859375, |
|
"learning_rate": 6.83898761443188e-05, |
|
"loss": 0.0754, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.3773087071240107, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 6.623586429725363e-05, |
|
"loss": 0.0793, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.430079155672823, |
|
"grad_norm": 0.1181640625, |
|
"learning_rate": 6.408185245018848e-05, |
|
"loss": 0.076, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.430079155672823, |
|
"eval_loss": 0.05801219865679741, |
|
"eval_runtime": 28.2125, |
|
"eval_samples_per_second": 70.891, |
|
"eval_steps_per_second": 0.284, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.4828496042216357, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 6.192784060312333e-05, |
|
"loss": 0.0745, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.5356200527704487, |
|
"grad_norm": 0.1142578125, |
|
"learning_rate": 5.9773828756058156e-05, |
|
"loss": 0.0766, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 3.5620052770448547, |
|
"eval_loss": 0.05800151824951172, |
|
"eval_runtime": 27.919, |
|
"eval_samples_per_second": 71.636, |
|
"eval_steps_per_second": 0.287, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.588390501319261, |
|
"grad_norm": 0.140625, |
|
"learning_rate": 5.7619816908993005e-05, |
|
"loss": 0.0753, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 3.641160949868074, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 5.5465805061927846e-05, |
|
"loss": 0.0772, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.6939313984168867, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 5.331179321486268e-05, |
|
"loss": 0.0716, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.6939313984168867, |
|
"eval_loss": 0.057653266936540604, |
|
"eval_runtime": 28.2955, |
|
"eval_samples_per_second": 70.683, |
|
"eval_steps_per_second": 0.283, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.746701846965699, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 5.115778136779753e-05, |
|
"loss": 0.0744, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.7994722955145117, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 4.9003769520732365e-05, |
|
"loss": 0.0777, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.825857519788918, |
|
"eval_loss": 0.05697743222117424, |
|
"eval_runtime": 28.2563, |
|
"eval_samples_per_second": 70.781, |
|
"eval_steps_per_second": 0.283, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.8522427440633247, |
|
"grad_norm": 0.1640625, |
|
"learning_rate": 4.6849757673667206e-05, |
|
"loss": 0.0736, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.905013192612137, |
|
"grad_norm": 0.1318359375, |
|
"learning_rate": 4.469574582660205e-05, |
|
"loss": 0.0753, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 3.9577836411609497, |
|
"grad_norm": 0.12255859375, |
|
"learning_rate": 4.254173397953689e-05, |
|
"loss": 0.0745, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.9577836411609497, |
|
"eval_loss": 0.05676369369029999, |
|
"eval_runtime": 27.6767, |
|
"eval_samples_per_second": 72.263, |
|
"eval_steps_per_second": 0.289, |
|
"step": 1500 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 1895, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.03702605821971e+19, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|