|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.99695843190267, |
|
"eval_steps": 500, |
|
"global_step": 1107, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.027036160865157147, |
|
"grad_norm": 3.3627634794530086, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8839, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.054072321730314295, |
|
"grad_norm": 1.9703511972746102, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6886, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08110848259547145, |
|
"grad_norm": 1.895618934030067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.649, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10814464346062859, |
|
"grad_norm": 3.018653004689794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6375, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13518080432578575, |
|
"grad_norm": 2.253174703767421, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6233, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1622169651909429, |
|
"grad_norm": 2.5770804247981793, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6164, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18925312605610004, |
|
"grad_norm": 1.9685443465813035, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6127, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21628928692125718, |
|
"grad_norm": 2.1325268775254034, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6039, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24332544778641432, |
|
"grad_norm": 1.4745840951249662, |
|
"learning_rate": 5e-06, |
|
"loss": 0.601, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2703616086515715, |
|
"grad_norm": 1.971048371029162, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6032, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.29739776951672864, |
|
"grad_norm": 1.6569004063408272, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6012, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3244339303818858, |
|
"grad_norm": 2.057401724353791, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5971, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3514700912470429, |
|
"grad_norm": 1.525579699451329, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5993, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.37850625211220007, |
|
"grad_norm": 1.60308792852181, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5985, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4055424129773572, |
|
"grad_norm": 1.6398129211146073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5895, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.43257857384251436, |
|
"grad_norm": 1.9310976828981994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5918, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4596147347076715, |
|
"grad_norm": 1.385584059931609, |
|
"learning_rate": 5e-06, |
|
"loss": 0.593, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.48665089557282865, |
|
"grad_norm": 1.2861159852159507, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5918, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5136870564379858, |
|
"grad_norm": 1.444176832285908, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5885, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.540723217303143, |
|
"grad_norm": 1.4862672671337698, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5867, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5677593781683001, |
|
"grad_norm": 1.5075706410261351, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5885, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5947955390334573, |
|
"grad_norm": 1.2766576640698686, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5831, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6218316998986144, |
|
"grad_norm": 1.3449220128756942, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5853, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6488678607637716, |
|
"grad_norm": 1.2820825044433828, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5816, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6759040216289287, |
|
"grad_norm": 2.0436410705486927, |
|
"learning_rate": 5e-06, |
|
"loss": 0.575, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7029401824940859, |
|
"grad_norm": 1.5111257680075043, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5781, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.729976343359243, |
|
"grad_norm": 1.4284810364538454, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5805, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7570125042244001, |
|
"grad_norm": 1.2912410275065467, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5753, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7840486650895573, |
|
"grad_norm": 1.3589332122580937, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5757, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8110848259547144, |
|
"grad_norm": 1.4270507706028406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5759, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8381209868198716, |
|
"grad_norm": 1.4167431698605149, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5728, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8651571476850287, |
|
"grad_norm": 1.476127949628171, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5711, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8921933085501859, |
|
"grad_norm": 1.3615337414773585, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5729, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.919229469415343, |
|
"grad_norm": 1.3530496841079478, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5797, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9462656302805001, |
|
"grad_norm": 1.6939163932161898, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5713, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9733017911456573, |
|
"grad_norm": 1.2503611827765622, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5711, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9976343359242987, |
|
"eval_loss": 0.07110526412725449, |
|
"eval_runtime": 379.7453, |
|
"eval_samples_per_second": 26.241, |
|
"eval_steps_per_second": 0.411, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.0023656640757013, |
|
"grad_norm": 2.951009083063046, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5659, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0294018249408583, |
|
"grad_norm": 2.275911553065513, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4791, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0564379858060156, |
|
"grad_norm": 1.9146887520712665, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4742, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0834741466711728, |
|
"grad_norm": 1.7060151561064856, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4754, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1105103075363298, |
|
"grad_norm": 1.4307333510541713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4723, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.1375464684014869, |
|
"grad_norm": 1.3914100165165024, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4744, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1645826292666441, |
|
"grad_norm": 1.7655561621577454, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4823, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1916187901318014, |
|
"grad_norm": 1.750541560120252, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4746, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.2186549509969584, |
|
"grad_norm": 1.3542180298546558, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4795, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2456911118621157, |
|
"grad_norm": 1.3487709786995525, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4811, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2727272727272727, |
|
"grad_norm": 1.4890081594482487, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4838, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.29976343359243, |
|
"grad_norm": 1.6710110403178111, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4803, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.326799594457587, |
|
"grad_norm": 1.3946918887630642, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4814, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3538357553227442, |
|
"grad_norm": 1.349529389304425, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4838, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3808719161879013, |
|
"grad_norm": 1.7196111346917198, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4834, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.4079080770530585, |
|
"grad_norm": 1.3671419685233817, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4868, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.4349442379182156, |
|
"grad_norm": 1.6390806735333066, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4797, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.4619803987833728, |
|
"grad_norm": 2.033732375443223, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4838, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.4890165596485299, |
|
"grad_norm": 1.6780750098228978, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4828, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5160527205136871, |
|
"grad_norm": 1.4650147086655332, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4833, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.5430888813788441, |
|
"grad_norm": 1.6474963909234748, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4801, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.5701250422440014, |
|
"grad_norm": 1.2672048074445312, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4821, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.5971612031091587, |
|
"grad_norm": 1.2455105330473952, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4858, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.6241973639743157, |
|
"grad_norm": 1.3365218119955355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4826, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6512335248394727, |
|
"grad_norm": 1.453771936609652, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4824, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.67826968570463, |
|
"grad_norm": 1.3826705153584662, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4872, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.7053058465697872, |
|
"grad_norm": 1.3948399069255963, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4923, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.7323420074349443, |
|
"grad_norm": 1.3015399889788772, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4897, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.7593781683001013, |
|
"grad_norm": 1.5649098361174691, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4882, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.7864143291652586, |
|
"grad_norm": 1.4369334673977943, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4856, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.8134504900304158, |
|
"grad_norm": 1.3582444869164498, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4872, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.8404866508955728, |
|
"grad_norm": 1.4410245166819187, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4902, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.8675228117607299, |
|
"grad_norm": 1.2401548016118424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4876, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.8945589726258871, |
|
"grad_norm": 1.3435104700539477, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4906, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.9215951334910444, |
|
"grad_norm": 1.4535634930233825, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4955, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.9486312943562014, |
|
"grad_norm": 1.267760090624259, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4887, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.9756674552213584, |
|
"grad_norm": 1.2488776839475728, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4845, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.9972963839134843, |
|
"eval_loss": 0.07172359526157379, |
|
"eval_runtime": 380.951, |
|
"eval_samples_per_second": 26.158, |
|
"eval_steps_per_second": 0.41, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 2.0047313281514025, |
|
"grad_norm": 3.0892791498740326, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4728, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.0317674890165596, |
|
"grad_norm": 2.0695271423861037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3816, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.0588036498817166, |
|
"grad_norm": 1.694338271769957, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3823, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.085839810746874, |
|
"grad_norm": 1.492703018660841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3768, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.112875971612031, |
|
"grad_norm": 1.6552377461262013, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3769, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.139912132477188, |
|
"grad_norm": 1.9420361419361558, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3761, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.1669482933423456, |
|
"grad_norm": 1.5563624108859666, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3826, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.1939844542075027, |
|
"grad_norm": 1.7915429380933352, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3804, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.2210206150726597, |
|
"grad_norm": 1.5584683923881884, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3816, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.2480567759378167, |
|
"grad_norm": 1.60097536516568, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3844, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.2750929368029738, |
|
"grad_norm": 1.6548064908062865, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3865, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.3021290976681312, |
|
"grad_norm": 1.7027619140998314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3818, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.3291652585332883, |
|
"grad_norm": 1.6016849568444829, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3847, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.3562014193984453, |
|
"grad_norm": 1.8796231385046944, |
|
"learning_rate": 5e-06, |
|
"loss": 0.39, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.3832375802636028, |
|
"grad_norm": 1.5319470307978418, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3892, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.41027374112876, |
|
"grad_norm": 1.7017719120193255, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3881, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.437309901993917, |
|
"grad_norm": 1.5344718368107968, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3873, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.464346062859074, |
|
"grad_norm": 1.6102507634771308, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3854, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.4913822237242313, |
|
"grad_norm": 1.6690069949519504, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3872, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.5184183845893884, |
|
"grad_norm": 1.5743935677314018, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3867, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 1.5720807503966818, |
|
"learning_rate": 5e-06, |
|
"loss": 0.394, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.5724907063197024, |
|
"grad_norm": 1.4596744684339498, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3896, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.59952686718486, |
|
"grad_norm": 1.4774112887538513, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3959, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.626563028050017, |
|
"grad_norm": 1.6927054304904465, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3946, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.653599188915174, |
|
"grad_norm": 1.6990634986226298, |
|
"learning_rate": 5e-06, |
|
"loss": 0.399, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.6806353497803315, |
|
"grad_norm": 1.5811069605653503, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3968, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.7076715106454885, |
|
"grad_norm": 1.929742002611046, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3906, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.7347076715106455, |
|
"grad_norm": 1.4332871535309044, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3984, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.7617438323758026, |
|
"grad_norm": 1.6711055842838813, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4002, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.7887799932409596, |
|
"grad_norm": 1.6261611517040526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3984, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.815816154106117, |
|
"grad_norm": 1.4326621075330992, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3972, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.842852314971274, |
|
"grad_norm": 1.4683518261050355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.399, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.869888475836431, |
|
"grad_norm": 1.4432147424830148, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3953, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.8969246367015886, |
|
"grad_norm": 1.4795447507798194, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4029, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.9239607975667457, |
|
"grad_norm": 1.54599126728265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3982, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.9509969584319027, |
|
"grad_norm": 1.4383101466258315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4018, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.9780331192970597, |
|
"grad_norm": 1.3572001471611468, |
|
"learning_rate": 5e-06, |
|
"loss": 0.401, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.99695843190267, |
|
"eval_loss": 0.07649821043014526, |
|
"eval_runtime": 382.4679, |
|
"eval_samples_per_second": 26.054, |
|
"eval_steps_per_second": 0.408, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 2.99695843190267, |
|
"step": 1107, |
|
"total_flos": 1854056851046400.0, |
|
"train_loss": 0.49201587243670264, |
|
"train_runtime": 63342.9422, |
|
"train_samples_per_second": 8.967, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1107, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1854056851046400.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|