{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99695843190267, "eval_steps": 500, "global_step": 1107, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.027036160865157147, "grad_norm": 3.3627634794530086, "learning_rate": 5e-06, "loss": 0.8839, "step": 10 }, { "epoch": 0.054072321730314295, "grad_norm": 1.9703511972746102, "learning_rate": 5e-06, "loss": 0.6886, "step": 20 }, { "epoch": 0.08110848259547145, "grad_norm": 1.895618934030067, "learning_rate": 5e-06, "loss": 0.649, "step": 30 }, { "epoch": 0.10814464346062859, "grad_norm": 3.018653004689794, "learning_rate": 5e-06, "loss": 0.6375, "step": 40 }, { "epoch": 0.13518080432578575, "grad_norm": 2.253174703767421, "learning_rate": 5e-06, "loss": 0.6233, "step": 50 }, { "epoch": 0.1622169651909429, "grad_norm": 2.5770804247981793, "learning_rate": 5e-06, "loss": 0.6164, "step": 60 }, { "epoch": 0.18925312605610004, "grad_norm": 1.9685443465813035, "learning_rate": 5e-06, "loss": 0.6127, "step": 70 }, { "epoch": 0.21628928692125718, "grad_norm": 2.1325268775254034, "learning_rate": 5e-06, "loss": 0.6039, "step": 80 }, { "epoch": 0.24332544778641432, "grad_norm": 1.4745840951249662, "learning_rate": 5e-06, "loss": 0.601, "step": 90 }, { "epoch": 0.2703616086515715, "grad_norm": 1.971048371029162, "learning_rate": 5e-06, "loss": 0.6032, "step": 100 }, { "epoch": 0.29739776951672864, "grad_norm": 1.6569004063408272, "learning_rate": 5e-06, "loss": 0.6012, "step": 110 }, { "epoch": 0.3244339303818858, "grad_norm": 2.057401724353791, "learning_rate": 5e-06, "loss": 0.5971, "step": 120 }, { "epoch": 0.3514700912470429, "grad_norm": 1.525579699451329, "learning_rate": 5e-06, "loss": 0.5993, "step": 130 }, { "epoch": 0.37850625211220007, "grad_norm": 1.60308792852181, "learning_rate": 5e-06, "loss": 0.5985, "step": 140 }, { "epoch": 0.4055424129773572, "grad_norm": 1.6398129211146073, "learning_rate": 5e-06, "loss": 0.5895, "step": 150 }, { "epoch": 0.43257857384251436, "grad_norm": 1.9310976828981994, "learning_rate": 5e-06, "loss": 0.5918, "step": 160 }, { "epoch": 0.4596147347076715, "grad_norm": 1.385584059931609, "learning_rate": 5e-06, "loss": 0.593, "step": 170 }, { "epoch": 0.48665089557282865, "grad_norm": 1.2861159852159507, "learning_rate": 5e-06, "loss": 0.5918, "step": 180 }, { "epoch": 0.5136870564379858, "grad_norm": 1.444176832285908, "learning_rate": 5e-06, "loss": 0.5885, "step": 190 }, { "epoch": 0.540723217303143, "grad_norm": 1.4862672671337698, "learning_rate": 5e-06, "loss": 0.5867, "step": 200 }, { "epoch": 0.5677593781683001, "grad_norm": 1.5075706410261351, "learning_rate": 5e-06, "loss": 0.5885, "step": 210 }, { "epoch": 0.5947955390334573, "grad_norm": 1.2766576640698686, "learning_rate": 5e-06, "loss": 0.5831, "step": 220 }, { "epoch": 0.6218316998986144, "grad_norm": 1.3449220128756942, "learning_rate": 5e-06, "loss": 0.5853, "step": 230 }, { "epoch": 0.6488678607637716, "grad_norm": 1.2820825044433828, "learning_rate": 5e-06, "loss": 0.5816, "step": 240 }, { "epoch": 0.6759040216289287, "grad_norm": 2.0436410705486927, "learning_rate": 5e-06, "loss": 0.575, "step": 250 }, { "epoch": 0.7029401824940859, "grad_norm": 1.5111257680075043, "learning_rate": 5e-06, "loss": 0.5781, "step": 260 }, { "epoch": 0.729976343359243, "grad_norm": 1.4284810364538454, "learning_rate": 5e-06, "loss": 0.5805, "step": 270 }, { "epoch": 0.7570125042244001, "grad_norm": 1.2912410275065467, "learning_rate": 5e-06, "loss": 0.5753, "step": 280 }, { "epoch": 0.7840486650895573, "grad_norm": 1.3589332122580937, "learning_rate": 5e-06, "loss": 0.5757, "step": 290 }, { "epoch": 0.8110848259547144, "grad_norm": 1.4270507706028406, "learning_rate": 5e-06, "loss": 0.5759, "step": 300 }, { "epoch": 0.8381209868198716, "grad_norm": 1.4167431698605149, "learning_rate": 5e-06, "loss": 0.5728, "step": 310 }, { "epoch": 0.8651571476850287, "grad_norm": 1.476127949628171, "learning_rate": 5e-06, "loss": 0.5711, "step": 320 }, { "epoch": 0.8921933085501859, "grad_norm": 1.3615337414773585, "learning_rate": 5e-06, "loss": 0.5729, "step": 330 }, { "epoch": 0.919229469415343, "grad_norm": 1.3530496841079478, "learning_rate": 5e-06, "loss": 0.5797, "step": 340 }, { "epoch": 0.9462656302805001, "grad_norm": 1.6939163932161898, "learning_rate": 5e-06, "loss": 0.5713, "step": 350 }, { "epoch": 0.9733017911456573, "grad_norm": 1.2503611827765622, "learning_rate": 5e-06, "loss": 0.5711, "step": 360 }, { "epoch": 0.9976343359242987, "eval_loss": 0.07110526412725449, "eval_runtime": 379.7453, "eval_samples_per_second": 26.241, "eval_steps_per_second": 0.411, "step": 369 }, { "epoch": 1.0023656640757013, "grad_norm": 2.951009083063046, "learning_rate": 5e-06, "loss": 0.5659, "step": 370 }, { "epoch": 1.0294018249408583, "grad_norm": 2.275911553065513, "learning_rate": 5e-06, "loss": 0.4791, "step": 380 }, { "epoch": 1.0564379858060156, "grad_norm": 1.9146887520712665, "learning_rate": 5e-06, "loss": 0.4742, "step": 390 }, { "epoch": 1.0834741466711728, "grad_norm": 1.7060151561064856, "learning_rate": 5e-06, "loss": 0.4754, "step": 400 }, { "epoch": 1.1105103075363298, "grad_norm": 1.4307333510541713, "learning_rate": 5e-06, "loss": 0.4723, "step": 410 }, { "epoch": 1.1375464684014869, "grad_norm": 1.3914100165165024, "learning_rate": 5e-06, "loss": 0.4744, "step": 420 }, { "epoch": 1.1645826292666441, "grad_norm": 1.7655561621577454, "learning_rate": 5e-06, "loss": 0.4823, "step": 430 }, { "epoch": 1.1916187901318014, "grad_norm": 1.750541560120252, "learning_rate": 5e-06, "loss": 0.4746, "step": 440 }, { "epoch": 1.2186549509969584, "grad_norm": 1.3542180298546558, "learning_rate": 5e-06, "loss": 0.4795, "step": 450 }, { "epoch": 1.2456911118621157, "grad_norm": 1.3487709786995525, "learning_rate": 5e-06, "loss": 0.4811, "step": 460 }, { "epoch": 1.2727272727272727, "grad_norm": 1.4890081594482487, "learning_rate": 5e-06, "loss": 0.4838, "step": 470 }, { "epoch": 1.29976343359243, "grad_norm": 1.6710110403178111, "learning_rate": 5e-06, "loss": 0.4803, "step": 480 }, { "epoch": 1.326799594457587, "grad_norm": 1.3946918887630642, "learning_rate": 5e-06, "loss": 0.4814, "step": 490 }, { "epoch": 1.3538357553227442, "grad_norm": 1.349529389304425, "learning_rate": 5e-06, "loss": 0.4838, "step": 500 }, { "epoch": 1.3808719161879013, "grad_norm": 1.7196111346917198, "learning_rate": 5e-06, "loss": 0.4834, "step": 510 }, { "epoch": 1.4079080770530585, "grad_norm": 1.3671419685233817, "learning_rate": 5e-06, "loss": 0.4868, "step": 520 }, { "epoch": 1.4349442379182156, "grad_norm": 1.6390806735333066, "learning_rate": 5e-06, "loss": 0.4797, "step": 530 }, { "epoch": 1.4619803987833728, "grad_norm": 2.033732375443223, "learning_rate": 5e-06, "loss": 0.4838, "step": 540 }, { "epoch": 1.4890165596485299, "grad_norm": 1.6780750098228978, "learning_rate": 5e-06, "loss": 0.4828, "step": 550 }, { "epoch": 1.5160527205136871, "grad_norm": 1.4650147086655332, "learning_rate": 5e-06, "loss": 0.4833, "step": 560 }, { "epoch": 1.5430888813788441, "grad_norm": 1.6474963909234748, "learning_rate": 5e-06, "loss": 0.4801, "step": 570 }, { "epoch": 1.5701250422440014, "grad_norm": 1.2672048074445312, "learning_rate": 5e-06, "loss": 0.4821, "step": 580 }, { "epoch": 1.5971612031091587, "grad_norm": 1.2455105330473952, "learning_rate": 5e-06, "loss": 0.4858, "step": 590 }, { "epoch": 1.6241973639743157, "grad_norm": 1.3365218119955355, "learning_rate": 5e-06, "loss": 0.4826, "step": 600 }, { "epoch": 1.6512335248394727, "grad_norm": 1.453771936609652, "learning_rate": 5e-06, "loss": 0.4824, "step": 610 }, { "epoch": 1.67826968570463, "grad_norm": 1.3826705153584662, "learning_rate": 5e-06, "loss": 0.4872, "step": 620 }, { "epoch": 1.7053058465697872, "grad_norm": 1.3948399069255963, "learning_rate": 5e-06, "loss": 0.4923, "step": 630 }, { "epoch": 1.7323420074349443, "grad_norm": 1.3015399889788772, "learning_rate": 5e-06, "loss": 0.4897, "step": 640 }, { "epoch": 1.7593781683001013, "grad_norm": 1.5649098361174691, "learning_rate": 5e-06, "loss": 0.4882, "step": 650 }, { "epoch": 1.7864143291652586, "grad_norm": 1.4369334673977943, "learning_rate": 5e-06, "loss": 0.4856, "step": 660 }, { "epoch": 1.8134504900304158, "grad_norm": 1.3582444869164498, "learning_rate": 5e-06, "loss": 0.4872, "step": 670 }, { "epoch": 1.8404866508955728, "grad_norm": 1.4410245166819187, "learning_rate": 5e-06, "loss": 0.4902, "step": 680 }, { "epoch": 1.8675228117607299, "grad_norm": 1.2401548016118424, "learning_rate": 5e-06, "loss": 0.4876, "step": 690 }, { "epoch": 1.8945589726258871, "grad_norm": 1.3435104700539477, "learning_rate": 5e-06, "loss": 0.4906, "step": 700 }, { "epoch": 1.9215951334910444, "grad_norm": 1.4535634930233825, "learning_rate": 5e-06, "loss": 0.4955, "step": 710 }, { "epoch": 1.9486312943562014, "grad_norm": 1.267760090624259, "learning_rate": 5e-06, "loss": 0.4887, "step": 720 }, { "epoch": 1.9756674552213584, "grad_norm": 1.2488776839475728, "learning_rate": 5e-06, "loss": 0.4845, "step": 730 }, { "epoch": 1.9972963839134843, "eval_loss": 0.07172359526157379, "eval_runtime": 380.951, "eval_samples_per_second": 26.158, "eval_steps_per_second": 0.41, "step": 738 }, { "epoch": 2.0047313281514025, "grad_norm": 3.0892791498740326, "learning_rate": 5e-06, "loss": 0.4728, "step": 740 }, { "epoch": 2.0317674890165596, "grad_norm": 2.0695271423861037, "learning_rate": 5e-06, "loss": 0.3816, "step": 750 }, { "epoch": 2.0588036498817166, "grad_norm": 1.694338271769957, "learning_rate": 5e-06, "loss": 0.3823, "step": 760 }, { "epoch": 2.085839810746874, "grad_norm": 1.492703018660841, "learning_rate": 5e-06, "loss": 0.3768, "step": 770 }, { "epoch": 2.112875971612031, "grad_norm": 1.6552377461262013, "learning_rate": 5e-06, "loss": 0.3769, "step": 780 }, { "epoch": 2.139912132477188, "grad_norm": 1.9420361419361558, "learning_rate": 5e-06, "loss": 0.3761, "step": 790 }, { "epoch": 2.1669482933423456, "grad_norm": 1.5563624108859666, "learning_rate": 5e-06, "loss": 0.3826, "step": 800 }, { "epoch": 2.1939844542075027, "grad_norm": 1.7915429380933352, "learning_rate": 5e-06, "loss": 0.3804, "step": 810 }, { "epoch": 2.2210206150726597, "grad_norm": 1.5584683923881884, "learning_rate": 5e-06, "loss": 0.3816, "step": 820 }, { "epoch": 2.2480567759378167, "grad_norm": 1.60097536516568, "learning_rate": 5e-06, "loss": 0.3844, "step": 830 }, { "epoch": 2.2750929368029738, "grad_norm": 1.6548064908062865, "learning_rate": 5e-06, "loss": 0.3865, "step": 840 }, { "epoch": 2.3021290976681312, "grad_norm": 1.7027619140998314, "learning_rate": 5e-06, "loss": 0.3818, "step": 850 }, { "epoch": 2.3291652585332883, "grad_norm": 1.6016849568444829, "learning_rate": 5e-06, "loss": 0.3847, "step": 860 }, { "epoch": 2.3562014193984453, "grad_norm": 1.8796231385046944, "learning_rate": 5e-06, "loss": 0.39, "step": 870 }, { "epoch": 2.3832375802636028, "grad_norm": 1.5319470307978418, "learning_rate": 5e-06, "loss": 0.3892, "step": 880 }, { "epoch": 2.41027374112876, "grad_norm": 1.7017719120193255, "learning_rate": 5e-06, "loss": 0.3881, "step": 890 }, { "epoch": 2.437309901993917, "grad_norm": 1.5344718368107968, "learning_rate": 5e-06, "loss": 0.3873, "step": 900 }, { "epoch": 2.464346062859074, "grad_norm": 1.6102507634771308, "learning_rate": 5e-06, "loss": 0.3854, "step": 910 }, { "epoch": 2.4913822237242313, "grad_norm": 1.6690069949519504, "learning_rate": 5e-06, "loss": 0.3872, "step": 920 }, { "epoch": 2.5184183845893884, "grad_norm": 1.5743935677314018, "learning_rate": 5e-06, "loss": 0.3867, "step": 930 }, { "epoch": 2.5454545454545454, "grad_norm": 1.5720807503966818, "learning_rate": 5e-06, "loss": 0.394, "step": 940 }, { "epoch": 2.5724907063197024, "grad_norm": 1.4596744684339498, "learning_rate": 5e-06, "loss": 0.3896, "step": 950 }, { "epoch": 2.59952686718486, "grad_norm": 1.4774112887538513, "learning_rate": 5e-06, "loss": 0.3959, "step": 960 }, { "epoch": 2.626563028050017, "grad_norm": 1.6927054304904465, "learning_rate": 5e-06, "loss": 0.3946, "step": 970 }, { "epoch": 2.653599188915174, "grad_norm": 1.6990634986226298, "learning_rate": 5e-06, "loss": 0.399, "step": 980 }, { "epoch": 2.6806353497803315, "grad_norm": 1.5811069605653503, "learning_rate": 5e-06, "loss": 0.3968, "step": 990 }, { "epoch": 2.7076715106454885, "grad_norm": 1.929742002611046, "learning_rate": 5e-06, "loss": 0.3906, "step": 1000 }, { "epoch": 2.7347076715106455, "grad_norm": 1.4332871535309044, "learning_rate": 5e-06, "loss": 0.3984, "step": 1010 }, { "epoch": 2.7617438323758026, "grad_norm": 1.6711055842838813, "learning_rate": 5e-06, "loss": 0.4002, "step": 1020 }, { "epoch": 2.7887799932409596, "grad_norm": 1.6261611517040526, "learning_rate": 5e-06, "loss": 0.3984, "step": 1030 }, { "epoch": 2.815816154106117, "grad_norm": 1.4326621075330992, "learning_rate": 5e-06, "loss": 0.3972, "step": 1040 }, { "epoch": 2.842852314971274, "grad_norm": 1.4683518261050355, "learning_rate": 5e-06, "loss": 0.399, "step": 1050 }, { "epoch": 2.869888475836431, "grad_norm": 1.4432147424830148, "learning_rate": 5e-06, "loss": 0.3953, "step": 1060 }, { "epoch": 2.8969246367015886, "grad_norm": 1.4795447507798194, "learning_rate": 5e-06, "loss": 0.4029, "step": 1070 }, { "epoch": 2.9239607975667457, "grad_norm": 1.54599126728265, "learning_rate": 5e-06, "loss": 0.3982, "step": 1080 }, { "epoch": 2.9509969584319027, "grad_norm": 1.4383101466258315, "learning_rate": 5e-06, "loss": 0.4018, "step": 1090 }, { "epoch": 2.9780331192970597, "grad_norm": 1.3572001471611468, "learning_rate": 5e-06, "loss": 0.401, "step": 1100 }, { "epoch": 2.99695843190267, "eval_loss": 0.07649821043014526, "eval_runtime": 382.4679, "eval_samples_per_second": 26.054, "eval_steps_per_second": 0.408, "step": 1107 }, { "epoch": 2.99695843190267, "step": 1107, "total_flos": 1854056851046400.0, "train_loss": 0.49201587243670264, "train_runtime": 63342.9422, "train_samples_per_second": 8.967, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 1107, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1854056851046400.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }