{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 50, "global_step": 1516, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002638522427440633, "eval_loss": 0.3697243332862854, "eval_runtime": 31.4109, "eval_samples_per_second": 63.672, "eval_steps_per_second": 0.255, "step": 1 }, { "epoch": 0.052770448548812667, "grad_norm": 0.26953125, "learning_rate": 0.00010526315789473685, "loss": 0.3823, "step": 20 }, { "epoch": 0.10554089709762533, "grad_norm": 0.201171875, "learning_rate": 0.0001997845988152935, "loss": 0.2239, "step": 40 }, { "epoch": 0.13192612137203166, "eval_loss": 0.11808302253484726, "eval_runtime": 29.4538, "eval_samples_per_second": 67.903, "eval_steps_per_second": 0.272, "step": 50 }, { "epoch": 0.158311345646438, "grad_norm": 0.1962890625, "learning_rate": 0.00019763058696822833, "loss": 0.1799, "step": 60 }, { "epoch": 0.21108179419525067, "grad_norm": 0.1943359375, "learning_rate": 0.0001954765751211632, "loss": 0.1651, "step": 80 }, { "epoch": 0.2638522427440633, "grad_norm": 0.2255859375, "learning_rate": 0.00019332256327409802, "loss": 0.1571, "step": 100 }, { "epoch": 0.2638522427440633, "eval_loss": 0.09250890463590622, "eval_runtime": 28.2273, "eval_samples_per_second": 70.853, "eval_steps_per_second": 0.283, "step": 100 }, { "epoch": 0.316622691292876, "grad_norm": 0.2333984375, "learning_rate": 0.00019116855142703286, "loss": 0.1535, "step": 120 }, { "epoch": 0.36939313984168864, "grad_norm": 0.1611328125, "learning_rate": 0.00018901453957996772, "loss": 0.1456, "step": 140 }, { "epoch": 0.39577836411609496, "eval_loss": 0.08707328885793686, "eval_runtime": 27.6259, "eval_samples_per_second": 72.396, "eval_steps_per_second": 0.29, "step": 150 }, { "epoch": 0.42216358839050133, "grad_norm": 0.1884765625, "learning_rate": 0.00018686052773290255, "loss": 0.1402, "step": 160 }, { "epoch": 0.47493403693931396, "grad_norm": 0.2109375, "learning_rate": 0.0001847065158858374, "loss": 0.142, "step": 180 }, { "epoch": 0.5277044854881267, "grad_norm": 0.1533203125, "learning_rate": 0.00018255250403877222, "loss": 0.1318, "step": 200 }, { "epoch": 0.5277044854881267, "eval_loss": 0.080934077501297, "eval_runtime": 27.3743, "eval_samples_per_second": 73.061, "eval_steps_per_second": 0.292, "step": 200 }, { "epoch": 0.5804749340369393, "grad_norm": 0.216796875, "learning_rate": 0.00018039849219170706, "loss": 0.1301, "step": 220 }, { "epoch": 0.633245382585752, "grad_norm": 0.162109375, "learning_rate": 0.0001782444803446419, "loss": 0.1317, "step": 240 }, { "epoch": 0.6596306068601583, "eval_loss": 0.0750429555773735, "eval_runtime": 27.7505, "eval_samples_per_second": 72.071, "eval_steps_per_second": 0.288, "step": 250 }, { "epoch": 0.6860158311345647, "grad_norm": 0.185546875, "learning_rate": 0.00017609046849757676, "loss": 0.1269, "step": 260 }, { "epoch": 0.7387862796833773, "grad_norm": 0.203125, "learning_rate": 0.0001739364566505116, "loss": 0.1267, "step": 280 }, { "epoch": 0.7915567282321899, "grad_norm": 0.1455078125, "learning_rate": 0.00017178244480344642, "loss": 0.1226, "step": 300 }, { "epoch": 0.7915567282321899, "eval_loss": 0.07792137563228607, "eval_runtime": 27.3248, "eval_samples_per_second": 73.194, "eval_steps_per_second": 0.293, "step": 300 }, { "epoch": 0.8443271767810027, "grad_norm": 0.1630859375, "learning_rate": 0.00016962843295638126, "loss": 0.1222, "step": 320 }, { "epoch": 0.8970976253298153, "grad_norm": 0.173828125, "learning_rate": 0.0001674744211093161, "loss": 0.1254, "step": 340 }, { "epoch": 0.9234828496042217, "eval_loss": 0.07484881579875946, "eval_runtime": 27.8135, "eval_samples_per_second": 71.907, "eval_steps_per_second": 0.288, "step": 350 }, { "epoch": 0.9498680738786279, "grad_norm": 0.1728515625, "learning_rate": 0.00016532040926225093, "loss": 0.1177, "step": 360 }, { "epoch": 1.0026385224274406, "grad_norm": 0.1220703125, "learning_rate": 0.0001631663974151858, "loss": 0.1207, "step": 380 }, { "epoch": 1.0554089709762533, "grad_norm": 0.1591796875, "learning_rate": 0.00016101238556812063, "loss": 0.1046, "step": 400 }, { "epoch": 1.0554089709762533, "eval_loss": 0.0715707540512085, "eval_runtime": 27.7758, "eval_samples_per_second": 72.005, "eval_steps_per_second": 0.288, "step": 400 }, { "epoch": 1.108179419525066, "grad_norm": 0.1142578125, "learning_rate": 0.0001588583737210555, "loss": 0.1041, "step": 420 }, { "epoch": 1.1609498680738786, "grad_norm": 0.177734375, "learning_rate": 0.00015670436187399032, "loss": 0.1034, "step": 440 }, { "epoch": 1.187335092348285, "eval_loss": 0.0693235993385315, "eval_runtime": 27.7658, "eval_samples_per_second": 72.031, "eval_steps_per_second": 0.288, "step": 450 }, { "epoch": 1.2137203166226913, "grad_norm": 0.1630859375, "learning_rate": 0.00015455035002692516, "loss": 0.1042, "step": 460 }, { "epoch": 1.266490765171504, "grad_norm": 0.1611328125, "learning_rate": 0.00015239633817986, "loss": 0.1032, "step": 480 }, { "epoch": 1.3192612137203166, "grad_norm": 0.169921875, "learning_rate": 0.00015024232633279485, "loss": 0.1021, "step": 500 }, { "epoch": 1.3192612137203166, "eval_loss": 0.06579812616109848, "eval_runtime": 27.42, "eval_samples_per_second": 72.939, "eval_steps_per_second": 0.292, "step": 500 }, { "epoch": 1.3720316622691293, "grad_norm": 0.1611328125, "learning_rate": 0.0001480883144857297, "loss": 0.1041, "step": 520 }, { "epoch": 1.424802110817942, "grad_norm": 0.11474609375, "learning_rate": 0.00014593430263866452, "loss": 0.1006, "step": 540 }, { "epoch": 1.4511873350923483, "eval_loss": 0.06417644023895264, "eval_runtime": 27.5371, "eval_samples_per_second": 72.629, "eval_steps_per_second": 0.291, "step": 550 }, { "epoch": 1.4775725593667546, "grad_norm": 0.1259765625, "learning_rate": 0.00014378029079159936, "loss": 0.1001, "step": 560 }, { "epoch": 1.5303430079155673, "grad_norm": 0.146484375, "learning_rate": 0.0001416262789445342, "loss": 0.1013, "step": 580 }, { "epoch": 1.58311345646438, "grad_norm": 0.1591796875, "learning_rate": 0.00013947226709746903, "loss": 0.1, "step": 600 }, { "epoch": 1.58311345646438, "eval_loss": 0.06583409756422043, "eval_runtime": 28.0223, "eval_samples_per_second": 71.372, "eval_steps_per_second": 0.285, "step": 600 }, { "epoch": 1.6358839050131926, "grad_norm": 0.1611328125, "learning_rate": 0.0001373182552504039, "loss": 0.1021, "step": 620 }, { "epoch": 1.6886543535620053, "grad_norm": 0.14453125, "learning_rate": 0.00013516424340333873, "loss": 0.1002, "step": 640 }, { "epoch": 1.7150395778364116, "eval_loss": 0.06498919427394867, "eval_runtime": 28.3581, "eval_samples_per_second": 70.527, "eval_steps_per_second": 0.282, "step": 650 }, { "epoch": 1.741424802110818, "grad_norm": 0.111328125, "learning_rate": 0.00013301023155627356, "loss": 0.0967, "step": 660 }, { "epoch": 1.7941952506596306, "grad_norm": 0.1884765625, "learning_rate": 0.0001308562197092084, "loss": 0.1004, "step": 680 }, { "epoch": 1.8469656992084431, "grad_norm": 0.13671875, "learning_rate": 0.00012870220786214323, "loss": 0.0992, "step": 700 }, { "epoch": 1.8469656992084431, "eval_loss": 0.06491042673587799, "eval_runtime": 27.748, "eval_samples_per_second": 72.077, "eval_steps_per_second": 0.288, "step": 700 }, { "epoch": 1.899736147757256, "grad_norm": 0.15234375, "learning_rate": 0.0001265481960150781, "loss": 0.0967, "step": 720 }, { "epoch": 1.9525065963060686, "grad_norm": 0.12451171875, "learning_rate": 0.00012439418416801293, "loss": 0.0956, "step": 740 }, { "epoch": 1.978891820580475, "eval_loss": 0.06425958126783371, "eval_runtime": 27.654, "eval_samples_per_second": 72.322, "eval_steps_per_second": 0.289, "step": 750 }, { "epoch": 2.005277044854881, "grad_norm": 0.12060546875, "learning_rate": 0.0001222401723209478, "loss": 0.0934, "step": 760 }, { "epoch": 2.058047493403694, "grad_norm": 0.171875, "learning_rate": 0.00012008616047388261, "loss": 0.0907, "step": 780 }, { "epoch": 2.1108179419525066, "grad_norm": 0.16796875, "learning_rate": 0.00011793214862681745, "loss": 0.0861, "step": 800 }, { "epoch": 2.1108179419525066, "eval_loss": 0.06223862245678902, "eval_runtime": 27.4046, "eval_samples_per_second": 72.981, "eval_steps_per_second": 0.292, "step": 800 }, { "epoch": 2.163588390501319, "grad_norm": 0.134765625, "learning_rate": 0.0001157781367797523, "loss": 0.0864, "step": 820 }, { "epoch": 2.216358839050132, "grad_norm": 0.123046875, "learning_rate": 0.00011362412493268713, "loss": 0.0842, "step": 840 }, { "epoch": 2.242744063324538, "eval_loss": 0.060463495552539825, "eval_runtime": 27.4597, "eval_samples_per_second": 72.834, "eval_steps_per_second": 0.291, "step": 850 }, { "epoch": 2.2691292875989446, "grad_norm": 0.1435546875, "learning_rate": 0.00011147011308562199, "loss": 0.0863, "step": 860 }, { "epoch": 2.321899736147757, "grad_norm": 0.1494140625, "learning_rate": 0.00010931610123855683, "loss": 0.0858, "step": 880 }, { "epoch": 2.37467018469657, "grad_norm": 0.1259765625, "learning_rate": 0.00010716208939149166, "loss": 0.0866, "step": 900 }, { "epoch": 2.37467018469657, "eval_loss": 0.06099672615528107, "eval_runtime": 27.7635, "eval_samples_per_second": 72.037, "eval_steps_per_second": 0.288, "step": 900 }, { "epoch": 2.4274406332453826, "grad_norm": 0.1376953125, "learning_rate": 0.0001050080775444265, "loss": 0.0873, "step": 920 }, { "epoch": 2.480211081794195, "grad_norm": 0.158203125, "learning_rate": 0.00010285406569736133, "loss": 0.0853, "step": 940 }, { "epoch": 2.5065963060686016, "eval_loss": 0.06115744262933731, "eval_runtime": 27.8521, "eval_samples_per_second": 71.808, "eval_steps_per_second": 0.287, "step": 950 }, { "epoch": 2.532981530343008, "grad_norm": 0.1259765625, "learning_rate": 0.00010070005385029618, "loss": 0.0849, "step": 960 }, { "epoch": 2.5857519788918206, "grad_norm": 0.1318359375, "learning_rate": 9.854604200323103e-05, "loss": 0.0814, "step": 980 }, { "epoch": 2.638522427440633, "grad_norm": 0.1376953125, "learning_rate": 9.639203015616588e-05, "loss": 0.0864, "step": 1000 }, { "epoch": 2.638522427440633, "eval_loss": 0.05968466028571129, "eval_runtime": 27.6897, "eval_samples_per_second": 72.229, "eval_steps_per_second": 0.289, "step": 1000 }, { "epoch": 2.691292875989446, "grad_norm": 0.16015625, "learning_rate": 9.423801830910071e-05, "loss": 0.0869, "step": 1020 }, { "epoch": 2.7440633245382586, "grad_norm": 0.12890625, "learning_rate": 9.208400646203555e-05, "loss": 0.0821, "step": 1040 }, { "epoch": 2.7704485488126647, "eval_loss": 0.059157080948352814, "eval_runtime": 27.7435, "eval_samples_per_second": 72.089, "eval_steps_per_second": 0.288, "step": 1050 }, { "epoch": 2.796833773087071, "grad_norm": 0.1337890625, "learning_rate": 8.99299946149704e-05, "loss": 0.0842, "step": 1060 }, { "epoch": 2.849604221635884, "grad_norm": 0.1513671875, "learning_rate": 8.777598276790523e-05, "loss": 0.0846, "step": 1080 }, { "epoch": 2.9023746701846966, "grad_norm": 0.1328125, "learning_rate": 8.562197092084006e-05, "loss": 0.0841, "step": 1100 }, { "epoch": 2.9023746701846966, "eval_loss": 0.05879725515842438, "eval_runtime": 27.612, "eval_samples_per_second": 72.432, "eval_steps_per_second": 0.29, "step": 1100 }, { "epoch": 2.955145118733509, "grad_norm": 0.1455078125, "learning_rate": 8.346795907377491e-05, "loss": 0.0809, "step": 1120 }, { "epoch": 3.007915567282322, "grad_norm": 0.1259765625, "learning_rate": 8.131394722670975e-05, "loss": 0.0815, "step": 1140 }, { "epoch": 3.034300791556728, "eval_loss": 0.05831225588917732, "eval_runtime": 27.6258, "eval_samples_per_second": 72.396, "eval_steps_per_second": 0.29, "step": 1150 }, { "epoch": 3.0606860158311346, "grad_norm": 0.130859375, "learning_rate": 7.91599353796446e-05, "loss": 0.0793, "step": 1160 }, { "epoch": 3.113456464379947, "grad_norm": 0.1435546875, "learning_rate": 7.700592353257944e-05, "loss": 0.0775, "step": 1180 }, { "epoch": 3.16622691292876, "grad_norm": 0.1357421875, "learning_rate": 7.485191168551428e-05, "loss": 0.0795, "step": 1200 }, { "epoch": 3.16622691292876, "eval_loss": 0.0580158606171608, "eval_runtime": 27.9777, "eval_samples_per_second": 71.485, "eval_steps_per_second": 0.286, "step": 1200 }, { "epoch": 3.2189973614775726, "grad_norm": 0.1220703125, "learning_rate": 7.269789983844911e-05, "loss": 0.0766, "step": 1220 }, { "epoch": 3.271767810026385, "grad_norm": 0.1318359375, "learning_rate": 7.054388799138396e-05, "loss": 0.0732, "step": 1240 }, { "epoch": 3.2981530343007917, "eval_loss": 0.057783834636211395, "eval_runtime": 28.6683, "eval_samples_per_second": 69.763, "eval_steps_per_second": 0.279, "step": 1250 }, { "epoch": 3.324538258575198, "grad_norm": 0.130859375, "learning_rate": 6.83898761443188e-05, "loss": 0.0754, "step": 1260 }, { "epoch": 3.3773087071240107, "grad_norm": 0.1611328125, "learning_rate": 6.623586429725363e-05, "loss": 0.0793, "step": 1280 }, { "epoch": 3.430079155672823, "grad_norm": 0.1181640625, "learning_rate": 6.408185245018848e-05, "loss": 0.076, "step": 1300 }, { "epoch": 3.430079155672823, "eval_loss": 0.05801219865679741, "eval_runtime": 28.2125, "eval_samples_per_second": 70.891, "eval_steps_per_second": 0.284, "step": 1300 }, { "epoch": 3.4828496042216357, "grad_norm": 0.1611328125, "learning_rate": 6.192784060312333e-05, "loss": 0.0745, "step": 1320 }, { "epoch": 3.5356200527704487, "grad_norm": 0.1142578125, "learning_rate": 5.9773828756058156e-05, "loss": 0.0766, "step": 1340 }, { "epoch": 3.5620052770448547, "eval_loss": 0.05800151824951172, "eval_runtime": 27.919, "eval_samples_per_second": 71.636, "eval_steps_per_second": 0.287, "step": 1350 }, { "epoch": 3.588390501319261, "grad_norm": 0.140625, "learning_rate": 5.7619816908993005e-05, "loss": 0.0753, "step": 1360 }, { "epoch": 3.641160949868074, "grad_norm": 0.1328125, "learning_rate": 5.5465805061927846e-05, "loss": 0.0772, "step": 1380 }, { "epoch": 3.6939313984168867, "grad_norm": 0.1328125, "learning_rate": 5.331179321486268e-05, "loss": 0.0716, "step": 1400 }, { "epoch": 3.6939313984168867, "eval_loss": 0.057653266936540604, "eval_runtime": 28.2955, "eval_samples_per_second": 70.683, "eval_steps_per_second": 0.283, "step": 1400 }, { "epoch": 3.746701846965699, "grad_norm": 0.1513671875, "learning_rate": 5.115778136779753e-05, "loss": 0.0744, "step": 1420 }, { "epoch": 3.7994722955145117, "grad_norm": 0.1259765625, "learning_rate": 4.9003769520732365e-05, "loss": 0.0777, "step": 1440 }, { "epoch": 3.825857519788918, "eval_loss": 0.05697743222117424, "eval_runtime": 28.2563, "eval_samples_per_second": 70.781, "eval_steps_per_second": 0.283, "step": 1450 }, { "epoch": 3.8522427440633247, "grad_norm": 0.1640625, "learning_rate": 4.6849757673667206e-05, "loss": 0.0736, "step": 1460 }, { "epoch": 3.905013192612137, "grad_norm": 0.1318359375, "learning_rate": 4.469574582660205e-05, "loss": 0.0753, "step": 1480 }, { "epoch": 3.9577836411609497, "grad_norm": 0.12255859375, "learning_rate": 4.254173397953689e-05, "loss": 0.0745, "step": 1500 }, { "epoch": 3.9577836411609497, "eval_loss": 0.05676369369029999, "eval_runtime": 27.6767, "eval_samples_per_second": 72.263, "eval_steps_per_second": 0.289, "step": 1500 } ], "logging_steps": 20, "max_steps": 1895, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.03702605821971e+19, "train_batch_size": 128, "trial_name": null, "trial_params": null }