{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99626400996264, "eval_steps": 500, "global_step": 1203, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024906600249066, "grad_norm": 10.55736720812015, "learning_rate": 5e-06, "loss": 0.899, "step": 10 }, { "epoch": 0.049813200498132, "grad_norm": 2.4056192000192786, "learning_rate": 5e-06, "loss": 0.7632, "step": 20 }, { "epoch": 0.074719800747198, "grad_norm": 1.0776649113170846, "learning_rate": 5e-06, "loss": 0.7213, "step": 30 }, { "epoch": 0.099626400996264, "grad_norm": 1.2974590316911792, "learning_rate": 5e-06, "loss": 0.6892, "step": 40 }, { "epoch": 0.12453300124533001, "grad_norm": 0.8664841939523225, "learning_rate": 5e-06, "loss": 0.6708, "step": 50 }, { "epoch": 0.149439601494396, "grad_norm": 0.7916914326975599, "learning_rate": 5e-06, "loss": 0.6504, "step": 60 }, { "epoch": 0.17434620174346202, "grad_norm": 0.8355113234133088, "learning_rate": 5e-06, "loss": 0.6407, "step": 70 }, { "epoch": 0.199252801992528, "grad_norm": 0.6185509662301869, "learning_rate": 5e-06, "loss": 0.6377, "step": 80 }, { "epoch": 0.22415940224159403, "grad_norm": 0.5793150300164642, "learning_rate": 5e-06, "loss": 0.6276, "step": 90 }, { "epoch": 0.24906600249066002, "grad_norm": 0.5522899454819212, "learning_rate": 5e-06, "loss": 0.6229, "step": 100 }, { "epoch": 0.273972602739726, "grad_norm": 0.6471095275689863, "learning_rate": 5e-06, "loss": 0.6154, "step": 110 }, { "epoch": 0.298879202988792, "grad_norm": 0.6264851589702339, "learning_rate": 5e-06, "loss": 0.6104, "step": 120 }, { "epoch": 0.32378580323785805, "grad_norm": 0.705297908293922, "learning_rate": 5e-06, "loss": 0.6094, "step": 130 }, { "epoch": 0.34869240348692404, "grad_norm": 0.620653193497072, "learning_rate": 5e-06, "loss": 0.6079, "step": 140 }, { "epoch": 0.37359900373599003, "grad_norm": 0.9444217688486999, "learning_rate": 5e-06, "loss": 0.5983, "step": 150 }, { "epoch": 0.398505603985056, "grad_norm": 0.6034795779576418, "learning_rate": 5e-06, "loss": 0.6044, "step": 160 }, { "epoch": 0.42341220423412207, "grad_norm": 0.4974203412589217, "learning_rate": 5e-06, "loss": 0.5962, "step": 170 }, { "epoch": 0.44831880448318806, "grad_norm": 0.728230577535382, "learning_rate": 5e-06, "loss": 0.5975, "step": 180 }, { "epoch": 0.47322540473225405, "grad_norm": 0.49214457212025137, "learning_rate": 5e-06, "loss": 0.5986, "step": 190 }, { "epoch": 0.49813200498132004, "grad_norm": 0.5432395369586009, "learning_rate": 5e-06, "loss": 0.5885, "step": 200 }, { "epoch": 0.523038605230386, "grad_norm": 0.5473973456491285, "learning_rate": 5e-06, "loss": 0.5901, "step": 210 }, { "epoch": 0.547945205479452, "grad_norm": 0.5759522711461309, "learning_rate": 5e-06, "loss": 0.586, "step": 220 }, { "epoch": 0.572851805728518, "grad_norm": 0.5583334053126371, "learning_rate": 5e-06, "loss": 0.5878, "step": 230 }, { "epoch": 0.597758405977584, "grad_norm": 0.7058646192825392, "learning_rate": 5e-06, "loss": 0.5818, "step": 240 }, { "epoch": 0.6226650062266501, "grad_norm": 0.5788660860127395, "learning_rate": 5e-06, "loss": 0.5853, "step": 250 }, { "epoch": 0.6475716064757161, "grad_norm": 0.6402936090502463, "learning_rate": 5e-06, "loss": 0.5769, "step": 260 }, { "epoch": 0.6724782067247821, "grad_norm": 0.5496743019012175, "learning_rate": 5e-06, "loss": 0.5762, "step": 270 }, { "epoch": 0.6973848069738481, "grad_norm": 0.5455503536049673, "learning_rate": 5e-06, "loss": 0.5808, "step": 280 }, { "epoch": 0.7222914072229141, "grad_norm": 0.5883422768851626, "learning_rate": 5e-06, "loss": 0.5747, "step": 290 }, { "epoch": 0.7471980074719801, "grad_norm": 0.7956726883043881, "learning_rate": 5e-06, "loss": 0.5717, "step": 300 }, { "epoch": 0.772104607721046, "grad_norm": 0.6813019491351078, "learning_rate": 5e-06, "loss": 0.5722, "step": 310 }, { "epoch": 0.797011207970112, "grad_norm": 0.5185381604929972, "learning_rate": 5e-06, "loss": 0.5724, "step": 320 }, { "epoch": 0.821917808219178, "grad_norm": 0.6909227667005327, "learning_rate": 5e-06, "loss": 0.5691, "step": 330 }, { "epoch": 0.8468244084682441, "grad_norm": 0.4796775398588659, "learning_rate": 5e-06, "loss": 0.5652, "step": 340 }, { "epoch": 0.8717310087173101, "grad_norm": 0.5758165163544798, "learning_rate": 5e-06, "loss": 0.5736, "step": 350 }, { "epoch": 0.8966376089663761, "grad_norm": 0.5892321233613834, "learning_rate": 5e-06, "loss": 0.5663, "step": 360 }, { "epoch": 0.9215442092154421, "grad_norm": 0.5088131774112012, "learning_rate": 5e-06, "loss": 0.5674, "step": 370 }, { "epoch": 0.9464508094645081, "grad_norm": 0.5017508892526636, "learning_rate": 5e-06, "loss": 0.563, "step": 380 }, { "epoch": 0.9713574097135741, "grad_norm": 0.6797455724948757, "learning_rate": 5e-06, "loss": 0.5674, "step": 390 }, { "epoch": 0.9962640099626401, "grad_norm": 0.5319255477795009, "learning_rate": 5e-06, "loss": 0.562, "step": 400 }, { "epoch": 0.9987546699875467, "eval_loss": 0.5648990869522095, "eval_runtime": 215.2105, "eval_samples_per_second": 50.272, "eval_steps_per_second": 0.395, "step": 401 }, { "epoch": 1.0211706102117062, "grad_norm": 0.6332423226665538, "learning_rate": 5e-06, "loss": 0.5373, "step": 410 }, { "epoch": 1.046077210460772, "grad_norm": 0.5250206863412192, "learning_rate": 5e-06, "loss": 0.5262, "step": 420 }, { "epoch": 1.0709838107098382, "grad_norm": 0.6270808658082102, "learning_rate": 5e-06, "loss": 0.5275, "step": 430 }, { "epoch": 1.095890410958904, "grad_norm": 0.5516867253896128, "learning_rate": 5e-06, "loss": 0.5262, "step": 440 }, { "epoch": 1.1207970112079702, "grad_norm": 0.5358792491430266, "learning_rate": 5e-06, "loss": 0.5319, "step": 450 }, { "epoch": 1.145703611457036, "grad_norm": 0.7694988300392477, "learning_rate": 5e-06, "loss": 0.5251, "step": 460 }, { "epoch": 1.1706102117061021, "grad_norm": 0.5425248792129497, "learning_rate": 5e-06, "loss": 0.5255, "step": 470 }, { "epoch": 1.195516811955168, "grad_norm": 0.6608067800821592, "learning_rate": 5e-06, "loss": 0.5312, "step": 480 }, { "epoch": 1.2204234122042341, "grad_norm": 0.5892306715949471, "learning_rate": 5e-06, "loss": 0.5292, "step": 490 }, { "epoch": 1.2453300124533002, "grad_norm": 0.6342257885292061, "learning_rate": 5e-06, "loss": 0.5282, "step": 500 }, { "epoch": 1.270236612702366, "grad_norm": 0.6358865929230534, "learning_rate": 5e-06, "loss": 0.5256, "step": 510 }, { "epoch": 1.2951432129514322, "grad_norm": 0.5977464268642516, "learning_rate": 5e-06, "loss": 0.5216, "step": 520 }, { "epoch": 1.320049813200498, "grad_norm": 0.5713712938047752, "learning_rate": 5e-06, "loss": 0.5246, "step": 530 }, { "epoch": 1.3449564134495642, "grad_norm": 0.5806626902728351, "learning_rate": 5e-06, "loss": 0.522, "step": 540 }, { "epoch": 1.36986301369863, "grad_norm": 0.6614502597905593, "learning_rate": 5e-06, "loss": 0.5232, "step": 550 }, { "epoch": 1.3947696139476962, "grad_norm": 0.5826714843440824, "learning_rate": 5e-06, "loss": 0.5243, "step": 560 }, { "epoch": 1.419676214196762, "grad_norm": 0.5870354154595686, "learning_rate": 5e-06, "loss": 0.5277, "step": 570 }, { "epoch": 1.4445828144458281, "grad_norm": 0.5517404762107168, "learning_rate": 5e-06, "loss": 0.5185, "step": 580 }, { "epoch": 1.4694894146948942, "grad_norm": 0.6548505841520894, "learning_rate": 5e-06, "loss": 0.5254, "step": 590 }, { "epoch": 1.4943960149439601, "grad_norm": 0.4798373315690756, "learning_rate": 5e-06, "loss": 0.5223, "step": 600 }, { "epoch": 1.519302615193026, "grad_norm": 0.561260368162772, "learning_rate": 5e-06, "loss": 0.5208, "step": 610 }, { "epoch": 1.544209215442092, "grad_norm": 0.4776435289710049, "learning_rate": 5e-06, "loss": 0.5169, "step": 620 }, { "epoch": 1.5691158156911582, "grad_norm": 0.5736062837326033, "learning_rate": 5e-06, "loss": 0.518, "step": 630 }, { "epoch": 1.5940224159402243, "grad_norm": 0.5605397398468376, "learning_rate": 5e-06, "loss": 0.519, "step": 640 }, { "epoch": 1.6189290161892902, "grad_norm": 0.5414003173485504, "learning_rate": 5e-06, "loss": 0.5141, "step": 650 }, { "epoch": 1.643835616438356, "grad_norm": 0.5029725162688677, "learning_rate": 5e-06, "loss": 0.5123, "step": 660 }, { "epoch": 1.6687422166874222, "grad_norm": 0.49094415690364374, "learning_rate": 5e-06, "loss": 0.5103, "step": 670 }, { "epoch": 1.6936488169364883, "grad_norm": 0.6443717801298894, "learning_rate": 5e-06, "loss": 0.5173, "step": 680 }, { "epoch": 1.7185554171855542, "grad_norm": 0.6131661065579275, "learning_rate": 5e-06, "loss": 0.5123, "step": 690 }, { "epoch": 1.74346201743462, "grad_norm": 0.5102402228309177, "learning_rate": 5e-06, "loss": 0.5117, "step": 700 }, { "epoch": 1.7683686176836861, "grad_norm": 0.6532768204087063, "learning_rate": 5e-06, "loss": 0.5116, "step": 710 }, { "epoch": 1.7932752179327522, "grad_norm": 0.5359103919656766, "learning_rate": 5e-06, "loss": 0.5112, "step": 720 }, { "epoch": 1.8181818181818183, "grad_norm": 0.4654117626073739, "learning_rate": 5e-06, "loss": 0.5108, "step": 730 }, { "epoch": 1.8430884184308842, "grad_norm": 0.4816605018327504, "learning_rate": 5e-06, "loss": 0.5125, "step": 740 }, { "epoch": 1.86799501867995, "grad_norm": 0.5078445355540058, "learning_rate": 5e-06, "loss": 0.5174, "step": 750 }, { "epoch": 1.8929016189290162, "grad_norm": 0.5123586401271214, "learning_rate": 5e-06, "loss": 0.5103, "step": 760 }, { "epoch": 1.9178082191780823, "grad_norm": 0.512594100070009, "learning_rate": 5e-06, "loss": 0.5125, "step": 770 }, { "epoch": 1.9427148194271482, "grad_norm": 0.5251535719075618, "learning_rate": 5e-06, "loss": 0.5097, "step": 780 }, { "epoch": 1.967621419676214, "grad_norm": 0.5598844434696433, "learning_rate": 5e-06, "loss": 0.5114, "step": 790 }, { "epoch": 1.9925280199252802, "grad_norm": 0.5580903372050761, "learning_rate": 5e-06, "loss": 0.5112, "step": 800 }, { "epoch": 2.0, "eval_loss": 0.5391710996627808, "eval_runtime": 215.2899, "eval_samples_per_second": 50.253, "eval_steps_per_second": 0.395, "step": 803 }, { "epoch": 2.0174346201743463, "grad_norm": 0.7489739683512228, "learning_rate": 5e-06, "loss": 0.4772, "step": 810 }, { "epoch": 2.0423412204234124, "grad_norm": 0.6206410773051714, "learning_rate": 5e-06, "loss": 0.4773, "step": 820 }, { "epoch": 2.067247820672478, "grad_norm": 0.5964416221117852, "learning_rate": 5e-06, "loss": 0.4774, "step": 830 }, { "epoch": 2.092154420921544, "grad_norm": 0.7072977668899793, "learning_rate": 5e-06, "loss": 0.4734, "step": 840 }, { "epoch": 2.1170610211706102, "grad_norm": 0.4869213806678473, "learning_rate": 5e-06, "loss": 0.4738, "step": 850 }, { "epoch": 2.1419676214196763, "grad_norm": 0.5559708671866962, "learning_rate": 5e-06, "loss": 0.4739, "step": 860 }, { "epoch": 2.166874221668742, "grad_norm": 0.590838180422237, "learning_rate": 5e-06, "loss": 0.4781, "step": 870 }, { "epoch": 2.191780821917808, "grad_norm": 0.5449563311605545, "learning_rate": 5e-06, "loss": 0.4768, "step": 880 }, { "epoch": 2.216687422166874, "grad_norm": 0.49509703126867355, "learning_rate": 5e-06, "loss": 0.4702, "step": 890 }, { "epoch": 2.2415940224159403, "grad_norm": 0.583235606431074, "learning_rate": 5e-06, "loss": 0.4785, "step": 900 }, { "epoch": 2.2665006226650064, "grad_norm": 0.5224340118110492, "learning_rate": 5e-06, "loss": 0.4688, "step": 910 }, { "epoch": 2.291407222914072, "grad_norm": 0.5751594164855693, "learning_rate": 5e-06, "loss": 0.4685, "step": 920 }, { "epoch": 2.316313823163138, "grad_norm": 0.5440703036591379, "learning_rate": 5e-06, "loss": 0.4773, "step": 930 }, { "epoch": 2.3412204234122043, "grad_norm": 0.7610931862664628, "learning_rate": 5e-06, "loss": 0.472, "step": 940 }, { "epoch": 2.3661270236612704, "grad_norm": 0.5531311055353737, "learning_rate": 5e-06, "loss": 0.4708, "step": 950 }, { "epoch": 2.391033623910336, "grad_norm": 0.511600827539314, "learning_rate": 5e-06, "loss": 0.4764, "step": 960 }, { "epoch": 2.415940224159402, "grad_norm": 0.5621388473604839, "learning_rate": 5e-06, "loss": 0.4792, "step": 970 }, { "epoch": 2.4408468244084682, "grad_norm": 0.574647600528262, "learning_rate": 5e-06, "loss": 0.4722, "step": 980 }, { "epoch": 2.4657534246575343, "grad_norm": 0.5174894983673209, "learning_rate": 5e-06, "loss": 0.4752, "step": 990 }, { "epoch": 2.4906600249066004, "grad_norm": 0.49985821335778424, "learning_rate": 5e-06, "loss": 0.4824, "step": 1000 }, { "epoch": 2.515566625155666, "grad_norm": 0.6533346544154589, "learning_rate": 5e-06, "loss": 0.468, "step": 1010 }, { "epoch": 2.540473225404732, "grad_norm": 0.5759385792339666, "learning_rate": 5e-06, "loss": 0.4759, "step": 1020 }, { "epoch": 2.5653798256537983, "grad_norm": 0.5706835517348989, "learning_rate": 5e-06, "loss": 0.473, "step": 1030 }, { "epoch": 2.5902864259028644, "grad_norm": 0.48023447741714353, "learning_rate": 5e-06, "loss": 0.4758, "step": 1040 }, { "epoch": 2.61519302615193, "grad_norm": 0.48468151801419096, "learning_rate": 5e-06, "loss": 0.4721, "step": 1050 }, { "epoch": 2.640099626400996, "grad_norm": 0.5091065263641408, "learning_rate": 5e-06, "loss": 0.479, "step": 1060 }, { "epoch": 2.6650062266500623, "grad_norm": 0.5876312291375192, "learning_rate": 5e-06, "loss": 0.471, "step": 1070 }, { "epoch": 2.6899128268991284, "grad_norm": 0.5815028633704035, "learning_rate": 5e-06, "loss": 0.4762, "step": 1080 }, { "epoch": 2.7148194271481945, "grad_norm": 0.5605250992780186, "learning_rate": 5e-06, "loss": 0.4735, "step": 1090 }, { "epoch": 2.73972602739726, "grad_norm": 0.5741942655339728, "learning_rate": 5e-06, "loss": 0.4747, "step": 1100 }, { "epoch": 2.7646326276463262, "grad_norm": 0.4775069856578055, "learning_rate": 5e-06, "loss": 0.4757, "step": 1110 }, { "epoch": 2.7895392278953923, "grad_norm": 0.498224788090082, "learning_rate": 5e-06, "loss": 0.4741, "step": 1120 }, { "epoch": 2.8144458281444584, "grad_norm": 0.48637748558305866, "learning_rate": 5e-06, "loss": 0.4767, "step": 1130 }, { "epoch": 2.839352428393524, "grad_norm": 0.5090237580915907, "learning_rate": 5e-06, "loss": 0.4734, "step": 1140 }, { "epoch": 2.86425902864259, "grad_norm": 0.4947938642895003, "learning_rate": 5e-06, "loss": 0.4782, "step": 1150 }, { "epoch": 2.8891656288916563, "grad_norm": 0.5195966815447782, "learning_rate": 5e-06, "loss": 0.4803, "step": 1160 }, { "epoch": 2.9140722291407224, "grad_norm": 0.5212585367090384, "learning_rate": 5e-06, "loss": 0.4722, "step": 1170 }, { "epoch": 2.9389788293897885, "grad_norm": 0.514479905781623, "learning_rate": 5e-06, "loss": 0.4752, "step": 1180 }, { "epoch": 2.963885429638854, "grad_norm": 0.5039971892926054, "learning_rate": 5e-06, "loss": 0.475, "step": 1190 }, { "epoch": 2.9887920298879203, "grad_norm": 0.4924909134578761, "learning_rate": 5e-06, "loss": 0.4697, "step": 1200 }, { "epoch": 2.99626400996264, "eval_loss": 0.5331025123596191, "eval_runtime": 217.1359, "eval_samples_per_second": 49.826, "eval_steps_per_second": 0.391, "step": 1203 }, { "epoch": 2.99626400996264, "step": 1203, "total_flos": 2014651046952960.0, "train_loss": 0.5350556444943397, "train_runtime": 35909.5127, "train_samples_per_second": 17.172, "train_steps_per_second": 0.034 } ], "logging_steps": 10, "max_steps": 1203, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2014651046952960.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }