{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9957446808510637, "eval_steps": 500, "global_step": 1320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02269503546099291, "grad_norm": 2.4400690431145335, "learning_rate": 5e-06, "loss": 0.9537, "step": 10 }, { "epoch": 0.04539007092198582, "grad_norm": 1.0371553805334768, "learning_rate": 5e-06, "loss": 0.8304, "step": 20 }, { "epoch": 0.06808510638297872, "grad_norm": 0.8207371359291633, "learning_rate": 5e-06, "loss": 0.7801, "step": 30 }, { "epoch": 0.09078014184397164, "grad_norm": 0.7210586495004421, "learning_rate": 5e-06, "loss": 0.7522, "step": 40 }, { "epoch": 0.11347517730496454, "grad_norm": 0.815177982428463, "learning_rate": 5e-06, "loss": 0.739, "step": 50 }, { "epoch": 0.13617021276595745, "grad_norm": 2.1607006669022053, "learning_rate": 5e-06, "loss": 0.731, "step": 60 }, { "epoch": 0.15886524822695036, "grad_norm": 0.8260032479277283, "learning_rate": 5e-06, "loss": 0.7245, "step": 70 }, { "epoch": 0.18156028368794327, "grad_norm": 0.6043557996158753, "learning_rate": 5e-06, "loss": 0.709, "step": 80 }, { "epoch": 0.20425531914893616, "grad_norm": 0.6817812588474443, "learning_rate": 5e-06, "loss": 0.7065, "step": 90 }, { "epoch": 0.22695035460992907, "grad_norm": 0.6633270827705589, "learning_rate": 5e-06, "loss": 0.7042, "step": 100 }, { "epoch": 0.24964539007092199, "grad_norm": 0.8788640026205816, "learning_rate": 5e-06, "loss": 0.6937, "step": 110 }, { "epoch": 0.2723404255319149, "grad_norm": 0.6318189161156693, "learning_rate": 5e-06, "loss": 0.6875, "step": 120 }, { "epoch": 0.2950354609929078, "grad_norm": 0.6116775844736505, "learning_rate": 5e-06, "loss": 0.6882, "step": 130 }, { "epoch": 0.3177304964539007, "grad_norm": 0.5442399734240484, "learning_rate": 5e-06, "loss": 0.6829, "step": 140 }, { "epoch": 0.3404255319148936, "grad_norm": 0.772917712294573, "learning_rate": 5e-06, "loss": 0.6778, "step": 150 }, { "epoch": 0.36312056737588655, "grad_norm": 0.5042750571740913, "learning_rate": 5e-06, "loss": 0.6826, "step": 160 }, { "epoch": 0.38581560283687943, "grad_norm": 0.5272476451832978, "learning_rate": 5e-06, "loss": 0.6791, "step": 170 }, { "epoch": 0.4085106382978723, "grad_norm": 0.6477667523961554, "learning_rate": 5e-06, "loss": 0.6778, "step": 180 }, { "epoch": 0.43120567375886526, "grad_norm": 0.6825001542564298, "learning_rate": 5e-06, "loss": 0.6727, "step": 190 }, { "epoch": 0.45390070921985815, "grad_norm": 0.7223077265284692, "learning_rate": 5e-06, "loss": 0.6766, "step": 200 }, { "epoch": 0.4765957446808511, "grad_norm": 0.7610671419515379, "learning_rate": 5e-06, "loss": 0.6754, "step": 210 }, { "epoch": 0.49929078014184397, "grad_norm": 0.5918397478370985, "learning_rate": 5e-06, "loss": 0.6707, "step": 220 }, { "epoch": 0.5219858156028369, "grad_norm": 0.5937553589475805, "learning_rate": 5e-06, "loss": 0.6695, "step": 230 }, { "epoch": 0.5446808510638298, "grad_norm": 0.6308694971546095, "learning_rate": 5e-06, "loss": 0.6706, "step": 240 }, { "epoch": 0.5673758865248227, "grad_norm": 0.5145635067713508, "learning_rate": 5e-06, "loss": 0.6638, "step": 250 }, { "epoch": 0.5900709219858156, "grad_norm": 0.8960582295099536, "learning_rate": 5e-06, "loss": 0.659, "step": 260 }, { "epoch": 0.6127659574468085, "grad_norm": 0.5714201206969344, "learning_rate": 5e-06, "loss": 0.6648, "step": 270 }, { "epoch": 0.6354609929078014, "grad_norm": 0.5218865972600176, "learning_rate": 5e-06, "loss": 0.6692, "step": 280 }, { "epoch": 0.6581560283687943, "grad_norm": 0.908425551953448, "learning_rate": 5e-06, "loss": 0.6603, "step": 290 }, { "epoch": 0.6808510638297872, "grad_norm": 0.5937599038511239, "learning_rate": 5e-06, "loss": 0.6616, "step": 300 }, { "epoch": 0.7035460992907802, "grad_norm": 0.5791492049959488, "learning_rate": 5e-06, "loss": 0.6573, "step": 310 }, { "epoch": 0.7262411347517731, "grad_norm": 0.5544912073084471, "learning_rate": 5e-06, "loss": 0.6561, "step": 320 }, { "epoch": 0.7489361702127659, "grad_norm": 0.6161595885325214, "learning_rate": 5e-06, "loss": 0.6535, "step": 330 }, { "epoch": 0.7716312056737589, "grad_norm": 0.5292955751782062, "learning_rate": 5e-06, "loss": 0.6575, "step": 340 }, { "epoch": 0.7943262411347518, "grad_norm": 0.5125989549019483, "learning_rate": 5e-06, "loss": 0.6508, "step": 350 }, { "epoch": 0.8170212765957446, "grad_norm": 0.4999207740358057, "learning_rate": 5e-06, "loss": 0.652, "step": 360 }, { "epoch": 0.8397163120567376, "grad_norm": 0.5737129637592092, "learning_rate": 5e-06, "loss": 0.6487, "step": 370 }, { "epoch": 0.8624113475177305, "grad_norm": 0.697556121614893, "learning_rate": 5e-06, "loss": 0.6537, "step": 380 }, { "epoch": 0.8851063829787233, "grad_norm": 0.8803924578376542, "learning_rate": 5e-06, "loss": 0.6528, "step": 390 }, { "epoch": 0.9078014184397163, "grad_norm": 0.7209729745886211, "learning_rate": 5e-06, "loss": 0.6499, "step": 400 }, { "epoch": 0.9304964539007092, "grad_norm": 0.631225301000223, "learning_rate": 5e-06, "loss": 0.6493, "step": 410 }, { "epoch": 0.9531914893617022, "grad_norm": 0.5302593707285189, "learning_rate": 5e-06, "loss": 0.651, "step": 420 }, { "epoch": 0.975886524822695, "grad_norm": 0.5216458807768983, "learning_rate": 5e-06, "loss": 0.6438, "step": 430 }, { "epoch": 0.9985815602836879, "grad_norm": 0.5024039028278129, "learning_rate": 5e-06, "loss": 0.6478, "step": 440 }, { "epoch": 0.9985815602836879, "eval_loss": 0.64774489402771, "eval_runtime": 312.1779, "eval_samples_per_second": 38.036, "eval_steps_per_second": 0.596, "step": 440 }, { "epoch": 1.0212765957446808, "grad_norm": 0.6519926768853314, "learning_rate": 5e-06, "loss": 0.6161, "step": 450 }, { "epoch": 1.0439716312056737, "grad_norm": 0.575055735064131, "learning_rate": 5e-06, "loss": 0.589, "step": 460 }, { "epoch": 1.0666666666666667, "grad_norm": 0.9082080666134971, "learning_rate": 5e-06, "loss": 0.5912, "step": 470 }, { "epoch": 1.0893617021276596, "grad_norm": 0.583715919536177, "learning_rate": 5e-06, "loss": 0.5913, "step": 480 }, { "epoch": 1.1120567375886525, "grad_norm": 0.5975048919359871, "learning_rate": 5e-06, "loss": 0.5976, "step": 490 }, { "epoch": 1.1347517730496455, "grad_norm": 0.5418419185217853, "learning_rate": 5e-06, "loss": 0.5931, "step": 500 }, { "epoch": 1.1574468085106382, "grad_norm": 0.638492552707852, "learning_rate": 5e-06, "loss": 0.5913, "step": 510 }, { "epoch": 1.1801418439716311, "grad_norm": 0.5450441529585099, "learning_rate": 5e-06, "loss": 0.5888, "step": 520 }, { "epoch": 1.202836879432624, "grad_norm": 0.667596474347774, "learning_rate": 5e-06, "loss": 0.5926, "step": 530 }, { "epoch": 1.225531914893617, "grad_norm": 0.5901671600943978, "learning_rate": 5e-06, "loss": 0.5905, "step": 540 }, { "epoch": 1.24822695035461, "grad_norm": 0.5680819366472465, "learning_rate": 5e-06, "loss": 0.5915, "step": 550 }, { "epoch": 1.270921985815603, "grad_norm": 0.7905201905390314, "learning_rate": 5e-06, "loss": 0.5948, "step": 560 }, { "epoch": 1.2936170212765958, "grad_norm": 0.5581341122325792, "learning_rate": 5e-06, "loss": 0.5904, "step": 570 }, { "epoch": 1.3163120567375888, "grad_norm": 0.5968971489169257, "learning_rate": 5e-06, "loss": 0.5927, "step": 580 }, { "epoch": 1.3390070921985815, "grad_norm": 0.6127986193776547, "learning_rate": 5e-06, "loss": 0.5889, "step": 590 }, { "epoch": 1.3617021276595744, "grad_norm": 0.5540746342979398, "learning_rate": 5e-06, "loss": 0.5998, "step": 600 }, { "epoch": 1.3843971631205674, "grad_norm": 0.6578848756357453, "learning_rate": 5e-06, "loss": 0.5933, "step": 610 }, { "epoch": 1.4070921985815603, "grad_norm": 0.752983431690844, "learning_rate": 5e-06, "loss": 0.5967, "step": 620 }, { "epoch": 1.4297872340425533, "grad_norm": 0.6009296481522326, "learning_rate": 5e-06, "loss": 0.5886, "step": 630 }, { "epoch": 1.452482269503546, "grad_norm": 0.6855926736224828, "learning_rate": 5e-06, "loss": 0.5931, "step": 640 }, { "epoch": 1.475177304964539, "grad_norm": 0.5390133364015494, "learning_rate": 5e-06, "loss": 0.5926, "step": 650 }, { "epoch": 1.4978723404255319, "grad_norm": 0.6338310115530009, "learning_rate": 5e-06, "loss": 0.5969, "step": 660 }, { "epoch": 1.5205673758865248, "grad_norm": 0.5298069642997515, "learning_rate": 5e-06, "loss": 0.5868, "step": 670 }, { "epoch": 1.5432624113475177, "grad_norm": 0.5254529387411777, "learning_rate": 5e-06, "loss": 0.5893, "step": 680 }, { "epoch": 1.5659574468085107, "grad_norm": 0.5090891733108535, "learning_rate": 5e-06, "loss": 0.5937, "step": 690 }, { "epoch": 1.5886524822695036, "grad_norm": 0.6276632546884366, "learning_rate": 5e-06, "loss": 0.5856, "step": 700 }, { "epoch": 1.6113475177304966, "grad_norm": 0.5618088874975952, "learning_rate": 5e-06, "loss": 0.5796, "step": 710 }, { "epoch": 1.6340425531914895, "grad_norm": 0.5451154576203426, "learning_rate": 5e-06, "loss": 0.584, "step": 720 }, { "epoch": 1.6567375886524822, "grad_norm": 0.5751145914968212, "learning_rate": 5e-06, "loss": 0.5882, "step": 730 }, { "epoch": 1.6794326241134752, "grad_norm": 0.5457326197852193, "learning_rate": 5e-06, "loss": 0.5893, "step": 740 }, { "epoch": 1.702127659574468, "grad_norm": 0.5538870750639828, "learning_rate": 5e-06, "loss": 0.5895, "step": 750 }, { "epoch": 1.724822695035461, "grad_norm": 0.6482629759445236, "learning_rate": 5e-06, "loss": 0.5883, "step": 760 }, { "epoch": 1.7475177304964538, "grad_norm": 0.604796917686035, "learning_rate": 5e-06, "loss": 0.5908, "step": 770 }, { "epoch": 1.7702127659574467, "grad_norm": 0.5570179578262068, "learning_rate": 5e-06, "loss": 0.5935, "step": 780 }, { "epoch": 1.7929078014184396, "grad_norm": 0.5359469482507023, "learning_rate": 5e-06, "loss": 0.5833, "step": 790 }, { "epoch": 1.8156028368794326, "grad_norm": 0.6340496302965002, "learning_rate": 5e-06, "loss": 0.5853, "step": 800 }, { "epoch": 1.8382978723404255, "grad_norm": 0.5742298835017674, "learning_rate": 5e-06, "loss": 0.5823, "step": 810 }, { "epoch": 1.8609929078014185, "grad_norm": 0.5827541219871901, "learning_rate": 5e-06, "loss": 0.5903, "step": 820 }, { "epoch": 1.8836879432624114, "grad_norm": 0.528321132004614, "learning_rate": 5e-06, "loss": 0.5832, "step": 830 }, { "epoch": 1.9063829787234043, "grad_norm": 0.565101788703942, "learning_rate": 5e-06, "loss": 0.5855, "step": 840 }, { "epoch": 1.9290780141843973, "grad_norm": 0.601792497070637, "learning_rate": 5e-06, "loss": 0.5842, "step": 850 }, { "epoch": 1.9517730496453902, "grad_norm": 0.5088677439673144, "learning_rate": 5e-06, "loss": 0.5821, "step": 860 }, { "epoch": 1.974468085106383, "grad_norm": 0.6353626361966195, "learning_rate": 5e-06, "loss": 0.5865, "step": 870 }, { "epoch": 1.9971631205673759, "grad_norm": 0.627955882058217, "learning_rate": 5e-06, "loss": 0.5869, "step": 880 }, { "epoch": 1.9994326241134752, "eval_loss": 0.63065505027771, "eval_runtime": 299.005, "eval_samples_per_second": 39.712, "eval_steps_per_second": 0.622, "step": 881 }, { "epoch": 2.0198581560283686, "grad_norm": 0.7421753177971604, "learning_rate": 5e-06, "loss": 0.5575, "step": 890 }, { "epoch": 2.0425531914893615, "grad_norm": 0.7475116659953, "learning_rate": 5e-06, "loss": 0.5333, "step": 900 }, { "epoch": 2.0652482269503545, "grad_norm": 0.6094678843754806, "learning_rate": 5e-06, "loss": 0.5281, "step": 910 }, { "epoch": 2.0879432624113474, "grad_norm": 0.6416679206326653, "learning_rate": 5e-06, "loss": 0.5307, "step": 920 }, { "epoch": 2.1106382978723404, "grad_norm": 0.6149270041451872, "learning_rate": 5e-06, "loss": 0.5253, "step": 930 }, { "epoch": 2.1333333333333333, "grad_norm": 0.5944382793137648, "learning_rate": 5e-06, "loss": 0.5321, "step": 940 }, { "epoch": 2.1560283687943262, "grad_norm": 0.5206404660542666, "learning_rate": 5e-06, "loss": 0.5327, "step": 950 }, { "epoch": 2.178723404255319, "grad_norm": 0.5421603421972422, "learning_rate": 5e-06, "loss": 0.5283, "step": 960 }, { "epoch": 2.201418439716312, "grad_norm": 0.630367827768556, "learning_rate": 5e-06, "loss": 0.5312, "step": 970 }, { "epoch": 2.224113475177305, "grad_norm": 0.5289786942278032, "learning_rate": 5e-06, "loss": 0.5281, "step": 980 }, { "epoch": 2.246808510638298, "grad_norm": 0.6186809984064454, "learning_rate": 5e-06, "loss": 0.5287, "step": 990 }, { "epoch": 2.269503546099291, "grad_norm": 0.5941474628916863, "learning_rate": 5e-06, "loss": 0.5349, "step": 1000 }, { "epoch": 2.2921985815602834, "grad_norm": 0.570443876715086, "learning_rate": 5e-06, "loss": 0.5295, "step": 1010 }, { "epoch": 2.3148936170212764, "grad_norm": 0.6792218051762158, "learning_rate": 5e-06, "loss": 0.5323, "step": 1020 }, { "epoch": 2.3375886524822693, "grad_norm": 0.5535124105821935, "learning_rate": 5e-06, "loss": 0.5277, "step": 1030 }, { "epoch": 2.3602836879432623, "grad_norm": 0.667112587037914, "learning_rate": 5e-06, "loss": 0.5276, "step": 1040 }, { "epoch": 2.382978723404255, "grad_norm": 0.5987387148760719, "learning_rate": 5e-06, "loss": 0.5295, "step": 1050 }, { "epoch": 2.405673758865248, "grad_norm": 0.5484898675236806, "learning_rate": 5e-06, "loss": 0.5328, "step": 1060 }, { "epoch": 2.428368794326241, "grad_norm": 0.5736373550799053, "learning_rate": 5e-06, "loss": 0.534, "step": 1070 }, { "epoch": 2.451063829787234, "grad_norm": 0.5626598264859632, "learning_rate": 5e-06, "loss": 0.5356, "step": 1080 }, { "epoch": 2.473758865248227, "grad_norm": 0.6153434121318484, "learning_rate": 5e-06, "loss": 0.5306, "step": 1090 }, { "epoch": 2.49645390070922, "grad_norm": 0.7252891365142108, "learning_rate": 5e-06, "loss": 0.5307, "step": 1100 }, { "epoch": 2.519148936170213, "grad_norm": 0.6153968835692192, "learning_rate": 5e-06, "loss": 0.5331, "step": 1110 }, { "epoch": 2.541843971631206, "grad_norm": 0.5969808676825302, "learning_rate": 5e-06, "loss": 0.5363, "step": 1120 }, { "epoch": 2.5645390070921987, "grad_norm": 0.5692435999805617, "learning_rate": 5e-06, "loss": 0.5391, "step": 1130 }, { "epoch": 2.5872340425531917, "grad_norm": 0.6180618030016519, "learning_rate": 5e-06, "loss": 0.5299, "step": 1140 }, { "epoch": 2.6099290780141846, "grad_norm": 0.543137746234749, "learning_rate": 5e-06, "loss": 0.5374, "step": 1150 }, { "epoch": 2.6326241134751776, "grad_norm": 0.5200265379748215, "learning_rate": 5e-06, "loss": 0.5309, "step": 1160 }, { "epoch": 2.65531914893617, "grad_norm": 0.5194882503023576, "learning_rate": 5e-06, "loss": 0.5293, "step": 1170 }, { "epoch": 2.678014184397163, "grad_norm": 0.5570786417431203, "learning_rate": 5e-06, "loss": 0.5316, "step": 1180 }, { "epoch": 2.700709219858156, "grad_norm": 0.6007520350434941, "learning_rate": 5e-06, "loss": 0.5347, "step": 1190 }, { "epoch": 2.723404255319149, "grad_norm": 0.5662193506846984, "learning_rate": 5e-06, "loss": 0.5339, "step": 1200 }, { "epoch": 2.746099290780142, "grad_norm": 0.6675198025626778, "learning_rate": 5e-06, "loss": 0.5346, "step": 1210 }, { "epoch": 2.7687943262411348, "grad_norm": 0.6689385730256848, "learning_rate": 5e-06, "loss": 0.5303, "step": 1220 }, { "epoch": 2.7914893617021277, "grad_norm": 0.6400833035990827, "learning_rate": 5e-06, "loss": 0.5315, "step": 1230 }, { "epoch": 2.8141843971631206, "grad_norm": 0.6835204230746162, "learning_rate": 5e-06, "loss": 0.5367, "step": 1240 }, { "epoch": 2.8368794326241136, "grad_norm": 0.5862756798841194, "learning_rate": 5e-06, "loss": 0.5346, "step": 1250 }, { "epoch": 2.8595744680851065, "grad_norm": 0.5727844470422598, "learning_rate": 5e-06, "loss": 0.5376, "step": 1260 }, { "epoch": 2.8822695035460995, "grad_norm": 0.5445857583169009, "learning_rate": 5e-06, "loss": 0.5294, "step": 1270 }, { "epoch": 2.904964539007092, "grad_norm": 0.6161117247407584, "learning_rate": 5e-06, "loss": 0.5309, "step": 1280 }, { "epoch": 2.927659574468085, "grad_norm": 0.6027552334532725, "learning_rate": 5e-06, "loss": 0.5359, "step": 1290 }, { "epoch": 2.950354609929078, "grad_norm": 0.6005574689486347, "learning_rate": 5e-06, "loss": 0.531, "step": 1300 }, { "epoch": 2.9730496453900708, "grad_norm": 0.5590382510647179, "learning_rate": 5e-06, "loss": 0.5348, "step": 1310 }, { "epoch": 2.9957446808510637, "grad_norm": 0.5507647179314145, "learning_rate": 5e-06, "loss": 0.5325, "step": 1320 }, { "epoch": 2.9957446808510637, "eval_loss": 0.6309003829956055, "eval_runtime": 301.6902, "eval_samples_per_second": 39.358, "eval_steps_per_second": 0.617, "step": 1320 }, { "epoch": 2.9957446808510637, "step": 1320, "total_flos": 2210839784325120.0, "train_loss": 0.6033726125052481, "train_runtime": 44159.109, "train_samples_per_second": 15.326, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 1320, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2210839784325120.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }