{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9946777054997042, "eval_steps": 500, "global_step": 1266, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02365464222353637, "grad_norm": 39.04922181795541, "learning_rate": 5e-06, "loss": 0.9233, "step": 10 }, { "epoch": 0.04730928444707274, "grad_norm": 8.275301054500245, "learning_rate": 5e-06, "loss": 0.8198, "step": 20 }, { "epoch": 0.0709639266706091, "grad_norm": 16.91349063149131, "learning_rate": 5e-06, "loss": 0.8094, "step": 30 }, { "epoch": 0.09461856889414548, "grad_norm": 14.148988171179118, "learning_rate": 5e-06, "loss": 0.7679, "step": 40 }, { "epoch": 0.11827321111768184, "grad_norm": 1.026404466612184, "learning_rate": 5e-06, "loss": 0.7523, "step": 50 }, { "epoch": 0.1419278533412182, "grad_norm": 0.8542225183892468, "learning_rate": 5e-06, "loss": 0.725, "step": 60 }, { "epoch": 0.16558249556475457, "grad_norm": 0.7426518873227826, "learning_rate": 5e-06, "loss": 0.7185, "step": 70 }, { "epoch": 0.18923713778829096, "grad_norm": 0.6764382829611749, "learning_rate": 5e-06, "loss": 0.6999, "step": 80 }, { "epoch": 0.21289178001182732, "grad_norm": 0.5663920137594394, "learning_rate": 5e-06, "loss": 0.7, "step": 90 }, { "epoch": 0.23654642223536368, "grad_norm": 0.6218835377066562, "learning_rate": 5e-06, "loss": 0.6968, "step": 100 }, { "epoch": 0.26020106445890007, "grad_norm": 0.601497886261039, "learning_rate": 5e-06, "loss": 0.6885, "step": 110 }, { "epoch": 0.2838557066824364, "grad_norm": 0.8786039473525534, "learning_rate": 5e-06, "loss": 0.6863, "step": 120 }, { "epoch": 0.3075103489059728, "grad_norm": 0.5373447312315734, "learning_rate": 5e-06, "loss": 0.6792, "step": 130 }, { "epoch": 0.33116499112950915, "grad_norm": 0.5195473153997355, "learning_rate": 5e-06, "loss": 0.6746, "step": 140 }, { "epoch": 0.35481963335304556, "grad_norm": 0.5999120946052041, "learning_rate": 5e-06, "loss": 0.68, "step": 150 }, { "epoch": 0.3784742755765819, "grad_norm": 0.5351205582865509, "learning_rate": 5e-06, "loss": 0.6766, "step": 160 }, { "epoch": 0.4021289178001183, "grad_norm": 0.5425010920017291, "learning_rate": 5e-06, "loss": 0.6839, "step": 170 }, { "epoch": 0.42578356002365464, "grad_norm": 0.702582958286065, "learning_rate": 5e-06, "loss": 0.6711, "step": 180 }, { "epoch": 0.449438202247191, "grad_norm": 1.2564450479675764, "learning_rate": 5e-06, "loss": 0.6776, "step": 190 }, { "epoch": 0.47309284447072736, "grad_norm": 0.6226292820244915, "learning_rate": 5e-06, "loss": 0.6695, "step": 200 }, { "epoch": 0.4967474866942638, "grad_norm": 0.6729451633589562, "learning_rate": 5e-06, "loss": 0.6656, "step": 210 }, { "epoch": 0.5204021289178001, "grad_norm": 0.72360498041814, "learning_rate": 5e-06, "loss": 0.6699, "step": 220 }, { "epoch": 0.5440567711413364, "grad_norm": 0.5730443009789895, "learning_rate": 5e-06, "loss": 0.6693, "step": 230 }, { "epoch": 0.5677114133648729, "grad_norm": 0.6304204873609028, "learning_rate": 5e-06, "loss": 0.6564, "step": 240 }, { "epoch": 0.5913660555884093, "grad_norm": 0.6327686473365169, "learning_rate": 5e-06, "loss": 0.6707, "step": 250 }, { "epoch": 0.6150206978119456, "grad_norm": 0.5640335014419563, "learning_rate": 5e-06, "loss": 0.6617, "step": 260 }, { "epoch": 0.638675340035482, "grad_norm": 0.6026872513783592, "learning_rate": 5e-06, "loss": 0.6711, "step": 270 }, { "epoch": 0.6623299822590183, "grad_norm": 0.5194797310260643, "learning_rate": 5e-06, "loss": 0.6561, "step": 280 }, { "epoch": 0.6859846244825547, "grad_norm": 0.6658270095766984, "learning_rate": 5e-06, "loss": 0.663, "step": 290 }, { "epoch": 0.7096392667060911, "grad_norm": 0.5259962549449988, "learning_rate": 5e-06, "loss": 0.6511, "step": 300 }, { "epoch": 0.7332939089296274, "grad_norm": 0.5776532705758929, "learning_rate": 5e-06, "loss": 0.6649, "step": 310 }, { "epoch": 0.7569485511531638, "grad_norm": 0.5249892835904177, "learning_rate": 5e-06, "loss": 0.6648, "step": 320 }, { "epoch": 0.7806031933767001, "grad_norm": 0.5092145613062358, "learning_rate": 5e-06, "loss": 0.6614, "step": 330 }, { "epoch": 0.8042578356002366, "grad_norm": 0.5273167065626364, "learning_rate": 5e-06, "loss": 0.6468, "step": 340 }, { "epoch": 0.8279124778237729, "grad_norm": 0.5666036582386984, "learning_rate": 5e-06, "loss": 0.6562, "step": 350 }, { "epoch": 0.8515671200473093, "grad_norm": 0.6164474600239763, "learning_rate": 5e-06, "loss": 0.6544, "step": 360 }, { "epoch": 0.8752217622708457, "grad_norm": 0.5854672267431167, "learning_rate": 5e-06, "loss": 0.6547, "step": 370 }, { "epoch": 0.898876404494382, "grad_norm": 0.5313196039449892, "learning_rate": 5e-06, "loss": 0.6584, "step": 380 }, { "epoch": 0.9225310467179184, "grad_norm": 0.5598019500152581, "learning_rate": 5e-06, "loss": 0.6533, "step": 390 }, { "epoch": 0.9461856889414547, "grad_norm": 0.6003027055491813, "learning_rate": 5e-06, "loss": 0.6458, "step": 400 }, { "epoch": 0.9698403311649911, "grad_norm": 0.6341763962447327, "learning_rate": 5e-06, "loss": 0.656, "step": 410 }, { "epoch": 0.9934949733885275, "grad_norm": 0.6083507877729101, "learning_rate": 5e-06, "loss": 0.6538, "step": 420 }, { "epoch": 0.9982259018332348, "eval_loss": 0.6558582186698914, "eval_runtime": 224.885, "eval_samples_per_second": 50.635, "eval_steps_per_second": 0.396, "step": 422 }, { "epoch": 1.0171496156120639, "grad_norm": 0.9361782463296332, "learning_rate": 5e-06, "loss": 0.6157, "step": 430 }, { "epoch": 1.0408042578356003, "grad_norm": 0.6204113010596938, "learning_rate": 5e-06, "loss": 0.6065, "step": 440 }, { "epoch": 1.0644589000591367, "grad_norm": 0.6564336264095381, "learning_rate": 5e-06, "loss": 0.6048, "step": 450 }, { "epoch": 1.0881135422826729, "grad_norm": 0.5555833545533679, "learning_rate": 5e-06, "loss": 0.6072, "step": 460 }, { "epoch": 1.1117681845062093, "grad_norm": 0.568736434370096, "learning_rate": 5e-06, "loss": 0.6026, "step": 470 }, { "epoch": 1.1354228267297457, "grad_norm": 0.5963174527245159, "learning_rate": 5e-06, "loss": 0.6041, "step": 480 }, { "epoch": 1.1590774689532821, "grad_norm": 0.6296624775692966, "learning_rate": 5e-06, "loss": 0.6036, "step": 490 }, { "epoch": 1.1827321111768185, "grad_norm": 0.7667349314546962, "learning_rate": 5e-06, "loss": 0.6068, "step": 500 }, { "epoch": 1.2063867534003547, "grad_norm": 0.6034621980970892, "learning_rate": 5e-06, "loss": 0.6104, "step": 510 }, { "epoch": 1.2300413956238911, "grad_norm": 0.5825117431703367, "learning_rate": 5e-06, "loss": 0.6026, "step": 520 }, { "epoch": 1.2536960378474276, "grad_norm": 0.5671081402783421, "learning_rate": 5e-06, "loss": 0.604, "step": 530 }, { "epoch": 1.277350680070964, "grad_norm": 0.5309591912112671, "learning_rate": 5e-06, "loss": 0.6132, "step": 540 }, { "epoch": 1.3010053222945004, "grad_norm": 0.5636046858771947, "learning_rate": 5e-06, "loss": 0.605, "step": 550 }, { "epoch": 1.3246599645180366, "grad_norm": 0.6623955102141809, "learning_rate": 5e-06, "loss": 0.6082, "step": 560 }, { "epoch": 1.348314606741573, "grad_norm": 0.5742305096790601, "learning_rate": 5e-06, "loss": 0.605, "step": 570 }, { "epoch": 1.3719692489651094, "grad_norm": 0.5167065988140831, "learning_rate": 5e-06, "loss": 0.6091, "step": 580 }, { "epoch": 1.3956238911886458, "grad_norm": 0.5112713876137833, "learning_rate": 5e-06, "loss": 0.6049, "step": 590 }, { "epoch": 1.4192785334121822, "grad_norm": 0.515536375353522, "learning_rate": 5e-06, "loss": 0.6079, "step": 600 }, { "epoch": 1.4429331756357184, "grad_norm": 0.5943800369847494, "learning_rate": 5e-06, "loss": 0.602, "step": 610 }, { "epoch": 1.4665878178592548, "grad_norm": 0.5570413849081146, "learning_rate": 5e-06, "loss": 0.6074, "step": 620 }, { "epoch": 1.4902424600827913, "grad_norm": 0.5383074416990815, "learning_rate": 5e-06, "loss": 0.6055, "step": 630 }, { "epoch": 1.5138971023063275, "grad_norm": 0.6221748842819845, "learning_rate": 5e-06, "loss": 0.6048, "step": 640 }, { "epoch": 1.537551744529864, "grad_norm": 0.623130737085543, "learning_rate": 5e-06, "loss": 0.6124, "step": 650 }, { "epoch": 1.5612063867534003, "grad_norm": 0.7728758992657894, "learning_rate": 5e-06, "loss": 0.6073, "step": 660 }, { "epoch": 1.5848610289769367, "grad_norm": 0.5531126954202661, "learning_rate": 5e-06, "loss": 0.6023, "step": 670 }, { "epoch": 1.6085156712004731, "grad_norm": 0.8207249388527519, "learning_rate": 5e-06, "loss": 0.6041, "step": 680 }, { "epoch": 1.6321703134240093, "grad_norm": 0.7382668830128054, "learning_rate": 5e-06, "loss": 0.6027, "step": 690 }, { "epoch": 1.655824955647546, "grad_norm": 0.8181634349883082, "learning_rate": 5e-06, "loss": 0.6077, "step": 700 }, { "epoch": 1.6794795978710821, "grad_norm": 0.5715750112816181, "learning_rate": 5e-06, "loss": 0.603, "step": 710 }, { "epoch": 1.7031342400946186, "grad_norm": 0.564060422032355, "learning_rate": 5e-06, "loss": 0.6137, "step": 720 }, { "epoch": 1.726788882318155, "grad_norm": 0.5112934215435977, "learning_rate": 5e-06, "loss": 0.6041, "step": 730 }, { "epoch": 1.7504435245416912, "grad_norm": 0.6498040890743698, "learning_rate": 5e-06, "loss": 0.6038, "step": 740 }, { "epoch": 1.7740981667652278, "grad_norm": 0.6625174306160165, "learning_rate": 5e-06, "loss": 0.604, "step": 750 }, { "epoch": 1.797752808988764, "grad_norm": 0.5200239654238437, "learning_rate": 5e-06, "loss": 0.604, "step": 760 }, { "epoch": 1.8214074512123004, "grad_norm": 0.5056250667365105, "learning_rate": 5e-06, "loss": 0.601, "step": 770 }, { "epoch": 1.8450620934358368, "grad_norm": 0.5465224841554837, "learning_rate": 5e-06, "loss": 0.6043, "step": 780 }, { "epoch": 1.868716735659373, "grad_norm": 0.5173445168820222, "learning_rate": 5e-06, "loss": 0.6051, "step": 790 }, { "epoch": 1.8923713778829097, "grad_norm": 0.5037163086029489, "learning_rate": 5e-06, "loss": 0.6071, "step": 800 }, { "epoch": 1.9160260201064458, "grad_norm": 0.5032092904194995, "learning_rate": 5e-06, "loss": 0.6097, "step": 810 }, { "epoch": 1.9396806623299823, "grad_norm": 0.5153373413177225, "learning_rate": 5e-06, "loss": 0.6017, "step": 820 }, { "epoch": 1.9633353045535187, "grad_norm": 0.48445434626456924, "learning_rate": 5e-06, "loss": 0.6097, "step": 830 }, { "epoch": 1.9869899467770549, "grad_norm": 0.5493690840998718, "learning_rate": 5e-06, "loss": 0.5996, "step": 840 }, { "epoch": 1.9988172678888232, "eval_loss": 0.6458428502082825, "eval_runtime": 226.2611, "eval_samples_per_second": 50.327, "eval_steps_per_second": 0.393, "step": 845 }, { "epoch": 2.0106445890005915, "grad_norm": 0.6225484532666892, "learning_rate": 5e-06, "loss": 0.5795, "step": 850 }, { "epoch": 2.0342992312241277, "grad_norm": 0.5819857678964343, "learning_rate": 5e-06, "loss": 0.5553, "step": 860 }, { "epoch": 2.057953873447664, "grad_norm": 0.7185360534865078, "learning_rate": 5e-06, "loss": 0.5506, "step": 870 }, { "epoch": 2.0816085156712005, "grad_norm": 0.5134284842767335, "learning_rate": 5e-06, "loss": 0.5539, "step": 880 }, { "epoch": 2.1052631578947367, "grad_norm": 0.5986326239884353, "learning_rate": 5e-06, "loss": 0.5609, "step": 890 }, { "epoch": 2.1289178001182734, "grad_norm": 0.579714763513885, "learning_rate": 5e-06, "loss": 0.5546, "step": 900 }, { "epoch": 2.1525724423418096, "grad_norm": 0.570292196409214, "learning_rate": 5e-06, "loss": 0.5586, "step": 910 }, { "epoch": 2.1762270845653457, "grad_norm": 0.5811117001743673, "learning_rate": 5e-06, "loss": 0.5585, "step": 920 }, { "epoch": 2.1998817267888824, "grad_norm": 0.554144816987719, "learning_rate": 5e-06, "loss": 0.5562, "step": 930 }, { "epoch": 2.2235363690124186, "grad_norm": 0.6493826388527278, "learning_rate": 5e-06, "loss": 0.5565, "step": 940 }, { "epoch": 2.247191011235955, "grad_norm": 0.5220557856218626, "learning_rate": 5e-06, "loss": 0.5694, "step": 950 }, { "epoch": 2.2708456534594914, "grad_norm": 0.6378102281048501, "learning_rate": 5e-06, "loss": 0.5602, "step": 960 }, { "epoch": 2.2945002956830276, "grad_norm": 0.5497371876386185, "learning_rate": 5e-06, "loss": 0.5628, "step": 970 }, { "epoch": 2.3181549379065642, "grad_norm": 0.6521682175920844, "learning_rate": 5e-06, "loss": 0.5565, "step": 980 }, { "epoch": 2.3418095801301004, "grad_norm": 0.5734936169662879, "learning_rate": 5e-06, "loss": 0.5674, "step": 990 }, { "epoch": 2.365464222353637, "grad_norm": 0.5394258314592499, "learning_rate": 5e-06, "loss": 0.5569, "step": 1000 }, { "epoch": 2.3891188645771733, "grad_norm": 0.5306593171364488, "learning_rate": 5e-06, "loss": 0.5502, "step": 1010 }, { "epoch": 2.4127735068007095, "grad_norm": 0.5344199954837688, "learning_rate": 5e-06, "loss": 0.5549, "step": 1020 }, { "epoch": 2.436428149024246, "grad_norm": 0.5892751227456119, "learning_rate": 5e-06, "loss": 0.5533, "step": 1030 }, { "epoch": 2.4600827912477823, "grad_norm": 0.6529042003930223, "learning_rate": 5e-06, "loss": 0.5613, "step": 1040 }, { "epoch": 2.483737433471319, "grad_norm": 0.5765438425321338, "learning_rate": 5e-06, "loss": 0.5646, "step": 1050 }, { "epoch": 2.507392075694855, "grad_norm": 0.6764490044193554, "learning_rate": 5e-06, "loss": 0.567, "step": 1060 }, { "epoch": 2.5310467179183913, "grad_norm": 0.5365218264481744, "learning_rate": 5e-06, "loss": 0.5532, "step": 1070 }, { "epoch": 2.554701360141928, "grad_norm": 0.6033785399498255, "learning_rate": 5e-06, "loss": 0.5622, "step": 1080 }, { "epoch": 2.578356002365464, "grad_norm": 0.8004909937255467, "learning_rate": 5e-06, "loss": 0.5661, "step": 1090 }, { "epoch": 2.6020106445890008, "grad_norm": 0.5819582134735406, "learning_rate": 5e-06, "loss": 0.5616, "step": 1100 }, { "epoch": 2.625665286812537, "grad_norm": 0.5537773395049099, "learning_rate": 5e-06, "loss": 0.5628, "step": 1110 }, { "epoch": 2.649319929036073, "grad_norm": 0.5539615560141525, "learning_rate": 5e-06, "loss": 0.5648, "step": 1120 }, { "epoch": 2.67297457125961, "grad_norm": 0.6206027218523953, "learning_rate": 5e-06, "loss": 0.5643, "step": 1130 }, { "epoch": 2.696629213483146, "grad_norm": 0.5108322877934205, "learning_rate": 5e-06, "loss": 0.5586, "step": 1140 }, { "epoch": 2.7202838557066826, "grad_norm": 0.48797735494965916, "learning_rate": 5e-06, "loss": 0.5563, "step": 1150 }, { "epoch": 2.743938497930219, "grad_norm": 0.5823974142352172, "learning_rate": 5e-06, "loss": 0.5671, "step": 1160 }, { "epoch": 2.767593140153755, "grad_norm": 0.8599218035136146, "learning_rate": 5e-06, "loss": 0.5723, "step": 1170 }, { "epoch": 2.7912477823772917, "grad_norm": 0.6555716714163583, "learning_rate": 5e-06, "loss": 0.5633, "step": 1180 }, { "epoch": 2.814902424600828, "grad_norm": 0.49879910164951613, "learning_rate": 5e-06, "loss": 0.5581, "step": 1190 }, { "epoch": 2.8385570668243645, "grad_norm": 0.5241725506783274, "learning_rate": 5e-06, "loss": 0.5623, "step": 1200 }, { "epoch": 2.8622117090479007, "grad_norm": 0.6173811070502804, "learning_rate": 5e-06, "loss": 0.569, "step": 1210 }, { "epoch": 2.885866351271437, "grad_norm": 0.5397292738316359, "learning_rate": 5e-06, "loss": 0.5642, "step": 1220 }, { "epoch": 2.9095209934949735, "grad_norm": 0.7053290870019903, "learning_rate": 5e-06, "loss": 0.5593, "step": 1230 }, { "epoch": 2.9331756357185097, "grad_norm": 0.5500348460578961, "learning_rate": 5e-06, "loss": 0.5591, "step": 1240 }, { "epoch": 2.9568302779420463, "grad_norm": 0.5833114667049699, "learning_rate": 5e-06, "loss": 0.5649, "step": 1250 }, { "epoch": 2.9804849201655825, "grad_norm": 0.569413301750619, "learning_rate": 5e-06, "loss": 0.5577, "step": 1260 }, { "epoch": 2.9946777054997042, "eval_loss": 0.648719847202301, "eval_runtime": 225.8886, "eval_samples_per_second": 50.41, "eval_steps_per_second": 0.394, "step": 1266 }, { "epoch": 2.9946777054997042, "step": 1266, "total_flos": 2120178393415680.0, "train_loss": 0.6180764295478568, "train_runtime": 37891.5513, "train_samples_per_second": 17.129, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 1266, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2120178393415680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }