{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9946777054997042, "eval_steps": 500, "global_step": 1266, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02365464222353637, "grad_norm": 4.438574633904732, "learning_rate": 5e-06, "loss": 0.8864, "step": 10 }, { "epoch": 0.04730928444707274, "grad_norm": 3.743436107422786, "learning_rate": 5e-06, "loss": 0.7865, "step": 20 }, { "epoch": 0.0709639266706091, "grad_norm": 1.3773266130731683, "learning_rate": 5e-06, "loss": 0.7718, "step": 30 }, { "epoch": 0.09461856889414548, "grad_norm": 1.0328014979269187, "learning_rate": 5e-06, "loss": 0.7321, "step": 40 }, { "epoch": 0.11827321111768184, "grad_norm": 0.9419322611015702, "learning_rate": 5e-06, "loss": 0.7263, "step": 50 }, { "epoch": 0.1419278533412182, "grad_norm": 1.0108429467163185, "learning_rate": 5e-06, "loss": 0.7095, "step": 60 }, { "epoch": 0.16558249556475457, "grad_norm": 0.7116397296212089, "learning_rate": 5e-06, "loss": 0.7076, "step": 70 }, { "epoch": 0.18923713778829096, "grad_norm": 0.6226004481870809, "learning_rate": 5e-06, "loss": 0.6923, "step": 80 }, { "epoch": 0.21289178001182732, "grad_norm": 0.7788334547916798, "learning_rate": 5e-06, "loss": 0.6939, "step": 90 }, { "epoch": 0.23654642223536368, "grad_norm": 0.5579006889125326, "learning_rate": 5e-06, "loss": 0.6916, "step": 100 }, { "epoch": 0.26020106445890007, "grad_norm": 0.8348224630921526, "learning_rate": 5e-06, "loss": 0.6841, "step": 110 }, { "epoch": 0.2838557066824364, "grad_norm": 1.0981466464621934, "learning_rate": 5e-06, "loss": 0.6822, "step": 120 }, { "epoch": 0.3075103489059728, "grad_norm": 0.8917486118802607, "learning_rate": 5e-06, "loss": 0.6761, "step": 130 }, { "epoch": 0.33116499112950915, "grad_norm": 0.6543556998725211, "learning_rate": 5e-06, "loss": 0.6715, "step": 140 }, { "epoch": 0.35481963335304556, "grad_norm": 0.8125188565799322, "learning_rate": 5e-06, "loss": 0.6773, "step": 150 }, { "epoch": 0.3784742755765819, "grad_norm": 0.5688602721338011, "learning_rate": 5e-06, "loss": 0.674, "step": 160 }, { "epoch": 0.4021289178001183, "grad_norm": 0.491756225050921, "learning_rate": 5e-06, "loss": 0.6815, "step": 170 }, { "epoch": 0.42578356002365464, "grad_norm": 0.6509377411475789, "learning_rate": 5e-06, "loss": 0.669, "step": 180 }, { "epoch": 0.449438202247191, "grad_norm": 0.6207735885705108, "learning_rate": 5e-06, "loss": 0.6755, "step": 190 }, { "epoch": 0.47309284447072736, "grad_norm": 0.528808960645126, "learning_rate": 5e-06, "loss": 0.6674, "step": 200 }, { "epoch": 0.4967474866942638, "grad_norm": 0.8472585224068009, "learning_rate": 5e-06, "loss": 0.6637, "step": 210 }, { "epoch": 0.5204021289178001, "grad_norm": 0.5848774410670773, "learning_rate": 5e-06, "loss": 0.6681, "step": 220 }, { "epoch": 0.5440567711413364, "grad_norm": 0.6344280914148243, "learning_rate": 5e-06, "loss": 0.6675, "step": 230 }, { "epoch": 0.5677114133648729, "grad_norm": 0.7485250474806475, "learning_rate": 5e-06, "loss": 0.6549, "step": 240 }, { "epoch": 0.5913660555884093, "grad_norm": 0.7332796111883003, "learning_rate": 5e-06, "loss": 0.6693, "step": 250 }, { "epoch": 0.6150206978119456, "grad_norm": 0.5097881381268425, "learning_rate": 5e-06, "loss": 0.6603, "step": 260 }, { "epoch": 0.638675340035482, "grad_norm": 0.49176638410796597, "learning_rate": 5e-06, "loss": 0.6697, "step": 270 }, { "epoch": 0.6623299822590183, "grad_norm": 0.4404068786810332, "learning_rate": 5e-06, "loss": 0.6547, "step": 280 }, { "epoch": 0.6859846244825547, "grad_norm": 0.45956579197536424, "learning_rate": 5e-06, "loss": 0.6617, "step": 290 }, { "epoch": 0.7096392667060911, "grad_norm": 0.5489471352518822, "learning_rate": 5e-06, "loss": 0.65, "step": 300 }, { "epoch": 0.7332939089296274, "grad_norm": 0.5071925000559494, "learning_rate": 5e-06, "loss": 0.6639, "step": 310 }, { "epoch": 0.7569485511531638, "grad_norm": 0.4479532221316009, "learning_rate": 5e-06, "loss": 0.6638, "step": 320 }, { "epoch": 0.7806031933767001, "grad_norm": 0.5768991016278898, "learning_rate": 5e-06, "loss": 0.6601, "step": 330 }, { "epoch": 0.8042578356002366, "grad_norm": 0.47440990509293773, "learning_rate": 5e-06, "loss": 0.6457, "step": 340 }, { "epoch": 0.8279124778237729, "grad_norm": 0.5535872360742707, "learning_rate": 5e-06, "loss": 0.6552, "step": 350 }, { "epoch": 0.8515671200473093, "grad_norm": 0.5692309710225549, "learning_rate": 5e-06, "loss": 0.6533, "step": 360 }, { "epoch": 0.8752217622708457, "grad_norm": 0.477161966378483, "learning_rate": 5e-06, "loss": 0.6538, "step": 370 }, { "epoch": 0.898876404494382, "grad_norm": 0.5742339288892304, "learning_rate": 5e-06, "loss": 0.6576, "step": 380 }, { "epoch": 0.9225310467179184, "grad_norm": 0.6276883665595651, "learning_rate": 5e-06, "loss": 0.6524, "step": 390 }, { "epoch": 0.9461856889414547, "grad_norm": 0.5739867998426434, "learning_rate": 5e-06, "loss": 0.645, "step": 400 }, { "epoch": 0.9698403311649911, "grad_norm": 0.6714991896688636, "learning_rate": 5e-06, "loss": 0.6552, "step": 410 }, { "epoch": 0.9934949733885275, "grad_norm": 0.5532850958647405, "learning_rate": 5e-06, "loss": 0.6528, "step": 420 }, { "epoch": 0.9982259018332348, "eval_loss": 0.6549943685531616, "eval_runtime": 226.5831, "eval_samples_per_second": 50.255, "eval_steps_per_second": 0.393, "step": 422 }, { "epoch": 1.0171496156120639, "grad_norm": 0.6577034498689445, "learning_rate": 5e-06, "loss": 0.6186, "step": 430 }, { "epoch": 1.0408042578356003, "grad_norm": 0.6919211184212087, "learning_rate": 5e-06, "loss": 0.611, "step": 440 }, { "epoch": 1.0644589000591367, "grad_norm": 0.49818432774881954, "learning_rate": 5e-06, "loss": 0.6094, "step": 450 }, { "epoch": 1.0881135422826729, "grad_norm": 0.5304613381757841, "learning_rate": 5e-06, "loss": 0.6118, "step": 460 }, { "epoch": 1.1117681845062093, "grad_norm": 0.5661240677080396, "learning_rate": 5e-06, "loss": 0.6069, "step": 470 }, { "epoch": 1.1354228267297457, "grad_norm": 0.4725657759678031, "learning_rate": 5e-06, "loss": 0.6081, "step": 480 }, { "epoch": 1.1590774689532821, "grad_norm": 0.5348982555181953, "learning_rate": 5e-06, "loss": 0.608, "step": 490 }, { "epoch": 1.1827321111768185, "grad_norm": 0.7009257467225577, "learning_rate": 5e-06, "loss": 0.6107, "step": 500 }, { "epoch": 1.2063867534003547, "grad_norm": 0.516755234113577, "learning_rate": 5e-06, "loss": 0.6145, "step": 510 }, { "epoch": 1.2300413956238911, "grad_norm": 0.5565870238553596, "learning_rate": 5e-06, "loss": 0.6064, "step": 520 }, { "epoch": 1.2536960378474276, "grad_norm": 0.5176934237005286, "learning_rate": 5e-06, "loss": 0.6078, "step": 530 }, { "epoch": 1.277350680070964, "grad_norm": 0.5399011019791115, "learning_rate": 5e-06, "loss": 0.6173, "step": 540 }, { "epoch": 1.3010053222945004, "grad_norm": 0.48804065232921706, "learning_rate": 5e-06, "loss": 0.6089, "step": 550 }, { "epoch": 1.3246599645180366, "grad_norm": 0.6247022748083035, "learning_rate": 5e-06, "loss": 0.612, "step": 560 }, { "epoch": 1.348314606741573, "grad_norm": 0.5205181494692162, "learning_rate": 5e-06, "loss": 0.6087, "step": 570 }, { "epoch": 1.3719692489651094, "grad_norm": 0.4444906716754459, "learning_rate": 5e-06, "loss": 0.6129, "step": 580 }, { "epoch": 1.3956238911886458, "grad_norm": 0.4699507974891951, "learning_rate": 5e-06, "loss": 0.6084, "step": 590 }, { "epoch": 1.4192785334121822, "grad_norm": 0.438759746705871, "learning_rate": 5e-06, "loss": 0.6118, "step": 600 }, { "epoch": 1.4429331756357184, "grad_norm": 0.492167276336904, "learning_rate": 5e-06, "loss": 0.6057, "step": 610 }, { "epoch": 1.4665878178592548, "grad_norm": 0.508896134049524, "learning_rate": 5e-06, "loss": 0.6111, "step": 620 }, { "epoch": 1.4902424600827913, "grad_norm": 0.5044935497801236, "learning_rate": 5e-06, "loss": 0.6091, "step": 630 }, { "epoch": 1.5138971023063275, "grad_norm": 0.611513828523435, "learning_rate": 5e-06, "loss": 0.6084, "step": 640 }, { "epoch": 1.537551744529864, "grad_norm": 0.4284886945389684, "learning_rate": 5e-06, "loss": 0.6161, "step": 650 }, { "epoch": 1.5612063867534003, "grad_norm": 0.7397737868115762, "learning_rate": 5e-06, "loss": 0.611, "step": 660 }, { "epoch": 1.5848610289769367, "grad_norm": 0.45834776518516607, "learning_rate": 5e-06, "loss": 0.6054, "step": 670 }, { "epoch": 1.6085156712004731, "grad_norm": 0.9830107568320281, "learning_rate": 5e-06, "loss": 0.6077, "step": 680 }, { "epoch": 1.6321703134240093, "grad_norm": 0.7316301006660809, "learning_rate": 5e-06, "loss": 0.6066, "step": 690 }, { "epoch": 1.655824955647546, "grad_norm": 0.8713979852654485, "learning_rate": 5e-06, "loss": 0.6112, "step": 700 }, { "epoch": 1.6794795978710821, "grad_norm": 0.6609904307136948, "learning_rate": 5e-06, "loss": 0.6063, "step": 710 }, { "epoch": 1.7031342400946186, "grad_norm": 0.5730385784454821, "learning_rate": 5e-06, "loss": 0.6174, "step": 720 }, { "epoch": 1.726788882318155, "grad_norm": 0.428675321624077, "learning_rate": 5e-06, "loss": 0.6075, "step": 730 }, { "epoch": 1.7504435245416912, "grad_norm": 0.5747845078803645, "learning_rate": 5e-06, "loss": 0.6073, "step": 740 }, { "epoch": 1.7740981667652278, "grad_norm": 0.6019599666582006, "learning_rate": 5e-06, "loss": 0.6074, "step": 750 }, { "epoch": 1.797752808988764, "grad_norm": 0.484871680178572, "learning_rate": 5e-06, "loss": 0.6076, "step": 760 }, { "epoch": 1.8214074512123004, "grad_norm": 0.4801879662753807, "learning_rate": 5e-06, "loss": 0.6046, "step": 770 }, { "epoch": 1.8450620934358368, "grad_norm": 0.44588625446373603, "learning_rate": 5e-06, "loss": 0.6078, "step": 780 }, { "epoch": 1.868716735659373, "grad_norm": 0.43921853360548113, "learning_rate": 5e-06, "loss": 0.6086, "step": 790 }, { "epoch": 1.8923713778829097, "grad_norm": 0.4461189307976923, "learning_rate": 5e-06, "loss": 0.6105, "step": 800 }, { "epoch": 1.9160260201064458, "grad_norm": 0.4949761836327779, "learning_rate": 5e-06, "loss": 0.613, "step": 810 }, { "epoch": 1.9396806623299823, "grad_norm": 0.44062812260467765, "learning_rate": 5e-06, "loss": 0.6049, "step": 820 }, { "epoch": 1.9633353045535187, "grad_norm": 0.4460160929815086, "learning_rate": 5e-06, "loss": 0.613, "step": 830 }, { "epoch": 1.9869899467770549, "grad_norm": 0.5175110899521405, "learning_rate": 5e-06, "loss": 0.6028, "step": 840 }, { "epoch": 1.9988172678888232, "eval_loss": 0.6445377469062805, "eval_runtime": 227.0244, "eval_samples_per_second": 50.158, "eval_steps_per_second": 0.392, "step": 845 }, { "epoch": 2.0106445890005915, "grad_norm": 0.5892375412389526, "learning_rate": 5e-06, "loss": 0.5851, "step": 850 }, { "epoch": 2.0342992312241277, "grad_norm": 0.5097516172118646, "learning_rate": 5e-06, "loss": 0.5643, "step": 860 }, { "epoch": 2.057953873447664, "grad_norm": 0.6536746176311915, "learning_rate": 5e-06, "loss": 0.5592, "step": 870 }, { "epoch": 2.0816085156712005, "grad_norm": 0.47983268810356666, "learning_rate": 5e-06, "loss": 0.5626, "step": 880 }, { "epoch": 2.1052631578947367, "grad_norm": 0.6017282349204336, "learning_rate": 5e-06, "loss": 0.5697, "step": 890 }, { "epoch": 2.1289178001182734, "grad_norm": 0.5728407157654074, "learning_rate": 5e-06, "loss": 0.5632, "step": 900 }, { "epoch": 2.1525724423418096, "grad_norm": 0.5680779384221303, "learning_rate": 5e-06, "loss": 0.5672, "step": 910 }, { "epoch": 2.1762270845653457, "grad_norm": 0.48858908601906337, "learning_rate": 5e-06, "loss": 0.567, "step": 920 }, { "epoch": 2.1998817267888824, "grad_norm": 0.5005707887249943, "learning_rate": 5e-06, "loss": 0.5646, "step": 930 }, { "epoch": 2.2235363690124186, "grad_norm": 0.5829558904651037, "learning_rate": 5e-06, "loss": 0.5648, "step": 940 }, { "epoch": 2.247191011235955, "grad_norm": 0.48798199303667406, "learning_rate": 5e-06, "loss": 0.578, "step": 950 }, { "epoch": 2.2708456534594914, "grad_norm": 0.582446153234459, "learning_rate": 5e-06, "loss": 0.5682, "step": 960 }, { "epoch": 2.2945002956830276, "grad_norm": 0.46970294592756995, "learning_rate": 5e-06, "loss": 0.571, "step": 970 }, { "epoch": 2.3181549379065642, "grad_norm": 0.5759020549520256, "learning_rate": 5e-06, "loss": 0.5647, "step": 980 }, { "epoch": 2.3418095801301004, "grad_norm": 0.549340588982862, "learning_rate": 5e-06, "loss": 0.5755, "step": 990 }, { "epoch": 2.365464222353637, "grad_norm": 0.46429208051701265, "learning_rate": 5e-06, "loss": 0.5648, "step": 1000 }, { "epoch": 2.3891188645771733, "grad_norm": 0.5160254392452897, "learning_rate": 5e-06, "loss": 0.558, "step": 1010 }, { "epoch": 2.4127735068007095, "grad_norm": 0.4799281597192369, "learning_rate": 5e-06, "loss": 0.5627, "step": 1020 }, { "epoch": 2.436428149024246, "grad_norm": 0.5121330286207769, "learning_rate": 5e-06, "loss": 0.5608, "step": 1030 }, { "epoch": 2.4600827912477823, "grad_norm": 0.5841580086447481, "learning_rate": 5e-06, "loss": 0.5693, "step": 1040 }, { "epoch": 2.483737433471319, "grad_norm": 0.557020183414569, "learning_rate": 5e-06, "loss": 0.5726, "step": 1050 }, { "epoch": 2.507392075694855, "grad_norm": 0.6374112998842234, "learning_rate": 5e-06, "loss": 0.5747, "step": 1060 }, { "epoch": 2.5310467179183913, "grad_norm": 0.5343754869995426, "learning_rate": 5e-06, "loss": 0.5607, "step": 1070 }, { "epoch": 2.554701360141928, "grad_norm": 0.5496554790900547, "learning_rate": 5e-06, "loss": 0.5698, "step": 1080 }, { "epoch": 2.578356002365464, "grad_norm": 0.6822773581077988, "learning_rate": 5e-06, "loss": 0.5738, "step": 1090 }, { "epoch": 2.6020106445890008, "grad_norm": 0.49632724788385346, "learning_rate": 5e-06, "loss": 0.5692, "step": 1100 }, { "epoch": 2.625665286812537, "grad_norm": 0.4859614320386073, "learning_rate": 5e-06, "loss": 0.5704, "step": 1110 }, { "epoch": 2.649319929036073, "grad_norm": 0.5005521245693028, "learning_rate": 5e-06, "loss": 0.5721, "step": 1120 }, { "epoch": 2.67297457125961, "grad_norm": 0.5418331476470847, "learning_rate": 5e-06, "loss": 0.5719, "step": 1130 }, { "epoch": 2.696629213483146, "grad_norm": 0.4518235189693759, "learning_rate": 5e-06, "loss": 0.5658, "step": 1140 }, { "epoch": 2.7202838557066826, "grad_norm": 0.45229828628235735, "learning_rate": 5e-06, "loss": 0.5638, "step": 1150 }, { "epoch": 2.743938497930219, "grad_norm": 0.5051148207876189, "learning_rate": 5e-06, "loss": 0.5748, "step": 1160 }, { "epoch": 2.767593140153755, "grad_norm": 0.7455413514573421, "learning_rate": 5e-06, "loss": 0.5797, "step": 1170 }, { "epoch": 2.7912477823772917, "grad_norm": 0.5362145068936747, "learning_rate": 5e-06, "loss": 0.5705, "step": 1180 }, { "epoch": 2.814902424600828, "grad_norm": 0.46118669511344673, "learning_rate": 5e-06, "loss": 0.5653, "step": 1190 }, { "epoch": 2.8385570668243645, "grad_norm": 0.5498761802579338, "learning_rate": 5e-06, "loss": 0.5694, "step": 1200 }, { "epoch": 2.8622117090479007, "grad_norm": 0.5720658060375756, "learning_rate": 5e-06, "loss": 0.5761, "step": 1210 }, { "epoch": 2.885866351271437, "grad_norm": 0.4735883791639776, "learning_rate": 5e-06, "loss": 0.5714, "step": 1220 }, { "epoch": 2.9095209934949735, "grad_norm": 0.6126626053091963, "learning_rate": 5e-06, "loss": 0.5665, "step": 1230 }, { "epoch": 2.9331756357185097, "grad_norm": 0.5724885076669786, "learning_rate": 5e-06, "loss": 0.5666, "step": 1240 }, { "epoch": 2.9568302779420463, "grad_norm": 0.5144727847784881, "learning_rate": 5e-06, "loss": 0.5721, "step": 1250 }, { "epoch": 2.9804849201655825, "grad_norm": 0.4637250585550989, "learning_rate": 5e-06, "loss": 0.5645, "step": 1260 }, { "epoch": 2.9946777054997042, "eval_loss": 0.6453979015350342, "eval_runtime": 226.895, "eval_samples_per_second": 50.186, "eval_steps_per_second": 0.392, "step": 1266 }, { "epoch": 2.9946777054997042, "step": 1266, "total_flos": 2120178393415680.0, "train_loss": 0.6197805719164687, "train_runtime": 38167.6556, "train_samples_per_second": 17.005, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 1266, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2120178393415680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }