{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9697529438928654, "eval_steps": 500, "global_step": 525, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018471484645578389, "grad_norm": 0.2646295130252838, "learning_rate": 3.636363636363636e-06, "loss": 0.9117, "step": 1 }, { "epoch": 0.009235742322789195, "grad_norm": 0.29179853200912476, "learning_rate": 1.8181818181818182e-05, "loss": 0.9579, "step": 5 }, { "epoch": 0.01847148464557839, "grad_norm": 0.30512726306915283, "learning_rate": 3.6363636363636364e-05, "loss": 0.9489, "step": 10 }, { "epoch": 0.027707226968367582, "grad_norm": 0.08514489233493805, "learning_rate": 5.4545454545454546e-05, "loss": 0.9274, "step": 15 }, { "epoch": 0.03694296929115678, "grad_norm": 0.07719692587852478, "learning_rate": 7.272727272727273e-05, "loss": 0.9257, "step": 20 }, { "epoch": 0.04617871161394597, "grad_norm": 0.08602738380432129, "learning_rate": 9.090909090909092e-05, "loss": 0.9255, "step": 25 }, { "epoch": 0.055414453936735164, "grad_norm": 0.08664494752883911, "learning_rate": 0.00010909090909090909, "loss": 0.8889, "step": 30 }, { "epoch": 0.06465019625952435, "grad_norm": 0.07322084158658981, "learning_rate": 0.00012727272727272728, "loss": 0.8809, "step": 35 }, { "epoch": 0.07388593858231356, "grad_norm": 0.08361256867647171, "learning_rate": 0.00014545454545454546, "loss": 0.8901, "step": 40 }, { "epoch": 0.08312168090510275, "grad_norm": 0.0712098553776741, "learning_rate": 0.00016363636363636366, "loss": 0.8712, "step": 45 }, { "epoch": 0.09235742322789194, "grad_norm": 0.07038867473602295, "learning_rate": 0.00018181818181818183, "loss": 0.8779, "step": 50 }, { "epoch": 0.10159316555068114, "grad_norm": 0.08133558183908463, "learning_rate": 0.0002, "loss": 0.8549, "step": 55 }, { "epoch": 0.11082890787347033, "grad_norm": 0.07199383527040482, "learning_rate": 0.00019994777247895855, "loss": 0.8655, "step": 60 }, { "epoch": 0.12006465019625952, "grad_norm": 0.08243429660797119, "learning_rate": 0.00019979114447011323, "loss": 0.891, "step": 65 }, { "epoch": 0.1293003925190487, "grad_norm": 0.0855724886059761, "learning_rate": 0.00019953027957931658, "loss": 0.8675, "step": 70 }, { "epoch": 0.1385361348418379, "grad_norm": 0.0847950428724289, "learning_rate": 0.00019916545029310012, "loss": 0.8579, "step": 75 }, { "epoch": 0.1477718771646271, "grad_norm": 0.07739102840423584, "learning_rate": 0.00019869703769404828, "loss": 0.8643, "step": 80 }, { "epoch": 0.1570076194874163, "grad_norm": 0.08714427053928375, "learning_rate": 0.00019812553106273847, "loss": 0.8766, "step": 85 }, { "epoch": 0.1662433618102055, "grad_norm": 0.06926970183849335, "learning_rate": 0.00019745152736666302, "loss": 0.8539, "step": 90 }, { "epoch": 0.1754791041329947, "grad_norm": 0.08098109066486359, "learning_rate": 0.0001966757306366662, "loss": 0.8823, "step": 95 }, { "epoch": 0.18471484645578387, "grad_norm": 0.09138432890176773, "learning_rate": 0.0001957989512315489, "loss": 0.8601, "step": 100 }, { "epoch": 0.19395058877857307, "grad_norm": 0.09067590534687042, "learning_rate": 0.00019482210499160765, "loss": 0.8551, "step": 105 }, { "epoch": 0.20318633110136228, "grad_norm": 0.08557430654764175, "learning_rate": 0.0001937462122819935, "loss": 0.8582, "step": 110 }, { "epoch": 0.21242207342415145, "grad_norm": 0.07583803683519363, "learning_rate": 0.00019257239692688907, "loss": 0.8468, "step": 115 }, { "epoch": 0.22165781574694066, "grad_norm": 0.10376883298158646, "learning_rate": 0.00019130188503561741, "loss": 0.8722, "step": 120 }, { "epoch": 0.23089355806972986, "grad_norm": 0.07388196140527725, "learning_rate": 0.00018993600372190932, "loss": 0.8715, "step": 125 }, { "epoch": 0.24012930039251904, "grad_norm": 0.07768449187278748, "learning_rate": 0.00018847617971766577, "loss": 0.8632, "step": 130 }, { "epoch": 0.24936504271530824, "grad_norm": 0.07902154326438904, "learning_rate": 0.00018692393788266479, "loss": 0.8746, "step": 135 }, { "epoch": 0.2586007850380974, "grad_norm": 0.07420278340578079, "learning_rate": 0.0001852808996117683, "loss": 0.8676, "step": 140 }, { "epoch": 0.26783652736088664, "grad_norm": 0.07562129944562912, "learning_rate": 0.00018354878114129367, "loss": 0.8829, "step": 145 }, { "epoch": 0.2770722696836758, "grad_norm": 0.07764951884746552, "learning_rate": 0.00018172939175631808, "loss": 0.8766, "step": 150 }, { "epoch": 0.286308012006465, "grad_norm": 0.07714657485485077, "learning_rate": 0.0001798246319007893, "loss": 0.8802, "step": 155 }, { "epoch": 0.2955437543292542, "grad_norm": 0.08166080713272095, "learning_rate": 0.00017783649119241602, "loss": 0.8504, "step": 160 }, { "epoch": 0.3047794966520434, "grad_norm": 0.07295581698417664, "learning_rate": 0.0001757670463444118, "loss": 0.8578, "step": 165 }, { "epoch": 0.3140152389748326, "grad_norm": 0.08463213592767715, "learning_rate": 0.00017361845899626355, "loss": 0.8465, "step": 170 }, { "epoch": 0.3232509812976218, "grad_norm": 0.07825674116611481, "learning_rate": 0.00017139297345578994, "loss": 0.8581, "step": 175 }, { "epoch": 0.332486723620411, "grad_norm": 0.08157385140657425, "learning_rate": 0.0001690929143548488, "loss": 0.8598, "step": 180 }, { "epoch": 0.34172246594320016, "grad_norm": 0.08528893440961838, "learning_rate": 0.00016672068422114196, "loss": 0.85, "step": 185 }, { "epoch": 0.3509582082659894, "grad_norm": 0.07343052327632904, "learning_rate": 0.00016427876096865394, "loss": 0.8515, "step": 190 }, { "epoch": 0.36019395058877857, "grad_norm": 0.07675015926361084, "learning_rate": 0.00016176969530934572, "loss": 0.8628, "step": 195 }, { "epoch": 0.36942969291156774, "grad_norm": 0.07422856241464615, "learning_rate": 0.0001591961080888076, "loss": 0.866, "step": 200 }, { "epoch": 0.378665435234357, "grad_norm": 0.0853537917137146, "learning_rate": 0.00015656068754865387, "loss": 0.8652, "step": 205 }, { "epoch": 0.38790117755714615, "grad_norm": 0.07879694551229477, "learning_rate": 0.0001538661865185188, "loss": 0.8613, "step": 210 }, { "epoch": 0.3971369198799353, "grad_norm": 0.0811944380402565, "learning_rate": 0.00015111541954058734, "loss": 0.8723, "step": 215 }, { "epoch": 0.40637266220272455, "grad_norm": 0.08038283884525299, "learning_rate": 0.00014831125992966385, "loss": 0.8709, "step": 220 }, { "epoch": 0.41560840452551373, "grad_norm": 0.07797664403915405, "learning_rate": 0.00014545663677185006, "loss": 0.8583, "step": 225 }, { "epoch": 0.4248441468483029, "grad_norm": 0.08307761698961258, "learning_rate": 0.00014255453186496673, "loss": 0.8467, "step": 230 }, { "epoch": 0.43407988917109214, "grad_norm": 0.07691530138254166, "learning_rate": 0.0001396079766039157, "loss": 0.8435, "step": 235 }, { "epoch": 0.4433156314938813, "grad_norm": 0.09281527251005173, "learning_rate": 0.0001366200488142348, "loss": 0.8605, "step": 240 }, { "epoch": 0.4525513738166705, "grad_norm": 0.07799265533685684, "learning_rate": 0.00013359386953715421, "loss": 0.85, "step": 245 }, { "epoch": 0.4617871161394597, "grad_norm": 0.07675463706254959, "learning_rate": 0.00013053259976951133, "loss": 0.8434, "step": 250 }, { "epoch": 0.4710228584622489, "grad_norm": 0.09029978513717651, "learning_rate": 0.00012743943716193016, "loss": 0.8587, "step": 255 }, { "epoch": 0.48025860078503807, "grad_norm": 0.07997170835733414, "learning_rate": 0.00012431761267871417, "loss": 0.8436, "step": 260 }, { "epoch": 0.4894943431078273, "grad_norm": 0.08595962822437286, "learning_rate": 0.0001211703872229411, "loss": 0.8682, "step": 265 }, { "epoch": 0.4987300854306165, "grad_norm": 0.08949116617441177, "learning_rate": 0.00011800104823028515, "loss": 0.8663, "step": 270 }, { "epoch": 0.5079658277534057, "grad_norm": 0.08509814739227295, "learning_rate": 0.0001148129062351249, "loss": 0.8359, "step": 275 }, { "epoch": 0.5172015700761948, "grad_norm": 0.08007588982582092, "learning_rate": 0.00011160929141252303, "loss": 0.8494, "step": 280 }, { "epoch": 0.5264373123989841, "grad_norm": 0.07660423964262009, "learning_rate": 0.00010839355009969068, "loss": 0.8403, "step": 285 }, { "epoch": 0.5356730547217733, "grad_norm": 0.07779201865196228, "learning_rate": 0.00010516904130056946, "loss": 0.863, "step": 290 }, { "epoch": 0.5449087970445624, "grad_norm": 0.07425861805677414, "learning_rate": 0.00010193913317718244, "loss": 0.8732, "step": 295 }, { "epoch": 0.5541445393673516, "grad_norm": 0.08780544251203537, "learning_rate": 9.870719953141917e-05, "loss": 0.856, "step": 300 }, { "epoch": 0.5633802816901409, "grad_norm": 0.07546891272068024, "learning_rate": 9.547661628092937e-05, "loss": 0.8418, "step": 305 }, { "epoch": 0.57261602401293, "grad_norm": 0.08621185272932053, "learning_rate": 9.225075793280692e-05, "loss": 0.8463, "step": 310 }, { "epoch": 0.5818517663357192, "grad_norm": 0.07618840038776398, "learning_rate": 8.903299405874684e-05, "loss": 0.8257, "step": 315 }, { "epoch": 0.5910875086585085, "grad_norm": 0.07749903202056885, "learning_rate": 8.582668577535797e-05, "loss": 0.8442, "step": 320 }, { "epoch": 0.6003232509812976, "grad_norm": 0.08626076579093933, "learning_rate": 8.263518223330697e-05, "loss": 0.8739, "step": 325 }, { "epoch": 0.6095589933040868, "grad_norm": 0.07912192493677139, "learning_rate": 7.94618171189618e-05, "loss": 0.816, "step": 330 }, { "epoch": 0.618794735626876, "grad_norm": 0.08324088156223297, "learning_rate": 7.630990517218808e-05, "loss": 0.853, "step": 335 }, { "epoch": 0.6280304779496652, "grad_norm": 0.08155480027198792, "learning_rate": 7.318273872393625e-05, "loss": 0.86, "step": 340 }, { "epoch": 0.6372662202724544, "grad_norm": 0.10541233420372009, "learning_rate": 7.008358425723585e-05, "loss": 0.8674, "step": 345 }, { "epoch": 0.6465019625952436, "grad_norm": 0.08856544643640518, "learning_rate": 6.701567899518924e-05, "loss": 0.8542, "step": 350 }, { "epoch": 0.6557377049180327, "grad_norm": 0.0975981131196022, "learning_rate": 6.398222751952899e-05, "loss": 0.8441, "step": 355 }, { "epoch": 0.664973447240822, "grad_norm": 0.0872284546494484, "learning_rate": 6.098639842327052e-05, "loss": 0.8661, "step": 360 }, { "epoch": 0.6742091895636112, "grad_norm": 0.08079314976930618, "learning_rate": 5.80313210009571e-05, "loss": 0.8465, "step": 365 }, { "epoch": 0.6834449318864003, "grad_norm": 0.08326224982738495, "learning_rate": 5.5120081979953785e-05, "loss": 0.8551, "step": 370 }, { "epoch": 0.6926806742091896, "grad_norm": 0.07942940294742584, "learning_rate": 5.22557222962051e-05, "loss": 0.8673, "step": 375 }, { "epoch": 0.7019164165319788, "grad_norm": 0.08273901045322418, "learning_rate": 4.9441233917824106e-05, "loss": 0.8424, "step": 380 }, { "epoch": 0.7111521588547679, "grad_norm": 0.07538026571273804, "learning_rate": 4.66795567198309e-05, "loss": 0.8448, "step": 385 }, { "epoch": 0.7203879011775571, "grad_norm": 0.07920888811349869, "learning_rate": 4.397357541330476e-05, "loss": 0.874, "step": 390 }, { "epoch": 0.7296236435003464, "grad_norm": 0.09400610625743866, "learning_rate": 4.132611653215822e-05, "loss": 0.8487, "step": 395 }, { "epoch": 0.7388593858231355, "grad_norm": 0.08254476636648178, "learning_rate": 3.873994548067972e-05, "loss": 0.836, "step": 400 }, { "epoch": 0.7480951281459247, "grad_norm": 0.08779824525117874, "learning_rate": 3.621776364492939e-05, "loss": 0.8621, "step": 405 }, { "epoch": 0.757330870468714, "grad_norm": 0.08715569227933884, "learning_rate": 3.376220557100523e-05, "loss": 0.8413, "step": 410 }, { "epoch": 0.7665666127915031, "grad_norm": 0.08005767315626144, "learning_rate": 3.137583621312665e-05, "loss": 0.8563, "step": 415 }, { "epoch": 0.7758023551142923, "grad_norm": 0.08362836390733719, "learning_rate": 2.906114825441072e-05, "loss": 0.8431, "step": 420 }, { "epoch": 0.7850380974370815, "grad_norm": 0.08525697141885757, "learning_rate": 2.6820559503138797e-05, "loss": 0.8619, "step": 425 }, { "epoch": 0.7942738397598706, "grad_norm": 0.08668297529220581, "learning_rate": 2.465641036723393e-05, "loss": 0.8525, "step": 430 }, { "epoch": 0.8035095820826599, "grad_norm": 0.08923713862895966, "learning_rate": 2.2570961409586754e-05, "loss": 0.854, "step": 435 }, { "epoch": 0.8127453244054491, "grad_norm": 0.08294524252414703, "learning_rate": 2.0566390986783646e-05, "loss": 0.867, "step": 440 }, { "epoch": 0.8219810667282382, "grad_norm": 0.08329101651906967, "learning_rate": 1.864479297370325e-05, "loss": 0.8454, "step": 445 }, { "epoch": 0.8312168090510275, "grad_norm": 0.08947557955980301, "learning_rate": 1.6808174576358848e-05, "loss": 0.8663, "step": 450 }, { "epoch": 0.8404525513738167, "grad_norm": 0.08388309925794601, "learning_rate": 1.505845423527027e-05, "loss": 0.8663, "step": 455 }, { "epoch": 0.8496882936966058, "grad_norm": 0.07886148244142532, "learning_rate": 1.339745962155613e-05, "loss": 0.8484, "step": 460 }, { "epoch": 0.858924036019395, "grad_norm": 0.08021736145019531, "learning_rate": 1.18269257278392e-05, "loss": 0.8613, "step": 465 }, { "epoch": 0.8681597783421843, "grad_norm": 0.0815872773528099, "learning_rate": 1.0348493055959062e-05, "loss": 0.8361, "step": 470 }, { "epoch": 0.8773955206649734, "grad_norm": 0.08512595295906067, "learning_rate": 8.963705903385345e-06, "loss": 0.8236, "step": 475 }, { "epoch": 0.8866312629877626, "grad_norm": 0.07890893518924713, "learning_rate": 7.674010750120964e-06, "loss": 0.8398, "step": 480 }, { "epoch": 0.8958670053105519, "grad_norm": 0.08390273153781891, "learning_rate": 6.480754747781037e-06, "loss": 0.8392, "step": 485 }, { "epoch": 0.905102747633341, "grad_norm": 0.07941864430904388, "learning_rate": 5.385184312424974e-06, "loss": 0.8344, "step": 490 }, { "epoch": 0.9143384899561302, "grad_norm": 0.0822635293006897, "learning_rate": 4.3884438226120424e-06, "loss": 0.8492, "step": 495 }, { "epoch": 0.9235742322789194, "grad_norm": 0.07574615627527237, "learning_rate": 3.4915744240403558e-06, "loss": 0.8499, "step": 500 }, { "epoch": 0.9328099746017086, "grad_norm": 0.08466410636901855, "learning_rate": 2.6955129420176196e-06, "loss": 0.848, "step": 505 }, { "epoch": 0.9420457169244978, "grad_norm": 0.0831914022564888, "learning_rate": 2.0010909028998827e-06, "loss": 0.8442, "step": 510 }, { "epoch": 0.951281459247287, "grad_norm": 0.08431433886289597, "learning_rate": 1.409033665520354e-06, "loss": 0.8491, "step": 515 }, { "epoch": 0.9605172015700761, "grad_norm": 0.08757560700178146, "learning_rate": 9.199596635154683e-07, "loss": 0.8526, "step": 520 }, { "epoch": 0.9697529438928654, "grad_norm": 0.0839475765824318, "learning_rate": 5.343797593398536e-07, "loss": 0.8663, "step": 525 }, { "epoch": 0.9697529438928654, "step": 525, "total_flos": 2.6262151886631076e+18, "train_loss": 0.0, "train_runtime": 0.0108, "train_samples_per_second": 9612825.336, "train_steps_per_second": 37551.544 } ], "logging_steps": 5, "max_steps": 406, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.6262151886631076e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }