{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.02603918028660458, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002603918028660458, "grad_norm": 0.4500846266746521, "learning_rate": 5.194805194805195e-06, "loss": 1.0381, "step": 10 }, { "epoch": 0.0005207836057320916, "grad_norm": 0.35188010334968567, "learning_rate": 1.038961038961039e-05, "loss": 1.0108, "step": 20 }, { "epoch": 0.0007811754085981374, "grad_norm": 0.2300374060869217, "learning_rate": 1.5584415584415583e-05, "loss": 0.9668, "step": 30 }, { "epoch": 0.0010415672114641832, "grad_norm": 0.16189467906951904, "learning_rate": 2.077922077922078e-05, "loss": 0.918, "step": 40 }, { "epoch": 0.001301959014330229, "grad_norm": 0.18843211233615875, "learning_rate": 2.5974025974025972e-05, "loss": 0.9265, "step": 50 }, { "epoch": 0.0015623508171962747, "grad_norm": 0.20334510505199432, "learning_rate": 3.1168831168831166e-05, "loss": 0.9234, "step": 60 }, { "epoch": 0.0018227426200623205, "grad_norm": 0.1745327115058899, "learning_rate": 3.6363636363636364e-05, "loss": 0.881, "step": 70 }, { "epoch": 0.0020831344229283663, "grad_norm": 0.18667331337928772, "learning_rate": 4.155844155844156e-05, "loss": 0.8592, "step": 80 }, { "epoch": 0.002343526225794412, "grad_norm": 0.1848158985376358, "learning_rate": 4.675324675324675e-05, "loss": 0.8537, "step": 90 }, { "epoch": 0.002603918028660458, "grad_norm": 0.17589879035949707, "learning_rate": 5.1948051948051944e-05, "loss": 0.8518, "step": 100 }, { "epoch": 0.0028643098315265037, "grad_norm": 0.2132624089717865, "learning_rate": 5.714285714285714e-05, "loss": 0.8511, "step": 110 }, { "epoch": 0.0031247016343925495, "grad_norm": 0.23070092499256134, "learning_rate": 6.233766233766233e-05, "loss": 0.7975, "step": 120 }, { "epoch": 0.0033850934372585953, "grad_norm": 0.25368157029151917, "learning_rate": 6.753246753246754e-05, "loss": 0.8134, "step": 130 }, { "epoch": 0.003645485240124641, "grad_norm": 0.22897231578826904, "learning_rate": 7.272727272727273e-05, "loss": 0.8322, "step": 140 }, { "epoch": 0.003905877042990687, "grad_norm": 0.19932536780834198, "learning_rate": 7.792207792207793e-05, "loss": 0.7959, "step": 150 }, { "epoch": 0.004166268845856733, "grad_norm": 0.21011792123317719, "learning_rate": 8.311688311688312e-05, "loss": 0.8102, "step": 160 }, { "epoch": 0.004426660648722778, "grad_norm": 0.20594824850559235, "learning_rate": 8.831168831168831e-05, "loss": 0.8128, "step": 170 }, { "epoch": 0.004687052451588824, "grad_norm": 0.20465536415576935, "learning_rate": 9.35064935064935e-05, "loss": 0.7989, "step": 180 }, { "epoch": 0.00494744425445487, "grad_norm": 0.4109392762184143, "learning_rate": 9.870129870129871e-05, "loss": 0.8108, "step": 190 }, { "epoch": 0.005207836057320916, "grad_norm": 0.4293076694011688, "learning_rate": 0.00010389610389610389, "loss": 0.8101, "step": 200 }, { "epoch": 0.005468227860186962, "grad_norm": 0.31628963351249695, "learning_rate": 0.00010909090909090909, "loss": 0.7989, "step": 210 }, { "epoch": 0.005728619663053007, "grad_norm": 0.24642810225486755, "learning_rate": 0.00011428571428571428, "loss": 0.7751, "step": 220 }, { "epoch": 0.005989011465919053, "grad_norm": 0.3599106967449188, "learning_rate": 0.00011948051948051949, "loss": 0.8063, "step": 230 }, { "epoch": 0.006249403268785099, "grad_norm": 0.17053447663784027, "learning_rate": 0.00012467532467532467, "loss": 0.7751, "step": 240 }, { "epoch": 0.006509795071651145, "grad_norm": 0.17303769290447235, "learning_rate": 0.00012987012987012987, "loss": 0.7883, "step": 250 }, { "epoch": 0.0067701868745171905, "grad_norm": 0.1815861016511917, "learning_rate": 0.00013506493506493507, "loss": 0.788, "step": 260 }, { "epoch": 0.007030578677383236, "grad_norm": 0.24125365912914276, "learning_rate": 0.00014025974025974028, "loss": 0.8018, "step": 270 }, { "epoch": 0.007290970480249282, "grad_norm": 0.19443446397781372, "learning_rate": 0.00014545454545454546, "loss": 0.7908, "step": 280 }, { "epoch": 0.007551362283115328, "grad_norm": 0.17829768359661102, "learning_rate": 0.00015064935064935066, "loss": 0.8033, "step": 290 }, { "epoch": 0.007811754085981374, "grad_norm": 0.19535653293132782, "learning_rate": 0.00015584415584415587, "loss": 0.7997, "step": 300 }, { "epoch": 0.008072145888847419, "grad_norm": 0.19930541515350342, "learning_rate": 0.00016103896103896104, "loss": 0.7945, "step": 310 }, { "epoch": 0.008332537691713465, "grad_norm": 0.2156297266483307, "learning_rate": 0.00016623376623376625, "loss": 0.8018, "step": 320 }, { "epoch": 0.00859292949457951, "grad_norm": 0.1924206018447876, "learning_rate": 0.00017142857142857143, "loss": 0.7746, "step": 330 }, { "epoch": 0.008853321297445557, "grad_norm": 0.2294880747795105, "learning_rate": 0.00017662337662337663, "loss": 0.8152, "step": 340 }, { "epoch": 0.009113713100311602, "grad_norm": 0.16817067563533783, "learning_rate": 0.00018181818181818183, "loss": 0.7972, "step": 350 }, { "epoch": 0.009374104903177648, "grad_norm": 0.18544812500476837, "learning_rate": 0.000187012987012987, "loss": 0.7801, "step": 360 }, { "epoch": 0.009634496706043693, "grad_norm": 0.19597066938877106, "learning_rate": 0.00019220779220779222, "loss": 0.7706, "step": 370 }, { "epoch": 0.00989488850890974, "grad_norm": 0.40291881561279297, "learning_rate": 0.00019740259740259742, "loss": 0.7911, "step": 380 }, { "epoch": 0.010155280311775785, "grad_norm": 0.23841074109077454, "learning_rate": 0.00019999996515752773, "loss": 0.7861, "step": 390 }, { "epoch": 0.010415672114641832, "grad_norm": 0.1675388514995575, "learning_rate": 0.00019999968641789507, "loss": 0.788, "step": 400 }, { "epoch": 0.010676063917507876, "grad_norm": 1.8860758543014526, "learning_rate": 0.0001999991289394067, "loss": 0.7632, "step": 410 }, { "epoch": 0.010936455720373923, "grad_norm": 0.17022117972373962, "learning_rate": 0.00019999829272361654, "loss": 0.784, "step": 420 }, { "epoch": 0.011196847523239968, "grad_norm": 0.21460269391536713, "learning_rate": 0.00019999717777285545, "loss": 0.761, "step": 430 }, { "epoch": 0.011457239326106015, "grad_norm": 0.19413785636425018, "learning_rate": 0.00019999578409023126, "loss": 0.7772, "step": 440 }, { "epoch": 0.01171763112897206, "grad_norm": 0.20223405957221985, "learning_rate": 0.00019999411167962868, "loss": 0.7811, "step": 450 }, { "epoch": 0.011978022931838106, "grad_norm": 0.15166303515434265, "learning_rate": 0.00019999216054570942, "loss": 0.7709, "step": 460 }, { "epoch": 0.012238414734704151, "grad_norm": 0.16307081282138824, "learning_rate": 0.00019998993069391205, "loss": 0.7811, "step": 470 }, { "epoch": 0.012498806537570198, "grad_norm": 0.15996049344539642, "learning_rate": 0.00019998742213045206, "loss": 0.7599, "step": 480 }, { "epoch": 0.012759198340436243, "grad_norm": 0.17560279369354248, "learning_rate": 0.00019998463486232179, "loss": 0.7572, "step": 490 }, { "epoch": 0.01301959014330229, "grad_norm": 0.17571642994880676, "learning_rate": 0.0001999815688972905, "loss": 0.7643, "step": 500 }, { "epoch": 0.013279981946168334, "grad_norm": 0.17719799280166626, "learning_rate": 0.00019997822424390422, "loss": 0.7923, "step": 510 }, { "epoch": 0.013540373749034381, "grad_norm": 0.19846616685390472, "learning_rate": 0.00019997460091148586, "loss": 0.7674, "step": 520 }, { "epoch": 0.013800765551900426, "grad_norm": 0.2715558111667633, "learning_rate": 0.00019997069891013503, "loss": 0.7421, "step": 530 }, { "epoch": 0.014061157354766473, "grad_norm": 0.1725197583436966, "learning_rate": 0.00019996651825072826, "loss": 0.7663, "step": 540 }, { "epoch": 0.014321549157632518, "grad_norm": 0.15060502290725708, "learning_rate": 0.00019996205894491856, "loss": 0.7794, "step": 550 }, { "epoch": 0.014581940960498564, "grad_norm": 0.16645808517932892, "learning_rate": 0.00019995732100513592, "loss": 0.752, "step": 560 }, { "epoch": 0.014842332763364609, "grad_norm": 0.1736789345741272, "learning_rate": 0.00019995230444458682, "loss": 0.7788, "step": 570 }, { "epoch": 0.015102724566230656, "grad_norm": 0.15416319668293, "learning_rate": 0.0001999470092772544, "loss": 0.7656, "step": 580 }, { "epoch": 0.0153631163690967, "grad_norm": 0.16610187292099, "learning_rate": 0.00019994143551789839, "loss": 0.7676, "step": 590 }, { "epoch": 0.015623508171962747, "grad_norm": 0.15843011438846588, "learning_rate": 0.00019993558318205507, "loss": 0.7746, "step": 600 }, { "epoch": 0.015883899974828794, "grad_norm": 0.26837801933288574, "learning_rate": 0.00019992945228603724, "loss": 0.7617, "step": 610 }, { "epoch": 0.016144291777694837, "grad_norm": 0.15099173784255981, "learning_rate": 0.0001999230428469341, "loss": 0.7601, "step": 620 }, { "epoch": 0.016404683580560884, "grad_norm": 0.15511856973171234, "learning_rate": 0.00019991635488261138, "loss": 0.7647, "step": 630 }, { "epoch": 0.01666507538342693, "grad_norm": 0.14919579029083252, "learning_rate": 0.00019990938841171104, "loss": 0.7692, "step": 640 }, { "epoch": 0.016925467186292977, "grad_norm": 0.15838642418384552, "learning_rate": 0.0001999021434536514, "loss": 0.7763, "step": 650 }, { "epoch": 0.01718585898915902, "grad_norm": 0.15956635773181915, "learning_rate": 0.00019989462002862704, "loss": 0.7598, "step": 660 }, { "epoch": 0.017446250792025067, "grad_norm": 0.1499069333076477, "learning_rate": 0.0001998868181576088, "loss": 0.7626, "step": 670 }, { "epoch": 0.017706642594891114, "grad_norm": 0.2170073390007019, "learning_rate": 0.00019987873786234348, "loss": 0.7569, "step": 680 }, { "epoch": 0.01796703439775716, "grad_norm": 0.17841948568820953, "learning_rate": 0.00019987037916535417, "loss": 0.7494, "step": 690 }, { "epoch": 0.018227426200623204, "grad_norm": 0.2066909372806549, "learning_rate": 0.0001998617420899398, "loss": 0.7609, "step": 700 }, { "epoch": 0.01848781800348925, "grad_norm": 0.17015361785888672, "learning_rate": 0.0001998528266601754, "loss": 0.7761, "step": 710 }, { "epoch": 0.018748209806355297, "grad_norm": 0.22166290879249573, "learning_rate": 0.0001998436329009118, "loss": 0.7573, "step": 720 }, { "epoch": 0.01900860160922134, "grad_norm": 0.15084640681743622, "learning_rate": 0.00019983416083777563, "loss": 0.7775, "step": 730 }, { "epoch": 0.019268993412087387, "grad_norm": 0.17800921201705933, "learning_rate": 0.0001998244104971693, "loss": 0.7359, "step": 740 }, { "epoch": 0.019529385214953433, "grad_norm": 0.17354707419872284, "learning_rate": 0.0001998143819062709, "loss": 0.7415, "step": 750 }, { "epoch": 0.01978977701781948, "grad_norm": 0.16408118605613708, "learning_rate": 0.00019980407509303413, "loss": 0.7708, "step": 760 }, { "epoch": 0.020050168820685523, "grad_norm": 0.16820089519023895, "learning_rate": 0.00019979349008618808, "loss": 0.791, "step": 770 }, { "epoch": 0.02031056062355157, "grad_norm": 0.15958388149738312, "learning_rate": 0.00019978262691523743, "loss": 0.7412, "step": 780 }, { "epoch": 0.020570952426417616, "grad_norm": 0.1646542251110077, "learning_rate": 0.00019977148561046217, "loss": 0.7529, "step": 790 }, { "epoch": 0.020831344229283663, "grad_norm": 0.17032025754451752, "learning_rate": 0.0001997600662029175, "loss": 0.7656, "step": 800 }, { "epoch": 0.021091736032149706, "grad_norm": 0.17189227044582367, "learning_rate": 0.00019974836872443388, "loss": 0.7433, "step": 810 }, { "epoch": 0.021352127835015753, "grad_norm": 0.16334249079227448, "learning_rate": 0.0001997363932076168, "loss": 0.7703, "step": 820 }, { "epoch": 0.0216125196378818, "grad_norm": 0.1676424890756607, "learning_rate": 0.00019972413968584682, "loss": 0.7603, "step": 830 }, { "epoch": 0.021872911440747846, "grad_norm": 0.16826209425926208, "learning_rate": 0.0001997116081932793, "loss": 0.7569, "step": 840 }, { "epoch": 0.02213330324361389, "grad_norm": 0.1876436173915863, "learning_rate": 0.0001996987987648446, "loss": 0.7553, "step": 850 }, { "epoch": 0.022393695046479936, "grad_norm": 0.17252250015735626, "learning_rate": 0.0001996857114362476, "loss": 0.7644, "step": 860 }, { "epoch": 0.022654086849345983, "grad_norm": 0.1632252335548401, "learning_rate": 0.00019967234624396793, "loss": 0.7568, "step": 870 }, { "epoch": 0.02291447865221203, "grad_norm": 0.1818259060382843, "learning_rate": 0.00019965870322525965, "loss": 0.7672, "step": 880 }, { "epoch": 0.023174870455078073, "grad_norm": 0.15418195724487305, "learning_rate": 0.0001996447824181513, "loss": 0.7642, "step": 890 }, { "epoch": 0.02343526225794412, "grad_norm": 0.17383505403995514, "learning_rate": 0.0001996305838614457, "loss": 0.7607, "step": 900 }, { "epoch": 0.023695654060810166, "grad_norm": 0.17794272303581238, "learning_rate": 0.00019961610759471984, "loss": 0.7588, "step": 910 }, { "epoch": 0.023956045863676213, "grad_norm": 0.1909121572971344, "learning_rate": 0.00019960135365832486, "loss": 0.7438, "step": 920 }, { "epoch": 0.024216437666542256, "grad_norm": 0.17758873105049133, "learning_rate": 0.00019958632209338587, "loss": 0.7323, "step": 930 }, { "epoch": 0.024476829469408302, "grad_norm": 0.15553662180900574, "learning_rate": 0.00019957101294180174, "loss": 0.7508, "step": 940 }, { "epoch": 0.02473722127227435, "grad_norm": 0.15310749411582947, "learning_rate": 0.00019955542624624522, "loss": 0.7563, "step": 950 }, { "epoch": 0.024997613075140396, "grad_norm": 0.1628728210926056, "learning_rate": 0.00019953956205016256, "loss": 0.7524, "step": 960 }, { "epoch": 0.02525800487800644, "grad_norm": 0.16211454570293427, "learning_rate": 0.00019952342039777362, "loss": 0.7564, "step": 970 }, { "epoch": 0.025518396680872486, "grad_norm": 0.15663012862205505, "learning_rate": 0.00019950700133407163, "loss": 0.7395, "step": 980 }, { "epoch": 0.025778788483738532, "grad_norm": 0.1684863567352295, "learning_rate": 0.00019949030490482296, "loss": 0.753, "step": 990 }, { "epoch": 0.02603918028660458, "grad_norm": 0.1561436653137207, "learning_rate": 0.0001994733311565673, "loss": 0.7409, "step": 1000 } ], "logging_steps": 10, "max_steps": 19202, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.6916733763584e+17, "train_batch_size": 5, "trial_name": null, "trial_params": null }