{ "best_metric": 0.4560202658176422, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.11421229209793704, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005710614604896852, "grad_norm": 9.655982971191406, "learning_rate": 2.333333333333333e-06, "loss": 4.9642, "step": 1 }, { "epoch": 0.0005710614604896852, "eval_loss": 0.7650408744812012, "eval_runtime": 593.1149, "eval_samples_per_second": 4.974, "eval_steps_per_second": 1.244, "step": 1 }, { "epoch": 0.0011421229209793704, "grad_norm": 8.363570213317871, "learning_rate": 4.666666666666666e-06, "loss": 5.0461, "step": 2 }, { "epoch": 0.0017131843814690555, "grad_norm": 8.605849266052246, "learning_rate": 7e-06, "loss": 4.6528, "step": 3 }, { "epoch": 0.002284245841958741, "grad_norm": 10.32148551940918, "learning_rate": 9.333333333333333e-06, "loss": 5.4813, "step": 4 }, { "epoch": 0.002855307302448426, "grad_norm": 7.543620586395264, "learning_rate": 1.1666666666666665e-05, "loss": 4.938, "step": 5 }, { "epoch": 0.003426368762938111, "grad_norm": 7.9145426750183105, "learning_rate": 1.4e-05, "loss": 5.5469, "step": 6 }, { "epoch": 0.003997430223427797, "grad_norm": 7.476115703582764, "learning_rate": 1.633333333333333e-05, "loss": 5.4585, "step": 7 }, { "epoch": 0.004568491683917482, "grad_norm": 7.091987133026123, "learning_rate": 1.8666666666666665e-05, "loss": 5.2762, "step": 8 }, { "epoch": 0.005139553144407167, "grad_norm": 5.893141746520996, "learning_rate": 2.1e-05, "loss": 4.9478, "step": 9 }, { "epoch": 0.005710614604896852, "grad_norm": 5.861713886260986, "learning_rate": 2.333333333333333e-05, "loss": 5.3339, "step": 10 }, { "epoch": 0.006281676065386537, "grad_norm": 5.187582015991211, "learning_rate": 2.5666666666666663e-05, "loss": 4.6863, "step": 11 }, { "epoch": 0.006852737525876222, "grad_norm": 5.725977420806885, "learning_rate": 2.8e-05, "loss": 4.2163, "step": 12 }, { "epoch": 0.007423798986365908, "grad_norm": 6.390807628631592, "learning_rate": 3.0333333333333333e-05, "loss": 4.7941, "step": 13 }, { "epoch": 0.007994860446855594, "grad_norm": 6.105741500854492, "learning_rate": 3.266666666666666e-05, "loss": 5.1002, "step": 14 }, { "epoch": 0.008565921907345278, "grad_norm": 6.119107723236084, "learning_rate": 3.5e-05, "loss": 4.9678, "step": 15 }, { "epoch": 0.009136983367834963, "grad_norm": 5.37454891204834, "learning_rate": 3.733333333333333e-05, "loss": 4.6234, "step": 16 }, { "epoch": 0.00970804482832465, "grad_norm": 5.023365497589111, "learning_rate": 3.9666666666666664e-05, "loss": 4.5956, "step": 17 }, { "epoch": 0.010279106288814333, "grad_norm": 5.116684436798096, "learning_rate": 4.2e-05, "loss": 4.4032, "step": 18 }, { "epoch": 0.010850167749304019, "grad_norm": 4.848908424377441, "learning_rate": 4.4333333333333324e-05, "loss": 4.1467, "step": 19 }, { "epoch": 0.011421229209793705, "grad_norm": 5.300320625305176, "learning_rate": 4.666666666666666e-05, "loss": 4.711, "step": 20 }, { "epoch": 0.011992290670283389, "grad_norm": 4.577368259429932, "learning_rate": 4.899999999999999e-05, "loss": 4.1949, "step": 21 }, { "epoch": 0.012563352130773075, "grad_norm": 4.3569560050964355, "learning_rate": 5.1333333333333325e-05, "loss": 4.6698, "step": 22 }, { "epoch": 0.01313441359126276, "grad_norm": 4.732718467712402, "learning_rate": 5.3666666666666666e-05, "loss": 4.4054, "step": 23 }, { "epoch": 0.013705475051752444, "grad_norm": 4.412967681884766, "learning_rate": 5.6e-05, "loss": 4.3697, "step": 24 }, { "epoch": 0.01427653651224213, "grad_norm": 4.235969543457031, "learning_rate": 5.833333333333333e-05, "loss": 4.3441, "step": 25 }, { "epoch": 0.014847597972731816, "grad_norm": 4.274826526641846, "learning_rate": 6.0666666666666666e-05, "loss": 4.6215, "step": 26 }, { "epoch": 0.0154186594332215, "grad_norm": 4.126132488250732, "learning_rate": 6.3e-05, "loss": 4.3444, "step": 27 }, { "epoch": 0.015989720893711187, "grad_norm": 4.711647987365723, "learning_rate": 6.533333333333333e-05, "loss": 4.4501, "step": 28 }, { "epoch": 0.01656078235420087, "grad_norm": 4.457403659820557, "learning_rate": 6.766666666666667e-05, "loss": 4.5348, "step": 29 }, { "epoch": 0.017131843814690555, "grad_norm": 4.318757057189941, "learning_rate": 7e-05, "loss": 4.8379, "step": 30 }, { "epoch": 0.01770290527518024, "grad_norm": 4.192445755004883, "learning_rate": 6.999402376603183e-05, "loss": 4.8049, "step": 31 }, { "epoch": 0.018273966735669927, "grad_norm": 4.142071723937988, "learning_rate": 6.99760971050058e-05, "loss": 4.4266, "step": 32 }, { "epoch": 0.018845028196159613, "grad_norm": 4.6366167068481445, "learning_rate": 6.994622613886018e-05, "loss": 5.603, "step": 33 }, { "epoch": 0.0194160896566493, "grad_norm": 4.616098880767822, "learning_rate": 6.990442106850258e-05, "loss": 5.0995, "step": 34 }, { "epoch": 0.01998715111713898, "grad_norm": 4.354705333709717, "learning_rate": 6.98506961703262e-05, "loss": 4.8896, "step": 35 }, { "epoch": 0.020558212577628666, "grad_norm": 4.386688709259033, "learning_rate": 6.978506979133457e-05, "loss": 4.4294, "step": 36 }, { "epoch": 0.021129274038118352, "grad_norm": 4.3963093757629395, "learning_rate": 6.9707564342876e-05, "loss": 4.2744, "step": 37 }, { "epoch": 0.021700335498608038, "grad_norm": 4.365737438201904, "learning_rate": 6.96182062929901e-05, "loss": 3.9286, "step": 38 }, { "epoch": 0.022271396959097724, "grad_norm": 4.8030571937561035, "learning_rate": 6.951702615736908e-05, "loss": 4.4058, "step": 39 }, { "epoch": 0.02284245841958741, "grad_norm": 5.323755264282227, "learning_rate": 6.940405848893656e-05, "loss": 4.145, "step": 40 }, { "epoch": 0.023413519880077092, "grad_norm": 4.974182605743408, "learning_rate": 6.92793418660478e-05, "loss": 4.5518, "step": 41 }, { "epoch": 0.023984581340566778, "grad_norm": 5.099968910217285, "learning_rate": 6.914291887931528e-05, "loss": 4.0907, "step": 42 }, { "epoch": 0.024555642801056463, "grad_norm": 4.776078224182129, "learning_rate": 6.899483611706398e-05, "loss": 4.1122, "step": 43 }, { "epoch": 0.02512670426154615, "grad_norm": 4.876911640167236, "learning_rate": 6.883514414942155e-05, "loss": 4.1617, "step": 44 }, { "epoch": 0.025697765722035835, "grad_norm": 5.060526371002197, "learning_rate": 6.866389751104867e-05, "loss": 3.7848, "step": 45 }, { "epoch": 0.02626882718252552, "grad_norm": 5.239251136779785, "learning_rate": 6.848115468251542e-05, "loss": 4.0855, "step": 46 }, { "epoch": 0.026839888643015203, "grad_norm": 5.517544269561768, "learning_rate": 6.828697807033038e-05, "loss": 4.1965, "step": 47 }, { "epoch": 0.02741095010350489, "grad_norm": 5.758817672729492, "learning_rate": 6.808143398562868e-05, "loss": 4.0623, "step": 48 }, { "epoch": 0.027982011563994574, "grad_norm": 5.886153697967529, "learning_rate": 6.786459262152698e-05, "loss": 3.7384, "step": 49 }, { "epoch": 0.02855307302448426, "grad_norm": 8.272394180297852, "learning_rate": 6.763652802915244e-05, "loss": 3.7424, "step": 50 }, { "epoch": 0.02855307302448426, "eval_loss": 0.5147433876991272, "eval_runtime": 595.5275, "eval_samples_per_second": 4.954, "eval_steps_per_second": 1.239, "step": 50 }, { "epoch": 0.029124134484973946, "grad_norm": 4.8110432624816895, "learning_rate": 6.739731809235446e-05, "loss": 3.3424, "step": 51 }, { "epoch": 0.02969519594546363, "grad_norm": 4.688547611236572, "learning_rate": 6.71470445011073e-05, "loss": 3.6607, "step": 52 }, { "epoch": 0.030266257405953317, "grad_norm": 4.069744110107422, "learning_rate": 6.688579272361309e-05, "loss": 3.8212, "step": 53 }, { "epoch": 0.030837318866443, "grad_norm": 3.885037422180176, "learning_rate": 6.66136519771145e-05, "loss": 3.7556, "step": 54 }, { "epoch": 0.03140838032693269, "grad_norm": 3.9566140174865723, "learning_rate": 6.633071519742718e-05, "loss": 3.6849, "step": 55 }, { "epoch": 0.031979441787422375, "grad_norm": 4.207897186279297, "learning_rate": 6.603707900720217e-05, "loss": 3.3258, "step": 56 }, { "epoch": 0.03255050324791205, "grad_norm": 4.095637321472168, "learning_rate": 6.573284368292943e-05, "loss": 3.8785, "step": 57 }, { "epoch": 0.03312156470840174, "grad_norm": 3.729644536972046, "learning_rate": 6.541811312069348e-05, "loss": 3.2926, "step": 58 }, { "epoch": 0.033692626168891425, "grad_norm": 4.160499572753906, "learning_rate": 6.509299480069303e-05, "loss": 3.6083, "step": 59 }, { "epoch": 0.03426368762938111, "grad_norm": 3.895054578781128, "learning_rate": 6.47575997505365e-05, "loss": 3.4853, "step": 60 }, { "epoch": 0.034834749089870796, "grad_norm": 3.4832022190093994, "learning_rate": 6.441204250732624e-05, "loss": 3.1942, "step": 61 }, { "epoch": 0.03540581055036048, "grad_norm": 3.5753769874572754, "learning_rate": 6.405644107854427e-05, "loss": 3.6576, "step": 62 }, { "epoch": 0.03597687201085017, "grad_norm": 3.4942517280578613, "learning_rate": 6.369091690175273e-05, "loss": 3.5523, "step": 63 }, { "epoch": 0.036547933471339854, "grad_norm": 3.4585907459259033, "learning_rate": 6.331559480312315e-05, "loss": 3.4847, "step": 64 }, { "epoch": 0.03711899493182954, "grad_norm": 3.7623393535614014, "learning_rate": 6.293060295480838e-05, "loss": 3.9728, "step": 65 }, { "epoch": 0.037690056392319225, "grad_norm": 3.7405989170074463, "learning_rate": 6.25360728311719e-05, "loss": 3.6524, "step": 66 }, { "epoch": 0.03826111785280891, "grad_norm": 3.857524871826172, "learning_rate": 6.213213916388954e-05, "loss": 4.1654, "step": 67 }, { "epoch": 0.0388321793132986, "grad_norm": 3.8793821334838867, "learning_rate": 6.171893989593859e-05, "loss": 3.9822, "step": 68 }, { "epoch": 0.039403240773788276, "grad_norm": 3.9082939624786377, "learning_rate": 6.129661613449057e-05, "loss": 4.042, "step": 69 }, { "epoch": 0.03997430223427796, "grad_norm": 3.849015474319458, "learning_rate": 6.086531210272306e-05, "loss": 3.68, "step": 70 }, { "epoch": 0.04054536369476765, "grad_norm": 3.643329381942749, "learning_rate": 6.042517509056784e-05, "loss": 3.4066, "step": 71 }, { "epoch": 0.04111642515525733, "grad_norm": 3.8051083087921143, "learning_rate": 5.997635540441133e-05, "loss": 3.9113, "step": 72 }, { "epoch": 0.04168748661574702, "grad_norm": 3.7576425075531006, "learning_rate": 5.9519006315765176e-05, "loss": 3.9575, "step": 73 }, { "epoch": 0.042258548076236704, "grad_norm": 4.091893196105957, "learning_rate": 5.9053284008924185e-05, "loss": 4.044, "step": 74 }, { "epoch": 0.04282960953672639, "grad_norm": 3.665947437286377, "learning_rate": 5.85793475276295e-05, "loss": 3.7113, "step": 75 }, { "epoch": 0.043400670997216076, "grad_norm": 3.7637410163879395, "learning_rate": 5.809735872075529e-05, "loss": 3.9625, "step": 76 }, { "epoch": 0.04397173245770576, "grad_norm": 3.848707437515259, "learning_rate": 5.760748218703755e-05, "loss": 4.3046, "step": 77 }, { "epoch": 0.04454279391819545, "grad_norm": 3.8582873344421387, "learning_rate": 5.710988521886378e-05, "loss": 4.0915, "step": 78 }, { "epoch": 0.04511385537868513, "grad_norm": 3.9476304054260254, "learning_rate": 5.660473774514275e-05, "loss": 3.9352, "step": 79 }, { "epoch": 0.04568491683917482, "grad_norm": 3.7469491958618164, "learning_rate": 5.6092212273273975e-05, "loss": 3.8668, "step": 80 }, { "epoch": 0.046255978299664505, "grad_norm": 3.9498250484466553, "learning_rate": 5.557248383023655e-05, "loss": 4.3159, "step": 81 }, { "epoch": 0.046827039760154184, "grad_norm": 3.702901601791382, "learning_rate": 5.5045729902817676e-05, "loss": 4.057, "step": 82 }, { "epoch": 0.04739810122064387, "grad_norm": 4.221633434295654, "learning_rate": 5.4512130377000987e-05, "loss": 4.4756, "step": 83 }, { "epoch": 0.047969162681133555, "grad_norm": 3.9935688972473145, "learning_rate": 5.397186747653573e-05, "loss": 4.9131, "step": 84 }, { "epoch": 0.04854022414162324, "grad_norm": 3.9818077087402344, "learning_rate": 5.342512570070745e-05, "loss": 4.3006, "step": 85 }, { "epoch": 0.04911128560211293, "grad_norm": 3.9484941959381104, "learning_rate": 5.287209176133174e-05, "loss": 3.9452, "step": 86 }, { "epoch": 0.04968234706260261, "grad_norm": 3.9193077087402344, "learning_rate": 5.231295451899226e-05, "loss": 3.8865, "step": 87 }, { "epoch": 0.0502534085230923, "grad_norm": 4.1261305809021, "learning_rate": 5.174790491854502e-05, "loss": 4.026, "step": 88 }, { "epoch": 0.050824469983581984, "grad_norm": 4.0768351554870605, "learning_rate": 5.117713592391096e-05, "loss": 3.7968, "step": 89 }, { "epoch": 0.05139553144407167, "grad_norm": 4.356061935424805, "learning_rate": 5.060084245217884e-05, "loss": 4.048, "step": 90 }, { "epoch": 0.051966592904561355, "grad_norm": 4.245785236358643, "learning_rate": 5.0019221307041306e-05, "loss": 3.5643, "step": 91 }, { "epoch": 0.05253765436505104, "grad_norm": 4.084421634674072, "learning_rate": 4.943247111158662e-05, "loss": 3.5692, "step": 92 }, { "epoch": 0.05310871582554073, "grad_norm": 4.277196407318115, "learning_rate": 4.884079224046898e-05, "loss": 4.2165, "step": 93 }, { "epoch": 0.053679777286030406, "grad_norm": 4.16418981552124, "learning_rate": 4.824438675148086e-05, "loss": 3.3093, "step": 94 }, { "epoch": 0.05425083874652009, "grad_norm": 4.666293144226074, "learning_rate": 4.764345831655036e-05, "loss": 3.5172, "step": 95 }, { "epoch": 0.05482190020700978, "grad_norm": 4.601908206939697, "learning_rate": 4.703821215218748e-05, "loss": 3.3629, "step": 96 }, { "epoch": 0.05539296166749946, "grad_norm": 4.77185583114624, "learning_rate": 4.642885494940291e-05, "loss": 3.5901, "step": 97 }, { "epoch": 0.05596402312798915, "grad_norm": 5.006587028503418, "learning_rate": 4.581559480312316e-05, "loss": 3.2387, "step": 98 }, { "epoch": 0.056535084588478834, "grad_norm": 5.511896133422852, "learning_rate": 4.519864114112636e-05, "loss": 3.5257, "step": 99 }, { "epoch": 0.05710614604896852, "grad_norm": 7.265841007232666, "learning_rate": 4.45782046525229e-05, "loss": 3.6848, "step": 100 }, { "epoch": 0.05710614604896852, "eval_loss": 0.4799676537513733, "eval_runtime": 595.5075, "eval_samples_per_second": 4.954, "eval_steps_per_second": 1.239, "step": 100 }, { "epoch": 0.057677207509458206, "grad_norm": 4.521103382110596, "learning_rate": 4.3954497215805244e-05, "loss": 4.1905, "step": 101 }, { "epoch": 0.05824826896994789, "grad_norm": 3.9009766578674316, "learning_rate": 4.332773182649165e-05, "loss": 3.408, "step": 102 }, { "epoch": 0.05881933043043758, "grad_norm": 3.8264896869659424, "learning_rate": 4.2698122524388405e-05, "loss": 3.433, "step": 103 }, { "epoch": 0.05939039189092726, "grad_norm": 3.7116568088531494, "learning_rate": 4.206588432049535e-05, "loss": 3.5134, "step": 104 }, { "epoch": 0.05996145335141695, "grad_norm": 3.5707197189331055, "learning_rate": 4.143123312357996e-05, "loss": 3.0987, "step": 105 }, { "epoch": 0.060532514811906635, "grad_norm": 3.62729811668396, "learning_rate": 4.079438566644454e-05, "loss": 3.7314, "step": 106 }, { "epoch": 0.061103576272396314, "grad_norm": 3.6416237354278564, "learning_rate": 4.015555943191231e-05, "loss": 3.4506, "step": 107 }, { "epoch": 0.061674637732886, "grad_norm": 3.462261199951172, "learning_rate": 3.9514972578557114e-05, "loss": 3.3004, "step": 108 }, { "epoch": 0.062245699193375685, "grad_norm": 3.664212942123413, "learning_rate": 3.8872843866202525e-05, "loss": 3.3434, "step": 109 }, { "epoch": 0.06281676065386538, "grad_norm": 3.6109557151794434, "learning_rate": 3.8229392581215565e-05, "loss": 3.6783, "step": 110 }, { "epoch": 0.06338782211435506, "grad_norm": 3.6711537837982178, "learning_rate": 3.7584838461620587e-05, "loss": 3.5776, "step": 111 }, { "epoch": 0.06395888357484475, "grad_norm": 3.6779439449310303, "learning_rate": 3.693940162205895e-05, "loss": 3.6493, "step": 112 }, { "epoch": 0.06452994503533442, "grad_norm": 3.770522356033325, "learning_rate": 3.629330247862007e-05, "loss": 3.4767, "step": 113 }, { "epoch": 0.0651010064958241, "grad_norm": 3.7113234996795654, "learning_rate": 3.564676167356954e-05, "loss": 3.3844, "step": 114 }, { "epoch": 0.06567206795631379, "grad_norm": 3.5234830379486084, "learning_rate": 3.5e-05, "loss": 3.3633, "step": 115 }, { "epoch": 0.06624312941680348, "grad_norm": 3.8543918132781982, "learning_rate": 3.435323832643046e-05, "loss": 4.189, "step": 116 }, { "epoch": 0.06681419087729316, "grad_norm": 3.756317138671875, "learning_rate": 3.370669752137993e-05, "loss": 3.4077, "step": 117 }, { "epoch": 0.06738525233778285, "grad_norm": 3.5420141220092773, "learning_rate": 3.306059837794105e-05, "loss": 3.577, "step": 118 }, { "epoch": 0.06795631379827254, "grad_norm": 3.767037868499756, "learning_rate": 3.241516153837941e-05, "loss": 4.1839, "step": 119 }, { "epoch": 0.06852737525876222, "grad_norm": 3.63523268699646, "learning_rate": 3.177060741878443e-05, "loss": 3.8141, "step": 120 }, { "epoch": 0.06909843671925191, "grad_norm": 3.623882532119751, "learning_rate": 3.1127156133797475e-05, "loss": 3.5584, "step": 121 }, { "epoch": 0.06966949817974159, "grad_norm": 3.6835837364196777, "learning_rate": 3.048502742144289e-05, "loss": 3.6744, "step": 122 }, { "epoch": 0.07024055964023128, "grad_norm": 3.5777692794799805, "learning_rate": 2.984444056808768e-05, "loss": 3.8484, "step": 123 }, { "epoch": 0.07081162110072096, "grad_norm": 3.7121081352233887, "learning_rate": 2.9205614333555444e-05, "loss": 3.9062, "step": 124 }, { "epoch": 0.07138268256121065, "grad_norm": 3.627039909362793, "learning_rate": 2.856876687642003e-05, "loss": 3.8178, "step": 125 }, { "epoch": 0.07195374402170034, "grad_norm": 3.5459561347961426, "learning_rate": 2.7934115679504645e-05, "loss": 3.4396, "step": 126 }, { "epoch": 0.07252480548219002, "grad_norm": 3.5370867252349854, "learning_rate": 2.7301877475611606e-05, "loss": 3.5644, "step": 127 }, { "epoch": 0.07309586694267971, "grad_norm": 3.6710925102233887, "learning_rate": 2.667226817350835e-05, "loss": 3.8682, "step": 128 }, { "epoch": 0.0736669284031694, "grad_norm": 3.8747775554656982, "learning_rate": 2.604550278419475e-05, "loss": 4.4182, "step": 129 }, { "epoch": 0.07423798986365908, "grad_norm": 3.7661991119384766, "learning_rate": 2.54217953474771e-05, "loss": 4.1295, "step": 130 }, { "epoch": 0.07480905132414876, "grad_norm": 3.6781368255615234, "learning_rate": 2.4801358858873636e-05, "loss": 4.1132, "step": 131 }, { "epoch": 0.07538011278463845, "grad_norm": 3.6991829872131348, "learning_rate": 2.4184405196876842e-05, "loss": 4.3835, "step": 132 }, { "epoch": 0.07595117424512814, "grad_norm": 3.730729341506958, "learning_rate": 2.3571145050597088e-05, "loss": 4.2776, "step": 133 }, { "epoch": 0.07652223570561782, "grad_norm": 3.956230401992798, "learning_rate": 2.296178784781251e-05, "loss": 4.8423, "step": 134 }, { "epoch": 0.07709329716610751, "grad_norm": 3.769970417022705, "learning_rate": 2.2356541683449646e-05, "loss": 4.1285, "step": 135 }, { "epoch": 0.0776643586265972, "grad_norm": 4.056671619415283, "learning_rate": 2.175561324851914e-05, "loss": 4.1466, "step": 136 }, { "epoch": 0.07823542008708688, "grad_norm": 3.9425039291381836, "learning_rate": 2.1159207759531013e-05, "loss": 3.734, "step": 137 }, { "epoch": 0.07880648154757655, "grad_norm": 3.9520437717437744, "learning_rate": 2.0567528888413382e-05, "loss": 3.4524, "step": 138 }, { "epoch": 0.07937754300806624, "grad_norm": 3.9661831855773926, "learning_rate": 1.9980778692958684e-05, "loss": 3.6408, "step": 139 }, { "epoch": 0.07994860446855592, "grad_norm": 4.106236934661865, "learning_rate": 1.9399157547821162e-05, "loss": 3.5565, "step": 140 }, { "epoch": 0.08051966592904561, "grad_norm": 4.123175621032715, "learning_rate": 1.882286407608904e-05, "loss": 3.8822, "step": 141 }, { "epoch": 0.0810907273895353, "grad_norm": 4.1223859786987305, "learning_rate": 1.825209508145497e-05, "loss": 3.5751, "step": 142 }, { "epoch": 0.08166178885002498, "grad_norm": 4.237730979919434, "learning_rate": 1.7687045481007746e-05, "loss": 3.8547, "step": 143 }, { "epoch": 0.08223285031051467, "grad_norm": 4.241462707519531, "learning_rate": 1.712790823866826e-05, "loss": 3.5782, "step": 144 }, { "epoch": 0.08280391177100435, "grad_norm": 4.347367763519287, "learning_rate": 1.657487429929254e-05, "loss": 3.3161, "step": 145 }, { "epoch": 0.08337497323149404, "grad_norm": 4.680622577667236, "learning_rate": 1.602813252346427e-05, "loss": 3.7808, "step": 146 }, { "epoch": 0.08394603469198372, "grad_norm": 4.457460403442383, "learning_rate": 1.5487869622999004e-05, "loss": 3.2924, "step": 147 }, { "epoch": 0.08451709615247341, "grad_norm": 4.6479172706604, "learning_rate": 1.4954270097182317e-05, "loss": 3.3637, "step": 148 }, { "epoch": 0.0850881576129631, "grad_norm": 5.357884883880615, "learning_rate": 1.4427516169763444e-05, "loss": 3.3677, "step": 149 }, { "epoch": 0.08565921907345278, "grad_norm": 6.834778308868408, "learning_rate": 1.3907787726726029e-05, "loss": 3.6711, "step": 150 }, { "epoch": 0.08565921907345278, "eval_loss": 0.46051260828971863, "eval_runtime": 596.2501, "eval_samples_per_second": 4.948, "eval_steps_per_second": 1.238, "step": 150 }, { "epoch": 0.08623028053394247, "grad_norm": 3.791814088821411, "learning_rate": 1.339526225485725e-05, "loss": 3.4645, "step": 151 }, { "epoch": 0.08680134199443215, "grad_norm": 3.796891212463379, "learning_rate": 1.2890114781136224e-05, "loss": 3.4475, "step": 152 }, { "epoch": 0.08737240345492184, "grad_norm": 3.6403725147247314, "learning_rate": 1.239251781296245e-05, "loss": 3.1833, "step": 153 }, { "epoch": 0.08794346491541152, "grad_norm": 3.6974196434020996, "learning_rate": 1.1902641279244715e-05, "loss": 3.3696, "step": 154 }, { "epoch": 0.08851452637590121, "grad_norm": 3.578291893005371, "learning_rate": 1.1420652472370497e-05, "loss": 3.1136, "step": 155 }, { "epoch": 0.0890855878363909, "grad_norm": 3.4341652393341064, "learning_rate": 1.0946715991075805e-05, "loss": 2.9641, "step": 156 }, { "epoch": 0.08965664929688058, "grad_norm": 3.451934576034546, "learning_rate": 1.0480993684234815e-05, "loss": 3.1792, "step": 157 }, { "epoch": 0.09022771075737027, "grad_norm": 3.3800251483917236, "learning_rate": 1.0023644595588671e-05, "loss": 3.3144, "step": 158 }, { "epoch": 0.09079877221785995, "grad_norm": 3.479318618774414, "learning_rate": 9.57482490943216e-06, "loss": 3.1648, "step": 159 }, { "epoch": 0.09136983367834964, "grad_norm": 3.491649866104126, "learning_rate": 9.134687897276934e-06, "loss": 3.3233, "step": 160 }, { "epoch": 0.09194089513883932, "grad_norm": 3.484065055847168, "learning_rate": 8.703383865509432e-06, "loss": 3.5239, "step": 161 }, { "epoch": 0.09251195659932901, "grad_norm": 3.6290812492370605, "learning_rate": 8.281060104061394e-06, "loss": 3.2998, "step": 162 }, { "epoch": 0.09308301805981868, "grad_norm": 3.6152350902557373, "learning_rate": 7.867860836110453e-06, "loss": 3.7931, "step": 163 }, { "epoch": 0.09365407952030837, "grad_norm": 3.5166842937469482, "learning_rate": 7.463927168828087e-06, "loss": 3.5036, "step": 164 }, { "epoch": 0.09422514098079805, "grad_norm": 3.266004800796509, "learning_rate": 7.069397045191617e-06, "loss": 2.7839, "step": 165 }, { "epoch": 0.09479620244128774, "grad_norm": 3.5984013080596924, "learning_rate": 6.684405196876842e-06, "loss": 3.7792, "step": 166 }, { "epoch": 0.09536726390177742, "grad_norm": 3.4622771739959717, "learning_rate": 6.309083098247264e-06, "loss": 3.4876, "step": 167 }, { "epoch": 0.09593832536226711, "grad_norm": 3.397523880004883, "learning_rate": 5.943558921455733e-06, "loss": 3.202, "step": 168 }, { "epoch": 0.0965093868227568, "grad_norm": 3.561593770980835, "learning_rate": 5.587957492673759e-06, "loss": 3.5857, "step": 169 }, { "epoch": 0.09708044828324648, "grad_norm": 3.7165884971618652, "learning_rate": 5.2424002494635095e-06, "loss": 3.6163, "step": 170 }, { "epoch": 0.09765150974373617, "grad_norm": 3.710353374481201, "learning_rate": 4.9070051993069636e-06, "loss": 3.9078, "step": 171 }, { "epoch": 0.09822257120422585, "grad_norm": 3.53379225730896, "learning_rate": 4.581886879306507e-06, "loss": 3.5316, "step": 172 }, { "epoch": 0.09879363266471554, "grad_norm": 3.5342020988464355, "learning_rate": 4.2671563170705725e-06, "loss": 3.5556, "step": 173 }, { "epoch": 0.09936469412520522, "grad_norm": 3.5446951389312744, "learning_rate": 3.962920992797834e-06, "loss": 3.4027, "step": 174 }, { "epoch": 0.09993575558569491, "grad_norm": 3.7609660625457764, "learning_rate": 3.6692848025728216e-06, "loss": 4.0196, "step": 175 }, { "epoch": 0.1005068170461846, "grad_norm": 3.5949559211730957, "learning_rate": 3.38634802288549e-06, "loss": 3.7018, "step": 176 }, { "epoch": 0.10107787850667428, "grad_norm": 3.4899206161499023, "learning_rate": 3.1142072763869042e-06, "loss": 3.4402, "step": 177 }, { "epoch": 0.10164893996716397, "grad_norm": 3.4958362579345703, "learning_rate": 2.852955498892694e-06, "loss": 3.6609, "step": 178 }, { "epoch": 0.10222000142765365, "grad_norm": 3.7367665767669678, "learning_rate": 2.6026819076455325e-06, "loss": 3.9386, "step": 179 }, { "epoch": 0.10279106288814334, "grad_norm": 3.6101346015930176, "learning_rate": 2.36347197084755e-06, "loss": 3.589, "step": 180 }, { "epoch": 0.10336212434863302, "grad_norm": 3.714442491531372, "learning_rate": 2.1354073784730253e-06, "loss": 3.9742, "step": 181 }, { "epoch": 0.10393318580912271, "grad_norm": 3.8775899410247803, "learning_rate": 1.9185660143713184e-06, "loss": 4.4065, "step": 182 }, { "epoch": 0.1045042472696124, "grad_norm": 4.041444301605225, "learning_rate": 1.7130219296696263e-06, "loss": 4.5485, "step": 183 }, { "epoch": 0.10507530873010208, "grad_norm": 3.8321800231933594, "learning_rate": 1.5188453174845743e-06, "loss": 4.2913, "step": 184 }, { "epoch": 0.10564637019059177, "grad_norm": 3.849231481552124, "learning_rate": 1.3361024889513333e-06, "loss": 4.6328, "step": 185 }, { "epoch": 0.10621743165108145, "grad_norm": 3.960550546646118, "learning_rate": 1.16485585057844e-06, "loss": 4.6204, "step": 186 }, { "epoch": 0.10678849311157114, "grad_norm": 3.7694177627563477, "learning_rate": 1.0051638829360127e-06, "loss": 3.6095, "step": 187 }, { "epoch": 0.10735955457206081, "grad_norm": 3.936204671859741, "learning_rate": 8.570811206847189e-07, "loss": 4.067, "step": 188 }, { "epoch": 0.1079306160325505, "grad_norm": 4.248170852661133, "learning_rate": 7.206581339521939e-07, "loss": 4.3996, "step": 189 }, { "epoch": 0.10850167749304018, "grad_norm": 4.024265766143799, "learning_rate": 5.959415110634375e-07, "loss": 3.9334, "step": 190 }, { "epoch": 0.10907273895352987, "grad_norm": 4.193387508392334, "learning_rate": 4.829738426309099e-07, "loss": 3.9381, "step": 191 }, { "epoch": 0.10964380041401955, "grad_norm": 4.102433204650879, "learning_rate": 3.817937070098914e-07, "loss": 3.7026, "step": 192 }, { "epoch": 0.11021486187450924, "grad_norm": 4.2977142333984375, "learning_rate": 2.9243565712400384e-07, "loss": 3.4472, "step": 193 }, { "epoch": 0.11078592333499893, "grad_norm": 4.350393295288086, "learning_rate": 2.1493020866542365e-07, "loss": 3.5126, "step": 194 }, { "epoch": 0.11135698479548861, "grad_norm": 4.65367317199707, "learning_rate": 1.4930382967379363e-07, "loss": 3.6799, "step": 195 }, { "epoch": 0.1119280462559783, "grad_norm": 4.893271446228027, "learning_rate": 9.557893149741924e-08, "loss": 3.6397, "step": 196 }, { "epoch": 0.11249910771646798, "grad_norm": 4.931285381317139, "learning_rate": 5.377386113981197e-08, "loss": 3.6552, "step": 197 }, { "epoch": 0.11307016917695767, "grad_norm": 5.327000141143799, "learning_rate": 2.3902894994198286e-08, "loss": 3.3647, "step": 198 }, { "epoch": 0.11364123063744735, "grad_norm": 6.226569175720215, "learning_rate": 5.976233968155164e-09, "loss": 3.279, "step": 199 }, { "epoch": 0.11421229209793704, "grad_norm": 8.364439010620117, "learning_rate": 0.0, "loss": 3.8711, "step": 200 }, { "epoch": 0.11421229209793704, "eval_loss": 0.4560202658176422, "eval_runtime": 596.8378, "eval_samples_per_second": 4.943, "eval_steps_per_second": 1.237, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 4, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.849985165910344e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }