|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9993073193257908, |
|
"eval_steps": 500, |
|
"global_step": 541, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0018471484645578389, |
|
"grad_norm": 0.2646295130252838, |
|
"learning_rate": 3.636363636363636e-06, |
|
"loss": 0.9117, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.009235742322789195, |
|
"grad_norm": 0.29179853200912476, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 0.9579, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01847148464557839, |
|
"grad_norm": 0.30512726306915283, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.9489, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.027707226968367582, |
|
"grad_norm": 0.08514489233493805, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 0.9274, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03694296929115678, |
|
"grad_norm": 0.07719692587852478, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.9257, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04617871161394597, |
|
"grad_norm": 0.08602738380432129, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 0.9255, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.055414453936735164, |
|
"grad_norm": 0.08664494752883911, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 0.8889, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06465019625952435, |
|
"grad_norm": 0.07322084158658981, |
|
"learning_rate": 0.00012727272727272728, |
|
"loss": 0.8809, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.07388593858231356, |
|
"grad_norm": 0.08361256867647171, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 0.8901, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08312168090510275, |
|
"grad_norm": 0.0712098553776741, |
|
"learning_rate": 0.00016363636363636366, |
|
"loss": 0.8712, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.09235742322789194, |
|
"grad_norm": 0.07038867473602295, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.8779, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10159316555068114, |
|
"grad_norm": 0.08133558183908463, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8549, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.11082890787347033, |
|
"grad_norm": 0.07199383527040482, |
|
"learning_rate": 0.00019994777247895855, |
|
"loss": 0.8655, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12006465019625952, |
|
"grad_norm": 0.08243429660797119, |
|
"learning_rate": 0.00019979114447011323, |
|
"loss": 0.891, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1293003925190487, |
|
"grad_norm": 0.0855724886059761, |
|
"learning_rate": 0.00019953027957931658, |
|
"loss": 0.8675, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1385361348418379, |
|
"grad_norm": 0.0847950428724289, |
|
"learning_rate": 0.00019916545029310012, |
|
"loss": 0.8579, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1477718771646271, |
|
"grad_norm": 0.07739102840423584, |
|
"learning_rate": 0.00019869703769404828, |
|
"loss": 0.8643, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1570076194874163, |
|
"grad_norm": 0.08714427053928375, |
|
"learning_rate": 0.00019812553106273847, |
|
"loss": 0.8766, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1662433618102055, |
|
"grad_norm": 0.06926970183849335, |
|
"learning_rate": 0.00019745152736666302, |
|
"loss": 0.8539, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1754791041329947, |
|
"grad_norm": 0.08098109066486359, |
|
"learning_rate": 0.0001966757306366662, |
|
"loss": 0.8823, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.18471484645578387, |
|
"grad_norm": 0.09138432890176773, |
|
"learning_rate": 0.0001957989512315489, |
|
"loss": 0.8601, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.19395058877857307, |
|
"grad_norm": 0.09067590534687042, |
|
"learning_rate": 0.00019482210499160765, |
|
"loss": 0.8551, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.20318633110136228, |
|
"grad_norm": 0.08557430654764175, |
|
"learning_rate": 0.0001937462122819935, |
|
"loss": 0.8582, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.21242207342415145, |
|
"grad_norm": 0.07583803683519363, |
|
"learning_rate": 0.00019257239692688907, |
|
"loss": 0.8468, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.22165781574694066, |
|
"grad_norm": 0.10376883298158646, |
|
"learning_rate": 0.00019130188503561741, |
|
"loss": 0.8722, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.23089355806972986, |
|
"grad_norm": 0.07388196140527725, |
|
"learning_rate": 0.00018993600372190932, |
|
"loss": 0.8715, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.24012930039251904, |
|
"grad_norm": 0.07768449187278748, |
|
"learning_rate": 0.00018847617971766577, |
|
"loss": 0.8632, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.24936504271530824, |
|
"grad_norm": 0.07902154326438904, |
|
"learning_rate": 0.00018692393788266479, |
|
"loss": 0.8746, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.2586007850380974, |
|
"grad_norm": 0.07420278340578079, |
|
"learning_rate": 0.0001852808996117683, |
|
"loss": 0.8676, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.26783652736088664, |
|
"grad_norm": 0.07562129944562912, |
|
"learning_rate": 0.00018354878114129367, |
|
"loss": 0.8829, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.2770722696836758, |
|
"grad_norm": 0.07764951884746552, |
|
"learning_rate": 0.00018172939175631808, |
|
"loss": 0.8766, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.286308012006465, |
|
"grad_norm": 0.07714657485485077, |
|
"learning_rate": 0.0001798246319007893, |
|
"loss": 0.8802, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.2955437543292542, |
|
"grad_norm": 0.08166080713272095, |
|
"learning_rate": 0.00017783649119241602, |
|
"loss": 0.8504, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3047794966520434, |
|
"grad_norm": 0.07295581698417664, |
|
"learning_rate": 0.0001757670463444118, |
|
"loss": 0.8578, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3140152389748326, |
|
"grad_norm": 0.08463213592767715, |
|
"learning_rate": 0.00017361845899626355, |
|
"loss": 0.8465, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3232509812976218, |
|
"grad_norm": 0.07825674116611481, |
|
"learning_rate": 0.00017139297345578994, |
|
"loss": 0.8581, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.332486723620411, |
|
"grad_norm": 0.08157385140657425, |
|
"learning_rate": 0.0001690929143548488, |
|
"loss": 0.8598, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.34172246594320016, |
|
"grad_norm": 0.08528893440961838, |
|
"learning_rate": 0.00016672068422114196, |
|
"loss": 0.85, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.3509582082659894, |
|
"grad_norm": 0.07343052327632904, |
|
"learning_rate": 0.00016427876096865394, |
|
"loss": 0.8515, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.36019395058877857, |
|
"grad_norm": 0.07675015926361084, |
|
"learning_rate": 0.00016176969530934572, |
|
"loss": 0.8628, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.36942969291156774, |
|
"grad_norm": 0.07422856241464615, |
|
"learning_rate": 0.0001591961080888076, |
|
"loss": 0.866, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.378665435234357, |
|
"grad_norm": 0.0853537917137146, |
|
"learning_rate": 0.00015656068754865387, |
|
"loss": 0.8652, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.38790117755714615, |
|
"grad_norm": 0.07879694551229477, |
|
"learning_rate": 0.0001538661865185188, |
|
"loss": 0.8613, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3971369198799353, |
|
"grad_norm": 0.0811944380402565, |
|
"learning_rate": 0.00015111541954058734, |
|
"loss": 0.8723, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.40637266220272455, |
|
"grad_norm": 0.08038283884525299, |
|
"learning_rate": 0.00014831125992966385, |
|
"loss": 0.8709, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.41560840452551373, |
|
"grad_norm": 0.07797664403915405, |
|
"learning_rate": 0.00014545663677185006, |
|
"loss": 0.8583, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.4248441468483029, |
|
"grad_norm": 0.08307761698961258, |
|
"learning_rate": 0.00014255453186496673, |
|
"loss": 0.8467, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.43407988917109214, |
|
"grad_norm": 0.07691530138254166, |
|
"learning_rate": 0.0001396079766039157, |
|
"loss": 0.8435, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.4433156314938813, |
|
"grad_norm": 0.09281527251005173, |
|
"learning_rate": 0.0001366200488142348, |
|
"loss": 0.8605, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4525513738166705, |
|
"grad_norm": 0.07799265533685684, |
|
"learning_rate": 0.00013359386953715421, |
|
"loss": 0.85, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4617871161394597, |
|
"grad_norm": 0.07675463706254959, |
|
"learning_rate": 0.00013053259976951133, |
|
"loss": 0.8434, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4710228584622489, |
|
"grad_norm": 0.09029978513717651, |
|
"learning_rate": 0.00012743943716193016, |
|
"loss": 0.8587, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.48025860078503807, |
|
"grad_norm": 0.07997170835733414, |
|
"learning_rate": 0.00012431761267871417, |
|
"loss": 0.8436, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4894943431078273, |
|
"grad_norm": 0.08595962822437286, |
|
"learning_rate": 0.0001211703872229411, |
|
"loss": 0.8682, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.4987300854306165, |
|
"grad_norm": 0.08949116617441177, |
|
"learning_rate": 0.00011800104823028515, |
|
"loss": 0.8663, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5079658277534057, |
|
"grad_norm": 0.08509814739227295, |
|
"learning_rate": 0.0001148129062351249, |
|
"loss": 0.8359, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5172015700761948, |
|
"grad_norm": 0.08007588982582092, |
|
"learning_rate": 0.00011160929141252303, |
|
"loss": 0.8494, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5264373123989841, |
|
"grad_norm": 0.07660423964262009, |
|
"learning_rate": 0.00010839355009969068, |
|
"loss": 0.8403, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5356730547217733, |
|
"grad_norm": 0.07779201865196228, |
|
"learning_rate": 0.00010516904130056946, |
|
"loss": 0.863, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5449087970445624, |
|
"grad_norm": 0.07425861805677414, |
|
"learning_rate": 0.00010193913317718244, |
|
"loss": 0.8732, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5541445393673516, |
|
"grad_norm": 0.08780544251203537, |
|
"learning_rate": 9.870719953141917e-05, |
|
"loss": 0.856, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5633802816901409, |
|
"grad_norm": 0.07546891272068024, |
|
"learning_rate": 9.547661628092937e-05, |
|
"loss": 0.8418, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.57261602401293, |
|
"grad_norm": 0.08621185272932053, |
|
"learning_rate": 9.225075793280692e-05, |
|
"loss": 0.8463, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5818517663357192, |
|
"grad_norm": 0.07618840038776398, |
|
"learning_rate": 8.903299405874684e-05, |
|
"loss": 0.8257, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.5910875086585085, |
|
"grad_norm": 0.07749903202056885, |
|
"learning_rate": 8.582668577535797e-05, |
|
"loss": 0.8442, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6003232509812976, |
|
"grad_norm": 0.08626076579093933, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 0.8739, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.6095589933040868, |
|
"grad_norm": 0.07912192493677139, |
|
"learning_rate": 7.94618171189618e-05, |
|
"loss": 0.816, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.618794735626876, |
|
"grad_norm": 0.08324088156223297, |
|
"learning_rate": 7.630990517218808e-05, |
|
"loss": 0.853, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.6280304779496652, |
|
"grad_norm": 0.08155480027198792, |
|
"learning_rate": 7.318273872393625e-05, |
|
"loss": 0.86, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6372662202724544, |
|
"grad_norm": 0.10541233420372009, |
|
"learning_rate": 7.008358425723585e-05, |
|
"loss": 0.8674, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.6465019625952436, |
|
"grad_norm": 0.08856544643640518, |
|
"learning_rate": 6.701567899518924e-05, |
|
"loss": 0.8542, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"grad_norm": 0.0975981131196022, |
|
"learning_rate": 6.398222751952899e-05, |
|
"loss": 0.8441, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.664973447240822, |
|
"grad_norm": 0.0872284546494484, |
|
"learning_rate": 6.098639842327052e-05, |
|
"loss": 0.8661, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6742091895636112, |
|
"grad_norm": 0.08079314976930618, |
|
"learning_rate": 5.80313210009571e-05, |
|
"loss": 0.8465, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.6834449318864003, |
|
"grad_norm": 0.08326224982738495, |
|
"learning_rate": 5.5120081979953785e-05, |
|
"loss": 0.8551, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6926806742091896, |
|
"grad_norm": 0.07942940294742584, |
|
"learning_rate": 5.22557222962051e-05, |
|
"loss": 0.8673, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.7019164165319788, |
|
"grad_norm": 0.08273901045322418, |
|
"learning_rate": 4.9441233917824106e-05, |
|
"loss": 0.8424, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7111521588547679, |
|
"grad_norm": 0.07538026571273804, |
|
"learning_rate": 4.66795567198309e-05, |
|
"loss": 0.8448, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.7203879011775571, |
|
"grad_norm": 0.07920888811349869, |
|
"learning_rate": 4.397357541330476e-05, |
|
"loss": 0.874, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7296236435003464, |
|
"grad_norm": 0.09400610625743866, |
|
"learning_rate": 4.132611653215822e-05, |
|
"loss": 0.8487, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.7388593858231355, |
|
"grad_norm": 0.08254476636648178, |
|
"learning_rate": 3.873994548067972e-05, |
|
"loss": 0.836, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7480951281459247, |
|
"grad_norm": 0.08779824525117874, |
|
"learning_rate": 3.621776364492939e-05, |
|
"loss": 0.8621, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.757330870468714, |
|
"grad_norm": 0.08715569227933884, |
|
"learning_rate": 3.376220557100523e-05, |
|
"loss": 0.8413, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7665666127915031, |
|
"grad_norm": 0.08005767315626144, |
|
"learning_rate": 3.137583621312665e-05, |
|
"loss": 0.8563, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.7758023551142923, |
|
"grad_norm": 0.08362836390733719, |
|
"learning_rate": 2.906114825441072e-05, |
|
"loss": 0.8431, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7850380974370815, |
|
"grad_norm": 0.08525697141885757, |
|
"learning_rate": 2.6820559503138797e-05, |
|
"loss": 0.8619, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.7942738397598706, |
|
"grad_norm": 0.08668297529220581, |
|
"learning_rate": 2.465641036723393e-05, |
|
"loss": 0.8525, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8035095820826599, |
|
"grad_norm": 0.08923713862895966, |
|
"learning_rate": 2.2570961409586754e-05, |
|
"loss": 0.854, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.8127453244054491, |
|
"grad_norm": 0.08294524252414703, |
|
"learning_rate": 2.0566390986783646e-05, |
|
"loss": 0.867, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8219810667282382, |
|
"grad_norm": 0.08329101651906967, |
|
"learning_rate": 1.864479297370325e-05, |
|
"loss": 0.8454, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.8312168090510275, |
|
"grad_norm": 0.08947557955980301, |
|
"learning_rate": 1.6808174576358848e-05, |
|
"loss": 0.8663, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8404525513738167, |
|
"grad_norm": 0.08388309925794601, |
|
"learning_rate": 1.505845423527027e-05, |
|
"loss": 0.8663, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.8496882936966058, |
|
"grad_norm": 0.07886148244142532, |
|
"learning_rate": 1.339745962155613e-05, |
|
"loss": 0.8484, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.858924036019395, |
|
"grad_norm": 0.08021736145019531, |
|
"learning_rate": 1.18269257278392e-05, |
|
"loss": 0.8613, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.8681597783421843, |
|
"grad_norm": 0.0815872773528099, |
|
"learning_rate": 1.0348493055959062e-05, |
|
"loss": 0.8361, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8773955206649734, |
|
"grad_norm": 0.08512595295906067, |
|
"learning_rate": 8.963705903385345e-06, |
|
"loss": 0.8236, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.8866312629877626, |
|
"grad_norm": 0.07890893518924713, |
|
"learning_rate": 7.674010750120964e-06, |
|
"loss": 0.8398, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8958670053105519, |
|
"grad_norm": 0.08390273153781891, |
|
"learning_rate": 6.480754747781037e-06, |
|
"loss": 0.8392, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.905102747633341, |
|
"grad_norm": 0.07941864430904388, |
|
"learning_rate": 5.385184312424974e-06, |
|
"loss": 0.8344, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9143384899561302, |
|
"grad_norm": 0.0822635293006897, |
|
"learning_rate": 4.3884438226120424e-06, |
|
"loss": 0.8492, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.9235742322789194, |
|
"grad_norm": 0.07574615627527237, |
|
"learning_rate": 3.4915744240403558e-06, |
|
"loss": 0.8499, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9328099746017086, |
|
"grad_norm": 0.08466410636901855, |
|
"learning_rate": 2.6955129420176196e-06, |
|
"loss": 0.848, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.9420457169244978, |
|
"grad_norm": 0.0831914022564888, |
|
"learning_rate": 2.0010909028998827e-06, |
|
"loss": 0.8442, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.951281459247287, |
|
"grad_norm": 0.08431433886289597, |
|
"learning_rate": 1.409033665520354e-06, |
|
"loss": 0.8491, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.9605172015700761, |
|
"grad_norm": 0.08757560700178146, |
|
"learning_rate": 9.199596635154683e-07, |
|
"loss": 0.8526, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9697529438928654, |
|
"grad_norm": 0.0839475765824318, |
|
"learning_rate": 5.343797593398536e-07, |
|
"loss": 0.8663, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.9789886862156546, |
|
"grad_norm": 0.07811986654996872, |
|
"learning_rate": 2.5269671064467313e-07, |
|
"loss": 0.8473, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9882244285384437, |
|
"grad_norm": 0.08186525851488113, |
|
"learning_rate": 7.520474957699586e-08, |
|
"loss": 0.8295, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.997460170861233, |
|
"grad_norm": 0.0831933468580246, |
|
"learning_rate": 2.0892754394208346e-09, |
|
"loss": 0.8639, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9993073193257908, |
|
"eval_loss": 1.0852999687194824, |
|
"eval_runtime": 177.1887, |
|
"eval_samples_per_second": 6.518, |
|
"eval_steps_per_second": 0.547, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.9993073193257908, |
|
"step": 541, |
|
"total_flos": 2.705944628841939e+18, |
|
"train_loss": 0.860630649956229, |
|
"train_runtime": 42250.2981, |
|
"train_samples_per_second": 2.46, |
|
"train_steps_per_second": 0.013 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 541, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.705944628841939e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|