{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 1340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018656716417910446, "grad_norm": 2.403158187866211, "learning_rate": 4.9998282347929784e-05, "loss": 3.3875, "step": 5 }, { "epoch": 0.03731343283582089, "grad_norm": 2.301710367202759, "learning_rate": 4.99931296277454e-05, "loss": 2.9015, "step": 10 }, { "epoch": 0.055970149253731345, "grad_norm": 1.271048665046692, "learning_rate": 4.998454254749331e-05, "loss": 2.6229, "step": 15 }, { "epoch": 0.07462686567164178, "grad_norm": 1.069893717765808, "learning_rate": 4.997252228714279e-05, "loss": 2.3704, "step": 20 }, { "epoch": 0.09328358208955224, "grad_norm": 0.9044906497001648, "learning_rate": 4.9957070498423854e-05, "loss": 2.3782, "step": 25 }, { "epoch": 0.11194029850746269, "grad_norm": 0.9635376334190369, "learning_rate": 4.993818930460026e-05, "loss": 2.3576, "step": 30 }, { "epoch": 0.13059701492537312, "grad_norm": 0.8513979315757751, "learning_rate": 4.9915881300177725e-05, "loss": 2.4603, "step": 35 }, { "epoch": 0.14925373134328357, "grad_norm": 0.845267117023468, "learning_rate": 4.9890149550547454e-05, "loss": 2.2033, "step": 40 }, { "epoch": 0.16791044776119404, "grad_norm": 0.6632418036460876, "learning_rate": 4.98609975915649e-05, "loss": 2.1851, "step": 45 }, { "epoch": 0.1865671641791045, "grad_norm": 0.6857479810714722, "learning_rate": 4.982842942906386e-05, "loss": 2.3592, "step": 50 }, { "epoch": 0.20522388059701493, "grad_norm": 0.7204287648200989, "learning_rate": 4.979244953830608e-05, "loss": 2.1323, "step": 55 }, { "epoch": 0.22388059701492538, "grad_norm": 0.6864420175552368, "learning_rate": 4.9753062863366276e-05, "loss": 2.2138, "step": 60 }, { "epoch": 0.24253731343283583, "grad_norm": 0.7536088228225708, "learning_rate": 4.971027481645274e-05, "loss": 2.2584, "step": 65 }, { "epoch": 0.26119402985074625, "grad_norm": 0.9708526134490967, "learning_rate": 4.966409127716367e-05, "loss": 2.2669, "step": 70 }, { "epoch": 0.2798507462686567, "grad_norm": 0.7516190409660339, "learning_rate": 4.96145185916792e-05, "loss": 2.2133, "step": 75 }, { "epoch": 0.29850746268656714, "grad_norm": 0.7864778637886047, "learning_rate": 4.95615635718894e-05, "loss": 2.1683, "step": 80 }, { "epoch": 0.31716417910447764, "grad_norm": 0.7846741080284119, "learning_rate": 4.950523349445824e-05, "loss": 2.1274, "step": 85 }, { "epoch": 0.3358208955223881, "grad_norm": 0.816838800907135, "learning_rate": 4.944553609982363e-05, "loss": 2.2033, "step": 90 }, { "epoch": 0.35447761194029853, "grad_norm": 0.7661916017532349, "learning_rate": 4.938247959113386e-05, "loss": 2.1492, "step": 95 }, { "epoch": 0.373134328358209, "grad_norm": 0.8964986205101013, "learning_rate": 4.931607263312032e-05, "loss": 2.0862, "step": 100 }, { "epoch": 0.3917910447761194, "grad_norm": 0.8603547215461731, "learning_rate": 4.924632435090696e-05, "loss": 2.1444, "step": 105 }, { "epoch": 0.41044776119402987, "grad_norm": 0.8611045479774475, "learning_rate": 4.917324432875627e-05, "loss": 2.1202, "step": 110 }, { "epoch": 0.4291044776119403, "grad_norm": 0.9499636888504028, "learning_rate": 4.909684260875235e-05, "loss": 2.1285, "step": 115 }, { "epoch": 0.44776119402985076, "grad_norm": 0.8490393161773682, "learning_rate": 4.9017129689421e-05, "loss": 2.236, "step": 120 }, { "epoch": 0.4664179104477612, "grad_norm": 0.9628555178642273, "learning_rate": 4.893411652428712e-05, "loss": 2.1219, "step": 125 }, { "epoch": 0.48507462686567165, "grad_norm": 1.1119599342346191, "learning_rate": 4.8847814520369475e-05, "loss": 2.2537, "step": 130 }, { "epoch": 0.503731343283582, "grad_norm": 0.9489665627479553, "learning_rate": 4.875823553661334e-05, "loss": 2.1018, "step": 135 }, { "epoch": 0.5223880597014925, "grad_norm": 0.9434083700180054, "learning_rate": 4.8665391882260856e-05, "loss": 2.0809, "step": 140 }, { "epoch": 0.5410447761194029, "grad_norm": 0.8856557607650757, "learning_rate": 4.856929631515964e-05, "loss": 2.0807, "step": 145 }, { "epoch": 0.5597014925373134, "grad_norm": 0.8770031929016113, "learning_rate": 4.846996204000967e-05, "loss": 2.0843, "step": 150 }, { "epoch": 0.5783582089552238, "grad_norm": 0.8374930620193481, "learning_rate": 4.8367402706548805e-05, "loss": 2.1869, "step": 155 }, { "epoch": 0.5970149253731343, "grad_norm": 1.0829132795333862, "learning_rate": 4.8261632407677174e-05, "loss": 2.028, "step": 160 }, { "epoch": 0.6156716417910447, "grad_norm": 0.9735206365585327, "learning_rate": 4.815266567752059e-05, "loss": 2.0966, "step": 165 }, { "epoch": 0.6343283582089553, "grad_norm": 1.087944746017456, "learning_rate": 4.804051748943343e-05, "loss": 2.0863, "step": 170 }, { "epoch": 0.6529850746268657, "grad_norm": 0.8176729083061218, "learning_rate": 4.792520325394111e-05, "loss": 2.1135, "step": 175 }, { "epoch": 0.6716417910447762, "grad_norm": 0.9173070788383484, "learning_rate": 4.780673881662242e-05, "loss": 2.0564, "step": 180 }, { "epoch": 0.6902985074626866, "grad_norm": 0.9463202953338623, "learning_rate": 4.7685140455932267e-05, "loss": 2.1579, "step": 185 }, { "epoch": 0.7089552238805971, "grad_norm": 1.149950385093689, "learning_rate": 4.756042488096471e-05, "loss": 2.1447, "step": 190 }, { "epoch": 0.7276119402985075, "grad_norm": 0.940965473651886, "learning_rate": 4.743260922915701e-05, "loss": 2.0823, "step": 195 }, { "epoch": 0.746268656716418, "grad_norm": 0.9384671449661255, "learning_rate": 4.730171106393466e-05, "loss": 2.1445, "step": 200 }, { "epoch": 0.7649253731343284, "grad_norm": 0.8937250971794128, "learning_rate": 4.716774837229804e-05, "loss": 2.014, "step": 205 }, { "epoch": 0.7835820895522388, "grad_norm": 0.8928058743476868, "learning_rate": 4.7030739562350713e-05, "loss": 2.1882, "step": 210 }, { "epoch": 0.8022388059701493, "grad_norm": 1.0239906311035156, "learning_rate": 4.6890703460769955e-05, "loss": 2.1042, "step": 215 }, { "epoch": 0.8208955223880597, "grad_norm": 1.0555064678192139, "learning_rate": 4.674765931021976e-05, "loss": 2.015, "step": 220 }, { "epoch": 0.8395522388059702, "grad_norm": 1.084709882736206, "learning_rate": 4.6601626766706626e-05, "loss": 2.0603, "step": 225 }, { "epoch": 0.8582089552238806, "grad_norm": 0.9265861511230469, "learning_rate": 4.645262589687861e-05, "loss": 2.1006, "step": 230 }, { "epoch": 0.8768656716417911, "grad_norm": 1.0058296918869019, "learning_rate": 4.6300677175267914e-05, "loss": 2.063, "step": 235 }, { "epoch": 0.8955223880597015, "grad_norm": 1.0766576528549194, "learning_rate": 4.614580148147744e-05, "loss": 2.0781, "step": 240 }, { "epoch": 0.914179104477612, "grad_norm": 1.0215730667114258, "learning_rate": 4.598802009731167e-05, "loss": 2.1774, "step": 245 }, { "epoch": 0.9328358208955224, "grad_norm": 0.9870419502258301, "learning_rate": 4.582735470385229e-05, "loss": 1.9636, "step": 250 }, { "epoch": 0.9514925373134329, "grad_norm": 1.1921675205230713, "learning_rate": 4.5663827378478975e-05, "loss": 2.0141, "step": 255 }, { "epoch": 0.9701492537313433, "grad_norm": 1.0618964433670044, "learning_rate": 4.5497460591835615e-05, "loss": 2.0508, "step": 260 }, { "epoch": 0.9888059701492538, "grad_norm": 0.9723111391067505, "learning_rate": 4.532827720474268e-05, "loss": 2.0312, "step": 265 }, { "epoch": 1.007462686567164, "grad_norm": 0.9339023232460022, "learning_rate": 4.515630046505575e-05, "loss": 2.1107, "step": 270 }, { "epoch": 1.0261194029850746, "grad_norm": 1.0588074922561646, "learning_rate": 4.498155400447107e-05, "loss": 2.0963, "step": 275 }, { "epoch": 1.044776119402985, "grad_norm": 1.0709750652313232, "learning_rate": 4.480406183527823e-05, "loss": 2.0359, "step": 280 }, { "epoch": 1.0634328358208955, "grad_norm": 1.2172249555587769, "learning_rate": 4.462384834706058e-05, "loss": 2.1083, "step": 285 }, { "epoch": 1.0820895522388059, "grad_norm": 1.1719626188278198, "learning_rate": 4.4440938303343804e-05, "loss": 2.1259, "step": 290 }, { "epoch": 1.1007462686567164, "grad_norm": 1.051269292831421, "learning_rate": 4.425535683819312e-05, "loss": 2.0901, "step": 295 }, { "epoch": 1.1194029850746268, "grad_norm": 1.3167760372161865, "learning_rate": 4.406712945275955e-05, "loss": 2.0032, "step": 300 }, { "epoch": 1.1380597014925373, "grad_norm": 1.2565367221832275, "learning_rate": 4.387628201177577e-05, "loss": 2.0148, "step": 305 }, { "epoch": 1.1567164179104479, "grad_norm": 1.1141688823699951, "learning_rate": 4.368284074000193e-05, "loss": 2.0217, "step": 310 }, { "epoch": 1.1753731343283582, "grad_norm": 1.1642612218856812, "learning_rate": 4.348683221862212e-05, "loss": 2.0194, "step": 315 }, { "epoch": 1.1940298507462686, "grad_norm": 1.1613104343414307, "learning_rate": 4.328828338159173e-05, "loss": 1.9371, "step": 320 }, { "epoch": 1.212686567164179, "grad_norm": 1.2319557666778564, "learning_rate": 4.3087221511936434e-05, "loss": 2.0227, "step": 325 }, { "epoch": 1.2313432835820897, "grad_norm": 1.2520420551300049, "learning_rate": 4.288367423800319e-05, "loss": 1.9883, "step": 330 }, { "epoch": 1.25, "grad_norm": 1.0452089309692383, "learning_rate": 4.267766952966369e-05, "loss": 1.9912, "step": 335 }, { "epoch": 1.2686567164179103, "grad_norm": 0.9965611100196838, "learning_rate": 4.2469235694471043e-05, "loss": 1.983, "step": 340 }, { "epoch": 1.287313432835821, "grad_norm": 1.0808607339859009, "learning_rate": 4.225840137376993e-05, "loss": 1.9514, "step": 345 }, { "epoch": 1.3059701492537314, "grad_norm": 1.102575659751892, "learning_rate": 4.204519553876095e-05, "loss": 2.0286, "step": 350 }, { "epoch": 1.3246268656716418, "grad_norm": 1.0246608257293701, "learning_rate": 4.1829647486519596e-05, "loss": 2.0265, "step": 355 }, { "epoch": 1.3432835820895521, "grad_norm": 1.0723367929458618, "learning_rate": 4.161178683597054e-05, "loss": 2.0077, "step": 360 }, { "epoch": 1.3619402985074627, "grad_norm": 1.4298617839813232, "learning_rate": 4.139164352381758e-05, "loss": 2.0898, "step": 365 }, { "epoch": 1.3805970149253732, "grad_norm": 1.1437115669250488, "learning_rate": 4.116924780042997e-05, "loss": 2.024, "step": 370 }, { "epoch": 1.3992537313432836, "grad_norm": 1.326556921005249, "learning_rate": 4.094463022568569e-05, "loss": 2.2252, "step": 375 }, { "epoch": 1.417910447761194, "grad_norm": 1.2549344301223755, "learning_rate": 4.071782166477213e-05, "loss": 1.9777, "step": 380 }, { "epoch": 1.4365671641791045, "grad_norm": 1.1226497888565063, "learning_rate": 4.0488853283944806e-05, "loss": 2.0062, "step": 385 }, { "epoch": 1.455223880597015, "grad_norm": 1.2250981330871582, "learning_rate": 4.0257756546244804e-05, "loss": 1.9147, "step": 390 }, { "epoch": 1.4738805970149254, "grad_norm": 1.3552589416503906, "learning_rate": 4.0024563207175316e-05, "loss": 1.9709, "step": 395 }, { "epoch": 1.4925373134328357, "grad_norm": 1.3661599159240723, "learning_rate": 3.978930531033807e-05, "loss": 1.9748, "step": 400 }, { "epoch": 1.5111940298507462, "grad_norm": 1.1794605255126953, "learning_rate": 3.9552015183030136e-05, "loss": 2.0367, "step": 405 }, { "epoch": 1.5298507462686568, "grad_norm": 1.19724440574646, "learning_rate": 3.93127254318018e-05, "loss": 1.9545, "step": 410 }, { "epoch": 1.5485074626865671, "grad_norm": 1.310658574104309, "learning_rate": 3.907146893797599e-05, "loss": 1.9933, "step": 415 }, { "epoch": 1.5671641791044775, "grad_norm": 1.2032736539840698, "learning_rate": 3.882827885312999e-05, "loss": 2.0442, "step": 420 }, { "epoch": 1.585820895522388, "grad_norm": 1.2670124769210815, "learning_rate": 3.858318859454001e-05, "loss": 1.974, "step": 425 }, { "epoch": 1.6044776119402986, "grad_norm": 1.5301685333251953, "learning_rate": 3.833623184058926e-05, "loss": 2.0865, "step": 430 }, { "epoch": 1.623134328358209, "grad_norm": 1.3863707780838013, "learning_rate": 3.808744252614012e-05, "loss": 1.9614, "step": 435 }, { "epoch": 1.6417910447761193, "grad_norm": 1.2591431140899658, "learning_rate": 3.783685483787105e-05, "loss": 1.949, "step": 440 }, { "epoch": 1.6604477611940298, "grad_norm": 1.2093037366867065, "learning_rate": 3.758450320957899e-05, "loss": 1.9618, "step": 445 }, { "epoch": 1.6791044776119404, "grad_norm": 1.1593824625015259, "learning_rate": 3.7330422317447685e-05, "loss": 2.0124, "step": 450 }, { "epoch": 1.6977611940298507, "grad_norm": 1.7013437747955322, "learning_rate": 3.707464707528275e-05, "loss": 2.0613, "step": 455 }, { "epoch": 1.716417910447761, "grad_norm": 1.2550350427627563, "learning_rate": 3.681721262971413e-05, "loss": 2.1354, "step": 460 }, { "epoch": 1.7350746268656716, "grad_norm": 1.1735903024673462, "learning_rate": 3.6558154355366506e-05, "loss": 1.9618, "step": 465 }, { "epoch": 1.7537313432835822, "grad_norm": 1.331148624420166, "learning_rate": 3.6297507849998344e-05, "loss": 1.9245, "step": 470 }, { "epoch": 1.7723880597014925, "grad_norm": 1.3502494096755981, "learning_rate": 3.6035308929610446e-05, "loss": 1.9758, "step": 475 }, { "epoch": 1.7910447761194028, "grad_norm": 1.2406198978424072, "learning_rate": 3.5771593623524265e-05, "loss": 1.9824, "step": 480 }, { "epoch": 1.8097014925373134, "grad_norm": 1.224885106086731, "learning_rate": 3.550639816943111e-05, "loss": 2.069, "step": 485 }, { "epoch": 1.828358208955224, "grad_norm": 1.4666011333465576, "learning_rate": 3.5239759008412666e-05, "loss": 2.0797, "step": 490 }, { "epoch": 1.8470149253731343, "grad_norm": 1.2758076190948486, "learning_rate": 3.497171277993346e-05, "loss": 2.0195, "step": 495 }, { "epoch": 1.8656716417910446, "grad_norm": 1.1991291046142578, "learning_rate": 3.4702296316806244e-05, "loss": 1.9558, "step": 500 }, { "epoch": 1.8843283582089554, "grad_norm": 1.2548415660858154, "learning_rate": 3.443154664013067e-05, "loss": 1.9805, "step": 505 }, { "epoch": 1.9029850746268657, "grad_norm": 1.5407222509384155, "learning_rate": 3.415950095420616e-05, "loss": 1.9152, "step": 510 }, { "epoch": 1.921641791044776, "grad_norm": 1.285704493522644, "learning_rate": 3.3886196641419545e-05, "loss": 2.0442, "step": 515 }, { "epoch": 1.9402985074626866, "grad_norm": 1.377465844154358, "learning_rate": 3.361167125710832e-05, "loss": 2.0537, "step": 520 }, { "epoch": 1.9589552238805972, "grad_norm": 1.186889410018921, "learning_rate": 3.333596252440008e-05, "loss": 1.9798, "step": 525 }, { "epoch": 1.9776119402985075, "grad_norm": 1.4855142831802368, "learning_rate": 3.305910832902884e-05, "loss": 2.0984, "step": 530 }, { "epoch": 1.9962686567164178, "grad_norm": 1.47159743309021, "learning_rate": 3.278114671412917e-05, "loss": 1.9932, "step": 535 }, { "epoch": 2.014925373134328, "grad_norm": 1.4078409671783447, "learning_rate": 3.2502115875008524e-05, "loss": 1.9457, "step": 540 }, { "epoch": 2.033582089552239, "grad_norm": 1.1386340856552124, "learning_rate": 3.222205415389877e-05, "loss": 1.9334, "step": 545 }, { "epoch": 2.0522388059701493, "grad_norm": 1.666084885597229, "learning_rate": 3.1941000034687515e-05, "loss": 1.9716, "step": 550 }, { "epoch": 2.0708955223880596, "grad_norm": 1.3137987852096558, "learning_rate": 3.165899213762995e-05, "loss": 1.9189, "step": 555 }, { "epoch": 2.08955223880597, "grad_norm": 1.2372797727584839, "learning_rate": 3.1376069214041913e-05, "loss": 2.0234, "step": 560 }, { "epoch": 2.1082089552238807, "grad_norm": 1.3149720430374146, "learning_rate": 3.109227014097505e-05, "loss": 2.0271, "step": 565 }, { "epoch": 2.126865671641791, "grad_norm": 1.4162675142288208, "learning_rate": 3.0807633915874584e-05, "loss": 1.8236, "step": 570 }, { "epoch": 2.1455223880597014, "grad_norm": 1.4029136896133423, "learning_rate": 3.052219965122062e-05, "loss": 2.1821, "step": 575 }, { "epoch": 2.1641791044776117, "grad_norm": 1.5424753427505493, "learning_rate": 3.0236006569153617e-05, "loss": 1.9496, "step": 580 }, { "epoch": 2.1828358208955225, "grad_norm": 1.274217963218689, "learning_rate": 2.9949093996084747e-05, "loss": 2.0439, "step": 585 }, { "epoch": 2.201492537313433, "grad_norm": 1.2068248987197876, "learning_rate": 2.9661501357292033e-05, "loss": 2.0805, "step": 590 }, { "epoch": 2.220149253731343, "grad_norm": 1.2352491617202759, "learning_rate": 2.9373268171502777e-05, "loss": 1.975, "step": 595 }, { "epoch": 2.2388059701492535, "grad_norm": 1.3039956092834473, "learning_rate": 2.9084434045463255e-05, "loss": 1.9834, "step": 600 }, { "epoch": 2.2574626865671643, "grad_norm": 1.3400136232376099, "learning_rate": 2.8795038668496222e-05, "loss": 1.84, "step": 605 }, { "epoch": 2.2761194029850746, "grad_norm": 1.458132028579712, "learning_rate": 2.850512180704715e-05, "loss": 1.9518, "step": 610 }, { "epoch": 2.294776119402985, "grad_norm": 1.446595311164856, "learning_rate": 2.821472329921981e-05, "loss": 1.8977, "step": 615 }, { "epoch": 2.3134328358208958, "grad_norm": 1.432244062423706, "learning_rate": 2.792388304930207e-05, "loss": 1.9742, "step": 620 }, { "epoch": 2.332089552238806, "grad_norm": 1.499017596244812, "learning_rate": 2.7632641022282502e-05, "loss": 1.9379, "step": 625 }, { "epoch": 2.3507462686567164, "grad_norm": 1.6504281759262085, "learning_rate": 2.7341037238358774e-05, "loss": 1.9175, "step": 630 }, { "epoch": 2.3694029850746268, "grad_norm": 1.6585911512374878, "learning_rate": 2.704911176743833e-05, "loss": 2.0449, "step": 635 }, { "epoch": 2.388059701492537, "grad_norm": 1.545623779296875, "learning_rate": 2.6756904723632324e-05, "loss": 2.0096, "step": 640 }, { "epoch": 2.406716417910448, "grad_norm": 1.468853235244751, "learning_rate": 2.646445625974347e-05, "loss": 1.878, "step": 645 }, { "epoch": 2.425373134328358, "grad_norm": 1.3598605394363403, "learning_rate": 2.6171806561748502e-05, "loss": 1.9625, "step": 650 }, { "epoch": 2.4440298507462686, "grad_norm": 1.3197077512741089, "learning_rate": 2.5878995843276204e-05, "loss": 1.9375, "step": 655 }, { "epoch": 2.4626865671641793, "grad_norm": 1.5469880104064941, "learning_rate": 2.5586064340081516e-05, "loss": 1.8402, "step": 660 }, { "epoch": 2.4813432835820897, "grad_norm": 1.4435440301895142, "learning_rate": 2.529305230451666e-05, "loss": 1.8795, "step": 665 }, { "epoch": 2.5, "grad_norm": 1.505194067955017, "learning_rate": 2.5e-05, "loss": 1.9938, "step": 670 }, { "epoch": 2.5186567164179103, "grad_norm": 1.3251738548278809, "learning_rate": 2.4706947695483348e-05, "loss": 1.956, "step": 675 }, { "epoch": 2.5373134328358207, "grad_norm": 1.4197183847427368, "learning_rate": 2.441393565991849e-05, "loss": 1.906, "step": 680 }, { "epoch": 2.5559701492537314, "grad_norm": 1.4905989170074463, "learning_rate": 2.4121004156723802e-05, "loss": 1.9073, "step": 685 }, { "epoch": 2.574626865671642, "grad_norm": 1.3891818523406982, "learning_rate": 2.3828193438251497e-05, "loss": 2.1399, "step": 690 }, { "epoch": 2.593283582089552, "grad_norm": 1.6372982263565063, "learning_rate": 2.3535543740256536e-05, "loss": 1.873, "step": 695 }, { "epoch": 2.611940298507463, "grad_norm": 1.5683703422546387, "learning_rate": 2.3243095276367685e-05, "loss": 1.8899, "step": 700 }, { "epoch": 2.6305970149253732, "grad_norm": 1.585425615310669, "learning_rate": 2.2950888232561672e-05, "loss": 2.0511, "step": 705 }, { "epoch": 2.6492537313432836, "grad_norm": 1.3682692050933838, "learning_rate": 2.2658962761641232e-05, "loss": 2.0364, "step": 710 }, { "epoch": 2.667910447761194, "grad_norm": 1.7755306959152222, "learning_rate": 2.23673589777175e-05, "loss": 2.0033, "step": 715 }, { "epoch": 2.6865671641791042, "grad_norm": 1.4118067026138306, "learning_rate": 2.207611695069794e-05, "loss": 2.102, "step": 720 }, { "epoch": 2.705223880597015, "grad_norm": 1.5786772966384888, "learning_rate": 2.17852767007802e-05, "loss": 1.9894, "step": 725 }, { "epoch": 2.7238805970149254, "grad_norm": 1.4233230352401733, "learning_rate": 2.1494878192952855e-05, "loss": 1.9355, "step": 730 }, { "epoch": 2.7425373134328357, "grad_norm": 1.5830904245376587, "learning_rate": 2.1204961331503787e-05, "loss": 1.9399, "step": 735 }, { "epoch": 2.7611940298507465, "grad_norm": 1.2974706888198853, "learning_rate": 2.0915565954536744e-05, "loss": 1.9814, "step": 740 }, { "epoch": 2.779850746268657, "grad_norm": 1.2366008758544922, "learning_rate": 2.0626731828497225e-05, "loss": 1.9275, "step": 745 }, { "epoch": 2.798507462686567, "grad_norm": 1.5165388584136963, "learning_rate": 2.0338498642707977e-05, "loss": 1.9444, "step": 750 }, { "epoch": 2.8171641791044775, "grad_norm": 1.429136037826538, "learning_rate": 2.005090600391526e-05, "loss": 1.9831, "step": 755 }, { "epoch": 2.835820895522388, "grad_norm": 1.4274283647537231, "learning_rate": 1.9763993430846395e-05, "loss": 2.0005, "step": 760 }, { "epoch": 2.8544776119402986, "grad_norm": 1.502812147140503, "learning_rate": 1.947780034877938e-05, "loss": 2.0224, "step": 765 }, { "epoch": 2.873134328358209, "grad_norm": 1.556489109992981, "learning_rate": 1.9192366084125425e-05, "loss": 1.9519, "step": 770 }, { "epoch": 2.8917910447761193, "grad_norm": 1.467826008796692, "learning_rate": 1.890772985902496e-05, "loss": 1.9947, "step": 775 }, { "epoch": 2.91044776119403, "grad_norm": 1.6837282180786133, "learning_rate": 1.8623930785958092e-05, "loss": 1.9335, "step": 780 }, { "epoch": 2.9291044776119404, "grad_norm": 1.446560025215149, "learning_rate": 1.8341007862370056e-05, "loss": 1.9258, "step": 785 }, { "epoch": 2.9477611940298507, "grad_norm": 1.453008770942688, "learning_rate": 1.8058999965312484e-05, "loss": 1.9039, "step": 790 }, { "epoch": 2.966417910447761, "grad_norm": 1.3427950143814087, "learning_rate": 1.777794584610124e-05, "loss": 1.8156, "step": 795 }, { "epoch": 2.9850746268656714, "grad_norm": 1.7210839986801147, "learning_rate": 1.749788412499149e-05, "loss": 2.0007, "step": 800 }, { "epoch": 3.003731343283582, "grad_norm": 1.8247441053390503, "learning_rate": 1.721885328587083e-05, "loss": 1.8995, "step": 805 }, { "epoch": 3.0223880597014925, "grad_norm": 1.3744760751724243, "learning_rate": 1.694089167097116e-05, "loss": 1.9604, "step": 810 }, { "epoch": 3.041044776119403, "grad_norm": 1.1527031660079956, "learning_rate": 1.6664037475599923e-05, "loss": 1.8479, "step": 815 }, { "epoch": 3.0597014925373136, "grad_norm": 1.412294626235962, "learning_rate": 1.638832874289168e-05, "loss": 1.9622, "step": 820 }, { "epoch": 3.078358208955224, "grad_norm": 1.5206471681594849, "learning_rate": 1.611380335858047e-05, "loss": 1.8965, "step": 825 }, { "epoch": 3.0970149253731343, "grad_norm": 1.426445484161377, "learning_rate": 1.5840499045793843e-05, "loss": 1.9118, "step": 830 }, { "epoch": 3.1156716417910446, "grad_norm": 1.556396245956421, "learning_rate": 1.5568453359869334e-05, "loss": 1.8189, "step": 835 }, { "epoch": 3.1343283582089554, "grad_norm": 1.5185908079147339, "learning_rate": 1.5297703683193752e-05, "loss": 1.9363, "step": 840 }, { "epoch": 3.1529850746268657, "grad_norm": 1.4425839185714722, "learning_rate": 1.502828722006655e-05, "loss": 1.9708, "step": 845 }, { "epoch": 3.171641791044776, "grad_norm": 1.6175637245178223, "learning_rate": 1.4760240991587337e-05, "loss": 1.9008, "step": 850 }, { "epoch": 3.1902985074626864, "grad_norm": 1.5075782537460327, "learning_rate": 1.4493601830568887e-05, "loss": 1.9626, "step": 855 }, { "epoch": 3.208955223880597, "grad_norm": 1.7610998153686523, "learning_rate": 1.4228406376475742e-05, "loss": 1.9749, "step": 860 }, { "epoch": 3.2276119402985075, "grad_norm": 1.538076400756836, "learning_rate": 1.396469107038956e-05, "loss": 1.9565, "step": 865 }, { "epoch": 3.246268656716418, "grad_norm": 1.4104888439178467, "learning_rate": 1.3702492150001659e-05, "loss": 1.9042, "step": 870 }, { "epoch": 3.264925373134328, "grad_norm": 1.5483851432800293, "learning_rate": 1.34418456446335e-05, "loss": 1.8595, "step": 875 }, { "epoch": 3.283582089552239, "grad_norm": 1.8045192956924438, "learning_rate": 1.3182787370285865e-05, "loss": 1.8968, "step": 880 }, { "epoch": 3.3022388059701493, "grad_norm": 1.5665298700332642, "learning_rate": 1.292535292471726e-05, "loss": 1.8853, "step": 885 }, { "epoch": 3.3208955223880596, "grad_norm": 1.4902681112289429, "learning_rate": 1.2669577682552319e-05, "loss": 1.8916, "step": 890 }, { "epoch": 3.33955223880597, "grad_norm": 1.3823623657226562, "learning_rate": 1.2415496790421011e-05, "loss": 1.8614, "step": 895 }, { "epoch": 3.3582089552238807, "grad_norm": 1.4400016069412231, "learning_rate": 1.2163145162128947e-05, "loss": 1.9052, "step": 900 }, { "epoch": 3.376865671641791, "grad_norm": 1.7787601947784424, "learning_rate": 1.1912557473859895e-05, "loss": 2.0061, "step": 905 }, { "epoch": 3.3955223880597014, "grad_norm": 1.5302358865737915, "learning_rate": 1.1663768159410748e-05, "loss": 1.9656, "step": 910 }, { "epoch": 3.4141791044776117, "grad_norm": 1.6571131944656372, "learning_rate": 1.1416811405459993e-05, "loss": 1.9289, "step": 915 }, { "epoch": 3.4328358208955225, "grad_norm": 1.8324801921844482, "learning_rate": 1.1171721146870015e-05, "loss": 1.8982, "step": 920 }, { "epoch": 3.451492537313433, "grad_norm": 1.5971417427062988, "learning_rate": 1.0928531062024017e-05, "loss": 1.9105, "step": 925 }, { "epoch": 3.470149253731343, "grad_norm": 1.5357367992401123, "learning_rate": 1.0687274568198208e-05, "loss": 1.9997, "step": 930 }, { "epoch": 3.4888059701492535, "grad_norm": 1.6085304021835327, "learning_rate": 1.0447984816969874e-05, "loss": 1.9139, "step": 935 }, { "epoch": 3.5074626865671643, "grad_norm": 1.3676837682724, "learning_rate": 1.021069468966194e-05, "loss": 1.9389, "step": 940 }, { "epoch": 3.5261194029850746, "grad_norm": 1.6692901849746704, "learning_rate": 9.975436792824691e-06, "loss": 1.835, "step": 945 }, { "epoch": 3.544776119402985, "grad_norm": 1.579232931137085, "learning_rate": 9.742243453755202e-06, "loss": 1.822, "step": 950 }, { "epoch": 3.5634328358208958, "grad_norm": 1.587336778640747, "learning_rate": 9.5111467160552e-06, "loss": 1.9564, "step": 955 }, { "epoch": 3.582089552238806, "grad_norm": 1.6467562913894653, "learning_rate": 9.282178335227884e-06, "loss": 1.9029, "step": 960 }, { "epoch": 3.6007462686567164, "grad_norm": 1.3579665422439575, "learning_rate": 9.05536977431431e-06, "loss": 2.0225, "step": 965 }, { "epoch": 3.6194029850746268, "grad_norm": 1.641695261001587, "learning_rate": 8.830752199570033e-06, "loss": 1.939, "step": 970 }, { "epoch": 3.638059701492537, "grad_norm": 1.5209190845489502, "learning_rate": 8.608356476182424e-06, "loss": 2.01, "step": 975 }, { "epoch": 3.656716417910448, "grad_norm": 1.769853115081787, "learning_rate": 8.38821316402946e-06, "loss": 1.9972, "step": 980 }, { "epoch": 3.675373134328358, "grad_norm": 1.7627779245376587, "learning_rate": 8.170352513480408e-06, "loss": 1.8508, "step": 985 }, { "epoch": 3.6940298507462686, "grad_norm": 1.8104168176651, "learning_rate": 7.954804461239053e-06, "loss": 1.994, "step": 990 }, { "epoch": 3.7126865671641793, "grad_norm": 1.6434147357940674, "learning_rate": 7.741598626230079e-06, "loss": 1.9354, "step": 995 }, { "epoch": 3.7313432835820897, "grad_norm": 1.871103286743164, "learning_rate": 7.530764305528959e-06, "loss": 2.004, "step": 1000 }, { "epoch": 3.75, "grad_norm": 1.7736434936523438, "learning_rate": 7.3223304703363135e-06, "loss": 2.0723, "step": 1005 }, { "epoch": 3.7686567164179103, "grad_norm": 1.3735415935516357, "learning_rate": 7.116325761996817e-06, "loss": 1.9994, "step": 1010 }, { "epoch": 3.7873134328358207, "grad_norm": 1.5601403713226318, "learning_rate": 6.91277848806356e-06, "loss": 2.0251, "step": 1015 }, { "epoch": 3.8059701492537314, "grad_norm": 1.7761123180389404, "learning_rate": 6.711716618408281e-06, "loss": 1.9377, "step": 1020 }, { "epoch": 3.824626865671642, "grad_norm": 1.8018087148666382, "learning_rate": 6.513167781377885e-06, "loss": 1.9374, "step": 1025 }, { "epoch": 3.843283582089552, "grad_norm": 1.8684526681900024, "learning_rate": 6.317159259998073e-06, "loss": 1.8859, "step": 1030 }, { "epoch": 3.861940298507463, "grad_norm": 1.4688338041305542, "learning_rate": 6.123717988224237e-06, "loss": 1.8078, "step": 1035 }, { "epoch": 3.8805970149253732, "grad_norm": 1.566017508506775, "learning_rate": 5.932870547240454e-06, "loss": 1.82, "step": 1040 }, { "epoch": 3.8992537313432836, "grad_norm": 1.6024701595306396, "learning_rate": 5.74464316180689e-06, "loss": 1.9388, "step": 1045 }, { "epoch": 3.917910447761194, "grad_norm": 1.615678310394287, "learning_rate": 5.559061696656198e-06, "loss": 1.8754, "step": 1050 }, { "epoch": 3.9365671641791042, "grad_norm": 1.7218761444091797, "learning_rate": 5.37615165293942e-06, "loss": 1.8457, "step": 1055 }, { "epoch": 3.955223880597015, "grad_norm": 1.5426040887832642, "learning_rate": 5.1959381647217666e-06, "loss": 1.944, "step": 1060 }, { "epoch": 3.9738805970149254, "grad_norm": 1.607737421989441, "learning_rate": 5.018445995528931e-06, "loss": 1.8135, "step": 1065 }, { "epoch": 3.9925373134328357, "grad_norm": 1.6999655961990356, "learning_rate": 4.843699534944257e-06, "loss": 1.8664, "step": 1070 }, { "epoch": 4.0111940298507465, "grad_norm": 1.7595158815383911, "learning_rate": 4.671722795257327e-06, "loss": 1.8825, "step": 1075 }, { "epoch": 4.029850746268656, "grad_norm": 1.6721343994140625, "learning_rate": 4.502539408164386e-06, "loss": 1.7858, "step": 1080 }, { "epoch": 4.048507462686567, "grad_norm": 1.8451169729232788, "learning_rate": 4.336172621521034e-06, "loss": 1.9469, "step": 1085 }, { "epoch": 4.067164179104478, "grad_norm": 1.5332179069519043, "learning_rate": 4.1726452961477146e-06, "loss": 1.8153, "step": 1090 }, { "epoch": 4.085820895522388, "grad_norm": 1.5114800930023193, "learning_rate": 4.01197990268834e-06, "loss": 1.92, "step": 1095 }, { "epoch": 4.104477611940299, "grad_norm": 1.642686367034912, "learning_rate": 3.8541985185225645e-06, "loss": 1.9352, "step": 1100 }, { "epoch": 4.123134328358209, "grad_norm": 2.106177568435669, "learning_rate": 3.6993228247320877e-06, "loss": 1.9699, "step": 1105 }, { "epoch": 4.141791044776119, "grad_norm": 1.5525238513946533, "learning_rate": 3.547374103121398e-06, "loss": 1.7852, "step": 1110 }, { "epoch": 4.16044776119403, "grad_norm": 1.49571692943573, "learning_rate": 3.398373233293378e-06, "loss": 1.8704, "step": 1115 }, { "epoch": 4.17910447761194, "grad_norm": 1.4679092168807983, "learning_rate": 3.252340689780245e-06, "loss": 1.8484, "step": 1120 }, { "epoch": 4.197761194029851, "grad_norm": 1.7375657558441162, "learning_rate": 3.1092965392300417e-06, "loss": 1.884, "step": 1125 }, { "epoch": 4.2164179104477615, "grad_norm": 1.5738627910614014, "learning_rate": 2.969260437649293e-06, "loss": 1.9437, "step": 1130 }, { "epoch": 4.235074626865671, "grad_norm": 1.9191651344299316, "learning_rate": 2.8322516277019624e-06, "loss": 1.9058, "step": 1135 }, { "epoch": 4.253731343283582, "grad_norm": 1.5124653577804565, "learning_rate": 2.6982889360653377e-06, "loss": 1.844, "step": 1140 }, { "epoch": 4.272388059701493, "grad_norm": 1.4586025476455688, "learning_rate": 2.5673907708429976e-06, "loss": 1.8923, "step": 1145 }, { "epoch": 4.291044776119403, "grad_norm": 1.423386812210083, "learning_rate": 2.4395751190352924e-06, "loss": 1.8745, "step": 1150 }, { "epoch": 4.309701492537314, "grad_norm": 1.5810245275497437, "learning_rate": 2.3148595440677405e-06, "loss": 1.9122, "step": 1155 }, { "epoch": 4.3283582089552235, "grad_norm": 1.562049388885498, "learning_rate": 2.1932611833775846e-06, "loss": 1.9031, "step": 1160 }, { "epoch": 4.347014925373134, "grad_norm": 1.7122998237609863, "learning_rate": 2.074796746058896e-06, "loss": 1.9303, "step": 1165 }, { "epoch": 4.365671641791045, "grad_norm": 1.9016716480255127, "learning_rate": 1.9594825105665654e-06, "loss": 1.9301, "step": 1170 }, { "epoch": 4.384328358208955, "grad_norm": 1.3229435682296753, "learning_rate": 1.847334322479413e-06, "loss": 2.0382, "step": 1175 }, { "epoch": 4.402985074626866, "grad_norm": 1.5966112613677979, "learning_rate": 1.738367592322837e-06, "loss": 1.8987, "step": 1180 }, { "epoch": 4.4216417910447765, "grad_norm": 1.6126375198364258, "learning_rate": 1.6325972934512018e-06, "loss": 1.7628, "step": 1185 }, { "epoch": 4.440298507462686, "grad_norm": 1.605992317199707, "learning_rate": 1.5300379599903409e-06, "loss": 1.9645, "step": 1190 }, { "epoch": 4.458955223880597, "grad_norm": 1.3877547979354858, "learning_rate": 1.4307036848403648e-06, "loss": 1.8797, "step": 1195 }, { "epoch": 4.477611940298507, "grad_norm": 1.6082713603973389, "learning_rate": 1.3346081177391472e-06, "loss": 1.8971, "step": 1200 }, { "epoch": 4.496268656716418, "grad_norm": 1.373641848564148, "learning_rate": 1.2417644633866632e-06, "loss": 2.0187, "step": 1205 }, { "epoch": 4.514925373134329, "grad_norm": 1.615536093711853, "learning_rate": 1.1521854796305242e-06, "loss": 1.9572, "step": 1210 }, { "epoch": 4.5335820895522385, "grad_norm": 1.8367860317230225, "learning_rate": 1.0658834757128838e-06, "loss": 1.9083, "step": 1215 }, { "epoch": 4.552238805970149, "grad_norm": 1.6231968402862549, "learning_rate": 9.828703105789983e-07, "loss": 1.8875, "step": 1220 }, { "epoch": 4.57089552238806, "grad_norm": 1.6421841382980347, "learning_rate": 9.031573912476554e-07, "loss": 1.9755, "step": 1225 }, { "epoch": 4.58955223880597, "grad_norm": 1.8085827827453613, "learning_rate": 8.267556712437341e-07, "loss": 1.9347, "step": 1230 }, { "epoch": 4.608208955223881, "grad_norm": 1.6334691047668457, "learning_rate": 7.536756490930358e-07, "loss": 1.8515, "step": 1235 }, { "epoch": 4.6268656716417915, "grad_norm": 1.4685897827148438, "learning_rate": 6.839273668796747e-07, "loss": 1.8517, "step": 1240 }, { "epoch": 4.645522388059701, "grad_norm": 1.932681918144226, "learning_rate": 6.175204088661485e-07, "loss": 2.0031, "step": 1245 }, { "epoch": 4.664179104477612, "grad_norm": 1.6335207223892212, "learning_rate": 5.544639001763718e-07, "loss": 1.9175, "step": 1250 }, { "epoch": 4.682835820895522, "grad_norm": 1.8198968172073364, "learning_rate": 4.947665055417605e-07, "loss": 1.913, "step": 1255 }, { "epoch": 4.701492537313433, "grad_norm": 1.649850845336914, "learning_rate": 4.3843642811059737e-07, "loss": 1.941, "step": 1260 }, { "epoch": 4.720149253731344, "grad_norm": 1.6591604948043823, "learning_rate": 3.854814083208064e-07, "loss": 1.9476, "step": 1265 }, { "epoch": 4.7388059701492535, "grad_norm": 1.6544086933135986, "learning_rate": 3.3590872283633944e-07, "loss": 1.9025, "step": 1270 }, { "epoch": 4.757462686567164, "grad_norm": 1.705462098121643, "learning_rate": 2.8972518354725977e-07, "loss": 1.9331, "step": 1275 }, { "epoch": 4.776119402985074, "grad_norm": 1.4553812742233276, "learning_rate": 2.4693713663372644e-07, "loss": 1.8251, "step": 1280 }, { "epoch": 4.794776119402985, "grad_norm": 1.7195388078689575, "learning_rate": 2.0755046169392e-07, "loss": 1.9025, "step": 1285 }, { "epoch": 4.813432835820896, "grad_norm": 1.7135190963745117, "learning_rate": 1.7157057093614703e-07, "loss": 1.9516, "step": 1290 }, { "epoch": 4.832089552238806, "grad_norm": 1.727378010749817, "learning_rate": 1.3900240843510993e-07, "loss": 2.0013, "step": 1295 }, { "epoch": 4.850746268656716, "grad_norm": 1.5725924968719482, "learning_rate": 1.0985044945254764e-07, "loss": 1.8973, "step": 1300 }, { "epoch": 4.869402985074627, "grad_norm": 1.6664009094238281, "learning_rate": 8.411869982228038e-08, "loss": 1.9408, "step": 1305 }, { "epoch": 4.888059701492537, "grad_norm": 1.7553932666778564, "learning_rate": 6.181069539974716e-08, "loss": 1.9985, "step": 1310 }, { "epoch": 4.906716417910448, "grad_norm": 1.7037988901138306, "learning_rate": 4.292950157614717e-08, "loss": 1.7886, "step": 1315 }, { "epoch": 4.925373134328359, "grad_norm": 1.474507212638855, "learning_rate": 2.7477712857215677e-08, "loss": 1.8797, "step": 1320 }, { "epoch": 4.9440298507462686, "grad_norm": 1.886070728302002, "learning_rate": 1.5457452506698056e-08, "loss": 1.821, "step": 1325 }, { "epoch": 4.962686567164179, "grad_norm": 1.5862762928009033, "learning_rate": 6.870372254602631e-09, "loss": 1.9146, "step": 1330 }, { "epoch": 4.981343283582089, "grad_norm": 1.5918110609054565, "learning_rate": 1.7176520702238964e-09, "loss": 1.9365, "step": 1335 }, { "epoch": 5.0, "grad_norm": 1.725006341934204, "learning_rate": 0.0, "loss": 1.9107, "step": 1340 }, { "epoch": 5.0, "step": 1340, "total_flos": 1.3943478340182344e+18, "train_loss": 2.0002955956245536, "train_runtime": 31085.0395, "train_samples_per_second": 1.379, "train_steps_per_second": 0.043 } ], "logging_steps": 5, "max_steps": 1340, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3943478340182344e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }