diff --git "a/checkpoint-118500/trainer_state.json" "b/checkpoint-118500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-118500/trainer_state.json" @@ -0,0 +1,8328 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3859897134555688, + "eval_steps": 500, + "global_step": 118500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003257297159962606, + "grad_norm": 2.2308592796325684, + "learning_rate": 4.99853416853153e-05, + "loss": 1.4483, + "step": 100 + }, + { + "epoch": 0.0006514594319925212, + "grad_norm": 2.3997225761413574, + "learning_rate": 4.996905466899897e-05, + "loss": 1.3276, + "step": 200 + }, + { + "epoch": 0.0009771891479887819, + "grad_norm": 1.4687339067459106, + "learning_rate": 4.995276765268264e-05, + "loss": 1.3394, + "step": 300 + }, + { + "epoch": 0.0013029188639850425, + "grad_norm": 0.6583470702171326, + "learning_rate": 4.993648063636631e-05, + "loss": 1.3245, + "step": 400 + }, + { + "epoch": 0.0016286485799813031, + "grad_norm": 1.6252340078353882, + "learning_rate": 4.992019362004997e-05, + "loss": 1.3249, + "step": 500 + }, + { + "epoch": 0.0019543782959775637, + "grad_norm": 2.0806777477264404, + "learning_rate": 4.9903906603733634e-05, + "loss": 1.32, + "step": 600 + }, + { + "epoch": 0.002280108011973824, + "grad_norm": 1.376539707183838, + "learning_rate": 4.988761958741731e-05, + "loss": 1.3133, + "step": 700 + }, + { + "epoch": 0.002605837727970085, + "grad_norm": 2.234644889831543, + "learning_rate": 4.987133257110097e-05, + "loss": 1.3179, + "step": 800 + }, + { + "epoch": 0.0029315674439663454, + "grad_norm": 1.4599684476852417, + "learning_rate": 4.985504555478464e-05, + "loss": 1.3097, + "step": 900 + }, + { + "epoch": 0.0032572971599626062, + "grad_norm": 1.7078094482421875, + "learning_rate": 4.9838758538468304e-05, + "loss": 1.3083, + "step": 1000 + }, + { + "epoch": 0.0035830268759588666, + "grad_norm": 0.6953567266464233, + "learning_rate": 4.9822471522151976e-05, + "loss": 1.3075, + "step": 1100 + }, + { + "epoch": 0.0039087565919551275, + "grad_norm": 1.225602626800537, + "learning_rate": 4.980618450583564e-05, + "loss": 1.3054, + "step": 1200 + }, + { + "epoch": 0.004234486307951388, + "grad_norm": 1.3010519742965698, + "learning_rate": 4.978989748951931e-05, + "loss": 1.3066, + "step": 1300 + }, + { + "epoch": 0.004560216023947648, + "grad_norm": 0.6475724577903748, + "learning_rate": 4.9773610473202974e-05, + "loss": 1.3109, + "step": 1400 + }, + { + "epoch": 0.004885945739943909, + "grad_norm": 1.046614646911621, + "learning_rate": 4.975732345688664e-05, + "loss": 1.3074, + "step": 1500 + }, + { + "epoch": 0.00521167545594017, + "grad_norm": 1.113573670387268, + "learning_rate": 4.974103644057031e-05, + "loss": 1.3083, + "step": 1600 + }, + { + "epoch": 0.005537405171936431, + "grad_norm": 1.4273550510406494, + "learning_rate": 4.972474942425398e-05, + "loss": 1.3018, + "step": 1700 + }, + { + "epoch": 0.005863134887932691, + "grad_norm": 0.5519908666610718, + "learning_rate": 4.970846240793764e-05, + "loss": 1.2945, + "step": 1800 + }, + { + "epoch": 0.006188864603928952, + "grad_norm": 0.6653416156768799, + "learning_rate": 4.969217539162131e-05, + "loss": 1.3004, + "step": 1900 + }, + { + "epoch": 0.0065145943199252125, + "grad_norm": 0.732170581817627, + "learning_rate": 4.9675888375304975e-05, + "loss": 1.3014, + "step": 2000 + }, + { + "epoch": 0.006840324035921473, + "grad_norm": 0.405608594417572, + "learning_rate": 4.965960135898865e-05, + "loss": 1.2939, + "step": 2100 + }, + { + "epoch": 0.007166053751917733, + "grad_norm": 0.9849847555160522, + "learning_rate": 4.9643314342672306e-05, + "loss": 1.2922, + "step": 2200 + }, + { + "epoch": 0.007491783467913994, + "grad_norm": 0.7152832746505737, + "learning_rate": 4.962702732635598e-05, + "loss": 1.2905, + "step": 2300 + }, + { + "epoch": 0.007817513183910255, + "grad_norm": 1.1164734363555908, + "learning_rate": 4.9610740310039644e-05, + "loss": 1.3024, + "step": 2400 + }, + { + "epoch": 0.008143242899906516, + "grad_norm": 0.574243426322937, + "learning_rate": 4.959445329372332e-05, + "loss": 1.2944, + "step": 2500 + }, + { + "epoch": 0.008468972615902777, + "grad_norm": 0.6976324319839478, + "learning_rate": 4.9578166277406976e-05, + "loss": 1.2939, + "step": 2600 + }, + { + "epoch": 0.008794702331899037, + "grad_norm": 0.4648737609386444, + "learning_rate": 4.956187926109064e-05, + "loss": 1.2841, + "step": 2700 + }, + { + "epoch": 0.009120432047895297, + "grad_norm": 1.189271092414856, + "learning_rate": 4.9545592244774314e-05, + "loss": 1.294, + "step": 2800 + }, + { + "epoch": 0.009446161763891557, + "grad_norm": 0.6437670588493347, + "learning_rate": 4.952930522845798e-05, + "loss": 1.2882, + "step": 2900 + }, + { + "epoch": 0.009771891479887818, + "grad_norm": 1.591304898262024, + "learning_rate": 4.9513018212141646e-05, + "loss": 1.2805, + "step": 3000 + }, + { + "epoch": 0.010097621195884079, + "grad_norm": 0.2836475670337677, + "learning_rate": 4.949673119582531e-05, + "loss": 1.2802, + "step": 3100 + }, + { + "epoch": 0.01042335091188034, + "grad_norm": 1.304417610168457, + "learning_rate": 4.9480444179508984e-05, + "loss": 1.2833, + "step": 3200 + }, + { + "epoch": 0.0107490806278766, + "grad_norm": 0.27579864859580994, + "learning_rate": 4.946415716319265e-05, + "loss": 1.2852, + "step": 3300 + }, + { + "epoch": 0.011074810343872862, + "grad_norm": 1.1080585718154907, + "learning_rate": 4.9447870146876315e-05, + "loss": 1.289, + "step": 3400 + }, + { + "epoch": 0.011400540059869122, + "grad_norm": 0.2783690392971039, + "learning_rate": 4.943158313055998e-05, + "loss": 1.2885, + "step": 3500 + }, + { + "epoch": 0.011726269775865382, + "grad_norm": 0.6603112816810608, + "learning_rate": 4.941529611424365e-05, + "loss": 1.2882, + "step": 3600 + }, + { + "epoch": 0.012051999491861642, + "grad_norm": 0.9498095512390137, + "learning_rate": 4.939900909792732e-05, + "loss": 1.2835, + "step": 3700 + }, + { + "epoch": 0.012377729207857903, + "grad_norm": 0.5274548530578613, + "learning_rate": 4.9382722081610985e-05, + "loss": 1.279, + "step": 3800 + }, + { + "epoch": 0.012703458923854164, + "grad_norm": 0.5299821496009827, + "learning_rate": 4.936643506529465e-05, + "loss": 1.2879, + "step": 3900 + }, + { + "epoch": 0.013029188639850425, + "grad_norm": 1.0898863077163696, + "learning_rate": 4.9350148048978316e-05, + "loss": 1.2913, + "step": 4000 + }, + { + "epoch": 0.013354918355846686, + "grad_norm": 0.6892501711845398, + "learning_rate": 4.933386103266198e-05, + "loss": 1.2835, + "step": 4100 + }, + { + "epoch": 0.013680648071842947, + "grad_norm": 0.9103847146034241, + "learning_rate": 4.9317574016345655e-05, + "loss": 1.2876, + "step": 4200 + }, + { + "epoch": 0.014006377787839207, + "grad_norm": 0.8750960826873779, + "learning_rate": 4.9301287000029314e-05, + "loss": 1.2761, + "step": 4300 + }, + { + "epoch": 0.014332107503835467, + "grad_norm": 1.7296843528747559, + "learning_rate": 4.9284999983712986e-05, + "loss": 1.2825, + "step": 4400 + }, + { + "epoch": 0.014657837219831727, + "grad_norm": 0.7019387483596802, + "learning_rate": 4.926871296739665e-05, + "loss": 1.2774, + "step": 4500 + }, + { + "epoch": 0.014983566935827988, + "grad_norm": 0.9353660345077515, + "learning_rate": 4.9252425951080324e-05, + "loss": 1.2701, + "step": 4600 + }, + { + "epoch": 0.015309296651824249, + "grad_norm": 0.7081932425498962, + "learning_rate": 4.923613893476399e-05, + "loss": 1.276, + "step": 4700 + }, + { + "epoch": 0.01563502636782051, + "grad_norm": 0.8366962671279907, + "learning_rate": 4.9219851918447656e-05, + "loss": 1.2767, + "step": 4800 + }, + { + "epoch": 0.01596075608381677, + "grad_norm": 1.765871286392212, + "learning_rate": 4.920356490213132e-05, + "loss": 1.2617, + "step": 4900 + }, + { + "epoch": 0.01628648579981303, + "grad_norm": 0.2926379442214966, + "learning_rate": 4.918727788581499e-05, + "loss": 1.2762, + "step": 5000 + }, + { + "epoch": 0.01661221551580929, + "grad_norm": 1.1176525354385376, + "learning_rate": 4.917099086949866e-05, + "loss": 1.2647, + "step": 5100 + }, + { + "epoch": 0.016937945231805553, + "grad_norm": 0.384264200925827, + "learning_rate": 4.915470385318232e-05, + "loss": 1.2628, + "step": 5200 + }, + { + "epoch": 0.017263674947801812, + "grad_norm": 1.5339140892028809, + "learning_rate": 4.913841683686599e-05, + "loss": 1.2692, + "step": 5300 + }, + { + "epoch": 0.017589404663798075, + "grad_norm": 1.2026703357696533, + "learning_rate": 4.912212982054966e-05, + "loss": 1.2618, + "step": 5400 + }, + { + "epoch": 0.017915134379794334, + "grad_norm": 0.6754997968673706, + "learning_rate": 4.910584280423333e-05, + "loss": 1.2495, + "step": 5500 + }, + { + "epoch": 0.018240864095790593, + "grad_norm": 0.8240428566932678, + "learning_rate": 4.908955578791699e-05, + "loss": 1.2498, + "step": 5600 + }, + { + "epoch": 0.018566593811786856, + "grad_norm": 0.6363087892532349, + "learning_rate": 4.9073268771600654e-05, + "loss": 1.2514, + "step": 5700 + }, + { + "epoch": 0.018892323527783115, + "grad_norm": 1.393833875656128, + "learning_rate": 4.905698175528433e-05, + "loss": 1.2509, + "step": 5800 + }, + { + "epoch": 0.019218053243779377, + "grad_norm": 0.6422170996665955, + "learning_rate": 4.904069473896799e-05, + "loss": 1.2405, + "step": 5900 + }, + { + "epoch": 0.019543782959775637, + "grad_norm": 0.7575420141220093, + "learning_rate": 4.902440772265166e-05, + "loss": 1.2241, + "step": 6000 + }, + { + "epoch": 0.0198695126757719, + "grad_norm": 0.7148196697235107, + "learning_rate": 4.9008120706335324e-05, + "loss": 1.2372, + "step": 6100 + }, + { + "epoch": 0.020195242391768158, + "grad_norm": 1.1207329034805298, + "learning_rate": 4.8991833690018996e-05, + "loss": 1.2372, + "step": 6200 + }, + { + "epoch": 0.02052097210776442, + "grad_norm": 1.3915568590164185, + "learning_rate": 4.897554667370266e-05, + "loss": 1.2129, + "step": 6300 + }, + { + "epoch": 0.02084670182376068, + "grad_norm": 0.8674553036689758, + "learning_rate": 4.895925965738633e-05, + "loss": 1.2262, + "step": 6400 + }, + { + "epoch": 0.02117243153975694, + "grad_norm": 0.7640644311904907, + "learning_rate": 4.8942972641069994e-05, + "loss": 1.1998, + "step": 6500 + }, + { + "epoch": 0.0214981612557532, + "grad_norm": 0.7928606271743774, + "learning_rate": 4.892668562475366e-05, + "loss": 1.1776, + "step": 6600 + }, + { + "epoch": 0.02182389097174946, + "grad_norm": 1.1644946336746216, + "learning_rate": 4.891039860843733e-05, + "loss": 1.1916, + "step": 6700 + }, + { + "epoch": 0.022149620687745723, + "grad_norm": 1.1310213804244995, + "learning_rate": 4.8894111592121e-05, + "loss": 1.1786, + "step": 6800 + }, + { + "epoch": 0.022475350403741982, + "grad_norm": 1.3858141899108887, + "learning_rate": 4.887782457580466e-05, + "loss": 1.1728, + "step": 6900 + }, + { + "epoch": 0.022801080119738245, + "grad_norm": 3.814767360687256, + "learning_rate": 4.886153755948833e-05, + "loss": 1.1384, + "step": 7000 + }, + { + "epoch": 0.023126809835734504, + "grad_norm": 1.2411885261535645, + "learning_rate": 4.8845250543171995e-05, + "loss": 1.1588, + "step": 7100 + }, + { + "epoch": 0.023452539551730763, + "grad_norm": 1.4492881298065186, + "learning_rate": 4.882896352685567e-05, + "loss": 1.1266, + "step": 7200 + }, + { + "epoch": 0.023778269267727026, + "grad_norm": 0.8389878869056702, + "learning_rate": 4.8812676510539326e-05, + "loss": 1.1446, + "step": 7300 + }, + { + "epoch": 0.024103998983723285, + "grad_norm": 0.33955487608909607, + "learning_rate": 4.8796389494223e-05, + "loss": 1.1111, + "step": 7400 + }, + { + "epoch": 0.024429728699719547, + "grad_norm": 0.7004753351211548, + "learning_rate": 4.8780102477906664e-05, + "loss": 1.0954, + "step": 7500 + }, + { + "epoch": 0.024755458415715807, + "grad_norm": 0.7213209271430969, + "learning_rate": 4.876381546159034e-05, + "loss": 1.1123, + "step": 7600 + }, + { + "epoch": 0.02508118813171207, + "grad_norm": 0.960991382598877, + "learning_rate": 4.8747528445273996e-05, + "loss": 1.0982, + "step": 7700 + }, + { + "epoch": 0.025406917847708328, + "grad_norm": 0.6955804228782654, + "learning_rate": 4.873124142895766e-05, + "loss": 1.0827, + "step": 7800 + }, + { + "epoch": 0.02573264756370459, + "grad_norm": 0.47498619556427, + "learning_rate": 4.8714954412641334e-05, + "loss": 1.1043, + "step": 7900 + }, + { + "epoch": 0.02605837727970085, + "grad_norm": 0.304063618183136, + "learning_rate": 4.8698667396325e-05, + "loss": 1.0699, + "step": 8000 + }, + { + "epoch": 0.02638410699569711, + "grad_norm": 0.9996088743209839, + "learning_rate": 4.8682380380008666e-05, + "loss": 1.0697, + "step": 8100 + }, + { + "epoch": 0.02670983671169337, + "grad_norm": 0.5986392498016357, + "learning_rate": 4.866609336369233e-05, + "loss": 1.0733, + "step": 8200 + }, + { + "epoch": 0.02703556642768963, + "grad_norm": 0.41347017884254456, + "learning_rate": 4.8649806347376004e-05, + "loss": 1.0643, + "step": 8300 + }, + { + "epoch": 0.027361296143685893, + "grad_norm": 0.3976612687110901, + "learning_rate": 4.863351933105967e-05, + "loss": 1.0401, + "step": 8400 + }, + { + "epoch": 0.027687025859682152, + "grad_norm": 1.1716387271881104, + "learning_rate": 4.8617232314743335e-05, + "loss": 1.0298, + "step": 8500 + }, + { + "epoch": 0.028012755575678415, + "grad_norm": 0.7384105324745178, + "learning_rate": 4.8600945298427e-05, + "loss": 1.0223, + "step": 8600 + }, + { + "epoch": 0.028338485291674674, + "grad_norm": 0.517280638217926, + "learning_rate": 4.858465828211067e-05, + "loss": 1.0445, + "step": 8700 + }, + { + "epoch": 0.028664215007670933, + "grad_norm": 0.7129126787185669, + "learning_rate": 4.856837126579434e-05, + "loss": 1.0508, + "step": 8800 + }, + { + "epoch": 0.028989944723667196, + "grad_norm": 0.35596320033073425, + "learning_rate": 4.8552084249478005e-05, + "loss": 1.0296, + "step": 8900 + }, + { + "epoch": 0.029315674439663455, + "grad_norm": 0.9362590909004211, + "learning_rate": 4.853579723316167e-05, + "loss": 1.0785, + "step": 9000 + }, + { + "epoch": 0.029641404155659717, + "grad_norm": 0.8223775625228882, + "learning_rate": 4.8519510216845336e-05, + "loss": 1.043, + "step": 9100 + }, + { + "epoch": 0.029967133871655977, + "grad_norm": 0.7149192690849304, + "learning_rate": 4.8503223200529e-05, + "loss": 1.0036, + "step": 9200 + }, + { + "epoch": 0.03029286358765224, + "grad_norm": 0.5907948017120361, + "learning_rate": 4.8486936184212675e-05, + "loss": 1.0408, + "step": 9300 + }, + { + "epoch": 0.030618593303648498, + "grad_norm": 0.6083859801292419, + "learning_rate": 4.847064916789634e-05, + "loss": 1.0313, + "step": 9400 + }, + { + "epoch": 0.03094432301964476, + "grad_norm": 0.5470224618911743, + "learning_rate": 4.8454362151580006e-05, + "loss": 1.0395, + "step": 9500 + }, + { + "epoch": 0.03127005273564102, + "grad_norm": 0.9455150961875916, + "learning_rate": 4.843807513526367e-05, + "loss": 1.0132, + "step": 9600 + }, + { + "epoch": 0.03159578245163728, + "grad_norm": 0.9068177938461304, + "learning_rate": 4.8421788118947344e-05, + "loss": 1.0219, + "step": 9700 + }, + { + "epoch": 0.03192151216763354, + "grad_norm": 0.6018943190574646, + "learning_rate": 4.840550110263101e-05, + "loss": 0.9966, + "step": 9800 + }, + { + "epoch": 0.032247241883629804, + "grad_norm": 1.1521615982055664, + "learning_rate": 4.838921408631467e-05, + "loss": 0.9782, + "step": 9900 + }, + { + "epoch": 0.03257297159962606, + "grad_norm": 0.33281368017196655, + "learning_rate": 4.837292706999834e-05, + "loss": 1.0325, + "step": 10000 + }, + { + "epoch": 0.03289870131562232, + "grad_norm": 0.8903327584266663, + "learning_rate": 4.835664005368201e-05, + "loss": 0.9889, + "step": 10100 + }, + { + "epoch": 0.03322443103161858, + "grad_norm": 0.5526803731918335, + "learning_rate": 4.834035303736568e-05, + "loss": 1.0018, + "step": 10200 + }, + { + "epoch": 0.03355016074761485, + "grad_norm": 0.8086706399917603, + "learning_rate": 4.832406602104934e-05, + "loss": 1.0189, + "step": 10300 + }, + { + "epoch": 0.03387589046361111, + "grad_norm": 0.6990864276885986, + "learning_rate": 4.830777900473301e-05, + "loss": 0.996, + "step": 10400 + }, + { + "epoch": 0.034201620179607366, + "grad_norm": 0.4859602451324463, + "learning_rate": 4.829149198841668e-05, + "loss": 0.992, + "step": 10500 + }, + { + "epoch": 0.034527349895603625, + "grad_norm": 1.2284592390060425, + "learning_rate": 4.827520497210034e-05, + "loss": 1.0139, + "step": 10600 + }, + { + "epoch": 0.034853079611599884, + "grad_norm": 0.6529733538627625, + "learning_rate": 4.825891795578401e-05, + "loss": 1.025, + "step": 10700 + }, + { + "epoch": 0.03517880932759615, + "grad_norm": 0.6755232810974121, + "learning_rate": 4.8242630939467674e-05, + "loss": 1.0123, + "step": 10800 + }, + { + "epoch": 0.03550453904359241, + "grad_norm": 0.9006055593490601, + "learning_rate": 4.8226343923151347e-05, + "loss": 0.9936, + "step": 10900 + }, + { + "epoch": 0.03583026875958867, + "grad_norm": 0.7058572769165039, + "learning_rate": 4.821005690683501e-05, + "loss": 0.934, + "step": 11000 + }, + { + "epoch": 0.03615599847558493, + "grad_norm": 0.4535008668899536, + "learning_rate": 4.819376989051868e-05, + "loss": 1.0269, + "step": 11100 + }, + { + "epoch": 0.036481728191581186, + "grad_norm": 0.39823395013809204, + "learning_rate": 4.8177482874202344e-05, + "loss": 0.9866, + "step": 11200 + }, + { + "epoch": 0.03680745790757745, + "grad_norm": 0.8109054565429688, + "learning_rate": 4.816119585788601e-05, + "loss": 1.0209, + "step": 11300 + }, + { + "epoch": 0.03713318762357371, + "grad_norm": 0.760396420955658, + "learning_rate": 4.814490884156968e-05, + "loss": 0.9711, + "step": 11400 + }, + { + "epoch": 0.03745891733956997, + "grad_norm": 0.8584955334663391, + "learning_rate": 4.812862182525335e-05, + "loss": 1.0151, + "step": 11500 + }, + { + "epoch": 0.03778464705556623, + "grad_norm": 1.104041576385498, + "learning_rate": 4.8112334808937013e-05, + "loss": 0.9826, + "step": 11600 + }, + { + "epoch": 0.038110376771562496, + "grad_norm": 0.6111257672309875, + "learning_rate": 4.809604779262068e-05, + "loss": 0.9524, + "step": 11700 + }, + { + "epoch": 0.038436106487558755, + "grad_norm": 0.6601366996765137, + "learning_rate": 4.807976077630435e-05, + "loss": 0.9527, + "step": 11800 + }, + { + "epoch": 0.038761836203555014, + "grad_norm": 0.4624398350715637, + "learning_rate": 4.806347375998802e-05, + "loss": 1.0077, + "step": 11900 + }, + { + "epoch": 0.03908756591955127, + "grad_norm": 0.2786065638065338, + "learning_rate": 4.8047186743671676e-05, + "loss": 0.956, + "step": 12000 + }, + { + "epoch": 0.03941329563554753, + "grad_norm": 1.0275955200195312, + "learning_rate": 4.803089972735535e-05, + "loss": 0.9484, + "step": 12100 + }, + { + "epoch": 0.0397390253515438, + "grad_norm": 0.6198407411575317, + "learning_rate": 4.8014612711039015e-05, + "loss": 0.9847, + "step": 12200 + }, + { + "epoch": 0.04006475506754006, + "grad_norm": 0.5880489945411682, + "learning_rate": 4.799832569472269e-05, + "loss": 0.9559, + "step": 12300 + }, + { + "epoch": 0.040390484783536316, + "grad_norm": 0.39753594994544983, + "learning_rate": 4.7982038678406346e-05, + "loss": 0.9489, + "step": 12400 + }, + { + "epoch": 0.040716214499532576, + "grad_norm": 0.5815085768699646, + "learning_rate": 4.796575166209002e-05, + "loss": 0.9567, + "step": 12500 + }, + { + "epoch": 0.04104194421552884, + "grad_norm": 0.8463611602783203, + "learning_rate": 4.7949464645773684e-05, + "loss": 0.9706, + "step": 12600 + }, + { + "epoch": 0.0413676739315251, + "grad_norm": 0.7260481715202332, + "learning_rate": 4.793317762945736e-05, + "loss": 1.0032, + "step": 12700 + }, + { + "epoch": 0.04169340364752136, + "grad_norm": 0.6970434188842773, + "learning_rate": 4.7916890613141016e-05, + "loss": 0.9559, + "step": 12800 + }, + { + "epoch": 0.04201913336351762, + "grad_norm": 0.6083927750587463, + "learning_rate": 4.790060359682468e-05, + "loss": 0.9558, + "step": 12900 + }, + { + "epoch": 0.04234486307951388, + "grad_norm": 0.4736403524875641, + "learning_rate": 4.7884316580508354e-05, + "loss": 0.9444, + "step": 13000 + }, + { + "epoch": 0.042670592795510144, + "grad_norm": 0.34586021304130554, + "learning_rate": 4.786802956419202e-05, + "loss": 0.9186, + "step": 13100 + }, + { + "epoch": 0.0429963225115064, + "grad_norm": 0.5979019403457642, + "learning_rate": 4.7851742547875685e-05, + "loss": 0.9367, + "step": 13200 + }, + { + "epoch": 0.04332205222750266, + "grad_norm": 1.0827624797821045, + "learning_rate": 4.783545553155935e-05, + "loss": 0.9324, + "step": 13300 + }, + { + "epoch": 0.04364778194349892, + "grad_norm": 1.1920030117034912, + "learning_rate": 4.7819168515243024e-05, + "loss": 0.9367, + "step": 13400 + }, + { + "epoch": 0.04397351165949519, + "grad_norm": 0.6469812989234924, + "learning_rate": 4.780288149892669e-05, + "loss": 0.9815, + "step": 13500 + }, + { + "epoch": 0.04429924137549145, + "grad_norm": 0.8156530857086182, + "learning_rate": 4.7786594482610355e-05, + "loss": 0.9679, + "step": 13600 + }, + { + "epoch": 0.044624971091487706, + "grad_norm": 1.2997325658798218, + "learning_rate": 4.777030746629402e-05, + "loss": 0.9358, + "step": 13700 + }, + { + "epoch": 0.044950700807483965, + "grad_norm": 0.42360150814056396, + "learning_rate": 4.7754020449977687e-05, + "loss": 0.9326, + "step": 13800 + }, + { + "epoch": 0.045276430523480224, + "grad_norm": 0.7316247820854187, + "learning_rate": 4.773773343366136e-05, + "loss": 0.9283, + "step": 13900 + }, + { + "epoch": 0.04560216023947649, + "grad_norm": 0.5978175401687622, + "learning_rate": 4.7721446417345025e-05, + "loss": 0.9699, + "step": 14000 + }, + { + "epoch": 0.04592788995547275, + "grad_norm": 0.5278334617614746, + "learning_rate": 4.770515940102869e-05, + "loss": 0.99, + "step": 14100 + }, + { + "epoch": 0.04625361967146901, + "grad_norm": 0.7452822327613831, + "learning_rate": 4.7688872384712356e-05, + "loss": 0.8824, + "step": 14200 + }, + { + "epoch": 0.04657934938746527, + "grad_norm": 0.4158065617084503, + "learning_rate": 4.767258536839602e-05, + "loss": 0.9076, + "step": 14300 + }, + { + "epoch": 0.046905079103461526, + "grad_norm": 0.6929590106010437, + "learning_rate": 4.7656298352079694e-05, + "loss": 0.926, + "step": 14400 + }, + { + "epoch": 0.04723080881945779, + "grad_norm": 0.8249752521514893, + "learning_rate": 4.764001133576336e-05, + "loss": 0.9342, + "step": 14500 + }, + { + "epoch": 0.04755653853545405, + "grad_norm": 0.6523115038871765, + "learning_rate": 4.7623724319447026e-05, + "loss": 0.9312, + "step": 14600 + }, + { + "epoch": 0.04788226825145031, + "grad_norm": 0.7809571027755737, + "learning_rate": 4.760743730313069e-05, + "loss": 0.927, + "step": 14700 + }, + { + "epoch": 0.04820799796744657, + "grad_norm": 0.4370424747467041, + "learning_rate": 4.7591150286814364e-05, + "loss": 0.9275, + "step": 14800 + }, + { + "epoch": 0.048533727683442836, + "grad_norm": 0.8082228302955627, + "learning_rate": 4.757486327049803e-05, + "loss": 0.9524, + "step": 14900 + }, + { + "epoch": 0.048859457399439095, + "grad_norm": 0.7073273658752441, + "learning_rate": 4.755857625418169e-05, + "loss": 0.9069, + "step": 15000 + }, + { + "epoch": 0.049185187115435354, + "grad_norm": 0.9150802493095398, + "learning_rate": 4.754228923786536e-05, + "loss": 0.9669, + "step": 15100 + }, + { + "epoch": 0.04951091683143161, + "grad_norm": 0.6621295809745789, + "learning_rate": 4.752600222154903e-05, + "loss": 0.9117, + "step": 15200 + }, + { + "epoch": 0.04983664654742787, + "grad_norm": 1.1658425331115723, + "learning_rate": 4.75097152052327e-05, + "loss": 0.9061, + "step": 15300 + }, + { + "epoch": 0.05016237626342414, + "grad_norm": 1.1669522523880005, + "learning_rate": 4.749342818891636e-05, + "loss": 0.9625, + "step": 15400 + }, + { + "epoch": 0.0504881059794204, + "grad_norm": 0.6995384693145752, + "learning_rate": 4.747714117260003e-05, + "loss": 0.9098, + "step": 15500 + }, + { + "epoch": 0.050813835695416656, + "grad_norm": 0.5169076919555664, + "learning_rate": 4.74608541562837e-05, + "loss": 0.9243, + "step": 15600 + }, + { + "epoch": 0.051139565411412916, + "grad_norm": 0.33565372228622437, + "learning_rate": 4.744456713996736e-05, + "loss": 0.9375, + "step": 15700 + }, + { + "epoch": 0.05146529512740918, + "grad_norm": 0.4140024781227112, + "learning_rate": 4.742828012365103e-05, + "loss": 0.919, + "step": 15800 + }, + { + "epoch": 0.05179102484340544, + "grad_norm": 0.9499224424362183, + "learning_rate": 4.7411993107334694e-05, + "loss": 0.9034, + "step": 15900 + }, + { + "epoch": 0.0521167545594017, + "grad_norm": 0.8801336288452148, + "learning_rate": 4.7395706091018366e-05, + "loss": 0.881, + "step": 16000 + }, + { + "epoch": 0.05244248427539796, + "grad_norm": 0.7208696007728577, + "learning_rate": 4.737941907470203e-05, + "loss": 0.8518, + "step": 16100 + }, + { + "epoch": 0.05276821399139422, + "grad_norm": 0.5132054686546326, + "learning_rate": 4.73631320583857e-05, + "loss": 0.8933, + "step": 16200 + }, + { + "epoch": 0.053093943707390484, + "grad_norm": 0.6521860957145691, + "learning_rate": 4.7346845042069364e-05, + "loss": 0.9332, + "step": 16300 + }, + { + "epoch": 0.05341967342338674, + "grad_norm": 0.7121620178222656, + "learning_rate": 4.733055802575303e-05, + "loss": 0.9067, + "step": 16400 + }, + { + "epoch": 0.053745403139383, + "grad_norm": 0.5065134763717651, + "learning_rate": 4.73142710094367e-05, + "loss": 0.9062, + "step": 16500 + }, + { + "epoch": 0.05407113285537926, + "grad_norm": 0.5855521559715271, + "learning_rate": 4.729798399312037e-05, + "loss": 0.915, + "step": 16600 + }, + { + "epoch": 0.05439686257137553, + "grad_norm": 0.5392531156539917, + "learning_rate": 4.728169697680403e-05, + "loss": 0.9124, + "step": 16700 + }, + { + "epoch": 0.05472259228737179, + "grad_norm": 0.6617989540100098, + "learning_rate": 4.72654099604877e-05, + "loss": 0.8594, + "step": 16800 + }, + { + "epoch": 0.055048322003368046, + "grad_norm": 0.6459785103797913, + "learning_rate": 4.724912294417137e-05, + "loss": 0.9262, + "step": 16900 + }, + { + "epoch": 0.055374051719364305, + "grad_norm": 0.34565970301628113, + "learning_rate": 4.723283592785504e-05, + "loss": 0.8747, + "step": 17000 + }, + { + "epoch": 0.055699781435360564, + "grad_norm": 0.9510948061943054, + "learning_rate": 4.7216548911538696e-05, + "loss": 0.9027, + "step": 17100 + }, + { + "epoch": 0.05602551115135683, + "grad_norm": 0.577192485332489, + "learning_rate": 4.720026189522237e-05, + "loss": 0.9192, + "step": 17200 + }, + { + "epoch": 0.05635124086735309, + "grad_norm": 0.38653406500816345, + "learning_rate": 4.7183974878906034e-05, + "loss": 0.8759, + "step": 17300 + }, + { + "epoch": 0.05667697058334935, + "grad_norm": 0.6405381560325623, + "learning_rate": 4.716768786258971e-05, + "loss": 0.8486, + "step": 17400 + }, + { + "epoch": 0.05700270029934561, + "grad_norm": 0.6968704462051392, + "learning_rate": 4.7151400846273366e-05, + "loss": 0.903, + "step": 17500 + }, + { + "epoch": 0.057328430015341866, + "grad_norm": 0.8094695210456848, + "learning_rate": 4.713511382995704e-05, + "loss": 0.864, + "step": 17600 + }, + { + "epoch": 0.05765415973133813, + "grad_norm": 0.8325287103652954, + "learning_rate": 4.7118826813640704e-05, + "loss": 0.8886, + "step": 17700 + }, + { + "epoch": 0.05797988944733439, + "grad_norm": 0.5068339705467224, + "learning_rate": 4.710253979732437e-05, + "loss": 0.8767, + "step": 17800 + }, + { + "epoch": 0.05830561916333065, + "grad_norm": 0.7535611391067505, + "learning_rate": 4.7086252781008036e-05, + "loss": 0.8661, + "step": 17900 + }, + { + "epoch": 0.05863134887932691, + "grad_norm": 0.9104974865913391, + "learning_rate": 4.70699657646917e-05, + "loss": 0.8612, + "step": 18000 + }, + { + "epoch": 0.058957078595323176, + "grad_norm": 0.9106101989746094, + "learning_rate": 4.7053678748375374e-05, + "loss": 0.8885, + "step": 18100 + }, + { + "epoch": 0.059282808311319435, + "grad_norm": 0.9990994334220886, + "learning_rate": 4.703739173205904e-05, + "loss": 0.9097, + "step": 18200 + }, + { + "epoch": 0.059608538027315694, + "grad_norm": 0.6219133138656616, + "learning_rate": 4.7021104715742705e-05, + "loss": 0.8349, + "step": 18300 + }, + { + "epoch": 0.05993426774331195, + "grad_norm": 0.28884798288345337, + "learning_rate": 4.700481769942637e-05, + "loss": 0.8359, + "step": 18400 + }, + { + "epoch": 0.06025999745930821, + "grad_norm": 0.6142743229866028, + "learning_rate": 4.698853068311004e-05, + "loss": 0.8686, + "step": 18500 + }, + { + "epoch": 0.06058572717530448, + "grad_norm": 0.7121238708496094, + "learning_rate": 4.697224366679371e-05, + "loss": 0.8318, + "step": 18600 + }, + { + "epoch": 0.06091145689130074, + "grad_norm": 0.3502013683319092, + "learning_rate": 4.6955956650477375e-05, + "loss": 0.8353, + "step": 18700 + }, + { + "epoch": 0.061237186607296996, + "grad_norm": 0.869159460067749, + "learning_rate": 4.693966963416104e-05, + "loss": 0.8811, + "step": 18800 + }, + { + "epoch": 0.061562916323293256, + "grad_norm": 0.4008027911186218, + "learning_rate": 4.6923382617844706e-05, + "loss": 0.8595, + "step": 18900 + }, + { + "epoch": 0.06188864603928952, + "grad_norm": 0.6609760522842407, + "learning_rate": 4.690709560152838e-05, + "loss": 0.8591, + "step": 19000 + }, + { + "epoch": 0.06221437575528578, + "grad_norm": 0.41599878668785095, + "learning_rate": 4.6890808585212045e-05, + "loss": 0.8792, + "step": 19100 + }, + { + "epoch": 0.06254010547128204, + "grad_norm": 0.8219528794288635, + "learning_rate": 4.687452156889571e-05, + "loss": 0.8469, + "step": 19200 + }, + { + "epoch": 0.0628658351872783, + "grad_norm": 0.5383628010749817, + "learning_rate": 4.6858234552579376e-05, + "loss": 0.8619, + "step": 19300 + }, + { + "epoch": 0.06319156490327456, + "grad_norm": 1.0892442464828491, + "learning_rate": 4.684194753626304e-05, + "loss": 0.8219, + "step": 19400 + }, + { + "epoch": 0.06351729461927082, + "grad_norm": 0.7258702516555786, + "learning_rate": 4.6825660519946714e-05, + "loss": 0.8243, + "step": 19500 + }, + { + "epoch": 0.06384302433526708, + "grad_norm": 1.2622634172439575, + "learning_rate": 4.680937350363038e-05, + "loss": 0.8619, + "step": 19600 + }, + { + "epoch": 0.06416875405126335, + "grad_norm": 0.3901592195034027, + "learning_rate": 4.6793086487314046e-05, + "loss": 0.8315, + "step": 19700 + }, + { + "epoch": 0.06449448376725961, + "grad_norm": 0.5976518392562866, + "learning_rate": 4.677679947099771e-05, + "loss": 0.8193, + "step": 19800 + }, + { + "epoch": 0.06482021348325587, + "grad_norm": 1.0668984651565552, + "learning_rate": 4.676051245468138e-05, + "loss": 0.8381, + "step": 19900 + }, + { + "epoch": 0.06514594319925213, + "grad_norm": 0.6844903826713562, + "learning_rate": 4.674422543836505e-05, + "loss": 0.8202, + "step": 20000 + }, + { + "epoch": 0.06547167291524839, + "grad_norm": 0.6987929344177246, + "learning_rate": 4.672793842204871e-05, + "loss": 0.844, + "step": 20100 + }, + { + "epoch": 0.06579740263124464, + "grad_norm": 1.0227413177490234, + "learning_rate": 4.671165140573238e-05, + "loss": 0.8093, + "step": 20200 + }, + { + "epoch": 0.0661231323472409, + "grad_norm": 0.5901645421981812, + "learning_rate": 4.669536438941605e-05, + "loss": 0.8068, + "step": 20300 + }, + { + "epoch": 0.06644886206323716, + "grad_norm": 0.7951213717460632, + "learning_rate": 4.667907737309972e-05, + "loss": 0.8581, + "step": 20400 + }, + { + "epoch": 0.06677459177923342, + "grad_norm": 0.617341160774231, + "learning_rate": 4.666279035678338e-05, + "loss": 0.8427, + "step": 20500 + }, + { + "epoch": 0.0671003214952297, + "grad_norm": 0.694558322429657, + "learning_rate": 4.6646503340467044e-05, + "loss": 0.8619, + "step": 20600 + }, + { + "epoch": 0.06742605121122595, + "grad_norm": 0.6441329717636108, + "learning_rate": 4.663021632415072e-05, + "loss": 0.8866, + "step": 20700 + }, + { + "epoch": 0.06775178092722221, + "grad_norm": 0.46440285444259644, + "learning_rate": 4.661392930783438e-05, + "loss": 0.8435, + "step": 20800 + }, + { + "epoch": 0.06807751064321847, + "grad_norm": 0.42911046743392944, + "learning_rate": 4.659764229151805e-05, + "loss": 0.8145, + "step": 20900 + }, + { + "epoch": 0.06840324035921473, + "grad_norm": 0.7508918046951294, + "learning_rate": 4.6581355275201714e-05, + "loss": 0.8576, + "step": 21000 + }, + { + "epoch": 0.06872897007521099, + "grad_norm": 0.6361901164054871, + "learning_rate": 4.6565068258885386e-05, + "loss": 0.7982, + "step": 21100 + }, + { + "epoch": 0.06905469979120725, + "grad_norm": 0.804426372051239, + "learning_rate": 4.654878124256905e-05, + "loss": 0.8386, + "step": 21200 + }, + { + "epoch": 0.06938042950720351, + "grad_norm": 0.5336636304855347, + "learning_rate": 4.653249422625272e-05, + "loss": 0.8296, + "step": 21300 + }, + { + "epoch": 0.06970615922319977, + "grad_norm": 0.5880811810493469, + "learning_rate": 4.6516207209936384e-05, + "loss": 0.8065, + "step": 21400 + }, + { + "epoch": 0.07003188893919603, + "grad_norm": 0.4607875347137451, + "learning_rate": 4.649992019362005e-05, + "loss": 0.8601, + "step": 21500 + }, + { + "epoch": 0.0703576186551923, + "grad_norm": 0.6503331065177917, + "learning_rate": 4.648363317730372e-05, + "loss": 0.7925, + "step": 21600 + }, + { + "epoch": 0.07068334837118856, + "grad_norm": 0.7841913104057312, + "learning_rate": 4.646734616098739e-05, + "loss": 0.8218, + "step": 21700 + }, + { + "epoch": 0.07100907808718482, + "grad_norm": 0.45437848567962646, + "learning_rate": 4.645105914467105e-05, + "loss": 0.8663, + "step": 21800 + }, + { + "epoch": 0.07133480780318108, + "grad_norm": 0.6052650213241577, + "learning_rate": 4.643477212835472e-05, + "loss": 0.8634, + "step": 21900 + }, + { + "epoch": 0.07166053751917734, + "grad_norm": 0.5301306247711182, + "learning_rate": 4.641848511203839e-05, + "loss": 0.8215, + "step": 22000 + }, + { + "epoch": 0.0719862672351736, + "grad_norm": 0.8724095821380615, + "learning_rate": 4.640219809572206e-05, + "loss": 0.8304, + "step": 22100 + }, + { + "epoch": 0.07231199695116985, + "grad_norm": 0.8219661116600037, + "learning_rate": 4.6385911079405716e-05, + "loss": 0.8515, + "step": 22200 + }, + { + "epoch": 0.07263772666716611, + "grad_norm": 0.6308414936065674, + "learning_rate": 4.636962406308939e-05, + "loss": 0.7233, + "step": 22300 + }, + { + "epoch": 0.07296345638316237, + "grad_norm": 0.35772112011909485, + "learning_rate": 4.6353337046773054e-05, + "loss": 0.7792, + "step": 22400 + }, + { + "epoch": 0.07328918609915865, + "grad_norm": 0.519975483417511, + "learning_rate": 4.633705003045673e-05, + "loss": 0.8265, + "step": 22500 + }, + { + "epoch": 0.0736149158151549, + "grad_norm": 0.8935458660125732, + "learning_rate": 4.6320763014140386e-05, + "loss": 0.8276, + "step": 22600 + }, + { + "epoch": 0.07394064553115116, + "grad_norm": 0.4765929877758026, + "learning_rate": 4.630447599782406e-05, + "loss": 0.8088, + "step": 22700 + }, + { + "epoch": 0.07426637524714742, + "grad_norm": 0.5910876989364624, + "learning_rate": 4.6288188981507724e-05, + "loss": 0.8003, + "step": 22800 + }, + { + "epoch": 0.07459210496314368, + "grad_norm": 0.6108260154724121, + "learning_rate": 4.627190196519139e-05, + "loss": 0.7949, + "step": 22900 + }, + { + "epoch": 0.07491783467913994, + "grad_norm": 0.9665610194206238, + "learning_rate": 4.625561494887506e-05, + "loss": 0.7989, + "step": 23000 + }, + { + "epoch": 0.0752435643951362, + "grad_norm": 0.43020346760749817, + "learning_rate": 4.623932793255872e-05, + "loss": 0.8052, + "step": 23100 + }, + { + "epoch": 0.07556929411113246, + "grad_norm": 0.3901965022087097, + "learning_rate": 4.6223040916242394e-05, + "loss": 0.7756, + "step": 23200 + }, + { + "epoch": 0.07589502382712872, + "grad_norm": 0.8132317066192627, + "learning_rate": 4.620675389992606e-05, + "loss": 0.797, + "step": 23300 + }, + { + "epoch": 0.07622075354312499, + "grad_norm": 0.6211370825767517, + "learning_rate": 4.619046688360973e-05, + "loss": 0.7698, + "step": 23400 + }, + { + "epoch": 0.07654648325912125, + "grad_norm": 0.8378313779830933, + "learning_rate": 4.617417986729339e-05, + "loss": 0.805, + "step": 23500 + }, + { + "epoch": 0.07687221297511751, + "grad_norm": 0.9225132465362549, + "learning_rate": 4.615789285097706e-05, + "loss": 0.7999, + "step": 23600 + }, + { + "epoch": 0.07719794269111377, + "grad_norm": 0.46878713369369507, + "learning_rate": 4.614160583466073e-05, + "loss": 0.75, + "step": 23700 + }, + { + "epoch": 0.07752367240711003, + "grad_norm": 0.409138560295105, + "learning_rate": 4.6125318818344395e-05, + "loss": 0.7944, + "step": 23800 + }, + { + "epoch": 0.07784940212310629, + "grad_norm": 0.4791303277015686, + "learning_rate": 4.610903180202806e-05, + "loss": 0.7912, + "step": 23900 + }, + { + "epoch": 0.07817513183910255, + "grad_norm": 0.8759014010429382, + "learning_rate": 4.6092744785711726e-05, + "loss": 0.8198, + "step": 24000 + }, + { + "epoch": 0.0785008615550988, + "grad_norm": 0.47595012187957764, + "learning_rate": 4.60764577693954e-05, + "loss": 0.7984, + "step": 24100 + }, + { + "epoch": 0.07882659127109506, + "grad_norm": 0.7923133373260498, + "learning_rate": 4.6060170753079065e-05, + "loss": 0.7436, + "step": 24200 + }, + { + "epoch": 0.07915232098709134, + "grad_norm": 0.39254361391067505, + "learning_rate": 4.604388373676273e-05, + "loss": 0.7771, + "step": 24300 + }, + { + "epoch": 0.0794780507030876, + "grad_norm": 0.6828033924102783, + "learning_rate": 4.6027596720446396e-05, + "loss": 0.8083, + "step": 24400 + }, + { + "epoch": 0.07980378041908386, + "grad_norm": 0.6189585328102112, + "learning_rate": 4.601130970413006e-05, + "loss": 0.7885, + "step": 24500 + }, + { + "epoch": 0.08012951013508011, + "grad_norm": 0.6750975847244263, + "learning_rate": 4.5995022687813734e-05, + "loss": 0.759, + "step": 24600 + }, + { + "epoch": 0.08045523985107637, + "grad_norm": 0.6616020798683167, + "learning_rate": 4.59787356714974e-05, + "loss": 0.8226, + "step": 24700 + }, + { + "epoch": 0.08078096956707263, + "grad_norm": 0.7598117589950562, + "learning_rate": 4.5962448655181066e-05, + "loss": 0.7806, + "step": 24800 + }, + { + "epoch": 0.08110669928306889, + "grad_norm": 0.41183263063430786, + "learning_rate": 4.594616163886473e-05, + "loss": 0.7939, + "step": 24900 + }, + { + "epoch": 0.08143242899906515, + "grad_norm": 0.40911582112312317, + "learning_rate": 4.59298746225484e-05, + "loss": 0.7635, + "step": 25000 + }, + { + "epoch": 0.08175815871506141, + "grad_norm": 0.8820083737373352, + "learning_rate": 4.591358760623207e-05, + "loss": 0.7886, + "step": 25100 + }, + { + "epoch": 0.08208388843105768, + "grad_norm": 0.9055482745170593, + "learning_rate": 4.589730058991573e-05, + "loss": 0.7487, + "step": 25200 + }, + { + "epoch": 0.08240961814705394, + "grad_norm": 0.5680561065673828, + "learning_rate": 4.58810135735994e-05, + "loss": 0.7505, + "step": 25300 + }, + { + "epoch": 0.0827353478630502, + "grad_norm": 0.5064377188682556, + "learning_rate": 4.586472655728307e-05, + "loss": 0.768, + "step": 25400 + }, + { + "epoch": 0.08306107757904646, + "grad_norm": 0.462200403213501, + "learning_rate": 4.584843954096674e-05, + "loss": 0.7399, + "step": 25500 + }, + { + "epoch": 0.08338680729504272, + "grad_norm": 0.7820500731468201, + "learning_rate": 4.58321525246504e-05, + "loss": 0.8109, + "step": 25600 + }, + { + "epoch": 0.08371253701103898, + "grad_norm": 0.4833464026451111, + "learning_rate": 4.5815865508334064e-05, + "loss": 0.764, + "step": 25700 + }, + { + "epoch": 0.08403826672703524, + "grad_norm": 0.3821680247783661, + "learning_rate": 4.5799578492017737e-05, + "loss": 0.7397, + "step": 25800 + }, + { + "epoch": 0.0843639964430315, + "grad_norm": 0.5084909200668335, + "learning_rate": 4.57832914757014e-05, + "loss": 0.7428, + "step": 25900 + }, + { + "epoch": 0.08468972615902776, + "grad_norm": 0.925619900226593, + "learning_rate": 4.576700445938507e-05, + "loss": 0.7386, + "step": 26000 + }, + { + "epoch": 0.08501545587502403, + "grad_norm": 0.8126088380813599, + "learning_rate": 4.5750717443068734e-05, + "loss": 0.7798, + "step": 26100 + }, + { + "epoch": 0.08534118559102029, + "grad_norm": 1.0178046226501465, + "learning_rate": 4.5734430426752406e-05, + "loss": 0.7796, + "step": 26200 + }, + { + "epoch": 0.08566691530701655, + "grad_norm": 0.4879295229911804, + "learning_rate": 4.571814341043607e-05, + "loss": 0.7762, + "step": 26300 + }, + { + "epoch": 0.0859926450230128, + "grad_norm": 0.6722548604011536, + "learning_rate": 4.570185639411974e-05, + "loss": 0.7234, + "step": 26400 + }, + { + "epoch": 0.08631837473900907, + "grad_norm": 0.6326486468315125, + "learning_rate": 4.5685569377803403e-05, + "loss": 0.72, + "step": 26500 + }, + { + "epoch": 0.08664410445500532, + "grad_norm": 0.4354076087474823, + "learning_rate": 4.566928236148707e-05, + "loss": 0.7704, + "step": 26600 + }, + { + "epoch": 0.08696983417100158, + "grad_norm": 0.7113054394721985, + "learning_rate": 4.565299534517074e-05, + "loss": 0.7623, + "step": 26700 + }, + { + "epoch": 0.08729556388699784, + "grad_norm": 0.595664381980896, + "learning_rate": 4.563670832885441e-05, + "loss": 0.765, + "step": 26800 + }, + { + "epoch": 0.0876212936029941, + "grad_norm": 0.5344740152359009, + "learning_rate": 4.562042131253807e-05, + "loss": 0.7201, + "step": 26900 + }, + { + "epoch": 0.08794702331899037, + "grad_norm": 0.5330939292907715, + "learning_rate": 4.560413429622174e-05, + "loss": 0.7617, + "step": 27000 + }, + { + "epoch": 0.08827275303498663, + "grad_norm": 0.45265939831733704, + "learning_rate": 4.5587847279905405e-05, + "loss": 0.7806, + "step": 27100 + }, + { + "epoch": 0.0885984827509829, + "grad_norm": 0.5947338342666626, + "learning_rate": 4.557156026358908e-05, + "loss": 0.7524, + "step": 27200 + }, + { + "epoch": 0.08892421246697915, + "grad_norm": 0.8656592965126038, + "learning_rate": 4.555527324727274e-05, + "loss": 0.7599, + "step": 27300 + }, + { + "epoch": 0.08924994218297541, + "grad_norm": 0.645728349685669, + "learning_rate": 4.553898623095641e-05, + "loss": 0.7629, + "step": 27400 + }, + { + "epoch": 0.08957567189897167, + "grad_norm": 0.8474392890930176, + "learning_rate": 4.5522699214640074e-05, + "loss": 0.7641, + "step": 27500 + }, + { + "epoch": 0.08990140161496793, + "grad_norm": 0.7386724948883057, + "learning_rate": 4.550641219832375e-05, + "loss": 0.7523, + "step": 27600 + }, + { + "epoch": 0.09022713133096419, + "grad_norm": 0.9216130971908569, + "learning_rate": 4.549012518200741e-05, + "loss": 0.7562, + "step": 27700 + }, + { + "epoch": 0.09055286104696045, + "grad_norm": 0.8789349794387817, + "learning_rate": 4.547383816569107e-05, + "loss": 0.7229, + "step": 27800 + }, + { + "epoch": 0.0908785907629567, + "grad_norm": 0.582091748714447, + "learning_rate": 4.5457551149374744e-05, + "loss": 0.7274, + "step": 27900 + }, + { + "epoch": 0.09120432047895298, + "grad_norm": 0.6011328101158142, + "learning_rate": 4.544126413305841e-05, + "loss": 0.7297, + "step": 28000 + }, + { + "epoch": 0.09153005019494924, + "grad_norm": 0.6041598916053772, + "learning_rate": 4.542497711674208e-05, + "loss": 0.7409, + "step": 28100 + }, + { + "epoch": 0.0918557799109455, + "grad_norm": 0.7190874814987183, + "learning_rate": 4.540869010042574e-05, + "loss": 0.7149, + "step": 28200 + }, + { + "epoch": 0.09218150962694176, + "grad_norm": 0.5705780982971191, + "learning_rate": 4.5392403084109414e-05, + "loss": 0.76, + "step": 28300 + }, + { + "epoch": 0.09250723934293802, + "grad_norm": 0.7988401651382446, + "learning_rate": 4.537611606779308e-05, + "loss": 0.7594, + "step": 28400 + }, + { + "epoch": 0.09283296905893428, + "grad_norm": 0.48971208930015564, + "learning_rate": 4.5359829051476745e-05, + "loss": 0.7505, + "step": 28500 + }, + { + "epoch": 0.09315869877493053, + "grad_norm": 0.6600379347801208, + "learning_rate": 4.534354203516041e-05, + "loss": 0.7902, + "step": 28600 + }, + { + "epoch": 0.0934844284909268, + "grad_norm": 0.6095920205116272, + "learning_rate": 4.5327255018844077e-05, + "loss": 0.7166, + "step": 28700 + }, + { + "epoch": 0.09381015820692305, + "grad_norm": 0.6808424592018127, + "learning_rate": 4.531096800252775e-05, + "loss": 0.7148, + "step": 28800 + }, + { + "epoch": 0.09413588792291933, + "grad_norm": 0.9923068284988403, + "learning_rate": 4.5294680986211415e-05, + "loss": 0.7226, + "step": 28900 + }, + { + "epoch": 0.09446161763891558, + "grad_norm": 0.8952274918556213, + "learning_rate": 4.527839396989508e-05, + "loss": 0.7645, + "step": 29000 + }, + { + "epoch": 0.09478734735491184, + "grad_norm": 0.7416999936103821, + "learning_rate": 4.5262106953578746e-05, + "loss": 0.7503, + "step": 29100 + }, + { + "epoch": 0.0951130770709081, + "grad_norm": 0.7862002849578857, + "learning_rate": 4.524581993726242e-05, + "loss": 0.7469, + "step": 29200 + }, + { + "epoch": 0.09543880678690436, + "grad_norm": 0.6296769380569458, + "learning_rate": 4.5229532920946085e-05, + "loss": 0.6873, + "step": 29300 + }, + { + "epoch": 0.09576453650290062, + "grad_norm": 0.9056894779205322, + "learning_rate": 4.521324590462975e-05, + "loss": 0.7126, + "step": 29400 + }, + { + "epoch": 0.09609026621889688, + "grad_norm": 0.624724268913269, + "learning_rate": 4.5196958888313416e-05, + "loss": 0.7668, + "step": 29500 + }, + { + "epoch": 0.09641599593489314, + "grad_norm": 0.680957555770874, + "learning_rate": 4.518067187199708e-05, + "loss": 0.7783, + "step": 29600 + }, + { + "epoch": 0.0967417256508894, + "grad_norm": 0.5778472423553467, + "learning_rate": 4.5164384855680754e-05, + "loss": 0.7355, + "step": 29700 + }, + { + "epoch": 0.09706745536688567, + "grad_norm": 0.6346442699432373, + "learning_rate": 4.514809783936442e-05, + "loss": 0.7276, + "step": 29800 + }, + { + "epoch": 0.09739318508288193, + "grad_norm": 0.9289300441741943, + "learning_rate": 4.5131810823048086e-05, + "loss": 0.7179, + "step": 29900 + }, + { + "epoch": 0.09771891479887819, + "grad_norm": 0.7473464012145996, + "learning_rate": 4.511552380673175e-05, + "loss": 0.7172, + "step": 30000 + }, + { + "epoch": 0.09804464451487445, + "grad_norm": 0.6801792979240417, + "learning_rate": 4.509923679041542e-05, + "loss": 0.7074, + "step": 30100 + }, + { + "epoch": 0.09837037423087071, + "grad_norm": 0.6129624247550964, + "learning_rate": 4.508294977409909e-05, + "loss": 0.7166, + "step": 30200 + }, + { + "epoch": 0.09869610394686697, + "grad_norm": 0.8195613026618958, + "learning_rate": 4.506666275778275e-05, + "loss": 0.7709, + "step": 30300 + }, + { + "epoch": 0.09902183366286323, + "grad_norm": 0.4703550934791565, + "learning_rate": 4.505037574146642e-05, + "loss": 0.7037, + "step": 30400 + }, + { + "epoch": 0.09934756337885949, + "grad_norm": 0.7674877047538757, + "learning_rate": 4.503408872515009e-05, + "loss": 0.7202, + "step": 30500 + }, + { + "epoch": 0.09967329309485574, + "grad_norm": 0.8670388460159302, + "learning_rate": 4.501780170883376e-05, + "loss": 0.7183, + "step": 30600 + }, + { + "epoch": 0.09999902281085202, + "grad_norm": 0.280652791261673, + "learning_rate": 4.500151469251742e-05, + "loss": 0.6998, + "step": 30700 + }, + { + "epoch": 0.10032475252684828, + "grad_norm": 0.7346746325492859, + "learning_rate": 4.4985227676201084e-05, + "loss": 0.7358, + "step": 30800 + }, + { + "epoch": 0.10065048224284454, + "grad_norm": 0.978670060634613, + "learning_rate": 4.4968940659884756e-05, + "loss": 0.7259, + "step": 30900 + }, + { + "epoch": 0.1009762119588408, + "grad_norm": 0.5910704135894775, + "learning_rate": 4.495265364356842e-05, + "loss": 0.7074, + "step": 31000 + }, + { + "epoch": 0.10130194167483705, + "grad_norm": 0.7966532707214355, + "learning_rate": 4.493636662725209e-05, + "loss": 0.7117, + "step": 31100 + }, + { + "epoch": 0.10162767139083331, + "grad_norm": 0.9344640374183655, + "learning_rate": 4.4920079610935754e-05, + "loss": 0.7349, + "step": 31200 + }, + { + "epoch": 0.10195340110682957, + "grad_norm": 0.8043787479400635, + "learning_rate": 4.4903792594619426e-05, + "loss": 0.7361, + "step": 31300 + }, + { + "epoch": 0.10227913082282583, + "grad_norm": 0.6786687970161438, + "learning_rate": 4.488750557830309e-05, + "loss": 0.6969, + "step": 31400 + }, + { + "epoch": 0.10260486053882209, + "grad_norm": 0.4679253399372101, + "learning_rate": 4.487121856198676e-05, + "loss": 0.7157, + "step": 31500 + }, + { + "epoch": 0.10293059025481836, + "grad_norm": 0.5903817415237427, + "learning_rate": 4.485493154567042e-05, + "loss": 0.7352, + "step": 31600 + }, + { + "epoch": 0.10325631997081462, + "grad_norm": 0.715834379196167, + "learning_rate": 4.483864452935409e-05, + "loss": 0.7532, + "step": 31700 + }, + { + "epoch": 0.10358204968681088, + "grad_norm": 0.6664106249809265, + "learning_rate": 4.482235751303776e-05, + "loss": 0.6853, + "step": 31800 + }, + { + "epoch": 0.10390777940280714, + "grad_norm": 0.700243353843689, + "learning_rate": 4.480607049672143e-05, + "loss": 0.6835, + "step": 31900 + }, + { + "epoch": 0.1042335091188034, + "grad_norm": 0.7481942772865295, + "learning_rate": 4.478978348040509e-05, + "loss": 0.7343, + "step": 32000 + }, + { + "epoch": 0.10455923883479966, + "grad_norm": 0.5347774028778076, + "learning_rate": 4.477349646408876e-05, + "loss": 0.6688, + "step": 32100 + }, + { + "epoch": 0.10488496855079592, + "grad_norm": 0.541346549987793, + "learning_rate": 4.4757209447772425e-05, + "loss": 0.7088, + "step": 32200 + }, + { + "epoch": 0.10521069826679218, + "grad_norm": 0.6126936674118042, + "learning_rate": 4.47409224314561e-05, + "loss": 0.7333, + "step": 32300 + }, + { + "epoch": 0.10553642798278844, + "grad_norm": 0.952684760093689, + "learning_rate": 4.472463541513976e-05, + "loss": 0.7242, + "step": 32400 + }, + { + "epoch": 0.10586215769878471, + "grad_norm": 0.72658771276474, + "learning_rate": 4.470834839882343e-05, + "loss": 0.7422, + "step": 32500 + }, + { + "epoch": 0.10618788741478097, + "grad_norm": 0.5741873383522034, + "learning_rate": 4.4692061382507094e-05, + "loss": 0.7307, + "step": 32600 + }, + { + "epoch": 0.10651361713077723, + "grad_norm": 0.646496057510376, + "learning_rate": 4.467577436619077e-05, + "loss": 0.7138, + "step": 32700 + }, + { + "epoch": 0.10683934684677349, + "grad_norm": 0.40007448196411133, + "learning_rate": 4.465948734987443e-05, + "loss": 0.7045, + "step": 32800 + }, + { + "epoch": 0.10716507656276975, + "grad_norm": 0.6594932675361633, + "learning_rate": 4.464320033355809e-05, + "loss": 0.6874, + "step": 32900 + }, + { + "epoch": 0.107490806278766, + "grad_norm": 0.7663995623588562, + "learning_rate": 4.4626913317241764e-05, + "loss": 0.7303, + "step": 33000 + }, + { + "epoch": 0.10781653599476226, + "grad_norm": 0.5867152810096741, + "learning_rate": 4.461062630092543e-05, + "loss": 0.7072, + "step": 33100 + }, + { + "epoch": 0.10814226571075852, + "grad_norm": 0.5017038583755493, + "learning_rate": 4.45943392846091e-05, + "loss": 0.6879, + "step": 33200 + }, + { + "epoch": 0.10846799542675478, + "grad_norm": 0.6196131110191345, + "learning_rate": 4.457805226829276e-05, + "loss": 0.7094, + "step": 33300 + }, + { + "epoch": 0.10879372514275105, + "grad_norm": 0.643118679523468, + "learning_rate": 4.4561765251976434e-05, + "loss": 0.6763, + "step": 33400 + }, + { + "epoch": 0.10911945485874731, + "grad_norm": 0.516583263874054, + "learning_rate": 4.45454782356601e-05, + "loss": 0.6744, + "step": 33500 + }, + { + "epoch": 0.10944518457474357, + "grad_norm": 0.6565887928009033, + "learning_rate": 4.4529191219343765e-05, + "loss": 0.6818, + "step": 33600 + }, + { + "epoch": 0.10977091429073983, + "grad_norm": 0.644209623336792, + "learning_rate": 4.451290420302743e-05, + "loss": 0.6795, + "step": 33700 + }, + { + "epoch": 0.11009664400673609, + "grad_norm": 0.5720322132110596, + "learning_rate": 4.4496617186711096e-05, + "loss": 0.6444, + "step": 33800 + }, + { + "epoch": 0.11042237372273235, + "grad_norm": 0.7580476999282837, + "learning_rate": 4.448033017039477e-05, + "loss": 0.7067, + "step": 33900 + }, + { + "epoch": 0.11074810343872861, + "grad_norm": 0.3334468603134155, + "learning_rate": 4.4464043154078435e-05, + "loss": 0.7245, + "step": 34000 + }, + { + "epoch": 0.11107383315472487, + "grad_norm": 0.7232679724693298, + "learning_rate": 4.44477561377621e-05, + "loss": 0.6476, + "step": 34100 + }, + { + "epoch": 0.11139956287072113, + "grad_norm": 0.49447712302207947, + "learning_rate": 4.4431469121445766e-05, + "loss": 0.6813, + "step": 34200 + }, + { + "epoch": 0.11172529258671739, + "grad_norm": 0.9112755656242371, + "learning_rate": 4.441518210512943e-05, + "loss": 0.7039, + "step": 34300 + }, + { + "epoch": 0.11205102230271366, + "grad_norm": 0.9391865134239197, + "learning_rate": 4.4398895088813104e-05, + "loss": 0.7154, + "step": 34400 + }, + { + "epoch": 0.11237675201870992, + "grad_norm": 0.6869890689849854, + "learning_rate": 4.438260807249677e-05, + "loss": 0.7462, + "step": 34500 + }, + { + "epoch": 0.11270248173470618, + "grad_norm": 0.6954273581504822, + "learning_rate": 4.4366321056180436e-05, + "loss": 0.7151, + "step": 34600 + }, + { + "epoch": 0.11302821145070244, + "grad_norm": 0.8512132167816162, + "learning_rate": 4.43500340398641e-05, + "loss": 0.7157, + "step": 34700 + }, + { + "epoch": 0.1133539411666987, + "grad_norm": 0.7044045329093933, + "learning_rate": 4.4333747023547774e-05, + "loss": 0.6649, + "step": 34800 + }, + { + "epoch": 0.11367967088269496, + "grad_norm": 0.6773298978805542, + "learning_rate": 4.431746000723144e-05, + "loss": 0.6137, + "step": 34900 + }, + { + "epoch": 0.11400540059869121, + "grad_norm": 0.544491171836853, + "learning_rate": 4.43011729909151e-05, + "loss": 0.6577, + "step": 35000 + }, + { + "epoch": 0.11433113031468747, + "grad_norm": 0.543596625328064, + "learning_rate": 4.428488597459877e-05, + "loss": 0.6699, + "step": 35100 + }, + { + "epoch": 0.11465686003068373, + "grad_norm": 0.7878594398498535, + "learning_rate": 4.426859895828244e-05, + "loss": 0.709, + "step": 35200 + }, + { + "epoch": 0.11498258974668, + "grad_norm": 0.8226998448371887, + "learning_rate": 4.425231194196611e-05, + "loss": 0.6954, + "step": 35300 + }, + { + "epoch": 0.11530831946267626, + "grad_norm": 0.48608875274658203, + "learning_rate": 4.423602492564977e-05, + "loss": 0.7502, + "step": 35400 + }, + { + "epoch": 0.11563404917867252, + "grad_norm": 0.6490182280540466, + "learning_rate": 4.421973790933344e-05, + "loss": 0.7085, + "step": 35500 + }, + { + "epoch": 0.11595977889466878, + "grad_norm": 0.3032003343105316, + "learning_rate": 4.420345089301711e-05, + "loss": 0.6778, + "step": 35600 + }, + { + "epoch": 0.11628550861066504, + "grad_norm": 0.7003344297409058, + "learning_rate": 4.418716387670077e-05, + "loss": 0.71, + "step": 35700 + }, + { + "epoch": 0.1166112383266613, + "grad_norm": 0.6569785475730896, + "learning_rate": 4.417087686038444e-05, + "loss": 0.653, + "step": 35800 + }, + { + "epoch": 0.11693696804265756, + "grad_norm": 0.5428867936134338, + "learning_rate": 4.4154589844068104e-05, + "loss": 0.6733, + "step": 35900 + }, + { + "epoch": 0.11726269775865382, + "grad_norm": 0.6179760098457336, + "learning_rate": 4.4138302827751776e-05, + "loss": 0.7081, + "step": 36000 + }, + { + "epoch": 0.11758842747465008, + "grad_norm": 0.7397803068161011, + "learning_rate": 4.412201581143544e-05, + "loss": 0.6894, + "step": 36100 + }, + { + "epoch": 0.11791415719064635, + "grad_norm": 0.725395679473877, + "learning_rate": 4.410572879511911e-05, + "loss": 0.6874, + "step": 36200 + }, + { + "epoch": 0.11823988690664261, + "grad_norm": 0.45658519864082336, + "learning_rate": 4.4089441778802774e-05, + "loss": 0.6821, + "step": 36300 + }, + { + "epoch": 0.11856561662263887, + "grad_norm": 0.9002487063407898, + "learning_rate": 4.407315476248644e-05, + "loss": 0.641, + "step": 36400 + }, + { + "epoch": 0.11889134633863513, + "grad_norm": 0.8738647103309631, + "learning_rate": 4.405686774617011e-05, + "loss": 0.6763, + "step": 36500 + }, + { + "epoch": 0.11921707605463139, + "grad_norm": 1.0051002502441406, + "learning_rate": 4.404058072985378e-05, + "loss": 0.6775, + "step": 36600 + }, + { + "epoch": 0.11954280577062765, + "grad_norm": 0.8074469566345215, + "learning_rate": 4.402429371353744e-05, + "loss": 0.7408, + "step": 36700 + }, + { + "epoch": 0.1198685354866239, + "grad_norm": 0.485388845205307, + "learning_rate": 4.400800669722111e-05, + "loss": 0.6729, + "step": 36800 + }, + { + "epoch": 0.12019426520262017, + "grad_norm": 0.7123886942863464, + "learning_rate": 4.399171968090478e-05, + "loss": 0.661, + "step": 36900 + }, + { + "epoch": 0.12051999491861642, + "grad_norm": 0.4587586522102356, + "learning_rate": 4.397543266458845e-05, + "loss": 0.6662, + "step": 37000 + }, + { + "epoch": 0.1208457246346127, + "grad_norm": 0.7726449966430664, + "learning_rate": 4.395914564827211e-05, + "loss": 0.7469, + "step": 37100 + }, + { + "epoch": 0.12117145435060896, + "grad_norm": 0.8636273741722107, + "learning_rate": 4.394285863195578e-05, + "loss": 0.6669, + "step": 37200 + }, + { + "epoch": 0.12149718406660522, + "grad_norm": 0.6817033886909485, + "learning_rate": 4.3926571615639444e-05, + "loss": 0.6874, + "step": 37300 + }, + { + "epoch": 0.12182291378260147, + "grad_norm": 0.5549355149269104, + "learning_rate": 4.391028459932312e-05, + "loss": 0.6939, + "step": 37400 + }, + { + "epoch": 0.12214864349859773, + "grad_norm": 0.6180316805839539, + "learning_rate": 4.389399758300678e-05, + "loss": 0.6299, + "step": 37500 + }, + { + "epoch": 0.12247437321459399, + "grad_norm": 0.7779985070228577, + "learning_rate": 4.387771056669045e-05, + "loss": 0.7181, + "step": 37600 + }, + { + "epoch": 0.12280010293059025, + "grad_norm": 0.7182669043540955, + "learning_rate": 4.3861423550374114e-05, + "loss": 0.6703, + "step": 37700 + }, + { + "epoch": 0.12312583264658651, + "grad_norm": 0.7191387414932251, + "learning_rate": 4.3845136534057787e-05, + "loss": 0.6802, + "step": 37800 + }, + { + "epoch": 0.12345156236258277, + "grad_norm": 0.6137369275093079, + "learning_rate": 4.382884951774145e-05, + "loss": 0.7028, + "step": 37900 + }, + { + "epoch": 0.12377729207857904, + "grad_norm": 0.7508791089057922, + "learning_rate": 4.381256250142511e-05, + "loss": 0.642, + "step": 38000 + }, + { + "epoch": 0.1241030217945753, + "grad_norm": 0.6414891481399536, + "learning_rate": 4.3796275485108784e-05, + "loss": 0.6255, + "step": 38100 + }, + { + "epoch": 0.12442875151057156, + "grad_norm": 0.6669697165489197, + "learning_rate": 4.377998846879245e-05, + "loss": 0.6691, + "step": 38200 + }, + { + "epoch": 0.12475448122656782, + "grad_norm": 0.8991898894309998, + "learning_rate": 4.376370145247612e-05, + "loss": 0.6727, + "step": 38300 + }, + { + "epoch": 0.12508021094256408, + "grad_norm": 0.4924679398536682, + "learning_rate": 4.374741443615978e-05, + "loss": 0.6661, + "step": 38400 + }, + { + "epoch": 0.12540594065856034, + "grad_norm": 0.3712103068828583, + "learning_rate": 4.3731127419843453e-05, + "loss": 0.7306, + "step": 38500 + }, + { + "epoch": 0.1257316703745566, + "grad_norm": 0.9136518836021423, + "learning_rate": 4.371484040352712e-05, + "loss": 0.6453, + "step": 38600 + }, + { + "epoch": 0.12605740009055286, + "grad_norm": 0.6828204393386841, + "learning_rate": 4.3698553387210785e-05, + "loss": 0.6587, + "step": 38700 + }, + { + "epoch": 0.12638312980654912, + "grad_norm": 0.6366333961486816, + "learning_rate": 4.368226637089445e-05, + "loss": 0.6606, + "step": 38800 + }, + { + "epoch": 0.12670885952254538, + "grad_norm": 0.39375558495521545, + "learning_rate": 4.3665979354578116e-05, + "loss": 0.6937, + "step": 38900 + }, + { + "epoch": 0.12703458923854163, + "grad_norm": 0.46293410658836365, + "learning_rate": 4.364969233826179e-05, + "loss": 0.6504, + "step": 39000 + }, + { + "epoch": 0.1273603189545379, + "grad_norm": 0.9897958040237427, + "learning_rate": 4.3633405321945455e-05, + "loss": 0.7126, + "step": 39100 + }, + { + "epoch": 0.12768604867053415, + "grad_norm": 0.5616987347602844, + "learning_rate": 4.361711830562912e-05, + "loss": 0.5956, + "step": 39200 + }, + { + "epoch": 0.1280117783865304, + "grad_norm": 0.4081191122531891, + "learning_rate": 4.3600831289312786e-05, + "loss": 0.6648, + "step": 39300 + }, + { + "epoch": 0.1283375081025267, + "grad_norm": 0.485188364982605, + "learning_rate": 4.358454427299645e-05, + "loss": 0.6694, + "step": 39400 + }, + { + "epoch": 0.12866323781852296, + "grad_norm": 0.7212422490119934, + "learning_rate": 4.3568257256680124e-05, + "loss": 0.6767, + "step": 39500 + }, + { + "epoch": 0.12898896753451922, + "grad_norm": 0.5502139925956726, + "learning_rate": 4.355197024036379e-05, + "loss": 0.6721, + "step": 39600 + }, + { + "epoch": 0.12931469725051548, + "grad_norm": 0.49975594878196716, + "learning_rate": 4.3535683224047456e-05, + "loss": 0.6669, + "step": 39700 + }, + { + "epoch": 0.12964042696651173, + "grad_norm": 0.4203544557094574, + "learning_rate": 4.351939620773112e-05, + "loss": 0.6716, + "step": 39800 + }, + { + "epoch": 0.129966156682508, + "grad_norm": 0.5464275479316711, + "learning_rate": 4.3503109191414794e-05, + "loss": 0.6544, + "step": 39900 + }, + { + "epoch": 0.13029188639850425, + "grad_norm": 0.6473097801208496, + "learning_rate": 4.348682217509846e-05, + "loss": 0.6977, + "step": 40000 + }, + { + "epoch": 0.1306176161145005, + "grad_norm": 0.39890334010124207, + "learning_rate": 4.347053515878212e-05, + "loss": 0.6704, + "step": 40100 + }, + { + "epoch": 0.13094334583049677, + "grad_norm": 1.0785876512527466, + "learning_rate": 4.345424814246579e-05, + "loss": 0.6196, + "step": 40200 + }, + { + "epoch": 0.13126907554649303, + "grad_norm": 0.6607077121734619, + "learning_rate": 4.343796112614946e-05, + "loss": 0.6608, + "step": 40300 + }, + { + "epoch": 0.1315948052624893, + "grad_norm": 0.5987501740455627, + "learning_rate": 4.342167410983313e-05, + "loss": 0.6334, + "step": 40400 + }, + { + "epoch": 0.13192053497848555, + "grad_norm": 0.3443163335323334, + "learning_rate": 4.340538709351679e-05, + "loss": 0.6621, + "step": 40500 + }, + { + "epoch": 0.1322462646944818, + "grad_norm": 0.9362694025039673, + "learning_rate": 4.338910007720046e-05, + "loss": 0.6404, + "step": 40600 + }, + { + "epoch": 0.13257199441047807, + "grad_norm": 0.5049243569374084, + "learning_rate": 4.3372813060884127e-05, + "loss": 0.6426, + "step": 40700 + }, + { + "epoch": 0.13289772412647433, + "grad_norm": 0.787389874458313, + "learning_rate": 4.335652604456779e-05, + "loss": 0.6432, + "step": 40800 + }, + { + "epoch": 0.13322345384247058, + "grad_norm": 0.8065658211708069, + "learning_rate": 4.334023902825146e-05, + "loss": 0.6477, + "step": 40900 + }, + { + "epoch": 0.13354918355846684, + "grad_norm": 0.5166397094726562, + "learning_rate": 4.3323952011935124e-05, + "loss": 0.6384, + "step": 41000 + }, + { + "epoch": 0.1338749132744631, + "grad_norm": 0.9597229957580566, + "learning_rate": 4.3307664995618796e-05, + "loss": 0.6832, + "step": 41100 + }, + { + "epoch": 0.1342006429904594, + "grad_norm": 0.5936517715454102, + "learning_rate": 4.329137797930246e-05, + "loss": 0.6767, + "step": 41200 + }, + { + "epoch": 0.13452637270645565, + "grad_norm": 0.8391766548156738, + "learning_rate": 4.3275090962986135e-05, + "loss": 0.6215, + "step": 41300 + }, + { + "epoch": 0.1348521024224519, + "grad_norm": 0.977497398853302, + "learning_rate": 4.3258803946669793e-05, + "loss": 0.6307, + "step": 41400 + }, + { + "epoch": 0.13517783213844817, + "grad_norm": 0.6750873923301697, + "learning_rate": 4.324251693035346e-05, + "loss": 0.631, + "step": 41500 + }, + { + "epoch": 0.13550356185444443, + "grad_norm": 0.4655423164367676, + "learning_rate": 4.322622991403713e-05, + "loss": 0.7025, + "step": 41600 + }, + { + "epoch": 0.13582929157044069, + "grad_norm": 0.43544334173202515, + "learning_rate": 4.32099428977208e-05, + "loss": 0.6555, + "step": 41700 + }, + { + "epoch": 0.13615502128643694, + "grad_norm": 0.7595189213752747, + "learning_rate": 4.319365588140446e-05, + "loss": 0.6197, + "step": 41800 + }, + { + "epoch": 0.1364807510024332, + "grad_norm": 0.4422534108161926, + "learning_rate": 4.317736886508813e-05, + "loss": 0.5798, + "step": 41900 + }, + { + "epoch": 0.13680648071842946, + "grad_norm": 0.4622032344341278, + "learning_rate": 4.31610818487718e-05, + "loss": 0.6493, + "step": 42000 + }, + { + "epoch": 0.13713221043442572, + "grad_norm": 0.7267939448356628, + "learning_rate": 4.314479483245547e-05, + "loss": 0.6228, + "step": 42100 + }, + { + "epoch": 0.13745794015042198, + "grad_norm": 0.66838139295578, + "learning_rate": 4.312850781613913e-05, + "loss": 0.6507, + "step": 42200 + }, + { + "epoch": 0.13778366986641824, + "grad_norm": 0.40865644812583923, + "learning_rate": 4.31122207998228e-05, + "loss": 0.6388, + "step": 42300 + }, + { + "epoch": 0.1381093995824145, + "grad_norm": 0.7203364968299866, + "learning_rate": 4.3095933783506464e-05, + "loss": 0.589, + "step": 42400 + }, + { + "epoch": 0.13843512929841076, + "grad_norm": 0.7719990015029907, + "learning_rate": 4.307964676719014e-05, + "loss": 0.6446, + "step": 42500 + }, + { + "epoch": 0.13876085901440702, + "grad_norm": 0.35780540108680725, + "learning_rate": 4.30633597508738e-05, + "loss": 0.683, + "step": 42600 + }, + { + "epoch": 0.13908658873040328, + "grad_norm": 0.5952534675598145, + "learning_rate": 4.304707273455747e-05, + "loss": 0.6697, + "step": 42700 + }, + { + "epoch": 0.13941231844639954, + "grad_norm": 0.539117157459259, + "learning_rate": 4.3030785718241134e-05, + "loss": 0.6582, + "step": 42800 + }, + { + "epoch": 0.1397380481623958, + "grad_norm": 0.8181525468826294, + "learning_rate": 4.30144987019248e-05, + "loss": 0.6695, + "step": 42900 + }, + { + "epoch": 0.14006377787839205, + "grad_norm": 0.8720047473907471, + "learning_rate": 4.299821168560847e-05, + "loss": 0.5931, + "step": 43000 + }, + { + "epoch": 0.14038950759438834, + "grad_norm": 0.9138098955154419, + "learning_rate": 4.298192466929213e-05, + "loss": 0.6874, + "step": 43100 + }, + { + "epoch": 0.1407152373103846, + "grad_norm": 0.8015493750572205, + "learning_rate": 4.2965637652975804e-05, + "loss": 0.6574, + "step": 43200 + }, + { + "epoch": 0.14104096702638086, + "grad_norm": 0.8426867723464966, + "learning_rate": 4.294935063665947e-05, + "loss": 0.6662, + "step": 43300 + }, + { + "epoch": 0.14136669674237712, + "grad_norm": 0.3480939567089081, + "learning_rate": 4.293306362034314e-05, + "loss": 0.6351, + "step": 43400 + }, + { + "epoch": 0.14169242645837338, + "grad_norm": 0.5666735172271729, + "learning_rate": 4.29167766040268e-05, + "loss": 0.641, + "step": 43500 + }, + { + "epoch": 0.14201815617436964, + "grad_norm": 0.9445961117744446, + "learning_rate": 4.2900489587710467e-05, + "loss": 0.6608, + "step": 43600 + }, + { + "epoch": 0.1423438858903659, + "grad_norm": 0.7916907072067261, + "learning_rate": 4.288420257139414e-05, + "loss": 0.6615, + "step": 43700 + }, + { + "epoch": 0.14266961560636215, + "grad_norm": 0.9159532785415649, + "learning_rate": 4.2867915555077805e-05, + "loss": 0.5919, + "step": 43800 + }, + { + "epoch": 0.1429953453223584, + "grad_norm": 0.5766249895095825, + "learning_rate": 4.285162853876147e-05, + "loss": 0.6724, + "step": 43900 + }, + { + "epoch": 0.14332107503835467, + "grad_norm": 0.753519594669342, + "learning_rate": 4.2835341522445136e-05, + "loss": 0.6995, + "step": 44000 + }, + { + "epoch": 0.14364680475435093, + "grad_norm": 1.1004271507263184, + "learning_rate": 4.281905450612881e-05, + "loss": 0.6636, + "step": 44100 + }, + { + "epoch": 0.1439725344703472, + "grad_norm": 0.7064334154129028, + "learning_rate": 4.2802767489812475e-05, + "loss": 0.6793, + "step": 44200 + }, + { + "epoch": 0.14429826418634345, + "grad_norm": 0.5158839225769043, + "learning_rate": 4.278648047349614e-05, + "loss": 0.6336, + "step": 44300 + }, + { + "epoch": 0.1446239939023397, + "grad_norm": 1.0451433658599854, + "learning_rate": 4.2770193457179806e-05, + "loss": 0.6227, + "step": 44400 + }, + { + "epoch": 0.14494972361833597, + "grad_norm": 0.5956864356994629, + "learning_rate": 4.275390644086347e-05, + "loss": 0.6517, + "step": 44500 + }, + { + "epoch": 0.14527545333433223, + "grad_norm": 0.9525729417800903, + "learning_rate": 4.2737619424547144e-05, + "loss": 0.6245, + "step": 44600 + }, + { + "epoch": 0.1456011830503285, + "grad_norm": 0.7456961274147034, + "learning_rate": 4.272133240823081e-05, + "loss": 0.6577, + "step": 44700 + }, + { + "epoch": 0.14592691276632475, + "grad_norm": 0.5686585307121277, + "learning_rate": 4.2705045391914476e-05, + "loss": 0.6675, + "step": 44800 + }, + { + "epoch": 0.14625264248232103, + "grad_norm": 0.5127500295639038, + "learning_rate": 4.268875837559814e-05, + "loss": 0.5966, + "step": 44900 + }, + { + "epoch": 0.1465783721983173, + "grad_norm": 0.6099263429641724, + "learning_rate": 4.267247135928181e-05, + "loss": 0.6259, + "step": 45000 + }, + { + "epoch": 0.14690410191431355, + "grad_norm": 0.5734119415283203, + "learning_rate": 4.265618434296548e-05, + "loss": 0.6251, + "step": 45100 + }, + { + "epoch": 0.1472298316303098, + "grad_norm": 0.40758875012397766, + "learning_rate": 4.263989732664914e-05, + "loss": 0.5856, + "step": 45200 + }, + { + "epoch": 0.14755556134630607, + "grad_norm": 0.5974459052085876, + "learning_rate": 4.262361031033281e-05, + "loss": 0.6443, + "step": 45300 + }, + { + "epoch": 0.14788129106230233, + "grad_norm": 0.48085859417915344, + "learning_rate": 4.260732329401648e-05, + "loss": 0.6612, + "step": 45400 + }, + { + "epoch": 0.1482070207782986, + "grad_norm": 0.5771530270576477, + "learning_rate": 4.259103627770015e-05, + "loss": 0.6272, + "step": 45500 + }, + { + "epoch": 0.14853275049429485, + "grad_norm": 0.8463455438613892, + "learning_rate": 4.2574749261383815e-05, + "loss": 0.6008, + "step": 45600 + }, + { + "epoch": 0.1488584802102911, + "grad_norm": 0.7014292478561401, + "learning_rate": 4.255846224506748e-05, + "loss": 0.5353, + "step": 45700 + }, + { + "epoch": 0.14918420992628736, + "grad_norm": 0.6181588768959045, + "learning_rate": 4.2542175228751146e-05, + "loss": 0.6139, + "step": 45800 + }, + { + "epoch": 0.14950993964228362, + "grad_norm": 0.6540141701698303, + "learning_rate": 4.252588821243481e-05, + "loss": 0.5997, + "step": 45900 + }, + { + "epoch": 0.14983566935827988, + "grad_norm": 0.47981733083724976, + "learning_rate": 4.2509601196118485e-05, + "loss": 0.6511, + "step": 46000 + }, + { + "epoch": 0.15016139907427614, + "grad_norm": 0.964857816696167, + "learning_rate": 4.2493314179802144e-05, + "loss": 0.6365, + "step": 46100 + }, + { + "epoch": 0.1504871287902724, + "grad_norm": 0.6706714034080505, + "learning_rate": 4.2477027163485816e-05, + "loss": 0.664, + "step": 46200 + }, + { + "epoch": 0.15081285850626866, + "grad_norm": 0.5073367953300476, + "learning_rate": 4.246074014716948e-05, + "loss": 0.5633, + "step": 46300 + }, + { + "epoch": 0.15113858822226492, + "grad_norm": 0.37114378809928894, + "learning_rate": 4.2444453130853154e-05, + "loss": 0.6498, + "step": 46400 + }, + { + "epoch": 0.15146431793826118, + "grad_norm": 1.153325080871582, + "learning_rate": 4.242816611453681e-05, + "loss": 0.6254, + "step": 46500 + }, + { + "epoch": 0.15179004765425744, + "grad_norm": 0.7353873252868652, + "learning_rate": 4.241187909822048e-05, + "loss": 0.6573, + "step": 46600 + }, + { + "epoch": 0.15211577737025372, + "grad_norm": 0.5379579067230225, + "learning_rate": 4.239559208190415e-05, + "loss": 0.6642, + "step": 46700 + }, + { + "epoch": 0.15244150708624998, + "grad_norm": 0.341907799243927, + "learning_rate": 4.237930506558782e-05, + "loss": 0.6294, + "step": 46800 + }, + { + "epoch": 0.15276723680224624, + "grad_norm": 0.3866462707519531, + "learning_rate": 4.236301804927148e-05, + "loss": 0.6212, + "step": 46900 + }, + { + "epoch": 0.1530929665182425, + "grad_norm": 0.6686252951622009, + "learning_rate": 4.234673103295515e-05, + "loss": 0.64, + "step": 47000 + }, + { + "epoch": 0.15341869623423876, + "grad_norm": 0.6398385167121887, + "learning_rate": 4.233044401663882e-05, + "loss": 0.6156, + "step": 47100 + }, + { + "epoch": 0.15374442595023502, + "grad_norm": 0.8679475784301758, + "learning_rate": 4.231415700032249e-05, + "loss": 0.6492, + "step": 47200 + }, + { + "epoch": 0.15407015566623128, + "grad_norm": 0.6425623297691345, + "learning_rate": 4.229786998400615e-05, + "loss": 0.6661, + "step": 47300 + }, + { + "epoch": 0.15439588538222754, + "grad_norm": 0.7811526656150818, + "learning_rate": 4.228158296768982e-05, + "loss": 0.6416, + "step": 47400 + }, + { + "epoch": 0.1547216150982238, + "grad_norm": 0.6820793747901917, + "learning_rate": 4.2265295951373484e-05, + "loss": 0.6426, + "step": 47500 + }, + { + "epoch": 0.15504734481422006, + "grad_norm": 0.8748511672019958, + "learning_rate": 4.224900893505716e-05, + "loss": 0.6038, + "step": 47600 + }, + { + "epoch": 0.15537307453021632, + "grad_norm": 0.6828723549842834, + "learning_rate": 4.223272191874082e-05, + "loss": 0.6408, + "step": 47700 + }, + { + "epoch": 0.15569880424621257, + "grad_norm": 1.01051926612854, + "learning_rate": 4.221643490242449e-05, + "loss": 0.6218, + "step": 47800 + }, + { + "epoch": 0.15602453396220883, + "grad_norm": 0.6920143961906433, + "learning_rate": 4.2200147886108154e-05, + "loss": 0.63, + "step": 47900 + }, + { + "epoch": 0.1563502636782051, + "grad_norm": 0.6410394310951233, + "learning_rate": 4.218386086979182e-05, + "loss": 0.6176, + "step": 48000 + }, + { + "epoch": 0.15667599339420135, + "grad_norm": 0.5157743692398071, + "learning_rate": 4.216757385347549e-05, + "loss": 0.5947, + "step": 48100 + }, + { + "epoch": 0.1570017231101976, + "grad_norm": 0.6770983934402466, + "learning_rate": 4.215128683715915e-05, + "loss": 0.6192, + "step": 48200 + }, + { + "epoch": 0.15732745282619387, + "grad_norm": 0.49714550375938416, + "learning_rate": 4.2134999820842824e-05, + "loss": 0.6121, + "step": 48300 + }, + { + "epoch": 0.15765318254219013, + "grad_norm": 0.3486001789569855, + "learning_rate": 4.211871280452649e-05, + "loss": 0.5821, + "step": 48400 + }, + { + "epoch": 0.15797891225818642, + "grad_norm": 0.4202999770641327, + "learning_rate": 4.210242578821016e-05, + "loss": 0.5909, + "step": 48500 + }, + { + "epoch": 0.15830464197418267, + "grad_norm": 0.44769522547721863, + "learning_rate": 4.208613877189382e-05, + "loss": 0.6369, + "step": 48600 + }, + { + "epoch": 0.15863037169017893, + "grad_norm": 0.6501901745796204, + "learning_rate": 4.2069851755577486e-05, + "loss": 0.6187, + "step": 48700 + }, + { + "epoch": 0.1589561014061752, + "grad_norm": 0.8261470794677734, + "learning_rate": 4.205356473926116e-05, + "loss": 0.6136, + "step": 48800 + }, + { + "epoch": 0.15928183112217145, + "grad_norm": 0.9979439973831177, + "learning_rate": 4.2037277722944825e-05, + "loss": 0.623, + "step": 48900 + }, + { + "epoch": 0.1596075608381677, + "grad_norm": 0.5651659369468689, + "learning_rate": 4.202099070662849e-05, + "loss": 0.6742, + "step": 49000 + }, + { + "epoch": 0.15993329055416397, + "grad_norm": 0.7412470579147339, + "learning_rate": 4.2004703690312156e-05, + "loss": 0.6272, + "step": 49100 + }, + { + "epoch": 0.16025902027016023, + "grad_norm": 0.43271690607070923, + "learning_rate": 4.198841667399583e-05, + "loss": 0.5729, + "step": 49200 + }, + { + "epoch": 0.1605847499861565, + "grad_norm": 0.5117851495742798, + "learning_rate": 4.1972129657679494e-05, + "loss": 0.6156, + "step": 49300 + }, + { + "epoch": 0.16091047970215275, + "grad_norm": 0.7106539011001587, + "learning_rate": 4.195584264136316e-05, + "loss": 0.6052, + "step": 49400 + }, + { + "epoch": 0.161236209418149, + "grad_norm": 0.6146919131278992, + "learning_rate": 4.1939555625046826e-05, + "loss": 0.5932, + "step": 49500 + }, + { + "epoch": 0.16156193913414527, + "grad_norm": 0.49088531732559204, + "learning_rate": 4.192326860873049e-05, + "loss": 0.568, + "step": 49600 + }, + { + "epoch": 0.16188766885014153, + "grad_norm": 0.9923317432403564, + "learning_rate": 4.1906981592414164e-05, + "loss": 0.596, + "step": 49700 + }, + { + "epoch": 0.16221339856613778, + "grad_norm": 0.3995937705039978, + "learning_rate": 4.189069457609783e-05, + "loss": 0.6442, + "step": 49800 + }, + { + "epoch": 0.16253912828213404, + "grad_norm": 0.5258984565734863, + "learning_rate": 4.1874407559781496e-05, + "loss": 0.5601, + "step": 49900 + }, + { + "epoch": 0.1628648579981303, + "grad_norm": 0.19585928320884705, + "learning_rate": 4.185812054346516e-05, + "loss": 0.6509, + "step": 50000 + }, + { + "epoch": 0.16319058771412656, + "grad_norm": 0.625548243522644, + "learning_rate": 4.184183352714883e-05, + "loss": 0.6411, + "step": 50100 + }, + { + "epoch": 0.16351631743012282, + "grad_norm": 0.7014303207397461, + "learning_rate": 4.18255465108325e-05, + "loss": 0.6125, + "step": 50200 + }, + { + "epoch": 0.16384204714611908, + "grad_norm": 0.5523779988288879, + "learning_rate": 4.1809259494516165e-05, + "loss": 0.5811, + "step": 50300 + }, + { + "epoch": 0.16416777686211537, + "grad_norm": 0.5742841958999634, + "learning_rate": 4.179297247819983e-05, + "loss": 0.6282, + "step": 50400 + }, + { + "epoch": 0.16449350657811163, + "grad_norm": 0.5776492357254028, + "learning_rate": 4.17766854618835e-05, + "loss": 0.6622, + "step": 50500 + }, + { + "epoch": 0.16481923629410788, + "grad_norm": 0.7464694380760193, + "learning_rate": 4.176039844556717e-05, + "loss": 0.6309, + "step": 50600 + }, + { + "epoch": 0.16514496601010414, + "grad_norm": 0.5271546244621277, + "learning_rate": 4.1744111429250835e-05, + "loss": 0.645, + "step": 50700 + }, + { + "epoch": 0.1654706957261004, + "grad_norm": 0.6904231905937195, + "learning_rate": 4.1727824412934494e-05, + "loss": 0.5927, + "step": 50800 + }, + { + "epoch": 0.16579642544209666, + "grad_norm": 0.578195333480835, + "learning_rate": 4.1711537396618166e-05, + "loss": 0.5812, + "step": 50900 + }, + { + "epoch": 0.16612215515809292, + "grad_norm": 0.8716936707496643, + "learning_rate": 4.169525038030183e-05, + "loss": 0.6261, + "step": 51000 + }, + { + "epoch": 0.16644788487408918, + "grad_norm": 0.6577697992324829, + "learning_rate": 4.1678963363985505e-05, + "loss": 0.6101, + "step": 51100 + }, + { + "epoch": 0.16677361459008544, + "grad_norm": 0.7431929111480713, + "learning_rate": 4.1662676347669164e-05, + "loss": 0.6227, + "step": 51200 + }, + { + "epoch": 0.1670993443060817, + "grad_norm": 0.9198315739631653, + "learning_rate": 4.1646389331352836e-05, + "loss": 0.6399, + "step": 51300 + }, + { + "epoch": 0.16742507402207796, + "grad_norm": 0.5159572958946228, + "learning_rate": 4.16301023150365e-05, + "loss": 0.6329, + "step": 51400 + }, + { + "epoch": 0.16775080373807422, + "grad_norm": 0.7744697332382202, + "learning_rate": 4.161381529872017e-05, + "loss": 0.5579, + "step": 51500 + }, + { + "epoch": 0.16807653345407048, + "grad_norm": 0.4429173767566681, + "learning_rate": 4.159752828240383e-05, + "loss": 0.5786, + "step": 51600 + }, + { + "epoch": 0.16840226317006673, + "grad_norm": 0.7796801924705505, + "learning_rate": 4.15812412660875e-05, + "loss": 0.6353, + "step": 51700 + }, + { + "epoch": 0.168727992886063, + "grad_norm": 0.43117523193359375, + "learning_rate": 4.156495424977117e-05, + "loss": 0.5807, + "step": 51800 + }, + { + "epoch": 0.16905372260205925, + "grad_norm": 0.44315412640571594, + "learning_rate": 4.154866723345484e-05, + "loss": 0.5979, + "step": 51900 + }, + { + "epoch": 0.1693794523180555, + "grad_norm": 0.4306319057941437, + "learning_rate": 4.15323802171385e-05, + "loss": 0.6498, + "step": 52000 + }, + { + "epoch": 0.16970518203405177, + "grad_norm": 0.283033549785614, + "learning_rate": 4.151609320082217e-05, + "loss": 0.6329, + "step": 52100 + }, + { + "epoch": 0.17003091175004806, + "grad_norm": 0.4118421673774719, + "learning_rate": 4.1499806184505834e-05, + "loss": 0.5933, + "step": 52200 + }, + { + "epoch": 0.17035664146604432, + "grad_norm": 0.9130700826644897, + "learning_rate": 4.148351916818951e-05, + "loss": 0.5349, + "step": 52300 + }, + { + "epoch": 0.17068237118204058, + "grad_norm": 0.33348548412323, + "learning_rate": 4.146723215187317e-05, + "loss": 0.6182, + "step": 52400 + }, + { + "epoch": 0.17100810089803684, + "grad_norm": 0.6642253398895264, + "learning_rate": 4.145094513555684e-05, + "loss": 0.5989, + "step": 52500 + }, + { + "epoch": 0.1713338306140331, + "grad_norm": 0.7113855481147766, + "learning_rate": 4.1434658119240504e-05, + "loss": 0.6063, + "step": 52600 + }, + { + "epoch": 0.17165956033002935, + "grad_norm": 1.0840643644332886, + "learning_rate": 4.1418371102924177e-05, + "loss": 0.615, + "step": 52700 + }, + { + "epoch": 0.1719852900460256, + "grad_norm": 0.5277838706970215, + "learning_rate": 4.140208408660784e-05, + "loss": 0.6234, + "step": 52800 + }, + { + "epoch": 0.17231101976202187, + "grad_norm": 0.5993104577064514, + "learning_rate": 4.13857970702915e-05, + "loss": 0.5905, + "step": 52900 + }, + { + "epoch": 0.17263674947801813, + "grad_norm": 0.7363581657409668, + "learning_rate": 4.1369510053975174e-05, + "loss": 0.6032, + "step": 53000 + }, + { + "epoch": 0.1729624791940144, + "grad_norm": 0.6299027800559998, + "learning_rate": 4.135322303765884e-05, + "loss": 0.5717, + "step": 53100 + }, + { + "epoch": 0.17328820891001065, + "grad_norm": 0.49232372641563416, + "learning_rate": 4.133693602134251e-05, + "loss": 0.6031, + "step": 53200 + }, + { + "epoch": 0.1736139386260069, + "grad_norm": 0.7371428608894348, + "learning_rate": 4.132064900502617e-05, + "loss": 0.5608, + "step": 53300 + }, + { + "epoch": 0.17393966834200317, + "grad_norm": 1.0730559825897217, + "learning_rate": 4.1304361988709843e-05, + "loss": 0.6026, + "step": 53400 + }, + { + "epoch": 0.17426539805799943, + "grad_norm": 0.674548327922821, + "learning_rate": 4.128807497239351e-05, + "loss": 0.5721, + "step": 53500 + }, + { + "epoch": 0.17459112777399569, + "grad_norm": 0.5990965962409973, + "learning_rate": 4.1271787956077175e-05, + "loss": 0.6185, + "step": 53600 + }, + { + "epoch": 0.17491685748999194, + "grad_norm": 0.61868816614151, + "learning_rate": 4.125550093976084e-05, + "loss": 0.6089, + "step": 53700 + }, + { + "epoch": 0.1752425872059882, + "grad_norm": 0.4897661507129669, + "learning_rate": 4.1239213923444506e-05, + "loss": 0.6025, + "step": 53800 + }, + { + "epoch": 0.17556831692198446, + "grad_norm": 0.2856525480747223, + "learning_rate": 4.122292690712818e-05, + "loss": 0.5609, + "step": 53900 + }, + { + "epoch": 0.17589404663798075, + "grad_norm": 0.5488519668579102, + "learning_rate": 4.1206639890811845e-05, + "loss": 0.5781, + "step": 54000 + }, + { + "epoch": 0.176219776353977, + "grad_norm": 0.7812597155570984, + "learning_rate": 4.119035287449551e-05, + "loss": 0.665, + "step": 54100 + }, + { + "epoch": 0.17654550606997327, + "grad_norm": 0.5567785501480103, + "learning_rate": 4.1174065858179176e-05, + "loss": 0.6178, + "step": 54200 + }, + { + "epoch": 0.17687123578596953, + "grad_norm": 0.7302952408790588, + "learning_rate": 4.115777884186285e-05, + "loss": 0.5912, + "step": 54300 + }, + { + "epoch": 0.1771969655019658, + "grad_norm": 0.6872962713241577, + "learning_rate": 4.1141491825546514e-05, + "loss": 0.5698, + "step": 54400 + }, + { + "epoch": 0.17752269521796205, + "grad_norm": 0.6139744520187378, + "learning_rate": 4.112520480923018e-05, + "loss": 0.6148, + "step": 54500 + }, + { + "epoch": 0.1778484249339583, + "grad_norm": 0.6646268367767334, + "learning_rate": 4.1108917792913846e-05, + "loss": 0.5222, + "step": 54600 + }, + { + "epoch": 0.17817415464995456, + "grad_norm": 0.4842844009399414, + "learning_rate": 4.109263077659751e-05, + "loss": 0.6225, + "step": 54700 + }, + { + "epoch": 0.17849988436595082, + "grad_norm": 0.6158716082572937, + "learning_rate": 4.1076343760281184e-05, + "loss": 0.634, + "step": 54800 + }, + { + "epoch": 0.17882561408194708, + "grad_norm": 0.5122677683830261, + "learning_rate": 4.106005674396485e-05, + "loss": 0.6355, + "step": 54900 + }, + { + "epoch": 0.17915134379794334, + "grad_norm": 0.6086121201515198, + "learning_rate": 4.1043769727648515e-05, + "loss": 0.5787, + "step": 55000 + }, + { + "epoch": 0.1794770735139396, + "grad_norm": 0.5853461623191833, + "learning_rate": 4.102748271133218e-05, + "loss": 0.5935, + "step": 55100 + }, + { + "epoch": 0.17980280322993586, + "grad_norm": 0.9216148853302002, + "learning_rate": 4.101119569501585e-05, + "loss": 0.575, + "step": 55200 + }, + { + "epoch": 0.18012853294593212, + "grad_norm": 0.6602348685264587, + "learning_rate": 4.099490867869952e-05, + "loss": 0.6324, + "step": 55300 + }, + { + "epoch": 0.18045426266192838, + "grad_norm": 0.7494210004806519, + "learning_rate": 4.0978621662383185e-05, + "loss": 0.5859, + "step": 55400 + }, + { + "epoch": 0.18077999237792464, + "grad_norm": 0.6391832232475281, + "learning_rate": 4.096233464606685e-05, + "loss": 0.6172, + "step": 55500 + }, + { + "epoch": 0.1811057220939209, + "grad_norm": 0.5824201107025146, + "learning_rate": 4.0946047629750517e-05, + "loss": 0.6298, + "step": 55600 + }, + { + "epoch": 0.18143145180991715, + "grad_norm": 0.6924212574958801, + "learning_rate": 4.092976061343419e-05, + "loss": 0.6105, + "step": 55700 + }, + { + "epoch": 0.1817571815259134, + "grad_norm": 0.4423877000808716, + "learning_rate": 4.0913473597117855e-05, + "loss": 0.5613, + "step": 55800 + }, + { + "epoch": 0.1820829112419097, + "grad_norm": 0.6090314984321594, + "learning_rate": 4.0897186580801514e-05, + "loss": 0.6643, + "step": 55900 + }, + { + "epoch": 0.18240864095790596, + "grad_norm": 0.7554407119750977, + "learning_rate": 4.0880899564485186e-05, + "loss": 0.6017, + "step": 56000 + }, + { + "epoch": 0.18273437067390222, + "grad_norm": 0.8148972988128662, + "learning_rate": 4.086461254816885e-05, + "loss": 0.6539, + "step": 56100 + }, + { + "epoch": 0.18306010038989848, + "grad_norm": 0.5610066652297974, + "learning_rate": 4.0848325531852525e-05, + "loss": 0.5872, + "step": 56200 + }, + { + "epoch": 0.18338583010589474, + "grad_norm": 0.6361645460128784, + "learning_rate": 4.0832038515536183e-05, + "loss": 0.5815, + "step": 56300 + }, + { + "epoch": 0.183711559821891, + "grad_norm": 0.4567771553993225, + "learning_rate": 4.0815751499219856e-05, + "loss": 0.5799, + "step": 56400 + }, + { + "epoch": 0.18403728953788726, + "grad_norm": 0.8705578446388245, + "learning_rate": 4.079946448290352e-05, + "loss": 0.6088, + "step": 56500 + }, + { + "epoch": 0.18436301925388351, + "grad_norm": 0.8278294801712036, + "learning_rate": 4.078317746658719e-05, + "loss": 0.6064, + "step": 56600 + }, + { + "epoch": 0.18468874896987977, + "grad_norm": 0.38864201307296753, + "learning_rate": 4.076689045027085e-05, + "loss": 0.5705, + "step": 56700 + }, + { + "epoch": 0.18501447868587603, + "grad_norm": 0.6986147165298462, + "learning_rate": 4.075060343395452e-05, + "loss": 0.6071, + "step": 56800 + }, + { + "epoch": 0.1853402084018723, + "grad_norm": 0.9127377867698669, + "learning_rate": 4.073431641763819e-05, + "loss": 0.608, + "step": 56900 + }, + { + "epoch": 0.18566593811786855, + "grad_norm": 0.5072229504585266, + "learning_rate": 4.071802940132186e-05, + "loss": 0.583, + "step": 57000 + }, + { + "epoch": 0.1859916678338648, + "grad_norm": 0.47545337677001953, + "learning_rate": 4.070174238500552e-05, + "loss": 0.5826, + "step": 57100 + }, + { + "epoch": 0.18631739754986107, + "grad_norm": 0.5175743103027344, + "learning_rate": 4.068545536868919e-05, + "loss": 0.6184, + "step": 57200 + }, + { + "epoch": 0.18664312726585733, + "grad_norm": 0.7252177596092224, + "learning_rate": 4.0669168352372854e-05, + "loss": 0.6042, + "step": 57300 + }, + { + "epoch": 0.1869688569818536, + "grad_norm": 0.21297673881053925, + "learning_rate": 4.065288133605653e-05, + "loss": 0.5874, + "step": 57400 + }, + { + "epoch": 0.18729458669784985, + "grad_norm": 0.6985592246055603, + "learning_rate": 4.063659431974019e-05, + "loss": 0.5641, + "step": 57500 + }, + { + "epoch": 0.1876203164138461, + "grad_norm": 0.35783612728118896, + "learning_rate": 4.062030730342386e-05, + "loss": 0.5743, + "step": 57600 + }, + { + "epoch": 0.1879460461298424, + "grad_norm": 0.40871796011924744, + "learning_rate": 4.0604020287107524e-05, + "loss": 0.6418, + "step": 57700 + }, + { + "epoch": 0.18827177584583865, + "grad_norm": 0.6412025094032288, + "learning_rate": 4.0587733270791197e-05, + "loss": 0.6048, + "step": 57800 + }, + { + "epoch": 0.1885975055618349, + "grad_norm": 0.6944416165351868, + "learning_rate": 4.057144625447486e-05, + "loss": 0.5647, + "step": 57900 + }, + { + "epoch": 0.18892323527783117, + "grad_norm": 0.8592963218688965, + "learning_rate": 4.055515923815852e-05, + "loss": 0.5703, + "step": 58000 + }, + { + "epoch": 0.18924896499382743, + "grad_norm": 0.7240419983863831, + "learning_rate": 4.0538872221842194e-05, + "loss": 0.6025, + "step": 58100 + }, + { + "epoch": 0.1895746947098237, + "grad_norm": 0.3861270546913147, + "learning_rate": 4.052258520552586e-05, + "loss": 0.5864, + "step": 58200 + }, + { + "epoch": 0.18990042442581995, + "grad_norm": 0.6718447208404541, + "learning_rate": 4.050629818920953e-05, + "loss": 0.6139, + "step": 58300 + }, + { + "epoch": 0.1902261541418162, + "grad_norm": 0.7049744129180908, + "learning_rate": 4.049001117289319e-05, + "loss": 0.5697, + "step": 58400 + }, + { + "epoch": 0.19055188385781247, + "grad_norm": 0.39576876163482666, + "learning_rate": 4.047372415657686e-05, + "loss": 0.5987, + "step": 58500 + }, + { + "epoch": 0.19087761357380872, + "grad_norm": 0.7814981341362, + "learning_rate": 4.045743714026053e-05, + "loss": 0.5715, + "step": 58600 + }, + { + "epoch": 0.19120334328980498, + "grad_norm": 1.0083011388778687, + "learning_rate": 4.0441150123944195e-05, + "loss": 0.6355, + "step": 58700 + }, + { + "epoch": 0.19152907300580124, + "grad_norm": 0.7083866596221924, + "learning_rate": 4.042486310762786e-05, + "loss": 0.6666, + "step": 58800 + }, + { + "epoch": 0.1918548027217975, + "grad_norm": 0.4740765690803528, + "learning_rate": 4.0408576091311526e-05, + "loss": 0.5773, + "step": 58900 + }, + { + "epoch": 0.19218053243779376, + "grad_norm": 0.3599790632724762, + "learning_rate": 4.03922890749952e-05, + "loss": 0.5916, + "step": 59000 + }, + { + "epoch": 0.19250626215379002, + "grad_norm": 0.6107310652732849, + "learning_rate": 4.0376002058678865e-05, + "loss": 0.63, + "step": 59100 + }, + { + "epoch": 0.19283199186978628, + "grad_norm": 0.6388813257217407, + "learning_rate": 4.035971504236253e-05, + "loss": 0.6197, + "step": 59200 + }, + { + "epoch": 0.19315772158578254, + "grad_norm": 0.4137844145298004, + "learning_rate": 4.0343428026046196e-05, + "loss": 0.6185, + "step": 59300 + }, + { + "epoch": 0.1934834513017788, + "grad_norm": 0.6289616823196411, + "learning_rate": 4.032714100972986e-05, + "loss": 0.6367, + "step": 59400 + }, + { + "epoch": 0.19380918101777508, + "grad_norm": 0.7528841495513916, + "learning_rate": 4.0310853993413534e-05, + "loss": 0.5783, + "step": 59500 + }, + { + "epoch": 0.19413491073377134, + "grad_norm": 0.7345238924026489, + "learning_rate": 4.02945669770972e-05, + "loss": 0.6378, + "step": 59600 + }, + { + "epoch": 0.1944606404497676, + "grad_norm": 0.7652753591537476, + "learning_rate": 4.0278279960780866e-05, + "loss": 0.5419, + "step": 59700 + }, + { + "epoch": 0.19478637016576386, + "grad_norm": 0.3726235032081604, + "learning_rate": 4.026199294446453e-05, + "loss": 0.5933, + "step": 59800 + }, + { + "epoch": 0.19511209988176012, + "grad_norm": 0.475990355014801, + "learning_rate": 4.0245705928148204e-05, + "loss": 0.5421, + "step": 59900 + }, + { + "epoch": 0.19543782959775638, + "grad_norm": 0.8618846535682678, + "learning_rate": 4.022941891183187e-05, + "loss": 0.6149, + "step": 60000 + }, + { + "epoch": 0.19576355931375264, + "grad_norm": 0.3643835484981537, + "learning_rate": 4.0213131895515535e-05, + "loss": 0.5898, + "step": 60100 + }, + { + "epoch": 0.1960892890297489, + "grad_norm": 0.6492701172828674, + "learning_rate": 4.01968448791992e-05, + "loss": 0.6115, + "step": 60200 + }, + { + "epoch": 0.19641501874574516, + "grad_norm": 0.46400219202041626, + "learning_rate": 4.018055786288287e-05, + "loss": 0.6093, + "step": 60300 + }, + { + "epoch": 0.19674074846174142, + "grad_norm": 0.6529611349105835, + "learning_rate": 4.016427084656654e-05, + "loss": 0.5663, + "step": 60400 + }, + { + "epoch": 0.19706647817773768, + "grad_norm": 0.8332497477531433, + "learning_rate": 4.0147983830250205e-05, + "loss": 0.557, + "step": 60500 + }, + { + "epoch": 0.19739220789373393, + "grad_norm": 0.43394774198532104, + "learning_rate": 4.013169681393387e-05, + "loss": 0.5864, + "step": 60600 + }, + { + "epoch": 0.1977179376097302, + "grad_norm": 0.3713783919811249, + "learning_rate": 4.0115409797617537e-05, + "loss": 0.597, + "step": 60700 + }, + { + "epoch": 0.19804366732572645, + "grad_norm": 0.5605040788650513, + "learning_rate": 4.00991227813012e-05, + "loss": 0.5965, + "step": 60800 + }, + { + "epoch": 0.1983693970417227, + "grad_norm": 0.4591531455516815, + "learning_rate": 4.0082835764984875e-05, + "loss": 0.5718, + "step": 60900 + }, + { + "epoch": 0.19869512675771897, + "grad_norm": 0.7599985003471375, + "learning_rate": 4.0066548748668534e-05, + "loss": 0.6088, + "step": 61000 + }, + { + "epoch": 0.19902085647371523, + "grad_norm": 0.7234918475151062, + "learning_rate": 4.0050261732352206e-05, + "loss": 0.6022, + "step": 61100 + }, + { + "epoch": 0.1993465861897115, + "grad_norm": 0.8344034552574158, + "learning_rate": 4.003397471603587e-05, + "loss": 0.5978, + "step": 61200 + }, + { + "epoch": 0.19967231590570778, + "grad_norm": 0.7539324164390564, + "learning_rate": 4.0017687699719544e-05, + "loss": 0.5979, + "step": 61300 + }, + { + "epoch": 0.19999804562170403, + "grad_norm": 0.7535436153411865, + "learning_rate": 4.00014006834032e-05, + "loss": 0.5632, + "step": 61400 + }, + { + "epoch": 0.2003237753377003, + "grad_norm": 1.0253859758377075, + "learning_rate": 3.998511366708687e-05, + "loss": 0.6245, + "step": 61500 + }, + { + "epoch": 0.20064950505369655, + "grad_norm": 0.8442240357398987, + "learning_rate": 3.996882665077054e-05, + "loss": 0.56, + "step": 61600 + }, + { + "epoch": 0.2009752347696928, + "grad_norm": 0.7696794867515564, + "learning_rate": 3.995253963445421e-05, + "loss": 0.5525, + "step": 61700 + }, + { + "epoch": 0.20130096448568907, + "grad_norm": 1.0839108228683472, + "learning_rate": 3.993625261813787e-05, + "loss": 0.576, + "step": 61800 + }, + { + "epoch": 0.20162669420168533, + "grad_norm": 0.4837821125984192, + "learning_rate": 3.991996560182154e-05, + "loss": 0.6654, + "step": 61900 + }, + { + "epoch": 0.2019524239176816, + "grad_norm": 0.8696286082267761, + "learning_rate": 3.990367858550521e-05, + "loss": 0.5237, + "step": 62000 + }, + { + "epoch": 0.20227815363367785, + "grad_norm": 0.5389662384986877, + "learning_rate": 3.988739156918888e-05, + "loss": 0.5765, + "step": 62100 + }, + { + "epoch": 0.2026038833496741, + "grad_norm": 0.39996546506881714, + "learning_rate": 3.987110455287254e-05, + "loss": 0.5666, + "step": 62200 + }, + { + "epoch": 0.20292961306567037, + "grad_norm": 0.5612654685974121, + "learning_rate": 3.985481753655621e-05, + "loss": 0.5975, + "step": 62300 + }, + { + "epoch": 0.20325534278166663, + "grad_norm": 0.4764688014984131, + "learning_rate": 3.9838530520239874e-05, + "loss": 0.5973, + "step": 62400 + }, + { + "epoch": 0.20358107249766288, + "grad_norm": 0.538745105266571, + "learning_rate": 3.982224350392355e-05, + "loss": 0.6108, + "step": 62500 + }, + { + "epoch": 0.20390680221365914, + "grad_norm": 0.6589317321777344, + "learning_rate": 3.980595648760721e-05, + "loss": 0.5482, + "step": 62600 + }, + { + "epoch": 0.2042325319296554, + "grad_norm": 0.8373557925224304, + "learning_rate": 3.978966947129088e-05, + "loss": 0.5671, + "step": 62700 + }, + { + "epoch": 0.20455826164565166, + "grad_norm": 0.6305526494979858, + "learning_rate": 3.9773382454974544e-05, + "loss": 0.6205, + "step": 62800 + }, + { + "epoch": 0.20488399136164792, + "grad_norm": 0.6550065875053406, + "learning_rate": 3.9757095438658216e-05, + "loss": 0.5805, + "step": 62900 + }, + { + "epoch": 0.20520972107764418, + "grad_norm": 0.6951280236244202, + "learning_rate": 3.974080842234188e-05, + "loss": 0.6103, + "step": 63000 + }, + { + "epoch": 0.20553545079364044, + "grad_norm": 0.5202652215957642, + "learning_rate": 3.972452140602554e-05, + "loss": 0.5623, + "step": 63100 + }, + { + "epoch": 0.20586118050963673, + "grad_norm": 1.0889042615890503, + "learning_rate": 3.9708234389709214e-05, + "loss": 0.5879, + "step": 63200 + }, + { + "epoch": 0.20618691022563299, + "grad_norm": 0.4142896234989166, + "learning_rate": 3.969194737339288e-05, + "loss": 0.6148, + "step": 63300 + }, + { + "epoch": 0.20651263994162924, + "grad_norm": 0.6650342345237732, + "learning_rate": 3.967566035707655e-05, + "loss": 0.5902, + "step": 63400 + }, + { + "epoch": 0.2068383696576255, + "grad_norm": 0.42452552914619446, + "learning_rate": 3.965937334076021e-05, + "loss": 0.4877, + "step": 63500 + }, + { + "epoch": 0.20716409937362176, + "grad_norm": 0.6702756881713867, + "learning_rate": 3.964308632444388e-05, + "loss": 0.5943, + "step": 63600 + }, + { + "epoch": 0.20748982908961802, + "grad_norm": 0.9007012248039246, + "learning_rate": 3.962679930812755e-05, + "loss": 0.5652, + "step": 63700 + }, + { + "epoch": 0.20781555880561428, + "grad_norm": 0.8962705135345459, + "learning_rate": 3.9610512291811215e-05, + "loss": 0.5731, + "step": 63800 + }, + { + "epoch": 0.20814128852161054, + "grad_norm": 0.8256299495697021, + "learning_rate": 3.959422527549489e-05, + "loss": 0.5596, + "step": 63900 + }, + { + "epoch": 0.2084670182376068, + "grad_norm": 0.5674106478691101, + "learning_rate": 3.9577938259178546e-05, + "loss": 0.557, + "step": 64000 + }, + { + "epoch": 0.20879274795360306, + "grad_norm": 0.564755916595459, + "learning_rate": 3.956165124286222e-05, + "loss": 0.5735, + "step": 64100 + }, + { + "epoch": 0.20911847766959932, + "grad_norm": 1.0437874794006348, + "learning_rate": 3.9545364226545884e-05, + "loss": 0.5371, + "step": 64200 + }, + { + "epoch": 0.20944420738559558, + "grad_norm": 0.877699077129364, + "learning_rate": 3.952907721022956e-05, + "loss": 0.538, + "step": 64300 + }, + { + "epoch": 0.20976993710159184, + "grad_norm": 0.6481153964996338, + "learning_rate": 3.9512790193913216e-05, + "loss": 0.5763, + "step": 64400 + }, + { + "epoch": 0.2100956668175881, + "grad_norm": 0.7963904142379761, + "learning_rate": 3.949650317759688e-05, + "loss": 0.5617, + "step": 64500 + }, + { + "epoch": 0.21042139653358435, + "grad_norm": 1.1034698486328125, + "learning_rate": 3.9480216161280554e-05, + "loss": 0.5876, + "step": 64600 + }, + { + "epoch": 0.2107471262495806, + "grad_norm": 0.7540128827095032, + "learning_rate": 3.946392914496422e-05, + "loss": 0.574, + "step": 64700 + }, + { + "epoch": 0.21107285596557687, + "grad_norm": 0.7184910178184509, + "learning_rate": 3.9447642128647886e-05, + "loss": 0.5328, + "step": 64800 + }, + { + "epoch": 0.21139858568157313, + "grad_norm": 0.7150009274482727, + "learning_rate": 3.943135511233155e-05, + "loss": 0.6049, + "step": 64900 + }, + { + "epoch": 0.21172431539756942, + "grad_norm": 0.4451941251754761, + "learning_rate": 3.9415068096015224e-05, + "loss": 0.5958, + "step": 65000 + }, + { + "epoch": 0.21205004511356568, + "grad_norm": 1.00858736038208, + "learning_rate": 3.939878107969889e-05, + "loss": 0.5752, + "step": 65100 + }, + { + "epoch": 0.21237577482956194, + "grad_norm": 0.7953845858573914, + "learning_rate": 3.9382494063382555e-05, + "loss": 0.5555, + "step": 65200 + }, + { + "epoch": 0.2127015045455582, + "grad_norm": 0.5992127060890198, + "learning_rate": 3.936620704706622e-05, + "loss": 0.59, + "step": 65300 + }, + { + "epoch": 0.21302723426155445, + "grad_norm": 0.5878809690475464, + "learning_rate": 3.934992003074989e-05, + "loss": 0.5881, + "step": 65400 + }, + { + "epoch": 0.2133529639775507, + "grad_norm": 0.9159529805183411, + "learning_rate": 3.933363301443356e-05, + "loss": 0.5951, + "step": 65500 + }, + { + "epoch": 0.21367869369354697, + "grad_norm": 0.6340069770812988, + "learning_rate": 3.9317345998117225e-05, + "loss": 0.5799, + "step": 65600 + }, + { + "epoch": 0.21400442340954323, + "grad_norm": 0.8940368890762329, + "learning_rate": 3.930105898180089e-05, + "loss": 0.5273, + "step": 65700 + }, + { + "epoch": 0.2143301531255395, + "grad_norm": 0.7908622622489929, + "learning_rate": 3.9284771965484556e-05, + "loss": 0.5472, + "step": 65800 + }, + { + "epoch": 0.21465588284153575, + "grad_norm": 0.9964277744293213, + "learning_rate": 3.926848494916822e-05, + "loss": 0.5719, + "step": 65900 + }, + { + "epoch": 0.214981612557532, + "grad_norm": 0.6497515439987183, + "learning_rate": 3.9252197932851895e-05, + "loss": 0.5338, + "step": 66000 + }, + { + "epoch": 0.21530734227352827, + "grad_norm": 0.8303185105323792, + "learning_rate": 3.9235910916535554e-05, + "loss": 0.5237, + "step": 66100 + }, + { + "epoch": 0.21563307198952453, + "grad_norm": 0.8530830144882202, + "learning_rate": 3.9219623900219226e-05, + "loss": 0.5328, + "step": 66200 + }, + { + "epoch": 0.2159588017055208, + "grad_norm": 0.9482616782188416, + "learning_rate": 3.920333688390289e-05, + "loss": 0.5548, + "step": 66300 + }, + { + "epoch": 0.21628453142151705, + "grad_norm": 0.430633008480072, + "learning_rate": 3.9187049867586564e-05, + "loss": 0.551, + "step": 66400 + }, + { + "epoch": 0.2166102611375133, + "grad_norm": 0.5612674355506897, + "learning_rate": 3.917076285127022e-05, + "loss": 0.5571, + "step": 66500 + }, + { + "epoch": 0.21693599085350956, + "grad_norm": 0.7157821655273438, + "learning_rate": 3.915447583495389e-05, + "loss": 0.555, + "step": 66600 + }, + { + "epoch": 0.21726172056950582, + "grad_norm": 0.6013966202735901, + "learning_rate": 3.913818881863756e-05, + "loss": 0.585, + "step": 66700 + }, + { + "epoch": 0.2175874502855021, + "grad_norm": 0.4616648554801941, + "learning_rate": 3.912190180232123e-05, + "loss": 0.5832, + "step": 66800 + }, + { + "epoch": 0.21791318000149837, + "grad_norm": 0.6870980858802795, + "learning_rate": 3.910561478600489e-05, + "loss": 0.5944, + "step": 66900 + }, + { + "epoch": 0.21823890971749463, + "grad_norm": 0.629490315914154, + "learning_rate": 3.908932776968856e-05, + "loss": 0.5279, + "step": 67000 + }, + { + "epoch": 0.2185646394334909, + "grad_norm": 0.5478650331497192, + "learning_rate": 3.907304075337223e-05, + "loss": 0.5815, + "step": 67100 + }, + { + "epoch": 0.21889036914948715, + "grad_norm": 0.6581255793571472, + "learning_rate": 3.90567537370559e-05, + "loss": 0.5661, + "step": 67200 + }, + { + "epoch": 0.2192160988654834, + "grad_norm": 0.7738802433013916, + "learning_rate": 3.904046672073956e-05, + "loss": 0.5901, + "step": 67300 + }, + { + "epoch": 0.21954182858147966, + "grad_norm": 0.5748447179794312, + "learning_rate": 3.902417970442323e-05, + "loss": 0.5813, + "step": 67400 + }, + { + "epoch": 0.21986755829747592, + "grad_norm": 0.7152987718582153, + "learning_rate": 3.9007892688106894e-05, + "loss": 0.5359, + "step": 67500 + }, + { + "epoch": 0.22019328801347218, + "grad_norm": 0.867574155330658, + "learning_rate": 3.899160567179057e-05, + "loss": 0.5419, + "step": 67600 + }, + { + "epoch": 0.22051901772946844, + "grad_norm": 0.8477634787559509, + "learning_rate": 3.897531865547423e-05, + "loss": 0.5788, + "step": 67700 + }, + { + "epoch": 0.2208447474454647, + "grad_norm": 0.7993571758270264, + "learning_rate": 3.89590316391579e-05, + "loss": 0.528, + "step": 67800 + }, + { + "epoch": 0.22117047716146096, + "grad_norm": 0.6607359647750854, + "learning_rate": 3.8942744622841564e-05, + "loss": 0.5647, + "step": 67900 + }, + { + "epoch": 0.22149620687745722, + "grad_norm": 0.6910780072212219, + "learning_rate": 3.892645760652523e-05, + "loss": 0.5418, + "step": 68000 + }, + { + "epoch": 0.22182193659345348, + "grad_norm": 0.4793308675289154, + "learning_rate": 3.89101705902089e-05, + "loss": 0.5913, + "step": 68100 + }, + { + "epoch": 0.22214766630944974, + "grad_norm": 0.7222141027450562, + "learning_rate": 3.889388357389257e-05, + "loss": 0.6128, + "step": 68200 + }, + { + "epoch": 0.222473396025446, + "grad_norm": 0.43712884187698364, + "learning_rate": 3.8877596557576233e-05, + "loss": 0.583, + "step": 68300 + }, + { + "epoch": 0.22279912574144226, + "grad_norm": 0.5187420845031738, + "learning_rate": 3.88613095412599e-05, + "loss": 0.5758, + "step": 68400 + }, + { + "epoch": 0.22312485545743851, + "grad_norm": 0.5550572872161865, + "learning_rate": 3.884502252494357e-05, + "loss": 0.5269, + "step": 68500 + }, + { + "epoch": 0.22345058517343477, + "grad_norm": 0.7551735639572144, + "learning_rate": 3.882873550862724e-05, + "loss": 0.6005, + "step": 68600 + }, + { + "epoch": 0.22377631488943106, + "grad_norm": 0.7213869690895081, + "learning_rate": 3.8812448492310896e-05, + "loss": 0.5174, + "step": 68700 + }, + { + "epoch": 0.22410204460542732, + "grad_norm": 0.6445099115371704, + "learning_rate": 3.879616147599457e-05, + "loss": 0.5501, + "step": 68800 + }, + { + "epoch": 0.22442777432142358, + "grad_norm": 0.7937589883804321, + "learning_rate": 3.8779874459678235e-05, + "loss": 0.5598, + "step": 68900 + }, + { + "epoch": 0.22475350403741984, + "grad_norm": 0.5327324271202087, + "learning_rate": 3.876358744336191e-05, + "loss": 0.531, + "step": 69000 + }, + { + "epoch": 0.2250792337534161, + "grad_norm": 0.7627710103988647, + "learning_rate": 3.8747300427045566e-05, + "loss": 0.578, + "step": 69100 + }, + { + "epoch": 0.22540496346941236, + "grad_norm": 0.5054932832717896, + "learning_rate": 3.873101341072924e-05, + "loss": 0.5905, + "step": 69200 + }, + { + "epoch": 0.22573069318540862, + "grad_norm": 0.6468352675437927, + "learning_rate": 3.8714726394412904e-05, + "loss": 0.5931, + "step": 69300 + }, + { + "epoch": 0.22605642290140487, + "grad_norm": 0.37974539399147034, + "learning_rate": 3.869843937809657e-05, + "loss": 0.5777, + "step": 69400 + }, + { + "epoch": 0.22638215261740113, + "grad_norm": 0.8011950850486755, + "learning_rate": 3.8682152361780236e-05, + "loss": 0.5187, + "step": 69500 + }, + { + "epoch": 0.2267078823333974, + "grad_norm": 0.40006023645401, + "learning_rate": 3.86658653454639e-05, + "loss": 0.5292, + "step": 69600 + }, + { + "epoch": 0.22703361204939365, + "grad_norm": 0.42605412006378174, + "learning_rate": 3.8649578329147574e-05, + "loss": 0.5704, + "step": 69700 + }, + { + "epoch": 0.2273593417653899, + "grad_norm": 0.820277988910675, + "learning_rate": 3.863329131283124e-05, + "loss": 0.5641, + "step": 69800 + }, + { + "epoch": 0.22768507148138617, + "grad_norm": 0.6671209931373596, + "learning_rate": 3.8617004296514905e-05, + "loss": 0.5942, + "step": 69900 + }, + { + "epoch": 0.22801080119738243, + "grad_norm": 0.7214267253875732, + "learning_rate": 3.860071728019857e-05, + "loss": 0.6078, + "step": 70000 + }, + { + "epoch": 0.2283365309133787, + "grad_norm": 0.5705024003982544, + "learning_rate": 3.858443026388224e-05, + "loss": 0.5111, + "step": 70100 + }, + { + "epoch": 0.22866226062937495, + "grad_norm": 0.7017680406570435, + "learning_rate": 3.856814324756591e-05, + "loss": 0.5386, + "step": 70200 + }, + { + "epoch": 0.2289879903453712, + "grad_norm": 0.36700716614723206, + "learning_rate": 3.8551856231249575e-05, + "loss": 0.5947, + "step": 70300 + }, + { + "epoch": 0.22931372006136747, + "grad_norm": 1.018539309501648, + "learning_rate": 3.853556921493324e-05, + "loss": 0.5739, + "step": 70400 + }, + { + "epoch": 0.22963944977736375, + "grad_norm": 0.8273037672042847, + "learning_rate": 3.851928219861691e-05, + "loss": 0.5247, + "step": 70500 + }, + { + "epoch": 0.22996517949336, + "grad_norm": 1.0655425786972046, + "learning_rate": 3.850299518230058e-05, + "loss": 0.5397, + "step": 70600 + }, + { + "epoch": 0.23029090920935627, + "grad_norm": 0.38495421409606934, + "learning_rate": 3.8486708165984245e-05, + "loss": 0.5844, + "step": 70700 + }, + { + "epoch": 0.23061663892535253, + "grad_norm": 0.9659711122512817, + "learning_rate": 3.847042114966791e-05, + "loss": 0.5873, + "step": 70800 + }, + { + "epoch": 0.2309423686413488, + "grad_norm": 0.7230137586593628, + "learning_rate": 3.8454134133351576e-05, + "loss": 0.593, + "step": 70900 + }, + { + "epoch": 0.23126809835734505, + "grad_norm": 0.9325969219207764, + "learning_rate": 3.843784711703524e-05, + "loss": 0.5965, + "step": 71000 + }, + { + "epoch": 0.2315938280733413, + "grad_norm": 0.6791651248931885, + "learning_rate": 3.8421560100718915e-05, + "loss": 0.6223, + "step": 71100 + }, + { + "epoch": 0.23191955778933757, + "grad_norm": 0.8241651058197021, + "learning_rate": 3.8405273084402573e-05, + "loss": 0.5257, + "step": 71200 + }, + { + "epoch": 0.23224528750533383, + "grad_norm": 0.8813059329986572, + "learning_rate": 3.8388986068086246e-05, + "loss": 0.5965, + "step": 71300 + }, + { + "epoch": 0.23257101722133008, + "grad_norm": 0.7717010378837585, + "learning_rate": 3.837269905176991e-05, + "loss": 0.5502, + "step": 71400 + }, + { + "epoch": 0.23289674693732634, + "grad_norm": 0.39482927322387695, + "learning_rate": 3.8356412035453584e-05, + "loss": 0.5618, + "step": 71500 + }, + { + "epoch": 0.2332224766533226, + "grad_norm": 0.8985998630523682, + "learning_rate": 3.834012501913724e-05, + "loss": 0.5247, + "step": 71600 + }, + { + "epoch": 0.23354820636931886, + "grad_norm": 0.4451032876968384, + "learning_rate": 3.832383800282091e-05, + "loss": 0.565, + "step": 71700 + }, + { + "epoch": 0.23387393608531512, + "grad_norm": 0.46427956223487854, + "learning_rate": 3.830755098650458e-05, + "loss": 0.5511, + "step": 71800 + }, + { + "epoch": 0.23419966580131138, + "grad_norm": 1.1371232271194458, + "learning_rate": 3.829126397018825e-05, + "loss": 0.5867, + "step": 71900 + }, + { + "epoch": 0.23452539551730764, + "grad_norm": 0.5856015086174011, + "learning_rate": 3.827497695387191e-05, + "loss": 0.5425, + "step": 72000 + }, + { + "epoch": 0.2348511252333039, + "grad_norm": 0.5723338723182678, + "learning_rate": 3.825868993755558e-05, + "loss": 0.5828, + "step": 72100 + }, + { + "epoch": 0.23517685494930016, + "grad_norm": 0.6274189352989197, + "learning_rate": 3.824240292123925e-05, + "loss": 0.4961, + "step": 72200 + }, + { + "epoch": 0.23550258466529644, + "grad_norm": 0.5841485857963562, + "learning_rate": 3.822611590492292e-05, + "loss": 0.5639, + "step": 72300 + }, + { + "epoch": 0.2358283143812927, + "grad_norm": 0.9061130285263062, + "learning_rate": 3.820982888860658e-05, + "loss": 0.5126, + "step": 72400 + }, + { + "epoch": 0.23615404409728896, + "grad_norm": 0.9499684572219849, + "learning_rate": 3.819354187229025e-05, + "loss": 0.5684, + "step": 72500 + }, + { + "epoch": 0.23647977381328522, + "grad_norm": 0.7132393717765808, + "learning_rate": 3.8177254855973914e-05, + "loss": 0.5287, + "step": 72600 + }, + { + "epoch": 0.23680550352928148, + "grad_norm": 0.8645475506782532, + "learning_rate": 3.8160967839657587e-05, + "loss": 0.564, + "step": 72700 + }, + { + "epoch": 0.23713123324527774, + "grad_norm": 0.8675580024719238, + "learning_rate": 3.814468082334125e-05, + "loss": 0.5435, + "step": 72800 + }, + { + "epoch": 0.237456962961274, + "grad_norm": 0.7194923162460327, + "learning_rate": 3.812839380702492e-05, + "loss": 0.5843, + "step": 72900 + }, + { + "epoch": 0.23778269267727026, + "grad_norm": 0.782618522644043, + "learning_rate": 3.8112106790708584e-05, + "loss": 0.5609, + "step": 73000 + }, + { + "epoch": 0.23810842239326652, + "grad_norm": 0.6671516299247742, + "learning_rate": 3.809581977439225e-05, + "loss": 0.4925, + "step": 73100 + }, + { + "epoch": 0.23843415210926278, + "grad_norm": 0.8488081097602844, + "learning_rate": 3.807953275807592e-05, + "loss": 0.5536, + "step": 73200 + }, + { + "epoch": 0.23875988182525903, + "grad_norm": 0.7259848117828369, + "learning_rate": 3.806324574175959e-05, + "loss": 0.5372, + "step": 73300 + }, + { + "epoch": 0.2390856115412553, + "grad_norm": 0.5849174857139587, + "learning_rate": 3.8046958725443253e-05, + "loss": 0.5602, + "step": 73400 + }, + { + "epoch": 0.23941134125725155, + "grad_norm": 0.36567142605781555, + "learning_rate": 3.803067170912692e-05, + "loss": 0.5976, + "step": 73500 + }, + { + "epoch": 0.2397370709732478, + "grad_norm": 0.8540560007095337, + "learning_rate": 3.801438469281059e-05, + "loss": 0.576, + "step": 73600 + }, + { + "epoch": 0.24006280068924407, + "grad_norm": 0.7733421921730042, + "learning_rate": 3.799809767649426e-05, + "loss": 0.5446, + "step": 73700 + }, + { + "epoch": 0.24038853040524033, + "grad_norm": 0.6541240811347961, + "learning_rate": 3.7981810660177916e-05, + "loss": 0.5302, + "step": 73800 + }, + { + "epoch": 0.2407142601212366, + "grad_norm": 0.6777580976486206, + "learning_rate": 3.796552364386159e-05, + "loss": 0.5742, + "step": 73900 + }, + { + "epoch": 0.24103998983723285, + "grad_norm": 1.1045103073120117, + "learning_rate": 3.7949236627545255e-05, + "loss": 0.5391, + "step": 74000 + }, + { + "epoch": 0.2413657195532291, + "grad_norm": 1.223781943321228, + "learning_rate": 3.793294961122893e-05, + "loss": 0.5754, + "step": 74100 + }, + { + "epoch": 0.2416914492692254, + "grad_norm": 0.7645404934883118, + "learning_rate": 3.7916662594912586e-05, + "loss": 0.5424, + "step": 74200 + }, + { + "epoch": 0.24201717898522165, + "grad_norm": 0.8637171983718872, + "learning_rate": 3.790037557859626e-05, + "loss": 0.5577, + "step": 74300 + }, + { + "epoch": 0.2423429087012179, + "grad_norm": 0.633642315864563, + "learning_rate": 3.7884088562279924e-05, + "loss": 0.5513, + "step": 74400 + }, + { + "epoch": 0.24266863841721417, + "grad_norm": 0.48609936237335205, + "learning_rate": 3.786780154596359e-05, + "loss": 0.6002, + "step": 74500 + }, + { + "epoch": 0.24299436813321043, + "grad_norm": 0.3668748140335083, + "learning_rate": 3.7851514529647256e-05, + "loss": 0.5947, + "step": 74600 + }, + { + "epoch": 0.2433200978492067, + "grad_norm": 0.735894501209259, + "learning_rate": 3.783522751333092e-05, + "loss": 0.5862, + "step": 74700 + }, + { + "epoch": 0.24364582756520295, + "grad_norm": 0.8264063000679016, + "learning_rate": 3.7818940497014594e-05, + "loss": 0.5749, + "step": 74800 + }, + { + "epoch": 0.2439715572811992, + "grad_norm": 0.482183575630188, + "learning_rate": 3.780265348069826e-05, + "loss": 0.5553, + "step": 74900 + }, + { + "epoch": 0.24429728699719547, + "grad_norm": 0.6649850606918335, + "learning_rate": 3.7786366464381925e-05, + "loss": 0.6042, + "step": 75000 + }, + { + "epoch": 0.24462301671319173, + "grad_norm": 0.5215208530426025, + "learning_rate": 3.777007944806559e-05, + "loss": 0.5134, + "step": 75100 + }, + { + "epoch": 0.24494874642918799, + "grad_norm": 0.6028915643692017, + "learning_rate": 3.775379243174926e-05, + "loss": 0.5, + "step": 75200 + }, + { + "epoch": 0.24527447614518424, + "grad_norm": 0.5038050413131714, + "learning_rate": 3.773750541543293e-05, + "loss": 0.6081, + "step": 75300 + }, + { + "epoch": 0.2456002058611805, + "grad_norm": 0.568586528301239, + "learning_rate": 3.7721218399116595e-05, + "loss": 0.5484, + "step": 75400 + }, + { + "epoch": 0.24592593557717676, + "grad_norm": 0.4442402720451355, + "learning_rate": 3.770493138280026e-05, + "loss": 0.5983, + "step": 75500 + }, + { + "epoch": 0.24625166529317302, + "grad_norm": 0.775284469127655, + "learning_rate": 3.7688644366483927e-05, + "loss": 0.549, + "step": 75600 + }, + { + "epoch": 0.24657739500916928, + "grad_norm": 0.7132833003997803, + "learning_rate": 3.76723573501676e-05, + "loss": 0.5317, + "step": 75700 + }, + { + "epoch": 0.24690312472516554, + "grad_norm": 0.7935360074043274, + "learning_rate": 3.7656070333851265e-05, + "loss": 0.5389, + "step": 75800 + }, + { + "epoch": 0.2472288544411618, + "grad_norm": 0.5749487280845642, + "learning_rate": 3.7639783317534924e-05, + "loss": 0.5918, + "step": 75900 + }, + { + "epoch": 0.2475545841571581, + "grad_norm": 0.6536827087402344, + "learning_rate": 3.7623496301218596e-05, + "loss": 0.5245, + "step": 76000 + }, + { + "epoch": 0.24788031387315435, + "grad_norm": 0.7014347314834595, + "learning_rate": 3.760720928490226e-05, + "loss": 0.5661, + "step": 76100 + }, + { + "epoch": 0.2482060435891506, + "grad_norm": 0.8436623811721802, + "learning_rate": 3.7590922268585934e-05, + "loss": 0.5714, + "step": 76200 + }, + { + "epoch": 0.24853177330514686, + "grad_norm": 0.6371897459030151, + "learning_rate": 3.7574635252269593e-05, + "loss": 0.5767, + "step": 76300 + }, + { + "epoch": 0.24885750302114312, + "grad_norm": 0.7796430587768555, + "learning_rate": 3.7558348235953266e-05, + "loss": 0.5308, + "step": 76400 + }, + { + "epoch": 0.24918323273713938, + "grad_norm": 0.6565324664115906, + "learning_rate": 3.754206121963693e-05, + "loss": 0.5377, + "step": 76500 + }, + { + "epoch": 0.24950896245313564, + "grad_norm": 0.6670543551445007, + "learning_rate": 3.75257742033206e-05, + "loss": 0.6095, + "step": 76600 + }, + { + "epoch": 0.2498346921691319, + "grad_norm": 0.8650514483451843, + "learning_rate": 3.750948718700426e-05, + "loss": 0.5586, + "step": 76700 + }, + { + "epoch": 0.25016042188512816, + "grad_norm": 0.42015933990478516, + "learning_rate": 3.749320017068793e-05, + "loss": 0.5274, + "step": 76800 + }, + { + "epoch": 0.2504861516011244, + "grad_norm": 0.5667533278465271, + "learning_rate": 3.74769131543716e-05, + "loss": 0.5628, + "step": 76900 + }, + { + "epoch": 0.2508118813171207, + "grad_norm": 0.6887187361717224, + "learning_rate": 3.746062613805527e-05, + "loss": 0.5663, + "step": 77000 + }, + { + "epoch": 0.25113761103311694, + "grad_norm": 0.4367005527019501, + "learning_rate": 3.744433912173893e-05, + "loss": 0.5368, + "step": 77100 + }, + { + "epoch": 0.2514633407491132, + "grad_norm": 0.3392166197299957, + "learning_rate": 3.74280521054226e-05, + "loss": 0.5353, + "step": 77200 + }, + { + "epoch": 0.25178907046510945, + "grad_norm": 0.5449352860450745, + "learning_rate": 3.7411765089106264e-05, + "loss": 0.5611, + "step": 77300 + }, + { + "epoch": 0.2521148001811057, + "grad_norm": 0.6924061179161072, + "learning_rate": 3.739547807278994e-05, + "loss": 0.5918, + "step": 77400 + }, + { + "epoch": 0.252440529897102, + "grad_norm": 0.8356592655181885, + "learning_rate": 3.73791910564736e-05, + "loss": 0.5713, + "step": 77500 + }, + { + "epoch": 0.25276625961309823, + "grad_norm": 0.9207838177680969, + "learning_rate": 3.736290404015727e-05, + "loss": 0.5078, + "step": 77600 + }, + { + "epoch": 0.2530919893290945, + "grad_norm": 0.6466575860977173, + "learning_rate": 3.7346617023840934e-05, + "loss": 0.5274, + "step": 77700 + }, + { + "epoch": 0.25341771904509075, + "grad_norm": 0.5351524353027344, + "learning_rate": 3.7330330007524606e-05, + "loss": 0.5411, + "step": 77800 + }, + { + "epoch": 0.253743448761087, + "grad_norm": 0.7786761522293091, + "learning_rate": 3.731404299120827e-05, + "loss": 0.4859, + "step": 77900 + }, + { + "epoch": 0.25406917847708327, + "grad_norm": 0.6750699281692505, + "learning_rate": 3.729775597489194e-05, + "loss": 0.5689, + "step": 78000 + }, + { + "epoch": 0.2543949081930795, + "grad_norm": 0.7088775038719177, + "learning_rate": 3.7281468958575604e-05, + "loss": 0.5325, + "step": 78100 + }, + { + "epoch": 0.2547206379090758, + "grad_norm": 0.8920672535896301, + "learning_rate": 3.726518194225927e-05, + "loss": 0.5284, + "step": 78200 + }, + { + "epoch": 0.25504636762507205, + "grad_norm": 0.6582838296890259, + "learning_rate": 3.724889492594294e-05, + "loss": 0.511, + "step": 78300 + }, + { + "epoch": 0.2553720973410683, + "grad_norm": 0.6662094593048096, + "learning_rate": 3.723260790962661e-05, + "loss": 0.5618, + "step": 78400 + }, + { + "epoch": 0.25569782705706456, + "grad_norm": 0.4346591830253601, + "learning_rate": 3.721632089331027e-05, + "loss": 0.54, + "step": 78500 + }, + { + "epoch": 0.2560235567730608, + "grad_norm": 0.7967207431793213, + "learning_rate": 3.720003387699394e-05, + "loss": 0.5884, + "step": 78600 + }, + { + "epoch": 0.25634928648905714, + "grad_norm": 0.4879821538925171, + "learning_rate": 3.7183746860677605e-05, + "loss": 0.5557, + "step": 78700 + }, + { + "epoch": 0.2566750162050534, + "grad_norm": 0.5626016855239868, + "learning_rate": 3.716745984436128e-05, + "loss": 0.498, + "step": 78800 + }, + { + "epoch": 0.25700074592104966, + "grad_norm": 0.5859974026679993, + "learning_rate": 3.7151172828044936e-05, + "loss": 0.5218, + "step": 78900 + }, + { + "epoch": 0.2573264756370459, + "grad_norm": 0.7462596893310547, + "learning_rate": 3.713488581172861e-05, + "loss": 0.5093, + "step": 79000 + }, + { + "epoch": 0.2576522053530422, + "grad_norm": 0.9555974006652832, + "learning_rate": 3.7118598795412274e-05, + "loss": 0.5348, + "step": 79100 + }, + { + "epoch": 0.25797793506903843, + "grad_norm": 0.7466504573822021, + "learning_rate": 3.710231177909595e-05, + "loss": 0.5383, + "step": 79200 + }, + { + "epoch": 0.2583036647850347, + "grad_norm": 0.8801865577697754, + "learning_rate": 3.7086024762779606e-05, + "loss": 0.4767, + "step": 79300 + }, + { + "epoch": 0.25862939450103095, + "grad_norm": 0.48174184560775757, + "learning_rate": 3.706973774646328e-05, + "loss": 0.5528, + "step": 79400 + }, + { + "epoch": 0.2589551242170272, + "grad_norm": 0.7198649048805237, + "learning_rate": 3.7053450730146944e-05, + "loss": 0.5953, + "step": 79500 + }, + { + "epoch": 0.25928085393302347, + "grad_norm": 0.4515075385570526, + "learning_rate": 3.703716371383061e-05, + "loss": 0.5505, + "step": 79600 + }, + { + "epoch": 0.25960658364901973, + "grad_norm": 0.706524670124054, + "learning_rate": 3.7020876697514276e-05, + "loss": 0.6011, + "step": 79700 + }, + { + "epoch": 0.259932313365016, + "grad_norm": 0.6895307302474976, + "learning_rate": 3.700458968119794e-05, + "loss": 0.5188, + "step": 79800 + }, + { + "epoch": 0.26025804308101225, + "grad_norm": 0.7927341461181641, + "learning_rate": 3.6988302664881614e-05, + "loss": 0.5739, + "step": 79900 + }, + { + "epoch": 0.2605837727970085, + "grad_norm": 0.8496550917625427, + "learning_rate": 3.697201564856528e-05, + "loss": 0.5152, + "step": 80000 + }, + { + "epoch": 0.26090950251300477, + "grad_norm": 0.47138693928718567, + "learning_rate": 3.6955728632248945e-05, + "loss": 0.5475, + "step": 80100 + }, + { + "epoch": 0.261235232229001, + "grad_norm": 0.8020485639572144, + "learning_rate": 3.693944161593261e-05, + "loss": 0.5489, + "step": 80200 + }, + { + "epoch": 0.2615609619449973, + "grad_norm": 0.6385429501533508, + "learning_rate": 3.692315459961628e-05, + "loss": 0.5457, + "step": 80300 + }, + { + "epoch": 0.26188669166099354, + "grad_norm": 0.6027743220329285, + "learning_rate": 3.690686758329995e-05, + "loss": 0.5412, + "step": 80400 + }, + { + "epoch": 0.2622124213769898, + "grad_norm": 0.6040454506874084, + "learning_rate": 3.6890580566983615e-05, + "loss": 0.5348, + "step": 80500 + }, + { + "epoch": 0.26253815109298606, + "grad_norm": 0.6697177290916443, + "learning_rate": 3.687429355066728e-05, + "loss": 0.509, + "step": 80600 + }, + { + "epoch": 0.2628638808089823, + "grad_norm": 0.8428653478622437, + "learning_rate": 3.6858006534350946e-05, + "loss": 0.5505, + "step": 80700 + }, + { + "epoch": 0.2631896105249786, + "grad_norm": 0.9421257972717285, + "learning_rate": 3.684171951803462e-05, + "loss": 0.5587, + "step": 80800 + }, + { + "epoch": 0.26351534024097484, + "grad_norm": 0.7752894759178162, + "learning_rate": 3.6825432501718285e-05, + "loss": 0.5308, + "step": 80900 + }, + { + "epoch": 0.2638410699569711, + "grad_norm": 0.9658520817756653, + "learning_rate": 3.6809145485401944e-05, + "loss": 0.5394, + "step": 81000 + }, + { + "epoch": 0.26416679967296736, + "grad_norm": 0.3100132644176483, + "learning_rate": 3.6792858469085616e-05, + "loss": 0.5616, + "step": 81100 + }, + { + "epoch": 0.2644925293889636, + "grad_norm": 1.0838834047317505, + "learning_rate": 3.677657145276928e-05, + "loss": 0.5374, + "step": 81200 + }, + { + "epoch": 0.2648182591049599, + "grad_norm": 0.9311345219612122, + "learning_rate": 3.6760284436452954e-05, + "loss": 0.5353, + "step": 81300 + }, + { + "epoch": 0.26514398882095613, + "grad_norm": 0.32365360856056213, + "learning_rate": 3.674399742013661e-05, + "loss": 0.5493, + "step": 81400 + }, + { + "epoch": 0.2654697185369524, + "grad_norm": 0.6390203833580017, + "learning_rate": 3.6727710403820286e-05, + "loss": 0.5205, + "step": 81500 + }, + { + "epoch": 0.26579544825294865, + "grad_norm": 0.6106113195419312, + "learning_rate": 3.671142338750395e-05, + "loss": 0.5161, + "step": 81600 + }, + { + "epoch": 0.2661211779689449, + "grad_norm": 0.4415883421897888, + "learning_rate": 3.669513637118762e-05, + "loss": 0.5235, + "step": 81700 + }, + { + "epoch": 0.26644690768494117, + "grad_norm": 0.8828484416007996, + "learning_rate": 3.667884935487128e-05, + "loss": 0.5214, + "step": 81800 + }, + { + "epoch": 0.26677263740093743, + "grad_norm": 0.8186760544776917, + "learning_rate": 3.666256233855495e-05, + "loss": 0.5435, + "step": 81900 + }, + { + "epoch": 0.2670983671169337, + "grad_norm": 0.43989554047584534, + "learning_rate": 3.664627532223862e-05, + "loss": 0.5653, + "step": 82000 + }, + { + "epoch": 0.26742409683292995, + "grad_norm": 1.083422303199768, + "learning_rate": 3.662998830592229e-05, + "loss": 0.5338, + "step": 82100 + }, + { + "epoch": 0.2677498265489262, + "grad_norm": 0.40522611141204834, + "learning_rate": 3.661370128960596e-05, + "loss": 0.4892, + "step": 82200 + }, + { + "epoch": 0.26807555626492247, + "grad_norm": 0.7010061740875244, + "learning_rate": 3.659741427328962e-05, + "loss": 0.5372, + "step": 82300 + }, + { + "epoch": 0.2684012859809188, + "grad_norm": 0.9971382021903992, + "learning_rate": 3.6581127256973284e-05, + "loss": 0.501, + "step": 82400 + }, + { + "epoch": 0.26872701569691504, + "grad_norm": 0.5222276449203491, + "learning_rate": 3.656484024065696e-05, + "loss": 0.5194, + "step": 82500 + }, + { + "epoch": 0.2690527454129113, + "grad_norm": 0.724824845790863, + "learning_rate": 3.654855322434062e-05, + "loss": 0.499, + "step": 82600 + }, + { + "epoch": 0.26937847512890756, + "grad_norm": 0.48272421956062317, + "learning_rate": 3.653226620802429e-05, + "loss": 0.486, + "step": 82700 + }, + { + "epoch": 0.2697042048449038, + "grad_norm": 0.8187432885169983, + "learning_rate": 3.6515979191707954e-05, + "loss": 0.5634, + "step": 82800 + }, + { + "epoch": 0.2700299345609001, + "grad_norm": 0.46917855739593506, + "learning_rate": 3.6499692175391626e-05, + "loss": 0.5468, + "step": 82900 + }, + { + "epoch": 0.27035566427689633, + "grad_norm": 0.5338607430458069, + "learning_rate": 3.648340515907529e-05, + "loss": 0.481, + "step": 83000 + }, + { + "epoch": 0.2706813939928926, + "grad_norm": 0.5420836806297302, + "learning_rate": 3.646711814275896e-05, + "loss": 0.5391, + "step": 83100 + }, + { + "epoch": 0.27100712370888885, + "grad_norm": 0.5124307870864868, + "learning_rate": 3.6450831126442624e-05, + "loss": 0.5446, + "step": 83200 + }, + { + "epoch": 0.2713328534248851, + "grad_norm": 0.5944223403930664, + "learning_rate": 3.643454411012629e-05, + "loss": 0.5759, + "step": 83300 + }, + { + "epoch": 0.27165858314088137, + "grad_norm": 1.1431384086608887, + "learning_rate": 3.641825709380996e-05, + "loss": 0.5416, + "step": 83400 + }, + { + "epoch": 0.27198431285687763, + "grad_norm": 0.9613766670227051, + "learning_rate": 3.640197007749363e-05, + "loss": 0.521, + "step": 83500 + }, + { + "epoch": 0.2723100425728739, + "grad_norm": 0.7477935552597046, + "learning_rate": 3.638568306117729e-05, + "loss": 0.558, + "step": 83600 + }, + { + "epoch": 0.27263577228887015, + "grad_norm": 0.47112804651260376, + "learning_rate": 3.636939604486096e-05, + "loss": 0.5083, + "step": 83700 + }, + { + "epoch": 0.2729615020048664, + "grad_norm": 0.5914379954338074, + "learning_rate": 3.6353109028544625e-05, + "loss": 0.5776, + "step": 83800 + }, + { + "epoch": 0.27328723172086267, + "grad_norm": 0.5500662326812744, + "learning_rate": 3.63368220122283e-05, + "loss": 0.5194, + "step": 83900 + }, + { + "epoch": 0.2736129614368589, + "grad_norm": 0.41591793298721313, + "learning_rate": 3.6320534995911956e-05, + "loss": 0.5266, + "step": 84000 + }, + { + "epoch": 0.2739386911528552, + "grad_norm": 1.080356478691101, + "learning_rate": 3.630424797959563e-05, + "loss": 0.4964, + "step": 84100 + }, + { + "epoch": 0.27426442086885144, + "grad_norm": 0.40892690420150757, + "learning_rate": 3.6287960963279294e-05, + "loss": 0.5163, + "step": 84200 + }, + { + "epoch": 0.2745901505848477, + "grad_norm": 0.7729841470718384, + "learning_rate": 3.627167394696297e-05, + "loss": 0.5336, + "step": 84300 + }, + { + "epoch": 0.27491588030084396, + "grad_norm": 0.6264617443084717, + "learning_rate": 3.6255386930646626e-05, + "loss": 0.5762, + "step": 84400 + }, + { + "epoch": 0.2752416100168402, + "grad_norm": 0.8050372004508972, + "learning_rate": 3.623909991433029e-05, + "loss": 0.4509, + "step": 84500 + }, + { + "epoch": 0.2755673397328365, + "grad_norm": 0.621804416179657, + "learning_rate": 3.6222812898013964e-05, + "loss": 0.5174, + "step": 84600 + }, + { + "epoch": 0.27589306944883274, + "grad_norm": 0.5717790126800537, + "learning_rate": 3.620652588169763e-05, + "loss": 0.5431, + "step": 84700 + }, + { + "epoch": 0.276218799164829, + "grad_norm": 0.394345223903656, + "learning_rate": 3.6190238865381295e-05, + "loss": 0.5294, + "step": 84800 + }, + { + "epoch": 0.27654452888082526, + "grad_norm": 0.8917814493179321, + "learning_rate": 3.617395184906496e-05, + "loss": 0.4955, + "step": 84900 + }, + { + "epoch": 0.2768702585968215, + "grad_norm": 0.721481442451477, + "learning_rate": 3.6157664832748634e-05, + "loss": 0.5433, + "step": 85000 + }, + { + "epoch": 0.2771959883128178, + "grad_norm": 0.6476948857307434, + "learning_rate": 3.61413778164323e-05, + "loss": 0.563, + "step": 85100 + }, + { + "epoch": 0.27752171802881404, + "grad_norm": 0.38036003708839417, + "learning_rate": 3.6125090800115965e-05, + "loss": 0.516, + "step": 85200 + }, + { + "epoch": 0.2778474477448103, + "grad_norm": 0.6185033917427063, + "learning_rate": 3.610880378379963e-05, + "loss": 0.5178, + "step": 85300 + }, + { + "epoch": 0.27817317746080655, + "grad_norm": 0.8313725590705872, + "learning_rate": 3.60925167674833e-05, + "loss": 0.5296, + "step": 85400 + }, + { + "epoch": 0.2784989071768028, + "grad_norm": 0.5369439721107483, + "learning_rate": 3.607622975116697e-05, + "loss": 0.5803, + "step": 85500 + }, + { + "epoch": 0.27882463689279907, + "grad_norm": 0.7777513265609741, + "learning_rate": 3.6059942734850635e-05, + "loss": 0.4875, + "step": 85600 + }, + { + "epoch": 0.27915036660879533, + "grad_norm": 0.5527925491333008, + "learning_rate": 3.60436557185343e-05, + "loss": 0.5141, + "step": 85700 + }, + { + "epoch": 0.2794760963247916, + "grad_norm": 0.8335199356079102, + "learning_rate": 3.6027368702217966e-05, + "loss": 0.4851, + "step": 85800 + }, + { + "epoch": 0.27980182604078785, + "grad_norm": 0.7015230059623718, + "learning_rate": 3.601108168590163e-05, + "loss": 0.5395, + "step": 85900 + }, + { + "epoch": 0.2801275557567841, + "grad_norm": 0.7245033979415894, + "learning_rate": 3.5994794669585305e-05, + "loss": 0.5204, + "step": 86000 + }, + { + "epoch": 0.2804532854727804, + "grad_norm": 0.8472508192062378, + "learning_rate": 3.5978507653268964e-05, + "loss": 0.5087, + "step": 86100 + }, + { + "epoch": 0.2807790151887767, + "grad_norm": 0.7517431974411011, + "learning_rate": 3.5962220636952636e-05, + "loss": 0.5176, + "step": 86200 + }, + { + "epoch": 0.28110474490477294, + "grad_norm": 0.5864343643188477, + "learning_rate": 3.59459336206363e-05, + "loss": 0.5828, + "step": 86300 + }, + { + "epoch": 0.2814304746207692, + "grad_norm": 0.8981267809867859, + "learning_rate": 3.5929646604319974e-05, + "loss": 0.5309, + "step": 86400 + }, + { + "epoch": 0.28175620433676546, + "grad_norm": 0.8167164325714111, + "learning_rate": 3.591335958800364e-05, + "loss": 0.5513, + "step": 86500 + }, + { + "epoch": 0.2820819340527617, + "grad_norm": 0.7764830589294434, + "learning_rate": 3.58970725716873e-05, + "loss": 0.5249, + "step": 86600 + }, + { + "epoch": 0.282407663768758, + "grad_norm": 0.7545201182365417, + "learning_rate": 3.588078555537097e-05, + "loss": 0.5293, + "step": 86700 + }, + { + "epoch": 0.28273339348475424, + "grad_norm": 0.6954336166381836, + "learning_rate": 3.586449853905464e-05, + "loss": 0.5532, + "step": 86800 + }, + { + "epoch": 0.2830591232007505, + "grad_norm": 0.6742025017738342, + "learning_rate": 3.584821152273831e-05, + "loss": 0.5356, + "step": 86900 + }, + { + "epoch": 0.28338485291674675, + "grad_norm": 0.731679379940033, + "learning_rate": 3.583192450642197e-05, + "loss": 0.5128, + "step": 87000 + }, + { + "epoch": 0.283710582632743, + "grad_norm": 0.7906468510627747, + "learning_rate": 3.581563749010564e-05, + "loss": 0.5359, + "step": 87100 + }, + { + "epoch": 0.2840363123487393, + "grad_norm": 0.36753523349761963, + "learning_rate": 3.579935047378931e-05, + "loss": 0.5366, + "step": 87200 + }, + { + "epoch": 0.28436204206473553, + "grad_norm": 0.6043976545333862, + "learning_rate": 3.578306345747298e-05, + "loss": 0.4995, + "step": 87300 + }, + { + "epoch": 0.2846877717807318, + "grad_norm": 0.7573038339614868, + "learning_rate": 3.576677644115664e-05, + "loss": 0.5093, + "step": 87400 + }, + { + "epoch": 0.28501350149672805, + "grad_norm": 0.25290992856025696, + "learning_rate": 3.5750489424840304e-05, + "loss": 0.4948, + "step": 87500 + }, + { + "epoch": 0.2853392312127243, + "grad_norm": 0.6551434397697449, + "learning_rate": 3.5734202408523977e-05, + "loss": 0.5116, + "step": 87600 + }, + { + "epoch": 0.28566496092872057, + "grad_norm": 0.6715214252471924, + "learning_rate": 3.571791539220764e-05, + "loss": 0.6104, + "step": 87700 + }, + { + "epoch": 0.2859906906447168, + "grad_norm": 0.7275449633598328, + "learning_rate": 3.570162837589131e-05, + "loss": 0.506, + "step": 87800 + }, + { + "epoch": 0.2863164203607131, + "grad_norm": 0.2885235846042633, + "learning_rate": 3.5685341359574974e-05, + "loss": 0.4684, + "step": 87900 + }, + { + "epoch": 0.28664215007670935, + "grad_norm": 0.9342713356018066, + "learning_rate": 3.5669054343258646e-05, + "loss": 0.5293, + "step": 88000 + }, + { + "epoch": 0.2869678797927056, + "grad_norm": 1.0423755645751953, + "learning_rate": 3.565276732694231e-05, + "loss": 0.5466, + "step": 88100 + }, + { + "epoch": 0.28729360950870186, + "grad_norm": 1.0259456634521484, + "learning_rate": 3.563648031062598e-05, + "loss": 0.4885, + "step": 88200 + }, + { + "epoch": 0.2876193392246981, + "grad_norm": 0.8733958601951599, + "learning_rate": 3.5620193294309643e-05, + "loss": 0.5353, + "step": 88300 + }, + { + "epoch": 0.2879450689406944, + "grad_norm": 0.33869871497154236, + "learning_rate": 3.560390627799331e-05, + "loss": 0.5465, + "step": 88400 + }, + { + "epoch": 0.28827079865669064, + "grad_norm": 0.5838894844055176, + "learning_rate": 3.558761926167698e-05, + "loss": 0.555, + "step": 88500 + }, + { + "epoch": 0.2885965283726869, + "grad_norm": 0.8616543412208557, + "learning_rate": 3.557133224536065e-05, + "loss": 0.5173, + "step": 88600 + }, + { + "epoch": 0.28892225808868316, + "grad_norm": 0.8486323356628418, + "learning_rate": 3.555504522904431e-05, + "loss": 0.5258, + "step": 88700 + }, + { + "epoch": 0.2892479878046794, + "grad_norm": 0.6569567918777466, + "learning_rate": 3.553875821272798e-05, + "loss": 0.5097, + "step": 88800 + }, + { + "epoch": 0.2895737175206757, + "grad_norm": 0.6821163296699524, + "learning_rate": 3.5522471196411645e-05, + "loss": 0.5428, + "step": 88900 + }, + { + "epoch": 0.28989944723667194, + "grad_norm": 0.6147534251213074, + "learning_rate": 3.550618418009532e-05, + "loss": 0.5544, + "step": 89000 + }, + { + "epoch": 0.2902251769526682, + "grad_norm": 0.42478904128074646, + "learning_rate": 3.5489897163778976e-05, + "loss": 0.5376, + "step": 89100 + }, + { + "epoch": 0.29055090666866445, + "grad_norm": 0.5254961252212524, + "learning_rate": 3.547361014746265e-05, + "loss": 0.4964, + "step": 89200 + }, + { + "epoch": 0.2908766363846607, + "grad_norm": 0.6934669017791748, + "learning_rate": 3.5457323131146314e-05, + "loss": 0.4835, + "step": 89300 + }, + { + "epoch": 0.291202366100657, + "grad_norm": 0.4250465929508209, + "learning_rate": 3.544103611482999e-05, + "loss": 0.4954, + "step": 89400 + }, + { + "epoch": 0.29152809581665323, + "grad_norm": 0.6067728996276855, + "learning_rate": 3.5424749098513646e-05, + "loss": 0.4926, + "step": 89500 + }, + { + "epoch": 0.2918538255326495, + "grad_norm": 0.5424463748931885, + "learning_rate": 3.540846208219731e-05, + "loss": 0.5627, + "step": 89600 + }, + { + "epoch": 0.2921795552486458, + "grad_norm": 0.5810889005661011, + "learning_rate": 3.5392175065880984e-05, + "loss": 0.4316, + "step": 89700 + }, + { + "epoch": 0.29250528496464206, + "grad_norm": 0.4583912491798401, + "learning_rate": 3.537588804956465e-05, + "loss": 0.4987, + "step": 89800 + }, + { + "epoch": 0.2928310146806383, + "grad_norm": 0.4320780634880066, + "learning_rate": 3.5359601033248315e-05, + "loss": 0.5204, + "step": 89900 + }, + { + "epoch": 0.2931567443966346, + "grad_norm": 0.6955101490020752, + "learning_rate": 3.534331401693198e-05, + "loss": 0.5179, + "step": 90000 + }, + { + "epoch": 0.29348247411263084, + "grad_norm": 0.512250542640686, + "learning_rate": 3.5327027000615654e-05, + "loss": 0.4909, + "step": 90100 + }, + { + "epoch": 0.2938082038286271, + "grad_norm": 0.7975231409072876, + "learning_rate": 3.531073998429932e-05, + "loss": 0.4845, + "step": 90200 + }, + { + "epoch": 0.29413393354462336, + "grad_norm": 0.25338149070739746, + "learning_rate": 3.5294452967982985e-05, + "loss": 0.4963, + "step": 90300 + }, + { + "epoch": 0.2944596632606196, + "grad_norm": 0.43115437030792236, + "learning_rate": 3.527816595166665e-05, + "loss": 0.5203, + "step": 90400 + }, + { + "epoch": 0.2947853929766159, + "grad_norm": 0.830754280090332, + "learning_rate": 3.5261878935350317e-05, + "loss": 0.4916, + "step": 90500 + }, + { + "epoch": 0.29511112269261214, + "grad_norm": 0.8370751738548279, + "learning_rate": 3.524559191903399e-05, + "loss": 0.547, + "step": 90600 + }, + { + "epoch": 0.2954368524086084, + "grad_norm": 0.7122400403022766, + "learning_rate": 3.5229304902717655e-05, + "loss": 0.5126, + "step": 90700 + }, + { + "epoch": 0.29576258212460466, + "grad_norm": 0.4084763824939728, + "learning_rate": 3.521301788640132e-05, + "loss": 0.4971, + "step": 90800 + }, + { + "epoch": 0.2960883118406009, + "grad_norm": 0.8079352974891663, + "learning_rate": 3.5196730870084986e-05, + "loss": 0.4992, + "step": 90900 + }, + { + "epoch": 0.2964140415565972, + "grad_norm": 0.25352516770362854, + "learning_rate": 3.518044385376865e-05, + "loss": 0.5333, + "step": 91000 + }, + { + "epoch": 0.29673977127259343, + "grad_norm": 0.5390329957008362, + "learning_rate": 3.5164156837452324e-05, + "loss": 0.5007, + "step": 91100 + }, + { + "epoch": 0.2970655009885897, + "grad_norm": 0.6617804765701294, + "learning_rate": 3.514786982113599e-05, + "loss": 0.548, + "step": 91200 + }, + { + "epoch": 0.29739123070458595, + "grad_norm": 0.7202132940292358, + "learning_rate": 3.5131582804819656e-05, + "loss": 0.5417, + "step": 91300 + }, + { + "epoch": 0.2977169604205822, + "grad_norm": 0.28012895584106445, + "learning_rate": 3.511529578850332e-05, + "loss": 0.4883, + "step": 91400 + }, + { + "epoch": 0.29804269013657847, + "grad_norm": 0.3527827560901642, + "learning_rate": 3.5099008772186994e-05, + "loss": 0.523, + "step": 91500 + }, + { + "epoch": 0.29836841985257473, + "grad_norm": 0.7193790078163147, + "learning_rate": 3.508272175587066e-05, + "loss": 0.5148, + "step": 91600 + }, + { + "epoch": 0.298694149568571, + "grad_norm": 0.9702345728874207, + "learning_rate": 3.506643473955432e-05, + "loss": 0.4781, + "step": 91700 + }, + { + "epoch": 0.29901987928456725, + "grad_norm": 0.7323670983314514, + "learning_rate": 3.505014772323799e-05, + "loss": 0.5394, + "step": 91800 + }, + { + "epoch": 0.2993456090005635, + "grad_norm": 0.6757960915565491, + "learning_rate": 3.503386070692166e-05, + "loss": 0.4984, + "step": 91900 + }, + { + "epoch": 0.29967133871655977, + "grad_norm": 0.7119109630584717, + "learning_rate": 3.501757369060533e-05, + "loss": 0.5502, + "step": 92000 + }, + { + "epoch": 0.299997068432556, + "grad_norm": 0.6820542216300964, + "learning_rate": 3.500128667428899e-05, + "loss": 0.5498, + "step": 92100 + }, + { + "epoch": 0.3003227981485523, + "grad_norm": 0.784050703048706, + "learning_rate": 3.498499965797266e-05, + "loss": 0.5445, + "step": 92200 + }, + { + "epoch": 0.30064852786454854, + "grad_norm": 0.6549366116523743, + "learning_rate": 3.496871264165633e-05, + "loss": 0.5326, + "step": 92300 + }, + { + "epoch": 0.3009742575805448, + "grad_norm": 0.4872061014175415, + "learning_rate": 3.495242562533999e-05, + "loss": 0.5093, + "step": 92400 + }, + { + "epoch": 0.30129998729654106, + "grad_norm": 0.3646996319293976, + "learning_rate": 3.493613860902366e-05, + "loss": 0.5476, + "step": 92500 + }, + { + "epoch": 0.3016257170125373, + "grad_norm": 0.5709706544876099, + "learning_rate": 3.4919851592707324e-05, + "loss": 0.4513, + "step": 92600 + }, + { + "epoch": 0.3019514467285336, + "grad_norm": 0.6031984090805054, + "learning_rate": 3.4903564576390996e-05, + "loss": 0.5044, + "step": 92700 + }, + { + "epoch": 0.30227717644452984, + "grad_norm": 0.8381587862968445, + "learning_rate": 3.488727756007466e-05, + "loss": 0.5128, + "step": 92800 + }, + { + "epoch": 0.3026029061605261, + "grad_norm": 1.0859401226043701, + "learning_rate": 3.487099054375833e-05, + "loss": 0.5328, + "step": 92900 + }, + { + "epoch": 0.30292863587652236, + "grad_norm": 0.34642109274864197, + "learning_rate": 3.4854703527441994e-05, + "loss": 0.4852, + "step": 93000 + }, + { + "epoch": 0.3032543655925186, + "grad_norm": 0.6529460549354553, + "learning_rate": 3.483841651112566e-05, + "loss": 0.5032, + "step": 93100 + }, + { + "epoch": 0.3035800953085149, + "grad_norm": 0.7026881575584412, + "learning_rate": 3.482212949480933e-05, + "loss": 0.6338, + "step": 93200 + }, + { + "epoch": 0.30390582502451113, + "grad_norm": 0.49741417169570923, + "learning_rate": 3.4805842478493e-05, + "loss": 0.5231, + "step": 93300 + }, + { + "epoch": 0.30423155474050745, + "grad_norm": 0.6611301898956299, + "learning_rate": 3.478955546217666e-05, + "loss": 0.5189, + "step": 93400 + }, + { + "epoch": 0.3045572844565037, + "grad_norm": 0.6907228827476501, + "learning_rate": 3.477326844586033e-05, + "loss": 0.5256, + "step": 93500 + }, + { + "epoch": 0.30488301417249997, + "grad_norm": 0.5975654721260071, + "learning_rate": 3.4756981429544e-05, + "loss": 0.522, + "step": 93600 + }, + { + "epoch": 0.3052087438884962, + "grad_norm": 0.6043006777763367, + "learning_rate": 3.474069441322767e-05, + "loss": 0.5018, + "step": 93700 + }, + { + "epoch": 0.3055344736044925, + "grad_norm": 0.5697898864746094, + "learning_rate": 3.4724407396911326e-05, + "loss": 0.5009, + "step": 93800 + }, + { + "epoch": 0.30586020332048874, + "grad_norm": 0.40364518761634827, + "learning_rate": 3.4708120380595e-05, + "loss": 0.4642, + "step": 93900 + }, + { + "epoch": 0.306185933036485, + "grad_norm": 0.940877377986908, + "learning_rate": 3.4691833364278664e-05, + "loss": 0.5136, + "step": 94000 + }, + { + "epoch": 0.30651166275248126, + "grad_norm": 0.7497209310531616, + "learning_rate": 3.467554634796234e-05, + "loss": 0.5261, + "step": 94100 + }, + { + "epoch": 0.3068373924684775, + "grad_norm": 0.8120318651199341, + "learning_rate": 3.4659259331645996e-05, + "loss": 0.4756, + "step": 94200 + }, + { + "epoch": 0.3071631221844738, + "grad_norm": 0.6802115440368652, + "learning_rate": 3.464297231532967e-05, + "loss": 0.5257, + "step": 94300 + }, + { + "epoch": 0.30748885190047004, + "grad_norm": 0.43083488941192627, + "learning_rate": 3.4626685299013334e-05, + "loss": 0.5365, + "step": 94400 + }, + { + "epoch": 0.3078145816164663, + "grad_norm": 0.6194273233413696, + "learning_rate": 3.4610398282697e-05, + "loss": 0.5157, + "step": 94500 + }, + { + "epoch": 0.30814031133246256, + "grad_norm": 0.5603410601615906, + "learning_rate": 3.4594111266380666e-05, + "loss": 0.51, + "step": 94600 + }, + { + "epoch": 0.3084660410484588, + "grad_norm": 1.0651506185531616, + "learning_rate": 3.457782425006433e-05, + "loss": 0.4759, + "step": 94700 + }, + { + "epoch": 0.3087917707644551, + "grad_norm": 0.7674971222877502, + "learning_rate": 3.4561537233748004e-05, + "loss": 0.467, + "step": 94800 + }, + { + "epoch": 0.30911750048045133, + "grad_norm": 0.9666951298713684, + "learning_rate": 3.454525021743167e-05, + "loss": 0.5524, + "step": 94900 + }, + { + "epoch": 0.3094432301964476, + "grad_norm": 0.6148163080215454, + "learning_rate": 3.4528963201115335e-05, + "loss": 0.5345, + "step": 95000 + }, + { + "epoch": 0.30976895991244385, + "grad_norm": 0.7641096711158752, + "learning_rate": 3.4512676184799e-05, + "loss": 0.4872, + "step": 95100 + }, + { + "epoch": 0.3100946896284401, + "grad_norm": 0.6152538657188416, + "learning_rate": 3.449638916848267e-05, + "loss": 0.4832, + "step": 95200 + }, + { + "epoch": 0.31042041934443637, + "grad_norm": 0.7761083841323853, + "learning_rate": 3.448010215216634e-05, + "loss": 0.4761, + "step": 95300 + }, + { + "epoch": 0.31074614906043263, + "grad_norm": 0.6005348563194275, + "learning_rate": 3.4463815135850005e-05, + "loss": 0.4585, + "step": 95400 + }, + { + "epoch": 0.3110718787764289, + "grad_norm": 0.7649496793746948, + "learning_rate": 3.444752811953367e-05, + "loss": 0.5283, + "step": 95500 + }, + { + "epoch": 0.31139760849242515, + "grad_norm": 0.9503573179244995, + "learning_rate": 3.4431241103217336e-05, + "loss": 0.5032, + "step": 95600 + }, + { + "epoch": 0.3117233382084214, + "grad_norm": 0.8403215408325195, + "learning_rate": 3.441495408690101e-05, + "loss": 0.5172, + "step": 95700 + }, + { + "epoch": 0.31204906792441767, + "grad_norm": 0.5137957334518433, + "learning_rate": 3.4398667070584675e-05, + "loss": 0.5551, + "step": 95800 + }, + { + "epoch": 0.3123747976404139, + "grad_norm": 0.6618998646736145, + "learning_rate": 3.438238005426834e-05, + "loss": 0.5237, + "step": 95900 + }, + { + "epoch": 0.3127005273564102, + "grad_norm": 0.3272695541381836, + "learning_rate": 3.4366093037952006e-05, + "loss": 0.4556, + "step": 96000 + }, + { + "epoch": 0.31302625707240644, + "grad_norm": 0.7416215538978577, + "learning_rate": 3.434980602163567e-05, + "loss": 0.5039, + "step": 96100 + }, + { + "epoch": 0.3133519867884027, + "grad_norm": 0.9183087944984436, + "learning_rate": 3.4333519005319344e-05, + "loss": 0.5408, + "step": 96200 + }, + { + "epoch": 0.31367771650439896, + "grad_norm": 0.3782617151737213, + "learning_rate": 3.431723198900301e-05, + "loss": 0.5113, + "step": 96300 + }, + { + "epoch": 0.3140034462203952, + "grad_norm": 0.6314922571182251, + "learning_rate": 3.4300944972686676e-05, + "loss": 0.4955, + "step": 96400 + }, + { + "epoch": 0.3143291759363915, + "grad_norm": 0.3009500801563263, + "learning_rate": 3.428465795637034e-05, + "loss": 0.5114, + "step": 96500 + }, + { + "epoch": 0.31465490565238774, + "grad_norm": 0.8378229737281799, + "learning_rate": 3.4268370940054014e-05, + "loss": 0.5287, + "step": 96600 + }, + { + "epoch": 0.314980635368384, + "grad_norm": 0.7249593138694763, + "learning_rate": 3.425208392373768e-05, + "loss": 0.5209, + "step": 96700 + }, + { + "epoch": 0.31530636508438026, + "grad_norm": 0.45489412546157837, + "learning_rate": 3.423579690742134e-05, + "loss": 0.5745, + "step": 96800 + }, + { + "epoch": 0.3156320948003765, + "grad_norm": 0.6379255056381226, + "learning_rate": 3.421950989110501e-05, + "loss": 0.5199, + "step": 96900 + }, + { + "epoch": 0.31595782451637283, + "grad_norm": 0.8550392389297485, + "learning_rate": 3.420322287478868e-05, + "loss": 0.5374, + "step": 97000 + }, + { + "epoch": 0.3162835542323691, + "grad_norm": 0.5571677684783936, + "learning_rate": 3.418693585847235e-05, + "loss": 0.5057, + "step": 97100 + }, + { + "epoch": 0.31660928394836535, + "grad_norm": 0.48302140831947327, + "learning_rate": 3.417064884215601e-05, + "loss": 0.5496, + "step": 97200 + }, + { + "epoch": 0.3169350136643616, + "grad_norm": 0.7864711284637451, + "learning_rate": 3.415436182583968e-05, + "loss": 0.5132, + "step": 97300 + }, + { + "epoch": 0.31726074338035787, + "grad_norm": 0.5517250299453735, + "learning_rate": 3.413807480952335e-05, + "loss": 0.4826, + "step": 97400 + }, + { + "epoch": 0.3175864730963541, + "grad_norm": 0.7834230065345764, + "learning_rate": 3.412178779320701e-05, + "loss": 0.5186, + "step": 97500 + }, + { + "epoch": 0.3179122028123504, + "grad_norm": 0.938097357749939, + "learning_rate": 3.410550077689068e-05, + "loss": 0.4817, + "step": 97600 + }, + { + "epoch": 0.31823793252834665, + "grad_norm": 0.25078582763671875, + "learning_rate": 3.4089213760574344e-05, + "loss": 0.4996, + "step": 97700 + }, + { + "epoch": 0.3185636622443429, + "grad_norm": 0.7896013259887695, + "learning_rate": 3.4072926744258016e-05, + "loss": 0.5163, + "step": 97800 + }, + { + "epoch": 0.31888939196033916, + "grad_norm": 0.6857266426086426, + "learning_rate": 3.405663972794168e-05, + "loss": 0.4952, + "step": 97900 + }, + { + "epoch": 0.3192151216763354, + "grad_norm": 0.5710707306861877, + "learning_rate": 3.404035271162535e-05, + "loss": 0.5273, + "step": 98000 + }, + { + "epoch": 0.3195408513923317, + "grad_norm": 0.5274339914321899, + "learning_rate": 3.4024065695309014e-05, + "loss": 0.5385, + "step": 98100 + }, + { + "epoch": 0.31986658110832794, + "grad_norm": 0.27135804295539856, + "learning_rate": 3.400777867899268e-05, + "loss": 0.5042, + "step": 98200 + }, + { + "epoch": 0.3201923108243242, + "grad_norm": 0.6852828860282898, + "learning_rate": 3.399149166267635e-05, + "loss": 0.5214, + "step": 98300 + }, + { + "epoch": 0.32051804054032046, + "grad_norm": 0.5614081621170044, + "learning_rate": 3.397520464636002e-05, + "loss": 0.5023, + "step": 98400 + }, + { + "epoch": 0.3208437702563167, + "grad_norm": 0.7719017863273621, + "learning_rate": 3.395891763004368e-05, + "loss": 0.4919, + "step": 98500 + }, + { + "epoch": 0.321169499972313, + "grad_norm": 0.8100476264953613, + "learning_rate": 3.394263061372735e-05, + "loss": 0.4607, + "step": 98600 + }, + { + "epoch": 0.32149522968830924, + "grad_norm": 0.6814531087875366, + "learning_rate": 3.392634359741102e-05, + "loss": 0.5457, + "step": 98700 + }, + { + "epoch": 0.3218209594043055, + "grad_norm": 1.0356829166412354, + "learning_rate": 3.391005658109469e-05, + "loss": 0.4844, + "step": 98800 + }, + { + "epoch": 0.32214668912030175, + "grad_norm": 0.8719603419303894, + "learning_rate": 3.3893769564778346e-05, + "loss": 0.5182, + "step": 98900 + }, + { + "epoch": 0.322472418836298, + "grad_norm": 0.6145396828651428, + "learning_rate": 3.387748254846202e-05, + "loss": 0.4732, + "step": 99000 + }, + { + "epoch": 0.3227981485522943, + "grad_norm": 1.005679726600647, + "learning_rate": 3.3861195532145684e-05, + "loss": 0.5182, + "step": 99100 + }, + { + "epoch": 0.32312387826829053, + "grad_norm": 0.29751360416412354, + "learning_rate": 3.384490851582936e-05, + "loss": 0.4823, + "step": 99200 + }, + { + "epoch": 0.3234496079842868, + "grad_norm": 0.7968891263008118, + "learning_rate": 3.3828621499513016e-05, + "loss": 0.5235, + "step": 99300 + }, + { + "epoch": 0.32377533770028305, + "grad_norm": 0.7049364447593689, + "learning_rate": 3.381233448319669e-05, + "loss": 0.5392, + "step": 99400 + }, + { + "epoch": 0.3241010674162793, + "grad_norm": 0.6265050172805786, + "learning_rate": 3.3796047466880354e-05, + "loss": 0.5119, + "step": 99500 + }, + { + "epoch": 0.32442679713227557, + "grad_norm": 0.6732152104377747, + "learning_rate": 3.377976045056402e-05, + "loss": 0.4837, + "step": 99600 + }, + { + "epoch": 0.3247525268482718, + "grad_norm": 0.25657424330711365, + "learning_rate": 3.3763473434247686e-05, + "loss": 0.5199, + "step": 99700 + }, + { + "epoch": 0.3250782565642681, + "grad_norm": 0.4994146227836609, + "learning_rate": 3.374718641793135e-05, + "loss": 0.4894, + "step": 99800 + }, + { + "epoch": 0.32540398628026435, + "grad_norm": 0.7468940615653992, + "learning_rate": 3.3730899401615024e-05, + "loss": 0.5409, + "step": 99900 + }, + { + "epoch": 0.3257297159962606, + "grad_norm": 0.17829063534736633, + "learning_rate": 3.371461238529869e-05, + "loss": 0.5111, + "step": 100000 + }, + { + "epoch": 0.32605544571225686, + "grad_norm": 0.6492403745651245, + "learning_rate": 3.369832536898236e-05, + "loss": 0.5085, + "step": 100100 + }, + { + "epoch": 0.3263811754282531, + "grad_norm": 0.41203296184539795, + "learning_rate": 3.368203835266602e-05, + "loss": 0.4674, + "step": 100200 + }, + { + "epoch": 0.3267069051442494, + "grad_norm": 0.6258901953697205, + "learning_rate": 3.366575133634969e-05, + "loss": 0.4797, + "step": 100300 + }, + { + "epoch": 0.32703263486024564, + "grad_norm": 0.5243533849716187, + "learning_rate": 3.364946432003336e-05, + "loss": 0.4851, + "step": 100400 + }, + { + "epoch": 0.3273583645762419, + "grad_norm": 0.7344015836715698, + "learning_rate": 3.3633177303717025e-05, + "loss": 0.4964, + "step": 100500 + }, + { + "epoch": 0.32768409429223816, + "grad_norm": 1.1914827823638916, + "learning_rate": 3.361689028740069e-05, + "loss": 0.4923, + "step": 100600 + }, + { + "epoch": 0.3280098240082345, + "grad_norm": 0.7036446928977966, + "learning_rate": 3.3600603271084356e-05, + "loss": 0.5234, + "step": 100700 + }, + { + "epoch": 0.32833555372423073, + "grad_norm": 0.8239650726318359, + "learning_rate": 3.358431625476803e-05, + "loss": 0.4715, + "step": 100800 + }, + { + "epoch": 0.328661283440227, + "grad_norm": 0.6158246397972107, + "learning_rate": 3.3568029238451695e-05, + "loss": 0.488, + "step": 100900 + }, + { + "epoch": 0.32898701315622325, + "grad_norm": 0.708604633808136, + "learning_rate": 3.355174222213536e-05, + "loss": 0.4674, + "step": 101000 + }, + { + "epoch": 0.3293127428722195, + "grad_norm": 0.5420898199081421, + "learning_rate": 3.3535455205819026e-05, + "loss": 0.4741, + "step": 101100 + }, + { + "epoch": 0.32963847258821577, + "grad_norm": 0.49769943952560425, + "learning_rate": 3.351916818950269e-05, + "loss": 0.4638, + "step": 101200 + }, + { + "epoch": 0.32996420230421203, + "grad_norm": 0.7099531888961792, + "learning_rate": 3.3502881173186364e-05, + "loss": 0.5236, + "step": 101300 + }, + { + "epoch": 0.3302899320202083, + "grad_norm": 0.712815523147583, + "learning_rate": 3.348659415687003e-05, + "loss": 0.5268, + "step": 101400 + }, + { + "epoch": 0.33061566173620455, + "grad_norm": 0.8762120008468628, + "learning_rate": 3.3470307140553696e-05, + "loss": 0.5045, + "step": 101500 + }, + { + "epoch": 0.3309413914522008, + "grad_norm": 0.7411269545555115, + "learning_rate": 3.345402012423736e-05, + "loss": 0.5017, + "step": 101600 + }, + { + "epoch": 0.33126712116819707, + "grad_norm": 0.7993664145469666, + "learning_rate": 3.343773310792103e-05, + "loss": 0.4866, + "step": 101700 + }, + { + "epoch": 0.3315928508841933, + "grad_norm": 0.9997897148132324, + "learning_rate": 3.34214460916047e-05, + "loss": 0.5033, + "step": 101800 + }, + { + "epoch": 0.3319185806001896, + "grad_norm": 0.3995771110057831, + "learning_rate": 3.340515907528836e-05, + "loss": 0.5037, + "step": 101900 + }, + { + "epoch": 0.33224431031618584, + "grad_norm": 0.4990951418876648, + "learning_rate": 3.338887205897203e-05, + "loss": 0.5353, + "step": 102000 + }, + { + "epoch": 0.3325700400321821, + "grad_norm": 0.4299832880496979, + "learning_rate": 3.33725850426557e-05, + "loss": 0.5121, + "step": 102100 + }, + { + "epoch": 0.33289576974817836, + "grad_norm": 0.9922016263008118, + "learning_rate": 3.335629802633937e-05, + "loss": 0.4948, + "step": 102200 + }, + { + "epoch": 0.3332214994641746, + "grad_norm": 0.547074556350708, + "learning_rate": 3.334001101002303e-05, + "loss": 0.5031, + "step": 102300 + }, + { + "epoch": 0.3335472291801709, + "grad_norm": 0.799204409122467, + "learning_rate": 3.3323723993706694e-05, + "loss": 0.4683, + "step": 102400 + }, + { + "epoch": 0.33387295889616714, + "grad_norm": 0.8631702065467834, + "learning_rate": 3.3307436977390367e-05, + "loss": 0.4813, + "step": 102500 + }, + { + "epoch": 0.3341986886121634, + "grad_norm": 1.0079576969146729, + "learning_rate": 3.329114996107403e-05, + "loss": 0.48, + "step": 102600 + }, + { + "epoch": 0.33452441832815966, + "grad_norm": 0.6884191036224365, + "learning_rate": 3.32748629447577e-05, + "loss": 0.5356, + "step": 102700 + }, + { + "epoch": 0.3348501480441559, + "grad_norm": 0.9845031499862671, + "learning_rate": 3.3258575928441364e-05, + "loss": 0.5276, + "step": 102800 + }, + { + "epoch": 0.3351758777601522, + "grad_norm": 0.5960990786552429, + "learning_rate": 3.3242288912125036e-05, + "loss": 0.4858, + "step": 102900 + }, + { + "epoch": 0.33550160747614843, + "grad_norm": 0.5453081727027893, + "learning_rate": 3.32260018958087e-05, + "loss": 0.5118, + "step": 103000 + }, + { + "epoch": 0.3358273371921447, + "grad_norm": 0.5795672535896301, + "learning_rate": 3.320971487949237e-05, + "loss": 0.4631, + "step": 103100 + }, + { + "epoch": 0.33615306690814095, + "grad_norm": 1.148959994316101, + "learning_rate": 3.3193427863176033e-05, + "loss": 0.4791, + "step": 103200 + }, + { + "epoch": 0.3364787966241372, + "grad_norm": 0.5743905901908875, + "learning_rate": 3.31771408468597e-05, + "loss": 0.4983, + "step": 103300 + }, + { + "epoch": 0.33680452634013347, + "grad_norm": 1.2373428344726562, + "learning_rate": 3.316085383054337e-05, + "loss": 0.4886, + "step": 103400 + }, + { + "epoch": 0.33713025605612973, + "grad_norm": 0.6242794990539551, + "learning_rate": 3.314456681422704e-05, + "loss": 0.4817, + "step": 103500 + }, + { + "epoch": 0.337455985772126, + "grad_norm": 0.3083389103412628, + "learning_rate": 3.31282797979107e-05, + "loss": 0.4843, + "step": 103600 + }, + { + "epoch": 0.33778171548812225, + "grad_norm": 0.4972945749759674, + "learning_rate": 3.311199278159437e-05, + "loss": 0.4806, + "step": 103700 + }, + { + "epoch": 0.3381074452041185, + "grad_norm": 0.7972423434257507, + "learning_rate": 3.309570576527804e-05, + "loss": 0.4699, + "step": 103800 + }, + { + "epoch": 0.33843317492011477, + "grad_norm": 0.5987827777862549, + "learning_rate": 3.307941874896171e-05, + "loss": 0.4969, + "step": 103900 + }, + { + "epoch": 0.338758904636111, + "grad_norm": 0.7832911014556885, + "learning_rate": 3.3063131732645366e-05, + "loss": 0.4627, + "step": 104000 + }, + { + "epoch": 0.3390846343521073, + "grad_norm": 0.4860471189022064, + "learning_rate": 3.304684471632904e-05, + "loss": 0.4596, + "step": 104100 + }, + { + "epoch": 0.33941036406810354, + "grad_norm": 0.3446727693080902, + "learning_rate": 3.3030557700012704e-05, + "loss": 0.4668, + "step": 104200 + }, + { + "epoch": 0.33973609378409986, + "grad_norm": 0.5124432444572449, + "learning_rate": 3.301427068369638e-05, + "loss": 0.5025, + "step": 104300 + }, + { + "epoch": 0.3400618235000961, + "grad_norm": 0.6023364663124084, + "learning_rate": 3.2997983667380036e-05, + "loss": 0.4446, + "step": 104400 + }, + { + "epoch": 0.3403875532160924, + "grad_norm": 0.7395136952400208, + "learning_rate": 3.298169665106371e-05, + "loss": 0.4543, + "step": 104500 + }, + { + "epoch": 0.34071328293208863, + "grad_norm": 0.8566365838050842, + "learning_rate": 3.2965409634747374e-05, + "loss": 0.5162, + "step": 104600 + }, + { + "epoch": 0.3410390126480849, + "grad_norm": 0.5422640442848206, + "learning_rate": 3.294912261843104e-05, + "loss": 0.4841, + "step": 104700 + }, + { + "epoch": 0.34136474236408115, + "grad_norm": 1.2125647068023682, + "learning_rate": 3.293283560211471e-05, + "loss": 0.5119, + "step": 104800 + }, + { + "epoch": 0.3416904720800774, + "grad_norm": 0.7454204559326172, + "learning_rate": 3.291654858579837e-05, + "loss": 0.4564, + "step": 104900 + }, + { + "epoch": 0.34201620179607367, + "grad_norm": 0.4049842953681946, + "learning_rate": 3.2900261569482044e-05, + "loss": 0.512, + "step": 105000 + }, + { + "epoch": 0.34234193151206993, + "grad_norm": 0.4401283264160156, + "learning_rate": 3.288397455316571e-05, + "loss": 0.515, + "step": 105100 + }, + { + "epoch": 0.3426676612280662, + "grad_norm": 1.0636835098266602, + "learning_rate": 3.286768753684938e-05, + "loss": 0.5331, + "step": 105200 + }, + { + "epoch": 0.34299339094406245, + "grad_norm": 0.5115429759025574, + "learning_rate": 3.285140052053304e-05, + "loss": 0.4552, + "step": 105300 + }, + { + "epoch": 0.3433191206600587, + "grad_norm": 0.5709575414657593, + "learning_rate": 3.2835113504216707e-05, + "loss": 0.4783, + "step": 105400 + }, + { + "epoch": 0.34364485037605497, + "grad_norm": 0.3476814329624176, + "learning_rate": 3.281882648790038e-05, + "loss": 0.4824, + "step": 105500 + }, + { + "epoch": 0.3439705800920512, + "grad_norm": 0.5530911684036255, + "learning_rate": 3.2802539471584045e-05, + "loss": 0.4672, + "step": 105600 + }, + { + "epoch": 0.3442963098080475, + "grad_norm": 0.7868565320968628, + "learning_rate": 3.278625245526771e-05, + "loss": 0.4574, + "step": 105700 + }, + { + "epoch": 0.34462203952404374, + "grad_norm": 0.9544464945793152, + "learning_rate": 3.2769965438951376e-05, + "loss": 0.4785, + "step": 105800 + }, + { + "epoch": 0.34494776924004, + "grad_norm": 0.6327000856399536, + "learning_rate": 3.275367842263505e-05, + "loss": 0.4899, + "step": 105900 + }, + { + "epoch": 0.34527349895603626, + "grad_norm": 0.5785555839538574, + "learning_rate": 3.2737391406318714e-05, + "loss": 0.5283, + "step": 106000 + }, + { + "epoch": 0.3455992286720325, + "grad_norm": 1.1979222297668457, + "learning_rate": 3.272110439000238e-05, + "loss": 0.4504, + "step": 106100 + }, + { + "epoch": 0.3459249583880288, + "grad_norm": 0.64732426404953, + "learning_rate": 3.2704817373686046e-05, + "loss": 0.4814, + "step": 106200 + }, + { + "epoch": 0.34625068810402504, + "grad_norm": 0.46975287795066833, + "learning_rate": 3.268853035736971e-05, + "loss": 0.453, + "step": 106300 + }, + { + "epoch": 0.3465764178200213, + "grad_norm": 0.3508839011192322, + "learning_rate": 3.2672243341053384e-05, + "loss": 0.5106, + "step": 106400 + }, + { + "epoch": 0.34690214753601756, + "grad_norm": 0.6801757216453552, + "learning_rate": 3.265595632473705e-05, + "loss": 0.4647, + "step": 106500 + }, + { + "epoch": 0.3472278772520138, + "grad_norm": 0.9168288111686707, + "learning_rate": 3.2639669308420716e-05, + "loss": 0.5184, + "step": 106600 + }, + { + "epoch": 0.3475536069680101, + "grad_norm": 0.7734511494636536, + "learning_rate": 3.262338229210438e-05, + "loss": 0.5379, + "step": 106700 + }, + { + "epoch": 0.34787933668400634, + "grad_norm": 0.4107971489429474, + "learning_rate": 3.260709527578805e-05, + "loss": 0.4379, + "step": 106800 + }, + { + "epoch": 0.3482050664000026, + "grad_norm": 0.7145285606384277, + "learning_rate": 3.259080825947172e-05, + "loss": 0.4784, + "step": 106900 + }, + { + "epoch": 0.34853079611599885, + "grad_norm": 0.6061236262321472, + "learning_rate": 3.257452124315538e-05, + "loss": 0.5022, + "step": 107000 + }, + { + "epoch": 0.3488565258319951, + "grad_norm": 0.49363043904304504, + "learning_rate": 3.255823422683905e-05, + "loss": 0.5555, + "step": 107100 + }, + { + "epoch": 0.34918225554799137, + "grad_norm": 0.9029503464698792, + "learning_rate": 3.254194721052272e-05, + "loss": 0.4924, + "step": 107200 + }, + { + "epoch": 0.34950798526398763, + "grad_norm": 0.914335310459137, + "learning_rate": 3.252566019420639e-05, + "loss": 0.4352, + "step": 107300 + }, + { + "epoch": 0.3498337149799839, + "grad_norm": 0.8748767375946045, + "learning_rate": 3.250937317789005e-05, + "loss": 0.4577, + "step": 107400 + }, + { + "epoch": 0.35015944469598015, + "grad_norm": 0.6719549298286438, + "learning_rate": 3.2493086161573714e-05, + "loss": 0.4886, + "step": 107500 + }, + { + "epoch": 0.3504851744119764, + "grad_norm": 0.7287290096282959, + "learning_rate": 3.2476799145257386e-05, + "loss": 0.5013, + "step": 107600 + }, + { + "epoch": 0.35081090412797267, + "grad_norm": 0.5061945915222168, + "learning_rate": 3.246051212894105e-05, + "loss": 0.4945, + "step": 107700 + }, + { + "epoch": 0.3511366338439689, + "grad_norm": 0.4315279722213745, + "learning_rate": 3.244422511262472e-05, + "loss": 0.4889, + "step": 107800 + }, + { + "epoch": 0.3514623635599652, + "grad_norm": 0.7010080814361572, + "learning_rate": 3.2427938096308384e-05, + "loss": 0.4927, + "step": 107900 + }, + { + "epoch": 0.3517880932759615, + "grad_norm": 0.5063943862915039, + "learning_rate": 3.2411651079992056e-05, + "loss": 0.485, + "step": 108000 + }, + { + "epoch": 0.35211382299195776, + "grad_norm": 0.4876722991466522, + "learning_rate": 3.239536406367572e-05, + "loss": 0.4779, + "step": 108100 + }, + { + "epoch": 0.352439552707954, + "grad_norm": 0.5269170999526978, + "learning_rate": 3.237907704735939e-05, + "loss": 0.4836, + "step": 108200 + }, + { + "epoch": 0.3527652824239503, + "grad_norm": 0.49777817726135254, + "learning_rate": 3.236279003104305e-05, + "loss": 0.5038, + "step": 108300 + }, + { + "epoch": 0.35309101213994654, + "grad_norm": 0.6626752018928528, + "learning_rate": 3.234650301472672e-05, + "loss": 0.434, + "step": 108400 + }, + { + "epoch": 0.3534167418559428, + "grad_norm": 0.5564941167831421, + "learning_rate": 3.233021599841039e-05, + "loss": 0.4893, + "step": 108500 + }, + { + "epoch": 0.35374247157193905, + "grad_norm": 0.9265629649162292, + "learning_rate": 3.231392898209406e-05, + "loss": 0.4447, + "step": 108600 + }, + { + "epoch": 0.3540682012879353, + "grad_norm": 0.3788335919380188, + "learning_rate": 3.229764196577772e-05, + "loss": 0.4792, + "step": 108700 + }, + { + "epoch": 0.3543939310039316, + "grad_norm": 0.7376036643981934, + "learning_rate": 3.228135494946139e-05, + "loss": 0.4611, + "step": 108800 + }, + { + "epoch": 0.35471966071992783, + "grad_norm": 0.6144190430641174, + "learning_rate": 3.2265067933145054e-05, + "loss": 0.5023, + "step": 108900 + }, + { + "epoch": 0.3550453904359241, + "grad_norm": 0.8389730453491211, + "learning_rate": 3.224878091682873e-05, + "loss": 0.4752, + "step": 109000 + }, + { + "epoch": 0.35537112015192035, + "grad_norm": 0.6739189624786377, + "learning_rate": 3.223249390051239e-05, + "loss": 0.5024, + "step": 109100 + }, + { + "epoch": 0.3556968498679166, + "grad_norm": 0.6198161840438843, + "learning_rate": 3.221620688419606e-05, + "loss": 0.4733, + "step": 109200 + }, + { + "epoch": 0.35602257958391287, + "grad_norm": 0.6034826636314392, + "learning_rate": 3.2199919867879724e-05, + "loss": 0.4766, + "step": 109300 + }, + { + "epoch": 0.3563483092999091, + "grad_norm": 0.38500547409057617, + "learning_rate": 3.21836328515634e-05, + "loss": 0.4524, + "step": 109400 + }, + { + "epoch": 0.3566740390159054, + "grad_norm": 0.8445745706558228, + "learning_rate": 3.216734583524706e-05, + "loss": 0.4634, + "step": 109500 + }, + { + "epoch": 0.35699976873190165, + "grad_norm": 0.6940500140190125, + "learning_rate": 3.215105881893072e-05, + "loss": 0.4598, + "step": 109600 + }, + { + "epoch": 0.3573254984478979, + "grad_norm": 0.7966079115867615, + "learning_rate": 3.2134771802614394e-05, + "loss": 0.5149, + "step": 109700 + }, + { + "epoch": 0.35765122816389416, + "grad_norm": 0.47482743859291077, + "learning_rate": 3.211848478629806e-05, + "loss": 0.4576, + "step": 109800 + }, + { + "epoch": 0.3579769578798904, + "grad_norm": 0.6817350387573242, + "learning_rate": 3.210219776998173e-05, + "loss": 0.4921, + "step": 109900 + }, + { + "epoch": 0.3583026875958867, + "grad_norm": 0.7756426930427551, + "learning_rate": 3.208591075366539e-05, + "loss": 0.4892, + "step": 110000 + }, + { + "epoch": 0.35862841731188294, + "grad_norm": 0.5921733975410461, + "learning_rate": 3.2069623737349064e-05, + "loss": 0.5321, + "step": 110100 + }, + { + "epoch": 0.3589541470278792, + "grad_norm": 0.9905286431312561, + "learning_rate": 3.205333672103273e-05, + "loss": 0.4957, + "step": 110200 + }, + { + "epoch": 0.35927987674387546, + "grad_norm": 0.5868031978607178, + "learning_rate": 3.2037049704716395e-05, + "loss": 0.5048, + "step": 110300 + }, + { + "epoch": 0.3596056064598717, + "grad_norm": 0.8284581899642944, + "learning_rate": 3.202076268840006e-05, + "loss": 0.5218, + "step": 110400 + }, + { + "epoch": 0.359931336175868, + "grad_norm": 0.7068589925765991, + "learning_rate": 3.2004475672083726e-05, + "loss": 0.5366, + "step": 110500 + }, + { + "epoch": 0.36025706589186424, + "grad_norm": 0.5528571009635925, + "learning_rate": 3.19881886557674e-05, + "loss": 0.5041, + "step": 110600 + }, + { + "epoch": 0.3605827956078605, + "grad_norm": 0.39369356632232666, + "learning_rate": 3.1971901639451065e-05, + "loss": 0.4997, + "step": 110700 + }, + { + "epoch": 0.36090852532385675, + "grad_norm": 0.9514594674110413, + "learning_rate": 3.195561462313473e-05, + "loss": 0.4835, + "step": 110800 + }, + { + "epoch": 0.361234255039853, + "grad_norm": 0.7980431318283081, + "learning_rate": 3.1939327606818396e-05, + "loss": 0.4503, + "step": 110900 + }, + { + "epoch": 0.3615599847558493, + "grad_norm": 0.6403480172157288, + "learning_rate": 3.192304059050206e-05, + "loss": 0.4764, + "step": 111000 + }, + { + "epoch": 0.36188571447184553, + "grad_norm": 0.8618559837341309, + "learning_rate": 3.1906753574185734e-05, + "loss": 0.4899, + "step": 111100 + }, + { + "epoch": 0.3622114441878418, + "grad_norm": 0.6167279481887817, + "learning_rate": 3.18904665578694e-05, + "loss": 0.4384, + "step": 111200 + }, + { + "epoch": 0.36253717390383805, + "grad_norm": 1.0622551441192627, + "learning_rate": 3.1874179541553066e-05, + "loss": 0.4984, + "step": 111300 + }, + { + "epoch": 0.3628629036198343, + "grad_norm": 0.5838492512702942, + "learning_rate": 3.185789252523673e-05, + "loss": 0.4627, + "step": 111400 + }, + { + "epoch": 0.36318863333583057, + "grad_norm": 0.9143506288528442, + "learning_rate": 3.1841605508920404e-05, + "loss": 0.4615, + "step": 111500 + }, + { + "epoch": 0.3635143630518268, + "grad_norm": 0.7809180021286011, + "learning_rate": 3.182531849260407e-05, + "loss": 0.4696, + "step": 111600 + }, + { + "epoch": 0.36384009276782314, + "grad_norm": 0.5757014751434326, + "learning_rate": 3.180903147628773e-05, + "loss": 0.4414, + "step": 111700 + }, + { + "epoch": 0.3641658224838194, + "grad_norm": 0.24343223869800568, + "learning_rate": 3.17927444599714e-05, + "loss": 0.4851, + "step": 111800 + }, + { + "epoch": 0.36449155219981566, + "grad_norm": 1.036160945892334, + "learning_rate": 3.177645744365507e-05, + "loss": 0.4464, + "step": 111900 + }, + { + "epoch": 0.3648172819158119, + "grad_norm": 0.6821613311767578, + "learning_rate": 3.176017042733874e-05, + "loss": 0.5128, + "step": 112000 + }, + { + "epoch": 0.3651430116318082, + "grad_norm": 0.9862882494926453, + "learning_rate": 3.17438834110224e-05, + "loss": 0.4883, + "step": 112100 + }, + { + "epoch": 0.36546874134780444, + "grad_norm": 0.38645848631858826, + "learning_rate": 3.172759639470607e-05, + "loss": 0.4441, + "step": 112200 + }, + { + "epoch": 0.3657944710638007, + "grad_norm": 0.786322832107544, + "learning_rate": 3.171130937838974e-05, + "loss": 0.4965, + "step": 112300 + }, + { + "epoch": 0.36612020077979696, + "grad_norm": 0.8714563846588135, + "learning_rate": 3.169502236207341e-05, + "loss": 0.503, + "step": 112400 + }, + { + "epoch": 0.3664459304957932, + "grad_norm": 0.688576877117157, + "learning_rate": 3.167873534575707e-05, + "loss": 0.5003, + "step": 112500 + }, + { + "epoch": 0.3667716602117895, + "grad_norm": 0.6371302008628845, + "learning_rate": 3.1662448329440734e-05, + "loss": 0.4432, + "step": 112600 + }, + { + "epoch": 0.36709738992778573, + "grad_norm": 0.6222009658813477, + "learning_rate": 3.1646161313124406e-05, + "loss": 0.5025, + "step": 112700 + }, + { + "epoch": 0.367423119643782, + "grad_norm": 1.225849986076355, + "learning_rate": 3.162987429680807e-05, + "loss": 0.4406, + "step": 112800 + }, + { + "epoch": 0.36774884935977825, + "grad_norm": 0.321907639503479, + "learning_rate": 3.161358728049174e-05, + "loss": 0.4577, + "step": 112900 + }, + { + "epoch": 0.3680745790757745, + "grad_norm": 0.7436826229095459, + "learning_rate": 3.1597300264175404e-05, + "loss": 0.516, + "step": 113000 + }, + { + "epoch": 0.36840030879177077, + "grad_norm": 0.2427530735731125, + "learning_rate": 3.1581013247859076e-05, + "loss": 0.4627, + "step": 113100 + }, + { + "epoch": 0.36872603850776703, + "grad_norm": 0.5064482688903809, + "learning_rate": 3.156472623154274e-05, + "loss": 0.4448, + "step": 113200 + }, + { + "epoch": 0.3690517682237633, + "grad_norm": 0.7813482284545898, + "learning_rate": 3.154843921522641e-05, + "loss": 0.5084, + "step": 113300 + }, + { + "epoch": 0.36937749793975955, + "grad_norm": 0.2408231645822525, + "learning_rate": 3.153215219891007e-05, + "loss": 0.4684, + "step": 113400 + }, + { + "epoch": 0.3697032276557558, + "grad_norm": 0.9477859735488892, + "learning_rate": 3.151586518259374e-05, + "loss": 0.4893, + "step": 113500 + }, + { + "epoch": 0.37002895737175207, + "grad_norm": 0.5946941375732422, + "learning_rate": 3.149957816627741e-05, + "loss": 0.4421, + "step": 113600 + }, + { + "epoch": 0.3703546870877483, + "grad_norm": 0.5360888838768005, + "learning_rate": 3.148329114996108e-05, + "loss": 0.4807, + "step": 113700 + }, + { + "epoch": 0.3706804168037446, + "grad_norm": 0.544630229473114, + "learning_rate": 3.146700413364474e-05, + "loss": 0.4875, + "step": 113800 + }, + { + "epoch": 0.37100614651974084, + "grad_norm": 0.5878973007202148, + "learning_rate": 3.145071711732841e-05, + "loss": 0.4582, + "step": 113900 + }, + { + "epoch": 0.3713318762357371, + "grad_norm": 0.4286665916442871, + "learning_rate": 3.1434430101012074e-05, + "loss": 0.4684, + "step": 114000 + }, + { + "epoch": 0.37165760595173336, + "grad_norm": 0.49592384696006775, + "learning_rate": 3.141814308469575e-05, + "loss": 0.466, + "step": 114100 + }, + { + "epoch": 0.3719833356677296, + "grad_norm": 0.6512838006019592, + "learning_rate": 3.140185606837941e-05, + "loss": 0.5022, + "step": 114200 + }, + { + "epoch": 0.3723090653837259, + "grad_norm": 0.528073787689209, + "learning_rate": 3.138556905206308e-05, + "loss": 0.4804, + "step": 114300 + }, + { + "epoch": 0.37263479509972214, + "grad_norm": 0.8002887964248657, + "learning_rate": 3.1369282035746744e-05, + "loss": 0.5072, + "step": 114400 + }, + { + "epoch": 0.3729605248157184, + "grad_norm": 0.854848325252533, + "learning_rate": 3.1352995019430417e-05, + "loss": 0.4935, + "step": 114500 + }, + { + "epoch": 0.37328625453171466, + "grad_norm": 0.7511128783226013, + "learning_rate": 3.133670800311408e-05, + "loss": 0.4781, + "step": 114600 + }, + { + "epoch": 0.3736119842477109, + "grad_norm": 0.7878793478012085, + "learning_rate": 3.132042098679774e-05, + "loss": 0.4901, + "step": 114700 + }, + { + "epoch": 0.3739377139637072, + "grad_norm": 0.2934127449989319, + "learning_rate": 3.1304133970481414e-05, + "loss": 0.4662, + "step": 114800 + }, + { + "epoch": 0.37426344367970343, + "grad_norm": 0.8116273283958435, + "learning_rate": 3.128784695416508e-05, + "loss": 0.51, + "step": 114900 + }, + { + "epoch": 0.3745891733956997, + "grad_norm": 0.8469391465187073, + "learning_rate": 3.127155993784875e-05, + "loss": 0.463, + "step": 115000 + }, + { + "epoch": 0.37491490311169595, + "grad_norm": 0.533585250377655, + "learning_rate": 3.125527292153241e-05, + "loss": 0.4684, + "step": 115100 + }, + { + "epoch": 0.3752406328276922, + "grad_norm": 0.42549699544906616, + "learning_rate": 3.1238985905216083e-05, + "loss": 0.4559, + "step": 115200 + }, + { + "epoch": 0.3755663625436885, + "grad_norm": 0.49844828248023987, + "learning_rate": 3.122269888889975e-05, + "loss": 0.51, + "step": 115300 + }, + { + "epoch": 0.3758920922596848, + "grad_norm": 0.7514449954032898, + "learning_rate": 3.1206411872583415e-05, + "loss": 0.5036, + "step": 115400 + }, + { + "epoch": 0.37621782197568104, + "grad_norm": 0.4272593557834625, + "learning_rate": 3.119012485626708e-05, + "loss": 0.5142, + "step": 115500 + }, + { + "epoch": 0.3765435516916773, + "grad_norm": 0.7942133545875549, + "learning_rate": 3.1173837839950746e-05, + "loss": 0.4972, + "step": 115600 + }, + { + "epoch": 0.37686928140767356, + "grad_norm": 0.8216513395309448, + "learning_rate": 3.115755082363442e-05, + "loss": 0.4347, + "step": 115700 + }, + { + "epoch": 0.3771950111236698, + "grad_norm": 0.7874899506568909, + "learning_rate": 3.1141263807318085e-05, + "loss": 0.5038, + "step": 115800 + }, + { + "epoch": 0.3775207408396661, + "grad_norm": 0.6654713749885559, + "learning_rate": 3.112497679100175e-05, + "loss": 0.469, + "step": 115900 + }, + { + "epoch": 0.37784647055566234, + "grad_norm": 0.4107855260372162, + "learning_rate": 3.1108689774685416e-05, + "loss": 0.4415, + "step": 116000 + }, + { + "epoch": 0.3781722002716586, + "grad_norm": 1.1100335121154785, + "learning_rate": 3.109240275836908e-05, + "loss": 0.476, + "step": 116100 + }, + { + "epoch": 0.37849792998765486, + "grad_norm": 0.6517056226730347, + "learning_rate": 3.1076115742052754e-05, + "loss": 0.532, + "step": 116200 + }, + { + "epoch": 0.3788236597036511, + "grad_norm": 0.3325548470020294, + "learning_rate": 3.105982872573642e-05, + "loss": 0.5195, + "step": 116300 + }, + { + "epoch": 0.3791493894196474, + "grad_norm": 0.7559306025505066, + "learning_rate": 3.1043541709420086e-05, + "loss": 0.4983, + "step": 116400 + }, + { + "epoch": 0.37947511913564363, + "grad_norm": 1.0251268148422241, + "learning_rate": 3.102725469310375e-05, + "loss": 0.5022, + "step": 116500 + }, + { + "epoch": 0.3798008488516399, + "grad_norm": 0.5389842391014099, + "learning_rate": 3.1010967676787424e-05, + "loss": 0.4775, + "step": 116600 + }, + { + "epoch": 0.38012657856763615, + "grad_norm": 0.31104978919029236, + "learning_rate": 3.099468066047109e-05, + "loss": 0.5069, + "step": 116700 + }, + { + "epoch": 0.3804523082836324, + "grad_norm": 0.4198776185512543, + "learning_rate": 3.097839364415475e-05, + "loss": 0.4936, + "step": 116800 + }, + { + "epoch": 0.38077803799962867, + "grad_norm": 0.702704131603241, + "learning_rate": 3.096210662783842e-05, + "loss": 0.4625, + "step": 116900 + }, + { + "epoch": 0.38110376771562493, + "grad_norm": 0.3091365396976471, + "learning_rate": 3.094581961152209e-05, + "loss": 0.4817, + "step": 117000 + }, + { + "epoch": 0.3814294974316212, + "grad_norm": 0.5650610327720642, + "learning_rate": 3.092953259520576e-05, + "loss": 0.5097, + "step": 117100 + }, + { + "epoch": 0.38175522714761745, + "grad_norm": 0.5446620583534241, + "learning_rate": 3.091324557888942e-05, + "loss": 0.5007, + "step": 117200 + }, + { + "epoch": 0.3820809568636137, + "grad_norm": 0.6179624795913696, + "learning_rate": 3.089695856257309e-05, + "loss": 0.5086, + "step": 117300 + }, + { + "epoch": 0.38240668657960997, + "grad_norm": 0.9222896099090576, + "learning_rate": 3.0880671546256757e-05, + "loss": 0.4844, + "step": 117400 + }, + { + "epoch": 0.3827324162956062, + "grad_norm": 0.6701821088790894, + "learning_rate": 3.086438452994042e-05, + "loss": 0.487, + "step": 117500 + }, + { + "epoch": 0.3830581460116025, + "grad_norm": 0.7625011205673218, + "learning_rate": 3.084809751362409e-05, + "loss": 0.512, + "step": 117600 + }, + { + "epoch": 0.38338387572759874, + "grad_norm": 0.40110349655151367, + "learning_rate": 3.0831810497307754e-05, + "loss": 0.486, + "step": 117700 + }, + { + "epoch": 0.383709605443595, + "grad_norm": 0.6916444897651672, + "learning_rate": 3.0815523480991426e-05, + "loss": 0.498, + "step": 117800 + }, + { + "epoch": 0.38403533515959126, + "grad_norm": 0.6750262975692749, + "learning_rate": 3.079923646467509e-05, + "loss": 0.4448, + "step": 117900 + }, + { + "epoch": 0.3843610648755875, + "grad_norm": 0.7050309777259827, + "learning_rate": 3.078294944835876e-05, + "loss": 0.4719, + "step": 118000 + }, + { + "epoch": 0.3846867945915838, + "grad_norm": 0.5331671833992004, + "learning_rate": 3.0766662432042423e-05, + "loss": 0.4755, + "step": 118100 + }, + { + "epoch": 0.38501252430758004, + "grad_norm": 0.7006567716598511, + "learning_rate": 3.075037541572609e-05, + "loss": 0.516, + "step": 118200 + }, + { + "epoch": 0.3853382540235763, + "grad_norm": 0.7282729744911194, + "learning_rate": 3.073408839940976e-05, + "loss": 0.4683, + "step": 118300 + }, + { + "epoch": 0.38566398373957256, + "grad_norm": 0.47892555594444275, + "learning_rate": 3.071780138309343e-05, + "loss": 0.4497, + "step": 118400 + }, + { + "epoch": 0.3859897134555688, + "grad_norm": 0.7894031405448914, + "learning_rate": 3.070151436677709e-05, + "loss": 0.4539, + "step": 118500 + } + ], + "logging_steps": 100, + "max_steps": 307003, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.760864231227392e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}