diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,29277 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 20862, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001438021282714984, + "grad_norm": 0.8895956262211892, + "learning_rate": 1.9999999886614413e-05, + "loss": 0.7503, + "step": 1 + }, + { + "epoch": 0.0007190106413574921, + "grad_norm": 0.7538850969485658, + "learning_rate": 1.9999997165360364e-05, + "loss": 0.7445, + "step": 5 + }, + { + "epoch": 0.0014380212827149843, + "grad_norm": 0.3052316332485418, + "learning_rate": 1.9999988661443057e-05, + "loss": 0.6786, + "step": 10 + }, + { + "epoch": 0.0021570319240724763, + "grad_norm": 0.2899600972103938, + "learning_rate": 1.9999974488252902e-05, + "loss": 0.6686, + "step": 15 + }, + { + "epoch": 0.0028760425654299686, + "grad_norm": 0.26674080063712186, + "learning_rate": 1.9999954645797935e-05, + "loss": 0.6502, + "step": 20 + }, + { + "epoch": 0.0035950532067874604, + "grad_norm": 0.2571378355868916, + "learning_rate": 1.9999929134089406e-05, + "loss": 0.6425, + "step": 25 + }, + { + "epoch": 0.004314063848144953, + "grad_norm": 0.2509279861113437, + "learning_rate": 1.9999897953141777e-05, + "loss": 0.6354, + "step": 30 + }, + { + "epoch": 0.005033074489502445, + "grad_norm": 0.23412598127898154, + "learning_rate": 1.9999861102972723e-05, + "loss": 0.6232, + "step": 35 + }, + { + "epoch": 0.005752085130859937, + "grad_norm": 0.23324654117368399, + "learning_rate": 1.999981858360314e-05, + "loss": 0.6105, + "step": 40 + }, + { + "epoch": 0.0064710957722174285, + "grad_norm": 0.24845353197583306, + "learning_rate": 1.999977039505713e-05, + "loss": 0.6146, + "step": 45 + }, + { + "epoch": 0.007190106413574921, + "grad_norm": 0.24546704448200354, + "learning_rate": 1.9999716537362013e-05, + "loss": 0.6156, + "step": 50 + }, + { + "epoch": 0.007909117054932413, + "grad_norm": 0.2418689562299013, + "learning_rate": 1.9999657010548325e-05, + "loss": 0.616, + "step": 55 + }, + { + "epoch": 0.008628127696289905, + "grad_norm": 0.23965859222142355, + "learning_rate": 1.999959181464981e-05, + "loss": 0.5937, + "step": 60 + }, + { + "epoch": 0.009347138337647398, + "grad_norm": 0.23669570942953966, + "learning_rate": 1.9999520949703432e-05, + "loss": 0.6027, + "step": 65 + }, + { + "epoch": 0.01006614897900489, + "grad_norm": 0.24176708430886543, + "learning_rate": 1.9999444415749365e-05, + "loss": 0.5905, + "step": 70 + }, + { + "epoch": 0.010785159620362382, + "grad_norm": 0.24723478742464217, + "learning_rate": 1.9999362212831e-05, + "loss": 0.6016, + "step": 75 + }, + { + "epoch": 0.011504170261719874, + "grad_norm": 0.2428143716928379, + "learning_rate": 1.9999274340994935e-05, + "loss": 0.5978, + "step": 80 + }, + { + "epoch": 0.012223180903077365, + "grad_norm": 0.2350929588489868, + "learning_rate": 1.999918080029099e-05, + "loss": 0.5669, + "step": 85 + }, + { + "epoch": 0.012942191544434857, + "grad_norm": 0.2280484038192313, + "learning_rate": 1.99990815907722e-05, + "loss": 0.6127, + "step": 90 + }, + { + "epoch": 0.01366120218579235, + "grad_norm": 0.23131386557697947, + "learning_rate": 1.9998976712494805e-05, + "loss": 0.6004, + "step": 95 + }, + { + "epoch": 0.014380212827149842, + "grad_norm": 0.2547551645700165, + "learning_rate": 1.9998866165518264e-05, + "loss": 0.5946, + "step": 100 + }, + { + "epoch": 0.015099223468507334, + "grad_norm": 0.23194950565240102, + "learning_rate": 1.999874994990525e-05, + "loss": 0.5896, + "step": 105 + }, + { + "epoch": 0.015818234109864826, + "grad_norm": 0.2675421206218923, + "learning_rate": 1.9998628065721647e-05, + "loss": 0.6009, + "step": 110 + }, + { + "epoch": 0.016537244751222317, + "grad_norm": 0.2564973354360148, + "learning_rate": 1.999850051303656e-05, + "loss": 0.6146, + "step": 115 + }, + { + "epoch": 0.01725625539257981, + "grad_norm": 0.2702995046995952, + "learning_rate": 1.9998367291922293e-05, + "loss": 0.592, + "step": 120 + }, + { + "epoch": 0.0179752660339373, + "grad_norm": 0.23744671504568096, + "learning_rate": 1.9998228402454384e-05, + "loss": 0.5881, + "step": 125 + }, + { + "epoch": 0.018694276675294795, + "grad_norm": 0.25400484853443966, + "learning_rate": 1.9998083844711563e-05, + "loss": 0.5995, + "step": 130 + }, + { + "epoch": 0.019413287316652286, + "grad_norm": 0.2289330994293863, + "learning_rate": 1.9997933618775787e-05, + "loss": 0.5831, + "step": 135 + }, + { + "epoch": 0.02013229795800978, + "grad_norm": 0.23314947417175733, + "learning_rate": 1.999777772473223e-05, + "loss": 0.588, + "step": 140 + }, + { + "epoch": 0.02085130859936727, + "grad_norm": 0.24609630390061227, + "learning_rate": 1.999761616266926e-05, + "loss": 0.6057, + "step": 145 + }, + { + "epoch": 0.021570319240724764, + "grad_norm": 0.25060806808962244, + "learning_rate": 1.9997448932678482e-05, + "loss": 0.6062, + "step": 150 + }, + { + "epoch": 0.022289329882082255, + "grad_norm": 0.23305588376025257, + "learning_rate": 1.9997276034854698e-05, + "loss": 0.5625, + "step": 155 + }, + { + "epoch": 0.02300834052343975, + "grad_norm": 0.2425323796334993, + "learning_rate": 1.999709746929593e-05, + "loss": 0.5981, + "step": 160 + }, + { + "epoch": 0.02372735116479724, + "grad_norm": 0.2393332584757854, + "learning_rate": 1.9996913236103418e-05, + "loss": 0.5676, + "step": 165 + }, + { + "epoch": 0.02444636180615473, + "grad_norm": 0.25948576809270496, + "learning_rate": 1.9996723335381595e-05, + "loss": 0.5843, + "step": 170 + }, + { + "epoch": 0.025165372447512224, + "grad_norm": 0.24575999197763174, + "learning_rate": 1.9996527767238132e-05, + "loss": 0.5873, + "step": 175 + }, + { + "epoch": 0.025884383088869714, + "grad_norm": 0.25781746644112463, + "learning_rate": 1.9996326531783898e-05, + "loss": 0.6042, + "step": 180 + }, + { + "epoch": 0.026603393730227208, + "grad_norm": 0.23786701467089164, + "learning_rate": 1.999611962913298e-05, + "loss": 0.5777, + "step": 185 + }, + { + "epoch": 0.0273224043715847, + "grad_norm": 0.26102003785829764, + "learning_rate": 1.999590705940268e-05, + "loss": 0.5968, + "step": 190 + }, + { + "epoch": 0.028041415012942193, + "grad_norm": 0.24249586062086007, + "learning_rate": 1.9995688822713503e-05, + "loss": 0.6061, + "step": 195 + }, + { + "epoch": 0.028760425654299683, + "grad_norm": 0.23976276635942745, + "learning_rate": 1.9995464919189177e-05, + "loss": 0.5998, + "step": 200 + }, + { + "epoch": 0.029479436295657177, + "grad_norm": 0.252632644612971, + "learning_rate": 1.9995235348956643e-05, + "loss": 0.5811, + "step": 205 + }, + { + "epoch": 0.030198446937014668, + "grad_norm": 0.2278841743646487, + "learning_rate": 1.9995000112146045e-05, + "loss": 0.5829, + "step": 210 + }, + { + "epoch": 0.03091745757837216, + "grad_norm": 0.2357301218659176, + "learning_rate": 1.9994759208890744e-05, + "loss": 0.5936, + "step": 215 + }, + { + "epoch": 0.03163646821972965, + "grad_norm": 0.2504255718005453, + "learning_rate": 1.999451263932732e-05, + "loss": 0.6065, + "step": 220 + }, + { + "epoch": 0.032355478861087146, + "grad_norm": 0.2642191490569926, + "learning_rate": 1.999426040359556e-05, + "loss": 0.5857, + "step": 225 + }, + { + "epoch": 0.03307448950244463, + "grad_norm": 0.24247603256650757, + "learning_rate": 1.999400250183846e-05, + "loss": 0.5933, + "step": 230 + }, + { + "epoch": 0.03379350014380213, + "grad_norm": 0.31604085503458695, + "learning_rate": 1.9993738934202235e-05, + "loss": 0.567, + "step": 235 + }, + { + "epoch": 0.03451251078515962, + "grad_norm": 0.23670027365405064, + "learning_rate": 1.9993469700836307e-05, + "loss": 0.5642, + "step": 240 + }, + { + "epoch": 0.035231521426517115, + "grad_norm": 0.23669625898006572, + "learning_rate": 1.999319480189331e-05, + "loss": 0.5789, + "step": 245 + }, + { + "epoch": 0.0359505320678746, + "grad_norm": 0.24845579465645523, + "learning_rate": 1.9992914237529094e-05, + "loss": 0.5847, + "step": 250 + }, + { + "epoch": 0.036669542709232096, + "grad_norm": 0.23450615631614805, + "learning_rate": 1.9992628007902718e-05, + "loss": 0.5849, + "step": 255 + }, + { + "epoch": 0.03738855335058959, + "grad_norm": 0.30173595777351025, + "learning_rate": 1.999233611317646e-05, + "loss": 0.5802, + "step": 260 + }, + { + "epoch": 0.038107563991947084, + "grad_norm": 0.2414617541078757, + "learning_rate": 1.9992038553515792e-05, + "loss": 0.5791, + "step": 265 + }, + { + "epoch": 0.03882657463330457, + "grad_norm": 0.24955545431976164, + "learning_rate": 1.9991735329089416e-05, + "loss": 0.5781, + "step": 270 + }, + { + "epoch": 0.039545585274662065, + "grad_norm": 0.2681517501448067, + "learning_rate": 1.999142644006924e-05, + "loss": 0.5738, + "step": 275 + }, + { + "epoch": 0.04026459591601956, + "grad_norm": 0.24569712912592853, + "learning_rate": 1.9991111886630375e-05, + "loss": 0.5719, + "step": 280 + }, + { + "epoch": 0.040983606557377046, + "grad_norm": 0.25324852483277277, + "learning_rate": 1.9990791668951155e-05, + "loss": 0.5783, + "step": 285 + }, + { + "epoch": 0.04170261719873454, + "grad_norm": 0.2353776930936098, + "learning_rate": 1.9990465787213118e-05, + "loss": 0.5749, + "step": 290 + }, + { + "epoch": 0.042421627840092034, + "grad_norm": 0.2672442244356293, + "learning_rate": 1.999013424160102e-05, + "loss": 0.5844, + "step": 295 + }, + { + "epoch": 0.04314063848144953, + "grad_norm": 0.2547192640704089, + "learning_rate": 1.998979703230282e-05, + "loss": 0.5901, + "step": 300 + }, + { + "epoch": 0.043859649122807015, + "grad_norm": 0.24689548800613587, + "learning_rate": 1.998945415950969e-05, + "loss": 0.5637, + "step": 305 + }, + { + "epoch": 0.04457865976416451, + "grad_norm": 0.24720149594694088, + "learning_rate": 1.9989105623416014e-05, + "loss": 0.5692, + "step": 310 + }, + { + "epoch": 0.045297670405522, + "grad_norm": 0.24942201632003652, + "learning_rate": 1.998875142421939e-05, + "loss": 0.5877, + "step": 315 + }, + { + "epoch": 0.0460166810468795, + "grad_norm": 0.24826746598996405, + "learning_rate": 1.998839156212062e-05, + "loss": 0.567, + "step": 320 + }, + { + "epoch": 0.046735691688236984, + "grad_norm": 0.23321079253402682, + "learning_rate": 1.9988026037323728e-05, + "loss": 0.5837, + "step": 325 + }, + { + "epoch": 0.04745470232959448, + "grad_norm": 0.23600090147065284, + "learning_rate": 1.9987654850035926e-05, + "loss": 0.5706, + "step": 330 + }, + { + "epoch": 0.04817371297095197, + "grad_norm": 0.23188518123042903, + "learning_rate": 1.9987278000467665e-05, + "loss": 0.5693, + "step": 335 + }, + { + "epoch": 0.04889272361230946, + "grad_norm": 0.2299076677891295, + "learning_rate": 1.998689548883258e-05, + "loss": 0.5649, + "step": 340 + }, + { + "epoch": 0.04961173425366695, + "grad_norm": 0.24272821845012257, + "learning_rate": 1.9986507315347535e-05, + "loss": 0.5731, + "step": 345 + }, + { + "epoch": 0.05033074489502445, + "grad_norm": 0.24883260991609574, + "learning_rate": 1.9986113480232598e-05, + "loss": 0.5684, + "step": 350 + }, + { + "epoch": 0.05104975553638194, + "grad_norm": 0.24361728388598086, + "learning_rate": 1.9985713983711034e-05, + "loss": 0.5703, + "step": 355 + }, + { + "epoch": 0.05176876617773943, + "grad_norm": 0.24082816776797009, + "learning_rate": 1.998530882600934e-05, + "loss": 0.5698, + "step": 360 + }, + { + "epoch": 0.05248777681909692, + "grad_norm": 0.28034003180068806, + "learning_rate": 1.9984898007357203e-05, + "loss": 0.5792, + "step": 365 + }, + { + "epoch": 0.053206787460454416, + "grad_norm": 0.2389126953535011, + "learning_rate": 1.9984481527987535e-05, + "loss": 0.585, + "step": 370 + }, + { + "epoch": 0.05392579810181191, + "grad_norm": 0.2604223693815005, + "learning_rate": 1.9984059388136448e-05, + "loss": 0.5841, + "step": 375 + }, + { + "epoch": 0.0546448087431694, + "grad_norm": 0.2241690714850129, + "learning_rate": 1.998363158804326e-05, + "loss": 0.5703, + "step": 380 + }, + { + "epoch": 0.05536381938452689, + "grad_norm": 0.2337180098567318, + "learning_rate": 1.9983198127950507e-05, + "loss": 0.5629, + "step": 385 + }, + { + "epoch": 0.056082830025884385, + "grad_norm": 0.2571830946809361, + "learning_rate": 1.9982759008103926e-05, + "loss": 0.5528, + "step": 390 + }, + { + "epoch": 0.05680184066724187, + "grad_norm": 0.23443323460830098, + "learning_rate": 1.9982314228752474e-05, + "loss": 0.5518, + "step": 395 + }, + { + "epoch": 0.057520851308599366, + "grad_norm": 0.2373171025532724, + "learning_rate": 1.9981863790148303e-05, + "loss": 0.5646, + "step": 400 + }, + { + "epoch": 0.05823986194995686, + "grad_norm": 0.23501490363911096, + "learning_rate": 1.9981407692546776e-05, + "loss": 0.5798, + "step": 405 + }, + { + "epoch": 0.058958872591314354, + "grad_norm": 0.2277225203925688, + "learning_rate": 1.9980945936206475e-05, + "loss": 0.5549, + "step": 410 + }, + { + "epoch": 0.05967788323267184, + "grad_norm": 0.228725824660234, + "learning_rate": 1.998047852138918e-05, + "loss": 0.5702, + "step": 415 + }, + { + "epoch": 0.060396893874029335, + "grad_norm": 0.24468329140247444, + "learning_rate": 1.9980005448359878e-05, + "loss": 0.5802, + "step": 420 + }, + { + "epoch": 0.06111590451538683, + "grad_norm": 0.24887287731815272, + "learning_rate": 1.997952671738677e-05, + "loss": 0.5541, + "step": 425 + }, + { + "epoch": 0.06183491515674432, + "grad_norm": 0.2345470777925773, + "learning_rate": 1.9979042328741264e-05, + "loss": 0.5751, + "step": 430 + }, + { + "epoch": 0.06255392579810182, + "grad_norm": 0.22675626648422811, + "learning_rate": 1.997855228269797e-05, + "loss": 0.5645, + "step": 435 + }, + { + "epoch": 0.0632729364394593, + "grad_norm": 0.22969452073158558, + "learning_rate": 1.997805657953471e-05, + "loss": 0.5576, + "step": 440 + }, + { + "epoch": 0.06399194708081679, + "grad_norm": 0.23819343640060633, + "learning_rate": 1.9977555219532512e-05, + "loss": 0.5614, + "step": 445 + }, + { + "epoch": 0.06471095772217429, + "grad_norm": 0.22449524684154257, + "learning_rate": 1.997704820297561e-05, + "loss": 0.5632, + "step": 450 + }, + { + "epoch": 0.06542996836353178, + "grad_norm": 0.2281428133225678, + "learning_rate": 1.9976535530151447e-05, + "loss": 0.5668, + "step": 455 + }, + { + "epoch": 0.06614897900488927, + "grad_norm": 0.22733576739594663, + "learning_rate": 1.997601720135067e-05, + "loss": 0.5559, + "step": 460 + }, + { + "epoch": 0.06686798964624677, + "grad_norm": 0.25848007315376675, + "learning_rate": 1.9975493216867143e-05, + "loss": 0.561, + "step": 465 + }, + { + "epoch": 0.06758700028760425, + "grad_norm": 0.2589607216199232, + "learning_rate": 1.9974963576997912e-05, + "loss": 0.556, + "step": 470 + }, + { + "epoch": 0.06830601092896176, + "grad_norm": 0.25234683129298624, + "learning_rate": 1.9974428282043255e-05, + "loss": 0.5596, + "step": 475 + }, + { + "epoch": 0.06902502157031924, + "grad_norm": 0.23087115529102087, + "learning_rate": 1.9973887332306648e-05, + "loss": 0.5668, + "step": 480 + }, + { + "epoch": 0.06974403221167673, + "grad_norm": 0.23159311014909575, + "learning_rate": 1.997334072809476e-05, + "loss": 0.5483, + "step": 485 + }, + { + "epoch": 0.07046304285303423, + "grad_norm": 0.2380679120871314, + "learning_rate": 1.9972788469717483e-05, + "loss": 0.5506, + "step": 490 + }, + { + "epoch": 0.07118205349439172, + "grad_norm": 0.23474547391570408, + "learning_rate": 1.9972230557487908e-05, + "loss": 0.5647, + "step": 495 + }, + { + "epoch": 0.0719010641357492, + "grad_norm": 0.2369243370447207, + "learning_rate": 1.997166699172233e-05, + "loss": 0.5837, + "step": 500 + }, + { + "epoch": 0.0726200747771067, + "grad_norm": 0.2314228917745866, + "learning_rate": 1.9971097772740248e-05, + "loss": 0.5685, + "step": 505 + }, + { + "epoch": 0.07333908541846419, + "grad_norm": 0.22604831031414893, + "learning_rate": 1.997052290086437e-05, + "loss": 0.553, + "step": 510 + }, + { + "epoch": 0.07405809605982168, + "grad_norm": 0.23479155122370488, + "learning_rate": 1.9969942376420606e-05, + "loss": 0.5693, + "step": 515 + }, + { + "epoch": 0.07477710670117918, + "grad_norm": 0.24154952916904426, + "learning_rate": 1.9969356199738076e-05, + "loss": 0.5559, + "step": 520 + }, + { + "epoch": 0.07549611734253667, + "grad_norm": 0.24234210645824733, + "learning_rate": 1.9968764371149098e-05, + "loss": 0.5763, + "step": 525 + }, + { + "epoch": 0.07621512798389417, + "grad_norm": 0.23512860396843185, + "learning_rate": 1.996816689098919e-05, + "loss": 0.5623, + "step": 530 + }, + { + "epoch": 0.07693413862525166, + "grad_norm": 0.23778492447875255, + "learning_rate": 1.9967563759597084e-05, + "loss": 0.5546, + "step": 535 + }, + { + "epoch": 0.07765314926660914, + "grad_norm": 0.2306104882528985, + "learning_rate": 1.9966954977314717e-05, + "loss": 0.5613, + "step": 540 + }, + { + "epoch": 0.07837215990796664, + "grad_norm": 0.25470531407410457, + "learning_rate": 1.9966340544487214e-05, + "loss": 0.5678, + "step": 545 + }, + { + "epoch": 0.07909117054932413, + "grad_norm": 0.2549311232751504, + "learning_rate": 1.996572046146293e-05, + "loss": 0.5641, + "step": 550 + }, + { + "epoch": 0.07981018119068162, + "grad_norm": 0.23736262691577187, + "learning_rate": 1.996509472859339e-05, + "loss": 0.5708, + "step": 555 + }, + { + "epoch": 0.08052919183203912, + "grad_norm": 0.23789179184218126, + "learning_rate": 1.996446334623335e-05, + "loss": 0.5747, + "step": 560 + }, + { + "epoch": 0.0812482024733966, + "grad_norm": 0.24658441392917815, + "learning_rate": 1.9963826314740755e-05, + "loss": 0.5715, + "step": 565 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 0.23122288100315114, + "learning_rate": 1.9963183634476757e-05, + "loss": 0.5596, + "step": 570 + }, + { + "epoch": 0.0826862237561116, + "grad_norm": 0.24086550214425853, + "learning_rate": 1.996253530580571e-05, + "loss": 0.5711, + "step": 575 + }, + { + "epoch": 0.08340523439746908, + "grad_norm": 0.24735019428776434, + "learning_rate": 1.9961881329095167e-05, + "loss": 0.5787, + "step": 580 + }, + { + "epoch": 0.08412424503882658, + "grad_norm": 0.24048575173417583, + "learning_rate": 1.9961221704715886e-05, + "loss": 0.569, + "step": 585 + }, + { + "epoch": 0.08484325568018407, + "grad_norm": 0.23036818476348792, + "learning_rate": 1.996055643304183e-05, + "loss": 0.5725, + "step": 590 + }, + { + "epoch": 0.08556226632154156, + "grad_norm": 0.23658089750158737, + "learning_rate": 1.995988551445016e-05, + "loss": 0.5526, + "step": 595 + }, + { + "epoch": 0.08628127696289906, + "grad_norm": 0.24266345921678414, + "learning_rate": 1.9959208949321234e-05, + "loss": 0.5695, + "step": 600 + }, + { + "epoch": 0.08700028760425654, + "grad_norm": 0.22811106526417912, + "learning_rate": 1.9958526738038618e-05, + "loss": 0.5651, + "step": 605 + }, + { + "epoch": 0.08771929824561403, + "grad_norm": 0.2421343124759253, + "learning_rate": 1.9957838880989076e-05, + "loss": 0.5651, + "step": 610 + }, + { + "epoch": 0.08843830888697153, + "grad_norm": 0.24224078684978484, + "learning_rate": 1.9957145378562574e-05, + "loss": 0.5565, + "step": 615 + }, + { + "epoch": 0.08915731952832902, + "grad_norm": 0.23449828939088413, + "learning_rate": 1.995644623115228e-05, + "loss": 0.557, + "step": 620 + }, + { + "epoch": 0.0898763301696865, + "grad_norm": 0.2354090594428972, + "learning_rate": 1.9955741439154557e-05, + "loss": 0.5601, + "step": 625 + }, + { + "epoch": 0.090595340811044, + "grad_norm": 0.23429370403590513, + "learning_rate": 1.9955031002968972e-05, + "loss": 0.5436, + "step": 630 + }, + { + "epoch": 0.0913143514524015, + "grad_norm": 0.23909034721910113, + "learning_rate": 1.995431492299829e-05, + "loss": 0.5438, + "step": 635 + }, + { + "epoch": 0.092033362093759, + "grad_norm": 0.2546003626324708, + "learning_rate": 1.9953593199648484e-05, + "loss": 0.552, + "step": 640 + }, + { + "epoch": 0.09275237273511648, + "grad_norm": 0.24340505218858643, + "learning_rate": 1.9952865833328707e-05, + "loss": 0.545, + "step": 645 + }, + { + "epoch": 0.09347138337647397, + "grad_norm": 0.235160724181661, + "learning_rate": 1.9952132824451333e-05, + "loss": 0.5443, + "step": 650 + }, + { + "epoch": 0.09419039401783147, + "grad_norm": 0.2304357738930148, + "learning_rate": 1.995139417343192e-05, + "loss": 0.5588, + "step": 655 + }, + { + "epoch": 0.09490940465918896, + "grad_norm": 0.24348777956804377, + "learning_rate": 1.995064988068923e-05, + "loss": 0.5734, + "step": 660 + }, + { + "epoch": 0.09562841530054644, + "grad_norm": 0.25544009282187286, + "learning_rate": 1.994989994664523e-05, + "loss": 0.5562, + "step": 665 + }, + { + "epoch": 0.09634742594190394, + "grad_norm": 0.23348945321713513, + "learning_rate": 1.994914437172507e-05, + "loss": 0.5546, + "step": 670 + }, + { + "epoch": 0.09706643658326143, + "grad_norm": 0.23369100591763928, + "learning_rate": 1.9948383156357112e-05, + "loss": 0.5609, + "step": 675 + }, + { + "epoch": 0.09778544722461892, + "grad_norm": 0.2506151417862584, + "learning_rate": 1.9947616300972906e-05, + "loss": 0.5782, + "step": 680 + }, + { + "epoch": 0.09850445786597642, + "grad_norm": 0.22590797440006433, + "learning_rate": 1.994684380600721e-05, + "loss": 0.5466, + "step": 685 + }, + { + "epoch": 0.0992234685073339, + "grad_norm": 0.23440225124281835, + "learning_rate": 1.9946065671897965e-05, + "loss": 0.546, + "step": 690 + }, + { + "epoch": 0.09994247914869141, + "grad_norm": 0.2322606378315528, + "learning_rate": 1.9945281899086325e-05, + "loss": 0.5614, + "step": 695 + }, + { + "epoch": 0.1006614897900489, + "grad_norm": 0.22932823503652058, + "learning_rate": 1.9944492488016623e-05, + "loss": 0.5709, + "step": 700 + }, + { + "epoch": 0.10138050043140638, + "grad_norm": 0.22956801014845277, + "learning_rate": 1.994369743913641e-05, + "loss": 0.5546, + "step": 705 + }, + { + "epoch": 0.10209951107276388, + "grad_norm": 0.2452051181302563, + "learning_rate": 1.9942896752896413e-05, + "loss": 0.5503, + "step": 710 + }, + { + "epoch": 0.10281852171412137, + "grad_norm": 0.2431334061890164, + "learning_rate": 1.9942090429750564e-05, + "loss": 0.5677, + "step": 715 + }, + { + "epoch": 0.10353753235547886, + "grad_norm": 0.23416786364574083, + "learning_rate": 1.9941278470155993e-05, + "loss": 0.5493, + "step": 720 + }, + { + "epoch": 0.10425654299683636, + "grad_norm": 0.2338646746203413, + "learning_rate": 1.9940460874573025e-05, + "loss": 0.5537, + "step": 725 + }, + { + "epoch": 0.10497555363819384, + "grad_norm": 0.24660941049936622, + "learning_rate": 1.993963764346517e-05, + "loss": 0.5545, + "step": 730 + }, + { + "epoch": 0.10569456427955133, + "grad_norm": 0.224186981201213, + "learning_rate": 1.9938808777299145e-05, + "loss": 0.546, + "step": 735 + }, + { + "epoch": 0.10641357492090883, + "grad_norm": 0.2416482602683721, + "learning_rate": 1.993797427654486e-05, + "loss": 0.5444, + "step": 740 + }, + { + "epoch": 0.10713258556226632, + "grad_norm": 0.23866160212304743, + "learning_rate": 1.993713414167541e-05, + "loss": 0.5566, + "step": 745 + }, + { + "epoch": 0.10785159620362382, + "grad_norm": 0.23920491517081066, + "learning_rate": 1.9936288373167095e-05, + "loss": 0.5541, + "step": 750 + }, + { + "epoch": 0.10857060684498131, + "grad_norm": 0.23148987601367907, + "learning_rate": 1.9935436971499408e-05, + "loss": 0.5532, + "step": 755 + }, + { + "epoch": 0.1092896174863388, + "grad_norm": 0.2709530628970777, + "learning_rate": 1.993457993715503e-05, + "loss": 0.5511, + "step": 760 + }, + { + "epoch": 0.1100086281276963, + "grad_norm": 0.23234198844872012, + "learning_rate": 1.9933717270619835e-05, + "loss": 0.5515, + "step": 765 + }, + { + "epoch": 0.11072763876905378, + "grad_norm": 0.23109806096717964, + "learning_rate": 1.9932848972382895e-05, + "loss": 0.5386, + "step": 770 + }, + { + "epoch": 0.11144664941041127, + "grad_norm": 0.2449388533996965, + "learning_rate": 1.9931975042936473e-05, + "loss": 0.5444, + "step": 775 + }, + { + "epoch": 0.11216566005176877, + "grad_norm": 0.22958192290078638, + "learning_rate": 1.993109548277602e-05, + "loss": 0.5538, + "step": 780 + }, + { + "epoch": 0.11288467069312626, + "grad_norm": 0.2358876835459476, + "learning_rate": 1.9930210292400186e-05, + "loss": 0.559, + "step": 785 + }, + { + "epoch": 0.11360368133448374, + "grad_norm": 0.2403799019112171, + "learning_rate": 1.9929319472310814e-05, + "loss": 0.5472, + "step": 790 + }, + { + "epoch": 0.11432269197584125, + "grad_norm": 0.23364592975387316, + "learning_rate": 1.992842302301293e-05, + "loss": 0.5514, + "step": 795 + }, + { + "epoch": 0.11504170261719873, + "grad_norm": 0.23389566476752166, + "learning_rate": 1.9927520945014757e-05, + "loss": 0.5539, + "step": 800 + }, + { + "epoch": 0.11576071325855623, + "grad_norm": 0.40418569681544375, + "learning_rate": 1.992661323882771e-05, + "loss": 0.5548, + "step": 805 + }, + { + "epoch": 0.11647972389991372, + "grad_norm": 0.24462456666343171, + "learning_rate": 1.992569990496639e-05, + "loss": 0.5468, + "step": 810 + }, + { + "epoch": 0.11719873454127121, + "grad_norm": 0.23888748035425905, + "learning_rate": 1.9924780943948595e-05, + "loss": 0.5727, + "step": 815 + }, + { + "epoch": 0.11791774518262871, + "grad_norm": 0.231850528615198, + "learning_rate": 1.9923856356295306e-05, + "loss": 0.5473, + "step": 820 + }, + { + "epoch": 0.1186367558239862, + "grad_norm": 0.2396817797554619, + "learning_rate": 1.9922926142530698e-05, + "loss": 0.5605, + "step": 825 + }, + { + "epoch": 0.11935576646534368, + "grad_norm": 0.22363201399811253, + "learning_rate": 1.9921990303182138e-05, + "loss": 0.5558, + "step": 830 + }, + { + "epoch": 0.12007477710670118, + "grad_norm": 0.23021616719333593, + "learning_rate": 1.992104883878018e-05, + "loss": 0.5767, + "step": 835 + }, + { + "epoch": 0.12079378774805867, + "grad_norm": 0.23380857983669595, + "learning_rate": 1.992010174985856e-05, + "loss": 0.5521, + "step": 840 + }, + { + "epoch": 0.12151279838941616, + "grad_norm": 0.22729003181704024, + "learning_rate": 1.9919149036954216e-05, + "loss": 0.5472, + "step": 845 + }, + { + "epoch": 0.12223180903077366, + "grad_norm": 0.23937887856660198, + "learning_rate": 1.9918190700607267e-05, + "loss": 0.5495, + "step": 850 + }, + { + "epoch": 0.12295081967213115, + "grad_norm": 0.23791753701672147, + "learning_rate": 1.9917226741361014e-05, + "loss": 0.5538, + "step": 855 + }, + { + "epoch": 0.12366983031348865, + "grad_norm": 0.23694673340103434, + "learning_rate": 1.9916257159761964e-05, + "loss": 0.5468, + "step": 860 + }, + { + "epoch": 0.12438884095484613, + "grad_norm": 0.2333089213103549, + "learning_rate": 1.9915281956359788e-05, + "loss": 0.5447, + "step": 865 + }, + { + "epoch": 0.12510785159620363, + "grad_norm": 0.227723221504762, + "learning_rate": 1.991430113170736e-05, + "loss": 0.5579, + "step": 870 + }, + { + "epoch": 0.12582686223756112, + "grad_norm": 0.23086865156496933, + "learning_rate": 1.9913314686360744e-05, + "loss": 0.5625, + "step": 875 + }, + { + "epoch": 0.1265458728789186, + "grad_norm": 0.25490421224303816, + "learning_rate": 1.991232262087917e-05, + "loss": 0.5498, + "step": 880 + }, + { + "epoch": 0.1272648835202761, + "grad_norm": 0.25322058897047683, + "learning_rate": 1.9911324935825083e-05, + "loss": 0.5467, + "step": 885 + }, + { + "epoch": 0.12798389416163358, + "grad_norm": 0.24209252391434005, + "learning_rate": 1.9910321631764083e-05, + "loss": 0.5554, + "step": 890 + }, + { + "epoch": 0.1287029048029911, + "grad_norm": 0.23150880614620745, + "learning_rate": 1.9909312709264982e-05, + "loss": 0.5522, + "step": 895 + }, + { + "epoch": 0.12942191544434858, + "grad_norm": 0.22637748606237124, + "learning_rate": 1.9908298168899764e-05, + "loss": 0.5605, + "step": 900 + }, + { + "epoch": 0.13014092608570607, + "grad_norm": 0.23657431868503107, + "learning_rate": 1.9907278011243598e-05, + "loss": 0.5469, + "step": 905 + }, + { + "epoch": 0.13085993672706356, + "grad_norm": 0.23323393650672994, + "learning_rate": 1.9906252236874842e-05, + "loss": 0.5574, + "step": 910 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 0.2242261540344663, + "learning_rate": 1.990522084637503e-05, + "loss": 0.5435, + "step": 915 + }, + { + "epoch": 0.13229795800977853, + "grad_norm": 0.2399896547903275, + "learning_rate": 1.99041838403289e-05, + "loss": 0.5497, + "step": 920 + }, + { + "epoch": 0.13301696865113605, + "grad_norm": 0.22799645707237384, + "learning_rate": 1.9903141219324346e-05, + "loss": 0.5344, + "step": 925 + }, + { + "epoch": 0.13373597929249353, + "grad_norm": 0.24695130258598189, + "learning_rate": 1.9902092983952464e-05, + "loss": 0.5608, + "step": 930 + }, + { + "epoch": 0.13445498993385102, + "grad_norm": 0.2340756439929401, + "learning_rate": 1.9901039134807528e-05, + "loss": 0.5381, + "step": 935 + }, + { + "epoch": 0.1351740005752085, + "grad_norm": 0.2312762255986081, + "learning_rate": 1.9899979672486997e-05, + "loss": 0.556, + "step": 940 + }, + { + "epoch": 0.135893011216566, + "grad_norm": 0.2310236580618276, + "learning_rate": 1.9898914597591504e-05, + "loss": 0.5327, + "step": 945 + }, + { + "epoch": 0.1366120218579235, + "grad_norm": 0.24017163230048633, + "learning_rate": 1.9897843910724877e-05, + "loss": 0.5608, + "step": 950 + }, + { + "epoch": 0.137331032499281, + "grad_norm": 0.23318572231200663, + "learning_rate": 1.989676761249411e-05, + "loss": 0.541, + "step": 955 + }, + { + "epoch": 0.13805004314063848, + "grad_norm": 0.21727740276615842, + "learning_rate": 1.9895685703509393e-05, + "loss": 0.542, + "step": 960 + }, + { + "epoch": 0.13876905378199597, + "grad_norm": 0.24073882046706868, + "learning_rate": 1.989459818438409e-05, + "loss": 0.5704, + "step": 965 + }, + { + "epoch": 0.13948806442335346, + "grad_norm": 0.2331323920025414, + "learning_rate": 1.989350505573474e-05, + "loss": 0.5622, + "step": 970 + }, + { + "epoch": 0.14020707506471095, + "grad_norm": 0.2339887752455901, + "learning_rate": 1.9892406318181075e-05, + "loss": 0.5253, + "step": 975 + }, + { + "epoch": 0.14092608570606846, + "grad_norm": 0.22730026395412942, + "learning_rate": 1.9891301972345993e-05, + "loss": 0.5663, + "step": 980 + }, + { + "epoch": 0.14164509634742595, + "grad_norm": 0.24588319237645848, + "learning_rate": 1.9890192018855587e-05, + "loss": 0.563, + "step": 985 + }, + { + "epoch": 0.14236410698878343, + "grad_norm": 0.23378060948352938, + "learning_rate": 1.9889076458339116e-05, + "loss": 0.5579, + "step": 990 + }, + { + "epoch": 0.14308311763014092, + "grad_norm": 0.23722178398720728, + "learning_rate": 1.988795529142902e-05, + "loss": 0.5408, + "step": 995 + }, + { + "epoch": 0.1438021282714984, + "grad_norm": 0.23988344715061594, + "learning_rate": 1.9886828518760925e-05, + "loss": 0.5265, + "step": 1000 + }, + { + "epoch": 0.14452113891285592, + "grad_norm": 0.24275883896253647, + "learning_rate": 1.9885696140973625e-05, + "loss": 0.5414, + "step": 1005 + }, + { + "epoch": 0.1452401495542134, + "grad_norm": 0.2346568620768657, + "learning_rate": 1.9884558158709103e-05, + "loss": 0.5407, + "step": 1010 + }, + { + "epoch": 0.1459591601955709, + "grad_norm": 0.2263203624484671, + "learning_rate": 1.9883414572612506e-05, + "loss": 0.5391, + "step": 1015 + }, + { + "epoch": 0.14667817083692838, + "grad_norm": 0.23593016127036032, + "learning_rate": 1.988226538333217e-05, + "loss": 0.5333, + "step": 1020 + }, + { + "epoch": 0.14739718147828587, + "grad_norm": 0.23969237664732787, + "learning_rate": 1.98811105915196e-05, + "loss": 0.5421, + "step": 1025 + }, + { + "epoch": 0.14811619211964336, + "grad_norm": 0.22835038314625386, + "learning_rate": 1.9879950197829477e-05, + "loss": 0.5538, + "step": 1030 + }, + { + "epoch": 0.14883520276100087, + "grad_norm": 0.24277098351604232, + "learning_rate": 1.9878784202919668e-05, + "loss": 0.5496, + "step": 1035 + }, + { + "epoch": 0.14955421340235836, + "grad_norm": 0.24541139573862844, + "learning_rate": 1.9877612607451203e-05, + "loss": 0.5493, + "step": 1040 + }, + { + "epoch": 0.15027322404371585, + "grad_norm": 0.23602266976589184, + "learning_rate": 1.9876435412088292e-05, + "loss": 0.5392, + "step": 1045 + }, + { + "epoch": 0.15099223468507333, + "grad_norm": 0.23870568698917677, + "learning_rate": 1.987525261749832e-05, + "loss": 0.5433, + "step": 1050 + }, + { + "epoch": 0.15171124532643082, + "grad_norm": 0.23740753927088906, + "learning_rate": 1.9874064224351846e-05, + "loss": 0.5467, + "step": 1055 + }, + { + "epoch": 0.15243025596778834, + "grad_norm": 0.2374693346992944, + "learning_rate": 1.987287023332261e-05, + "loss": 0.5541, + "step": 1060 + }, + { + "epoch": 0.15314926660914582, + "grad_norm": 0.23345027765310092, + "learning_rate": 1.987167064508751e-05, + "loss": 0.5535, + "step": 1065 + }, + { + "epoch": 0.1538682772505033, + "grad_norm": 0.22219741109666344, + "learning_rate": 1.9870465460326628e-05, + "loss": 0.5441, + "step": 1070 + }, + { + "epoch": 0.1545872878918608, + "grad_norm": 0.2359136000366983, + "learning_rate": 1.9869254679723222e-05, + "loss": 0.5513, + "step": 1075 + }, + { + "epoch": 0.15530629853321828, + "grad_norm": 0.23894326584217548, + "learning_rate": 1.986803830396371e-05, + "loss": 0.5478, + "step": 1080 + }, + { + "epoch": 0.15602530917457577, + "grad_norm": 0.23669074673568327, + "learning_rate": 1.9866816333737694e-05, + "loss": 0.5463, + "step": 1085 + }, + { + "epoch": 0.1567443198159333, + "grad_norm": 0.2294896030560247, + "learning_rate": 1.9865588769737944e-05, + "loss": 0.548, + "step": 1090 + }, + { + "epoch": 0.15746333045729077, + "grad_norm": 0.22698285930341772, + "learning_rate": 1.9864355612660397e-05, + "loss": 0.5567, + "step": 1095 + }, + { + "epoch": 0.15818234109864826, + "grad_norm": 0.2195602706946871, + "learning_rate": 1.9863116863204165e-05, + "loss": 0.5371, + "step": 1100 + }, + { + "epoch": 0.15890135174000575, + "grad_norm": 0.23219751677955974, + "learning_rate": 1.9861872522071532e-05, + "loss": 0.5308, + "step": 1105 + }, + { + "epoch": 0.15962036238136323, + "grad_norm": 0.22608054174949835, + "learning_rate": 1.9860622589967946e-05, + "loss": 0.5327, + "step": 1110 + }, + { + "epoch": 0.16033937302272075, + "grad_norm": 0.2295650963912051, + "learning_rate": 1.985936706760203e-05, + "loss": 0.5443, + "step": 1115 + }, + { + "epoch": 0.16105838366407824, + "grad_norm": 0.2291500016959589, + "learning_rate": 1.985810595568558e-05, + "loss": 0.5317, + "step": 1120 + }, + { + "epoch": 0.16177739430543572, + "grad_norm": 0.22975864734060938, + "learning_rate": 1.9856839254933545e-05, + "loss": 0.5206, + "step": 1125 + }, + { + "epoch": 0.1624964049467932, + "grad_norm": 0.22403915883013656, + "learning_rate": 1.9855566966064062e-05, + "loss": 0.5432, + "step": 1130 + }, + { + "epoch": 0.1632154155881507, + "grad_norm": 0.23054931238613238, + "learning_rate": 1.9854289089798422e-05, + "loss": 0.5497, + "step": 1135 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 0.23378976237458074, + "learning_rate": 1.985300562686109e-05, + "loss": 0.5382, + "step": 1140 + }, + { + "epoch": 0.1646534368708657, + "grad_norm": 0.2453434720853162, + "learning_rate": 1.98517165779797e-05, + "loss": 0.5522, + "step": 1145 + }, + { + "epoch": 0.1653724475122232, + "grad_norm": 0.23126619905706874, + "learning_rate": 1.9850421943885045e-05, + "loss": 0.5256, + "step": 1150 + }, + { + "epoch": 0.16609145815358067, + "grad_norm": 0.23933710626023538, + "learning_rate": 1.9849121725311094e-05, + "loss": 0.5363, + "step": 1155 + }, + { + "epoch": 0.16681046879493816, + "grad_norm": 0.23609953921386437, + "learning_rate": 1.984781592299497e-05, + "loss": 0.5338, + "step": 1160 + }, + { + "epoch": 0.16752947943629565, + "grad_norm": 0.22988073789336907, + "learning_rate": 1.984650453767698e-05, + "loss": 0.5213, + "step": 1165 + }, + { + "epoch": 0.16824849007765316, + "grad_norm": 0.22991077323241457, + "learning_rate": 1.9845187570100576e-05, + "loss": 0.5415, + "step": 1170 + }, + { + "epoch": 0.16896750071901065, + "grad_norm": 0.22750061988091566, + "learning_rate": 1.9843865021012386e-05, + "loss": 0.5498, + "step": 1175 + }, + { + "epoch": 0.16968651136036814, + "grad_norm": 0.237248966845271, + "learning_rate": 1.9842536891162202e-05, + "loss": 0.5599, + "step": 1180 + }, + { + "epoch": 0.17040552200172562, + "grad_norm": 0.23234132221437664, + "learning_rate": 1.984120318130297e-05, + "loss": 0.5475, + "step": 1185 + }, + { + "epoch": 0.1711245326430831, + "grad_norm": 0.2419059605223682, + "learning_rate": 1.983986389219082e-05, + "loss": 0.5428, + "step": 1190 + }, + { + "epoch": 0.1718435432844406, + "grad_norm": 0.22907754436174932, + "learning_rate": 1.9838519024585025e-05, + "loss": 0.552, + "step": 1195 + }, + { + "epoch": 0.1725625539257981, + "grad_norm": 0.21402770173233282, + "learning_rate": 1.9837168579248027e-05, + "loss": 0.5276, + "step": 1200 + }, + { + "epoch": 0.1732815645671556, + "grad_norm": 0.227098258804778, + "learning_rate": 1.983581255694543e-05, + "loss": 0.5415, + "step": 1205 + }, + { + "epoch": 0.1740005752085131, + "grad_norm": 0.2491045684374358, + "learning_rate": 1.983445095844601e-05, + "loss": 0.5439, + "step": 1210 + }, + { + "epoch": 0.17471958584987057, + "grad_norm": 0.22501977101506793, + "learning_rate": 1.9833083784521687e-05, + "loss": 0.5392, + "step": 1215 + }, + { + "epoch": 0.17543859649122806, + "grad_norm": 0.22291022506238115, + "learning_rate": 1.9831711035947552e-05, + "loss": 0.5256, + "step": 1220 + }, + { + "epoch": 0.17615760713258558, + "grad_norm": 0.22811394434642848, + "learning_rate": 1.9830332713501855e-05, + "loss": 0.5374, + "step": 1225 + }, + { + "epoch": 0.17687661777394306, + "grad_norm": 0.2353804592934281, + "learning_rate": 1.9828948817966006e-05, + "loss": 0.5486, + "step": 1230 + }, + { + "epoch": 0.17759562841530055, + "grad_norm": 0.23615171001248797, + "learning_rate": 1.9827559350124573e-05, + "loss": 0.5414, + "step": 1235 + }, + { + "epoch": 0.17831463905665804, + "grad_norm": 0.23404800644559273, + "learning_rate": 1.9826164310765284e-05, + "loss": 0.5478, + "step": 1240 + }, + { + "epoch": 0.17903364969801552, + "grad_norm": 0.23147222875411733, + "learning_rate": 1.9824763700679026e-05, + "loss": 0.5643, + "step": 1245 + }, + { + "epoch": 0.179752660339373, + "grad_norm": 0.22886466551947476, + "learning_rate": 1.9823357520659843e-05, + "loss": 0.5534, + "step": 1250 + }, + { + "epoch": 0.18047167098073053, + "grad_norm": 0.23299717065916334, + "learning_rate": 1.982194577150494e-05, + "loss": 0.5497, + "step": 1255 + }, + { + "epoch": 0.181190681622088, + "grad_norm": 0.21174447690771017, + "learning_rate": 1.982052845401468e-05, + "loss": 0.5229, + "step": 1260 + }, + { + "epoch": 0.1819096922634455, + "grad_norm": 0.2288204806983009, + "learning_rate": 1.981910556899257e-05, + "loss": 0.5507, + "step": 1265 + }, + { + "epoch": 0.182628702904803, + "grad_norm": 0.22209899205461645, + "learning_rate": 1.9817677117245293e-05, + "loss": 0.5541, + "step": 1270 + }, + { + "epoch": 0.18334771354616047, + "grad_norm": 0.22373001284221763, + "learning_rate": 1.981624309958267e-05, + "loss": 0.5362, + "step": 1275 + }, + { + "epoch": 0.184066724187518, + "grad_norm": 0.26037079338235714, + "learning_rate": 1.9814803516817695e-05, + "loss": 0.5305, + "step": 1280 + }, + { + "epoch": 0.18478573482887548, + "grad_norm": 0.23351357170898626, + "learning_rate": 1.98133583697665e-05, + "loss": 0.5241, + "step": 1285 + }, + { + "epoch": 0.18550474547023296, + "grad_norm": 0.22572536057908968, + "learning_rate": 1.981190765924838e-05, + "loss": 0.5414, + "step": 1290 + }, + { + "epoch": 0.18622375611159045, + "grad_norm": 0.24302606733378837, + "learning_rate": 1.9810451386085788e-05, + "loss": 0.5206, + "step": 1295 + }, + { + "epoch": 0.18694276675294794, + "grad_norm": 0.22806542556817114, + "learning_rate": 1.9808989551104324e-05, + "loss": 0.5478, + "step": 1300 + }, + { + "epoch": 0.18766177739430542, + "grad_norm": 0.23299659727388808, + "learning_rate": 1.980752215513274e-05, + "loss": 0.5214, + "step": 1305 + }, + { + "epoch": 0.18838078803566294, + "grad_norm": 0.2388535868127206, + "learning_rate": 1.9806049199002944e-05, + "loss": 0.5404, + "step": 1310 + }, + { + "epoch": 0.18909979867702043, + "grad_norm": 0.2469828609274157, + "learning_rate": 1.980457068355e-05, + "loss": 0.547, + "step": 1315 + }, + { + "epoch": 0.1898188093183779, + "grad_norm": 0.22257039601510287, + "learning_rate": 1.9803086609612118e-05, + "loss": 0.5374, + "step": 1320 + }, + { + "epoch": 0.1905378199597354, + "grad_norm": 0.2413030333997793, + "learning_rate": 1.980159697803066e-05, + "loss": 0.5271, + "step": 1325 + }, + { + "epoch": 0.1912568306010929, + "grad_norm": 0.23034141755393386, + "learning_rate": 1.980010178965014e-05, + "loss": 0.5401, + "step": 1330 + }, + { + "epoch": 0.1919758412424504, + "grad_norm": 0.2344180021730793, + "learning_rate": 1.9798601045318224e-05, + "loss": 0.5143, + "step": 1335 + }, + { + "epoch": 0.1926948518838079, + "grad_norm": 0.2300905493052747, + "learning_rate": 1.979709474588572e-05, + "loss": 0.5347, + "step": 1340 + }, + { + "epoch": 0.19341386252516538, + "grad_norm": 0.24252955790420352, + "learning_rate": 1.9795582892206598e-05, + "loss": 0.5587, + "step": 1345 + }, + { + "epoch": 0.19413287316652286, + "grad_norm": 0.22426323090474662, + "learning_rate": 1.9794065485137973e-05, + "loss": 0.5442, + "step": 1350 + }, + { + "epoch": 0.19485188380788035, + "grad_norm": 0.22660906301904027, + "learning_rate": 1.9792542525540093e-05, + "loss": 0.5578, + "step": 1355 + }, + { + "epoch": 0.19557089444923784, + "grad_norm": 0.22840321357737117, + "learning_rate": 1.9791014014276377e-05, + "loss": 0.5298, + "step": 1360 + }, + { + "epoch": 0.19628990509059535, + "grad_norm": 0.22583338327840055, + "learning_rate": 1.9789479952213372e-05, + "loss": 0.5156, + "step": 1365 + }, + { + "epoch": 0.19700891573195284, + "grad_norm": 0.2283477520153283, + "learning_rate": 1.978794034022079e-05, + "loss": 0.5349, + "step": 1370 + }, + { + "epoch": 0.19772792637331033, + "grad_norm": 0.22295892877749496, + "learning_rate": 1.9786395179171474e-05, + "loss": 0.5446, + "step": 1375 + }, + { + "epoch": 0.1984469370146678, + "grad_norm": 0.23247464793333245, + "learning_rate": 1.978484446994142e-05, + "loss": 0.5578, + "step": 1380 + }, + { + "epoch": 0.1991659476560253, + "grad_norm": 0.25651838953898104, + "learning_rate": 1.978328821340977e-05, + "loss": 0.534, + "step": 1385 + }, + { + "epoch": 0.19988495829738281, + "grad_norm": 0.24940461896639685, + "learning_rate": 1.978172641045881e-05, + "loss": 0.5368, + "step": 1390 + }, + { + "epoch": 0.2006039689387403, + "grad_norm": 0.23819459349466615, + "learning_rate": 1.9780159061973964e-05, + "loss": 0.5488, + "step": 1395 + }, + { + "epoch": 0.2013229795800978, + "grad_norm": 0.24600076890112443, + "learning_rate": 1.977858616884381e-05, + "loss": 0.5451, + "step": 1400 + }, + { + "epoch": 0.20204199022145528, + "grad_norm": 0.22966108206971866, + "learning_rate": 1.977700773196007e-05, + "loss": 0.5245, + "step": 1405 + }, + { + "epoch": 0.20276100086281276, + "grad_norm": 0.22368217165519352, + "learning_rate": 1.9775423752217594e-05, + "loss": 0.5399, + "step": 1410 + }, + { + "epoch": 0.20348001150417025, + "grad_norm": 0.2439537542208693, + "learning_rate": 1.9773834230514386e-05, + "loss": 0.5245, + "step": 1415 + }, + { + "epoch": 0.20419902214552776, + "grad_norm": 0.23360094996444933, + "learning_rate": 1.97722391677516e-05, + "loss": 0.5287, + "step": 1420 + }, + { + "epoch": 0.20491803278688525, + "grad_norm": 0.24149617489789105, + "learning_rate": 1.977063856483351e-05, + "loss": 0.5501, + "step": 1425 + }, + { + "epoch": 0.20563704342824274, + "grad_norm": 0.23400826927180796, + "learning_rate": 1.9769032422667548e-05, + "loss": 0.5381, + "step": 1430 + }, + { + "epoch": 0.20635605406960023, + "grad_norm": 0.2313057655360691, + "learning_rate": 1.976742074216428e-05, + "loss": 0.5181, + "step": 1435 + }, + { + "epoch": 0.2070750647109577, + "grad_norm": 0.245508577754147, + "learning_rate": 1.9765803524237417e-05, + "loss": 0.5362, + "step": 1440 + }, + { + "epoch": 0.20779407535231523, + "grad_norm": 0.22858703382328208, + "learning_rate": 1.9764180769803795e-05, + "loss": 0.5339, + "step": 1445 + }, + { + "epoch": 0.20851308599367271, + "grad_norm": 0.23956741295965744, + "learning_rate": 1.9762552479783407e-05, + "loss": 0.5522, + "step": 1450 + }, + { + "epoch": 0.2092320966350302, + "grad_norm": 0.2316826087844408, + "learning_rate": 1.9760918655099376e-05, + "loss": 0.5484, + "step": 1455 + }, + { + "epoch": 0.2099511072763877, + "grad_norm": 0.23046848800785244, + "learning_rate": 1.9759279296677957e-05, + "loss": 0.5528, + "step": 1460 + }, + { + "epoch": 0.21067011791774518, + "grad_norm": 0.2313691713412135, + "learning_rate": 1.9757634405448554e-05, + "loss": 0.5378, + "step": 1465 + }, + { + "epoch": 0.21138912855910266, + "grad_norm": 0.23265144139370875, + "learning_rate": 1.9755983982343698e-05, + "loss": 0.5287, + "step": 1470 + }, + { + "epoch": 0.21210813920046018, + "grad_norm": 0.24235250103699738, + "learning_rate": 1.9754328028299064e-05, + "loss": 0.5568, + "step": 1475 + }, + { + "epoch": 0.21282714984181766, + "grad_norm": 0.25300425841033486, + "learning_rate": 1.9752666544253453e-05, + "loss": 0.528, + "step": 1480 + }, + { + "epoch": 0.21354616048317515, + "grad_norm": 0.25582352827044114, + "learning_rate": 1.975099953114881e-05, + "loss": 0.5372, + "step": 1485 + }, + { + "epoch": 0.21426517112453264, + "grad_norm": 0.2393144672938059, + "learning_rate": 1.9749326989930213e-05, + "loss": 0.5557, + "step": 1490 + }, + { + "epoch": 0.21498418176589013, + "grad_norm": 0.2338384244722033, + "learning_rate": 1.974764892154587e-05, + "loss": 0.5278, + "step": 1495 + }, + { + "epoch": 0.21570319240724764, + "grad_norm": 0.22718114309745505, + "learning_rate": 1.9745965326947126e-05, + "loss": 0.5292, + "step": 1500 + }, + { + "epoch": 0.21642220304860513, + "grad_norm": 0.23738809850294332, + "learning_rate": 1.9744276207088454e-05, + "loss": 0.5381, + "step": 1505 + }, + { + "epoch": 0.21714121368996261, + "grad_norm": 0.2350559821128524, + "learning_rate": 1.974258156292747e-05, + "loss": 0.5271, + "step": 1510 + }, + { + "epoch": 0.2178602243313201, + "grad_norm": 0.22301098485626983, + "learning_rate": 1.9740881395424904e-05, + "loss": 0.523, + "step": 1515 + }, + { + "epoch": 0.2185792349726776, + "grad_norm": 0.22973786300474133, + "learning_rate": 1.973917570554464e-05, + "loss": 0.5225, + "step": 1520 + }, + { + "epoch": 0.21929824561403508, + "grad_norm": 0.22820444656957772, + "learning_rate": 1.973746449425368e-05, + "loss": 0.5199, + "step": 1525 + }, + { + "epoch": 0.2200172562553926, + "grad_norm": 0.24294662043501544, + "learning_rate": 1.973574776252215e-05, + "loss": 0.5336, + "step": 1530 + }, + { + "epoch": 0.22073626689675008, + "grad_norm": 0.2253611699849513, + "learning_rate": 1.9734025511323317e-05, + "loss": 0.5079, + "step": 1535 + }, + { + "epoch": 0.22145527753810756, + "grad_norm": 0.22870243521742797, + "learning_rate": 1.9732297741633577e-05, + "loss": 0.5228, + "step": 1540 + }, + { + "epoch": 0.22217428817946505, + "grad_norm": 0.23223776960809672, + "learning_rate": 1.973056445443245e-05, + "loss": 0.5333, + "step": 1545 + }, + { + "epoch": 0.22289329882082254, + "grad_norm": 0.22960419047645408, + "learning_rate": 1.9728825650702577e-05, + "loss": 0.5314, + "step": 1550 + }, + { + "epoch": 0.22361230946218005, + "grad_norm": 0.22057428146654934, + "learning_rate": 1.972708133142974e-05, + "loss": 0.5352, + "step": 1555 + }, + { + "epoch": 0.22433132010353754, + "grad_norm": 0.2260456985994222, + "learning_rate": 1.9725331497602848e-05, + "loss": 0.5338, + "step": 1560 + }, + { + "epoch": 0.22505033074489503, + "grad_norm": 0.23375258498113377, + "learning_rate": 1.972357615021392e-05, + "loss": 0.5282, + "step": 1565 + }, + { + "epoch": 0.22576934138625251, + "grad_norm": 0.24090124346949987, + "learning_rate": 1.972181529025812e-05, + "loss": 0.5192, + "step": 1570 + }, + { + "epoch": 0.22648835202761, + "grad_norm": 0.23821272932675513, + "learning_rate": 1.9720048918733723e-05, + "loss": 0.5203, + "step": 1575 + }, + { + "epoch": 0.2272073626689675, + "grad_norm": 0.22696455116323191, + "learning_rate": 1.9718277036642135e-05, + "loss": 0.5478, + "step": 1580 + }, + { + "epoch": 0.227926373310325, + "grad_norm": 0.22304866738956428, + "learning_rate": 1.971649964498789e-05, + "loss": 0.5137, + "step": 1585 + }, + { + "epoch": 0.2286453839516825, + "grad_norm": 0.22051335274667153, + "learning_rate": 1.971471674477864e-05, + "loss": 0.5222, + "step": 1590 + }, + { + "epoch": 0.22936439459303998, + "grad_norm": 0.23818798516644749, + "learning_rate": 1.9712928337025152e-05, + "loss": 0.5297, + "step": 1595 + }, + { + "epoch": 0.23008340523439746, + "grad_norm": 0.23680065043886803, + "learning_rate": 1.9711134422741335e-05, + "loss": 0.5384, + "step": 1600 + }, + { + "epoch": 0.23080241587575495, + "grad_norm": 0.22339621581722777, + "learning_rate": 1.9709335002944205e-05, + "loss": 0.5047, + "step": 1605 + }, + { + "epoch": 0.23152142651711247, + "grad_norm": 0.23067104286841825, + "learning_rate": 1.9707530078653903e-05, + "loss": 0.529, + "step": 1610 + }, + { + "epoch": 0.23224043715846995, + "grad_norm": 0.2205878433893513, + "learning_rate": 1.9705719650893692e-05, + "loss": 0.5245, + "step": 1615 + }, + { + "epoch": 0.23295944779982744, + "grad_norm": 0.23060983274023172, + "learning_rate": 1.9703903720689954e-05, + "loss": 0.5321, + "step": 1620 + }, + { + "epoch": 0.23367845844118493, + "grad_norm": 0.2322122812506246, + "learning_rate": 1.9702082289072192e-05, + "loss": 0.5389, + "step": 1625 + }, + { + "epoch": 0.23439746908254241, + "grad_norm": 0.22567259557106886, + "learning_rate": 1.9700255357073023e-05, + "loss": 0.5273, + "step": 1630 + }, + { + "epoch": 0.2351164797238999, + "grad_norm": 0.22286904843242045, + "learning_rate": 1.9698422925728184e-05, + "loss": 0.5247, + "step": 1635 + }, + { + "epoch": 0.23583549036525742, + "grad_norm": 0.24293209876127186, + "learning_rate": 1.969658499607654e-05, + "loss": 0.5055, + "step": 1640 + }, + { + "epoch": 0.2365545010066149, + "grad_norm": 0.23539146522743076, + "learning_rate": 1.9694741569160057e-05, + "loss": 0.5403, + "step": 1645 + }, + { + "epoch": 0.2372735116479724, + "grad_norm": 0.23425506379258582, + "learning_rate": 1.969289264602383e-05, + "loss": 0.5494, + "step": 1650 + }, + { + "epoch": 0.23799252228932988, + "grad_norm": 0.23818614892816958, + "learning_rate": 1.9691038227716062e-05, + "loss": 0.5355, + "step": 1655 + }, + { + "epoch": 0.23871153293068736, + "grad_norm": 0.22586868983651667, + "learning_rate": 1.9689178315288073e-05, + "loss": 0.5285, + "step": 1660 + }, + { + "epoch": 0.23943054357204488, + "grad_norm": 0.24169875014394682, + "learning_rate": 1.9687312909794304e-05, + "loss": 0.5434, + "step": 1665 + }, + { + "epoch": 0.24014955421340237, + "grad_norm": 0.2619831502991906, + "learning_rate": 1.9685442012292303e-05, + "loss": 0.5262, + "step": 1670 + }, + { + "epoch": 0.24086856485475985, + "grad_norm": 0.2457796865562363, + "learning_rate": 1.9683565623842734e-05, + "loss": 0.5305, + "step": 1675 + }, + { + "epoch": 0.24158757549611734, + "grad_norm": 0.24695986543118498, + "learning_rate": 1.9681683745509376e-05, + "loss": 0.5431, + "step": 1680 + }, + { + "epoch": 0.24230658613747483, + "grad_norm": 0.2262775958024951, + "learning_rate": 1.9679796378359114e-05, + "loss": 0.5288, + "step": 1685 + }, + { + "epoch": 0.24302559677883231, + "grad_norm": 0.2167246493798092, + "learning_rate": 1.967790352346195e-05, + "loss": 0.5347, + "step": 1690 + }, + { + "epoch": 0.24374460742018983, + "grad_norm": 0.22096827906452082, + "learning_rate": 1.9676005181891e-05, + "loss": 0.5202, + "step": 1695 + }, + { + "epoch": 0.24446361806154732, + "grad_norm": 0.23135900618997918, + "learning_rate": 1.967410135472249e-05, + "loss": 0.5259, + "step": 1700 + }, + { + "epoch": 0.2451826287029048, + "grad_norm": 0.23070941887649535, + "learning_rate": 1.9672192043035744e-05, + "loss": 0.5194, + "step": 1705 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 0.23834592003390928, + "learning_rate": 1.9670277247913205e-05, + "loss": 0.5476, + "step": 1710 + }, + { + "epoch": 0.24662064998561978, + "grad_norm": 0.23438083518633582, + "learning_rate": 1.966835697044043e-05, + "loss": 0.5208, + "step": 1715 + }, + { + "epoch": 0.2473396606269773, + "grad_norm": 0.23226909980735055, + "learning_rate": 1.9666431211706073e-05, + "loss": 0.5221, + "step": 1720 + }, + { + "epoch": 0.24805867126833478, + "grad_norm": 0.2466818291192998, + "learning_rate": 1.9664499972801902e-05, + "loss": 0.5382, + "step": 1725 + }, + { + "epoch": 0.24877768190969227, + "grad_norm": 0.22066837565634534, + "learning_rate": 1.966256325482279e-05, + "loss": 0.5127, + "step": 1730 + }, + { + "epoch": 0.24949669255104975, + "grad_norm": 0.23376402195778545, + "learning_rate": 1.966062105886672e-05, + "loss": 0.5252, + "step": 1735 + }, + { + "epoch": 0.25021570319240727, + "grad_norm": 0.22849185736104233, + "learning_rate": 1.9658673386034773e-05, + "loss": 0.5453, + "step": 1740 + }, + { + "epoch": 0.25093471383376476, + "grad_norm": 0.24879707626081934, + "learning_rate": 1.965672023743114e-05, + "loss": 0.5344, + "step": 1745 + }, + { + "epoch": 0.25165372447512224, + "grad_norm": 0.2253533058308538, + "learning_rate": 1.9654761614163112e-05, + "loss": 0.5202, + "step": 1750 + }, + { + "epoch": 0.25237273511647973, + "grad_norm": 0.24628121730237923, + "learning_rate": 1.9652797517341095e-05, + "loss": 0.519, + "step": 1755 + }, + { + "epoch": 0.2530917457578372, + "grad_norm": 0.21993950116541036, + "learning_rate": 1.9650827948078586e-05, + "loss": 0.5181, + "step": 1760 + }, + { + "epoch": 0.2538107563991947, + "grad_norm": 0.2216964533979465, + "learning_rate": 1.9648852907492187e-05, + "loss": 0.535, + "step": 1765 + }, + { + "epoch": 0.2545297670405522, + "grad_norm": 0.22733449106986603, + "learning_rate": 1.9646872396701603e-05, + "loss": 0.5341, + "step": 1770 + }, + { + "epoch": 0.2552487776819097, + "grad_norm": 0.23300459174017882, + "learning_rate": 1.964488641682965e-05, + "loss": 0.5457, + "step": 1775 + }, + { + "epoch": 0.25596778832326716, + "grad_norm": 0.2260882801630965, + "learning_rate": 1.9642894969002224e-05, + "loss": 0.5302, + "step": 1780 + }, + { + "epoch": 0.25668679896462465, + "grad_norm": 0.21935871604787338, + "learning_rate": 1.964089805434834e-05, + "loss": 0.5213, + "step": 1785 + }, + { + "epoch": 0.2574058096059822, + "grad_norm": 0.22969789518771314, + "learning_rate": 1.96388956740001e-05, + "loss": 0.5127, + "step": 1790 + }, + { + "epoch": 0.2581248202473397, + "grad_norm": 0.2217372653926415, + "learning_rate": 1.963688782909271e-05, + "loss": 0.5504, + "step": 1795 + }, + { + "epoch": 0.25884383088869717, + "grad_norm": 0.22767228659738542, + "learning_rate": 1.9634874520764478e-05, + "loss": 0.5119, + "step": 1800 + }, + { + "epoch": 0.25956284153005466, + "grad_norm": 0.2254639647009183, + "learning_rate": 1.96328557501568e-05, + "loss": 0.5207, + "step": 1805 + }, + { + "epoch": 0.26028185217141214, + "grad_norm": 0.23627855092627786, + "learning_rate": 1.9630831518414176e-05, + "loss": 0.5335, + "step": 1810 + }, + { + "epoch": 0.26100086281276963, + "grad_norm": 0.2281833795806712, + "learning_rate": 1.9628801826684197e-05, + "loss": 0.5279, + "step": 1815 + }, + { + "epoch": 0.2617198734541271, + "grad_norm": 0.22218430509158774, + "learning_rate": 1.9626766676117555e-05, + "loss": 0.5228, + "step": 1820 + }, + { + "epoch": 0.2624388840954846, + "grad_norm": 0.21712174411589044, + "learning_rate": 1.962472606786803e-05, + "loss": 0.525, + "step": 1825 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 0.23417579156196858, + "learning_rate": 1.9622680003092503e-05, + "loss": 0.5211, + "step": 1830 + }, + { + "epoch": 0.2638769053781996, + "grad_norm": 0.2240231565672544, + "learning_rate": 1.962062848295095e-05, + "loss": 0.5371, + "step": 1835 + }, + { + "epoch": 0.26459591601955706, + "grad_norm": 0.2222473795124956, + "learning_rate": 1.961857150860642e-05, + "loss": 0.5434, + "step": 1840 + }, + { + "epoch": 0.2653149266609146, + "grad_norm": 0.23076834678829988, + "learning_rate": 1.961650908122508e-05, + "loss": 0.522, + "step": 1845 + }, + { + "epoch": 0.2660339373022721, + "grad_norm": 0.23132830942202995, + "learning_rate": 1.961444120197618e-05, + "loss": 0.5141, + "step": 1850 + }, + { + "epoch": 0.2667529479436296, + "grad_norm": 0.2262698238973961, + "learning_rate": 1.961236787203205e-05, + "loss": 0.5175, + "step": 1855 + }, + { + "epoch": 0.26747195858498707, + "grad_norm": 0.2501956106882812, + "learning_rate": 1.9610289092568125e-05, + "loss": 0.5211, + "step": 1860 + }, + { + "epoch": 0.26819096922634456, + "grad_norm": 0.23101958311553186, + "learning_rate": 1.9608204864762923e-05, + "loss": 0.5388, + "step": 1865 + }, + { + "epoch": 0.26890997986770204, + "grad_norm": 0.22903904556030297, + "learning_rate": 1.9606115189798047e-05, + "loss": 0.513, + "step": 1870 + }, + { + "epoch": 0.26962899050905953, + "grad_norm": 0.2360316924178287, + "learning_rate": 1.9604020068858197e-05, + "loss": 0.5215, + "step": 1875 + }, + { + "epoch": 0.270348001150417, + "grad_norm": 0.2323414670928527, + "learning_rate": 1.960191950313115e-05, + "loss": 0.5197, + "step": 1880 + }, + { + "epoch": 0.2710670117917745, + "grad_norm": 0.23097225391963927, + "learning_rate": 1.9599813493807778e-05, + "loss": 0.5132, + "step": 1885 + }, + { + "epoch": 0.271786022433132, + "grad_norm": 0.22586506109921145, + "learning_rate": 1.959770204208204e-05, + "loss": 0.5217, + "step": 1890 + }, + { + "epoch": 0.2725050330744895, + "grad_norm": 0.2362053442728093, + "learning_rate": 1.959558514915097e-05, + "loss": 0.5328, + "step": 1895 + }, + { + "epoch": 0.273224043715847, + "grad_norm": 0.2326560206023545, + "learning_rate": 1.9593462816214698e-05, + "loss": 0.543, + "step": 1900 + }, + { + "epoch": 0.2739430543572045, + "grad_norm": 0.23463535723170362, + "learning_rate": 1.959133504447644e-05, + "loss": 0.5328, + "step": 1905 + }, + { + "epoch": 0.274662064998562, + "grad_norm": 0.2304443090931523, + "learning_rate": 1.9589201835142476e-05, + "loss": 0.5095, + "step": 1910 + }, + { + "epoch": 0.2753810756399195, + "grad_norm": 0.21985857057475458, + "learning_rate": 1.9587063189422188e-05, + "loss": 0.5194, + "step": 1915 + }, + { + "epoch": 0.27610008628127697, + "grad_norm": 0.2321461057505647, + "learning_rate": 1.9584919108528036e-05, + "loss": 0.5232, + "step": 1920 + }, + { + "epoch": 0.27681909692263446, + "grad_norm": 0.23450598290151903, + "learning_rate": 1.9582769593675557e-05, + "loss": 0.5148, + "step": 1925 + }, + { + "epoch": 0.27753810756399194, + "grad_norm": 0.23719067147262055, + "learning_rate": 1.958061464608337e-05, + "loss": 0.5241, + "step": 1930 + }, + { + "epoch": 0.27825711820534943, + "grad_norm": 0.22914212741212578, + "learning_rate": 1.9578454266973184e-05, + "loss": 0.5292, + "step": 1935 + }, + { + "epoch": 0.2789761288467069, + "grad_norm": 0.2428143124876925, + "learning_rate": 1.9576288457569764e-05, + "loss": 0.5394, + "step": 1940 + }, + { + "epoch": 0.2796951394880644, + "grad_norm": 0.24019828487603107, + "learning_rate": 1.9574117219100975e-05, + "loss": 0.5314, + "step": 1945 + }, + { + "epoch": 0.2804141501294219, + "grad_norm": 0.23020307675607476, + "learning_rate": 1.9571940552797758e-05, + "loss": 0.5514, + "step": 1950 + }, + { + "epoch": 0.28113316077077943, + "grad_norm": 0.23117631707107703, + "learning_rate": 1.9569758459894118e-05, + "loss": 0.5207, + "step": 1955 + }, + { + "epoch": 0.2818521714121369, + "grad_norm": 0.23188555518556003, + "learning_rate": 1.9567570941627144e-05, + "loss": 0.5106, + "step": 1960 + }, + { + "epoch": 0.2825711820534944, + "grad_norm": 0.22719305269705242, + "learning_rate": 1.9565377999237007e-05, + "loss": 0.5397, + "step": 1965 + }, + { + "epoch": 0.2832901926948519, + "grad_norm": 0.2425886923256403, + "learning_rate": 1.9563179633966944e-05, + "loss": 0.5389, + "step": 1970 + }, + { + "epoch": 0.2840092033362094, + "grad_norm": 0.2248447291879169, + "learning_rate": 1.9560975847063267e-05, + "loss": 0.5314, + "step": 1975 + }, + { + "epoch": 0.28472821397756687, + "grad_norm": 0.22291943170416983, + "learning_rate": 1.955876663977537e-05, + "loss": 0.5234, + "step": 1980 + }, + { + "epoch": 0.28544722461892436, + "grad_norm": 0.23294609267878633, + "learning_rate": 1.955655201335571e-05, + "loss": 0.5245, + "step": 1985 + }, + { + "epoch": 0.28616623526028184, + "grad_norm": 0.2467537340412599, + "learning_rate": 1.9554331969059825e-05, + "loss": 0.5185, + "step": 1990 + }, + { + "epoch": 0.28688524590163933, + "grad_norm": 0.24172071683786844, + "learning_rate": 1.955210650814632e-05, + "loss": 0.5443, + "step": 1995 + }, + { + "epoch": 0.2876042565429968, + "grad_norm": 0.22605868965849668, + "learning_rate": 1.9549875631876864e-05, + "loss": 0.5121, + "step": 2000 + }, + { + "epoch": 0.2883232671843543, + "grad_norm": 0.22942794996365853, + "learning_rate": 1.9547639341516206e-05, + "loss": 0.5095, + "step": 2005 + }, + { + "epoch": 0.28904227782571185, + "grad_norm": 0.2298099151783695, + "learning_rate": 1.9545397638332163e-05, + "loss": 0.5286, + "step": 2010 + }, + { + "epoch": 0.28976128846706933, + "grad_norm": 0.24778629700549126, + "learning_rate": 1.9543150523595625e-05, + "loss": 0.537, + "step": 2015 + }, + { + "epoch": 0.2904802991084268, + "grad_norm": 0.2322170927230343, + "learning_rate": 1.954089799858053e-05, + "loss": 0.5294, + "step": 2020 + }, + { + "epoch": 0.2911993097497843, + "grad_norm": 0.2238734266550472, + "learning_rate": 1.953864006456391e-05, + "loss": 0.5146, + "step": 2025 + }, + { + "epoch": 0.2919183203911418, + "grad_norm": 0.21981906060948142, + "learning_rate": 1.9536376722825844e-05, + "loss": 0.5077, + "step": 2030 + }, + { + "epoch": 0.2926373310324993, + "grad_norm": 0.22954693685234034, + "learning_rate": 1.953410797464949e-05, + "loss": 0.5335, + "step": 2035 + }, + { + "epoch": 0.29335634167385677, + "grad_norm": 0.23985169357884734, + "learning_rate": 1.9531833821321057e-05, + "loss": 0.5376, + "step": 2040 + }, + { + "epoch": 0.29407535231521426, + "grad_norm": 0.2207958326221015, + "learning_rate": 1.952955426412983e-05, + "loss": 0.52, + "step": 2045 + }, + { + "epoch": 0.29479436295657174, + "grad_norm": 0.23780493129732794, + "learning_rate": 1.9527269304368154e-05, + "loss": 0.4906, + "step": 2050 + }, + { + "epoch": 0.29551337359792923, + "grad_norm": 0.2330149879325887, + "learning_rate": 1.9524978943331435e-05, + "loss": 0.5194, + "step": 2055 + }, + { + "epoch": 0.2962323842392867, + "grad_norm": 0.2386438942499068, + "learning_rate": 1.9522683182318145e-05, + "loss": 0.5346, + "step": 2060 + }, + { + "epoch": 0.29695139488064426, + "grad_norm": 0.23293713570114105, + "learning_rate": 1.9520382022629814e-05, + "loss": 0.5459, + "step": 2065 + }, + { + "epoch": 0.29767040552200175, + "grad_norm": 0.21852751340739718, + "learning_rate": 1.951807546557103e-05, + "loss": 0.5164, + "step": 2070 + }, + { + "epoch": 0.29838941616335923, + "grad_norm": 0.23863393320891652, + "learning_rate": 1.951576351244945e-05, + "loss": 0.5379, + "step": 2075 + }, + { + "epoch": 0.2991084268047167, + "grad_norm": 0.22760314810091892, + "learning_rate": 1.9513446164575782e-05, + "loss": 0.5227, + "step": 2080 + }, + { + "epoch": 0.2998274374460742, + "grad_norm": 0.22785151169541662, + "learning_rate": 1.9511123423263797e-05, + "loss": 0.5279, + "step": 2085 + }, + { + "epoch": 0.3005464480874317, + "grad_norm": 0.22331529138091233, + "learning_rate": 1.950879528983032e-05, + "loss": 0.5168, + "step": 2090 + }, + { + "epoch": 0.3012654587287892, + "grad_norm": 0.2265341653861222, + "learning_rate": 1.9506461765595233e-05, + "loss": 0.5129, + "step": 2095 + }, + { + "epoch": 0.30198446937014667, + "grad_norm": 0.224803847238097, + "learning_rate": 1.950412285188148e-05, + "loss": 0.5113, + "step": 2100 + }, + { + "epoch": 0.30270348001150416, + "grad_norm": 0.22826292672125245, + "learning_rate": 1.9501778550015057e-05, + "loss": 0.5172, + "step": 2105 + }, + { + "epoch": 0.30342249065286164, + "grad_norm": 0.23941019012024453, + "learning_rate": 1.949942886132501e-05, + "loss": 0.5364, + "step": 2110 + }, + { + "epoch": 0.30414150129421913, + "grad_norm": 0.23098899764233535, + "learning_rate": 1.9497073787143445e-05, + "loss": 0.5198, + "step": 2115 + }, + { + "epoch": 0.3048605119355767, + "grad_norm": 0.2235225210957512, + "learning_rate": 1.9494713328805522e-05, + "loss": 0.5105, + "step": 2120 + }, + { + "epoch": 0.30557952257693416, + "grad_norm": 0.24219249616083108, + "learning_rate": 1.949234748764945e-05, + "loss": 0.5178, + "step": 2125 + }, + { + "epoch": 0.30629853321829165, + "grad_norm": 0.2294038434256185, + "learning_rate": 1.9489976265016483e-05, + "loss": 0.5236, + "step": 2130 + }, + { + "epoch": 0.30701754385964913, + "grad_norm": 0.22860251975225562, + "learning_rate": 1.9487599662250945e-05, + "loss": 0.5151, + "step": 2135 + }, + { + "epoch": 0.3077365545010066, + "grad_norm": 0.22768766298971088, + "learning_rate": 1.948521768070019e-05, + "loss": 0.5157, + "step": 2140 + }, + { + "epoch": 0.3084555651423641, + "grad_norm": 0.2262280341082414, + "learning_rate": 1.9482830321714634e-05, + "loss": 0.5179, + "step": 2145 + }, + { + "epoch": 0.3091745757837216, + "grad_norm": 0.21765278037284172, + "learning_rate": 1.9480437586647737e-05, + "loss": 0.5249, + "step": 2150 + }, + { + "epoch": 0.3098935864250791, + "grad_norm": 0.21991607271397515, + "learning_rate": 1.9478039476856004e-05, + "loss": 0.5151, + "step": 2155 + }, + { + "epoch": 0.31061259706643657, + "grad_norm": 0.22731893412220183, + "learning_rate": 1.9475635993698995e-05, + "loss": 0.5135, + "step": 2160 + }, + { + "epoch": 0.31133160770779406, + "grad_norm": 0.22518016603767735, + "learning_rate": 1.9473227138539305e-05, + "loss": 0.5062, + "step": 2165 + }, + { + "epoch": 0.31205061834915154, + "grad_norm": 0.22989447620543818, + "learning_rate": 1.9470812912742588e-05, + "loss": 0.5097, + "step": 2170 + }, + { + "epoch": 0.3127696289905091, + "grad_norm": 0.22642057674043994, + "learning_rate": 1.9468393317677537e-05, + "loss": 0.5136, + "step": 2175 + }, + { + "epoch": 0.3134886396318666, + "grad_norm": 0.2243558671870111, + "learning_rate": 1.9465968354715882e-05, + "loss": 0.5109, + "step": 2180 + }, + { + "epoch": 0.31420765027322406, + "grad_norm": 0.23756228609410254, + "learning_rate": 1.946353802523241e-05, + "loss": 0.5187, + "step": 2185 + }, + { + "epoch": 0.31492666091458155, + "grad_norm": 0.22829309083723986, + "learning_rate": 1.946110233060493e-05, + "loss": 0.5119, + "step": 2190 + }, + { + "epoch": 0.31564567155593903, + "grad_norm": 0.2258910659924151, + "learning_rate": 1.945866127221432e-05, + "loss": 0.5269, + "step": 2195 + }, + { + "epoch": 0.3163646821972965, + "grad_norm": 0.24851677104917747, + "learning_rate": 1.945621485144447e-05, + "loss": 0.5211, + "step": 2200 + }, + { + "epoch": 0.317083692838654, + "grad_norm": 0.2252194695451131, + "learning_rate": 1.9453763069682336e-05, + "loss": 0.5154, + "step": 2205 + }, + { + "epoch": 0.3178027034800115, + "grad_norm": 0.2567628615443648, + "learning_rate": 1.94513059283179e-05, + "loss": 0.5224, + "step": 2210 + }, + { + "epoch": 0.318521714121369, + "grad_norm": 0.22849096152898013, + "learning_rate": 1.9448843428744175e-05, + "loss": 0.4982, + "step": 2215 + }, + { + "epoch": 0.31924072476272647, + "grad_norm": 0.21917519968492866, + "learning_rate": 1.944637557235723e-05, + "loss": 0.5091, + "step": 2220 + }, + { + "epoch": 0.31995973540408396, + "grad_norm": 0.22313052392410496, + "learning_rate": 1.944390236055616e-05, + "loss": 0.536, + "step": 2225 + }, + { + "epoch": 0.3206787460454415, + "grad_norm": 0.25969930780411865, + "learning_rate": 1.9441423794743092e-05, + "loss": 0.5357, + "step": 2230 + }, + { + "epoch": 0.321397756686799, + "grad_norm": 0.2489657558014886, + "learning_rate": 1.9438939876323202e-05, + "loss": 0.5148, + "step": 2235 + }, + { + "epoch": 0.3221167673281565, + "grad_norm": 0.24222728781124633, + "learning_rate": 1.9436450606704688e-05, + "loss": 0.5291, + "step": 2240 + }, + { + "epoch": 0.32283577796951396, + "grad_norm": 0.2258550445944719, + "learning_rate": 1.943395598729879e-05, + "loss": 0.5101, + "step": 2245 + }, + { + "epoch": 0.32355478861087145, + "grad_norm": 0.22598842350418805, + "learning_rate": 1.9431456019519774e-05, + "loss": 0.5107, + "step": 2250 + }, + { + "epoch": 0.32427379925222893, + "grad_norm": 0.23349735743483102, + "learning_rate": 1.9428950704784944e-05, + "loss": 0.5078, + "step": 2255 + }, + { + "epoch": 0.3249928098935864, + "grad_norm": 0.2271548994925287, + "learning_rate": 1.942644004451463e-05, + "loss": 0.5317, + "step": 2260 + }, + { + "epoch": 0.3257118205349439, + "grad_norm": 0.2149592225481257, + "learning_rate": 1.94239240401322e-05, + "loss": 0.4978, + "step": 2265 + }, + { + "epoch": 0.3264308311763014, + "grad_norm": 0.2331575054290221, + "learning_rate": 1.9421402693064037e-05, + "loss": 0.5117, + "step": 2270 + }, + { + "epoch": 0.3271498418176589, + "grad_norm": 0.25608468064653417, + "learning_rate": 1.941887600473958e-05, + "loss": 0.5102, + "step": 2275 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 0.2298492196508499, + "learning_rate": 1.941634397659126e-05, + "loss": 0.5161, + "step": 2280 + }, + { + "epoch": 0.3285878631003739, + "grad_norm": 0.2316441559609167, + "learning_rate": 1.941380661005457e-05, + "loss": 0.527, + "step": 2285 + }, + { + "epoch": 0.3293068737417314, + "grad_norm": 0.2342418485461944, + "learning_rate": 1.9411263906568007e-05, + "loss": 0.5153, + "step": 2290 + }, + { + "epoch": 0.3300258843830889, + "grad_norm": 0.21500183625653949, + "learning_rate": 1.94087158675731e-05, + "loss": 0.5227, + "step": 2295 + }, + { + "epoch": 0.3307448950244464, + "grad_norm": 0.21606183098344467, + "learning_rate": 1.9406162494514406e-05, + "loss": 0.5151, + "step": 2300 + }, + { + "epoch": 0.33146390566580386, + "grad_norm": 0.21936796559465915, + "learning_rate": 1.9403603788839503e-05, + "loss": 0.5342, + "step": 2305 + }, + { + "epoch": 0.33218291630716135, + "grad_norm": 0.22373896227411189, + "learning_rate": 1.940103975199899e-05, + "loss": 0.5176, + "step": 2310 + }, + { + "epoch": 0.33290192694851883, + "grad_norm": 0.23290539221639253, + "learning_rate": 1.93984703854465e-05, + "loss": 0.5263, + "step": 2315 + }, + { + "epoch": 0.3336209375898763, + "grad_norm": 0.2144772210153438, + "learning_rate": 1.9395895690638662e-05, + "loss": 0.504, + "step": 2320 + }, + { + "epoch": 0.3343399482312338, + "grad_norm": 0.2238835015898237, + "learning_rate": 1.9393315669035157e-05, + "loss": 0.522, + "step": 2325 + }, + { + "epoch": 0.3350589588725913, + "grad_norm": 0.23661097593806635, + "learning_rate": 1.9390730322098667e-05, + "loss": 0.5149, + "step": 2330 + }, + { + "epoch": 0.3357779695139488, + "grad_norm": 0.2261318626286406, + "learning_rate": 1.9388139651294897e-05, + "loss": 0.5251, + "step": 2335 + }, + { + "epoch": 0.3364969801553063, + "grad_norm": 0.23000865916981403, + "learning_rate": 1.9385543658092572e-05, + "loss": 0.5302, + "step": 2340 + }, + { + "epoch": 0.3372159907966638, + "grad_norm": 0.22875455612471773, + "learning_rate": 1.938294234396343e-05, + "loss": 0.5211, + "step": 2345 + }, + { + "epoch": 0.3379350014380213, + "grad_norm": 0.2219866431765329, + "learning_rate": 1.938033571038223e-05, + "loss": 0.528, + "step": 2350 + }, + { + "epoch": 0.3386540120793788, + "grad_norm": 0.21715874352702125, + "learning_rate": 1.9377723758826746e-05, + "loss": 0.49, + "step": 2355 + }, + { + "epoch": 0.3393730227207363, + "grad_norm": 0.22382192120897512, + "learning_rate": 1.9375106490777768e-05, + "loss": 0.5129, + "step": 2360 + }, + { + "epoch": 0.34009203336209376, + "grad_norm": 0.2961426979518339, + "learning_rate": 1.9372483907719092e-05, + "loss": 0.4934, + "step": 2365 + }, + { + "epoch": 0.34081104400345125, + "grad_norm": 0.23631695058807112, + "learning_rate": 1.936985601113754e-05, + "loss": 0.5105, + "step": 2370 + }, + { + "epoch": 0.34153005464480873, + "grad_norm": 0.2294202832007626, + "learning_rate": 1.936722280252294e-05, + "loss": 0.5203, + "step": 2375 + }, + { + "epoch": 0.3422490652861662, + "grad_norm": 0.2184499380948413, + "learning_rate": 1.9364584283368127e-05, + "loss": 0.4972, + "step": 2380 + }, + { + "epoch": 0.3429680759275237, + "grad_norm": 0.3176172772787008, + "learning_rate": 1.9361940455168954e-05, + "loss": 0.5156, + "step": 2385 + }, + { + "epoch": 0.3436870865688812, + "grad_norm": 0.22609068671139035, + "learning_rate": 1.935929131942428e-05, + "loss": 0.5182, + "step": 2390 + }, + { + "epoch": 0.34440609721023874, + "grad_norm": 0.23496652718953012, + "learning_rate": 1.9356636877635975e-05, + "loss": 0.5247, + "step": 2395 + }, + { + "epoch": 0.3451251078515962, + "grad_norm": 0.2410937708363873, + "learning_rate": 1.935397713130892e-05, + "loss": 0.5155, + "step": 2400 + }, + { + "epoch": 0.3458441184929537, + "grad_norm": 0.24457505196862847, + "learning_rate": 1.935131208195099e-05, + "loss": 0.5234, + "step": 2405 + }, + { + "epoch": 0.3465631291343112, + "grad_norm": 0.2314728776928187, + "learning_rate": 1.9348641731073085e-05, + "loss": 0.5004, + "step": 2410 + }, + { + "epoch": 0.3472821397756687, + "grad_norm": 0.24582034077579354, + "learning_rate": 1.9345966080189095e-05, + "loss": 0.5425, + "step": 2415 + }, + { + "epoch": 0.3480011504170262, + "grad_norm": 0.22554765028784285, + "learning_rate": 1.934328513081592e-05, + "loss": 0.5265, + "step": 2420 + }, + { + "epoch": 0.34872016105838366, + "grad_norm": 0.2216814016503243, + "learning_rate": 1.9340598884473478e-05, + "loss": 0.5137, + "step": 2425 + }, + { + "epoch": 0.34943917169974115, + "grad_norm": 0.22611290789235558, + "learning_rate": 1.9337907342684664e-05, + "loss": 0.4992, + "step": 2430 + }, + { + "epoch": 0.35015818234109863, + "grad_norm": 0.24118982095464295, + "learning_rate": 1.933521050697539e-05, + "loss": 0.5046, + "step": 2435 + }, + { + "epoch": 0.3508771929824561, + "grad_norm": 0.23009247877401107, + "learning_rate": 1.933250837887457e-05, + "loss": 0.533, + "step": 2440 + }, + { + "epoch": 0.3515962036238136, + "grad_norm": 0.22314852251039957, + "learning_rate": 1.932980095991412e-05, + "loss": 0.5123, + "step": 2445 + }, + { + "epoch": 0.35231521426517115, + "grad_norm": 0.232637861133316, + "learning_rate": 1.9327088251628946e-05, + "loss": 0.5195, + "step": 2450 + }, + { + "epoch": 0.35303422490652864, + "grad_norm": 0.2250538273016955, + "learning_rate": 1.9324370255556957e-05, + "loss": 0.5237, + "step": 2455 + }, + { + "epoch": 0.3537532355478861, + "grad_norm": 0.2340025948600299, + "learning_rate": 1.932164697323906e-05, + "loss": 0.5081, + "step": 2460 + }, + { + "epoch": 0.3544722461892436, + "grad_norm": 0.23692479665974087, + "learning_rate": 1.9318918406219168e-05, + "loss": 0.5218, + "step": 2465 + }, + { + "epoch": 0.3551912568306011, + "grad_norm": 0.5252858739768315, + "learning_rate": 1.9316184556044176e-05, + "loss": 0.5291, + "step": 2470 + }, + { + "epoch": 0.3559102674719586, + "grad_norm": 0.22584081496056052, + "learning_rate": 1.931344542426398e-05, + "loss": 0.5115, + "step": 2475 + }, + { + "epoch": 0.3566292781133161, + "grad_norm": 0.22181076897628466, + "learning_rate": 1.931070101243147e-05, + "loss": 0.5236, + "step": 2480 + }, + { + "epoch": 0.35734828875467356, + "grad_norm": 0.2368592260778137, + "learning_rate": 1.930795132210253e-05, + "loss": 0.5196, + "step": 2485 + }, + { + "epoch": 0.35806729939603105, + "grad_norm": 0.25285566239619084, + "learning_rate": 1.930519635483604e-05, + "loss": 0.5348, + "step": 2490 + }, + { + "epoch": 0.35878631003738853, + "grad_norm": 0.2172012713603105, + "learning_rate": 1.9302436112193863e-05, + "loss": 0.5133, + "step": 2495 + }, + { + "epoch": 0.359505320678746, + "grad_norm": 0.22253220913655272, + "learning_rate": 1.929967059574086e-05, + "loss": 0.5195, + "step": 2500 + }, + { + "epoch": 0.36022433132010356, + "grad_norm": 0.2318823497847268, + "learning_rate": 1.9296899807044876e-05, + "loss": 0.5013, + "step": 2505 + }, + { + "epoch": 0.36094334196146105, + "grad_norm": 0.22474959745467496, + "learning_rate": 1.9294123747676757e-05, + "loss": 0.51, + "step": 2510 + }, + { + "epoch": 0.36166235260281854, + "grad_norm": 0.22499764413272827, + "learning_rate": 1.929134241921032e-05, + "loss": 0.5196, + "step": 2515 + }, + { + "epoch": 0.362381363244176, + "grad_norm": 0.22408515127502746, + "learning_rate": 1.928855582322238e-05, + "loss": 0.5061, + "step": 2520 + }, + { + "epoch": 0.3631003738855335, + "grad_norm": 0.2218720986538149, + "learning_rate": 1.9285763961292738e-05, + "loss": 0.4987, + "step": 2525 + }, + { + "epoch": 0.363819384526891, + "grad_norm": 0.23440751496432688, + "learning_rate": 1.9282966835004177e-05, + "loss": 0.4959, + "step": 2530 + }, + { + "epoch": 0.3645383951682485, + "grad_norm": 0.23791499533194543, + "learning_rate": 1.9280164445942467e-05, + "loss": 0.5045, + "step": 2535 + }, + { + "epoch": 0.365257405809606, + "grad_norm": 0.23397274808648272, + "learning_rate": 1.927735679569636e-05, + "loss": 0.51, + "step": 2540 + }, + { + "epoch": 0.36597641645096346, + "grad_norm": 0.22441000781632436, + "learning_rate": 1.9274543885857594e-05, + "loss": 0.5246, + "step": 2545 + }, + { + "epoch": 0.36669542709232095, + "grad_norm": 0.22439109575711147, + "learning_rate": 1.9271725718020877e-05, + "loss": 0.5163, + "step": 2550 + }, + { + "epoch": 0.36741443773367843, + "grad_norm": 0.23923944832721677, + "learning_rate": 1.9268902293783918e-05, + "loss": 0.4949, + "step": 2555 + }, + { + "epoch": 0.368133448375036, + "grad_norm": 0.22120021514337773, + "learning_rate": 1.926607361474739e-05, + "loss": 0.5122, + "step": 2560 + }, + { + "epoch": 0.36885245901639346, + "grad_norm": 0.2371975422289003, + "learning_rate": 1.9263239682514953e-05, + "loss": 0.5214, + "step": 2565 + }, + { + "epoch": 0.36957146965775095, + "grad_norm": 0.232241975255212, + "learning_rate": 1.9260400498693236e-05, + "loss": 0.5031, + "step": 2570 + }, + { + "epoch": 0.37029048029910844, + "grad_norm": 0.22581754155054365, + "learning_rate": 1.9257556064891858e-05, + "loss": 0.5011, + "step": 2575 + }, + { + "epoch": 0.3710094909404659, + "grad_norm": 0.24102315491721474, + "learning_rate": 1.9254706382723404e-05, + "loss": 0.518, + "step": 2580 + }, + { + "epoch": 0.3717285015818234, + "grad_norm": 0.2224274974729962, + "learning_rate": 1.925185145380344e-05, + "loss": 0.4986, + "step": 2585 + }, + { + "epoch": 0.3724475122231809, + "grad_norm": 0.2352443275901719, + "learning_rate": 1.9248991279750507e-05, + "loss": 0.5067, + "step": 2590 + }, + { + "epoch": 0.3731665228645384, + "grad_norm": 0.21567561516996372, + "learning_rate": 1.9246125862186116e-05, + "loss": 0.5139, + "step": 2595 + }, + { + "epoch": 0.3738855335058959, + "grad_norm": 0.222006321644596, + "learning_rate": 1.924325520273475e-05, + "loss": 0.5028, + "step": 2600 + }, + { + "epoch": 0.37460454414725336, + "grad_norm": 0.22928775820841665, + "learning_rate": 1.924037930302387e-05, + "loss": 0.5028, + "step": 2605 + }, + { + "epoch": 0.37532355478861085, + "grad_norm": 0.2316016899827689, + "learning_rate": 1.9237498164683898e-05, + "loss": 0.5161, + "step": 2610 + }, + { + "epoch": 0.3760425654299684, + "grad_norm": 0.22536272794248402, + "learning_rate": 1.9234611789348242e-05, + "loss": 0.5109, + "step": 2615 + }, + { + "epoch": 0.3767615760713259, + "grad_norm": 0.23014273480588587, + "learning_rate": 1.9231720178653254e-05, + "loss": 0.5029, + "step": 2620 + }, + { + "epoch": 0.37748058671268336, + "grad_norm": 0.22814428980830126, + "learning_rate": 1.9228823334238284e-05, + "loss": 0.5022, + "step": 2625 + }, + { + "epoch": 0.37819959735404085, + "grad_norm": 0.2167038042325131, + "learning_rate": 1.9225921257745623e-05, + "loss": 0.5108, + "step": 2630 + }, + { + "epoch": 0.37891860799539834, + "grad_norm": 0.23434021986166953, + "learning_rate": 1.9223013950820542e-05, + "loss": 0.5064, + "step": 2635 + }, + { + "epoch": 0.3796376186367558, + "grad_norm": 0.225965873014395, + "learning_rate": 1.922010141511128e-05, + "loss": 0.514, + "step": 2640 + }, + { + "epoch": 0.3803566292781133, + "grad_norm": 0.22919798332492977, + "learning_rate": 1.921718365226903e-05, + "loss": 0.4962, + "step": 2645 + }, + { + "epoch": 0.3810756399194708, + "grad_norm": 0.23126416170567896, + "learning_rate": 1.921426066394795e-05, + "loss": 0.521, + "step": 2650 + }, + { + "epoch": 0.3817946505608283, + "grad_norm": 0.25582988216177793, + "learning_rate": 1.9211332451805173e-05, + "loss": 0.5261, + "step": 2655 + }, + { + "epoch": 0.3825136612021858, + "grad_norm": 0.2281178044654486, + "learning_rate": 1.9208399017500773e-05, + "loss": 0.503, + "step": 2660 + }, + { + "epoch": 0.38323267184354326, + "grad_norm": 0.2631327587490055, + "learning_rate": 1.920546036269781e-05, + "loss": 0.5046, + "step": 2665 + }, + { + "epoch": 0.3839516824849008, + "grad_norm": 0.23088230748866914, + "learning_rate": 1.9202516489062273e-05, + "loss": 0.5008, + "step": 2670 + }, + { + "epoch": 0.3846706931262583, + "grad_norm": 0.23527118660089594, + "learning_rate": 1.9199567398263136e-05, + "loss": 0.5154, + "step": 2675 + }, + { + "epoch": 0.3853897037676158, + "grad_norm": 0.293795071835734, + "learning_rate": 1.919661309197232e-05, + "loss": 0.5095, + "step": 2680 + }, + { + "epoch": 0.38610871440897326, + "grad_norm": 0.239008766521943, + "learning_rate": 1.9193653571864706e-05, + "loss": 0.5361, + "step": 2685 + }, + { + "epoch": 0.38682772505033075, + "grad_norm": 0.2317785167460494, + "learning_rate": 1.9190688839618122e-05, + "loss": 0.5263, + "step": 2690 + }, + { + "epoch": 0.38754673569168824, + "grad_norm": 0.2247915886082916, + "learning_rate": 1.9187718896913364e-05, + "loss": 0.5206, + "step": 2695 + }, + { + "epoch": 0.3882657463330457, + "grad_norm": 0.2342002775002291, + "learning_rate": 1.918474374543417e-05, + "loss": 0.5148, + "step": 2700 + }, + { + "epoch": 0.3889847569744032, + "grad_norm": 0.24713293310816917, + "learning_rate": 1.918176338686724e-05, + "loss": 0.5291, + "step": 2705 + }, + { + "epoch": 0.3897037676157607, + "grad_norm": 0.2283808207676213, + "learning_rate": 1.9178777822902223e-05, + "loss": 0.5187, + "step": 2710 + }, + { + "epoch": 0.3904227782571182, + "grad_norm": 0.23504214119034128, + "learning_rate": 1.9175787055231713e-05, + "loss": 0.5146, + "step": 2715 + }, + { + "epoch": 0.3911417888984757, + "grad_norm": 0.2278359350821238, + "learning_rate": 1.917279108555127e-05, + "loss": 0.5052, + "step": 2720 + }, + { + "epoch": 0.3918607995398332, + "grad_norm": 0.2163025374806738, + "learning_rate": 1.9169789915559384e-05, + "loss": 0.508, + "step": 2725 + }, + { + "epoch": 0.3925798101811907, + "grad_norm": 0.22661359197059017, + "learning_rate": 1.91667835469575e-05, + "loss": 0.5054, + "step": 2730 + }, + { + "epoch": 0.3932988208225482, + "grad_norm": 0.2189191212011496, + "learning_rate": 1.916377198145002e-05, + "loss": 0.5049, + "step": 2735 + }, + { + "epoch": 0.3940178314639057, + "grad_norm": 0.22112339418211252, + "learning_rate": 1.9160755220744285e-05, + "loss": 0.507, + "step": 2740 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 0.24023378549204696, + "learning_rate": 1.9157733266550577e-05, + "loss": 0.5001, + "step": 2745 + }, + { + "epoch": 0.39545585274662065, + "grad_norm": 0.23085735744919814, + "learning_rate": 1.9154706120582124e-05, + "loss": 0.4964, + "step": 2750 + }, + { + "epoch": 0.39617486338797814, + "grad_norm": 0.22201978435599948, + "learning_rate": 1.9151673784555104e-05, + "loss": 0.5106, + "step": 2755 + }, + { + "epoch": 0.3968938740293356, + "grad_norm": 0.22146871558017686, + "learning_rate": 1.914863626018863e-05, + "loss": 0.521, + "step": 2760 + }, + { + "epoch": 0.3976128846706931, + "grad_norm": 0.261735920771169, + "learning_rate": 1.9145593549204765e-05, + "loss": 0.5158, + "step": 2765 + }, + { + "epoch": 0.3983318953120506, + "grad_norm": 0.23628237933283103, + "learning_rate": 1.9142545653328498e-05, + "loss": 0.5125, + "step": 2770 + }, + { + "epoch": 0.3990509059534081, + "grad_norm": 0.2244030842478789, + "learning_rate": 1.9139492574287773e-05, + "loss": 0.5065, + "step": 2775 + }, + { + "epoch": 0.39976991659476563, + "grad_norm": 0.22124683679006732, + "learning_rate": 1.9136434313813464e-05, + "loss": 0.5148, + "step": 2780 + }, + { + "epoch": 0.4004889272361231, + "grad_norm": 0.22194477521719427, + "learning_rate": 1.9133370873639384e-05, + "loss": 0.5187, + "step": 2785 + }, + { + "epoch": 0.4012079378774806, + "grad_norm": 0.2575630583603113, + "learning_rate": 1.913030225550228e-05, + "loss": 0.5218, + "step": 2790 + }, + { + "epoch": 0.4019269485188381, + "grad_norm": 0.22569098013577954, + "learning_rate": 1.9127228461141842e-05, + "loss": 0.4918, + "step": 2795 + }, + { + "epoch": 0.4026459591601956, + "grad_norm": 0.2310480906609719, + "learning_rate": 1.9124149492300688e-05, + "loss": 0.5119, + "step": 2800 + }, + { + "epoch": 0.40336496980155306, + "grad_norm": 0.23554005972607317, + "learning_rate": 1.9121065350724373e-05, + "loss": 0.5052, + "step": 2805 + }, + { + "epoch": 0.40408398044291055, + "grad_norm": 0.22735551853092875, + "learning_rate": 1.9117976038161382e-05, + "loss": 0.5191, + "step": 2810 + }, + { + "epoch": 0.40480299108426804, + "grad_norm": 0.22047183159407308, + "learning_rate": 1.911488155636313e-05, + "loss": 0.5108, + "step": 2815 + }, + { + "epoch": 0.4055220017256255, + "grad_norm": 0.21509880016454425, + "learning_rate": 1.9111781907083965e-05, + "loss": 0.5306, + "step": 2820 + }, + { + "epoch": 0.406241012366983, + "grad_norm": 0.22716977879847486, + "learning_rate": 1.9108677092081168e-05, + "loss": 0.5072, + "step": 2825 + }, + { + "epoch": 0.4069600230083405, + "grad_norm": 0.24450214167605047, + "learning_rate": 1.910556711311495e-05, + "loss": 0.505, + "step": 2830 + }, + { + "epoch": 0.40767903364969804, + "grad_norm": 0.2327700820207908, + "learning_rate": 1.910245197194843e-05, + "loss": 0.5205, + "step": 2835 + }, + { + "epoch": 0.40839804429105553, + "grad_norm": 0.21825242144579582, + "learning_rate": 1.9099331670347685e-05, + "loss": 0.5101, + "step": 2840 + }, + { + "epoch": 0.409117054932413, + "grad_norm": 0.23767483509194245, + "learning_rate": 1.909620621008169e-05, + "loss": 0.5218, + "step": 2845 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 0.22889357601500054, + "learning_rate": 1.909307559292236e-05, + "loss": 0.5169, + "step": 2850 + }, + { + "epoch": 0.410555076215128, + "grad_norm": 0.22625666452282742, + "learning_rate": 1.908993982064453e-05, + "loss": 0.5072, + "step": 2855 + }, + { + "epoch": 0.4112740868564855, + "grad_norm": 0.21580683260119565, + "learning_rate": 1.9086798895025955e-05, + "loss": 0.5069, + "step": 2860 + }, + { + "epoch": 0.41199309749784296, + "grad_norm": 0.22956520306395545, + "learning_rate": 1.9083652817847313e-05, + "loss": 0.5215, + "step": 2865 + }, + { + "epoch": 0.41271210813920045, + "grad_norm": 0.23021825019034187, + "learning_rate": 1.9080501590892204e-05, + "loss": 0.5184, + "step": 2870 + }, + { + "epoch": 0.41343111878055794, + "grad_norm": 0.22176100614050664, + "learning_rate": 1.9077345215947148e-05, + "loss": 0.4997, + "step": 2875 + }, + { + "epoch": 0.4141501294219154, + "grad_norm": 0.27256463340242076, + "learning_rate": 1.9074183694801582e-05, + "loss": 0.5064, + "step": 2880 + }, + { + "epoch": 0.4148691400632729, + "grad_norm": 0.2262004936961111, + "learning_rate": 1.9071017029247855e-05, + "loss": 0.5125, + "step": 2885 + }, + { + "epoch": 0.41558815070463045, + "grad_norm": 0.24563263769390858, + "learning_rate": 1.9067845221081244e-05, + "loss": 0.5152, + "step": 2890 + }, + { + "epoch": 0.41630716134598794, + "grad_norm": 0.22076043331812095, + "learning_rate": 1.906466827209994e-05, + "loss": 0.5109, + "step": 2895 + }, + { + "epoch": 0.41702617198734543, + "grad_norm": 0.23198323940026291, + "learning_rate": 1.9061486184105032e-05, + "loss": 0.5149, + "step": 2900 + }, + { + "epoch": 0.4177451826287029, + "grad_norm": 0.23598266909227508, + "learning_rate": 1.905829895890054e-05, + "loss": 0.5223, + "step": 2905 + }, + { + "epoch": 0.4184641932700604, + "grad_norm": 0.228514639309264, + "learning_rate": 1.9055106598293397e-05, + "loss": 0.5058, + "step": 2910 + }, + { + "epoch": 0.4191832039114179, + "grad_norm": 0.23800299382683535, + "learning_rate": 1.9051909104093435e-05, + "loss": 0.5058, + "step": 2915 + }, + { + "epoch": 0.4199022145527754, + "grad_norm": 0.23133785515445354, + "learning_rate": 1.90487064781134e-05, + "loss": 0.5213, + "step": 2920 + }, + { + "epoch": 0.42062122519413286, + "grad_norm": 0.22342554440272905, + "learning_rate": 1.9045498722168955e-05, + "loss": 0.4991, + "step": 2925 + }, + { + "epoch": 0.42134023583549035, + "grad_norm": 0.22853945252588564, + "learning_rate": 1.904228583807867e-05, + "loss": 0.5006, + "step": 2930 + }, + { + "epoch": 0.42205924647684784, + "grad_norm": 0.22268903457409447, + "learning_rate": 1.903906782766401e-05, + "loss": 0.5138, + "step": 2935 + }, + { + "epoch": 0.4227782571182053, + "grad_norm": 0.23470747813012946, + "learning_rate": 1.903584469274936e-05, + "loss": 0.507, + "step": 2940 + }, + { + "epoch": 0.42349726775956287, + "grad_norm": 0.23158290190173897, + "learning_rate": 1.9032616435162006e-05, + "loss": 0.494, + "step": 2945 + }, + { + "epoch": 0.42421627840092035, + "grad_norm": 0.23651030424701674, + "learning_rate": 1.9029383056732137e-05, + "loss": 0.5192, + "step": 2950 + }, + { + "epoch": 0.42493528904227784, + "grad_norm": 0.22640594430508912, + "learning_rate": 1.902614455929284e-05, + "loss": 0.5259, + "step": 2955 + }, + { + "epoch": 0.42565429968363533, + "grad_norm": 0.22262895192874665, + "learning_rate": 1.9022900944680115e-05, + "loss": 0.5067, + "step": 2960 + }, + { + "epoch": 0.4263733103249928, + "grad_norm": 0.21766140475746906, + "learning_rate": 1.9019652214732856e-05, + "loss": 0.4988, + "step": 2965 + }, + { + "epoch": 0.4270923209663503, + "grad_norm": 0.23215571807401733, + "learning_rate": 1.9016398371292865e-05, + "loss": 0.5053, + "step": 2970 + }, + { + "epoch": 0.4278113316077078, + "grad_norm": 0.22387635197718406, + "learning_rate": 1.9013139416204827e-05, + "loss": 0.5277, + "step": 2975 + }, + { + "epoch": 0.4285303422490653, + "grad_norm": 0.21890148025509146, + "learning_rate": 1.9009875351316338e-05, + "loss": 0.5085, + "step": 2980 + }, + { + "epoch": 0.42924935289042276, + "grad_norm": 0.2253750179966219, + "learning_rate": 1.9006606178477887e-05, + "loss": 0.5131, + "step": 2985 + }, + { + "epoch": 0.42996836353178025, + "grad_norm": 0.22408290204012185, + "learning_rate": 1.9003331899542864e-05, + "loss": 0.5223, + "step": 2990 + }, + { + "epoch": 0.43068737417313774, + "grad_norm": 0.24372803124516482, + "learning_rate": 1.9000052516367548e-05, + "loss": 0.5124, + "step": 2995 + }, + { + "epoch": 0.4314063848144953, + "grad_norm": 0.21808115918337018, + "learning_rate": 1.8996768030811105e-05, + "loss": 0.5102, + "step": 3000 + }, + { + "epoch": 0.43212539545585277, + "grad_norm": 0.23243284851206658, + "learning_rate": 1.899347844473561e-05, + "loss": 0.517, + "step": 3005 + }, + { + "epoch": 0.43284440609721025, + "grad_norm": 0.22815958327074795, + "learning_rate": 1.899018376000602e-05, + "loss": 0.522, + "step": 3010 + }, + { + "epoch": 0.43356341673856774, + "grad_norm": 0.22171076721553623, + "learning_rate": 1.8986883978490183e-05, + "loss": 0.5072, + "step": 3015 + }, + { + "epoch": 0.43428242737992523, + "grad_norm": 0.23723540529297746, + "learning_rate": 1.8983579102058832e-05, + "loss": 0.5176, + "step": 3020 + }, + { + "epoch": 0.4350014380212827, + "grad_norm": 0.22060421243897868, + "learning_rate": 1.8980269132585603e-05, + "loss": 0.4943, + "step": 3025 + }, + { + "epoch": 0.4357204486626402, + "grad_norm": 0.26456992213454594, + "learning_rate": 1.8976954071947e-05, + "loss": 0.5068, + "step": 3030 + }, + { + "epoch": 0.4364394593039977, + "grad_norm": 0.24097182160658487, + "learning_rate": 1.8973633922022435e-05, + "loss": 0.51, + "step": 3035 + }, + { + "epoch": 0.4371584699453552, + "grad_norm": 0.23089530329520278, + "learning_rate": 1.8970308684694186e-05, + "loss": 0.5073, + "step": 3040 + }, + { + "epoch": 0.43787748058671266, + "grad_norm": 0.22460564208903933, + "learning_rate": 1.8966978361847426e-05, + "loss": 0.4963, + "step": 3045 + }, + { + "epoch": 0.43859649122807015, + "grad_norm": 0.23903022133946736, + "learning_rate": 1.8963642955370203e-05, + "loss": 0.5141, + "step": 3050 + }, + { + "epoch": 0.4393155018694277, + "grad_norm": 0.23200855596333272, + "learning_rate": 1.8960302467153457e-05, + "loss": 0.5134, + "step": 3055 + }, + { + "epoch": 0.4400345125107852, + "grad_norm": 0.2438151089386712, + "learning_rate": 1.8956956899091004e-05, + "loss": 0.4802, + "step": 3060 + }, + { + "epoch": 0.44075352315214267, + "grad_norm": 0.23012560648822744, + "learning_rate": 1.8953606253079537e-05, + "loss": 0.5116, + "step": 3065 + }, + { + "epoch": 0.44147253379350015, + "grad_norm": 0.22946741307925678, + "learning_rate": 1.8950250531018636e-05, + "loss": 0.5165, + "step": 3070 + }, + { + "epoch": 0.44219154443485764, + "grad_norm": 0.22590133613817706, + "learning_rate": 1.8946889734810744e-05, + "loss": 0.5089, + "step": 3075 + }, + { + "epoch": 0.44291055507621513, + "grad_norm": 0.23305060264263988, + "learning_rate": 1.89435238663612e-05, + "loss": 0.5143, + "step": 3080 + }, + { + "epoch": 0.4436295657175726, + "grad_norm": 0.23846789632242757, + "learning_rate": 1.894015292757821e-05, + "loss": 0.5098, + "step": 3085 + }, + { + "epoch": 0.4443485763589301, + "grad_norm": 0.23320831247194246, + "learning_rate": 1.893677692037284e-05, + "loss": 0.5181, + "step": 3090 + }, + { + "epoch": 0.4450675870002876, + "grad_norm": 0.22608827407258242, + "learning_rate": 1.8933395846659057e-05, + "loss": 0.5183, + "step": 3095 + }, + { + "epoch": 0.4457865976416451, + "grad_norm": 0.2284432845740079, + "learning_rate": 1.8930009708353675e-05, + "loss": 0.5116, + "step": 3100 + }, + { + "epoch": 0.44650560828300256, + "grad_norm": 0.24099811110806968, + "learning_rate": 1.89266185073764e-05, + "loss": 0.5091, + "step": 3105 + }, + { + "epoch": 0.4472246189243601, + "grad_norm": 0.2320885348377546, + "learning_rate": 1.8923222245649796e-05, + "loss": 0.5211, + "step": 3110 + }, + { + "epoch": 0.4479436295657176, + "grad_norm": 0.2225444619602451, + "learning_rate": 1.891982092509929e-05, + "loss": 0.5132, + "step": 3115 + }, + { + "epoch": 0.4486626402070751, + "grad_norm": 0.21887758463857643, + "learning_rate": 1.89164145476532e-05, + "loss": 0.5082, + "step": 3120 + }, + { + "epoch": 0.44938165084843257, + "grad_norm": 0.21821324265710812, + "learning_rate": 1.8913003115242686e-05, + "loss": 0.4948, + "step": 3125 + }, + { + "epoch": 0.45010066148979005, + "grad_norm": 0.22546647289935937, + "learning_rate": 1.8909586629801788e-05, + "loss": 0.4875, + "step": 3130 + }, + { + "epoch": 0.45081967213114754, + "grad_norm": 0.2388867270357045, + "learning_rate": 1.8906165093267407e-05, + "loss": 0.5105, + "step": 3135 + }, + { + "epoch": 0.45153868277250503, + "grad_norm": 0.22303569790676106, + "learning_rate": 1.8902738507579305e-05, + "loss": 0.5039, + "step": 3140 + }, + { + "epoch": 0.4522576934138625, + "grad_norm": 0.227972234263652, + "learning_rate": 1.8899306874680113e-05, + "loss": 0.4885, + "step": 3145 + }, + { + "epoch": 0.45297670405522, + "grad_norm": 0.2267767566599487, + "learning_rate": 1.8895870196515314e-05, + "loss": 0.5049, + "step": 3150 + }, + { + "epoch": 0.4536957146965775, + "grad_norm": 0.21945728338555323, + "learning_rate": 1.8892428475033264e-05, + "loss": 0.5137, + "step": 3155 + }, + { + "epoch": 0.454414725337935, + "grad_norm": 0.22083748984649187, + "learning_rate": 1.8888981712185166e-05, + "loss": 0.5106, + "step": 3160 + }, + { + "epoch": 0.4551337359792925, + "grad_norm": 0.2401040695033316, + "learning_rate": 1.888552990992509e-05, + "loss": 0.5157, + "step": 3165 + }, + { + "epoch": 0.45585274662065, + "grad_norm": 0.24329074291054098, + "learning_rate": 1.888207307020995e-05, + "loss": 0.5124, + "step": 3170 + }, + { + "epoch": 0.4565717572620075, + "grad_norm": 0.22721196193725088, + "learning_rate": 1.887861119499954e-05, + "loss": 0.5184, + "step": 3175 + }, + { + "epoch": 0.457290767903365, + "grad_norm": 0.22097197219531742, + "learning_rate": 1.887514428625648e-05, + "loss": 0.5118, + "step": 3180 + }, + { + "epoch": 0.45800977854472247, + "grad_norm": 0.22942161994403518, + "learning_rate": 1.8871672345946265e-05, + "loss": 0.5002, + "step": 3185 + }, + { + "epoch": 0.45872878918607995, + "grad_norm": 0.23294479900892548, + "learning_rate": 1.8868195376037234e-05, + "loss": 0.5106, + "step": 3190 + }, + { + "epoch": 0.45944779982743744, + "grad_norm": 0.220153166817927, + "learning_rate": 1.8864713378500574e-05, + "loss": 0.5046, + "step": 3195 + }, + { + "epoch": 0.46016681046879493, + "grad_norm": 0.23782734580650305, + "learning_rate": 1.886122635531033e-05, + "loss": 0.5083, + "step": 3200 + }, + { + "epoch": 0.4608858211101524, + "grad_norm": 0.22201878015890575, + "learning_rate": 1.8857734308443392e-05, + "loss": 0.4996, + "step": 3205 + }, + { + "epoch": 0.4616048317515099, + "grad_norm": 0.25951882547960176, + "learning_rate": 1.8854237239879505e-05, + "loss": 0.5186, + "step": 3210 + }, + { + "epoch": 0.4623238423928674, + "grad_norm": 0.23372725500667854, + "learning_rate": 1.8850735151601243e-05, + "loss": 0.5137, + "step": 3215 + }, + { + "epoch": 0.46304285303422493, + "grad_norm": 0.22203328341904643, + "learning_rate": 1.8847228045594047e-05, + "loss": 0.5058, + "step": 3220 + }, + { + "epoch": 0.4637618636755824, + "grad_norm": 0.22777680675837877, + "learning_rate": 1.884371592384619e-05, + "loss": 0.514, + "step": 3225 + }, + { + "epoch": 0.4644808743169399, + "grad_norm": 0.2535600439046393, + "learning_rate": 1.8840198788348795e-05, + "loss": 0.521, + "step": 3230 + }, + { + "epoch": 0.4651998849582974, + "grad_norm": 0.23929596753578028, + "learning_rate": 1.8836676641095815e-05, + "loss": 0.5041, + "step": 3235 + }, + { + "epoch": 0.4659188955996549, + "grad_norm": 0.22737419484986415, + "learning_rate": 1.8833149484084064e-05, + "loss": 0.4928, + "step": 3240 + }, + { + "epoch": 0.46663790624101237, + "grad_norm": 0.22747158371404952, + "learning_rate": 1.8829617319313183e-05, + "loss": 0.5176, + "step": 3245 + }, + { + "epoch": 0.46735691688236985, + "grad_norm": 0.2350098884737649, + "learning_rate": 1.882608014878565e-05, + "loss": 0.5063, + "step": 3250 + }, + { + "epoch": 0.46807592752372734, + "grad_norm": 0.24683317655013465, + "learning_rate": 1.8822537974506794e-05, + "loss": 0.5138, + "step": 3255 + }, + { + "epoch": 0.46879493816508483, + "grad_norm": 0.2321129798855353, + "learning_rate": 1.8818990798484766e-05, + "loss": 0.5237, + "step": 3260 + }, + { + "epoch": 0.4695139488064423, + "grad_norm": 0.2313513696629806, + "learning_rate": 1.8815438622730563e-05, + "loss": 0.5094, + "step": 3265 + }, + { + "epoch": 0.4702329594477998, + "grad_norm": 0.22221489438951242, + "learning_rate": 1.8811881449258008e-05, + "loss": 0.5257, + "step": 3270 + }, + { + "epoch": 0.47095197008915735, + "grad_norm": 0.2309891648111513, + "learning_rate": 1.8808319280083766e-05, + "loss": 0.4929, + "step": 3275 + }, + { + "epoch": 0.47167098073051483, + "grad_norm": 0.21935255772772466, + "learning_rate": 1.880475211722733e-05, + "loss": 0.5007, + "step": 3280 + }, + { + "epoch": 0.4723899913718723, + "grad_norm": 0.21443691347510438, + "learning_rate": 1.8801179962711022e-05, + "loss": 0.5071, + "step": 3285 + }, + { + "epoch": 0.4731090020132298, + "grad_norm": 0.22881942399765773, + "learning_rate": 1.8797602818559996e-05, + "loss": 0.5073, + "step": 3290 + }, + { + "epoch": 0.4738280126545873, + "grad_norm": 0.22744544291360294, + "learning_rate": 1.879402068680224e-05, + "loss": 0.5131, + "step": 3295 + }, + { + "epoch": 0.4745470232959448, + "grad_norm": 0.22692909860000035, + "learning_rate": 1.879043356946856e-05, + "loss": 0.5133, + "step": 3300 + }, + { + "epoch": 0.47526603393730227, + "grad_norm": 0.2258584203109247, + "learning_rate": 1.8786841468592592e-05, + "loss": 0.4988, + "step": 3305 + }, + { + "epoch": 0.47598504457865976, + "grad_norm": 0.2329578824209415, + "learning_rate": 1.8783244386210802e-05, + "loss": 0.5066, + "step": 3310 + }, + { + "epoch": 0.47670405522001724, + "grad_norm": 0.2178009959841328, + "learning_rate": 1.8779642324362475e-05, + "loss": 0.5135, + "step": 3315 + }, + { + "epoch": 0.47742306586137473, + "grad_norm": 0.22999756735795288, + "learning_rate": 1.877603528508972e-05, + "loss": 0.5033, + "step": 3320 + }, + { + "epoch": 0.4781420765027322, + "grad_norm": 0.23474329467975602, + "learning_rate": 1.8772423270437467e-05, + "loss": 0.5043, + "step": 3325 + }, + { + "epoch": 0.47886108714408976, + "grad_norm": 0.227373395841068, + "learning_rate": 1.876880628245347e-05, + "loss": 0.5365, + "step": 3330 + }, + { + "epoch": 0.47958009778544725, + "grad_norm": 0.22867563621139628, + "learning_rate": 1.87651843231883e-05, + "loss": 0.4967, + "step": 3335 + }, + { + "epoch": 0.48029910842680473, + "grad_norm": 0.2556750962454127, + "learning_rate": 1.8761557394695347e-05, + "loss": 0.4932, + "step": 3340 + }, + { + "epoch": 0.4810181190681622, + "grad_norm": 0.23099567532789703, + "learning_rate": 1.8757925499030817e-05, + "loss": 0.5051, + "step": 3345 + }, + { + "epoch": 0.4817371297095197, + "grad_norm": 0.23416258925845912, + "learning_rate": 1.8754288638253734e-05, + "loss": 0.5, + "step": 3350 + }, + { + "epoch": 0.4824561403508772, + "grad_norm": 0.23674712863753772, + "learning_rate": 1.875064681442594e-05, + "loss": 0.4995, + "step": 3355 + }, + { + "epoch": 0.4831751509922347, + "grad_norm": 0.2361268981891666, + "learning_rate": 1.8747000029612077e-05, + "loss": 0.5046, + "step": 3360 + }, + { + "epoch": 0.48389416163359217, + "grad_norm": 0.22590461729069614, + "learning_rate": 1.8743348285879615e-05, + "loss": 0.505, + "step": 3365 + }, + { + "epoch": 0.48461317227494966, + "grad_norm": 0.2318060822698632, + "learning_rate": 1.8739691585298833e-05, + "loss": 0.5107, + "step": 3370 + }, + { + "epoch": 0.48533218291630714, + "grad_norm": 0.24037948201072387, + "learning_rate": 1.8736029929942813e-05, + "loss": 0.5119, + "step": 3375 + }, + { + "epoch": 0.48605119355766463, + "grad_norm": 0.2319768646318957, + "learning_rate": 1.8732363321887447e-05, + "loss": 0.5179, + "step": 3380 + }, + { + "epoch": 0.48677020419902217, + "grad_norm": 0.23122665039326531, + "learning_rate": 1.872869176321144e-05, + "loss": 0.5049, + "step": 3385 + }, + { + "epoch": 0.48748921484037966, + "grad_norm": 0.22506661120445953, + "learning_rate": 1.87250152559963e-05, + "loss": 0.506, + "step": 3390 + }, + { + "epoch": 0.48820822548173715, + "grad_norm": 0.22671157330761432, + "learning_rate": 1.8721333802326345e-05, + "loss": 0.5124, + "step": 3395 + }, + { + "epoch": 0.48892723612309463, + "grad_norm": 0.2262392387165888, + "learning_rate": 1.871764740428869e-05, + "loss": 0.5075, + "step": 3400 + }, + { + "epoch": 0.4896462467644521, + "grad_norm": 0.23953668318855156, + "learning_rate": 1.871395606397326e-05, + "loss": 0.5035, + "step": 3405 + }, + { + "epoch": 0.4903652574058096, + "grad_norm": 0.22816597207508776, + "learning_rate": 1.8710259783472778e-05, + "loss": 0.5217, + "step": 3410 + }, + { + "epoch": 0.4910842680471671, + "grad_norm": 0.22589418481821869, + "learning_rate": 1.8706558564882766e-05, + "loss": 0.5225, + "step": 3415 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 0.23084269685354364, + "learning_rate": 1.8702852410301556e-05, + "loss": 0.4966, + "step": 3420 + }, + { + "epoch": 0.49252228932988207, + "grad_norm": 0.22922196106101597, + "learning_rate": 1.8699141321830257e-05, + "loss": 0.4897, + "step": 3425 + }, + { + "epoch": 0.49324129997123956, + "grad_norm": 0.22904199398424144, + "learning_rate": 1.8695425301572802e-05, + "loss": 0.4981, + "step": 3430 + }, + { + "epoch": 0.49396031061259704, + "grad_norm": 0.23056356839049091, + "learning_rate": 1.8691704351635903e-05, + "loss": 0.4904, + "step": 3435 + }, + { + "epoch": 0.4946793212539546, + "grad_norm": 0.27576248579574547, + "learning_rate": 1.8687978474129065e-05, + "loss": 0.5119, + "step": 3440 + }, + { + "epoch": 0.49539833189531207, + "grad_norm": 0.22644457374568444, + "learning_rate": 1.8684247671164596e-05, + "loss": 0.5015, + "step": 3445 + }, + { + "epoch": 0.49611734253666956, + "grad_norm": 0.22092616693572895, + "learning_rate": 1.868051194485759e-05, + "loss": 0.4963, + "step": 3450 + }, + { + "epoch": 0.49683635317802705, + "grad_norm": 0.2279874573282857, + "learning_rate": 1.8676771297325943e-05, + "loss": 0.4986, + "step": 3455 + }, + { + "epoch": 0.49755536381938453, + "grad_norm": 0.22574757462624237, + "learning_rate": 1.8673025730690323e-05, + "loss": 0.5125, + "step": 3460 + }, + { + "epoch": 0.498274374460742, + "grad_norm": 0.24253004022010075, + "learning_rate": 1.8669275247074197e-05, + "loss": 0.5042, + "step": 3465 + }, + { + "epoch": 0.4989933851020995, + "grad_norm": 0.22875289480420072, + "learning_rate": 1.8665519848603825e-05, + "loss": 0.513, + "step": 3470 + }, + { + "epoch": 0.499712395743457, + "grad_norm": 0.22613927853567778, + "learning_rate": 1.8661759537408245e-05, + "loss": 0.5026, + "step": 3475 + }, + { + "epoch": 0.5004314063848145, + "grad_norm": 0.23799705443593844, + "learning_rate": 1.865799431561928e-05, + "loss": 0.5166, + "step": 3480 + }, + { + "epoch": 0.501150417026172, + "grad_norm": 0.234919497465417, + "learning_rate": 1.865422418537154e-05, + "loss": 0.5125, + "step": 3485 + }, + { + "epoch": 0.5018694276675295, + "grad_norm": 0.2186332262355146, + "learning_rate": 1.8650449148802416e-05, + "loss": 0.506, + "step": 3490 + }, + { + "epoch": 0.502588438308887, + "grad_norm": 0.22872419867177446, + "learning_rate": 1.8646669208052086e-05, + "loss": 0.4887, + "step": 3495 + }, + { + "epoch": 0.5033074489502445, + "grad_norm": 0.2367736854157651, + "learning_rate": 1.86428843652635e-05, + "loss": 0.5178, + "step": 3500 + }, + { + "epoch": 0.504026459591602, + "grad_norm": 0.22439560833364647, + "learning_rate": 1.8639094622582395e-05, + "loss": 0.5116, + "step": 3505 + }, + { + "epoch": 0.5047454702329595, + "grad_norm": 0.22908813891521232, + "learning_rate": 1.8635299982157272e-05, + "loss": 0.4907, + "step": 3510 + }, + { + "epoch": 0.505464480874317, + "grad_norm": 0.23232159031491972, + "learning_rate": 1.8631500446139436e-05, + "loss": 0.5194, + "step": 3515 + }, + { + "epoch": 0.5061834915156744, + "grad_norm": 0.2283220143978447, + "learning_rate": 1.8627696016682934e-05, + "loss": 0.5001, + "step": 3520 + }, + { + "epoch": 0.5069025021570319, + "grad_norm": 0.22980408461545354, + "learning_rate": 1.8623886695944612e-05, + "loss": 0.5107, + "step": 3525 + }, + { + "epoch": 0.5076215127983894, + "grad_norm": 0.23722746787328844, + "learning_rate": 1.8620072486084075e-05, + "loss": 0.5066, + "step": 3530 + }, + { + "epoch": 0.5083405234397469, + "grad_norm": 0.23287381111820576, + "learning_rate": 1.8616253389263713e-05, + "loss": 0.5078, + "step": 3535 + }, + { + "epoch": 0.5090595340811044, + "grad_norm": 0.23693781129127364, + "learning_rate": 1.8612429407648668e-05, + "loss": 0.5255, + "step": 3540 + }, + { + "epoch": 0.5097785447224619, + "grad_norm": 0.2698671806909946, + "learning_rate": 1.860860054340687e-05, + "loss": 0.5131, + "step": 3545 + }, + { + "epoch": 0.5104975553638194, + "grad_norm": 0.22671582879050173, + "learning_rate": 1.8604766798709005e-05, + "loss": 0.5018, + "step": 3550 + }, + { + "epoch": 0.5112165660051768, + "grad_norm": 0.22718775482090045, + "learning_rate": 1.8600928175728535e-05, + "loss": 0.4973, + "step": 3555 + }, + { + "epoch": 0.5119355766465343, + "grad_norm": 0.21840372466561936, + "learning_rate": 1.8597084676641677e-05, + "loss": 0.4842, + "step": 3560 + }, + { + "epoch": 0.5126545872878918, + "grad_norm": 0.21210979676494143, + "learning_rate": 1.859323630362742e-05, + "loss": 0.4945, + "step": 3565 + }, + { + "epoch": 0.5133735979292493, + "grad_norm": 0.21804206661910921, + "learning_rate": 1.8589383058867515e-05, + "loss": 0.4896, + "step": 3570 + }, + { + "epoch": 0.5140926085706068, + "grad_norm": 0.23110253293609673, + "learning_rate": 1.8585524944546473e-05, + "loss": 0.5223, + "step": 3575 + }, + { + "epoch": 0.5148116192119644, + "grad_norm": 0.23613500534537313, + "learning_rate": 1.8581661962851566e-05, + "loss": 0.4987, + "step": 3580 + }, + { + "epoch": 0.5155306298533219, + "grad_norm": 0.22778446097339797, + "learning_rate": 1.8577794115972824e-05, + "loss": 0.5083, + "step": 3585 + }, + { + "epoch": 0.5162496404946794, + "grad_norm": 0.23767354447655717, + "learning_rate": 1.8573921406103048e-05, + "loss": 0.5087, + "step": 3590 + }, + { + "epoch": 0.5169686511360368, + "grad_norm": 0.21981817666742454, + "learning_rate": 1.8570043835437772e-05, + "loss": 0.499, + "step": 3595 + }, + { + "epoch": 0.5176876617773943, + "grad_norm": 0.2274951101541769, + "learning_rate": 1.8566161406175306e-05, + "loss": 0.5144, + "step": 3600 + }, + { + "epoch": 0.5184066724187518, + "grad_norm": 0.23572189148125483, + "learning_rate": 1.856227412051671e-05, + "loss": 0.4995, + "step": 3605 + }, + { + "epoch": 0.5191256830601093, + "grad_norm": 0.21629735180634516, + "learning_rate": 1.855838198066579e-05, + "loss": 0.4963, + "step": 3610 + }, + { + "epoch": 0.5198446937014668, + "grad_norm": 0.22363878924914682, + "learning_rate": 1.8554484988829108e-05, + "loss": 0.5063, + "step": 3615 + }, + { + "epoch": 0.5205637043428243, + "grad_norm": 0.2316899394396873, + "learning_rate": 1.8550583147215985e-05, + "loss": 0.4905, + "step": 3620 + }, + { + "epoch": 0.5212827149841818, + "grad_norm": 0.221354141997133, + "learning_rate": 1.854667645803847e-05, + "loss": 0.4988, + "step": 3625 + }, + { + "epoch": 0.5220017256255393, + "grad_norm": 0.2253239099128233, + "learning_rate": 1.8542764923511392e-05, + "loss": 0.5033, + "step": 3630 + }, + { + "epoch": 0.5227207362668967, + "grad_norm": 0.22954386049162062, + "learning_rate": 1.8538848545852294e-05, + "loss": 0.4878, + "step": 3635 + }, + { + "epoch": 0.5234397469082542, + "grad_norm": 0.22715952879859952, + "learning_rate": 1.8534927327281488e-05, + "loss": 0.499, + "step": 3640 + }, + { + "epoch": 0.5241587575496117, + "grad_norm": 0.22229217452640895, + "learning_rate": 1.8531001270022024e-05, + "loss": 0.4884, + "step": 3645 + }, + { + "epoch": 0.5248777681909692, + "grad_norm": 0.23891821519257553, + "learning_rate": 1.852707037629968e-05, + "loss": 0.5108, + "step": 3650 + }, + { + "epoch": 0.5255967788323267, + "grad_norm": 0.23675197935926118, + "learning_rate": 1.852313464834301e-05, + "loss": 0.4957, + "step": 3655 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.22253486079417645, + "learning_rate": 1.851919408838327e-05, + "loss": 0.4775, + "step": 3660 + }, + { + "epoch": 0.5270348001150417, + "grad_norm": 0.22333070985380785, + "learning_rate": 1.8515248698654486e-05, + "loss": 0.4983, + "step": 3665 + }, + { + "epoch": 0.5277538107563992, + "grad_norm": 0.223274460335613, + "learning_rate": 1.8511298481393403e-05, + "loss": 0.4982, + "step": 3670 + }, + { + "epoch": 0.5284728213977566, + "grad_norm": 0.22355882969418756, + "learning_rate": 1.850734343883951e-05, + "loss": 0.5084, + "step": 3675 + }, + { + "epoch": 0.5291918320391141, + "grad_norm": 0.22137972544339088, + "learning_rate": 1.8503383573235032e-05, + "loss": 0.5012, + "step": 3680 + }, + { + "epoch": 0.5299108426804716, + "grad_norm": 0.22426702815827018, + "learning_rate": 1.8499418886824926e-05, + "loss": 0.5014, + "step": 3685 + }, + { + "epoch": 0.5306298533218292, + "grad_norm": 0.2384895998266707, + "learning_rate": 1.8495449381856886e-05, + "loss": 0.4985, + "step": 3690 + }, + { + "epoch": 0.5313488639631867, + "grad_norm": 0.23328619696763794, + "learning_rate": 1.8491475060581337e-05, + "loss": 0.4892, + "step": 3695 + }, + { + "epoch": 0.5320678746045442, + "grad_norm": 0.2208450387758745, + "learning_rate": 1.8487495925251427e-05, + "loss": 0.4839, + "step": 3700 + }, + { + "epoch": 0.5327868852459017, + "grad_norm": 0.22952401879385564, + "learning_rate": 1.848351197812304e-05, + "loss": 0.5041, + "step": 3705 + }, + { + "epoch": 0.5335058958872592, + "grad_norm": 0.22545010734962523, + "learning_rate": 1.847952322145479e-05, + "loss": 0.5189, + "step": 3710 + }, + { + "epoch": 0.5342249065286166, + "grad_norm": 0.23198247491965804, + "learning_rate": 1.8475529657508016e-05, + "loss": 0.5041, + "step": 3715 + }, + { + "epoch": 0.5349439171699741, + "grad_norm": 0.2321580786261051, + "learning_rate": 1.8471531288546773e-05, + "loss": 0.5108, + "step": 3720 + }, + { + "epoch": 0.5356629278113316, + "grad_norm": 0.23247829012931276, + "learning_rate": 1.8467528116837857e-05, + "loss": 0.5238, + "step": 3725 + }, + { + "epoch": 0.5363819384526891, + "grad_norm": 0.23385610902788734, + "learning_rate": 1.8463520144650773e-05, + "loss": 0.4964, + "step": 3730 + }, + { + "epoch": 0.5371009490940466, + "grad_norm": 0.22626064932274298, + "learning_rate": 1.8459507374257755e-05, + "loss": 0.5097, + "step": 3735 + }, + { + "epoch": 0.5378199597354041, + "grad_norm": 0.22079798279561869, + "learning_rate": 1.845548980793375e-05, + "loss": 0.4997, + "step": 3740 + }, + { + "epoch": 0.5385389703767616, + "grad_norm": 0.2345884377445552, + "learning_rate": 1.845146744795643e-05, + "loss": 0.4853, + "step": 3745 + }, + { + "epoch": 0.5392579810181191, + "grad_norm": 0.22831199879883093, + "learning_rate": 1.8447440296606193e-05, + "loss": 0.5012, + "step": 3750 + }, + { + "epoch": 0.5399769916594765, + "grad_norm": 0.24900114363683074, + "learning_rate": 1.8443408356166128e-05, + "loss": 0.521, + "step": 3755 + }, + { + "epoch": 0.540696002300834, + "grad_norm": 0.22163999971406523, + "learning_rate": 1.8439371628922064e-05, + "loss": 0.5045, + "step": 3760 + }, + { + "epoch": 0.5414150129421915, + "grad_norm": 0.22760154174757039, + "learning_rate": 1.8435330117162534e-05, + "loss": 0.501, + "step": 3765 + }, + { + "epoch": 0.542134023583549, + "grad_norm": 0.22523822958018522, + "learning_rate": 1.843128382317878e-05, + "loss": 0.5133, + "step": 3770 + }, + { + "epoch": 0.5428530342249065, + "grad_norm": 0.22600992748430698, + "learning_rate": 1.8427232749264762e-05, + "loss": 0.499, + "step": 3775 + }, + { + "epoch": 0.543572044866264, + "grad_norm": 0.22389972269539865, + "learning_rate": 1.8423176897717143e-05, + "loss": 0.5015, + "step": 3780 + }, + { + "epoch": 0.5442910555076215, + "grad_norm": 0.23666072422166584, + "learning_rate": 1.8419116270835307e-05, + "loss": 0.522, + "step": 3785 + }, + { + "epoch": 0.545010066148979, + "grad_norm": 0.23444635492141347, + "learning_rate": 1.841505087092133e-05, + "loss": 0.4916, + "step": 3790 + }, + { + "epoch": 0.5457290767903364, + "grad_norm": 0.23417088378508938, + "learning_rate": 1.841098070028e-05, + "loss": 0.5131, + "step": 3795 + }, + { + "epoch": 0.546448087431694, + "grad_norm": 0.2398297332015248, + "learning_rate": 1.8406905761218815e-05, + "loss": 0.4969, + "step": 3800 + }, + { + "epoch": 0.5471670980730515, + "grad_norm": 0.23646633183918445, + "learning_rate": 1.8402826056047964e-05, + "loss": 0.5148, + "step": 3805 + }, + { + "epoch": 0.547886108714409, + "grad_norm": 0.23304665042616649, + "learning_rate": 1.8398741587080358e-05, + "loss": 0.506, + "step": 3810 + }, + { + "epoch": 0.5486051193557665, + "grad_norm": 0.21709816862440437, + "learning_rate": 1.8394652356631585e-05, + "loss": 0.5089, + "step": 3815 + }, + { + "epoch": 0.549324129997124, + "grad_norm": 0.23021854763291302, + "learning_rate": 1.8390558367019954e-05, + "loss": 0.4946, + "step": 3820 + }, + { + "epoch": 0.5500431406384815, + "grad_norm": 0.2206971983996814, + "learning_rate": 1.8386459620566453e-05, + "loss": 0.4745, + "step": 3825 + }, + { + "epoch": 0.550762151279839, + "grad_norm": 0.21574112788117586, + "learning_rate": 1.838235611959478e-05, + "loss": 0.5086, + "step": 3830 + }, + { + "epoch": 0.5514811619211964, + "grad_norm": 0.2261136289668569, + "learning_rate": 1.8378247866431325e-05, + "loss": 0.4966, + "step": 3835 + }, + { + "epoch": 0.5522001725625539, + "grad_norm": 0.22972972838837386, + "learning_rate": 1.837413486340517e-05, + "loss": 0.4906, + "step": 3840 + }, + { + "epoch": 0.5529191832039114, + "grad_norm": 0.2330925028758894, + "learning_rate": 1.837001711284809e-05, + "loss": 0.5098, + "step": 3845 + }, + { + "epoch": 0.5536381938452689, + "grad_norm": 0.2270327849370844, + "learning_rate": 1.8365894617094558e-05, + "loss": 0.4926, + "step": 3850 + }, + { + "epoch": 0.5543572044866264, + "grad_norm": 0.23660944623443908, + "learning_rate": 1.8361767378481725e-05, + "loss": 0.5044, + "step": 3855 + }, + { + "epoch": 0.5550762151279839, + "grad_norm": 0.29863962341515, + "learning_rate": 1.8357635399349442e-05, + "loss": 0.5173, + "step": 3860 + }, + { + "epoch": 0.5557952257693414, + "grad_norm": 0.21782436220827342, + "learning_rate": 1.8353498682040244e-05, + "loss": 0.499, + "step": 3865 + }, + { + "epoch": 0.5565142364106989, + "grad_norm": 0.2210253870313405, + "learning_rate": 1.8349357228899348e-05, + "loss": 0.4892, + "step": 3870 + }, + { + "epoch": 0.5572332470520563, + "grad_norm": 0.21922597978664715, + "learning_rate": 1.834521104227466e-05, + "loss": 0.4924, + "step": 3875 + }, + { + "epoch": 0.5579522576934138, + "grad_norm": 0.22792159860332095, + "learning_rate": 1.8341060124516774e-05, + "loss": 0.52, + "step": 3880 + }, + { + "epoch": 0.5586712683347713, + "grad_norm": 0.21526668904279522, + "learning_rate": 1.833690447797896e-05, + "loss": 0.4981, + "step": 3885 + }, + { + "epoch": 0.5593902789761288, + "grad_norm": 0.29103804275461576, + "learning_rate": 1.8332744105017163e-05, + "loss": 0.4928, + "step": 3890 + }, + { + "epoch": 0.5601092896174863, + "grad_norm": 0.2357301459907596, + "learning_rate": 1.832857900799002e-05, + "loss": 0.4984, + "step": 3895 + }, + { + "epoch": 0.5608283002588438, + "grad_norm": 0.22478118848509318, + "learning_rate": 1.832440918925884e-05, + "loss": 0.4948, + "step": 3900 + }, + { + "epoch": 0.5615473109002013, + "grad_norm": 0.23300933444107352, + "learning_rate": 1.8320234651187614e-05, + "loss": 0.4909, + "step": 3905 + }, + { + "epoch": 0.5622663215415589, + "grad_norm": 0.230128753020351, + "learning_rate": 1.8316055396142997e-05, + "loss": 0.5244, + "step": 3910 + }, + { + "epoch": 0.5629853321829164, + "grad_norm": 0.2304505782146836, + "learning_rate": 1.831187142649433e-05, + "loss": 0.5158, + "step": 3915 + }, + { + "epoch": 0.5637043428242738, + "grad_norm": 0.22014125911826737, + "learning_rate": 1.830768274461362e-05, + "loss": 0.482, + "step": 3920 + }, + { + "epoch": 0.5644233534656313, + "grad_norm": 0.2278820436024887, + "learning_rate": 1.830348935287555e-05, + "loss": 0.5112, + "step": 3925 + }, + { + "epoch": 0.5651423641069888, + "grad_norm": 0.2156583838047392, + "learning_rate": 1.829929125365747e-05, + "loss": 0.496, + "step": 3930 + }, + { + "epoch": 0.5658613747483463, + "grad_norm": 0.2315411777675097, + "learning_rate": 1.8295088449339395e-05, + "loss": 0.5031, + "step": 3935 + }, + { + "epoch": 0.5665803853897038, + "grad_norm": 0.2191930539045992, + "learning_rate": 1.8290880942304018e-05, + "loss": 0.5017, + "step": 3940 + }, + { + "epoch": 0.5672993960310613, + "grad_norm": 0.23045445869070985, + "learning_rate": 1.8286668734936693e-05, + "loss": 0.5047, + "step": 3945 + }, + { + "epoch": 0.5680184066724188, + "grad_norm": 0.23068453263665495, + "learning_rate": 1.8282451829625433e-05, + "loss": 0.4884, + "step": 3950 + }, + { + "epoch": 0.5687374173137763, + "grad_norm": 0.22441184191853916, + "learning_rate": 1.827823022876092e-05, + "loss": 0.4925, + "step": 3955 + }, + { + "epoch": 0.5694564279551337, + "grad_norm": 0.2274613215901173, + "learning_rate": 1.8274003934736507e-05, + "loss": 0.4948, + "step": 3960 + }, + { + "epoch": 0.5701754385964912, + "grad_norm": 0.2154163185531681, + "learning_rate": 1.8269772949948185e-05, + "loss": 0.4804, + "step": 3965 + }, + { + "epoch": 0.5708944492378487, + "grad_norm": 0.2265136281130171, + "learning_rate": 1.8265537276794624e-05, + "loss": 0.5021, + "step": 3970 + }, + { + "epoch": 0.5716134598792062, + "grad_norm": 0.23548231058339367, + "learning_rate": 1.826129691767714e-05, + "loss": 0.4969, + "step": 3975 + }, + { + "epoch": 0.5723324705205637, + "grad_norm": 0.24709148621521423, + "learning_rate": 1.8257051874999723e-05, + "loss": 0.4947, + "step": 3980 + }, + { + "epoch": 0.5730514811619212, + "grad_norm": 0.22658966766078337, + "learning_rate": 1.8252802151168992e-05, + "loss": 0.4806, + "step": 3985 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 0.23535080257238622, + "learning_rate": 1.8248547748594246e-05, + "loss": 0.496, + "step": 3990 + }, + { + "epoch": 0.5744895024446361, + "grad_norm": 0.23300185528621603, + "learning_rate": 1.8244288669687414e-05, + "loss": 0.5161, + "step": 3995 + }, + { + "epoch": 0.5752085130859936, + "grad_norm": 0.2195927252226493, + "learning_rate": 1.8240024916863096e-05, + "loss": 0.5145, + "step": 4000 + }, + { + "epoch": 0.5759275237273511, + "grad_norm": 0.22707087879411403, + "learning_rate": 1.823575649253853e-05, + "loss": 0.5027, + "step": 4005 + }, + { + "epoch": 0.5766465343687086, + "grad_norm": 0.23258755169070053, + "learning_rate": 1.82314833991336e-05, + "loss": 0.4822, + "step": 4010 + }, + { + "epoch": 0.5773655450100661, + "grad_norm": 0.21245372046944946, + "learning_rate": 1.8227205639070845e-05, + "loss": 0.4841, + "step": 4015 + }, + { + "epoch": 0.5780845556514237, + "grad_norm": 0.227555450329633, + "learning_rate": 1.822292321477545e-05, + "loss": 0.5137, + "step": 4020 + }, + { + "epoch": 0.5788035662927812, + "grad_norm": 0.22238301849030442, + "learning_rate": 1.821863612867524e-05, + "loss": 0.5018, + "step": 4025 + }, + { + "epoch": 0.5795225769341387, + "grad_norm": 0.23818689276016308, + "learning_rate": 1.821434438320068e-05, + "loss": 0.4984, + "step": 4030 + }, + { + "epoch": 0.5802415875754962, + "grad_norm": 0.22243316105430688, + "learning_rate": 1.821004798078488e-05, + "loss": 0.4791, + "step": 4035 + }, + { + "epoch": 0.5809605982168536, + "grad_norm": 0.2235013488087155, + "learning_rate": 1.8205746923863596e-05, + "loss": 0.5031, + "step": 4040 + }, + { + "epoch": 0.5816796088582111, + "grad_norm": 0.22378546532385274, + "learning_rate": 1.820144121487521e-05, + "loss": 0.4869, + "step": 4045 + }, + { + "epoch": 0.5823986194995686, + "grad_norm": 0.22608188106946078, + "learning_rate": 1.819713085626076e-05, + "loss": 0.5, + "step": 4050 + }, + { + "epoch": 0.5831176301409261, + "grad_norm": 0.23120069866094303, + "learning_rate": 1.8192815850463896e-05, + "loss": 0.5043, + "step": 4055 + }, + { + "epoch": 0.5838366407822836, + "grad_norm": 0.22373861994189723, + "learning_rate": 1.8188496199930922e-05, + "loss": 0.5069, + "step": 4060 + }, + { + "epoch": 0.5845556514236411, + "grad_norm": 0.22542422851046623, + "learning_rate": 1.8184171907110767e-05, + "loss": 0.4898, + "step": 4065 + }, + { + "epoch": 0.5852746620649986, + "grad_norm": 0.22992707671161222, + "learning_rate": 1.8179842974454997e-05, + "loss": 0.5058, + "step": 4070 + }, + { + "epoch": 0.585993672706356, + "grad_norm": 0.23512992551531928, + "learning_rate": 1.8175509404417795e-05, + "loss": 0.5131, + "step": 4075 + }, + { + "epoch": 0.5867126833477135, + "grad_norm": 0.23047258195395515, + "learning_rate": 1.8171171199455995e-05, + "loss": 0.4866, + "step": 4080 + }, + { + "epoch": 0.587431693989071, + "grad_norm": 0.22879948545941575, + "learning_rate": 1.8166828362029038e-05, + "loss": 0.4984, + "step": 4085 + }, + { + "epoch": 0.5881507046304285, + "grad_norm": 0.2318143159711983, + "learning_rate": 1.8162480894599007e-05, + "loss": 0.5046, + "step": 4090 + }, + { + "epoch": 0.588869715271786, + "grad_norm": 0.22044473281174515, + "learning_rate": 1.8158128799630593e-05, + "loss": 0.4972, + "step": 4095 + }, + { + "epoch": 0.5895887259131435, + "grad_norm": 0.21967188839528212, + "learning_rate": 1.815377207959113e-05, + "loss": 0.5114, + "step": 4100 + }, + { + "epoch": 0.590307736554501, + "grad_norm": 0.22481352334786187, + "learning_rate": 1.8149410736950557e-05, + "loss": 0.4804, + "step": 4105 + }, + { + "epoch": 0.5910267471958585, + "grad_norm": 0.22907040470257112, + "learning_rate": 1.8145044774181446e-05, + "loss": 0.5133, + "step": 4110 + }, + { + "epoch": 0.591745757837216, + "grad_norm": 0.22599774025751448, + "learning_rate": 1.814067419375898e-05, + "loss": 0.5127, + "step": 4115 + }, + { + "epoch": 0.5924647684785734, + "grad_norm": 0.22849166460658857, + "learning_rate": 1.8136298998160964e-05, + "loss": 0.4885, + "step": 4120 + }, + { + "epoch": 0.5931837791199309, + "grad_norm": 0.23707425413994465, + "learning_rate": 1.8131919189867823e-05, + "loss": 0.5023, + "step": 4125 + }, + { + "epoch": 0.5939027897612885, + "grad_norm": 0.218741855347216, + "learning_rate": 1.8127534771362583e-05, + "loss": 0.5053, + "step": 4130 + }, + { + "epoch": 0.594621800402646, + "grad_norm": 0.21990224755715207, + "learning_rate": 1.81231457451309e-05, + "loss": 0.5011, + "step": 4135 + }, + { + "epoch": 0.5953408110440035, + "grad_norm": 0.22519334707137623, + "learning_rate": 1.8118752113661036e-05, + "loss": 0.4929, + "step": 4140 + }, + { + "epoch": 0.596059821685361, + "grad_norm": 0.2251407174797879, + "learning_rate": 1.811435387944386e-05, + "loss": 0.4897, + "step": 4145 + }, + { + "epoch": 0.5967788323267185, + "grad_norm": 0.22081155450327186, + "learning_rate": 1.8109951044972852e-05, + "loss": 0.5096, + "step": 4150 + }, + { + "epoch": 0.597497842968076, + "grad_norm": 0.21913049190140144, + "learning_rate": 1.810554361274411e-05, + "loss": 0.4994, + "step": 4155 + }, + { + "epoch": 0.5982168536094334, + "grad_norm": 0.2280104778800277, + "learning_rate": 1.8101131585256327e-05, + "loss": 0.5088, + "step": 4160 + }, + { + "epoch": 0.5989358642507909, + "grad_norm": 0.23121277159859566, + "learning_rate": 1.80967149650108e-05, + "loss": 0.4977, + "step": 4165 + }, + { + "epoch": 0.5996548748921484, + "grad_norm": 0.23167148679803448, + "learning_rate": 1.8092293754511437e-05, + "loss": 0.4928, + "step": 4170 + }, + { + "epoch": 0.6003738855335059, + "grad_norm": 0.22355387075840977, + "learning_rate": 1.808786795626475e-05, + "loss": 0.4905, + "step": 4175 + }, + { + "epoch": 0.6010928961748634, + "grad_norm": 0.2145020922572475, + "learning_rate": 1.8083437572779842e-05, + "loss": 0.4835, + "step": 4180 + }, + { + "epoch": 0.6018119068162209, + "grad_norm": 0.2502122824779311, + "learning_rate": 1.8079002606568425e-05, + "loss": 0.4885, + "step": 4185 + }, + { + "epoch": 0.6025309174575784, + "grad_norm": 0.25428888688428436, + "learning_rate": 1.8074563060144804e-05, + "loss": 0.5103, + "step": 4190 + }, + { + "epoch": 0.6032499280989359, + "grad_norm": 0.22496649178253486, + "learning_rate": 1.807011893602588e-05, + "loss": 0.4787, + "step": 4195 + }, + { + "epoch": 0.6039689387402933, + "grad_norm": 0.22504966410722568, + "learning_rate": 1.8065670236731147e-05, + "loss": 0.4946, + "step": 4200 + }, + { + "epoch": 0.6046879493816508, + "grad_norm": 0.2256429602495479, + "learning_rate": 1.8061216964782707e-05, + "loss": 0.4919, + "step": 4205 + }, + { + "epoch": 0.6054069600230083, + "grad_norm": 0.24545480436663297, + "learning_rate": 1.805675912270524e-05, + "loss": 0.5098, + "step": 4210 + }, + { + "epoch": 0.6061259706643658, + "grad_norm": 0.23138472437464977, + "learning_rate": 1.805229671302602e-05, + "loss": 0.49, + "step": 4215 + }, + { + "epoch": 0.6068449813057233, + "grad_norm": 0.22420294449430914, + "learning_rate": 1.8047829738274912e-05, + "loss": 0.5135, + "step": 4220 + }, + { + "epoch": 0.6075639919470808, + "grad_norm": 0.23477514688466256, + "learning_rate": 1.8043358200984367e-05, + "loss": 0.5091, + "step": 4225 + }, + { + "epoch": 0.6082830025884383, + "grad_norm": 0.22664370647917517, + "learning_rate": 1.8038882103689425e-05, + "loss": 0.5009, + "step": 4230 + }, + { + "epoch": 0.6090020132297957, + "grad_norm": 0.22665190958697912, + "learning_rate": 1.8034401448927717e-05, + "loss": 0.5039, + "step": 4235 + }, + { + "epoch": 0.6097210238711533, + "grad_norm": 0.22347929069368033, + "learning_rate": 1.8029916239239444e-05, + "loss": 0.4995, + "step": 4240 + }, + { + "epoch": 0.6104400345125108, + "grad_norm": 0.22840687464864576, + "learning_rate": 1.8025426477167398e-05, + "loss": 0.4906, + "step": 4245 + }, + { + "epoch": 0.6111590451538683, + "grad_norm": 0.21865290043810331, + "learning_rate": 1.802093216525695e-05, + "loss": 0.5002, + "step": 4250 + }, + { + "epoch": 0.6118780557952258, + "grad_norm": 0.2257697870345122, + "learning_rate": 1.8016433306056056e-05, + "loss": 0.4974, + "step": 4255 + }, + { + "epoch": 0.6125970664365833, + "grad_norm": 0.23554978601033177, + "learning_rate": 1.801192990211524e-05, + "loss": 0.5076, + "step": 4260 + }, + { + "epoch": 0.6133160770779408, + "grad_norm": 0.21763035761289187, + "learning_rate": 1.800742195598761e-05, + "loss": 0.5022, + "step": 4265 + }, + { + "epoch": 0.6140350877192983, + "grad_norm": 0.22845763433585353, + "learning_rate": 1.800290947022884e-05, + "loss": 0.5012, + "step": 4270 + }, + { + "epoch": 0.6147540983606558, + "grad_norm": 0.23660622765117761, + "learning_rate": 1.7998392447397197e-05, + "loss": 0.4953, + "step": 4275 + }, + { + "epoch": 0.6154731090020132, + "grad_norm": 0.23512957287176042, + "learning_rate": 1.799387089005349e-05, + "loss": 0.5081, + "step": 4280 + }, + { + "epoch": 0.6161921196433707, + "grad_norm": 0.23139366949545287, + "learning_rate": 1.7989344800761138e-05, + "loss": 0.4785, + "step": 4285 + }, + { + "epoch": 0.6169111302847282, + "grad_norm": 0.21543568576088704, + "learning_rate": 1.798481418208609e-05, + "loss": 0.4889, + "step": 4290 + }, + { + "epoch": 0.6176301409260857, + "grad_norm": 0.24103936247130817, + "learning_rate": 1.7980279036596882e-05, + "loss": 0.4887, + "step": 4295 + }, + { + "epoch": 0.6183491515674432, + "grad_norm": 0.22480459818866064, + "learning_rate": 1.797573936686462e-05, + "loss": 0.4998, + "step": 4300 + }, + { + "epoch": 0.6190681622088007, + "grad_norm": 0.2302806647932163, + "learning_rate": 1.797119517546297e-05, + "loss": 0.4823, + "step": 4305 + }, + { + "epoch": 0.6197871728501582, + "grad_norm": 0.2195093911650767, + "learning_rate": 1.7966646464968156e-05, + "loss": 0.4874, + "step": 4310 + }, + { + "epoch": 0.6205061834915157, + "grad_norm": 0.23515790120245852, + "learning_rate": 1.7962093237958975e-05, + "loss": 0.5017, + "step": 4315 + }, + { + "epoch": 0.6212251941328731, + "grad_norm": 0.22558657641284738, + "learning_rate": 1.7957535497016773e-05, + "loss": 0.4836, + "step": 4320 + }, + { + "epoch": 0.6219442047742306, + "grad_norm": 0.21514923208427525, + "learning_rate": 1.7952973244725466e-05, + "loss": 0.503, + "step": 4325 + }, + { + "epoch": 0.6226632154155881, + "grad_norm": 0.2240849714125103, + "learning_rate": 1.7948406483671516e-05, + "loss": 0.5095, + "step": 4330 + }, + { + "epoch": 0.6233822260569456, + "grad_norm": 0.23011998238604692, + "learning_rate": 1.7943835216443954e-05, + "loss": 0.4978, + "step": 4335 + }, + { + "epoch": 0.6241012366983031, + "grad_norm": 0.2226633367382515, + "learning_rate": 1.793925944563435e-05, + "loss": 0.4978, + "step": 4340 + }, + { + "epoch": 0.6248202473396606, + "grad_norm": 0.22271625540264856, + "learning_rate": 1.7934679173836845e-05, + "loss": 0.4793, + "step": 4345 + }, + { + "epoch": 0.6255392579810182, + "grad_norm": 0.22168826941628614, + "learning_rate": 1.7930094403648123e-05, + "loss": 0.485, + "step": 4350 + }, + { + "epoch": 0.6262582686223757, + "grad_norm": 0.2194434339487712, + "learning_rate": 1.792550513766741e-05, + "loss": 0.5179, + "step": 4355 + }, + { + "epoch": 0.6269772792637331, + "grad_norm": 0.2241703162853489, + "learning_rate": 1.79209113784965e-05, + "loss": 0.4924, + "step": 4360 + }, + { + "epoch": 0.6276962899050906, + "grad_norm": 0.22907476566053597, + "learning_rate": 1.7916313128739713e-05, + "loss": 0.5165, + "step": 4365 + }, + { + "epoch": 0.6284153005464481, + "grad_norm": 0.2287805893738131, + "learning_rate": 1.791171039100393e-05, + "loss": 0.504, + "step": 4370 + }, + { + "epoch": 0.6291343111878056, + "grad_norm": 0.22889669531847387, + "learning_rate": 1.7907103167898574e-05, + "loss": 0.4875, + "step": 4375 + }, + { + "epoch": 0.6298533218291631, + "grad_norm": 0.22708288689043107, + "learning_rate": 1.7902491462035604e-05, + "loss": 0.4997, + "step": 4380 + }, + { + "epoch": 0.6305723324705206, + "grad_norm": 0.22524635262428871, + "learning_rate": 1.789787527602953e-05, + "loss": 0.5019, + "step": 4385 + }, + { + "epoch": 0.6312913431118781, + "grad_norm": 0.22737838008003655, + "learning_rate": 1.789325461249739e-05, + "loss": 0.5035, + "step": 4390 + }, + { + "epoch": 0.6320103537532356, + "grad_norm": 0.22814314146043482, + "learning_rate": 1.788862947405877e-05, + "loss": 0.5136, + "step": 4395 + }, + { + "epoch": 0.632729364394593, + "grad_norm": 0.23190044903178156, + "learning_rate": 1.7883999863335795e-05, + "loss": 0.4727, + "step": 4400 + }, + { + "epoch": 0.6334483750359505, + "grad_norm": 0.23035525240649793, + "learning_rate": 1.787936578295311e-05, + "loss": 0.4864, + "step": 4405 + }, + { + "epoch": 0.634167385677308, + "grad_norm": 0.2172364867230587, + "learning_rate": 1.787472723553792e-05, + "loss": 0.4932, + "step": 4410 + }, + { + "epoch": 0.6348863963186655, + "grad_norm": 0.22456558012536526, + "learning_rate": 1.7870084223719927e-05, + "loss": 0.4963, + "step": 4415 + }, + { + "epoch": 0.635605406960023, + "grad_norm": 0.22782599669782636, + "learning_rate": 1.7865436750131404e-05, + "loss": 0.485, + "step": 4420 + }, + { + "epoch": 0.6363244176013805, + "grad_norm": 0.22293627410505634, + "learning_rate": 1.7860784817407123e-05, + "loss": 0.5025, + "step": 4425 + }, + { + "epoch": 0.637043428242738, + "grad_norm": 0.2290342354101169, + "learning_rate": 1.7856128428184394e-05, + "loss": 0.5034, + "step": 4430 + }, + { + "epoch": 0.6377624388840955, + "grad_norm": 0.2123244352372745, + "learning_rate": 1.7851467585103058e-05, + "loss": 0.4789, + "step": 4435 + }, + { + "epoch": 0.6384814495254529, + "grad_norm": 0.2363554364033453, + "learning_rate": 1.7846802290805475e-05, + "loss": 0.4993, + "step": 4440 + }, + { + "epoch": 0.6392004601668104, + "grad_norm": 0.2169137466116251, + "learning_rate": 1.784213254793653e-05, + "loss": 0.5046, + "step": 4445 + }, + { + "epoch": 0.6399194708081679, + "grad_norm": 0.22732219414522734, + "learning_rate": 1.7837458359143635e-05, + "loss": 0.4898, + "step": 4450 + }, + { + "epoch": 0.6406384814495254, + "grad_norm": 0.23308360464567718, + "learning_rate": 1.783277972707671e-05, + "loss": 0.5037, + "step": 4455 + }, + { + "epoch": 0.641357492090883, + "grad_norm": 0.21797166379499391, + "learning_rate": 1.782809665438821e-05, + "loss": 0.4836, + "step": 4460 + }, + { + "epoch": 0.6420765027322405, + "grad_norm": 0.21670714333069238, + "learning_rate": 1.7823409143733096e-05, + "loss": 0.492, + "step": 4465 + }, + { + "epoch": 0.642795513373598, + "grad_norm": 0.22289980619650276, + "learning_rate": 1.7818717197768853e-05, + "loss": 0.488, + "step": 4470 + }, + { + "epoch": 0.6435145240149555, + "grad_norm": 0.21492990992266275, + "learning_rate": 1.7814020819155467e-05, + "loss": 0.485, + "step": 4475 + }, + { + "epoch": 0.644233534656313, + "grad_norm": 0.22565582075395257, + "learning_rate": 1.7809320010555457e-05, + "loss": 0.504, + "step": 4480 + }, + { + "epoch": 0.6449525452976704, + "grad_norm": 0.22953046603808086, + "learning_rate": 1.7804614774633837e-05, + "loss": 0.4942, + "step": 4485 + }, + { + "epoch": 0.6456715559390279, + "grad_norm": 0.22714500679627633, + "learning_rate": 1.7799905114058135e-05, + "loss": 0.4929, + "step": 4490 + }, + { + "epoch": 0.6463905665803854, + "grad_norm": 0.22402856864894585, + "learning_rate": 1.779519103149839e-05, + "loss": 0.5052, + "step": 4495 + }, + { + "epoch": 0.6471095772217429, + "grad_norm": 0.21690244310309423, + "learning_rate": 1.7790472529627152e-05, + "loss": 0.4773, + "step": 4500 + }, + { + "epoch": 0.6478285878631004, + "grad_norm": 0.22885853032295295, + "learning_rate": 1.7785749611119468e-05, + "loss": 0.5014, + "step": 4505 + }, + { + "epoch": 0.6485475985044579, + "grad_norm": 0.21772258605601158, + "learning_rate": 1.7781022278652892e-05, + "loss": 0.4843, + "step": 4510 + }, + { + "epoch": 0.6492666091458154, + "grad_norm": 0.22695485689332384, + "learning_rate": 1.777629053490748e-05, + "loss": 0.5005, + "step": 4515 + }, + { + "epoch": 0.6499856197871728, + "grad_norm": 0.2301755389479362, + "learning_rate": 1.777155438256579e-05, + "loss": 0.4863, + "step": 4520 + }, + { + "epoch": 0.6507046304285303, + "grad_norm": 0.2303790735813269, + "learning_rate": 1.776681382431288e-05, + "loss": 0.5158, + "step": 4525 + }, + { + "epoch": 0.6514236410698878, + "grad_norm": 0.22708261008915043, + "learning_rate": 1.7762068862836305e-05, + "loss": 0.4928, + "step": 4530 + }, + { + "epoch": 0.6521426517112453, + "grad_norm": 0.21978609691147083, + "learning_rate": 1.7757319500826118e-05, + "loss": 0.4821, + "step": 4535 + }, + { + "epoch": 0.6528616623526028, + "grad_norm": 0.2280064491413487, + "learning_rate": 1.775256574097486e-05, + "loss": 0.4944, + "step": 4540 + }, + { + "epoch": 0.6535806729939603, + "grad_norm": 0.23140586976106473, + "learning_rate": 1.7747807585977575e-05, + "loss": 0.4982, + "step": 4545 + }, + { + "epoch": 0.6542996836353178, + "grad_norm": 0.2249761869967712, + "learning_rate": 1.774304503853179e-05, + "loss": 0.503, + "step": 4550 + }, + { + "epoch": 0.6550186942766753, + "grad_norm": 0.22383737618616106, + "learning_rate": 1.773827810133753e-05, + "loss": 0.4845, + "step": 4555 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.23868560906760042, + "learning_rate": 1.77335067770973e-05, + "loss": 0.4915, + "step": 4560 + }, + { + "epoch": 0.6564567155593902, + "grad_norm": 0.22975311873958806, + "learning_rate": 1.7728731068516102e-05, + "loss": 0.4972, + "step": 4565 + }, + { + "epoch": 0.6571757262007478, + "grad_norm": 0.22989215659951942, + "learning_rate": 1.772395097830142e-05, + "loss": 0.4817, + "step": 4570 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.21496590035678406, + "learning_rate": 1.771916650916321e-05, + "loss": 0.4658, + "step": 4575 + }, + { + "epoch": 0.6586137474834628, + "grad_norm": 0.2242487964584687, + "learning_rate": 1.7714377663813932e-05, + "loss": 0.4948, + "step": 4580 + }, + { + "epoch": 0.6593327581248203, + "grad_norm": 0.2259566340870872, + "learning_rate": 1.770958444496851e-05, + "loss": 0.4884, + "step": 4585 + }, + { + "epoch": 0.6600517687661778, + "grad_norm": 0.22184505787317513, + "learning_rate": 1.7704786855344362e-05, + "loss": 0.4933, + "step": 4590 + }, + { + "epoch": 0.6607707794075353, + "grad_norm": 0.22147846460463083, + "learning_rate": 1.7699984897661366e-05, + "loss": 0.5163, + "step": 4595 + }, + { + "epoch": 0.6614897900488927, + "grad_norm": 0.22380466269010332, + "learning_rate": 1.769517857464189e-05, + "loss": 0.4924, + "step": 4600 + }, + { + "epoch": 0.6622088006902502, + "grad_norm": 0.2183939165932946, + "learning_rate": 1.769036788901077e-05, + "loss": 0.497, + "step": 4605 + }, + { + "epoch": 0.6629278113316077, + "grad_norm": 0.2299963801247869, + "learning_rate": 1.7685552843495325e-05, + "loss": 0.4888, + "step": 4610 + }, + { + "epoch": 0.6636468219729652, + "grad_norm": 0.22186769796460507, + "learning_rate": 1.768073344082533e-05, + "loss": 0.4946, + "step": 4615 + }, + { + "epoch": 0.6643658326143227, + "grad_norm": 0.22516670993138563, + "learning_rate": 1.7675909683733044e-05, + "loss": 0.489, + "step": 4620 + }, + { + "epoch": 0.6650848432556802, + "grad_norm": 0.2220975827121235, + "learning_rate": 1.767108157495319e-05, + "loss": 0.5141, + "step": 4625 + }, + { + "epoch": 0.6658038538970377, + "grad_norm": 0.22391628586117562, + "learning_rate": 1.7666249117222954e-05, + "loss": 0.5046, + "step": 4630 + }, + { + "epoch": 0.6665228645383952, + "grad_norm": 0.21878055532902985, + "learning_rate": 1.7661412313281996e-05, + "loss": 0.4827, + "step": 4635 + }, + { + "epoch": 0.6672418751797526, + "grad_norm": 0.21455190269812402, + "learning_rate": 1.7656571165872433e-05, + "loss": 0.4904, + "step": 4640 + }, + { + "epoch": 0.6679608858211101, + "grad_norm": 0.23229713610117478, + "learning_rate": 1.7651725677738848e-05, + "loss": 0.4944, + "step": 4645 + }, + { + "epoch": 0.6686798964624676, + "grad_norm": 0.2194496579507639, + "learning_rate": 1.764687585162828e-05, + "loss": 0.4945, + "step": 4650 + }, + { + "epoch": 0.6693989071038251, + "grad_norm": 0.24046731351470918, + "learning_rate": 1.764202169029023e-05, + "loss": 0.4985, + "step": 4655 + }, + { + "epoch": 0.6701179177451826, + "grad_norm": 0.24250442873954306, + "learning_rate": 1.7637163196476665e-05, + "loss": 0.4857, + "step": 4660 + }, + { + "epoch": 0.6708369283865401, + "grad_norm": 0.22781641303717975, + "learning_rate": 1.7632300372941994e-05, + "loss": 0.495, + "step": 4665 + }, + { + "epoch": 0.6715559390278976, + "grad_norm": 0.22177424874769494, + "learning_rate": 1.762743322244309e-05, + "loss": 0.4952, + "step": 4670 + }, + { + "epoch": 0.672274949669255, + "grad_norm": 0.2228721918137761, + "learning_rate": 1.762256174773928e-05, + "loss": 0.478, + "step": 4675 + }, + { + "epoch": 0.6729939603106126, + "grad_norm": 0.2199598987395226, + "learning_rate": 1.7617685951592332e-05, + "loss": 0.4921, + "step": 4680 + }, + { + "epoch": 0.6737129709519701, + "grad_norm": 0.21649943361141435, + "learning_rate": 1.7612805836766473e-05, + "loss": 0.4919, + "step": 4685 + }, + { + "epoch": 0.6744319815933276, + "grad_norm": 0.22575564833456532, + "learning_rate": 1.7607921406028383e-05, + "loss": 0.4804, + "step": 4690 + }, + { + "epoch": 0.6751509922346851, + "grad_norm": 0.2208911041009505, + "learning_rate": 1.7603032662147174e-05, + "loss": 0.4827, + "step": 4695 + }, + { + "epoch": 0.6758700028760426, + "grad_norm": 0.2318713562772489, + "learning_rate": 1.7598139607894415e-05, + "loss": 0.4916, + "step": 4700 + }, + { + "epoch": 0.6765890135174001, + "grad_norm": 0.22042625401063254, + "learning_rate": 1.7593242246044112e-05, + "loss": 0.4994, + "step": 4705 + }, + { + "epoch": 0.6773080241587576, + "grad_norm": 0.23336914586818205, + "learning_rate": 1.7588340579372723e-05, + "loss": 0.4876, + "step": 4710 + }, + { + "epoch": 0.6780270348001151, + "grad_norm": 0.23040143177334688, + "learning_rate": 1.7583434610659135e-05, + "loss": 0.4896, + "step": 4715 + }, + { + "epoch": 0.6787460454414725, + "grad_norm": 0.222450062203108, + "learning_rate": 1.757852434268468e-05, + "loss": 0.4977, + "step": 4720 + }, + { + "epoch": 0.67946505608283, + "grad_norm": 0.22334129871305886, + "learning_rate": 1.757360977823312e-05, + "loss": 0.4843, + "step": 4725 + }, + { + "epoch": 0.6801840667241875, + "grad_norm": 0.2249846211526674, + "learning_rate": 1.7568690920090667e-05, + "loss": 0.508, + "step": 4730 + }, + { + "epoch": 0.680903077365545, + "grad_norm": 0.22385876087181392, + "learning_rate": 1.756376777104596e-05, + "loss": 0.4792, + "step": 4735 + }, + { + "epoch": 0.6816220880069025, + "grad_norm": 0.21810768344183779, + "learning_rate": 1.755884033389006e-05, + "loss": 0.4959, + "step": 4740 + }, + { + "epoch": 0.68234109864826, + "grad_norm": 0.22889663535319346, + "learning_rate": 1.7553908611416476e-05, + "loss": 0.4921, + "step": 4745 + }, + { + "epoch": 0.6830601092896175, + "grad_norm": 0.22836005091803305, + "learning_rate": 1.754897260642114e-05, + "loss": 0.5041, + "step": 4750 + }, + { + "epoch": 0.683779119930975, + "grad_norm": 0.2337522649597847, + "learning_rate": 1.754403232170241e-05, + "loss": 0.4983, + "step": 4755 + }, + { + "epoch": 0.6844981305723324, + "grad_norm": 0.22427854619228013, + "learning_rate": 1.7539087760061065e-05, + "loss": 0.4788, + "step": 4760 + }, + { + "epoch": 0.6852171412136899, + "grad_norm": 0.22340482122931785, + "learning_rate": 1.7534138924300322e-05, + "loss": 0.4871, + "step": 4765 + }, + { + "epoch": 0.6859361518550474, + "grad_norm": 0.21209483637838136, + "learning_rate": 1.7529185817225814e-05, + "loss": 0.4843, + "step": 4770 + }, + { + "epoch": 0.6866551624964049, + "grad_norm": 0.2614114941386408, + "learning_rate": 1.7524228441645595e-05, + "loss": 0.4889, + "step": 4775 + }, + { + "epoch": 0.6873741731377624, + "grad_norm": 0.2759337878952069, + "learning_rate": 1.751926680037014e-05, + "loss": 0.4924, + "step": 4780 + }, + { + "epoch": 0.6880931837791199, + "grad_norm": 0.2395374492522584, + "learning_rate": 1.7514300896212337e-05, + "loss": 0.5061, + "step": 4785 + }, + { + "epoch": 0.6888121944204775, + "grad_norm": 0.2596268828245961, + "learning_rate": 1.7509330731987503e-05, + "loss": 0.5152, + "step": 4790 + }, + { + "epoch": 0.689531205061835, + "grad_norm": 0.22117993803359795, + "learning_rate": 1.750435631051336e-05, + "loss": 0.4988, + "step": 4795 + }, + { + "epoch": 0.6902502157031924, + "grad_norm": 0.22858075750882878, + "learning_rate": 1.7499377634610045e-05, + "loss": 0.5127, + "step": 4800 + }, + { + "epoch": 0.6909692263445499, + "grad_norm": 0.22143843885058742, + "learning_rate": 1.7494394707100106e-05, + "loss": 0.4877, + "step": 4805 + }, + { + "epoch": 0.6916882369859074, + "grad_norm": 0.22707771227565943, + "learning_rate": 1.748940753080851e-05, + "loss": 0.4958, + "step": 4810 + }, + { + "epoch": 0.6924072476272649, + "grad_norm": 0.27308742041361583, + "learning_rate": 1.7484416108562622e-05, + "loss": 0.4825, + "step": 4815 + }, + { + "epoch": 0.6931262582686224, + "grad_norm": 0.21424028907678153, + "learning_rate": 1.7479420443192224e-05, + "loss": 0.4854, + "step": 4820 + }, + { + "epoch": 0.6938452689099799, + "grad_norm": 0.22823633517798755, + "learning_rate": 1.747442053752949e-05, + "loss": 0.5075, + "step": 4825 + }, + { + "epoch": 0.6945642795513374, + "grad_norm": 0.21523415768614815, + "learning_rate": 1.746941639440902e-05, + "loss": 0.4939, + "step": 4830 + }, + { + "epoch": 0.6952832901926949, + "grad_norm": 0.2294486883785128, + "learning_rate": 1.7464408016667782e-05, + "loss": 0.4798, + "step": 4835 + }, + { + "epoch": 0.6960023008340523, + "grad_norm": 0.22961433155302852, + "learning_rate": 1.7459395407145184e-05, + "loss": 0.5036, + "step": 4840 + }, + { + "epoch": 0.6967213114754098, + "grad_norm": 0.2316012619634393, + "learning_rate": 1.7454378568683003e-05, + "loss": 0.4768, + "step": 4845 + }, + { + "epoch": 0.6974403221167673, + "grad_norm": 0.22749278143875307, + "learning_rate": 1.744935750412543e-05, + "loss": 0.488, + "step": 4850 + }, + { + "epoch": 0.6981593327581248, + "grad_norm": 0.22330719621266287, + "learning_rate": 1.7444332216319044e-05, + "loss": 0.4905, + "step": 4855 + }, + { + "epoch": 0.6988783433994823, + "grad_norm": 0.23288889389670006, + "learning_rate": 1.7439302708112825e-05, + "loss": 0.4975, + "step": 4860 + }, + { + "epoch": 0.6995973540408398, + "grad_norm": 0.2179924907854225, + "learning_rate": 1.743426898235814e-05, + "loss": 0.4972, + "step": 4865 + }, + { + "epoch": 0.7003163646821973, + "grad_norm": 0.22453081468092548, + "learning_rate": 1.7429231041908745e-05, + "loss": 0.4885, + "step": 4870 + }, + { + "epoch": 0.7010353753235548, + "grad_norm": 0.22653760101343884, + "learning_rate": 1.742418888962079e-05, + "loss": 0.4772, + "step": 4875 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 0.2412243876517065, + "learning_rate": 1.7419142528352815e-05, + "loss": 0.5079, + "step": 4880 + }, + { + "epoch": 0.7024733966062697, + "grad_norm": 0.2292565589968856, + "learning_rate": 1.7414091960965745e-05, + "loss": 0.4601, + "step": 4885 + }, + { + "epoch": 0.7031924072476272, + "grad_norm": 0.2409486748433275, + "learning_rate": 1.7409037190322882e-05, + "loss": 0.4947, + "step": 4890 + }, + { + "epoch": 0.7039114178889847, + "grad_norm": 0.22951050066480008, + "learning_rate": 1.740397821928992e-05, + "loss": 0.4942, + "step": 4895 + }, + { + "epoch": 0.7046304285303423, + "grad_norm": 0.22113426876234893, + "learning_rate": 1.7398915050734934e-05, + "loss": 0.4954, + "step": 4900 + }, + { + "epoch": 0.7053494391716998, + "grad_norm": 0.22645148562176193, + "learning_rate": 1.7393847687528367e-05, + "loss": 0.4824, + "step": 4905 + }, + { + "epoch": 0.7060684498130573, + "grad_norm": 0.22216456536462917, + "learning_rate": 1.7388776132543055e-05, + "loss": 0.4627, + "step": 4910 + }, + { + "epoch": 0.7067874604544148, + "grad_norm": 0.23986355860805847, + "learning_rate": 1.73837003886542e-05, + "loss": 0.511, + "step": 4915 + }, + { + "epoch": 0.7075064710957722, + "grad_norm": 0.24141481260266942, + "learning_rate": 1.737862045873939e-05, + "loss": 0.4904, + "step": 4920 + }, + { + "epoch": 0.7082254817371297, + "grad_norm": 0.2247741598774793, + "learning_rate": 1.7373536345678578e-05, + "loss": 0.5114, + "step": 4925 + }, + { + "epoch": 0.7089444923784872, + "grad_norm": 0.22360516425594493, + "learning_rate": 1.736844805235408e-05, + "loss": 0.5022, + "step": 4930 + }, + { + "epoch": 0.7096635030198447, + "grad_norm": 0.22136089523441504, + "learning_rate": 1.73633555816506e-05, + "loss": 0.4964, + "step": 4935 + }, + { + "epoch": 0.7103825136612022, + "grad_norm": 0.21994560345932826, + "learning_rate": 1.7358258936455203e-05, + "loss": 0.4985, + "step": 4940 + }, + { + "epoch": 0.7111015243025597, + "grad_norm": 0.22481508730322386, + "learning_rate": 1.7353158119657312e-05, + "loss": 0.4924, + "step": 4945 + }, + { + "epoch": 0.7118205349439172, + "grad_norm": 0.23240499727299993, + "learning_rate": 1.7348053134148727e-05, + "loss": 0.4896, + "step": 4950 + }, + { + "epoch": 0.7125395455852747, + "grad_norm": 0.2283444105591223, + "learning_rate": 1.7342943982823612e-05, + "loss": 0.4912, + "step": 4955 + }, + { + "epoch": 0.7132585562266321, + "grad_norm": 0.22351511189067214, + "learning_rate": 1.7337830668578478e-05, + "loss": 0.5084, + "step": 4960 + }, + { + "epoch": 0.7139775668679896, + "grad_norm": 0.21938042798122012, + "learning_rate": 1.733271319431221e-05, + "loss": 0.4814, + "step": 4965 + }, + { + "epoch": 0.7146965775093471, + "grad_norm": 0.23337124553132274, + "learning_rate": 1.732759156292605e-05, + "loss": 0.4892, + "step": 4970 + }, + { + "epoch": 0.7154155881507046, + "grad_norm": 0.2273338423301612, + "learning_rate": 1.732246577732359e-05, + "loss": 0.4862, + "step": 4975 + }, + { + "epoch": 0.7161345987920621, + "grad_norm": 0.22692402136683593, + "learning_rate": 1.731733584041078e-05, + "loss": 0.4781, + "step": 4980 + }, + { + "epoch": 0.7168536094334196, + "grad_norm": 0.22180526870888811, + "learning_rate": 1.731220175509593e-05, + "loss": 0.4937, + "step": 4985 + }, + { + "epoch": 0.7175726200747771, + "grad_norm": 0.23380812277896126, + "learning_rate": 1.7307063524289692e-05, + "loss": 0.4911, + "step": 4990 + }, + { + "epoch": 0.7182916307161346, + "grad_norm": 0.22035178227881136, + "learning_rate": 1.730192115090507e-05, + "loss": 0.4816, + "step": 4995 + }, + { + "epoch": 0.719010641357492, + "grad_norm": 0.22970889099981737, + "learning_rate": 1.7296774637857428e-05, + "loss": 0.5036, + "step": 5000 + }, + { + "epoch": 0.7197296519988495, + "grad_norm": 0.22578161192062568, + "learning_rate": 1.729162398806446e-05, + "loss": 0.493, + "step": 5005 + }, + { + "epoch": 0.7204486626402071, + "grad_norm": 0.2196631750733422, + "learning_rate": 1.7286469204446215e-05, + "loss": 0.4768, + "step": 5010 + }, + { + "epoch": 0.7211676732815646, + "grad_norm": 0.2357705200771042, + "learning_rate": 1.7281310289925087e-05, + "loss": 0.4955, + "step": 5015 + }, + { + "epoch": 0.7218866839229221, + "grad_norm": 0.22215462652234658, + "learning_rate": 1.7276147247425802e-05, + "loss": 0.485, + "step": 5020 + }, + { + "epoch": 0.7226056945642796, + "grad_norm": 0.22752699079874833, + "learning_rate": 1.7270980079875444e-05, + "loss": 0.4918, + "step": 5025 + }, + { + "epoch": 0.7233247052056371, + "grad_norm": 0.23611477914863882, + "learning_rate": 1.726580879020341e-05, + "loss": 0.5021, + "step": 5030 + }, + { + "epoch": 0.7240437158469946, + "grad_norm": 0.2182898832045658, + "learning_rate": 1.7260633381341462e-05, + "loss": 0.4753, + "step": 5035 + }, + { + "epoch": 0.724762726488352, + "grad_norm": 0.2527928852955433, + "learning_rate": 1.7255453856223674e-05, + "loss": 0.4885, + "step": 5040 + }, + { + "epoch": 0.7254817371297095, + "grad_norm": 0.22254442434874563, + "learning_rate": 1.7250270217786473e-05, + "loss": 0.4986, + "step": 5045 + }, + { + "epoch": 0.726200747771067, + "grad_norm": 0.2301174649411321, + "learning_rate": 1.7245082468968596e-05, + "loss": 0.4904, + "step": 5050 + }, + { + "epoch": 0.7269197584124245, + "grad_norm": 0.213616955709942, + "learning_rate": 1.7239890612711135e-05, + "loss": 0.4967, + "step": 5055 + }, + { + "epoch": 0.727638769053782, + "grad_norm": 0.21870487684081705, + "learning_rate": 1.723469465195749e-05, + "loss": 0.496, + "step": 5060 + }, + { + "epoch": 0.7283577796951395, + "grad_norm": 0.21985442116131565, + "learning_rate": 1.7229494589653403e-05, + "loss": 0.4709, + "step": 5065 + }, + { + "epoch": 0.729076790336497, + "grad_norm": 0.24043341432046253, + "learning_rate": 1.722429042874693e-05, + "loss": 0.4944, + "step": 5070 + }, + { + "epoch": 0.7297958009778545, + "grad_norm": 0.22259041116703665, + "learning_rate": 1.7219082172188452e-05, + "loss": 0.4974, + "step": 5075 + }, + { + "epoch": 0.730514811619212, + "grad_norm": 0.2227993175937651, + "learning_rate": 1.7213869822930686e-05, + "loss": 0.4906, + "step": 5080 + }, + { + "epoch": 0.7312338222605694, + "grad_norm": 0.23570180181268807, + "learning_rate": 1.7208653383928645e-05, + "loss": 0.4769, + "step": 5085 + }, + { + "epoch": 0.7319528329019269, + "grad_norm": 0.22680006014496892, + "learning_rate": 1.7203432858139683e-05, + "loss": 0.5028, + "step": 5090 + }, + { + "epoch": 0.7326718435432844, + "grad_norm": 0.22184121654666847, + "learning_rate": 1.719820824852346e-05, + "loss": 0.4839, + "step": 5095 + }, + { + "epoch": 0.7333908541846419, + "grad_norm": 0.22870656998660482, + "learning_rate": 1.719297955804195e-05, + "loss": 0.4995, + "step": 5100 + }, + { + "epoch": 0.7341098648259994, + "grad_norm": 0.2192476673370964, + "learning_rate": 1.718774678965945e-05, + "loss": 0.48, + "step": 5105 + }, + { + "epoch": 0.7348288754673569, + "grad_norm": 0.2276749968421666, + "learning_rate": 1.7182509946342554e-05, + "loss": 0.5092, + "step": 5110 + }, + { + "epoch": 0.7355478861087144, + "grad_norm": 0.2323405461686891, + "learning_rate": 1.717726903106018e-05, + "loss": 0.4908, + "step": 5115 + }, + { + "epoch": 0.736266896750072, + "grad_norm": 0.2276263889083126, + "learning_rate": 1.717202404678355e-05, + "loss": 0.4824, + "step": 5120 + }, + { + "epoch": 0.7369859073914294, + "grad_norm": 0.23767352851696075, + "learning_rate": 1.716677499648619e-05, + "loss": 0.498, + "step": 5125 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 0.23840569740317488, + "learning_rate": 1.7161521883143936e-05, + "loss": 0.491, + "step": 5130 + }, + { + "epoch": 0.7384239286741444, + "grad_norm": 0.23215210545943304, + "learning_rate": 1.715626470973492e-05, + "loss": 0.4882, + "step": 5135 + }, + { + "epoch": 0.7391429393155019, + "grad_norm": 0.21936893104681401, + "learning_rate": 1.7151003479239583e-05, + "loss": 0.5061, + "step": 5140 + }, + { + "epoch": 0.7398619499568594, + "grad_norm": 0.21293031412925917, + "learning_rate": 1.7145738194640665e-05, + "loss": 0.4774, + "step": 5145 + }, + { + "epoch": 0.7405809605982169, + "grad_norm": 0.21941363355229476, + "learning_rate": 1.7140468858923198e-05, + "loss": 0.4902, + "step": 5150 + }, + { + "epoch": 0.7412999712395744, + "grad_norm": 0.2266668243692637, + "learning_rate": 1.7135195475074523e-05, + "loss": 0.4869, + "step": 5155 + }, + { + "epoch": 0.7420189818809318, + "grad_norm": 0.2209708864201447, + "learning_rate": 1.7129918046084263e-05, + "loss": 0.4758, + "step": 5160 + }, + { + "epoch": 0.7427379925222893, + "grad_norm": 0.22661270643043085, + "learning_rate": 1.712463657494434e-05, + "loss": 0.4973, + "step": 5165 + }, + { + "epoch": 0.7434570031636468, + "grad_norm": 0.23214286702988027, + "learning_rate": 1.711935106464897e-05, + "loss": 0.4996, + "step": 5170 + }, + { + "epoch": 0.7441760138050043, + "grad_norm": 0.2187655580344453, + "learning_rate": 1.7114061518194655e-05, + "loss": 0.4873, + "step": 5175 + }, + { + "epoch": 0.7448950244463618, + "grad_norm": 0.2242583712510688, + "learning_rate": 1.7108767938580184e-05, + "loss": 0.48, + "step": 5180 + }, + { + "epoch": 0.7456140350877193, + "grad_norm": 0.2152231764672806, + "learning_rate": 1.710347032880664e-05, + "loss": 0.4861, + "step": 5185 + }, + { + "epoch": 0.7463330457290768, + "grad_norm": 0.23531880208372852, + "learning_rate": 1.7098168691877386e-05, + "loss": 0.473, + "step": 5190 + }, + { + "epoch": 0.7470520563704343, + "grad_norm": 0.22192408281986586, + "learning_rate": 1.7092863030798063e-05, + "loss": 0.493, + "step": 5195 + }, + { + "epoch": 0.7477710670117917, + "grad_norm": 0.22200491012741358, + "learning_rate": 1.7087553348576603e-05, + "loss": 0.4864, + "step": 5200 + }, + { + "epoch": 0.7484900776531492, + "grad_norm": 0.22153294696718556, + "learning_rate": 1.7082239648223212e-05, + "loss": 0.4989, + "step": 5205 + }, + { + "epoch": 0.7492090882945067, + "grad_norm": 0.21469480556358042, + "learning_rate": 1.7076921932750374e-05, + "loss": 0.515, + "step": 5210 + }, + { + "epoch": 0.7499280989358642, + "grad_norm": 0.22916542377243984, + "learning_rate": 1.7071600205172848e-05, + "loss": 0.498, + "step": 5215 + }, + { + "epoch": 0.7506471095772217, + "grad_norm": 0.22666436482873567, + "learning_rate": 1.7066274468507677e-05, + "loss": 0.4987, + "step": 5220 + }, + { + "epoch": 0.7513661202185792, + "grad_norm": 0.2399650963726253, + "learning_rate": 1.7060944725774165e-05, + "loss": 0.4897, + "step": 5225 + }, + { + "epoch": 0.7520851308599368, + "grad_norm": 0.2112528215844886, + "learning_rate": 1.7055610979993895e-05, + "loss": 0.4886, + "step": 5230 + }, + { + "epoch": 0.7528041415012943, + "grad_norm": 0.21680533409841338, + "learning_rate": 1.705027323419071e-05, + "loss": 0.5032, + "step": 5235 + }, + { + "epoch": 0.7535231521426518, + "grad_norm": 0.2334385250291673, + "learning_rate": 1.7044931491390736e-05, + "loss": 0.4986, + "step": 5240 + }, + { + "epoch": 0.7542421627840092, + "grad_norm": 0.2167997638685853, + "learning_rate": 1.7039585754622345e-05, + "loss": 0.5036, + "step": 5245 + }, + { + "epoch": 0.7549611734253667, + "grad_norm": 0.22161454188712626, + "learning_rate": 1.7034236026916195e-05, + "loss": 0.4845, + "step": 5250 + }, + { + "epoch": 0.7556801840667242, + "grad_norm": 0.2300865929908801, + "learning_rate": 1.7028882311305194e-05, + "loss": 0.4831, + "step": 5255 + }, + { + "epoch": 0.7563991947080817, + "grad_norm": 0.2347650948886215, + "learning_rate": 1.7023524610824508e-05, + "loss": 0.4781, + "step": 5260 + }, + { + "epoch": 0.7571182053494392, + "grad_norm": 0.21672676426437748, + "learning_rate": 1.7018162928511572e-05, + "loss": 0.4866, + "step": 5265 + }, + { + "epoch": 0.7578372159907967, + "grad_norm": 0.2277082506824969, + "learning_rate": 1.7012797267406068e-05, + "loss": 0.4863, + "step": 5270 + }, + { + "epoch": 0.7585562266321542, + "grad_norm": 0.2249837586716329, + "learning_rate": 1.700742763054995e-05, + "loss": 0.4941, + "step": 5275 + }, + { + "epoch": 0.7592752372735116, + "grad_norm": 0.22592325114041079, + "learning_rate": 1.70020540209874e-05, + "loss": 0.4996, + "step": 5280 + }, + { + "epoch": 0.7599942479148691, + "grad_norm": 0.23233670566592116, + "learning_rate": 1.6996676441764877e-05, + "loss": 0.4909, + "step": 5285 + }, + { + "epoch": 0.7607132585562266, + "grad_norm": 0.21942890788454864, + "learning_rate": 1.6991294895931083e-05, + "loss": 0.4811, + "step": 5290 + }, + { + "epoch": 0.7614322691975841, + "grad_norm": 0.22043734089709277, + "learning_rate": 1.6985909386536957e-05, + "loss": 0.5007, + "step": 5295 + }, + { + "epoch": 0.7621512798389416, + "grad_norm": 0.228478118346707, + "learning_rate": 1.6980519916635704e-05, + "loss": 0.4662, + "step": 5300 + }, + { + "epoch": 0.7628702904802991, + "grad_norm": 0.22449985491532506, + "learning_rate": 1.6975126489282762e-05, + "loss": 0.4757, + "step": 5305 + }, + { + "epoch": 0.7635893011216566, + "grad_norm": 0.22059926031579813, + "learning_rate": 1.6969729107535814e-05, + "loss": 0.4754, + "step": 5310 + }, + { + "epoch": 0.7643083117630141, + "grad_norm": 0.2273525810445935, + "learning_rate": 1.6964327774454784e-05, + "loss": 0.4756, + "step": 5315 + }, + { + "epoch": 0.7650273224043715, + "grad_norm": 0.2309888426177309, + "learning_rate": 1.6958922493101844e-05, + "loss": 0.4972, + "step": 5320 + }, + { + "epoch": 0.765746333045729, + "grad_norm": 0.21694684669431855, + "learning_rate": 1.6953513266541396e-05, + "loss": 0.4875, + "step": 5325 + }, + { + "epoch": 0.7664653436870865, + "grad_norm": 0.22695318127330225, + "learning_rate": 1.6948100097840082e-05, + "loss": 0.4916, + "step": 5330 + }, + { + "epoch": 0.767184354328444, + "grad_norm": 0.23443902534856237, + "learning_rate": 1.694268299006678e-05, + "loss": 0.4918, + "step": 5335 + }, + { + "epoch": 0.7679033649698016, + "grad_norm": 0.22364296541891465, + "learning_rate": 1.6937261946292603e-05, + "loss": 0.4949, + "step": 5340 + }, + { + "epoch": 0.7686223756111591, + "grad_norm": 0.24012071225481527, + "learning_rate": 1.693183696959088e-05, + "loss": 0.4987, + "step": 5345 + }, + { + "epoch": 0.7693413862525166, + "grad_norm": 0.23501285078960005, + "learning_rate": 1.6926408063037194e-05, + "loss": 0.4734, + "step": 5350 + }, + { + "epoch": 0.7700603968938741, + "grad_norm": 0.22817716513346892, + "learning_rate": 1.692097522970934e-05, + "loss": 0.4697, + "step": 5355 + }, + { + "epoch": 0.7707794075352316, + "grad_norm": 0.2273677168182446, + "learning_rate": 1.6915538472687337e-05, + "loss": 0.483, + "step": 5360 + }, + { + "epoch": 0.771498418176589, + "grad_norm": 0.24145708448332248, + "learning_rate": 1.6910097795053443e-05, + "loss": 0.495, + "step": 5365 + }, + { + "epoch": 0.7722174288179465, + "grad_norm": 0.21989503645557912, + "learning_rate": 1.6904653199892128e-05, + "loss": 0.4928, + "step": 5370 + }, + { + "epoch": 0.772936439459304, + "grad_norm": 0.2308138520043049, + "learning_rate": 1.689920469029008e-05, + "loss": 0.4869, + "step": 5375 + }, + { + "epoch": 0.7736554501006615, + "grad_norm": 0.2190293279349582, + "learning_rate": 1.689375226933622e-05, + "loss": 0.4697, + "step": 5380 + }, + { + "epoch": 0.774374460742019, + "grad_norm": 0.22367853404329643, + "learning_rate": 1.6888295940121667e-05, + "loss": 0.4875, + "step": 5385 + }, + { + "epoch": 0.7750934713833765, + "grad_norm": 0.2248775477308476, + "learning_rate": 1.6882835705739777e-05, + "loss": 0.4838, + "step": 5390 + }, + { + "epoch": 0.775812482024734, + "grad_norm": 0.22960776143545394, + "learning_rate": 1.6877371569286103e-05, + "loss": 0.5037, + "step": 5395 + }, + { + "epoch": 0.7765314926660914, + "grad_norm": 0.22781522889996447, + "learning_rate": 1.6871903533858417e-05, + "loss": 0.4959, + "step": 5400 + }, + { + "epoch": 0.7772505033074489, + "grad_norm": 0.22083050011075891, + "learning_rate": 1.6866431602556704e-05, + "loss": 0.4885, + "step": 5405 + }, + { + "epoch": 0.7779695139488064, + "grad_norm": 0.21468440259119084, + "learning_rate": 1.686095577848315e-05, + "loss": 0.4975, + "step": 5410 + }, + { + "epoch": 0.7786885245901639, + "grad_norm": 0.22226370104089818, + "learning_rate": 1.6855476064742156e-05, + "loss": 0.4887, + "step": 5415 + }, + { + "epoch": 0.7794075352315214, + "grad_norm": 0.22494368750807195, + "learning_rate": 1.6849992464440323e-05, + "loss": 0.4968, + "step": 5420 + }, + { + "epoch": 0.7801265458728789, + "grad_norm": 0.22237263202529223, + "learning_rate": 1.684450498068646e-05, + "loss": 0.4835, + "step": 5425 + }, + { + "epoch": 0.7808455565142364, + "grad_norm": 0.22938231525634556, + "learning_rate": 1.6839013616591574e-05, + "loss": 0.4905, + "step": 5430 + }, + { + "epoch": 0.7815645671555939, + "grad_norm": 0.2144686219903849, + "learning_rate": 1.683351837526887e-05, + "loss": 0.5035, + "step": 5435 + }, + { + "epoch": 0.7822835777969513, + "grad_norm": 0.21730574169202727, + "learning_rate": 1.6828019259833758e-05, + "loss": 0.4762, + "step": 5440 + }, + { + "epoch": 0.7830025884383088, + "grad_norm": 0.22346888724944097, + "learning_rate": 1.6822516273403832e-05, + "loss": 0.463, + "step": 5445 + }, + { + "epoch": 0.7837215990796664, + "grad_norm": 0.22238996145588832, + "learning_rate": 1.68170094190989e-05, + "loss": 0.4952, + "step": 5450 + }, + { + "epoch": 0.7844406097210239, + "grad_norm": 0.21586108868278092, + "learning_rate": 1.681149870004094e-05, + "loss": 0.4896, + "step": 5455 + }, + { + "epoch": 0.7851596203623814, + "grad_norm": 0.23240105717982454, + "learning_rate": 1.6805984119354146e-05, + "loss": 0.4818, + "step": 5460 + }, + { + "epoch": 0.7858786310037389, + "grad_norm": 0.22390902508487515, + "learning_rate": 1.6800465680164875e-05, + "loss": 0.499, + "step": 5465 + }, + { + "epoch": 0.7865976416450964, + "grad_norm": 0.2281975071709816, + "learning_rate": 1.6794943385601688e-05, + "loss": 0.4914, + "step": 5470 + }, + { + "epoch": 0.7873166522864539, + "grad_norm": 0.22488441842585513, + "learning_rate": 1.6789417238795334e-05, + "loss": 0.4635, + "step": 5475 + }, + { + "epoch": 0.7880356629278114, + "grad_norm": 0.22598444041192908, + "learning_rate": 1.678388724287873e-05, + "loss": 0.4772, + "step": 5480 + }, + { + "epoch": 0.7887546735691688, + "grad_norm": 0.2199128816848034, + "learning_rate": 1.6778353400986996e-05, + "loss": 0.4797, + "step": 5485 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.22362822017432454, + "learning_rate": 1.6772815716257414e-05, + "loss": 0.4847, + "step": 5490 + }, + { + "epoch": 0.7901926948518838, + "grad_norm": 0.2243910018533074, + "learning_rate": 1.676727419182945e-05, + "loss": 0.4868, + "step": 5495 + }, + { + "epoch": 0.7909117054932413, + "grad_norm": 0.23119935601280228, + "learning_rate": 1.6761728830844758e-05, + "loss": 0.4804, + "step": 5500 + }, + { + "epoch": 0.7916307161345988, + "grad_norm": 0.22939329313957305, + "learning_rate": 1.6756179636447153e-05, + "loss": 0.483, + "step": 5505 + }, + { + "epoch": 0.7923497267759563, + "grad_norm": 0.223164242974818, + "learning_rate": 1.6750626611782624e-05, + "loss": 0.4934, + "step": 5510 + }, + { + "epoch": 0.7930687374173138, + "grad_norm": 0.21955055597425524, + "learning_rate": 1.6745069759999345e-05, + "loss": 0.4863, + "step": 5515 + }, + { + "epoch": 0.7937877480586712, + "grad_norm": 0.22108710690359845, + "learning_rate": 1.673950908424764e-05, + "loss": 0.4933, + "step": 5520 + }, + { + "epoch": 0.7945067587000287, + "grad_norm": 0.21896612206901434, + "learning_rate": 1.6733944587680024e-05, + "loss": 0.4842, + "step": 5525 + }, + { + "epoch": 0.7952257693413862, + "grad_norm": 0.2250445157258839, + "learning_rate": 1.6728376273451155e-05, + "loss": 0.4802, + "step": 5530 + }, + { + "epoch": 0.7959447799827437, + "grad_norm": 0.22614610498599702, + "learning_rate": 1.6722804144717866e-05, + "loss": 0.4867, + "step": 5535 + }, + { + "epoch": 0.7966637906241012, + "grad_norm": 0.21945982018024862, + "learning_rate": 1.671722820463916e-05, + "loss": 0.4837, + "step": 5540 + }, + { + "epoch": 0.7973828012654587, + "grad_norm": 0.2235783071481693, + "learning_rate": 1.6711648456376187e-05, + "loss": 0.4847, + "step": 5545 + }, + { + "epoch": 0.7981018119068162, + "grad_norm": 0.22503519784393333, + "learning_rate": 1.6706064903092265e-05, + "loss": 0.4824, + "step": 5550 + }, + { + "epoch": 0.7988208225481737, + "grad_norm": 0.23765125443432644, + "learning_rate": 1.670047754795287e-05, + "loss": 0.502, + "step": 5555 + }, + { + "epoch": 0.7995398331895313, + "grad_norm": 0.24539570004485733, + "learning_rate": 1.6694886394125616e-05, + "loss": 0.4853, + "step": 5560 + }, + { + "epoch": 0.8002588438308887, + "grad_norm": 0.22839361425841867, + "learning_rate": 1.6689291444780296e-05, + "loss": 0.4843, + "step": 5565 + }, + { + "epoch": 0.8009778544722462, + "grad_norm": 0.2220318852698405, + "learning_rate": 1.668369270308884e-05, + "loss": 0.4761, + "step": 5570 + }, + { + "epoch": 0.8016968651136037, + "grad_norm": 0.22278535824455947, + "learning_rate": 1.6678090172225334e-05, + "loss": 0.4724, + "step": 5575 + }, + { + "epoch": 0.8024158757549612, + "grad_norm": 0.22065163329260376, + "learning_rate": 1.6672483855366003e-05, + "loss": 0.4823, + "step": 5580 + }, + { + "epoch": 0.8031348863963187, + "grad_norm": 0.22705093047998076, + "learning_rate": 1.6666873755689233e-05, + "loss": 0.473, + "step": 5585 + }, + { + "epoch": 0.8038538970376762, + "grad_norm": 0.22183215111313642, + "learning_rate": 1.6661259876375538e-05, + "loss": 0.4858, + "step": 5590 + }, + { + "epoch": 0.8045729076790337, + "grad_norm": 0.2287693632580429, + "learning_rate": 1.6655642220607585e-05, + "loss": 0.4841, + "step": 5595 + }, + { + "epoch": 0.8052919183203912, + "grad_norm": 0.2160365599554167, + "learning_rate": 1.665002079157018e-05, + "loss": 0.4812, + "step": 5600 + }, + { + "epoch": 0.8060109289617486, + "grad_norm": 0.23398640973486998, + "learning_rate": 1.6644395592450275e-05, + "loss": 0.4978, + "step": 5605 + }, + { + "epoch": 0.8067299396031061, + "grad_norm": 0.2246444511777012, + "learning_rate": 1.6638766626436942e-05, + "loss": 0.4949, + "step": 5610 + }, + { + "epoch": 0.8074489502444636, + "grad_norm": 0.23046563410664012, + "learning_rate": 1.663313389672141e-05, + "loss": 0.4782, + "step": 5615 + }, + { + "epoch": 0.8081679608858211, + "grad_norm": 0.2236054981949869, + "learning_rate": 1.662749740649702e-05, + "loss": 0.5058, + "step": 5620 + }, + { + "epoch": 0.8088869715271786, + "grad_norm": 0.22204436891909438, + "learning_rate": 1.662185715895926e-05, + "loss": 0.4795, + "step": 5625 + }, + { + "epoch": 0.8096059821685361, + "grad_norm": 0.22077332371837027, + "learning_rate": 1.6616213157305742e-05, + "loss": 0.4898, + "step": 5630 + }, + { + "epoch": 0.8103249928098936, + "grad_norm": 0.23330823438995923, + "learning_rate": 1.6610565404736216e-05, + "loss": 0.4825, + "step": 5635 + }, + { + "epoch": 0.811044003451251, + "grad_norm": 0.2189476109187264, + "learning_rate": 1.660491390445254e-05, + "loss": 0.4748, + "step": 5640 + }, + { + "epoch": 0.8117630140926085, + "grad_norm": 0.2216225341020065, + "learning_rate": 1.6599258659658716e-05, + "loss": 0.4804, + "step": 5645 + }, + { + "epoch": 0.812482024733966, + "grad_norm": 0.23714374623134352, + "learning_rate": 1.6593599673560854e-05, + "loss": 0.5001, + "step": 5650 + }, + { + "epoch": 0.8132010353753235, + "grad_norm": 0.23710103986620198, + "learning_rate": 1.6587936949367195e-05, + "loss": 0.4703, + "step": 5655 + }, + { + "epoch": 0.813920046016681, + "grad_norm": 0.2386050339880569, + "learning_rate": 1.658227049028809e-05, + "loss": 0.4987, + "step": 5660 + }, + { + "epoch": 0.8146390566580385, + "grad_norm": 0.21525085942313, + "learning_rate": 1.6576600299536024e-05, + "loss": 0.4944, + "step": 5665 + }, + { + "epoch": 0.8153580672993961, + "grad_norm": 0.23108927827783027, + "learning_rate": 1.6570926380325574e-05, + "loss": 0.4817, + "step": 5670 + }, + { + "epoch": 0.8160770779407536, + "grad_norm": 0.2300507584194407, + "learning_rate": 1.6565248735873452e-05, + "loss": 0.4992, + "step": 5675 + }, + { + "epoch": 0.8167960885821111, + "grad_norm": 0.21965071462741484, + "learning_rate": 1.6559567369398468e-05, + "loss": 0.4658, + "step": 5680 + }, + { + "epoch": 0.8175150992234685, + "grad_norm": 0.2229343886670813, + "learning_rate": 1.6553882284121554e-05, + "loss": 0.4758, + "step": 5685 + }, + { + "epoch": 0.818234109864826, + "grad_norm": 0.23703192379223698, + "learning_rate": 1.6548193483265737e-05, + "loss": 0.4663, + "step": 5690 + }, + { + "epoch": 0.8189531205061835, + "grad_norm": 0.23582047078488924, + "learning_rate": 1.6542500970056154e-05, + "loss": 0.4941, + "step": 5695 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 0.21828699915638525, + "learning_rate": 1.653680474772006e-05, + "loss": 0.4629, + "step": 5700 + }, + { + "epoch": 0.8203911417888985, + "grad_norm": 0.24568154916375534, + "learning_rate": 1.6531104819486795e-05, + "loss": 0.481, + "step": 5705 + }, + { + "epoch": 0.821110152430256, + "grad_norm": 0.22389567629138732, + "learning_rate": 1.6525401188587812e-05, + "loss": 0.4887, + "step": 5710 + }, + { + "epoch": 0.8218291630716135, + "grad_norm": 0.2168816136348257, + "learning_rate": 1.6519693858256657e-05, + "loss": 0.5099, + "step": 5715 + }, + { + "epoch": 0.822548173712971, + "grad_norm": 0.23615713732079055, + "learning_rate": 1.6513982831728975e-05, + "loss": 0.4799, + "step": 5720 + }, + { + "epoch": 0.8232671843543284, + "grad_norm": 0.21988093075155635, + "learning_rate": 1.6508268112242502e-05, + "loss": 0.4759, + "step": 5725 + }, + { + "epoch": 0.8239861949956859, + "grad_norm": 0.22800000584062294, + "learning_rate": 1.650254970303708e-05, + "loss": 0.4814, + "step": 5730 + }, + { + "epoch": 0.8247052056370434, + "grad_norm": 0.21678615221681993, + "learning_rate": 1.6496827607354626e-05, + "loss": 0.4847, + "step": 5735 + }, + { + "epoch": 0.8254242162784009, + "grad_norm": 0.23408400973634177, + "learning_rate": 1.6491101828439166e-05, + "loss": 0.4881, + "step": 5740 + }, + { + "epoch": 0.8261432269197584, + "grad_norm": 0.2186056862208376, + "learning_rate": 1.6485372369536795e-05, + "loss": 0.4924, + "step": 5745 + }, + { + "epoch": 0.8268622375611159, + "grad_norm": 0.2159187104796151, + "learning_rate": 1.647963923389571e-05, + "loss": 0.4825, + "step": 5750 + }, + { + "epoch": 0.8275812482024734, + "grad_norm": 0.24496266550950746, + "learning_rate": 1.6473902424766183e-05, + "loss": 0.494, + "step": 5755 + }, + { + "epoch": 0.8283002588438308, + "grad_norm": 0.2216005575843833, + "learning_rate": 1.6468161945400563e-05, + "loss": 0.4986, + "step": 5760 + }, + { + "epoch": 0.8290192694851883, + "grad_norm": 0.22620718671453902, + "learning_rate": 1.6462417799053305e-05, + "loss": 0.4852, + "step": 5765 + }, + { + "epoch": 0.8297382801265458, + "grad_norm": 0.22835005662918015, + "learning_rate": 1.6456669988980914e-05, + "loss": 0.4908, + "step": 5770 + }, + { + "epoch": 0.8304572907679033, + "grad_norm": 0.22567869815441383, + "learning_rate": 1.6450918518441987e-05, + "loss": 0.4833, + "step": 5775 + }, + { + "epoch": 0.8311763014092609, + "grad_norm": 0.2315328450963394, + "learning_rate": 1.6445163390697195e-05, + "loss": 0.5077, + "step": 5780 + }, + { + "epoch": 0.8318953120506184, + "grad_norm": 0.22117832252886435, + "learning_rate": 1.6439404609009274e-05, + "loss": 0.4814, + "step": 5785 + }, + { + "epoch": 0.8326143226919759, + "grad_norm": 0.2341708532379826, + "learning_rate": 1.643364217664305e-05, + "loss": 0.4788, + "step": 5790 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.22387820306203415, + "learning_rate": 1.6427876096865394e-05, + "loss": 0.4817, + "step": 5795 + }, + { + "epoch": 0.8340523439746909, + "grad_norm": 0.22556092688610593, + "learning_rate": 1.642210637294527e-05, + "loss": 0.4717, + "step": 5800 + }, + { + "epoch": 0.8347713546160483, + "grad_norm": 0.23744905375208034, + "learning_rate": 1.6416333008153686e-05, + "loss": 0.4758, + "step": 5805 + }, + { + "epoch": 0.8354903652574058, + "grad_norm": 0.23711306495500717, + "learning_rate": 1.6410556005763722e-05, + "loss": 0.4779, + "step": 5810 + }, + { + "epoch": 0.8362093758987633, + "grad_norm": 0.22411311826775682, + "learning_rate": 1.640477536905053e-05, + "loss": 0.4859, + "step": 5815 + }, + { + "epoch": 0.8369283865401208, + "grad_norm": 0.2428542391128517, + "learning_rate": 1.6398991101291316e-05, + "loss": 0.471, + "step": 5820 + }, + { + "epoch": 0.8376473971814783, + "grad_norm": 0.22509448770506563, + "learning_rate": 1.6393203205765335e-05, + "loss": 0.4782, + "step": 5825 + }, + { + "epoch": 0.8383664078228358, + "grad_norm": 0.22559902539830068, + "learning_rate": 1.6387411685753912e-05, + "loss": 0.468, + "step": 5830 + }, + { + "epoch": 0.8390854184641933, + "grad_norm": 0.24306805583011284, + "learning_rate": 1.6381616544540415e-05, + "loss": 0.4905, + "step": 5835 + }, + { + "epoch": 0.8398044291055508, + "grad_norm": 0.23787097633193682, + "learning_rate": 1.637581778541028e-05, + "loss": 0.4898, + "step": 5840 + }, + { + "epoch": 0.8405234397469082, + "grad_norm": 0.21778768663408346, + "learning_rate": 1.637001541165098e-05, + "loss": 0.4726, + "step": 5845 + }, + { + "epoch": 0.8412424503882657, + "grad_norm": 0.23070983780461993, + "learning_rate": 1.6364209426552046e-05, + "loss": 0.4758, + "step": 5850 + }, + { + "epoch": 0.8419614610296232, + "grad_norm": 0.2835667407348202, + "learning_rate": 1.6358399833405044e-05, + "loss": 0.4956, + "step": 5855 + }, + { + "epoch": 0.8426804716709807, + "grad_norm": 0.2381888745289789, + "learning_rate": 1.6352586635503608e-05, + "loss": 0.4841, + "step": 5860 + }, + { + "epoch": 0.8433994823123382, + "grad_norm": 0.22569894243887517, + "learning_rate": 1.6346769836143393e-05, + "loss": 0.4857, + "step": 5865 + }, + { + "epoch": 0.8441184929536957, + "grad_norm": 0.2302954548936336, + "learning_rate": 1.6340949438622112e-05, + "loss": 0.5063, + "step": 5870 + }, + { + "epoch": 0.8448375035950532, + "grad_norm": 0.21531230746504856, + "learning_rate": 1.6335125446239505e-05, + "loss": 0.4786, + "step": 5875 + }, + { + "epoch": 0.8455565142364107, + "grad_norm": 0.23073398174636456, + "learning_rate": 1.6329297862297357e-05, + "loss": 0.4808, + "step": 5880 + }, + { + "epoch": 0.8462755248777681, + "grad_norm": 0.22976229717518745, + "learning_rate": 1.632346669009949e-05, + "loss": 0.4716, + "step": 5885 + }, + { + "epoch": 0.8469945355191257, + "grad_norm": 0.22232867606276158, + "learning_rate": 1.6317631932951754e-05, + "loss": 0.4775, + "step": 5890 + }, + { + "epoch": 0.8477135461604832, + "grad_norm": 0.21947980567589506, + "learning_rate": 1.631179359416204e-05, + "loss": 0.4893, + "step": 5895 + }, + { + "epoch": 0.8484325568018407, + "grad_norm": 0.2110588984550264, + "learning_rate": 1.6305951677040267e-05, + "loss": 0.471, + "step": 5900 + }, + { + "epoch": 0.8491515674431982, + "grad_norm": 0.23131721067942174, + "learning_rate": 1.6300106184898378e-05, + "loss": 0.4965, + "step": 5905 + }, + { + "epoch": 0.8498705780845557, + "grad_norm": 0.23118088988577612, + "learning_rate": 1.6294257121050346e-05, + "loss": 0.4725, + "step": 5910 + }, + { + "epoch": 0.8505895887259132, + "grad_norm": 0.22190058206659963, + "learning_rate": 1.6288404488812166e-05, + "loss": 0.5111, + "step": 5915 + }, + { + "epoch": 0.8513085993672707, + "grad_norm": 0.21259685560742111, + "learning_rate": 1.6282548291501862e-05, + "loss": 0.4737, + "step": 5920 + }, + { + "epoch": 0.8520276100086281, + "grad_norm": 0.2205808860614832, + "learning_rate": 1.6276688532439476e-05, + "loss": 0.4773, + "step": 5925 + }, + { + "epoch": 0.8527466206499856, + "grad_norm": 0.22014342140272772, + "learning_rate": 1.6270825214947067e-05, + "loss": 0.477, + "step": 5930 + }, + { + "epoch": 0.8534656312913431, + "grad_norm": 0.22028914254683596, + "learning_rate": 1.626495834234872e-05, + "loss": 0.4976, + "step": 5935 + }, + { + "epoch": 0.8541846419327006, + "grad_norm": 0.22540771119256428, + "learning_rate": 1.625908791797052e-05, + "loss": 0.49, + "step": 5940 + }, + { + "epoch": 0.8549036525740581, + "grad_norm": 0.2243457513999384, + "learning_rate": 1.6253213945140577e-05, + "loss": 0.4708, + "step": 5945 + }, + { + "epoch": 0.8556226632154156, + "grad_norm": 0.22179506056212145, + "learning_rate": 1.6247336427189013e-05, + "loss": 0.4612, + "step": 5950 + }, + { + "epoch": 0.8563416738567731, + "grad_norm": 0.22499318240500388, + "learning_rate": 1.6241455367447955e-05, + "loss": 0.4799, + "step": 5955 + }, + { + "epoch": 0.8570606844981306, + "grad_norm": 0.23238220072024518, + "learning_rate": 1.623557076925154e-05, + "loss": 0.4944, + "step": 5960 + }, + { + "epoch": 0.857779695139488, + "grad_norm": 0.2328029055532124, + "learning_rate": 1.6229682635935913e-05, + "loss": 0.4896, + "step": 5965 + }, + { + "epoch": 0.8584987057808455, + "grad_norm": 0.23127132014501922, + "learning_rate": 1.6223790970839214e-05, + "loss": 0.48, + "step": 5970 + }, + { + "epoch": 0.859217716422203, + "grad_norm": 0.2231009382922071, + "learning_rate": 1.6217895777301606e-05, + "loss": 0.4787, + "step": 5975 + }, + { + "epoch": 0.8599367270635605, + "grad_norm": 0.2317042007787483, + "learning_rate": 1.6211997058665226e-05, + "loss": 0.4766, + "step": 5980 + }, + { + "epoch": 0.860655737704918, + "grad_norm": 0.22457281308081492, + "learning_rate": 1.6206094818274228e-05, + "loss": 0.5016, + "step": 5985 + }, + { + "epoch": 0.8613747483462755, + "grad_norm": 0.22725088886882755, + "learning_rate": 1.6200189059474758e-05, + "loss": 0.4776, + "step": 5990 + }, + { + "epoch": 0.862093758987633, + "grad_norm": 0.2357506938091471, + "learning_rate": 1.6194279785614955e-05, + "loss": 0.4896, + "step": 5995 + }, + { + "epoch": 0.8628127696289906, + "grad_norm": 0.23057941226085277, + "learning_rate": 1.618836700004495e-05, + "loss": 0.487, + "step": 6000 + }, + { + "epoch": 0.863531780270348, + "grad_norm": 0.22756225548351794, + "learning_rate": 1.6182450706116863e-05, + "loss": 0.4989, + "step": 6005 + }, + { + "epoch": 0.8642507909117055, + "grad_norm": 0.22167997938476994, + "learning_rate": 1.617653090718481e-05, + "loss": 0.481, + "step": 6010 + }, + { + "epoch": 0.864969801553063, + "grad_norm": 0.2316491758487818, + "learning_rate": 1.6170607606604895e-05, + "loss": 0.4638, + "step": 6015 + }, + { + "epoch": 0.8656888121944205, + "grad_norm": 0.23171481513913114, + "learning_rate": 1.6164680807735192e-05, + "loss": 0.4881, + "step": 6020 + }, + { + "epoch": 0.866407822835778, + "grad_norm": 0.23045109254133472, + "learning_rate": 1.615875051393578e-05, + "loss": 0.4797, + "step": 6025 + }, + { + "epoch": 0.8671268334771355, + "grad_norm": 0.21834412170756476, + "learning_rate": 1.6152816728568697e-05, + "loss": 0.5082, + "step": 6030 + }, + { + "epoch": 0.867845844118493, + "grad_norm": 0.20756527996156474, + "learning_rate": 1.614687945499798e-05, + "loss": 0.4718, + "step": 6035 + }, + { + "epoch": 0.8685648547598505, + "grad_norm": 0.22570883487100146, + "learning_rate": 1.6140938696589634e-05, + "loss": 0.4769, + "step": 6040 + }, + { + "epoch": 0.869283865401208, + "grad_norm": 0.2242382985650034, + "learning_rate": 1.6134994456711638e-05, + "loss": 0.4707, + "step": 6045 + }, + { + "epoch": 0.8700028760425654, + "grad_norm": 0.2199373810957077, + "learning_rate": 1.6129046738733947e-05, + "loss": 0.4822, + "step": 6050 + }, + { + "epoch": 0.8707218866839229, + "grad_norm": 0.2230809670907246, + "learning_rate": 1.6123095546028495e-05, + "loss": 0.4898, + "step": 6055 + }, + { + "epoch": 0.8714408973252804, + "grad_norm": 0.22136148728649427, + "learning_rate": 1.611714088196917e-05, + "loss": 0.482, + "step": 6060 + }, + { + "epoch": 0.8721599079666379, + "grad_norm": 0.23195219942814263, + "learning_rate": 1.6111182749931845e-05, + "loss": 0.4687, + "step": 6065 + }, + { + "epoch": 0.8728789186079954, + "grad_norm": 0.22535271756104616, + "learning_rate": 1.610522115329435e-05, + "loss": 0.4797, + "step": 6070 + }, + { + "epoch": 0.8735979292493529, + "grad_norm": 0.23272576269146622, + "learning_rate": 1.6099256095436476e-05, + "loss": 0.4873, + "step": 6075 + }, + { + "epoch": 0.8743169398907104, + "grad_norm": 0.23328064687107508, + "learning_rate": 1.6093287579739983e-05, + "loss": 0.495, + "step": 6080 + }, + { + "epoch": 0.8750359505320678, + "grad_norm": 0.23241446465065863, + "learning_rate": 1.608731560958859e-05, + "loss": 0.4958, + "step": 6085 + }, + { + "epoch": 0.8757549611734253, + "grad_norm": 0.2165342273687269, + "learning_rate": 1.608134018836798e-05, + "loss": 0.4872, + "step": 6090 + }, + { + "epoch": 0.8764739718147828, + "grad_norm": 0.21036060325924805, + "learning_rate": 1.6075361319465773e-05, + "loss": 0.4892, + "step": 6095 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 0.22465055999089578, + "learning_rate": 1.606937900627157e-05, + "loss": 0.4693, + "step": 6100 + }, + { + "epoch": 0.8779119930974978, + "grad_norm": 0.21896389536444244, + "learning_rate": 1.60633932521769e-05, + "loss": 0.4781, + "step": 6105 + }, + { + "epoch": 0.8786310037388554, + "grad_norm": 0.2221483004455103, + "learning_rate": 1.6057404060575264e-05, + "loss": 0.4857, + "step": 6110 + }, + { + "epoch": 0.8793500143802129, + "grad_norm": 0.22614345726496707, + "learning_rate": 1.6051411434862094e-05, + "loss": 0.4763, + "step": 6115 + }, + { + "epoch": 0.8800690250215704, + "grad_norm": 0.22634152148536316, + "learning_rate": 1.604541537843478e-05, + "loss": 0.4911, + "step": 6120 + }, + { + "epoch": 0.8807880356629278, + "grad_norm": 0.22412631243319087, + "learning_rate": 1.6039415894692657e-05, + "loss": 0.4606, + "step": 6125 + }, + { + "epoch": 0.8815070463042853, + "grad_norm": 0.22735655451709952, + "learning_rate": 1.6033412987036994e-05, + "loss": 0.4785, + "step": 6130 + }, + { + "epoch": 0.8822260569456428, + "grad_norm": 0.23678519265006817, + "learning_rate": 1.6027406658871014e-05, + "loss": 0.4825, + "step": 6135 + }, + { + "epoch": 0.8829450675870003, + "grad_norm": 0.22413195344276224, + "learning_rate": 1.6021396913599865e-05, + "loss": 0.4792, + "step": 6140 + }, + { + "epoch": 0.8836640782283578, + "grad_norm": 0.23892307294965057, + "learning_rate": 1.601538375463064e-05, + "loss": 0.4825, + "step": 6145 + }, + { + "epoch": 0.8843830888697153, + "grad_norm": 0.21516185870148855, + "learning_rate": 1.6009367185372377e-05, + "loss": 0.4757, + "step": 6150 + }, + { + "epoch": 0.8851020995110728, + "grad_norm": 0.2170681075937456, + "learning_rate": 1.6003347209236025e-05, + "loss": 0.4799, + "step": 6155 + }, + { + "epoch": 0.8858211101524303, + "grad_norm": 0.2294809317302986, + "learning_rate": 1.599732382963448e-05, + "loss": 0.4611, + "step": 6160 + }, + { + "epoch": 0.8865401207937877, + "grad_norm": 0.22283423190183876, + "learning_rate": 1.599129704998257e-05, + "loss": 0.4864, + "step": 6165 + }, + { + "epoch": 0.8872591314351452, + "grad_norm": 0.2175656604739115, + "learning_rate": 1.598526687369703e-05, + "loss": 0.4869, + "step": 6170 + }, + { + "epoch": 0.8879781420765027, + "grad_norm": 0.23105587197199792, + "learning_rate": 1.5979233304196556e-05, + "loss": 0.4873, + "step": 6175 + }, + { + "epoch": 0.8886971527178602, + "grad_norm": 0.21272643771628316, + "learning_rate": 1.597319634490173e-05, + "loss": 0.4688, + "step": 6180 + }, + { + "epoch": 0.8894161633592177, + "grad_norm": 0.21893173889511963, + "learning_rate": 1.5967155999235083e-05, + "loss": 0.4778, + "step": 6185 + }, + { + "epoch": 0.8901351740005752, + "grad_norm": 0.2272209278510632, + "learning_rate": 1.5961112270621048e-05, + "loss": 0.4664, + "step": 6190 + }, + { + "epoch": 0.8908541846419327, + "grad_norm": 0.22082710241141104, + "learning_rate": 1.595506516248599e-05, + "loss": 0.4981, + "step": 6195 + }, + { + "epoch": 0.8915731952832902, + "grad_norm": 0.2217593697200759, + "learning_rate": 1.594901467825818e-05, + "loss": 0.4796, + "step": 6200 + }, + { + "epoch": 0.8922922059246476, + "grad_norm": 0.22488503585245775, + "learning_rate": 1.594296082136781e-05, + "loss": 0.4865, + "step": 6205 + }, + { + "epoch": 0.8930112165660051, + "grad_norm": 0.22575245374790812, + "learning_rate": 1.5936903595246974e-05, + "loss": 0.4875, + "step": 6210 + }, + { + "epoch": 0.8937302272073626, + "grad_norm": 0.22818628540292302, + "learning_rate": 1.593084300332969e-05, + "loss": 0.4985, + "step": 6215 + }, + { + "epoch": 0.8944492378487202, + "grad_norm": 0.2302092002355399, + "learning_rate": 1.592477904905187e-05, + "loss": 0.5042, + "step": 6220 + }, + { + "epoch": 0.8951682484900777, + "grad_norm": 0.21943538633345835, + "learning_rate": 1.5918711735851342e-05, + "loss": 0.4778, + "step": 6225 + }, + { + "epoch": 0.8958872591314352, + "grad_norm": 0.2242531512662913, + "learning_rate": 1.591264106716784e-05, + "loss": 0.4807, + "step": 6230 + }, + { + "epoch": 0.8966062697727927, + "grad_norm": 0.21745708652231507, + "learning_rate": 1.5906567046442987e-05, + "loss": 0.476, + "step": 6235 + }, + { + "epoch": 0.8973252804141502, + "grad_norm": 0.22501323800463438, + "learning_rate": 1.5900489677120318e-05, + "loss": 0.4858, + "step": 6240 + }, + { + "epoch": 0.8980442910555076, + "grad_norm": 0.22790024126820935, + "learning_rate": 1.589440896264527e-05, + "loss": 0.4697, + "step": 6245 + }, + { + "epoch": 0.8987633016968651, + "grad_norm": 0.21995145936225782, + "learning_rate": 1.5888324906465164e-05, + "loss": 0.4641, + "step": 6250 + }, + { + "epoch": 0.8994823123382226, + "grad_norm": 0.23120518687662991, + "learning_rate": 1.5882237512029217e-05, + "loss": 0.4863, + "step": 6255 + }, + { + "epoch": 0.9002013229795801, + "grad_norm": 0.23010807888424378, + "learning_rate": 1.5876146782788552e-05, + "loss": 0.4968, + "step": 6260 + }, + { + "epoch": 0.9009203336209376, + "grad_norm": 0.231826121768125, + "learning_rate": 1.587005272219617e-05, + "loss": 0.4952, + "step": 6265 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 0.22390104604646177, + "learning_rate": 1.586395533370696e-05, + "loss": 0.4692, + "step": 6270 + }, + { + "epoch": 0.9023583549036526, + "grad_norm": 0.2184873369261828, + "learning_rate": 1.5857854620777705e-05, + "loss": 0.4874, + "step": 6275 + }, + { + "epoch": 0.9030773655450101, + "grad_norm": 0.21982204158725013, + "learning_rate": 1.5851750586867072e-05, + "loss": 0.4907, + "step": 6280 + }, + { + "epoch": 0.9037963761863675, + "grad_norm": 0.23976510415423974, + "learning_rate": 1.5845643235435603e-05, + "loss": 0.4985, + "step": 6285 + }, + { + "epoch": 0.904515386827725, + "grad_norm": 0.23374421323926556, + "learning_rate": 1.5839532569945733e-05, + "loss": 0.4908, + "step": 6290 + }, + { + "epoch": 0.9052343974690825, + "grad_norm": 0.21948693325100235, + "learning_rate": 1.5833418593861764e-05, + "loss": 0.4747, + "step": 6295 + }, + { + "epoch": 0.90595340811044, + "grad_norm": 0.22071951471152612, + "learning_rate": 1.5827301310649882e-05, + "loss": 0.4778, + "step": 6300 + }, + { + "epoch": 0.9066724187517975, + "grad_norm": 0.21646960390367734, + "learning_rate": 1.582118072377814e-05, + "loss": 0.4866, + "step": 6305 + }, + { + "epoch": 0.907391429393155, + "grad_norm": 0.21923777159118818, + "learning_rate": 1.581505683671648e-05, + "loss": 0.4817, + "step": 6310 + }, + { + "epoch": 0.9081104400345125, + "grad_norm": 0.21454771867122877, + "learning_rate": 1.5808929652936696e-05, + "loss": 0.4738, + "step": 6315 + }, + { + "epoch": 0.90882945067587, + "grad_norm": 0.22792992248683422, + "learning_rate": 1.580279917591246e-05, + "loss": 0.503, + "step": 6320 + }, + { + "epoch": 0.9095484613172274, + "grad_norm": 0.2538238320439887, + "learning_rate": 1.5796665409119314e-05, + "loss": 0.4775, + "step": 6325 + }, + { + "epoch": 0.910267471958585, + "grad_norm": 0.22588015715472973, + "learning_rate": 1.5790528356034664e-05, + "loss": 0.4903, + "step": 6330 + }, + { + "epoch": 0.9109864825999425, + "grad_norm": 0.22418936739523307, + "learning_rate": 1.578438802013777e-05, + "loss": 0.4867, + "step": 6335 + }, + { + "epoch": 0.9117054932413, + "grad_norm": 0.22751983453106048, + "learning_rate": 1.5778244404909766e-05, + "loss": 0.4754, + "step": 6340 + }, + { + "epoch": 0.9124245038826575, + "grad_norm": 0.21899944187479214, + "learning_rate": 1.5772097513833638e-05, + "loss": 0.4678, + "step": 6345 + }, + { + "epoch": 0.913143514524015, + "grad_norm": 0.2417862929099873, + "learning_rate": 1.5765947350394223e-05, + "loss": 0.4857, + "step": 6350 + }, + { + "epoch": 0.9138625251653725, + "grad_norm": 0.24055106261970338, + "learning_rate": 1.575979391807823e-05, + "loss": 0.4781, + "step": 6355 + }, + { + "epoch": 0.91458153580673, + "grad_norm": 0.22946156855092792, + "learning_rate": 1.5753637220374207e-05, + "loss": 0.4904, + "step": 6360 + }, + { + "epoch": 0.9153005464480874, + "grad_norm": 0.22385014546780724, + "learning_rate": 1.574747726077256e-05, + "loss": 0.4604, + "step": 6365 + }, + { + "epoch": 0.9160195570894449, + "grad_norm": 0.22505092536829716, + "learning_rate": 1.5741314042765538e-05, + "loss": 0.4759, + "step": 6370 + }, + { + "epoch": 0.9167385677308024, + "grad_norm": 0.22073380209287308, + "learning_rate": 1.5735147569847246e-05, + "loss": 0.4827, + "step": 6375 + }, + { + "epoch": 0.9174575783721599, + "grad_norm": 0.22423700907272914, + "learning_rate": 1.572897784551363e-05, + "loss": 0.4858, + "step": 6380 + }, + { + "epoch": 0.9181765890135174, + "grad_norm": 0.21893309238006559, + "learning_rate": 1.572280487326247e-05, + "loss": 0.4682, + "step": 6385 + }, + { + "epoch": 0.9188955996548749, + "grad_norm": 0.22147352269071435, + "learning_rate": 1.571662865659341e-05, + "loss": 0.4659, + "step": 6390 + }, + { + "epoch": 0.9196146102962324, + "grad_norm": 0.21802176576616122, + "learning_rate": 1.571044919900791e-05, + "loss": 0.4596, + "step": 6395 + }, + { + "epoch": 0.9203336209375899, + "grad_norm": 0.2158313716245226, + "learning_rate": 1.570426650400928e-05, + "loss": 0.4789, + "step": 6400 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.24928740250212056, + "learning_rate": 1.5698080575102662e-05, + "loss": 0.4854, + "step": 6405 + }, + { + "epoch": 0.9217716422203048, + "grad_norm": 0.22848767277634377, + "learning_rate": 1.5691891415795036e-05, + "loss": 0.488, + "step": 6410 + }, + { + "epoch": 0.9224906528616623, + "grad_norm": 0.2256690554653882, + "learning_rate": 1.5685699029595204e-05, + "loss": 0.4961, + "step": 6415 + }, + { + "epoch": 0.9232096635030198, + "grad_norm": 0.2282178032512667, + "learning_rate": 1.5679503420013802e-05, + "loss": 0.4801, + "step": 6420 + }, + { + "epoch": 0.9239286741443773, + "grad_norm": 0.2369155203213744, + "learning_rate": 1.5673304590563296e-05, + "loss": 0.4826, + "step": 6425 + }, + { + "epoch": 0.9246476847857348, + "grad_norm": 0.23112462486864008, + "learning_rate": 1.5667102544757978e-05, + "loss": 0.5034, + "step": 6430 + }, + { + "epoch": 0.9253666954270923, + "grad_norm": 0.22463678559485087, + "learning_rate": 1.566089728611396e-05, + "loss": 0.4746, + "step": 6435 + }, + { + "epoch": 0.9260857060684499, + "grad_norm": 0.22740379538540387, + "learning_rate": 1.5654688818149173e-05, + "loss": 0.4775, + "step": 6440 + }, + { + "epoch": 0.9268047167098074, + "grad_norm": 0.22671726134834985, + "learning_rate": 1.5648477144383374e-05, + "loss": 0.4722, + "step": 6445 + }, + { + "epoch": 0.9275237273511648, + "grad_norm": 0.22361203094445511, + "learning_rate": 1.5642262268338134e-05, + "loss": 0.4875, + "step": 6450 + }, + { + "epoch": 0.9282427379925223, + "grad_norm": 0.22042860812674375, + "learning_rate": 1.5636044193536838e-05, + "loss": 0.5021, + "step": 6455 + }, + { + "epoch": 0.9289617486338798, + "grad_norm": 0.21880500542128759, + "learning_rate": 1.5629822923504692e-05, + "loss": 0.4901, + "step": 6460 + }, + { + "epoch": 0.9296807592752373, + "grad_norm": 0.22161424076526037, + "learning_rate": 1.56235984617687e-05, + "loss": 0.476, + "step": 6465 + }, + { + "epoch": 0.9303997699165948, + "grad_norm": 0.2295143312197024, + "learning_rate": 1.5617370811857683e-05, + "loss": 0.4692, + "step": 6470 + }, + { + "epoch": 0.9311187805579523, + "grad_norm": 0.23064816205367686, + "learning_rate": 1.5611139977302278e-05, + "loss": 0.4845, + "step": 6475 + }, + { + "epoch": 0.9318377911993098, + "grad_norm": 0.2067989481827472, + "learning_rate": 1.5604905961634913e-05, + "loss": 0.4955, + "step": 6480 + }, + { + "epoch": 0.9325568018406672, + "grad_norm": 0.22781771912016957, + "learning_rate": 1.5598668768389827e-05, + "loss": 0.4752, + "step": 6485 + }, + { + "epoch": 0.9332758124820247, + "grad_norm": 0.22430664209549425, + "learning_rate": 1.5592428401103057e-05, + "loss": 0.4749, + "step": 6490 + }, + { + "epoch": 0.9339948231233822, + "grad_norm": 0.22702362842749063, + "learning_rate": 1.558618486331245e-05, + "loss": 0.4735, + "step": 6495 + }, + { + "epoch": 0.9347138337647397, + "grad_norm": 0.23188979467673745, + "learning_rate": 1.557993815855763e-05, + "loss": 0.471, + "step": 6500 + }, + { + "epoch": 0.9354328444060972, + "grad_norm": 0.2316035087138115, + "learning_rate": 1.557368829038003e-05, + "loss": 0.4914, + "step": 6505 + }, + { + "epoch": 0.9361518550474547, + "grad_norm": 0.22225177951148622, + "learning_rate": 1.5567435262322887e-05, + "loss": 0.4999, + "step": 6510 + }, + { + "epoch": 0.9368708656888122, + "grad_norm": 0.22102749529702834, + "learning_rate": 1.5561179077931204e-05, + "loss": 0.473, + "step": 6515 + }, + { + "epoch": 0.9375898763301697, + "grad_norm": 0.22225748257578634, + "learning_rate": 1.5554919740751794e-05, + "loss": 0.4871, + "step": 6520 + }, + { + "epoch": 0.9383088869715271, + "grad_norm": 0.2263304052281884, + "learning_rate": 1.554865725433324e-05, + "loss": 0.4627, + "step": 6525 + }, + { + "epoch": 0.9390278976128846, + "grad_norm": 0.2255865820044724, + "learning_rate": 1.5542391622225935e-05, + "loss": 0.4796, + "step": 6530 + }, + { + "epoch": 0.9397469082542421, + "grad_norm": 0.21548022907518974, + "learning_rate": 1.5536122847982033e-05, + "loss": 0.4794, + "step": 6535 + }, + { + "epoch": 0.9404659188955996, + "grad_norm": 0.2192863785121772, + "learning_rate": 1.552985093515548e-05, + "loss": 0.5112, + "step": 6540 + }, + { + "epoch": 0.9411849295369571, + "grad_norm": 0.23321238623231505, + "learning_rate": 1.552357588730199e-05, + "loss": 0.4774, + "step": 6545 + }, + { + "epoch": 0.9419039401783147, + "grad_norm": 0.22865602059154158, + "learning_rate": 1.5517297707979075e-05, + "loss": 0.4846, + "step": 6550 + }, + { + "epoch": 0.9426229508196722, + "grad_norm": 0.2166415062591689, + "learning_rate": 1.5511016400746e-05, + "loss": 0.4676, + "step": 6555 + }, + { + "epoch": 0.9433419614610297, + "grad_norm": 0.22574702698545204, + "learning_rate": 1.5504731969163825e-05, + "loss": 0.4897, + "step": 6560 + }, + { + "epoch": 0.9440609721023872, + "grad_norm": 0.23374801842488893, + "learning_rate": 1.5498444416795356e-05, + "loss": 0.4686, + "step": 6565 + }, + { + "epoch": 0.9447799827437446, + "grad_norm": 0.2306137475480127, + "learning_rate": 1.5492153747205193e-05, + "loss": 0.4808, + "step": 6570 + }, + { + "epoch": 0.9454989933851021, + "grad_norm": 0.21537193186412396, + "learning_rate": 1.5485859963959687e-05, + "loss": 0.4882, + "step": 6575 + }, + { + "epoch": 0.9462180040264596, + "grad_norm": 0.21738851906271006, + "learning_rate": 1.547956307062696e-05, + "loss": 0.4789, + "step": 6580 + }, + { + "epoch": 0.9469370146678171, + "grad_norm": 0.2207644500562652, + "learning_rate": 1.5473263070776896e-05, + "loss": 0.4796, + "step": 6585 + }, + { + "epoch": 0.9476560253091746, + "grad_norm": 0.21677716829427052, + "learning_rate": 1.5466959967981145e-05, + "loss": 0.4829, + "step": 6590 + }, + { + "epoch": 0.9483750359505321, + "grad_norm": 0.23097534373286382, + "learning_rate": 1.5460653765813107e-05, + "loss": 0.4812, + "step": 6595 + }, + { + "epoch": 0.9490940465918896, + "grad_norm": 0.2093786853394847, + "learning_rate": 1.5454344467847948e-05, + "loss": 0.4896, + "step": 6600 + }, + { + "epoch": 0.949813057233247, + "grad_norm": 0.21999645225575495, + "learning_rate": 1.5448032077662583e-05, + "loss": 0.4851, + "step": 6605 + }, + { + "epoch": 0.9505320678746045, + "grad_norm": 0.22477641153328398, + "learning_rate": 1.5441716598835684e-05, + "loss": 0.4951, + "step": 6610 + }, + { + "epoch": 0.951251078515962, + "grad_norm": 0.23135266269374266, + "learning_rate": 1.5435398034947667e-05, + "loss": 0.4702, + "step": 6615 + }, + { + "epoch": 0.9519700891573195, + "grad_norm": 0.22462726289244378, + "learning_rate": 1.542907638958071e-05, + "loss": 0.4744, + "step": 6620 + }, + { + "epoch": 0.952689099798677, + "grad_norm": 0.22612302893339267, + "learning_rate": 1.542275166631873e-05, + "loss": 0.4834, + "step": 6625 + }, + { + "epoch": 0.9534081104400345, + "grad_norm": 0.22340595977068603, + "learning_rate": 1.541642386874738e-05, + "loss": 0.4948, + "step": 6630 + }, + { + "epoch": 0.954127121081392, + "grad_norm": 0.21601642477661376, + "learning_rate": 1.541009300045407e-05, + "loss": 0.4858, + "step": 6635 + }, + { + "epoch": 0.9548461317227495, + "grad_norm": 0.22499946084889408, + "learning_rate": 1.5403759065027954e-05, + "loss": 0.4856, + "step": 6640 + }, + { + "epoch": 0.955565142364107, + "grad_norm": 0.2197889154098527, + "learning_rate": 1.5397422066059906e-05, + "loss": 0.4725, + "step": 6645 + }, + { + "epoch": 0.9562841530054644, + "grad_norm": 0.23875758339220973, + "learning_rate": 1.539108200714255e-05, + "loss": 0.4762, + "step": 6650 + }, + { + "epoch": 0.9570031636468219, + "grad_norm": 0.2227905479215225, + "learning_rate": 1.538473889187025e-05, + "loss": 0.4739, + "step": 6655 + }, + { + "epoch": 0.9577221742881795, + "grad_norm": 0.23357580858983001, + "learning_rate": 1.5378392723839086e-05, + "loss": 0.4796, + "step": 6660 + }, + { + "epoch": 0.958441184929537, + "grad_norm": 0.24057115172831314, + "learning_rate": 1.537204350664688e-05, + "loss": 0.4808, + "step": 6665 + }, + { + "epoch": 0.9591601955708945, + "grad_norm": 0.22333693699108556, + "learning_rate": 1.5365691243893186e-05, + "loss": 0.4797, + "step": 6670 + }, + { + "epoch": 0.959879206212252, + "grad_norm": 0.22164298046976957, + "learning_rate": 1.535933593917927e-05, + "loss": 0.4775, + "step": 6675 + }, + { + "epoch": 0.9605982168536095, + "grad_norm": 0.2108983044458471, + "learning_rate": 1.5352977596108138e-05, + "loss": 0.4838, + "step": 6680 + }, + { + "epoch": 0.961317227494967, + "grad_norm": 0.22778761172616302, + "learning_rate": 1.5346616218284514e-05, + "loss": 0.4695, + "step": 6685 + }, + { + "epoch": 0.9620362381363244, + "grad_norm": 0.21909715873778327, + "learning_rate": 1.5340251809314833e-05, + "loss": 0.4734, + "step": 6690 + }, + { + "epoch": 0.9627552487776819, + "grad_norm": 0.2148002346061404, + "learning_rate": 1.533388437280727e-05, + "loss": 0.4568, + "step": 6695 + }, + { + "epoch": 0.9634742594190394, + "grad_norm": 0.22905164253993948, + "learning_rate": 1.5327513912371684e-05, + "loss": 0.4878, + "step": 6700 + }, + { + "epoch": 0.9641932700603969, + "grad_norm": 0.2185782489667153, + "learning_rate": 1.532114043161968e-05, + "loss": 0.4834, + "step": 6705 + }, + { + "epoch": 0.9649122807017544, + "grad_norm": 0.21833720077174126, + "learning_rate": 1.531476393416456e-05, + "loss": 0.4749, + "step": 6710 + }, + { + "epoch": 0.9656312913431119, + "grad_norm": 0.2279245541018691, + "learning_rate": 1.530838442362134e-05, + "loss": 0.4982, + "step": 6715 + }, + { + "epoch": 0.9663503019844694, + "grad_norm": 0.2293060476544168, + "learning_rate": 1.5302001903606735e-05, + "loss": 0.4741, + "step": 6720 + }, + { + "epoch": 0.9670693126258268, + "grad_norm": 0.22032989655105037, + "learning_rate": 1.5295616377739178e-05, + "loss": 0.4726, + "step": 6725 + }, + { + "epoch": 0.9677883232671843, + "grad_norm": 0.2295613322786625, + "learning_rate": 1.5289227849638803e-05, + "loss": 0.4769, + "step": 6730 + }, + { + "epoch": 0.9685073339085418, + "grad_norm": 0.2247777045129754, + "learning_rate": 1.5282836322927446e-05, + "loss": 0.4835, + "step": 6735 + }, + { + "epoch": 0.9692263445498993, + "grad_norm": 0.225786697144287, + "learning_rate": 1.527644180122864e-05, + "loss": 0.4929, + "step": 6740 + }, + { + "epoch": 0.9699453551912568, + "grad_norm": 0.21678269771980435, + "learning_rate": 1.527004428816762e-05, + "loss": 0.4798, + "step": 6745 + }, + { + "epoch": 0.9706643658326143, + "grad_norm": 0.22366425391870273, + "learning_rate": 1.5263643787371313e-05, + "loss": 0.4809, + "step": 6750 + }, + { + "epoch": 0.9713833764739718, + "grad_norm": 0.22144782348953934, + "learning_rate": 1.5257240302468343e-05, + "loss": 0.4796, + "step": 6755 + }, + { + "epoch": 0.9721023871153293, + "grad_norm": 0.2156443814782539, + "learning_rate": 1.5250833837089024e-05, + "loss": 0.4684, + "step": 6760 + }, + { + "epoch": 0.9728213977566867, + "grad_norm": 0.22117236753267636, + "learning_rate": 1.5244424394865359e-05, + "loss": 0.4832, + "step": 6765 + }, + { + "epoch": 0.9735404083980443, + "grad_norm": 0.22364864691530395, + "learning_rate": 1.523801197943104e-05, + "loss": 0.4863, + "step": 6770 + }, + { + "epoch": 0.9742594190394018, + "grad_norm": 0.2353866035164657, + "learning_rate": 1.5231596594421443e-05, + "loss": 0.463, + "step": 6775 + }, + { + "epoch": 0.9749784296807593, + "grad_norm": 0.23914032451055825, + "learning_rate": 1.5225178243473633e-05, + "loss": 0.4799, + "step": 6780 + }, + { + "epoch": 0.9756974403221168, + "grad_norm": 0.22286084045601381, + "learning_rate": 1.521875693022635e-05, + "loss": 0.4845, + "step": 6785 + }, + { + "epoch": 0.9764164509634743, + "grad_norm": 0.23539655426452574, + "learning_rate": 1.5212332658320016e-05, + "loss": 0.484, + "step": 6790 + }, + { + "epoch": 0.9771354616048318, + "grad_norm": 0.21981752748550457, + "learning_rate": 1.5205905431396728e-05, + "loss": 0.4751, + "step": 6795 + }, + { + "epoch": 0.9778544722461893, + "grad_norm": 0.2201025870941446, + "learning_rate": 1.5199475253100264e-05, + "loss": 0.4721, + "step": 6800 + }, + { + "epoch": 0.9785734828875468, + "grad_norm": 0.2185983176885855, + "learning_rate": 1.5193042127076072e-05, + "loss": 0.4698, + "step": 6805 + }, + { + "epoch": 0.9792924935289042, + "grad_norm": 0.2209307444365703, + "learning_rate": 1.518660605697127e-05, + "loss": 0.4816, + "step": 6810 + }, + { + "epoch": 0.9800115041702617, + "grad_norm": 0.23385081037322303, + "learning_rate": 1.518016704643464e-05, + "loss": 0.4632, + "step": 6815 + }, + { + "epoch": 0.9807305148116192, + "grad_norm": 0.21479626312740918, + "learning_rate": 1.5173725099116645e-05, + "loss": 0.4665, + "step": 6820 + }, + { + "epoch": 0.9814495254529767, + "grad_norm": 0.22238392229375717, + "learning_rate": 1.51672802186694e-05, + "loss": 0.4719, + "step": 6825 + }, + { + "epoch": 0.9821685360943342, + "grad_norm": 0.23404902107369124, + "learning_rate": 1.5160832408746692e-05, + "loss": 0.5035, + "step": 6830 + }, + { + "epoch": 0.9828875467356917, + "grad_norm": 0.21819905193403144, + "learning_rate": 1.515438167300396e-05, + "loss": 0.4794, + "step": 6835 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 0.2227124367940369, + "learning_rate": 1.5147928015098309e-05, + "loss": 0.4683, + "step": 6840 + }, + { + "epoch": 0.9843255680184066, + "grad_norm": 0.23175024743121764, + "learning_rate": 1.5141471438688497e-05, + "loss": 0.5067, + "step": 6845 + }, + { + "epoch": 0.9850445786597641, + "grad_norm": 0.23035042801996858, + "learning_rate": 1.5135011947434937e-05, + "loss": 0.4856, + "step": 6850 + }, + { + "epoch": 0.9857635893011216, + "grad_norm": 0.22697895961676684, + "learning_rate": 1.5128549544999694e-05, + "loss": 0.482, + "step": 6855 + }, + { + "epoch": 0.9864825999424791, + "grad_norm": 0.21609441944646846, + "learning_rate": 1.512208423504649e-05, + "loss": 0.4746, + "step": 6860 + }, + { + "epoch": 0.9872016105838366, + "grad_norm": 0.22593709410358026, + "learning_rate": 1.5115616021240685e-05, + "loss": 0.4933, + "step": 6865 + }, + { + "epoch": 0.9879206212251941, + "grad_norm": 0.22718404114173152, + "learning_rate": 1.510914490724929e-05, + "loss": 0.4751, + "step": 6870 + }, + { + "epoch": 0.9886396318665516, + "grad_norm": 0.22797193686181583, + "learning_rate": 1.5102670896740957e-05, + "loss": 0.4747, + "step": 6875 + }, + { + "epoch": 0.9893586425079092, + "grad_norm": 0.22768531436338516, + "learning_rate": 1.509619399338599e-05, + "loss": 0.4706, + "step": 6880 + }, + { + "epoch": 0.9900776531492667, + "grad_norm": 0.2305537352512049, + "learning_rate": 1.5089714200856325e-05, + "loss": 0.497, + "step": 6885 + }, + { + "epoch": 0.9907966637906241, + "grad_norm": 0.2366616945336574, + "learning_rate": 1.5083231522825537e-05, + "loss": 0.4912, + "step": 6890 + }, + { + "epoch": 0.9915156744319816, + "grad_norm": 0.2217756285681072, + "learning_rate": 1.5076745962968833e-05, + "loss": 0.4676, + "step": 6895 + }, + { + "epoch": 0.9922346850733391, + "grad_norm": 0.22771473147282423, + "learning_rate": 1.5070257524963063e-05, + "loss": 0.4756, + "step": 6900 + }, + { + "epoch": 0.9929536957146966, + "grad_norm": 0.23294760309698267, + "learning_rate": 1.5063766212486704e-05, + "loss": 0.4928, + "step": 6905 + }, + { + "epoch": 0.9936727063560541, + "grad_norm": 0.21411969868887565, + "learning_rate": 1.5057272029219857e-05, + "loss": 0.4753, + "step": 6910 + }, + { + "epoch": 0.9943917169974116, + "grad_norm": 0.2212546197891626, + "learning_rate": 1.5050774978844263e-05, + "loss": 0.478, + "step": 6915 + }, + { + "epoch": 0.9951107276387691, + "grad_norm": 0.22268275460533818, + "learning_rate": 1.5044275065043273e-05, + "loss": 0.4833, + "step": 6920 + }, + { + "epoch": 0.9958297382801266, + "grad_norm": 0.2449174453256786, + "learning_rate": 1.503777229150188e-05, + "loss": 0.4853, + "step": 6925 + }, + { + "epoch": 0.996548748921484, + "grad_norm": 0.21985579467908328, + "learning_rate": 1.5031266661906678e-05, + "loss": 0.4657, + "step": 6930 + }, + { + "epoch": 0.9972677595628415, + "grad_norm": 0.22165507711757962, + "learning_rate": 1.5024758179945896e-05, + "loss": 0.4934, + "step": 6935 + }, + { + "epoch": 0.997986770204199, + "grad_norm": 0.21902956888212533, + "learning_rate": 1.501824684930937e-05, + "loss": 0.4816, + "step": 6940 + }, + { + "epoch": 0.9987057808455565, + "grad_norm": 0.2313696800475175, + "learning_rate": 1.501173267368856e-05, + "loss": 0.4866, + "step": 6945 + }, + { + "epoch": 0.999424791486914, + "grad_norm": 0.23660863539279678, + "learning_rate": 1.5005215656776531e-05, + "loss": 0.4649, + "step": 6950 + }, + { + "epoch": 1.0, + "eval_loss": 0.45176830887794495, + "eval_runtime": 0.6251, + "eval_samples_per_second": 39.991, + "eval_steps_per_second": 1.6, + "step": 6954 + }, + { + "epoch": 1.0001438021282716, + "grad_norm": 0.2609984891577565, + "learning_rate": 1.4998695802267965e-05, + "loss": 0.4361, + "step": 6955 + }, + { + "epoch": 1.000862812769629, + "grad_norm": 0.2516451509161594, + "learning_rate": 1.4992173113859143e-05, + "loss": 0.427, + "step": 6960 + }, + { + "epoch": 1.0015818234109866, + "grad_norm": 0.24733817071518213, + "learning_rate": 1.4985647595247965e-05, + "loss": 0.4212, + "step": 6965 + }, + { + "epoch": 1.002300834052344, + "grad_norm": 0.24389227857970142, + "learning_rate": 1.4979119250133929e-05, + "loss": 0.4249, + "step": 6970 + }, + { + "epoch": 1.0030198446937015, + "grad_norm": 0.24506075363170043, + "learning_rate": 1.4972588082218136e-05, + "loss": 0.4265, + "step": 6975 + }, + { + "epoch": 1.003738855335059, + "grad_norm": 0.24593725853045245, + "learning_rate": 1.4966054095203284e-05, + "loss": 0.4166, + "step": 6980 + }, + { + "epoch": 1.0044578659764165, + "grad_norm": 0.23807359079460177, + "learning_rate": 1.4959517292793677e-05, + "loss": 0.423, + "step": 6985 + }, + { + "epoch": 1.005176876617774, + "grad_norm": 0.24105601006448701, + "learning_rate": 1.4952977678695211e-05, + "loss": 0.4143, + "step": 6990 + }, + { + "epoch": 1.0058958872591315, + "grad_norm": 0.2447137201531492, + "learning_rate": 1.4946435256615373e-05, + "loss": 0.4199, + "step": 6995 + }, + { + "epoch": 1.006614897900489, + "grad_norm": 0.23351901468885872, + "learning_rate": 1.4939890030263244e-05, + "loss": 0.4224, + "step": 7000 + }, + { + "epoch": 1.0073339085418465, + "grad_norm": 0.24343901453464484, + "learning_rate": 1.4933342003349502e-05, + "loss": 0.4256, + "step": 7005 + }, + { + "epoch": 1.008052919183204, + "grad_norm": 0.24632246255177054, + "learning_rate": 1.49267911795864e-05, + "loss": 0.4083, + "step": 7010 + }, + { + "epoch": 1.0087719298245614, + "grad_norm": 0.24576841044321335, + "learning_rate": 1.4920237562687784e-05, + "loss": 0.4139, + "step": 7015 + }, + { + "epoch": 1.009490940465919, + "grad_norm": 0.2285242809366901, + "learning_rate": 1.4913681156369083e-05, + "loss": 0.4254, + "step": 7020 + }, + { + "epoch": 1.0102099511072764, + "grad_norm": 0.24698769719760663, + "learning_rate": 1.490712196434731e-05, + "loss": 0.4201, + "step": 7025 + }, + { + "epoch": 1.010928961748634, + "grad_norm": 0.24177562139259248, + "learning_rate": 1.4900559990341048e-05, + "loss": 0.405, + "step": 7030 + }, + { + "epoch": 1.0116479723899914, + "grad_norm": 0.2284256513112865, + "learning_rate": 1.489399523807047e-05, + "loss": 0.4153, + "step": 7035 + }, + { + "epoch": 1.0123669830313489, + "grad_norm": 0.260722596094778, + "learning_rate": 1.488742771125731e-05, + "loss": 0.4192, + "step": 7040 + }, + { + "epoch": 1.0130859936727064, + "grad_norm": 0.2318339583881091, + "learning_rate": 1.4880857413624888e-05, + "loss": 0.4311, + "step": 7045 + }, + { + "epoch": 1.0138050043140638, + "grad_norm": 0.2556005671407621, + "learning_rate": 1.4874284348898089e-05, + "loss": 0.4289, + "step": 7050 + }, + { + "epoch": 1.0145240149554213, + "grad_norm": 0.23971405460324446, + "learning_rate": 1.4867708520803366e-05, + "loss": 0.4112, + "step": 7055 + }, + { + "epoch": 1.0152430255967788, + "grad_norm": 0.2595593731931635, + "learning_rate": 1.4861129933068738e-05, + "loss": 0.4248, + "step": 7060 + }, + { + "epoch": 1.0159620362381363, + "grad_norm": 0.23810363881907462, + "learning_rate": 1.4854548589423792e-05, + "loss": 0.4102, + "step": 7065 + }, + { + "epoch": 1.0166810468794938, + "grad_norm": 0.2534738889359647, + "learning_rate": 1.4847964493599674e-05, + "loss": 0.4294, + "step": 7070 + }, + { + "epoch": 1.0174000575208513, + "grad_norm": 0.23848085398163255, + "learning_rate": 1.4841377649329095e-05, + "loss": 0.4266, + "step": 7075 + }, + { + "epoch": 1.0181190681622088, + "grad_norm": 0.2455758594331574, + "learning_rate": 1.4834788060346315e-05, + "loss": 0.4158, + "step": 7080 + }, + { + "epoch": 1.0188380788035662, + "grad_norm": 0.2499471679265644, + "learning_rate": 1.4828195730387162e-05, + "loss": 0.4242, + "step": 7085 + }, + { + "epoch": 1.0195570894449237, + "grad_norm": 0.24622402422531225, + "learning_rate": 1.4821600663189009e-05, + "loss": 0.4097, + "step": 7090 + }, + { + "epoch": 1.0202761000862812, + "grad_norm": 0.255688979450824, + "learning_rate": 1.4815002862490784e-05, + "loss": 0.4359, + "step": 7095 + }, + { + "epoch": 1.0209951107276387, + "grad_norm": 0.2359064077389048, + "learning_rate": 1.4808402332032966e-05, + "loss": 0.4215, + "step": 7100 + }, + { + "epoch": 1.0217141213689962, + "grad_norm": 0.26117531866755295, + "learning_rate": 1.4801799075557579e-05, + "loss": 0.4281, + "step": 7105 + }, + { + "epoch": 1.0224331320103537, + "grad_norm": 0.2351672925246976, + "learning_rate": 1.4795193096808191e-05, + "loss": 0.4252, + "step": 7110 + }, + { + "epoch": 1.0231521426517112, + "grad_norm": 0.24530936090046365, + "learning_rate": 1.4788584399529919e-05, + "loss": 0.4129, + "step": 7115 + }, + { + "epoch": 1.0238711532930687, + "grad_norm": 0.23829577092670978, + "learning_rate": 1.4781972987469421e-05, + "loss": 0.4134, + "step": 7120 + }, + { + "epoch": 1.0245901639344261, + "grad_norm": 0.2373278401919916, + "learning_rate": 1.4775358864374884e-05, + "loss": 0.4262, + "step": 7125 + }, + { + "epoch": 1.0253091745757836, + "grad_norm": 0.2386215543659765, + "learning_rate": 1.4768742033996045e-05, + "loss": 0.4292, + "step": 7130 + }, + { + "epoch": 1.0260281852171411, + "grad_norm": 0.245165207310462, + "learning_rate": 1.4762122500084163e-05, + "loss": 0.4111, + "step": 7135 + }, + { + "epoch": 1.0267471958584986, + "grad_norm": 0.24704393239415223, + "learning_rate": 1.4755500266392044e-05, + "loss": 0.4282, + "step": 7140 + }, + { + "epoch": 1.027466206499856, + "grad_norm": 0.24281873003447055, + "learning_rate": 1.4748875336674016e-05, + "loss": 0.4265, + "step": 7145 + }, + { + "epoch": 1.0281852171412136, + "grad_norm": 0.2469815238178739, + "learning_rate": 1.474224771468593e-05, + "loss": 0.4258, + "step": 7150 + }, + { + "epoch": 1.0289042277825713, + "grad_norm": 0.23933515470857958, + "learning_rate": 1.4735617404185183e-05, + "loss": 0.414, + "step": 7155 + }, + { + "epoch": 1.0296232384239288, + "grad_norm": 0.2480531863542624, + "learning_rate": 1.4728984408930668e-05, + "loss": 0.4155, + "step": 7160 + }, + { + "epoch": 1.0303422490652863, + "grad_norm": 0.22928500215568193, + "learning_rate": 1.4722348732682824e-05, + "loss": 0.41, + "step": 7165 + }, + { + "epoch": 1.0310612597066438, + "grad_norm": 0.23058079356797653, + "learning_rate": 1.4715710379203602e-05, + "loss": 0.4222, + "step": 7170 + }, + { + "epoch": 1.0317802703480012, + "grad_norm": 0.23989774674583075, + "learning_rate": 1.4709069352256467e-05, + "loss": 0.3973, + "step": 7175 + }, + { + "epoch": 1.0324992809893587, + "grad_norm": 0.23468182850625818, + "learning_rate": 1.4702425655606403e-05, + "loss": 0.4261, + "step": 7180 + }, + { + "epoch": 1.0332182916307162, + "grad_norm": 0.24877747571837835, + "learning_rate": 1.4695779293019908e-05, + "loss": 0.4382, + "step": 7185 + }, + { + "epoch": 1.0339373022720737, + "grad_norm": 0.23754062310392648, + "learning_rate": 1.4689130268264989e-05, + "loss": 0.4272, + "step": 7190 + }, + { + "epoch": 1.0346563129134312, + "grad_norm": 0.24175160857979996, + "learning_rate": 1.4682478585111165e-05, + "loss": 0.417, + "step": 7195 + }, + { + "epoch": 1.0353753235547887, + "grad_norm": 0.7431584047698181, + "learning_rate": 1.467582424732946e-05, + "loss": 0.4253, + "step": 7200 + }, + { + "epoch": 1.0360943341961462, + "grad_norm": 0.24693534297845374, + "learning_rate": 1.4669167258692407e-05, + "loss": 0.4176, + "step": 7205 + }, + { + "epoch": 1.0368133448375036, + "grad_norm": 0.23950832330092756, + "learning_rate": 1.4662507622974037e-05, + "loss": 0.4123, + "step": 7210 + }, + { + "epoch": 1.0375323554788611, + "grad_norm": 0.2442138232459913, + "learning_rate": 1.4655845343949877e-05, + "loss": 0.4211, + "step": 7215 + }, + { + "epoch": 1.0382513661202186, + "grad_norm": 0.2470450242567347, + "learning_rate": 1.4649180425396972e-05, + "loss": 0.4199, + "step": 7220 + }, + { + "epoch": 1.038970376761576, + "grad_norm": 0.25475772569037297, + "learning_rate": 1.4642512871093838e-05, + "loss": 0.4228, + "step": 7225 + }, + { + "epoch": 1.0396893874029336, + "grad_norm": 0.23975823704149188, + "learning_rate": 1.4635842684820506e-05, + "loss": 0.4335, + "step": 7230 + }, + { + "epoch": 1.040408398044291, + "grad_norm": 0.2434879248343545, + "learning_rate": 1.462916987035849e-05, + "loss": 0.4183, + "step": 7235 + }, + { + "epoch": 1.0411274086856486, + "grad_norm": 0.23917965588817724, + "learning_rate": 1.462249443149079e-05, + "loss": 0.4338, + "step": 7240 + }, + { + "epoch": 1.041846419327006, + "grad_norm": 0.2413379742131455, + "learning_rate": 1.4615816372001904e-05, + "loss": 0.4226, + "step": 7245 + }, + { + "epoch": 1.0425654299683635, + "grad_norm": 0.24650543649743945, + "learning_rate": 1.4609135695677805e-05, + "loss": 0.4268, + "step": 7250 + }, + { + "epoch": 1.043284440609721, + "grad_norm": 0.2369181897502444, + "learning_rate": 1.4602452406305962e-05, + "loss": 0.4108, + "step": 7255 + }, + { + "epoch": 1.0440034512510785, + "grad_norm": 0.25203791980105866, + "learning_rate": 1.4595766507675313e-05, + "loss": 0.4186, + "step": 7260 + }, + { + "epoch": 1.044722461892436, + "grad_norm": 0.2870260934077793, + "learning_rate": 1.4589078003576279e-05, + "loss": 0.4158, + "step": 7265 + }, + { + "epoch": 1.0454414725337935, + "grad_norm": 0.24269081963109623, + "learning_rate": 1.4582386897800766e-05, + "loss": 0.4172, + "step": 7270 + }, + { + "epoch": 1.046160483175151, + "grad_norm": 0.25226442413606864, + "learning_rate": 1.4575693194142146e-05, + "loss": 0.429, + "step": 7275 + }, + { + "epoch": 1.0468794938165085, + "grad_norm": 0.2425898262686411, + "learning_rate": 1.4568996896395264e-05, + "loss": 0.4266, + "step": 7280 + }, + { + "epoch": 1.047598504457866, + "grad_norm": 0.23733252857410786, + "learning_rate": 1.4562298008356441e-05, + "loss": 0.4147, + "step": 7285 + }, + { + "epoch": 1.0483175150992234, + "grad_norm": 0.24222783715783344, + "learning_rate": 1.4555596533823466e-05, + "loss": 0.4325, + "step": 7290 + }, + { + "epoch": 1.049036525740581, + "grad_norm": 0.24329552527037612, + "learning_rate": 1.4548892476595587e-05, + "loss": 0.4243, + "step": 7295 + }, + { + "epoch": 1.0497555363819384, + "grad_norm": 0.24202286281496768, + "learning_rate": 1.4542185840473523e-05, + "loss": 0.4178, + "step": 7300 + }, + { + "epoch": 1.050474547023296, + "grad_norm": 0.27213465537018944, + "learning_rate": 1.4535476629259454e-05, + "loss": 0.4237, + "step": 7305 + }, + { + "epoch": 1.0511935576646534, + "grad_norm": 0.2495004385772135, + "learning_rate": 1.4528764846757018e-05, + "loss": 0.423, + "step": 7310 + }, + { + "epoch": 1.0519125683060109, + "grad_norm": 0.23320958886525425, + "learning_rate": 1.4522050496771314e-05, + "loss": 0.4282, + "step": 7315 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.24698409693908785, + "learning_rate": 1.4515333583108896e-05, + "loss": 0.4256, + "step": 7320 + }, + { + "epoch": 1.0533505895887258, + "grad_norm": 0.2517814174177234, + "learning_rate": 1.4508614109577766e-05, + "loss": 0.4267, + "step": 7325 + }, + { + "epoch": 1.0540696002300833, + "grad_norm": 0.2483119203371741, + "learning_rate": 1.4501892079987378e-05, + "loss": 0.4152, + "step": 7330 + }, + { + "epoch": 1.0547886108714408, + "grad_norm": 0.23969019890865684, + "learning_rate": 1.4495167498148648e-05, + "loss": 0.4156, + "step": 7335 + }, + { + "epoch": 1.0555076215127983, + "grad_norm": 0.2449967426733617, + "learning_rate": 1.4488440367873922e-05, + "loss": 0.4277, + "step": 7340 + }, + { + "epoch": 1.0562266321541558, + "grad_norm": 0.2387446871052847, + "learning_rate": 1.4481710692977e-05, + "loss": 0.4093, + "step": 7345 + }, + { + "epoch": 1.0569456427955133, + "grad_norm": 0.258326074049477, + "learning_rate": 1.4474978477273124e-05, + "loss": 0.4226, + "step": 7350 + }, + { + "epoch": 1.0576646534368708, + "grad_norm": 0.2524216813764755, + "learning_rate": 1.4468243724578977e-05, + "loss": 0.4385, + "step": 7355 + }, + { + "epoch": 1.0583836640782283, + "grad_norm": 0.24424360507027318, + "learning_rate": 1.4461506438712668e-05, + "loss": 0.4321, + "step": 7360 + }, + { + "epoch": 1.0591026747195857, + "grad_norm": 0.2547646496981046, + "learning_rate": 1.4454766623493766e-05, + "loss": 0.4145, + "step": 7365 + }, + { + "epoch": 1.0598216853609435, + "grad_norm": 0.24839584461899775, + "learning_rate": 1.4448024282743252e-05, + "loss": 0.4205, + "step": 7370 + }, + { + "epoch": 1.0605406960023007, + "grad_norm": 0.26600263394387463, + "learning_rate": 1.444127942028355e-05, + "loss": 0.4193, + "step": 7375 + }, + { + "epoch": 1.0612597066436584, + "grad_norm": 0.24940257938808494, + "learning_rate": 1.443453203993851e-05, + "loss": 0.4392, + "step": 7380 + }, + { + "epoch": 1.061978717285016, + "grad_norm": 0.2335680813250201, + "learning_rate": 1.4427782145533411e-05, + "loss": 0.4258, + "step": 7385 + }, + { + "epoch": 1.0626977279263734, + "grad_norm": 0.24488926419935933, + "learning_rate": 1.4421029740894956e-05, + "loss": 0.4304, + "step": 7390 + }, + { + "epoch": 1.063416738567731, + "grad_norm": 0.23669795423595294, + "learning_rate": 1.4414274829851271e-05, + "loss": 0.416, + "step": 7395 + }, + { + "epoch": 1.0641357492090884, + "grad_norm": 0.25050783762739626, + "learning_rate": 1.4407517416231906e-05, + "loss": 0.4153, + "step": 7400 + }, + { + "epoch": 1.0648547598504459, + "grad_norm": 0.24802843987771775, + "learning_rate": 1.4400757503867828e-05, + "loss": 0.4158, + "step": 7405 + }, + { + "epoch": 1.0655737704918034, + "grad_norm": 0.24647940912208477, + "learning_rate": 1.4393995096591415e-05, + "loss": 0.427, + "step": 7410 + }, + { + "epoch": 1.0662927811331608, + "grad_norm": 0.2520899190375955, + "learning_rate": 1.4387230198236473e-05, + "loss": 0.4063, + "step": 7415 + }, + { + "epoch": 1.0670117917745183, + "grad_norm": 0.24659949302618547, + "learning_rate": 1.4380462812638205e-05, + "loss": 0.4252, + "step": 7420 + }, + { + "epoch": 1.0677308024158758, + "grad_norm": 0.23804217507019756, + "learning_rate": 1.437369294363323e-05, + "loss": 0.4279, + "step": 7425 + }, + { + "epoch": 1.0684498130572333, + "grad_norm": 0.24609340691898351, + "learning_rate": 1.4366920595059584e-05, + "loss": 0.4413, + "step": 7430 + }, + { + "epoch": 1.0691688236985908, + "grad_norm": 0.2706893944507149, + "learning_rate": 1.436014577075669e-05, + "loss": 0.4235, + "step": 7435 + }, + { + "epoch": 1.0698878343399483, + "grad_norm": 0.2444436725172846, + "learning_rate": 1.4353368474565392e-05, + "loss": 0.4264, + "step": 7440 + }, + { + "epoch": 1.0706068449813058, + "grad_norm": 0.257931501325315, + "learning_rate": 1.4346588710327926e-05, + "loss": 0.4154, + "step": 7445 + }, + { + "epoch": 1.0713258556226632, + "grad_norm": 0.23427955361087918, + "learning_rate": 1.4339806481887934e-05, + "loss": 0.4118, + "step": 7450 + }, + { + "epoch": 1.0720448662640207, + "grad_norm": 0.24248707511932324, + "learning_rate": 1.4333021793090444e-05, + "loss": 0.4159, + "step": 7455 + }, + { + "epoch": 1.0727638769053782, + "grad_norm": 0.2506867628026418, + "learning_rate": 1.4326234647781887e-05, + "loss": 0.4229, + "step": 7460 + }, + { + "epoch": 1.0734828875467357, + "grad_norm": 0.2397145327599731, + "learning_rate": 1.4319445049810088e-05, + "loss": 0.4176, + "step": 7465 + }, + { + "epoch": 1.0742018981880932, + "grad_norm": 0.24258713393340053, + "learning_rate": 1.431265300302426e-05, + "loss": 0.4271, + "step": 7470 + }, + { + "epoch": 1.0749209088294507, + "grad_norm": 0.2551377420219885, + "learning_rate": 1.4305858511275004e-05, + "loss": 0.4188, + "step": 7475 + }, + { + "epoch": 1.0756399194708082, + "grad_norm": 0.24540457752189151, + "learning_rate": 1.4299061578414303e-05, + "loss": 0.4244, + "step": 7480 + }, + { + "epoch": 1.0763589301121657, + "grad_norm": 0.25298614154712434, + "learning_rate": 1.4292262208295534e-05, + "loss": 0.4296, + "step": 7485 + }, + { + "epoch": 1.0770779407535231, + "grad_norm": 0.2523828578504143, + "learning_rate": 1.4285460404773442e-05, + "loss": 0.4225, + "step": 7490 + }, + { + "epoch": 1.0777969513948806, + "grad_norm": 0.2579441208781946, + "learning_rate": 1.4278656171704165e-05, + "loss": 0.4258, + "step": 7495 + }, + { + "epoch": 1.0785159620362381, + "grad_norm": 0.24706189517949953, + "learning_rate": 1.4271849512945218e-05, + "loss": 0.423, + "step": 7500 + }, + { + "epoch": 1.0792349726775956, + "grad_norm": 0.2410591304489331, + "learning_rate": 1.426504043235547e-05, + "loss": 0.4194, + "step": 7505 + }, + { + "epoch": 1.079953983318953, + "grad_norm": 0.2474535942584965, + "learning_rate": 1.4258228933795194e-05, + "loss": 0.4322, + "step": 7510 + }, + { + "epoch": 1.0806729939603106, + "grad_norm": 0.2670023141239513, + "learning_rate": 1.4251415021126015e-05, + "loss": 0.4187, + "step": 7515 + }, + { + "epoch": 1.081392004601668, + "grad_norm": 0.24125451781166807, + "learning_rate": 1.4244598698210927e-05, + "loss": 0.4195, + "step": 7520 + }, + { + "epoch": 1.0821110152430256, + "grad_norm": 0.2541324517150731, + "learning_rate": 1.4237779968914294e-05, + "loss": 0.43, + "step": 7525 + }, + { + "epoch": 1.082830025884383, + "grad_norm": 0.24984127697393865, + "learning_rate": 1.4230958837101847e-05, + "loss": 0.4303, + "step": 7530 + }, + { + "epoch": 1.0835490365257405, + "grad_norm": 0.23895121972780423, + "learning_rate": 1.4224135306640674e-05, + "loss": 0.4256, + "step": 7535 + }, + { + "epoch": 1.084268047167098, + "grad_norm": 0.22815843129758234, + "learning_rate": 1.4217309381399227e-05, + "loss": 0.4165, + "step": 7540 + }, + { + "epoch": 1.0849870578084555, + "grad_norm": 0.24987427549234392, + "learning_rate": 1.4210481065247312e-05, + "loss": 0.4062, + "step": 7545 + }, + { + "epoch": 1.085706068449813, + "grad_norm": 0.2440521322641604, + "learning_rate": 1.4203650362056094e-05, + "loss": 0.4218, + "step": 7550 + }, + { + "epoch": 1.0864250790911705, + "grad_norm": 0.24695188285324185, + "learning_rate": 1.4196817275698085e-05, + "loss": 0.4327, + "step": 7555 + }, + { + "epoch": 1.087144089732528, + "grad_norm": 0.24914710534389795, + "learning_rate": 1.4189981810047155e-05, + "loss": 0.4136, + "step": 7560 + }, + { + "epoch": 1.0878631003738854, + "grad_norm": 0.24358389545644626, + "learning_rate": 1.4183143968978523e-05, + "loss": 0.4264, + "step": 7565 + }, + { + "epoch": 1.088582111015243, + "grad_norm": 0.2459376500553283, + "learning_rate": 1.4176303756368753e-05, + "loss": 0.4148, + "step": 7570 + }, + { + "epoch": 1.0893011216566004, + "grad_norm": 0.26083867482554024, + "learning_rate": 1.4169461176095745e-05, + "loss": 0.4351, + "step": 7575 + }, + { + "epoch": 1.090020132297958, + "grad_norm": 0.24715188774379035, + "learning_rate": 1.4162616232038754e-05, + "loss": 0.4199, + "step": 7580 + }, + { + "epoch": 1.0907391429393154, + "grad_norm": 0.24491240798280314, + "learning_rate": 1.4155768928078371e-05, + "loss": 0.418, + "step": 7585 + }, + { + "epoch": 1.0914581535806729, + "grad_norm": 0.25181985639995874, + "learning_rate": 1.4148919268096519e-05, + "loss": 0.4232, + "step": 7590 + }, + { + "epoch": 1.0921771642220306, + "grad_norm": 0.24005994198180422, + "learning_rate": 1.4142067255976466e-05, + "loss": 0.4309, + "step": 7595 + }, + { + "epoch": 1.092896174863388, + "grad_norm": 0.2447397337880326, + "learning_rate": 1.413521289560281e-05, + "loss": 0.4143, + "step": 7600 + }, + { + "epoch": 1.0936151855047456, + "grad_norm": 0.2425734376554363, + "learning_rate": 1.4128356190861471e-05, + "loss": 0.4184, + "step": 7605 + }, + { + "epoch": 1.094334196146103, + "grad_norm": 0.2530970688941995, + "learning_rate": 1.412149714563972e-05, + "loss": 0.4127, + "step": 7610 + }, + { + "epoch": 1.0950532067874605, + "grad_norm": 0.2448959328514985, + "learning_rate": 1.411463576382613e-05, + "loss": 0.4205, + "step": 7615 + }, + { + "epoch": 1.095772217428818, + "grad_norm": 0.24869817141051245, + "learning_rate": 1.4107772049310615e-05, + "loss": 0.4193, + "step": 7620 + }, + { + "epoch": 1.0964912280701755, + "grad_norm": 0.2717766488293654, + "learning_rate": 1.4100906005984404e-05, + "loss": 0.4325, + "step": 7625 + }, + { + "epoch": 1.097210238711533, + "grad_norm": 0.2539575468369953, + "learning_rate": 1.4094037637740048e-05, + "loss": 0.4185, + "step": 7630 + }, + { + "epoch": 1.0979292493528905, + "grad_norm": 0.25130113289646006, + "learning_rate": 1.408716694847142e-05, + "loss": 0.4169, + "step": 7635 + }, + { + "epoch": 1.098648259994248, + "grad_norm": 0.24884985415016497, + "learning_rate": 1.4080293942073704e-05, + "loss": 0.42, + "step": 7640 + }, + { + "epoch": 1.0993672706356055, + "grad_norm": 0.24074102770601502, + "learning_rate": 1.4073418622443402e-05, + "loss": 0.4127, + "step": 7645 + }, + { + "epoch": 1.100086281276963, + "grad_norm": 0.25304182275332077, + "learning_rate": 1.4066540993478321e-05, + "loss": 0.4241, + "step": 7650 + }, + { + "epoch": 1.1008052919183204, + "grad_norm": 0.29644282230759933, + "learning_rate": 1.405966105907758e-05, + "loss": 0.4305, + "step": 7655 + }, + { + "epoch": 1.101524302559678, + "grad_norm": 0.2436353807120827, + "learning_rate": 1.4052778823141609e-05, + "loss": 0.416, + "step": 7660 + }, + { + "epoch": 1.1022433132010354, + "grad_norm": 0.24484423851073356, + "learning_rate": 1.4045894289572142e-05, + "loss": 0.4346, + "step": 7665 + }, + { + "epoch": 1.102962323842393, + "grad_norm": 0.24391951324397565, + "learning_rate": 1.4039007462272207e-05, + "loss": 0.4121, + "step": 7670 + }, + { + "epoch": 1.1036813344837504, + "grad_norm": 0.24224843292543935, + "learning_rate": 1.4032118345146141e-05, + "loss": 0.4423, + "step": 7675 + }, + { + "epoch": 1.1044003451251079, + "grad_norm": 0.24151880011846152, + "learning_rate": 1.4025226942099579e-05, + "loss": 0.4315, + "step": 7680 + }, + { + "epoch": 1.1051193557664654, + "grad_norm": 0.24674435136342923, + "learning_rate": 1.4018333257039449e-05, + "loss": 0.4258, + "step": 7685 + }, + { + "epoch": 1.1058383664078228, + "grad_norm": 0.23572907486634379, + "learning_rate": 1.4011437293873975e-05, + "loss": 0.4065, + "step": 7690 + }, + { + "epoch": 1.1065573770491803, + "grad_norm": 0.2563816055424987, + "learning_rate": 1.4004539056512667e-05, + "loss": 0.4355, + "step": 7695 + }, + { + "epoch": 1.1072763876905378, + "grad_norm": 0.2566698703429824, + "learning_rate": 1.399763854886633e-05, + "loss": 0.4252, + "step": 7700 + }, + { + "epoch": 1.1079953983318953, + "grad_norm": 0.2439159913441445, + "learning_rate": 1.3990735774847057e-05, + "loss": 0.4252, + "step": 7705 + }, + { + "epoch": 1.1087144089732528, + "grad_norm": 0.24823739748302368, + "learning_rate": 1.398383073836822e-05, + "loss": 0.4256, + "step": 7710 + }, + { + "epoch": 1.1094334196146103, + "grad_norm": 0.27016829076384286, + "learning_rate": 1.3976923443344483e-05, + "loss": 0.4257, + "step": 7715 + }, + { + "epoch": 1.1101524302559678, + "grad_norm": 0.24301415679475427, + "learning_rate": 1.3970013893691776e-05, + "loss": 0.4163, + "step": 7720 + }, + { + "epoch": 1.1108714408973253, + "grad_norm": 0.25527015767883365, + "learning_rate": 1.396310209332732e-05, + "loss": 0.4145, + "step": 7725 + }, + { + "epoch": 1.1115904515386827, + "grad_norm": 0.23980610885688808, + "learning_rate": 1.3956188046169607e-05, + "loss": 0.4145, + "step": 7730 + }, + { + "epoch": 1.1123094621800402, + "grad_norm": 0.2468743079078507, + "learning_rate": 1.3949271756138407e-05, + "loss": 0.4256, + "step": 7735 + }, + { + "epoch": 1.1130284728213977, + "grad_norm": 0.2633663801233096, + "learning_rate": 1.3942353227154755e-05, + "loss": 0.4226, + "step": 7740 + }, + { + "epoch": 1.1137474834627552, + "grad_norm": 0.25326954886349157, + "learning_rate": 1.3935432463140954e-05, + "loss": 0.4004, + "step": 7745 + }, + { + "epoch": 1.1144664941041127, + "grad_norm": 0.252813511679432, + "learning_rate": 1.3928509468020586e-05, + "loss": 0.4142, + "step": 7750 + }, + { + "epoch": 1.1151855047454702, + "grad_norm": 0.24948974834562754, + "learning_rate": 1.3921584245718485e-05, + "loss": 0.4275, + "step": 7755 + }, + { + "epoch": 1.1159045153868277, + "grad_norm": 0.2490774493997426, + "learning_rate": 1.3914656800160755e-05, + "loss": 0.4401, + "step": 7760 + }, + { + "epoch": 1.1166235260281852, + "grad_norm": 0.24731238961604693, + "learning_rate": 1.390772713527476e-05, + "loss": 0.413, + "step": 7765 + }, + { + "epoch": 1.1173425366695426, + "grad_norm": 0.24009966652121423, + "learning_rate": 1.3900795254989117e-05, + "loss": 0.4326, + "step": 7770 + }, + { + "epoch": 1.1180615473109001, + "grad_norm": 0.241251844552185, + "learning_rate": 1.3893861163233704e-05, + "loss": 0.4046, + "step": 7775 + }, + { + "epoch": 1.1187805579522576, + "grad_norm": 0.24711082983975394, + "learning_rate": 1.388692486393965e-05, + "loss": 0.4046, + "step": 7780 + }, + { + "epoch": 1.119499568593615, + "grad_norm": 0.24611081640811172, + "learning_rate": 1.3879986361039341e-05, + "loss": 0.4254, + "step": 7785 + }, + { + "epoch": 1.1202185792349726, + "grad_norm": 0.24883474377488835, + "learning_rate": 1.3873045658466404e-05, + "loss": 0.4179, + "step": 7790 + }, + { + "epoch": 1.12093758987633, + "grad_norm": 0.24887871632954492, + "learning_rate": 1.386610276015572e-05, + "loss": 0.4092, + "step": 7795 + }, + { + "epoch": 1.1216566005176876, + "grad_norm": 0.2339971482090034, + "learning_rate": 1.3859157670043409e-05, + "loss": 0.4139, + "step": 7800 + }, + { + "epoch": 1.122375611159045, + "grad_norm": 0.25291445076907043, + "learning_rate": 1.3852210392066837e-05, + "loss": 0.4435, + "step": 7805 + }, + { + "epoch": 1.1230946218004028, + "grad_norm": 0.23886337976308974, + "learning_rate": 1.384526093016461e-05, + "loss": 0.3929, + "step": 7810 + }, + { + "epoch": 1.12381363244176, + "grad_norm": 0.24867184978452428, + "learning_rate": 1.3838309288276577e-05, + "loss": 0.4214, + "step": 7815 + }, + { + "epoch": 1.1245326430831177, + "grad_norm": 0.2416953267844077, + "learning_rate": 1.383135547034381e-05, + "loss": 0.4268, + "step": 7820 + }, + { + "epoch": 1.125251653724475, + "grad_norm": 0.24714780517863277, + "learning_rate": 1.3824399480308625e-05, + "loss": 0.4255, + "step": 7825 + }, + { + "epoch": 1.1259706643658327, + "grad_norm": 0.24697736669814996, + "learning_rate": 1.3817441322114573e-05, + "loss": 0.4217, + "step": 7830 + }, + { + "epoch": 1.1266896750071902, + "grad_norm": 0.2446263674892036, + "learning_rate": 1.3810480999706424e-05, + "loss": 0.4333, + "step": 7835 + }, + { + "epoch": 1.1274086856485477, + "grad_norm": 0.24137930683222925, + "learning_rate": 1.3803518517030175e-05, + "loss": 0.4387, + "step": 7840 + }, + { + "epoch": 1.1281276962899052, + "grad_norm": 0.25949236377089124, + "learning_rate": 1.3796553878033056e-05, + "loss": 0.4309, + "step": 7845 + }, + { + "epoch": 1.1288467069312627, + "grad_norm": 0.25804771630659423, + "learning_rate": 1.3789587086663516e-05, + "loss": 0.4334, + "step": 7850 + }, + { + "epoch": 1.1295657175726201, + "grad_norm": 0.24467407111888867, + "learning_rate": 1.3782618146871222e-05, + "loss": 0.4189, + "step": 7855 + }, + { + "epoch": 1.1302847282139776, + "grad_norm": 0.24540869178112273, + "learning_rate": 1.3775647062607062e-05, + "loss": 0.426, + "step": 7860 + }, + { + "epoch": 1.1310037388553351, + "grad_norm": 0.2581233546593125, + "learning_rate": 1.3768673837823138e-05, + "loss": 0.4295, + "step": 7865 + }, + { + "epoch": 1.1317227494966926, + "grad_norm": 0.23733084790193076, + "learning_rate": 1.3761698476472767e-05, + "loss": 0.4099, + "step": 7870 + }, + { + "epoch": 1.13244176013805, + "grad_norm": 0.26599088742069377, + "learning_rate": 1.375472098251047e-05, + "loss": 0.4011, + "step": 7875 + }, + { + "epoch": 1.1331607707794076, + "grad_norm": 0.253357189978077, + "learning_rate": 1.3747741359891991e-05, + "loss": 0.4217, + "step": 7880 + }, + { + "epoch": 1.133879781420765, + "grad_norm": 0.25256913460661307, + "learning_rate": 1.3740759612574268e-05, + "loss": 0.4187, + "step": 7885 + }, + { + "epoch": 1.1345987920621226, + "grad_norm": 0.2434106525342183, + "learning_rate": 1.3733775744515452e-05, + "loss": 0.4259, + "step": 7890 + }, + { + "epoch": 1.13531780270348, + "grad_norm": 0.25358779229776013, + "learning_rate": 1.372678975967489e-05, + "loss": 0.4361, + "step": 7895 + }, + { + "epoch": 1.1360368133448375, + "grad_norm": 0.24904816611960595, + "learning_rate": 1.3719801662013133e-05, + "loss": 0.4202, + "step": 7900 + }, + { + "epoch": 1.136755823986195, + "grad_norm": 0.24093806308462346, + "learning_rate": 1.3712811455491927e-05, + "loss": 0.4176, + "step": 7905 + }, + { + "epoch": 1.1374748346275525, + "grad_norm": 0.24362253736347905, + "learning_rate": 1.370581914407422e-05, + "loss": 0.4065, + "step": 7910 + }, + { + "epoch": 1.13819384526891, + "grad_norm": 0.24153784261120567, + "learning_rate": 1.3698824731724147e-05, + "loss": 0.4263, + "step": 7915 + }, + { + "epoch": 1.1389128559102675, + "grad_norm": 0.26957486232644123, + "learning_rate": 1.3691828222407032e-05, + "loss": 0.4149, + "step": 7920 + }, + { + "epoch": 1.139631866551625, + "grad_norm": 0.25905751939209765, + "learning_rate": 1.3684829620089391e-05, + "loss": 0.421, + "step": 7925 + }, + { + "epoch": 1.1403508771929824, + "grad_norm": 0.2510604462676102, + "learning_rate": 1.3677828928738934e-05, + "loss": 0.4231, + "step": 7930 + }, + { + "epoch": 1.14106988783434, + "grad_norm": 0.24835283662485474, + "learning_rate": 1.3670826152324543e-05, + "loss": 0.4211, + "step": 7935 + }, + { + "epoch": 1.1417888984756974, + "grad_norm": 0.24748031095299838, + "learning_rate": 1.3663821294816289e-05, + "loss": 0.4218, + "step": 7940 + }, + { + "epoch": 1.142507909117055, + "grad_norm": 0.24418549656367056, + "learning_rate": 1.3656814360185422e-05, + "loss": 0.4239, + "step": 7945 + }, + { + "epoch": 1.1432269197584124, + "grad_norm": 0.24667828916811363, + "learning_rate": 1.3649805352404366e-05, + "loss": 0.4132, + "step": 7950 + }, + { + "epoch": 1.1439459303997699, + "grad_norm": 0.24542712661129085, + "learning_rate": 1.3642794275446728e-05, + "loss": 0.4138, + "step": 7955 + }, + { + "epoch": 1.1446649410411274, + "grad_norm": 0.25146817243635045, + "learning_rate": 1.363578113328728e-05, + "loss": 0.4319, + "step": 7960 + }, + { + "epoch": 1.1453839516824849, + "grad_norm": 0.25652758801455494, + "learning_rate": 1.362876592990197e-05, + "loss": 0.4197, + "step": 7965 + }, + { + "epoch": 1.1461029623238423, + "grad_norm": 0.2527655011860801, + "learning_rate": 1.3621748669267911e-05, + "loss": 0.4148, + "step": 7970 + }, + { + "epoch": 1.1468219729651998, + "grad_norm": 0.2437231962419539, + "learning_rate": 1.3614729355363382e-05, + "loss": 0.4087, + "step": 7975 + }, + { + "epoch": 1.1475409836065573, + "grad_norm": 0.2519247554626112, + "learning_rate": 1.3607707992167836e-05, + "loss": 0.4205, + "step": 7980 + }, + { + "epoch": 1.1482599942479148, + "grad_norm": 0.255776034441485, + "learning_rate": 1.3600684583661872e-05, + "loss": 0.415, + "step": 7985 + }, + { + "epoch": 1.1489790048892723, + "grad_norm": 0.25702160420203973, + "learning_rate": 1.3593659133827258e-05, + "loss": 0.4285, + "step": 7990 + }, + { + "epoch": 1.1496980155306298, + "grad_norm": 0.251075718116356, + "learning_rate": 1.358663164664692e-05, + "loss": 0.4233, + "step": 7995 + }, + { + "epoch": 1.1504170261719873, + "grad_norm": 0.24391957218362018, + "learning_rate": 1.3579602126104935e-05, + "loss": 0.4321, + "step": 8000 + }, + { + "epoch": 1.1511360368133448, + "grad_norm": 0.2542217038989035, + "learning_rate": 1.3572570576186535e-05, + "loss": 0.4246, + "step": 8005 + }, + { + "epoch": 1.1518550474547022, + "grad_norm": 0.24617547563080916, + "learning_rate": 1.3565537000878102e-05, + "loss": 0.4195, + "step": 8010 + }, + { + "epoch": 1.1525740580960597, + "grad_norm": 0.2406743128878967, + "learning_rate": 1.3558501404167168e-05, + "loss": 0.4211, + "step": 8015 + }, + { + "epoch": 1.1532930687374172, + "grad_norm": 0.2492803217604934, + "learning_rate": 1.3551463790042405e-05, + "loss": 0.4483, + "step": 8020 + }, + { + "epoch": 1.154012079378775, + "grad_norm": 0.2687431622773391, + "learning_rate": 1.3544424162493636e-05, + "loss": 0.4034, + "step": 8025 + }, + { + "epoch": 1.1547310900201322, + "grad_norm": 0.2474863901410814, + "learning_rate": 1.3537382525511827e-05, + "loss": 0.4248, + "step": 8030 + }, + { + "epoch": 1.15545010066149, + "grad_norm": 0.24277793893103647, + "learning_rate": 1.3530338883089068e-05, + "loss": 0.4138, + "step": 8035 + }, + { + "epoch": 1.1561691113028472, + "grad_norm": 0.2376746683514846, + "learning_rate": 1.3523293239218607e-05, + "loss": 0.405, + "step": 8040 + }, + { + "epoch": 1.1568881219442049, + "grad_norm": 0.2586039263754343, + "learning_rate": 1.3516245597894809e-05, + "loss": 0.4151, + "step": 8045 + }, + { + "epoch": 1.1576071325855624, + "grad_norm": 0.2573459661290238, + "learning_rate": 1.3509195963113179e-05, + "loss": 0.4208, + "step": 8050 + }, + { + "epoch": 1.1583261432269198, + "grad_norm": 0.26358292190965493, + "learning_rate": 1.3502144338870358e-05, + "loss": 0.4281, + "step": 8055 + }, + { + "epoch": 1.1590451538682773, + "grad_norm": 0.25201890612946326, + "learning_rate": 1.3495090729164103e-05, + "loss": 0.4108, + "step": 8060 + }, + { + "epoch": 1.1597641645096348, + "grad_norm": 0.25152231329952435, + "learning_rate": 1.3488035137993305e-05, + "loss": 0.4331, + "step": 8065 + }, + { + "epoch": 1.1604831751509923, + "grad_norm": 0.26500500442079494, + "learning_rate": 1.3480977569357974e-05, + "loss": 0.4222, + "step": 8070 + }, + { + "epoch": 1.1612021857923498, + "grad_norm": 0.25151093241524297, + "learning_rate": 1.3473918027259242e-05, + "loss": 0.4245, + "step": 8075 + }, + { + "epoch": 1.1619211964337073, + "grad_norm": 0.24952523717373598, + "learning_rate": 1.3466856515699367e-05, + "loss": 0.4213, + "step": 8080 + }, + { + "epoch": 1.1626402070750648, + "grad_norm": 0.24856168521182317, + "learning_rate": 1.345979303868171e-05, + "loss": 0.4099, + "step": 8085 + }, + { + "epoch": 1.1633592177164223, + "grad_norm": 0.26059949588767317, + "learning_rate": 1.3452727600210755e-05, + "loss": 0.4207, + "step": 8090 + }, + { + "epoch": 1.1640782283577797, + "grad_norm": 0.2610866688103912, + "learning_rate": 1.3445660204292098e-05, + "loss": 0.4105, + "step": 8095 + }, + { + "epoch": 1.1647972389991372, + "grad_norm": 0.24596872245945312, + "learning_rate": 1.3438590854932442e-05, + "loss": 0.427, + "step": 8100 + }, + { + "epoch": 1.1655162496404947, + "grad_norm": 0.250687613838101, + "learning_rate": 1.3431519556139599e-05, + "loss": 0.4031, + "step": 8105 + }, + { + "epoch": 1.1662352602818522, + "grad_norm": 0.24182193223557452, + "learning_rate": 1.3424446311922486e-05, + "loss": 0.4363, + "step": 8110 + }, + { + "epoch": 1.1669542709232097, + "grad_norm": 0.24363623767786466, + "learning_rate": 1.341737112629112e-05, + "loss": 0.4205, + "step": 8115 + }, + { + "epoch": 1.1676732815645672, + "grad_norm": 0.23567571845844587, + "learning_rate": 1.3410294003256623e-05, + "loss": 0.4273, + "step": 8120 + }, + { + "epoch": 1.1683922922059247, + "grad_norm": 0.2481386696407478, + "learning_rate": 1.3403214946831218e-05, + "loss": 0.4242, + "step": 8125 + }, + { + "epoch": 1.1691113028472822, + "grad_norm": 0.24984866659117091, + "learning_rate": 1.3396133961028214e-05, + "loss": 0.4151, + "step": 8130 + }, + { + "epoch": 1.1698303134886396, + "grad_norm": 0.2587341281267679, + "learning_rate": 1.3389051049862024e-05, + "loss": 0.4324, + "step": 8135 + }, + { + "epoch": 1.1705493241299971, + "grad_norm": 0.24480606706358338, + "learning_rate": 1.3381966217348143e-05, + "loss": 0.417, + "step": 8140 + }, + { + "epoch": 1.1712683347713546, + "grad_norm": 0.2467942565642489, + "learning_rate": 1.3374879467503163e-05, + "loss": 0.4279, + "step": 8145 + }, + { + "epoch": 1.171987345412712, + "grad_norm": 0.2554295736313284, + "learning_rate": 1.3367790804344762e-05, + "loss": 0.4398, + "step": 8150 + }, + { + "epoch": 1.1727063560540696, + "grad_norm": 0.24686156456775635, + "learning_rate": 1.33607002318917e-05, + "loss": 0.4292, + "step": 8155 + }, + { + "epoch": 1.173425366695427, + "grad_norm": 0.25350234857874493, + "learning_rate": 1.3353607754163822e-05, + "loss": 0.4171, + "step": 8160 + }, + { + "epoch": 1.1741443773367846, + "grad_norm": 0.2675757948436432, + "learning_rate": 1.3346513375182049e-05, + "loss": 0.425, + "step": 8165 + }, + { + "epoch": 1.174863387978142, + "grad_norm": 0.24840026964787834, + "learning_rate": 1.333941709896838e-05, + "loss": 0.4243, + "step": 8170 + }, + { + "epoch": 1.1755823986194995, + "grad_norm": 0.24658512060060236, + "learning_rate": 1.3332318929545898e-05, + "loss": 0.4429, + "step": 8175 + }, + { + "epoch": 1.176301409260857, + "grad_norm": 0.24447367376785173, + "learning_rate": 1.3325218870938751e-05, + "loss": 0.4117, + "step": 8180 + }, + { + "epoch": 1.1770204199022145, + "grad_norm": 0.24290491986399293, + "learning_rate": 1.3318116927172162e-05, + "loss": 0.4111, + "step": 8185 + }, + { + "epoch": 1.177739430543572, + "grad_norm": 0.25434081056777064, + "learning_rate": 1.331101310227242e-05, + "loss": 0.4295, + "step": 8190 + }, + { + "epoch": 1.1784584411849295, + "grad_norm": 0.24918566963063316, + "learning_rate": 1.330390740026688e-05, + "loss": 0.4169, + "step": 8195 + }, + { + "epoch": 1.179177451826287, + "grad_norm": 0.2466669288347175, + "learning_rate": 1.3296799825183966e-05, + "loss": 0.4318, + "step": 8200 + }, + { + "epoch": 1.1798964624676445, + "grad_norm": 0.25137643483645633, + "learning_rate": 1.328969038105316e-05, + "loss": 0.4304, + "step": 8205 + }, + { + "epoch": 1.180615473109002, + "grad_norm": 0.25921147868275496, + "learning_rate": 1.3282579071905004e-05, + "loss": 0.4238, + "step": 8210 + }, + { + "epoch": 1.1813344837503594, + "grad_norm": 0.24375862567652218, + "learning_rate": 1.3275465901771094e-05, + "loss": 0.4086, + "step": 8215 + }, + { + "epoch": 1.182053494391717, + "grad_norm": 0.24452988663026934, + "learning_rate": 1.326835087468409e-05, + "loss": 0.4138, + "step": 8220 + }, + { + "epoch": 1.1827725050330744, + "grad_norm": 0.25794940392873345, + "learning_rate": 1.32612339946777e-05, + "loss": 0.4271, + "step": 8225 + }, + { + "epoch": 1.183491515674432, + "grad_norm": 0.24798941451520337, + "learning_rate": 1.3254115265786682e-05, + "loss": 0.405, + "step": 8230 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.2475664814793249, + "learning_rate": 1.3246994692046837e-05, + "loss": 0.4374, + "step": 8235 + }, + { + "epoch": 1.184929536957147, + "grad_norm": 0.2545691836978971, + "learning_rate": 1.323987227749502e-05, + "loss": 0.4209, + "step": 8240 + }, + { + "epoch": 1.1856485475985044, + "grad_norm": 0.26838769791648476, + "learning_rate": 1.323274802616913e-05, + "loss": 0.4113, + "step": 8245 + }, + { + "epoch": 1.186367558239862, + "grad_norm": 0.24614739433809657, + "learning_rate": 1.3225621942108098e-05, + "loss": 0.4301, + "step": 8250 + }, + { + "epoch": 1.1870865688812193, + "grad_norm": 0.2365370918897361, + "learning_rate": 1.3218494029351903e-05, + "loss": 0.4308, + "step": 8255 + }, + { + "epoch": 1.187805579522577, + "grad_norm": 0.262541605445906, + "learning_rate": 1.3211364291941562e-05, + "loss": 0.418, + "step": 8260 + }, + { + "epoch": 1.1885245901639343, + "grad_norm": 0.23740679928960856, + "learning_rate": 1.3204232733919113e-05, + "loss": 0.4251, + "step": 8265 + }, + { + "epoch": 1.189243600805292, + "grad_norm": 0.24576537383506467, + "learning_rate": 1.3197099359327643e-05, + "loss": 0.4216, + "step": 8270 + }, + { + "epoch": 1.1899626114466495, + "grad_norm": 0.2545486272631958, + "learning_rate": 1.318996417221126e-05, + "loss": 0.4294, + "step": 8275 + }, + { + "epoch": 1.190681622088007, + "grad_norm": 0.25064130034355747, + "learning_rate": 1.3182827176615098e-05, + "loss": 0.412, + "step": 8280 + }, + { + "epoch": 1.1914006327293645, + "grad_norm": 0.23643340314959274, + "learning_rate": 1.3175688376585323e-05, + "loss": 0.4399, + "step": 8285 + }, + { + "epoch": 1.192119643370722, + "grad_norm": 0.2537867803665056, + "learning_rate": 1.3168547776169117e-05, + "loss": 0.435, + "step": 8290 + }, + { + "epoch": 1.1928386540120794, + "grad_norm": 0.25676163776653566, + "learning_rate": 1.3161405379414686e-05, + "loss": 0.4288, + "step": 8295 + }, + { + "epoch": 1.193557664653437, + "grad_norm": 0.24692052623095706, + "learning_rate": 1.3154261190371255e-05, + "loss": 0.4169, + "step": 8300 + }, + { + "epoch": 1.1942766752947944, + "grad_norm": 0.24644854980084216, + "learning_rate": 1.3147115213089065e-05, + "loss": 0.4209, + "step": 8305 + }, + { + "epoch": 1.194995685936152, + "grad_norm": 0.2626283251769723, + "learning_rate": 1.3139967451619371e-05, + "loss": 0.4239, + "step": 8310 + }, + { + "epoch": 1.1957146965775094, + "grad_norm": 0.25140453305076255, + "learning_rate": 1.3132817910014435e-05, + "loss": 0.4325, + "step": 8315 + }, + { + "epoch": 1.1964337072188669, + "grad_norm": 0.24253654316024326, + "learning_rate": 1.3125666592327534e-05, + "loss": 0.4091, + "step": 8320 + }, + { + "epoch": 1.1971527178602244, + "grad_norm": 0.2523257882774095, + "learning_rate": 1.3118513502612951e-05, + "loss": 0.4269, + "step": 8325 + }, + { + "epoch": 1.1978717285015819, + "grad_norm": 0.2541110638728842, + "learning_rate": 1.311135864492597e-05, + "loss": 0.419, + "step": 8330 + }, + { + "epoch": 1.1985907391429393, + "grad_norm": 0.25649704405579965, + "learning_rate": 1.3104202023322879e-05, + "loss": 0.4154, + "step": 8335 + }, + { + "epoch": 1.1993097497842968, + "grad_norm": 0.2533668272093602, + "learning_rate": 1.3097043641860965e-05, + "loss": 0.4337, + "step": 8340 + }, + { + "epoch": 1.2000287604256543, + "grad_norm": 0.25450629122005425, + "learning_rate": 1.3089883504598525e-05, + "loss": 0.42, + "step": 8345 + }, + { + "epoch": 1.2007477710670118, + "grad_norm": 0.2420736074725916, + "learning_rate": 1.3082721615594828e-05, + "loss": 0.418, + "step": 8350 + }, + { + "epoch": 1.2014667817083693, + "grad_norm": 0.25432487846975754, + "learning_rate": 1.3075557978910156e-05, + "loss": 0.4233, + "step": 8355 + }, + { + "epoch": 1.2021857923497268, + "grad_norm": 0.24677056731156752, + "learning_rate": 1.3068392598605775e-05, + "loss": 0.4086, + "step": 8360 + }, + { + "epoch": 1.2029048029910843, + "grad_norm": 0.24647318234129603, + "learning_rate": 1.3061225478743933e-05, + "loss": 0.4071, + "step": 8365 + }, + { + "epoch": 1.2036238136324418, + "grad_norm": 0.2538408007703241, + "learning_rate": 1.3054056623387876e-05, + "loss": 0.4259, + "step": 8370 + }, + { + "epoch": 1.2043428242737992, + "grad_norm": 0.24974410344574643, + "learning_rate": 1.3046886036601829e-05, + "loss": 0.4127, + "step": 8375 + }, + { + "epoch": 1.2050618349151567, + "grad_norm": 0.24230198275162937, + "learning_rate": 1.3039713722450995e-05, + "loss": 0.4125, + "step": 8380 + }, + { + "epoch": 1.2057808455565142, + "grad_norm": 0.24913475757853068, + "learning_rate": 1.3032539685001558e-05, + "loss": 0.423, + "step": 8385 + }, + { + "epoch": 1.2064998561978717, + "grad_norm": 0.27232861147614684, + "learning_rate": 1.302536392832068e-05, + "loss": 0.415, + "step": 8390 + }, + { + "epoch": 1.2072188668392292, + "grad_norm": 0.24370010266072922, + "learning_rate": 1.3018186456476504e-05, + "loss": 0.4228, + "step": 8395 + }, + { + "epoch": 1.2079378774805867, + "grad_norm": 0.24506828275307008, + "learning_rate": 1.3011007273538134e-05, + "loss": 0.424, + "step": 8400 + }, + { + "epoch": 1.2086568881219442, + "grad_norm": 0.2628538262349111, + "learning_rate": 1.300382638357565e-05, + "loss": 0.4275, + "step": 8405 + }, + { + "epoch": 1.2093758987633016, + "grad_norm": 0.25852077220155995, + "learning_rate": 1.2996643790660102e-05, + "loss": 0.42, + "step": 8410 + }, + { + "epoch": 1.2100949094046591, + "grad_norm": 0.2555973034880567, + "learning_rate": 1.2989459498863498e-05, + "loss": 0.4266, + "step": 8415 + }, + { + "epoch": 1.2108139200460166, + "grad_norm": 0.37250123766470045, + "learning_rate": 1.2982273512258813e-05, + "loss": 0.3953, + "step": 8420 + }, + { + "epoch": 1.211532930687374, + "grad_norm": 0.2485536945534741, + "learning_rate": 1.2975085834919991e-05, + "loss": 0.4312, + "step": 8425 + }, + { + "epoch": 1.2122519413287316, + "grad_norm": 0.24032018458794435, + "learning_rate": 1.2967896470921922e-05, + "loss": 0.4168, + "step": 8430 + }, + { + "epoch": 1.212970951970089, + "grad_norm": 0.2526115365005817, + "learning_rate": 1.2960705424340453e-05, + "loss": 0.4139, + "step": 8435 + }, + { + "epoch": 1.2136899626114466, + "grad_norm": 0.26723808958337547, + "learning_rate": 1.2953512699252398e-05, + "loss": 0.4321, + "step": 8440 + }, + { + "epoch": 1.214408973252804, + "grad_norm": 0.23980364599778922, + "learning_rate": 1.2946318299735508e-05, + "loss": 0.4231, + "step": 8445 + }, + { + "epoch": 1.2151279838941615, + "grad_norm": 0.23168784713537172, + "learning_rate": 1.2939122229868489e-05, + "loss": 0.418, + "step": 8450 + }, + { + "epoch": 1.215846994535519, + "grad_norm": 0.25321859227375604, + "learning_rate": 1.2931924493730997e-05, + "loss": 0.431, + "step": 8455 + }, + { + "epoch": 1.2165660051768765, + "grad_norm": 0.25390809268714226, + "learning_rate": 1.2924725095403625e-05, + "loss": 0.436, + "step": 8460 + }, + { + "epoch": 1.2172850158182342, + "grad_norm": 0.25004190514605684, + "learning_rate": 1.2917524038967919e-05, + "loss": 0.4098, + "step": 8465 + }, + { + "epoch": 1.2180040264595915, + "grad_norm": 0.24400882190317358, + "learning_rate": 1.2910321328506355e-05, + "loss": 0.4175, + "step": 8470 + }, + { + "epoch": 1.2187230371009492, + "grad_norm": 0.25849678176050406, + "learning_rate": 1.2903116968102354e-05, + "loss": 0.4239, + "step": 8475 + }, + { + "epoch": 1.2194420477423065, + "grad_norm": 0.2511130706595977, + "learning_rate": 1.2895910961840263e-05, + "loss": 0.4092, + "step": 8480 + }, + { + "epoch": 1.2201610583836642, + "grad_norm": 0.2662565814750477, + "learning_rate": 1.2888703313805375e-05, + "loss": 0.4321, + "step": 8485 + }, + { + "epoch": 1.2208800690250217, + "grad_norm": 0.2610809882106791, + "learning_rate": 1.2881494028083901e-05, + "loss": 0.4385, + "step": 8490 + }, + { + "epoch": 1.2215990796663792, + "grad_norm": 0.2562432167176942, + "learning_rate": 1.2874283108762991e-05, + "loss": 0.4253, + "step": 8495 + }, + { + "epoch": 1.2223180903077366, + "grad_norm": 0.23942443774657562, + "learning_rate": 1.2867070559930715e-05, + "loss": 0.4174, + "step": 8500 + }, + { + "epoch": 1.2230371009490941, + "grad_norm": 0.2504392672198876, + "learning_rate": 1.2859856385676066e-05, + "loss": 0.4186, + "step": 8505 + }, + { + "epoch": 1.2237561115904516, + "grad_norm": 0.2413481143352121, + "learning_rate": 1.2852640590088964e-05, + "loss": 0.4273, + "step": 8510 + }, + { + "epoch": 1.224475122231809, + "grad_norm": 0.24855331184524024, + "learning_rate": 1.2845423177260245e-05, + "loss": 0.4249, + "step": 8515 + }, + { + "epoch": 1.2251941328731666, + "grad_norm": 0.2539988946799221, + "learning_rate": 1.2838204151281661e-05, + "loss": 0.4339, + "step": 8520 + }, + { + "epoch": 1.225913143514524, + "grad_norm": 0.26062461026941763, + "learning_rate": 1.2830983516245883e-05, + "loss": 0.4232, + "step": 8525 + }, + { + "epoch": 1.2266321541558816, + "grad_norm": 0.2405582501284004, + "learning_rate": 1.2823761276246483e-05, + "loss": 0.4208, + "step": 8530 + }, + { + "epoch": 1.227351164797239, + "grad_norm": 0.2560490797414399, + "learning_rate": 1.2816537435377953e-05, + "loss": 0.412, + "step": 8535 + }, + { + "epoch": 1.2280701754385965, + "grad_norm": 0.2490737915145687, + "learning_rate": 1.2809311997735697e-05, + "loss": 0.4406, + "step": 8540 + }, + { + "epoch": 1.228789186079954, + "grad_norm": 0.23857478709388596, + "learning_rate": 1.280208496741601e-05, + "loss": 0.4183, + "step": 8545 + }, + { + "epoch": 1.2295081967213115, + "grad_norm": 0.24339980366638034, + "learning_rate": 1.2794856348516095e-05, + "loss": 0.423, + "step": 8550 + }, + { + "epoch": 1.230227207362669, + "grad_norm": 0.23505042392329964, + "learning_rate": 1.2787626145134066e-05, + "loss": 0.4199, + "step": 8555 + }, + { + "epoch": 1.2309462180040265, + "grad_norm": 0.24505729095145254, + "learning_rate": 1.2780394361368923e-05, + "loss": 0.4306, + "step": 8560 + }, + { + "epoch": 1.231665228645384, + "grad_norm": 0.24991699271782725, + "learning_rate": 1.2773161001320568e-05, + "loss": 0.4174, + "step": 8565 + }, + { + "epoch": 1.2323842392867415, + "grad_norm": 0.2613446094274954, + "learning_rate": 1.2765926069089796e-05, + "loss": 0.4085, + "step": 8570 + }, + { + "epoch": 1.233103249928099, + "grad_norm": 0.24970744977753517, + "learning_rate": 1.2758689568778286e-05, + "loss": 0.4203, + "step": 8575 + }, + { + "epoch": 1.2338222605694564, + "grad_norm": 0.24192710879802426, + "learning_rate": 1.275145150448862e-05, + "loss": 0.414, + "step": 8580 + }, + { + "epoch": 1.234541271210814, + "grad_norm": 0.26464114170276903, + "learning_rate": 1.2744211880324248e-05, + "loss": 0.4177, + "step": 8585 + }, + { + "epoch": 1.2352602818521714, + "grad_norm": 0.24058953049530782, + "learning_rate": 1.2736970700389528e-05, + "loss": 0.4291, + "step": 8590 + }, + { + "epoch": 1.235979292493529, + "grad_norm": 0.26170215003131153, + "learning_rate": 1.2729727968789678e-05, + "loss": 0.4143, + "step": 8595 + }, + { + "epoch": 1.2366983031348864, + "grad_norm": 0.24147063478240458, + "learning_rate": 1.272248368963081e-05, + "loss": 0.4165, + "step": 8600 + }, + { + "epoch": 1.2374173137762439, + "grad_norm": 0.26028828499534745, + "learning_rate": 1.2715237867019904e-05, + "loss": 0.4268, + "step": 8605 + }, + { + "epoch": 1.2381363244176014, + "grad_norm": 0.26999382144154427, + "learning_rate": 1.270799050506482e-05, + "loss": 0.4277, + "step": 8610 + }, + { + "epoch": 1.2388553350589588, + "grad_norm": 0.24594834114587194, + "learning_rate": 1.2700741607874295e-05, + "loss": 0.429, + "step": 8615 + }, + { + "epoch": 1.2395743457003163, + "grad_norm": 0.24907555641346754, + "learning_rate": 1.2693491179557922e-05, + "loss": 0.4289, + "step": 8620 + }, + { + "epoch": 1.2402933563416738, + "grad_norm": 0.2527414030085228, + "learning_rate": 1.2686239224226183e-05, + "loss": 0.418, + "step": 8625 + }, + { + "epoch": 1.2410123669830313, + "grad_norm": 0.24952338659009965, + "learning_rate": 1.2678985745990401e-05, + "loss": 0.4277, + "step": 8630 + }, + { + "epoch": 1.2417313776243888, + "grad_norm": 0.2576321199932301, + "learning_rate": 1.2671730748962785e-05, + "loss": 0.4309, + "step": 8635 + }, + { + "epoch": 1.2424503882657463, + "grad_norm": 0.25162676433589115, + "learning_rate": 1.2664474237256394e-05, + "loss": 0.4221, + "step": 8640 + }, + { + "epoch": 1.2431693989071038, + "grad_norm": 0.25372190105004244, + "learning_rate": 1.2657216214985144e-05, + "loss": 0.4164, + "step": 8645 + }, + { + "epoch": 1.2438884095484612, + "grad_norm": 0.2513779098353839, + "learning_rate": 1.2649956686263814e-05, + "loss": 0.4243, + "step": 8650 + }, + { + "epoch": 1.2446074201898187, + "grad_norm": 0.26709377522559713, + "learning_rate": 1.2642695655208028e-05, + "loss": 0.4215, + "step": 8655 + }, + { + "epoch": 1.2453264308311762, + "grad_norm": 0.2701441209301692, + "learning_rate": 1.2635433125934273e-05, + "loss": 0.4209, + "step": 8660 + }, + { + "epoch": 1.2460454414725337, + "grad_norm": 0.24942264858926272, + "learning_rate": 1.2628169102559878e-05, + "loss": 0.4115, + "step": 8665 + }, + { + "epoch": 1.2467644521138912, + "grad_norm": 0.24566825676531506, + "learning_rate": 1.262090358920302e-05, + "loss": 0.4194, + "step": 8670 + }, + { + "epoch": 1.2474834627552487, + "grad_norm": 0.2475653256105879, + "learning_rate": 1.2613636589982723e-05, + "loss": 0.4166, + "step": 8675 + }, + { + "epoch": 1.2482024733966064, + "grad_norm": 0.25544820127515167, + "learning_rate": 1.260636810901885e-05, + "loss": 0.4039, + "step": 8680 + }, + { + "epoch": 1.2489214840379637, + "grad_norm": 0.2427882896810874, + "learning_rate": 1.2599098150432103e-05, + "loss": 0.4381, + "step": 8685 + }, + { + "epoch": 1.2496404946793214, + "grad_norm": 0.2558819267236487, + "learning_rate": 1.2591826718344034e-05, + "loss": 0.4282, + "step": 8690 + }, + { + "epoch": 1.2503595053206786, + "grad_norm": 0.2632854056788182, + "learning_rate": 1.2584553816877012e-05, + "loss": 0.4185, + "step": 8695 + }, + { + "epoch": 1.2510785159620363, + "grad_norm": 0.25350775902291917, + "learning_rate": 1.257727945015425e-05, + "loss": 0.4245, + "step": 8700 + }, + { + "epoch": 1.2517975266033936, + "grad_norm": 0.24278316129715752, + "learning_rate": 1.2570003622299792e-05, + "loss": 0.4011, + "step": 8705 + }, + { + "epoch": 1.2525165372447513, + "grad_norm": 0.25336770292287847, + "learning_rate": 1.2562726337438504e-05, + "loss": 0.4172, + "step": 8710 + }, + { + "epoch": 1.2532355478861086, + "grad_norm": 0.25741926259404213, + "learning_rate": 1.2555447599696086e-05, + "loss": 0.4356, + "step": 8715 + }, + { + "epoch": 1.2539545585274663, + "grad_norm": 0.24037401587557164, + "learning_rate": 1.254816741319906e-05, + "loss": 0.418, + "step": 8720 + }, + { + "epoch": 1.2546735691688238, + "grad_norm": 0.2383795745878674, + "learning_rate": 1.2540885782074756e-05, + "loss": 0.4341, + "step": 8725 + }, + { + "epoch": 1.2553925798101813, + "grad_norm": 0.28056766382702697, + "learning_rate": 1.2533602710451345e-05, + "loss": 0.4255, + "step": 8730 + }, + { + "epoch": 1.2561115904515388, + "grad_norm": 0.25016980018947965, + "learning_rate": 1.25263182024578e-05, + "loss": 0.4309, + "step": 8735 + }, + { + "epoch": 1.2568306010928962, + "grad_norm": 0.25477712843950795, + "learning_rate": 1.2519032262223913e-05, + "loss": 0.4081, + "step": 8740 + }, + { + "epoch": 1.2575496117342537, + "grad_norm": 0.2512275325163774, + "learning_rate": 1.2511744893880286e-05, + "loss": 0.4297, + "step": 8745 + }, + { + "epoch": 1.2582686223756112, + "grad_norm": 0.25050196308698536, + "learning_rate": 1.250445610155833e-05, + "loss": 0.4396, + "step": 8750 + }, + { + "epoch": 1.2589876330169687, + "grad_norm": 0.24522131997693686, + "learning_rate": 1.2497165889390269e-05, + "loss": 0.4147, + "step": 8755 + }, + { + "epoch": 1.2597066436583262, + "grad_norm": 0.25364950567029165, + "learning_rate": 1.2489874261509123e-05, + "loss": 0.4313, + "step": 8760 + }, + { + "epoch": 1.2604256542996837, + "grad_norm": 0.24474088408856176, + "learning_rate": 1.2482581222048724e-05, + "loss": 0.4146, + "step": 8765 + }, + { + "epoch": 1.2611446649410412, + "grad_norm": 0.24892207109833964, + "learning_rate": 1.2475286775143698e-05, + "loss": 0.4079, + "step": 8770 + }, + { + "epoch": 1.2618636755823986, + "grad_norm": 0.24562891750634785, + "learning_rate": 1.246799092492947e-05, + "loss": 0.4153, + "step": 8775 + }, + { + "epoch": 1.2625826862237561, + "grad_norm": 0.2676145885530362, + "learning_rate": 1.2460693675542257e-05, + "loss": 0.4134, + "step": 8780 + }, + { + "epoch": 1.2633016968651136, + "grad_norm": 0.24387436135045176, + "learning_rate": 1.2453395031119082e-05, + "loss": 0.4097, + "step": 8785 + }, + { + "epoch": 1.264020707506471, + "grad_norm": 0.2544788927491767, + "learning_rate": 1.2446094995797748e-05, + "loss": 0.4206, + "step": 8790 + }, + { + "epoch": 1.2647397181478286, + "grad_norm": 0.24259194034592435, + "learning_rate": 1.2438793573716848e-05, + "loss": 0.4172, + "step": 8795 + }, + { + "epoch": 1.265458728789186, + "grad_norm": 0.2536800428499078, + "learning_rate": 1.2431490769015757e-05, + "loss": 0.428, + "step": 8800 + }, + { + "epoch": 1.2661777394305436, + "grad_norm": 0.2536542632015627, + "learning_rate": 1.2424186585834646e-05, + "loss": 0.4207, + "step": 8805 + }, + { + "epoch": 1.266896750071901, + "grad_norm": 0.2535409529020453, + "learning_rate": 1.2416881028314457e-05, + "loss": 0.4292, + "step": 8810 + }, + { + "epoch": 1.2676157607132585, + "grad_norm": 0.2760392008221144, + "learning_rate": 1.2409574100596917e-05, + "loss": 0.4266, + "step": 8815 + }, + { + "epoch": 1.268334771354616, + "grad_norm": 0.26927077491976653, + "learning_rate": 1.2402265806824528e-05, + "loss": 0.4254, + "step": 8820 + }, + { + "epoch": 1.2690537819959735, + "grad_norm": 0.24132731029927765, + "learning_rate": 1.2394956151140558e-05, + "loss": 0.4287, + "step": 8825 + }, + { + "epoch": 1.269772792637331, + "grad_norm": 0.2506050738859499, + "learning_rate": 1.238764513768906e-05, + "loss": 0.4217, + "step": 8830 + }, + { + "epoch": 1.2704918032786885, + "grad_norm": 0.24099722920341918, + "learning_rate": 1.2380332770614856e-05, + "loss": 0.4156, + "step": 8835 + }, + { + "epoch": 1.271210813920046, + "grad_norm": 0.25846590141325754, + "learning_rate": 1.2373019054063528e-05, + "loss": 0.3999, + "step": 8840 + }, + { + "epoch": 1.2719298245614035, + "grad_norm": 0.2537504710160106, + "learning_rate": 1.2365703992181425e-05, + "loss": 0.42, + "step": 8845 + }, + { + "epoch": 1.272648835202761, + "grad_norm": 0.2500198765457716, + "learning_rate": 1.235838758911566e-05, + "loss": 0.4135, + "step": 8850 + }, + { + "epoch": 1.2733678458441184, + "grad_norm": 0.25212979662710866, + "learning_rate": 1.2351069849014106e-05, + "loss": 0.4144, + "step": 8855 + }, + { + "epoch": 1.274086856485476, + "grad_norm": 0.24726969745223198, + "learning_rate": 1.2343750776025396e-05, + "loss": 0.4327, + "step": 8860 + }, + { + "epoch": 1.2748058671268334, + "grad_norm": 0.2418920714048062, + "learning_rate": 1.2336430374298914e-05, + "loss": 0.4329, + "step": 8865 + }, + { + "epoch": 1.275524877768191, + "grad_norm": 0.25764744072883017, + "learning_rate": 1.2329108647984805e-05, + "loss": 0.4427, + "step": 8870 + }, + { + "epoch": 1.2762438884095484, + "grad_norm": 0.241722982585825, + "learning_rate": 1.2321785601233956e-05, + "loss": 0.4207, + "step": 8875 + }, + { + "epoch": 1.2769628990509059, + "grad_norm": 0.2496516652588619, + "learning_rate": 1.2314461238198003e-05, + "loss": 0.4136, + "step": 8880 + }, + { + "epoch": 1.2776819096922634, + "grad_norm": 0.25612736017110765, + "learning_rate": 1.2307135563029343e-05, + "loss": 0.4077, + "step": 8885 + }, + { + "epoch": 1.2784009203336208, + "grad_norm": 0.2587883532969212, + "learning_rate": 1.2299808579881096e-05, + "loss": 0.4061, + "step": 8890 + }, + { + "epoch": 1.2791199309749786, + "grad_norm": 0.26266354817234644, + "learning_rate": 1.2292480292907139e-05, + "loss": 0.4194, + "step": 8895 + }, + { + "epoch": 1.2798389416163358, + "grad_norm": 0.26595637586806237, + "learning_rate": 1.2285150706262079e-05, + "loss": 0.4165, + "step": 8900 + }, + { + "epoch": 1.2805579522576935, + "grad_norm": 0.24296353166495893, + "learning_rate": 1.2277819824101267e-05, + "loss": 0.4156, + "step": 8905 + }, + { + "epoch": 1.2812769628990508, + "grad_norm": 0.24802595972323246, + "learning_rate": 1.227048765058078e-05, + "loss": 0.4358, + "step": 8910 + }, + { + "epoch": 1.2819959735404085, + "grad_norm": 0.24939904196648624, + "learning_rate": 1.2263154189857437e-05, + "loss": 0.4202, + "step": 8915 + }, + { + "epoch": 1.2827149841817658, + "grad_norm": 0.2631457107121235, + "learning_rate": 1.225581944608878e-05, + "loss": 0.4221, + "step": 8920 + }, + { + "epoch": 1.2834339948231235, + "grad_norm": 0.25265520731043345, + "learning_rate": 1.2248483423433075e-05, + "loss": 0.4254, + "step": 8925 + }, + { + "epoch": 1.2841530054644807, + "grad_norm": 0.2571500562558599, + "learning_rate": 1.2241146126049326e-05, + "loss": 0.4205, + "step": 8930 + }, + { + "epoch": 1.2848720161058385, + "grad_norm": 0.261232092036823, + "learning_rate": 1.2233807558097248e-05, + "loss": 0.4057, + "step": 8935 + }, + { + "epoch": 1.2855910267471957, + "grad_norm": 0.2701298075039673, + "learning_rate": 1.2226467723737282e-05, + "loss": 0.4073, + "step": 8940 + }, + { + "epoch": 1.2863100373885534, + "grad_norm": 0.23623260788931721, + "learning_rate": 1.221912662713058e-05, + "loss": 0.4246, + "step": 8945 + }, + { + "epoch": 1.287029048029911, + "grad_norm": 0.23770205432246938, + "learning_rate": 1.221178427243902e-05, + "loss": 0.4206, + "step": 8950 + }, + { + "epoch": 1.2877480586712684, + "grad_norm": 0.23938848354815995, + "learning_rate": 1.2204440663825185e-05, + "loss": 0.4264, + "step": 8955 + }, + { + "epoch": 1.288467069312626, + "grad_norm": 0.24699020765493063, + "learning_rate": 1.2197095805452374e-05, + "loss": 0.4169, + "step": 8960 + }, + { + "epoch": 1.2891860799539834, + "grad_norm": 0.2534254886038366, + "learning_rate": 1.2189749701484593e-05, + "loss": 0.4155, + "step": 8965 + }, + { + "epoch": 1.2899050905953409, + "grad_norm": 0.2589522794130376, + "learning_rate": 1.2182402356086552e-05, + "loss": 0.4152, + "step": 8970 + }, + { + "epoch": 1.2906241012366984, + "grad_norm": 0.24692218182129114, + "learning_rate": 1.2175053773423663e-05, + "loss": 0.4281, + "step": 8975 + }, + { + "epoch": 1.2913431118780558, + "grad_norm": 0.24207100979568935, + "learning_rate": 1.2167703957662047e-05, + "loss": 0.4139, + "step": 8980 + }, + { + "epoch": 1.2920621225194133, + "grad_norm": 0.25062807272082144, + "learning_rate": 1.2160352912968521e-05, + "loss": 0.4171, + "step": 8985 + }, + { + "epoch": 1.2927811331607708, + "grad_norm": 0.24926449357321184, + "learning_rate": 1.2153000643510593e-05, + "loss": 0.4205, + "step": 8990 + }, + { + "epoch": 1.2935001438021283, + "grad_norm": 0.2453477272661754, + "learning_rate": 1.214564715345647e-05, + "loss": 0.4141, + "step": 8995 + }, + { + "epoch": 1.2942191544434858, + "grad_norm": 0.25636505060068915, + "learning_rate": 1.2138292446975055e-05, + "loss": 0.4308, + "step": 9000 + }, + { + "epoch": 1.2949381650848433, + "grad_norm": 0.26972091191831327, + "learning_rate": 1.2130936528235936e-05, + "loss": 0.4046, + "step": 9005 + }, + { + "epoch": 1.2956571757262008, + "grad_norm": 0.25918071989917435, + "learning_rate": 1.2123579401409384e-05, + "loss": 0.4279, + "step": 9010 + }, + { + "epoch": 1.2963761863675582, + "grad_norm": 0.2630262093217878, + "learning_rate": 1.2116221070666365e-05, + "loss": 0.4175, + "step": 9015 + }, + { + "epoch": 1.2970951970089157, + "grad_norm": 0.24243825902734034, + "learning_rate": 1.2108861540178523e-05, + "loss": 0.4122, + "step": 9020 + }, + { + "epoch": 1.2978142076502732, + "grad_norm": 0.25506959698223824, + "learning_rate": 1.2101500814118173e-05, + "loss": 0.4152, + "step": 9025 + }, + { + "epoch": 1.2985332182916307, + "grad_norm": 0.2631526036818052, + "learning_rate": 1.2094138896658323e-05, + "loss": 0.4216, + "step": 9030 + }, + { + "epoch": 1.2992522289329882, + "grad_norm": 0.24600693980020885, + "learning_rate": 1.2086775791972652e-05, + "loss": 0.419, + "step": 9035 + }, + { + "epoch": 1.2999712395743457, + "grad_norm": 0.25090266996787813, + "learning_rate": 1.2079411504235503e-05, + "loss": 0.4295, + "step": 9040 + }, + { + "epoch": 1.3006902502157032, + "grad_norm": 0.24291223275973534, + "learning_rate": 1.2072046037621898e-05, + "loss": 0.4222, + "step": 9045 + }, + { + "epoch": 1.3014092608570607, + "grad_norm": 0.23813746309860986, + "learning_rate": 1.206467939630753e-05, + "loss": 0.4103, + "step": 9050 + }, + { + "epoch": 1.3021282714984181, + "grad_norm": 0.24884896644308455, + "learning_rate": 1.205731158446875e-05, + "loss": 0.4172, + "step": 9055 + }, + { + "epoch": 1.3028472821397756, + "grad_norm": 0.2555252116292196, + "learning_rate": 1.2049942606282575e-05, + "loss": 0.4252, + "step": 9060 + }, + { + "epoch": 1.3035662927811331, + "grad_norm": 0.24401614065301408, + "learning_rate": 1.2042572465926687e-05, + "loss": 0.427, + "step": 9065 + }, + { + "epoch": 1.3042853034224906, + "grad_norm": 0.24417830806495353, + "learning_rate": 1.2035201167579427e-05, + "loss": 0.4256, + "step": 9070 + }, + { + "epoch": 1.305004314063848, + "grad_norm": 0.2715529724400531, + "learning_rate": 1.2027828715419782e-05, + "loss": 0.4265, + "step": 9075 + }, + { + "epoch": 1.3057233247052056, + "grad_norm": 0.25038790481816453, + "learning_rate": 1.202045511362741e-05, + "loss": 0.4268, + "step": 9080 + }, + { + "epoch": 1.306442335346563, + "grad_norm": 0.2634032672442618, + "learning_rate": 1.2013080366382608e-05, + "loss": 0.4201, + "step": 9085 + }, + { + "epoch": 1.3071613459879206, + "grad_norm": 0.253186940890943, + "learning_rate": 1.2005704477866324e-05, + "loss": 0.4158, + "step": 9090 + }, + { + "epoch": 1.307880356629278, + "grad_norm": 0.2336901935095103, + "learning_rate": 1.1998327452260156e-05, + "loss": 0.4315, + "step": 9095 + }, + { + "epoch": 1.3085993672706355, + "grad_norm": 0.25825144487765334, + "learning_rate": 1.1990949293746348e-05, + "loss": 0.4061, + "step": 9100 + }, + { + "epoch": 1.309318377911993, + "grad_norm": 0.2444218307835328, + "learning_rate": 1.1983570006507782e-05, + "loss": 0.4368, + "step": 9105 + }, + { + "epoch": 1.3100373885533507, + "grad_norm": 0.24922025162474865, + "learning_rate": 1.1976189594727984e-05, + "loss": 0.4103, + "step": 9110 + }, + { + "epoch": 1.310756399194708, + "grad_norm": 0.2522836576143024, + "learning_rate": 1.1968808062591115e-05, + "loss": 0.4157, + "step": 9115 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 0.25283861678020636, + "learning_rate": 1.196142541428197e-05, + "loss": 0.4139, + "step": 9120 + }, + { + "epoch": 1.312194420477423, + "grad_norm": 0.24988364344354413, + "learning_rate": 1.1954041653985982e-05, + "loss": 0.4301, + "step": 9125 + }, + { + "epoch": 1.3129134311187807, + "grad_norm": 0.2553856076148972, + "learning_rate": 1.1946656785889206e-05, + "loss": 0.418, + "step": 9130 + }, + { + "epoch": 1.313632441760138, + "grad_norm": 0.2550773085985317, + "learning_rate": 1.1939270814178337e-05, + "loss": 0.4185, + "step": 9135 + }, + { + "epoch": 1.3143514524014956, + "grad_norm": 0.25301840877820897, + "learning_rate": 1.193188374304068e-05, + "loss": 0.4046, + "step": 9140 + }, + { + "epoch": 1.315070463042853, + "grad_norm": 0.24418782329920338, + "learning_rate": 1.1924495576664176e-05, + "loss": 0.4207, + "step": 9145 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.2570383060224066, + "learning_rate": 1.1917106319237386e-05, + "loss": 0.4334, + "step": 9150 + }, + { + "epoch": 1.3165084843255679, + "grad_norm": 0.2538138864816419, + "learning_rate": 1.1909715974949481e-05, + "loss": 0.429, + "step": 9155 + }, + { + "epoch": 1.3172274949669256, + "grad_norm": 0.24631562920038322, + "learning_rate": 1.1902324547990257e-05, + "loss": 0.4197, + "step": 9160 + }, + { + "epoch": 1.317946505608283, + "grad_norm": 0.2596974676338609, + "learning_rate": 1.189493204255012e-05, + "loss": 0.4207, + "step": 9165 + }, + { + "epoch": 1.3186655162496406, + "grad_norm": 0.2676532626426566, + "learning_rate": 1.1887538462820088e-05, + "loss": 0.4163, + "step": 9170 + }, + { + "epoch": 1.319384526890998, + "grad_norm": 0.2515849338266774, + "learning_rate": 1.1880143812991785e-05, + "loss": 0.4227, + "step": 9175 + }, + { + "epoch": 1.3201035375323555, + "grad_norm": 0.24876902146122476, + "learning_rate": 1.1872748097257446e-05, + "loss": 0.4217, + "step": 9180 + }, + { + "epoch": 1.320822548173713, + "grad_norm": 0.24178909751029976, + "learning_rate": 1.1865351319809913e-05, + "loss": 0.4027, + "step": 9185 + }, + { + "epoch": 1.3215415588150705, + "grad_norm": 0.2743929624580491, + "learning_rate": 1.185795348484262e-05, + "loss": 0.4103, + "step": 9190 + }, + { + "epoch": 1.322260569456428, + "grad_norm": 0.24909239087389742, + "learning_rate": 1.1850554596549606e-05, + "loss": 0.4062, + "step": 9195 + }, + { + "epoch": 1.3229795800977855, + "grad_norm": 0.26268882213569555, + "learning_rate": 1.1843154659125513e-05, + "loss": 0.4198, + "step": 9200 + }, + { + "epoch": 1.323698590739143, + "grad_norm": 0.2593561140192512, + "learning_rate": 1.1835753676765567e-05, + "loss": 0.4401, + "step": 9205 + }, + { + "epoch": 1.3244176013805005, + "grad_norm": 0.23760104891675088, + "learning_rate": 1.1828351653665596e-05, + "loss": 0.4125, + "step": 9210 + }, + { + "epoch": 1.325136612021858, + "grad_norm": 0.2579797719848589, + "learning_rate": 1.1820948594022009e-05, + "loss": 0.4312, + "step": 9215 + }, + { + "epoch": 1.3258556226632154, + "grad_norm": 0.2549569947724984, + "learning_rate": 1.1813544502031808e-05, + "loss": 0.4266, + "step": 9220 + }, + { + "epoch": 1.326574633304573, + "grad_norm": 0.27832142998017334, + "learning_rate": 1.180613938189258e-05, + "loss": 0.4066, + "step": 9225 + }, + { + "epoch": 1.3272936439459304, + "grad_norm": 0.24598302504979594, + "learning_rate": 1.17987332378025e-05, + "loss": 0.4133, + "step": 9230 + }, + { + "epoch": 1.328012654587288, + "grad_norm": 0.23934405812207502, + "learning_rate": 1.1791326073960313e-05, + "loss": 0.4147, + "step": 9235 + }, + { + "epoch": 1.3287316652286454, + "grad_norm": 0.2470972899882699, + "learning_rate": 1.1783917894565344e-05, + "loss": 0.4194, + "step": 9240 + }, + { + "epoch": 1.3294506758700029, + "grad_norm": 0.25432228325449185, + "learning_rate": 1.1776508703817503e-05, + "loss": 0.4259, + "step": 9245 + }, + { + "epoch": 1.3301696865113604, + "grad_norm": 0.2509491368439348, + "learning_rate": 1.176909850591726e-05, + "loss": 0.4315, + "step": 9250 + }, + { + "epoch": 1.3308886971527178, + "grad_norm": 0.24480475137300584, + "learning_rate": 1.176168730506567e-05, + "loss": 0.4211, + "step": 9255 + }, + { + "epoch": 1.3316077077940753, + "grad_norm": 0.2465616123435715, + "learning_rate": 1.1754275105464349e-05, + "loss": 0.4132, + "step": 9260 + }, + { + "epoch": 1.3323267184354328, + "grad_norm": 0.24228887418427103, + "learning_rate": 1.1746861911315476e-05, + "loss": 0.4167, + "step": 9265 + }, + { + "epoch": 1.3330457290767903, + "grad_norm": 0.24133922804214428, + "learning_rate": 1.1739447726821798e-05, + "loss": 0.4313, + "step": 9270 + }, + { + "epoch": 1.3337647397181478, + "grad_norm": 0.2577785421881333, + "learning_rate": 1.1732032556186626e-05, + "loss": 0.415, + "step": 9275 + }, + { + "epoch": 1.3344837503595053, + "grad_norm": 0.2628027543411037, + "learning_rate": 1.1724616403613827e-05, + "loss": 0.4035, + "step": 9280 + }, + { + "epoch": 1.3352027610008628, + "grad_norm": 0.23816228620413699, + "learning_rate": 1.1717199273307826e-05, + "loss": 0.4208, + "step": 9285 + }, + { + "epoch": 1.3359217716422203, + "grad_norm": 0.263877912817765, + "learning_rate": 1.1709781169473599e-05, + "loss": 0.4236, + "step": 9290 + }, + { + "epoch": 1.3366407822835777, + "grad_norm": 0.2438005305187124, + "learning_rate": 1.1702362096316675e-05, + "loss": 0.4227, + "step": 9295 + }, + { + "epoch": 1.3373597929249352, + "grad_norm": 0.2621549663696597, + "learning_rate": 1.169494205804314e-05, + "loss": 0.4241, + "step": 9300 + }, + { + "epoch": 1.3380788035662927, + "grad_norm": 0.24567814302505314, + "learning_rate": 1.1687521058859612e-05, + "loss": 0.4281, + "step": 9305 + }, + { + "epoch": 1.3387978142076502, + "grad_norm": 0.26479997475978473, + "learning_rate": 1.1680099102973271e-05, + "loss": 0.4254, + "step": 9310 + }, + { + "epoch": 1.3395168248490077, + "grad_norm": 0.23805591644841448, + "learning_rate": 1.1672676194591825e-05, + "loss": 0.4172, + "step": 9315 + }, + { + "epoch": 1.3402358354903652, + "grad_norm": 0.25929710952180146, + "learning_rate": 1.1665252337923529e-05, + "loss": 0.42, + "step": 9320 + }, + { + "epoch": 1.3409548461317227, + "grad_norm": 0.24398070016346668, + "learning_rate": 1.165782753717718e-05, + "loss": 0.4106, + "step": 9325 + }, + { + "epoch": 1.3416738567730802, + "grad_norm": 0.25164634515915246, + "learning_rate": 1.1650401796562098e-05, + "loss": 0.4204, + "step": 9330 + }, + { + "epoch": 1.3423928674144379, + "grad_norm": 0.2594687930418735, + "learning_rate": 1.1642975120288148e-05, + "loss": 0.4362, + "step": 9335 + }, + { + "epoch": 1.3431118780557951, + "grad_norm": 0.25432427841688693, + "learning_rate": 1.1635547512565719e-05, + "loss": 0.401, + "step": 9340 + }, + { + "epoch": 1.3438308886971528, + "grad_norm": 0.2658648488496538, + "learning_rate": 1.1628118977605724e-05, + "loss": 0.4117, + "step": 9345 + }, + { + "epoch": 1.34454989933851, + "grad_norm": 0.27131429608303353, + "learning_rate": 1.1620689519619614e-05, + "loss": 0.4269, + "step": 9350 + }, + { + "epoch": 1.3452689099798678, + "grad_norm": 0.24804847320340484, + "learning_rate": 1.1613259142819352e-05, + "loss": 0.4191, + "step": 9355 + }, + { + "epoch": 1.345987920621225, + "grad_norm": 0.2567138794450249, + "learning_rate": 1.160582785141743e-05, + "loss": 0.4088, + "step": 9360 + }, + { + "epoch": 1.3467069312625828, + "grad_norm": 0.2544226922118282, + "learning_rate": 1.159839564962685e-05, + "loss": 0.4294, + "step": 9365 + }, + { + "epoch": 1.34742594190394, + "grad_norm": 0.25368336566165745, + "learning_rate": 1.159096254166114e-05, + "loss": 0.4211, + "step": 9370 + }, + { + "epoch": 1.3481449525452978, + "grad_norm": 0.2533635232451979, + "learning_rate": 1.158352853173433e-05, + "loss": 0.4158, + "step": 9375 + }, + { + "epoch": 1.348863963186655, + "grad_norm": 0.24139131564856997, + "learning_rate": 1.1576093624060973e-05, + "loss": 0.4182, + "step": 9380 + }, + { + "epoch": 1.3495829738280127, + "grad_norm": 0.25838832899405517, + "learning_rate": 1.1568657822856124e-05, + "loss": 0.4118, + "step": 9385 + }, + { + "epoch": 1.3503019844693702, + "grad_norm": 0.24150676562955783, + "learning_rate": 1.1561221132335345e-05, + "loss": 0.4262, + "step": 9390 + }, + { + "epoch": 1.3510209951107277, + "grad_norm": 0.24973426047606528, + "learning_rate": 1.1553783556714705e-05, + "loss": 0.4327, + "step": 9395 + }, + { + "epoch": 1.3517400057520852, + "grad_norm": 0.2552059385133282, + "learning_rate": 1.1546345100210774e-05, + "loss": 0.4231, + "step": 9400 + }, + { + "epoch": 1.3524590163934427, + "grad_norm": 0.24879979757131104, + "learning_rate": 1.153890576704062e-05, + "loss": 0.4212, + "step": 9405 + }, + { + "epoch": 1.3531780270348002, + "grad_norm": 0.2510385449145052, + "learning_rate": 1.1531465561421808e-05, + "loss": 0.4266, + "step": 9410 + }, + { + "epoch": 1.3538970376761577, + "grad_norm": 0.2458257142853843, + "learning_rate": 1.1524024487572399e-05, + "loss": 0.4166, + "step": 9415 + }, + { + "epoch": 1.3546160483175151, + "grad_norm": 0.24926383458245444, + "learning_rate": 1.1516582549710947e-05, + "loss": 0.4169, + "step": 9420 + }, + { + "epoch": 1.3553350589588726, + "grad_norm": 0.26251012756652353, + "learning_rate": 1.1509139752056493e-05, + "loss": 0.4139, + "step": 9425 + }, + { + "epoch": 1.3560540696002301, + "grad_norm": 0.25493395903499944, + "learning_rate": 1.1501696098828568e-05, + "loss": 0.4397, + "step": 9430 + }, + { + "epoch": 1.3567730802415876, + "grad_norm": 0.2484301244686449, + "learning_rate": 1.1494251594247183e-05, + "loss": 0.4132, + "step": 9435 + }, + { + "epoch": 1.357492090882945, + "grad_norm": 0.25680415561428516, + "learning_rate": 1.1486806242532839e-05, + "loss": 0.4157, + "step": 9440 + }, + { + "epoch": 1.3582111015243026, + "grad_norm": 0.25511888621037243, + "learning_rate": 1.1479360047906511e-05, + "loss": 0.4248, + "step": 9445 + }, + { + "epoch": 1.35893011216566, + "grad_norm": 0.24114788267993964, + "learning_rate": 1.1471913014589665e-05, + "loss": 0.4089, + "step": 9450 + }, + { + "epoch": 1.3596491228070176, + "grad_norm": 0.24653045174658883, + "learning_rate": 1.1464465146804218e-05, + "loss": 0.4121, + "step": 9455 + }, + { + "epoch": 1.360368133448375, + "grad_norm": 0.2535711076643114, + "learning_rate": 1.145701644877258e-05, + "loss": 0.4175, + "step": 9460 + }, + { + "epoch": 1.3610871440897325, + "grad_norm": 0.23796423442406128, + "learning_rate": 1.1449566924717627e-05, + "loss": 0.4115, + "step": 9465 + }, + { + "epoch": 1.36180615473109, + "grad_norm": 0.25045376780440926, + "learning_rate": 1.1442116578862701e-05, + "loss": 0.4182, + "step": 9470 + }, + { + "epoch": 1.3625251653724475, + "grad_norm": 0.24208080670495713, + "learning_rate": 1.1434665415431614e-05, + "loss": 0.4127, + "step": 9475 + }, + { + "epoch": 1.363244176013805, + "grad_norm": 0.2559483678996027, + "learning_rate": 1.1427213438648636e-05, + "loss": 0.4128, + "step": 9480 + }, + { + "epoch": 1.3639631866551625, + "grad_norm": 0.2509038564055273, + "learning_rate": 1.1419760652738498e-05, + "loss": 0.4253, + "step": 9485 + }, + { + "epoch": 1.36468219729652, + "grad_norm": 0.26408296167827605, + "learning_rate": 1.1412307061926396e-05, + "loss": 0.4242, + "step": 9490 + }, + { + "epoch": 1.3654012079378774, + "grad_norm": 0.24820214493663295, + "learning_rate": 1.140485267043798e-05, + "loss": 0.4198, + "step": 9495 + }, + { + "epoch": 1.366120218579235, + "grad_norm": 0.24623099519803363, + "learning_rate": 1.1397397482499352e-05, + "loss": 0.4192, + "step": 9500 + }, + { + "epoch": 1.3668392292205924, + "grad_norm": 0.2541389919088227, + "learning_rate": 1.1389941502337063e-05, + "loss": 0.4114, + "step": 9505 + }, + { + "epoch": 1.36755823986195, + "grad_norm": 0.25804970955333406, + "learning_rate": 1.138248473417812e-05, + "loss": 0.4182, + "step": 9510 + }, + { + "epoch": 1.3682772505033074, + "grad_norm": 0.2417742930782717, + "learning_rate": 1.1375027182249971e-05, + "loss": 0.4231, + "step": 9515 + }, + { + "epoch": 1.3689962611446649, + "grad_norm": 0.2690508189851924, + "learning_rate": 1.1367568850780511e-05, + "loss": 0.4412, + "step": 9520 + }, + { + "epoch": 1.3697152717860224, + "grad_norm": 0.26211289218773653, + "learning_rate": 1.1360109743998075e-05, + "loss": 0.4319, + "step": 9525 + }, + { + "epoch": 1.3704342824273799, + "grad_norm": 0.25732312239806404, + "learning_rate": 1.1352649866131447e-05, + "loss": 0.4102, + "step": 9530 + }, + { + "epoch": 1.3711532930687373, + "grad_norm": 0.25182004898998794, + "learning_rate": 1.1345189221409828e-05, + "loss": 0.4109, + "step": 9535 + }, + { + "epoch": 1.3718723037100948, + "grad_norm": 0.24814394052739985, + "learning_rate": 1.133772781406287e-05, + "loss": 0.4182, + "step": 9540 + }, + { + "epoch": 1.3725913143514523, + "grad_norm": 0.25001805051958936, + "learning_rate": 1.133026564832066e-05, + "loss": 0.4153, + "step": 9545 + }, + { + "epoch": 1.37331032499281, + "grad_norm": 0.24659292069576322, + "learning_rate": 1.13228027284137e-05, + "loss": 0.4141, + "step": 9550 + }, + { + "epoch": 1.3740293356341673, + "grad_norm": 0.24357609150858484, + "learning_rate": 1.131533905857293e-05, + "loss": 0.4301, + "step": 9555 + }, + { + "epoch": 1.374748346275525, + "grad_norm": 0.2575243150984694, + "learning_rate": 1.1307874643029715e-05, + "loss": 0.4189, + "step": 9560 + }, + { + "epoch": 1.3754673569168823, + "grad_norm": 0.24769625836215414, + "learning_rate": 1.1300409486015837e-05, + "loss": 0.4251, + "step": 9565 + }, + { + "epoch": 1.37618636755824, + "grad_norm": 0.2522589458200581, + "learning_rate": 1.1292943591763506e-05, + "loss": 0.4152, + "step": 9570 + }, + { + "epoch": 1.3769053781995972, + "grad_norm": 0.2533970724785977, + "learning_rate": 1.1285476964505341e-05, + "loss": 0.4109, + "step": 9575 + }, + { + "epoch": 1.377624388840955, + "grad_norm": 0.24809731784067882, + "learning_rate": 1.1278009608474389e-05, + "loss": 0.426, + "step": 9580 + }, + { + "epoch": 1.3783433994823122, + "grad_norm": 0.24430656121325212, + "learning_rate": 1.1270541527904098e-05, + "loss": 0.4233, + "step": 9585 + }, + { + "epoch": 1.37906241012367, + "grad_norm": 0.24883279197937416, + "learning_rate": 1.1263072727028325e-05, + "loss": 0.4131, + "step": 9590 + }, + { + "epoch": 1.3797814207650272, + "grad_norm": 0.24854922226819207, + "learning_rate": 1.1255603210081358e-05, + "loss": 0.4103, + "step": 9595 + }, + { + "epoch": 1.380500431406385, + "grad_norm": 0.25070734498380887, + "learning_rate": 1.1248132981297858e-05, + "loss": 0.4332, + "step": 9600 + }, + { + "epoch": 1.3812194420477424, + "grad_norm": 0.24205854629117363, + "learning_rate": 1.1240662044912917e-05, + "loss": 0.4062, + "step": 9605 + }, + { + "epoch": 1.3819384526890999, + "grad_norm": 0.24975559484913917, + "learning_rate": 1.1233190405162014e-05, + "loss": 0.4234, + "step": 9610 + }, + { + "epoch": 1.3826574633304574, + "grad_norm": 0.255512768099968, + "learning_rate": 1.1225718066281029e-05, + "loss": 0.437, + "step": 9615 + }, + { + "epoch": 1.3833764739718148, + "grad_norm": 0.25492180126484526, + "learning_rate": 1.1218245032506241e-05, + "loss": 0.4227, + "step": 9620 + }, + { + "epoch": 1.3840954846131723, + "grad_norm": 0.25878749808087337, + "learning_rate": 1.1210771308074321e-05, + "loss": 0.4181, + "step": 9625 + }, + { + "epoch": 1.3848144952545298, + "grad_norm": 0.2516230476695916, + "learning_rate": 1.1203296897222335e-05, + "loss": 0.4273, + "step": 9630 + }, + { + "epoch": 1.3855335058958873, + "grad_norm": 0.24514289255287167, + "learning_rate": 1.119582180418773e-05, + "loss": 0.4333, + "step": 9635 + }, + { + "epoch": 1.3862525165372448, + "grad_norm": 0.2556132386527134, + "learning_rate": 1.1188346033208349e-05, + "loss": 0.4116, + "step": 9640 + }, + { + "epoch": 1.3869715271786023, + "grad_norm": 0.25596622811491904, + "learning_rate": 1.1180869588522415e-05, + "loss": 0.4357, + "step": 9645 + }, + { + "epoch": 1.3876905378199598, + "grad_norm": 0.24824004793430757, + "learning_rate": 1.1173392474368532e-05, + "loss": 0.4123, + "step": 9650 + }, + { + "epoch": 1.3884095484613173, + "grad_norm": 0.24390818336880687, + "learning_rate": 1.1165914694985684e-05, + "loss": 0.4114, + "step": 9655 + }, + { + "epoch": 1.3891285591026747, + "grad_norm": 0.24472961261740755, + "learning_rate": 1.1158436254613237e-05, + "loss": 0.4106, + "step": 9660 + }, + { + "epoch": 1.3898475697440322, + "grad_norm": 0.2562298705368656, + "learning_rate": 1.1150957157490922e-05, + "loss": 0.4194, + "step": 9665 + }, + { + "epoch": 1.3905665803853897, + "grad_norm": 0.25020129785017103, + "learning_rate": 1.114347740785885e-05, + "loss": 0.4197, + "step": 9670 + }, + { + "epoch": 1.3912855910267472, + "grad_norm": 0.2669599095249636, + "learning_rate": 1.1135997009957504e-05, + "loss": 0.4206, + "step": 9675 + }, + { + "epoch": 1.3920046016681047, + "grad_norm": 0.26084835531794537, + "learning_rate": 1.1128515968027729e-05, + "loss": 0.4285, + "step": 9680 + }, + { + "epoch": 1.3927236123094622, + "grad_norm": 0.24661487482958494, + "learning_rate": 1.112103428631073e-05, + "loss": 0.4266, + "step": 9685 + }, + { + "epoch": 1.3934426229508197, + "grad_norm": 0.25431214598260804, + "learning_rate": 1.1113551969048088e-05, + "loss": 0.4391, + "step": 9690 + }, + { + "epoch": 1.3941616335921772, + "grad_norm": 0.26744876168382925, + "learning_rate": 1.1106069020481738e-05, + "loss": 0.4286, + "step": 9695 + }, + { + "epoch": 1.3948806442335346, + "grad_norm": 0.2536205533902165, + "learning_rate": 1.1098585444853969e-05, + "loss": 0.4091, + "step": 9700 + }, + { + "epoch": 1.3955996548748921, + "grad_norm": 0.2526351650046927, + "learning_rate": 1.1091101246407431e-05, + "loss": 0.4234, + "step": 9705 + }, + { + "epoch": 1.3963186655162496, + "grad_norm": 0.24987142557874195, + "learning_rate": 1.1083616429385125e-05, + "loss": 0.395, + "step": 9710 + }, + { + "epoch": 1.397037676157607, + "grad_norm": 0.2500600739779468, + "learning_rate": 1.1076130998030401e-05, + "loss": 0.4114, + "step": 9715 + }, + { + "epoch": 1.3977566867989646, + "grad_norm": 0.25971817692692917, + "learning_rate": 1.106864495658696e-05, + "loss": 0.4156, + "step": 9720 + }, + { + "epoch": 1.398475697440322, + "grad_norm": 0.24390586391176858, + "learning_rate": 1.106115830929885e-05, + "loss": 0.4215, + "step": 9725 + }, + { + "epoch": 1.3991947080816796, + "grad_norm": 0.2572365424296446, + "learning_rate": 1.105367106041046e-05, + "loss": 0.4264, + "step": 9730 + }, + { + "epoch": 1.399913718723037, + "grad_norm": 0.2531752093277882, + "learning_rate": 1.1046183214166515e-05, + "loss": 0.4308, + "step": 9735 + }, + { + "epoch": 1.4006327293643945, + "grad_norm": 0.310930690806812, + "learning_rate": 1.1038694774812091e-05, + "loss": 0.4036, + "step": 9740 + }, + { + "epoch": 1.401351740005752, + "grad_norm": 0.24453563822550906, + "learning_rate": 1.1031205746592593e-05, + "loss": 0.4019, + "step": 9745 + }, + { + "epoch": 1.4020707506471095, + "grad_norm": 0.24514692054203863, + "learning_rate": 1.1023716133753758e-05, + "loss": 0.417, + "step": 9750 + }, + { + "epoch": 1.402789761288467, + "grad_norm": 0.267653217070886, + "learning_rate": 1.1016225940541654e-05, + "loss": 0.4208, + "step": 9755 + }, + { + "epoch": 1.4035087719298245, + "grad_norm": 0.24405169547485742, + "learning_rate": 1.1008735171202685e-05, + "loss": 0.4348, + "step": 9760 + }, + { + "epoch": 1.404227782571182, + "grad_norm": 0.2592343425587176, + "learning_rate": 1.1001243829983575e-05, + "loss": 0.4284, + "step": 9765 + }, + { + "epoch": 1.4049467932125395, + "grad_norm": 0.26906403680949453, + "learning_rate": 1.0993751921131375e-05, + "loss": 0.4178, + "step": 9770 + }, + { + "epoch": 1.4056658038538972, + "grad_norm": 0.24745255746857836, + "learning_rate": 1.098625944889346e-05, + "loss": 0.4232, + "step": 9775 + }, + { + "epoch": 1.4063848144952544, + "grad_norm": 0.24632817658584125, + "learning_rate": 1.097876641751752e-05, + "loss": 0.4149, + "step": 9780 + }, + { + "epoch": 1.4071038251366121, + "grad_norm": 0.2950557010074856, + "learning_rate": 1.0971272831251557e-05, + "loss": 0.4328, + "step": 9785 + }, + { + "epoch": 1.4078228357779694, + "grad_norm": 0.25024781565923276, + "learning_rate": 1.0963778694343908e-05, + "loss": 0.4059, + "step": 9790 + }, + { + "epoch": 1.4085418464193271, + "grad_norm": 0.2597517594562987, + "learning_rate": 1.0956284011043199e-05, + "loss": 0.4194, + "step": 9795 + }, + { + "epoch": 1.4092608570606844, + "grad_norm": 0.26603567927424965, + "learning_rate": 1.094878878559838e-05, + "loss": 0.4193, + "step": 9800 + }, + { + "epoch": 1.409979867702042, + "grad_norm": 0.24432102690114063, + "learning_rate": 1.0941293022258697e-05, + "loss": 0.397, + "step": 9805 + }, + { + "epoch": 1.4106988783433994, + "grad_norm": 0.25657527895947746, + "learning_rate": 1.093379672527371e-05, + "loss": 0.4076, + "step": 9810 + }, + { + "epoch": 1.411417888984757, + "grad_norm": 0.2604569361478616, + "learning_rate": 1.0926299898893284e-05, + "loss": 0.4036, + "step": 9815 + }, + { + "epoch": 1.4121368996261143, + "grad_norm": 0.24733722138877323, + "learning_rate": 1.0918802547367575e-05, + "loss": 0.4177, + "step": 9820 + }, + { + "epoch": 1.412855910267472, + "grad_norm": 0.2443676689862657, + "learning_rate": 1.0911304674947043e-05, + "loss": 0.429, + "step": 9825 + }, + { + "epoch": 1.4135749209088295, + "grad_norm": 0.2498505339062968, + "learning_rate": 1.0903806285882441e-05, + "loss": 0.4248, + "step": 9830 + }, + { + "epoch": 1.414293931550187, + "grad_norm": 0.2673976560861366, + "learning_rate": 1.089630738442481e-05, + "loss": 0.414, + "step": 9835 + }, + { + "epoch": 1.4150129421915445, + "grad_norm": 0.24990001920378563, + "learning_rate": 1.0888807974825496e-05, + "loss": 0.4138, + "step": 9840 + }, + { + "epoch": 1.415731952832902, + "grad_norm": 0.2539166543872164, + "learning_rate": 1.088130806133612e-05, + "loss": 0.4048, + "step": 9845 + }, + { + "epoch": 1.4164509634742595, + "grad_norm": 0.2440319611051078, + "learning_rate": 1.0873807648208587e-05, + "loss": 0.4264, + "step": 9850 + }, + { + "epoch": 1.417169974115617, + "grad_norm": 0.24993266032512565, + "learning_rate": 1.0866306739695097e-05, + "loss": 0.4138, + "step": 9855 + }, + { + "epoch": 1.4178889847569744, + "grad_norm": 0.24901540914858042, + "learning_rate": 1.0858805340048121e-05, + "loss": 0.4342, + "step": 9860 + }, + { + "epoch": 1.418607995398332, + "grad_norm": 0.2476975113243009, + "learning_rate": 1.0851303453520414e-05, + "loss": 0.4061, + "step": 9865 + }, + { + "epoch": 1.4193270060396894, + "grad_norm": 0.2581664703794402, + "learning_rate": 1.0843801084365004e-05, + "loss": 0.4074, + "step": 9870 + }, + { + "epoch": 1.420046016681047, + "grad_norm": 0.252494581026853, + "learning_rate": 1.0836298236835197e-05, + "loss": 0.4163, + "step": 9875 + }, + { + "epoch": 1.4207650273224044, + "grad_norm": 0.2508404738384896, + "learning_rate": 1.0828794915184556e-05, + "loss": 0.4096, + "step": 9880 + }, + { + "epoch": 1.4214840379637619, + "grad_norm": 0.24030733519590325, + "learning_rate": 1.0821291123666939e-05, + "loss": 0.4192, + "step": 9885 + }, + { + "epoch": 1.4222030486051194, + "grad_norm": 0.2526576940274779, + "learning_rate": 1.0813786866536445e-05, + "loss": 0.4081, + "step": 9890 + }, + { + "epoch": 1.4229220592464769, + "grad_norm": 0.24985206673472138, + "learning_rate": 1.0806282148047448e-05, + "loss": 0.4172, + "step": 9895 + }, + { + "epoch": 1.4236410698878343, + "grad_norm": 0.24422182429307604, + "learning_rate": 1.0798776972454586e-05, + "loss": 0.4007, + "step": 9900 + }, + { + "epoch": 1.4243600805291918, + "grad_norm": 0.2570413067331986, + "learning_rate": 1.0791271344012748e-05, + "loss": 0.4173, + "step": 9905 + }, + { + "epoch": 1.4250790911705493, + "grad_norm": 0.2507723594504243, + "learning_rate": 1.0783765266977088e-05, + "loss": 0.4073, + "step": 9910 + }, + { + "epoch": 1.4257981018119068, + "grad_norm": 0.2524376425793629, + "learning_rate": 1.077625874560301e-05, + "loss": 0.4324, + "step": 9915 + }, + { + "epoch": 1.4265171124532643, + "grad_norm": 0.26484532697084445, + "learning_rate": 1.076875178414617e-05, + "loss": 0.4131, + "step": 9920 + }, + { + "epoch": 1.4272361230946218, + "grad_norm": 0.25445991101378906, + "learning_rate": 1.0761244386862475e-05, + "loss": 0.3948, + "step": 9925 + }, + { + "epoch": 1.4279551337359793, + "grad_norm": 0.2721341341876498, + "learning_rate": 1.0753736558008074e-05, + "loss": 0.4077, + "step": 9930 + }, + { + "epoch": 1.4286741443773368, + "grad_norm": 0.2527863317405902, + "learning_rate": 1.074622830183937e-05, + "loss": 0.4266, + "step": 9935 + }, + { + "epoch": 1.4293931550186942, + "grad_norm": 0.25451031488199266, + "learning_rate": 1.0738719622613e-05, + "loss": 0.4238, + "step": 9940 + }, + { + "epoch": 1.4301121656600517, + "grad_norm": 0.2552071960367096, + "learning_rate": 1.0731210524585852e-05, + "loss": 0.4155, + "step": 9945 + }, + { + "epoch": 1.4308311763014092, + "grad_norm": 0.24824533306246047, + "learning_rate": 1.0723701012015032e-05, + "loss": 0.4094, + "step": 9950 + }, + { + "epoch": 1.4315501869427667, + "grad_norm": 0.2498459830748846, + "learning_rate": 1.0716191089157895e-05, + "loss": 0.4224, + "step": 9955 + }, + { + "epoch": 1.4322691975841242, + "grad_norm": 0.2500368139104141, + "learning_rate": 1.070868076027203e-05, + "loss": 0.4138, + "step": 9960 + }, + { + "epoch": 1.4329882082254817, + "grad_norm": 0.2657392868538405, + "learning_rate": 1.0701170029615248e-05, + "loss": 0.4229, + "step": 9965 + }, + { + "epoch": 1.4337072188668392, + "grad_norm": 0.2451600737041558, + "learning_rate": 1.0693658901445596e-05, + "loss": 0.4054, + "step": 9970 + }, + { + "epoch": 1.4344262295081966, + "grad_norm": 0.27267755056042187, + "learning_rate": 1.0686147380021343e-05, + "loss": 0.4148, + "step": 9975 + }, + { + "epoch": 1.4351452401495541, + "grad_norm": 0.2565989948882745, + "learning_rate": 1.0678635469600974e-05, + "loss": 0.4042, + "step": 9980 + }, + { + "epoch": 1.4358642507909116, + "grad_norm": 0.2579820399597762, + "learning_rate": 1.0671123174443205e-05, + "loss": 0.4265, + "step": 9985 + }, + { + "epoch": 1.4365832614322693, + "grad_norm": 0.2551052561872372, + "learning_rate": 1.0663610498806967e-05, + "loss": 0.4129, + "step": 9990 + }, + { + "epoch": 1.4373022720736266, + "grad_norm": 0.26506244582250904, + "learning_rate": 1.0656097446951405e-05, + "loss": 0.4019, + "step": 9995 + }, + { + "epoch": 1.4380212827149843, + "grad_norm": 0.2563926253213933, + "learning_rate": 1.0648584023135878e-05, + "loss": 0.4259, + "step": 10000 + }, + { + "epoch": 1.4387402933563416, + "grad_norm": 0.23996561192295213, + "learning_rate": 1.064107023161995e-05, + "loss": 0.402, + "step": 10005 + }, + { + "epoch": 1.4394593039976993, + "grad_norm": 0.2609602746289666, + "learning_rate": 1.063355607666341e-05, + "loss": 0.433, + "step": 10010 + }, + { + "epoch": 1.4401783146390565, + "grad_norm": 0.2572423732120433, + "learning_rate": 1.0626041562526232e-05, + "loss": 0.4144, + "step": 10015 + }, + { + "epoch": 1.4408973252804143, + "grad_norm": 0.2532322139448099, + "learning_rate": 1.0618526693468611e-05, + "loss": 0.4104, + "step": 10020 + }, + { + "epoch": 1.4416163359217715, + "grad_norm": 0.256513588435813, + "learning_rate": 1.0611011473750932e-05, + "loss": 0.4284, + "step": 10025 + }, + { + "epoch": 1.4423353465631292, + "grad_norm": 0.2512556503761663, + "learning_rate": 1.0603495907633785e-05, + "loss": 0.4167, + "step": 10030 + }, + { + "epoch": 1.4430543572044865, + "grad_norm": 0.2612955230091824, + "learning_rate": 1.0595979999377953e-05, + "loss": 0.4303, + "step": 10035 + }, + { + "epoch": 1.4437733678458442, + "grad_norm": 0.2389982500909746, + "learning_rate": 1.0588463753244419e-05, + "loss": 0.4081, + "step": 10040 + }, + { + "epoch": 1.4444923784872017, + "grad_norm": 0.24034615966848993, + "learning_rate": 1.0580947173494344e-05, + "loss": 0.4168, + "step": 10045 + }, + { + "epoch": 1.4452113891285592, + "grad_norm": 0.25396503366625184, + "learning_rate": 1.0573430264389095e-05, + "loss": 0.4172, + "step": 10050 + }, + { + "epoch": 1.4459303997699167, + "grad_norm": 0.25299540098124423, + "learning_rate": 1.056591303019021e-05, + "loss": 0.4226, + "step": 10055 + }, + { + "epoch": 1.4466494104112742, + "grad_norm": 0.2538181787663044, + "learning_rate": 1.0558395475159429e-05, + "loss": 0.4181, + "step": 10060 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.2583341815420175, + "learning_rate": 1.0550877603558656e-05, + "loss": 0.4178, + "step": 10065 + }, + { + "epoch": 1.4480874316939891, + "grad_norm": 0.24865034683634282, + "learning_rate": 1.0543359419649986e-05, + "loss": 0.402, + "step": 10070 + }, + { + "epoch": 1.4488064423353466, + "grad_norm": 0.266590676113421, + "learning_rate": 1.0535840927695684e-05, + "loss": 0.4358, + "step": 10075 + }, + { + "epoch": 1.449525452976704, + "grad_norm": 0.25905004609181675, + "learning_rate": 1.0528322131958198e-05, + "loss": 0.4041, + "step": 10080 + }, + { + "epoch": 1.4502444636180616, + "grad_norm": 0.25026534058827304, + "learning_rate": 1.0520803036700138e-05, + "loss": 0.4233, + "step": 10085 + }, + { + "epoch": 1.450963474259419, + "grad_norm": 0.25318916934109487, + "learning_rate": 1.0513283646184297e-05, + "loss": 0.4269, + "step": 10090 + }, + { + "epoch": 1.4516824849007766, + "grad_norm": 0.2644612965612271, + "learning_rate": 1.0505763964673617e-05, + "loss": 0.4169, + "step": 10095 + }, + { + "epoch": 1.452401495542134, + "grad_norm": 0.2461679414258703, + "learning_rate": 1.049824399643122e-05, + "loss": 0.4077, + "step": 10100 + }, + { + "epoch": 1.4531205061834915, + "grad_norm": 0.2531626796998085, + "learning_rate": 1.0490723745720387e-05, + "loss": 0.4112, + "step": 10105 + }, + { + "epoch": 1.453839516824849, + "grad_norm": 0.2503388208313446, + "learning_rate": 1.0483203216804562e-05, + "loss": 0.417, + "step": 10110 + }, + { + "epoch": 1.4545585274662065, + "grad_norm": 0.24133257517578935, + "learning_rate": 1.0475682413947337e-05, + "loss": 0.4283, + "step": 10115 + }, + { + "epoch": 1.455277538107564, + "grad_norm": 0.2447893414574943, + "learning_rate": 1.0468161341412466e-05, + "loss": 0.4137, + "step": 10120 + }, + { + "epoch": 1.4559965487489215, + "grad_norm": 0.2540357859067168, + "learning_rate": 1.0460640003463855e-05, + "loss": 0.4349, + "step": 10125 + }, + { + "epoch": 1.456715559390279, + "grad_norm": 0.237597538775147, + "learning_rate": 1.0453118404365563e-05, + "loss": 0.4034, + "step": 10130 + }, + { + "epoch": 1.4574345700316365, + "grad_norm": 0.2496607379914245, + "learning_rate": 1.0445596548381793e-05, + "loss": 0.4168, + "step": 10135 + }, + { + "epoch": 1.458153580672994, + "grad_norm": 0.2701412787204999, + "learning_rate": 1.0438074439776895e-05, + "loss": 0.4158, + "step": 10140 + }, + { + "epoch": 1.4588725913143514, + "grad_norm": 0.25006441637973104, + "learning_rate": 1.0430552082815363e-05, + "loss": 0.4039, + "step": 10145 + }, + { + "epoch": 1.459591601955709, + "grad_norm": 0.2633613283531685, + "learning_rate": 1.0423029481761831e-05, + "loss": 0.4235, + "step": 10150 + }, + { + "epoch": 1.4603106125970664, + "grad_norm": 0.24826147447761096, + "learning_rate": 1.0415506640881068e-05, + "loss": 0.4246, + "step": 10155 + }, + { + "epoch": 1.461029623238424, + "grad_norm": 0.2613503948237542, + "learning_rate": 1.0407983564437992e-05, + "loss": 0.4144, + "step": 10160 + }, + { + "epoch": 1.4617486338797814, + "grad_norm": 0.24539329475298266, + "learning_rate": 1.0400460256697638e-05, + "loss": 0.4282, + "step": 10165 + }, + { + "epoch": 1.4624676445211389, + "grad_norm": 0.2505553555477828, + "learning_rate": 1.0392936721925178e-05, + "loss": 0.4341, + "step": 10170 + }, + { + "epoch": 1.4631866551624964, + "grad_norm": 0.2515603381742112, + "learning_rate": 1.0385412964385916e-05, + "loss": 0.4321, + "step": 10175 + }, + { + "epoch": 1.4639056658038538, + "grad_norm": 0.2501549781754998, + "learning_rate": 1.0377888988345283e-05, + "loss": 0.4056, + "step": 10180 + }, + { + "epoch": 1.4646246764452113, + "grad_norm": 0.24808066699770787, + "learning_rate": 1.037036479806883e-05, + "loss": 0.4278, + "step": 10185 + }, + { + "epoch": 1.4653436870865688, + "grad_norm": 0.2554829719204167, + "learning_rate": 1.0362840397822228e-05, + "loss": 0.4249, + "step": 10190 + }, + { + "epoch": 1.4660626977279263, + "grad_norm": 0.25503395067806245, + "learning_rate": 1.0355315791871275e-05, + "loss": 0.425, + "step": 10195 + }, + { + "epoch": 1.4667817083692838, + "grad_norm": 0.2516936218004914, + "learning_rate": 1.0347790984481868e-05, + "loss": 0.4165, + "step": 10200 + }, + { + "epoch": 1.4675007190106413, + "grad_norm": 0.25736666923114426, + "learning_rate": 1.0340265979920047e-05, + "loss": 0.4205, + "step": 10205 + }, + { + "epoch": 1.4682197296519988, + "grad_norm": 0.2370907375593564, + "learning_rate": 1.0332740782451936e-05, + "loss": 0.3983, + "step": 10210 + }, + { + "epoch": 1.4689387402933565, + "grad_norm": 0.26807117481941795, + "learning_rate": 1.0325215396343782e-05, + "loss": 0.4176, + "step": 10215 + }, + { + "epoch": 1.4696577509347137, + "grad_norm": 0.25537168710013947, + "learning_rate": 1.031768982586194e-05, + "loss": 0.4271, + "step": 10220 + }, + { + "epoch": 1.4703767615760714, + "grad_norm": 0.2498004049578793, + "learning_rate": 1.031016407527286e-05, + "loss": 0.416, + "step": 10225 + }, + { + "epoch": 1.4710957722174287, + "grad_norm": 0.2518779383607219, + "learning_rate": 1.0302638148843105e-05, + "loss": 0.4288, + "step": 10230 + }, + { + "epoch": 1.4718147828587864, + "grad_norm": 0.25224022010410896, + "learning_rate": 1.0295112050839331e-05, + "loss": 0.4137, + "step": 10235 + }, + { + "epoch": 1.4725337935001437, + "grad_norm": 0.2652334016292149, + "learning_rate": 1.0287585785528298e-05, + "loss": 0.4168, + "step": 10240 + }, + { + "epoch": 1.4732528041415014, + "grad_norm": 0.2592847480057149, + "learning_rate": 1.0280059357176846e-05, + "loss": 0.4346, + "step": 10245 + }, + { + "epoch": 1.4739718147828587, + "grad_norm": 0.26403942909123734, + "learning_rate": 1.0272532770051924e-05, + "loss": 0.4163, + "step": 10250 + }, + { + "epoch": 1.4746908254242164, + "grad_norm": 0.2533517614216728, + "learning_rate": 1.0265006028420565e-05, + "loss": 0.403, + "step": 10255 + }, + { + "epoch": 1.4754098360655736, + "grad_norm": 0.24750296558877358, + "learning_rate": 1.0257479136549889e-05, + "loss": 0.4081, + "step": 10260 + }, + { + "epoch": 1.4761288467069313, + "grad_norm": 0.244400346489122, + "learning_rate": 1.0249952098707096e-05, + "loss": 0.4179, + "step": 10265 + }, + { + "epoch": 1.4768478573482888, + "grad_norm": 0.24817124170891883, + "learning_rate": 1.024242491915948e-05, + "loss": 0.3997, + "step": 10270 + }, + { + "epoch": 1.4775668679896463, + "grad_norm": 0.25013198917785917, + "learning_rate": 1.0234897602174405e-05, + "loss": 0.4209, + "step": 10275 + }, + { + "epoch": 1.4782858786310038, + "grad_norm": 0.25576260336241796, + "learning_rate": 1.022737015201932e-05, + "loss": 0.4061, + "step": 10280 + }, + { + "epoch": 1.4790048892723613, + "grad_norm": 0.2584500349591197, + "learning_rate": 1.0219842572961747e-05, + "loss": 0.4246, + "step": 10285 + }, + { + "epoch": 1.4797238999137188, + "grad_norm": 0.24933293916123048, + "learning_rate": 1.0212314869269282e-05, + "loss": 0.4147, + "step": 10290 + }, + { + "epoch": 1.4804429105550763, + "grad_norm": 0.24627965296883847, + "learning_rate": 1.0204787045209583e-05, + "loss": 0.4077, + "step": 10295 + }, + { + "epoch": 1.4811619211964338, + "grad_norm": 0.2535039331883902, + "learning_rate": 1.019725910505039e-05, + "loss": 0.4324, + "step": 10300 + }, + { + "epoch": 1.4818809318377912, + "grad_norm": 0.2597546595277921, + "learning_rate": 1.0189731053059504e-05, + "loss": 0.427, + "step": 10305 + }, + { + "epoch": 1.4825999424791487, + "grad_norm": 0.26682658026370226, + "learning_rate": 1.0182202893504784e-05, + "loss": 0.4114, + "step": 10310 + }, + { + "epoch": 1.4833189531205062, + "grad_norm": 0.25604836937005826, + "learning_rate": 1.0174674630654156e-05, + "loss": 0.3984, + "step": 10315 + }, + { + "epoch": 1.4840379637618637, + "grad_norm": 0.2702117005184134, + "learning_rate": 1.0167146268775601e-05, + "loss": 0.4182, + "step": 10320 + }, + { + "epoch": 1.4847569744032212, + "grad_norm": 0.2640341573786068, + "learning_rate": 1.0159617812137157e-05, + "loss": 0.414, + "step": 10325 + }, + { + "epoch": 1.4854759850445787, + "grad_norm": 0.26282420196086725, + "learning_rate": 1.0152089265006916e-05, + "loss": 0.4285, + "step": 10330 + }, + { + "epoch": 1.4861949956859362, + "grad_norm": 0.2553042564800194, + "learning_rate": 1.0144560631653026e-05, + "loss": 0.4222, + "step": 10335 + }, + { + "epoch": 1.4869140063272936, + "grad_norm": 0.25678271676292025, + "learning_rate": 1.0137031916343681e-05, + "loss": 0.422, + "step": 10340 + }, + { + "epoch": 1.4876330169686511, + "grad_norm": 0.2405527517769915, + "learning_rate": 1.0129503123347108e-05, + "loss": 0.4296, + "step": 10345 + }, + { + "epoch": 1.4883520276100086, + "grad_norm": 0.24107070336536038, + "learning_rate": 1.01219742569316e-05, + "loss": 0.4115, + "step": 10350 + }, + { + "epoch": 1.489071038251366, + "grad_norm": 0.26737435669796555, + "learning_rate": 1.0114445321365483e-05, + "loss": 0.4293, + "step": 10355 + }, + { + "epoch": 1.4897900488927236, + "grad_norm": 0.2448869318476463, + "learning_rate": 1.0106916320917113e-05, + "loss": 0.4269, + "step": 10360 + }, + { + "epoch": 1.490509059534081, + "grad_norm": 0.2512691007866713, + "learning_rate": 1.0099387259854897e-05, + "loss": 0.4234, + "step": 10365 + }, + { + "epoch": 1.4912280701754386, + "grad_norm": 0.25139839806292674, + "learning_rate": 1.0091858142447266e-05, + "loss": 0.418, + "step": 10370 + }, + { + "epoch": 1.491947080816796, + "grad_norm": 0.25818314693975347, + "learning_rate": 1.008432897296269e-05, + "loss": 0.4091, + "step": 10375 + }, + { + "epoch": 1.4926660914581535, + "grad_norm": 0.24946522475818825, + "learning_rate": 1.0076799755669662e-05, + "loss": 0.4191, + "step": 10380 + }, + { + "epoch": 1.493385102099511, + "grad_norm": 0.24475456689761105, + "learning_rate": 1.0069270494836709e-05, + "loss": 0.4108, + "step": 10385 + }, + { + "epoch": 1.4941041127408685, + "grad_norm": 0.24666501837527802, + "learning_rate": 1.006174119473238e-05, + "loss": 0.4177, + "step": 10390 + }, + { + "epoch": 1.494823123382226, + "grad_norm": 0.24476633350362695, + "learning_rate": 1.0054211859625238e-05, + "loss": 0.4188, + "step": 10395 + }, + { + "epoch": 1.4955421340235835, + "grad_norm": 0.24979308411343795, + "learning_rate": 1.0046682493783881e-05, + "loss": 0.406, + "step": 10400 + }, + { + "epoch": 1.496261144664941, + "grad_norm": 0.26279919228449894, + "learning_rate": 1.0039153101476919e-05, + "loss": 0.4297, + "step": 10405 + }, + { + "epoch": 1.4969801553062985, + "grad_norm": 0.24398339297174304, + "learning_rate": 1.0031623686972967e-05, + "loss": 0.4114, + "step": 10410 + }, + { + "epoch": 1.497699165947656, + "grad_norm": 0.27099933558193773, + "learning_rate": 1.0024094254540665e-05, + "loss": 0.4303, + "step": 10415 + }, + { + "epoch": 1.4984181765890134, + "grad_norm": 0.2601120818746926, + "learning_rate": 1.0016564808448655e-05, + "loss": 0.4263, + "step": 10420 + }, + { + "epoch": 1.499137187230371, + "grad_norm": 0.2576834197103154, + "learning_rate": 1.0009035352965593e-05, + "loss": 0.4166, + "step": 10425 + }, + { + "epoch": 1.4998561978717286, + "grad_norm": 0.2434421252623038, + "learning_rate": 1.0001505892360138e-05, + "loss": 0.4131, + "step": 10430 + }, + { + "epoch": 1.500575208513086, + "grad_norm": 0.2427517789901842, + "learning_rate": 9.993976430900951e-06, + "loss": 0.4303, + "step": 10435 + }, + { + "epoch": 1.5012942191544436, + "grad_norm": 0.2593244896531974, + "learning_rate": 9.98644697285669e-06, + "loss": 0.4238, + "step": 10440 + }, + { + "epoch": 1.5020132297958009, + "grad_norm": 0.25119219381398794, + "learning_rate": 9.978917522496021e-06, + "loss": 0.4257, + "step": 10445 + }, + { + "epoch": 1.5027322404371586, + "grad_norm": 0.25106940031364017, + "learning_rate": 9.9713880840876e-06, + "loss": 0.4167, + "step": 10450 + }, + { + "epoch": 1.5034512510785158, + "grad_norm": 0.26566863198644425, + "learning_rate": 9.96385866190007e-06, + "loss": 0.4269, + "step": 10455 + }, + { + "epoch": 1.5041702617198736, + "grad_norm": 0.25510783732412645, + "learning_rate": 9.956329260202076e-06, + "loss": 0.425, + "step": 10460 + }, + { + "epoch": 1.5048892723612308, + "grad_norm": 0.24346425747421713, + "learning_rate": 9.948799883262241e-06, + "loss": 0.4273, + "step": 10465 + }, + { + "epoch": 1.5056082830025885, + "grad_norm": 0.24811318518506437, + "learning_rate": 9.941270535349184e-06, + "loss": 0.4271, + "step": 10470 + }, + { + "epoch": 1.5063272936439458, + "grad_norm": 0.24991918010004216, + "learning_rate": 9.9337412207315e-06, + "loss": 0.412, + "step": 10475 + }, + { + "epoch": 1.5070463042853035, + "grad_norm": 0.2505908963935783, + "learning_rate": 9.926211943677772e-06, + "loss": 0.404, + "step": 10480 + }, + { + "epoch": 1.5077653149266608, + "grad_norm": 0.253336551462878, + "learning_rate": 9.918682708456547e-06, + "loss": 0.3912, + "step": 10485 + }, + { + "epoch": 1.5084843255680185, + "grad_norm": 0.25830329953524983, + "learning_rate": 9.911153519336372e-06, + "loss": 0.4183, + "step": 10490 + }, + { + "epoch": 1.5092033362093757, + "grad_norm": 0.30529482298154637, + "learning_rate": 9.903624380585744e-06, + "loss": 0.4076, + "step": 10495 + }, + { + "epoch": 1.5099223468507335, + "grad_norm": 0.2583172817729028, + "learning_rate": 9.896095296473146e-06, + "loss": 0.4211, + "step": 10500 + }, + { + "epoch": 1.5106413574920907, + "grad_norm": 0.2379268205746049, + "learning_rate": 9.888566271267029e-06, + "loss": 0.4076, + "step": 10505 + }, + { + "epoch": 1.5113603681334484, + "grad_norm": 0.25206595841422097, + "learning_rate": 9.881037309235802e-06, + "loss": 0.4195, + "step": 10510 + }, + { + "epoch": 1.512079378774806, + "grad_norm": 0.25510472525985534, + "learning_rate": 9.87350841464785e-06, + "loss": 0.42, + "step": 10515 + }, + { + "epoch": 1.5127983894161634, + "grad_norm": 0.24854147718329755, + "learning_rate": 9.86597959177151e-06, + "loss": 0.4211, + "step": 10520 + }, + { + "epoch": 1.513517400057521, + "grad_norm": 0.25067813066623357, + "learning_rate": 9.858450844875077e-06, + "loss": 0.435, + "step": 10525 + }, + { + "epoch": 1.5142364106988784, + "grad_norm": 0.24763221009061093, + "learning_rate": 9.850922178226819e-06, + "loss": 0.406, + "step": 10530 + }, + { + "epoch": 1.5149554213402359, + "grad_norm": 0.2517774937888186, + "learning_rate": 9.843393596094943e-06, + "loss": 0.398, + "step": 10535 + }, + { + "epoch": 1.5156744319815934, + "grad_norm": 0.2581762643671444, + "learning_rate": 9.835865102747605e-06, + "loss": 0.4389, + "step": 10540 + }, + { + "epoch": 1.5163934426229508, + "grad_norm": 0.26229897737197194, + "learning_rate": 9.828336702452926e-06, + "loss": 0.4245, + "step": 10545 + }, + { + "epoch": 1.5171124532643083, + "grad_norm": 0.2478466974122416, + "learning_rate": 9.820808399478969e-06, + "loss": 0.4413, + "step": 10550 + }, + { + "epoch": 1.5178314639056658, + "grad_norm": 0.2613714789547285, + "learning_rate": 9.813280198093727e-06, + "loss": 0.4103, + "step": 10555 + }, + { + "epoch": 1.5185504745470233, + "grad_norm": 0.245430331715001, + "learning_rate": 9.805752102565162e-06, + "loss": 0.4106, + "step": 10560 + }, + { + "epoch": 1.5192694851883808, + "grad_norm": 0.251148558174525, + "learning_rate": 9.798224117161153e-06, + "loss": 0.4189, + "step": 10565 + }, + { + "epoch": 1.5199884958297383, + "grad_norm": 0.25693318875295806, + "learning_rate": 9.790696246149524e-06, + "loss": 0.4209, + "step": 10570 + }, + { + "epoch": 1.5207075064710958, + "grad_norm": 0.2591621988908957, + "learning_rate": 9.783168493798044e-06, + "loss": 0.4231, + "step": 10575 + }, + { + "epoch": 1.5214265171124532, + "grad_norm": 0.25604656716274005, + "learning_rate": 9.775640864374398e-06, + "loss": 0.4026, + "step": 10580 + }, + { + "epoch": 1.5221455277538107, + "grad_norm": 0.24739468808956666, + "learning_rate": 9.768113362146209e-06, + "loss": 0.4154, + "step": 10585 + }, + { + "epoch": 1.5228645383951682, + "grad_norm": 0.2593488710273111, + "learning_rate": 9.760585991381033e-06, + "loss": 0.4176, + "step": 10590 + }, + { + "epoch": 1.5235835490365257, + "grad_norm": 0.23504431821814106, + "learning_rate": 9.753058756346346e-06, + "loss": 0.4181, + "step": 10595 + }, + { + "epoch": 1.5243025596778832, + "grad_norm": 0.245640721324355, + "learning_rate": 9.745531661309544e-06, + "loss": 0.4423, + "step": 10600 + }, + { + "epoch": 1.5250215703192407, + "grad_norm": 0.25215427075046754, + "learning_rate": 9.738004710537953e-06, + "loss": 0.4388, + "step": 10605 + }, + { + "epoch": 1.5257405809605982, + "grad_norm": 0.24913812385524794, + "learning_rate": 9.730477908298806e-06, + "loss": 0.4136, + "step": 10610 + }, + { + "epoch": 1.5264595916019557, + "grad_norm": 0.26318609212029886, + "learning_rate": 9.722951258859261e-06, + "loss": 0.4229, + "step": 10615 + }, + { + "epoch": 1.5271786022433131, + "grad_norm": 0.24957565565496473, + "learning_rate": 9.715424766486385e-06, + "loss": 0.4183, + "step": 10620 + }, + { + "epoch": 1.5278976128846709, + "grad_norm": 0.2674208605810236, + "learning_rate": 9.707898435447153e-06, + "loss": 0.4159, + "step": 10625 + }, + { + "epoch": 1.5286166235260281, + "grad_norm": 0.26280131334403317, + "learning_rate": 9.70037227000846e-06, + "loss": 0.4257, + "step": 10630 + }, + { + "epoch": 1.5293356341673858, + "grad_norm": 0.24938256962028485, + "learning_rate": 9.692846274437095e-06, + "loss": 0.4181, + "step": 10635 + }, + { + "epoch": 1.530054644808743, + "grad_norm": 0.25764954483156466, + "learning_rate": 9.68532045299975e-06, + "loss": 0.4291, + "step": 10640 + }, + { + "epoch": 1.5307736554501008, + "grad_norm": 0.2578862842527871, + "learning_rate": 9.677794809963034e-06, + "loss": 0.4169, + "step": 10645 + }, + { + "epoch": 1.531492666091458, + "grad_norm": 0.2476802687012682, + "learning_rate": 9.670269349593438e-06, + "loss": 0.4151, + "step": 10650 + }, + { + "epoch": 1.5322116767328158, + "grad_norm": 0.250049738063665, + "learning_rate": 9.662744076157353e-06, + "loss": 0.418, + "step": 10655 + }, + { + "epoch": 1.532930687374173, + "grad_norm": 0.24953016919389576, + "learning_rate": 9.655218993921072e-06, + "loss": 0.4181, + "step": 10660 + }, + { + "epoch": 1.5336496980155307, + "grad_norm": 0.25677559512208753, + "learning_rate": 9.647694107150773e-06, + "loss": 0.4138, + "step": 10665 + }, + { + "epoch": 1.534368708656888, + "grad_norm": 0.2518384392576241, + "learning_rate": 9.64016942011252e-06, + "loss": 0.4064, + "step": 10670 + }, + { + "epoch": 1.5350877192982457, + "grad_norm": 0.2626226969249233, + "learning_rate": 9.632644937072277e-06, + "loss": 0.417, + "step": 10675 + }, + { + "epoch": 1.535806729939603, + "grad_norm": 0.25414243414168186, + "learning_rate": 9.625120662295878e-06, + "loss": 0.4221, + "step": 10680 + }, + { + "epoch": 1.5365257405809607, + "grad_norm": 0.2598807696270767, + "learning_rate": 9.617596600049041e-06, + "loss": 0.4364, + "step": 10685 + }, + { + "epoch": 1.537244751222318, + "grad_norm": 0.25844805472902665, + "learning_rate": 9.610072754597373e-06, + "loss": 0.4351, + "step": 10690 + }, + { + "epoch": 1.5379637618636757, + "grad_norm": 0.2524970535348807, + "learning_rate": 9.602549130206353e-06, + "loss": 0.4059, + "step": 10695 + }, + { + "epoch": 1.538682772505033, + "grad_norm": 0.26275208611273376, + "learning_rate": 9.595025731141326e-06, + "loss": 0.4408, + "step": 10700 + }, + { + "epoch": 1.5394017831463906, + "grad_norm": 0.25292419698311214, + "learning_rate": 9.587502561667525e-06, + "loss": 0.4088, + "step": 10705 + }, + { + "epoch": 1.540120793787748, + "grad_norm": 0.24708841066490864, + "learning_rate": 9.579979626050043e-06, + "loss": 0.4069, + "step": 10710 + }, + { + "epoch": 1.5408398044291056, + "grad_norm": 0.2487555695805377, + "learning_rate": 9.572456928553836e-06, + "loss": 0.4065, + "step": 10715 + }, + { + "epoch": 1.5415588150704629, + "grad_norm": 0.2617493095764514, + "learning_rate": 9.564934473443742e-06, + "loss": 0.4093, + "step": 10720 + }, + { + "epoch": 1.5422778257118206, + "grad_norm": 0.24836706110965545, + "learning_rate": 9.557412264984444e-06, + "loss": 0.4065, + "step": 10725 + }, + { + "epoch": 1.542996836353178, + "grad_norm": 0.24844848518424625, + "learning_rate": 9.54989030744049e-06, + "loss": 0.4263, + "step": 10730 + }, + { + "epoch": 1.5437158469945356, + "grad_norm": 0.2525521654434998, + "learning_rate": 9.542368605076296e-06, + "loss": 0.4075, + "step": 10735 + }, + { + "epoch": 1.544434857635893, + "grad_norm": 0.24642249580899397, + "learning_rate": 9.534847162156115e-06, + "loss": 0.3918, + "step": 10740 + }, + { + "epoch": 1.5451538682772505, + "grad_norm": 0.24486119995221942, + "learning_rate": 9.52732598294407e-06, + "loss": 0.4073, + "step": 10745 + }, + { + "epoch": 1.545872878918608, + "grad_norm": 0.2549947650456555, + "learning_rate": 9.519805071704131e-06, + "loss": 0.4091, + "step": 10750 + }, + { + "epoch": 1.5465918895599655, + "grad_norm": 0.2502840945830212, + "learning_rate": 9.512284432700101e-06, + "loss": 0.4066, + "step": 10755 + }, + { + "epoch": 1.547310900201323, + "grad_norm": 0.25620127482812816, + "learning_rate": 9.504764070195652e-06, + "loss": 0.4026, + "step": 10760 + }, + { + "epoch": 1.5480299108426805, + "grad_norm": 0.2548526285927597, + "learning_rate": 9.49724398845428e-06, + "loss": 0.416, + "step": 10765 + }, + { + "epoch": 1.548748921484038, + "grad_norm": 0.25445359889395053, + "learning_rate": 9.489724191739329e-06, + "loss": 0.4165, + "step": 10770 + }, + { + "epoch": 1.5494679321253955, + "grad_norm": 0.2511165733452332, + "learning_rate": 9.48220468431399e-06, + "loss": 0.4067, + "step": 10775 + }, + { + "epoch": 1.550186942766753, + "grad_norm": 0.2464055911624285, + "learning_rate": 9.474685470441274e-06, + "loss": 0.4088, + "step": 10780 + }, + { + "epoch": 1.5509059534081104, + "grad_norm": 0.24343283618036587, + "learning_rate": 9.467166554384033e-06, + "loss": 0.417, + "step": 10785 + }, + { + "epoch": 1.551624964049468, + "grad_norm": 0.2929003871990144, + "learning_rate": 9.459647940404955e-06, + "loss": 0.4051, + "step": 10790 + }, + { + "epoch": 1.5523439746908254, + "grad_norm": 0.26335455862311163, + "learning_rate": 9.452129632766553e-06, + "loss": 0.4133, + "step": 10795 + }, + { + "epoch": 1.553062985332183, + "grad_norm": 0.26867986196228943, + "learning_rate": 9.444611635731157e-06, + "loss": 0.4039, + "step": 10800 + }, + { + "epoch": 1.5537819959735404, + "grad_norm": 0.2527610281819871, + "learning_rate": 9.437093953560941e-06, + "loss": 0.4369, + "step": 10805 + }, + { + "epoch": 1.5545010066148979, + "grad_norm": 0.26359228700844456, + "learning_rate": 9.429576590517879e-06, + "loss": 0.4075, + "step": 10810 + }, + { + "epoch": 1.5552200172562554, + "grad_norm": 0.24934557309261163, + "learning_rate": 9.42205955086378e-06, + "loss": 0.4181, + "step": 10815 + }, + { + "epoch": 1.5559390278976128, + "grad_norm": 0.28417752458187967, + "learning_rate": 9.414542838860263e-06, + "loss": 0.4101, + "step": 10820 + }, + { + "epoch": 1.5566580385389703, + "grad_norm": 0.2687027110590275, + "learning_rate": 9.407026458768763e-06, + "loss": 0.4275, + "step": 10825 + }, + { + "epoch": 1.5573770491803278, + "grad_norm": 0.24624048707976195, + "learning_rate": 9.399510414850518e-06, + "loss": 0.412, + "step": 10830 + }, + { + "epoch": 1.5580960598216853, + "grad_norm": 0.2591235754039305, + "learning_rate": 9.391994711366592e-06, + "loss": 0.4276, + "step": 10835 + }, + { + "epoch": 1.558815070463043, + "grad_norm": 0.2531149607445782, + "learning_rate": 9.384479352577844e-06, + "loss": 0.4055, + "step": 10840 + }, + { + "epoch": 1.5595340811044003, + "grad_norm": 0.25099823671722576, + "learning_rate": 9.376964342744942e-06, + "loss": 0.4126, + "step": 10845 + }, + { + "epoch": 1.560253091745758, + "grad_norm": 0.2621789703583561, + "learning_rate": 9.369449686128356e-06, + "loss": 0.4204, + "step": 10850 + }, + { + "epoch": 1.5609721023871153, + "grad_norm": 0.24146616536060794, + "learning_rate": 9.361935386988347e-06, + "loss": 0.4246, + "step": 10855 + }, + { + "epoch": 1.561691113028473, + "grad_norm": 0.25053774015064967, + "learning_rate": 9.354421449584992e-06, + "loss": 0.4083, + "step": 10860 + }, + { + "epoch": 1.5624101236698302, + "grad_norm": 0.24254978561460552, + "learning_rate": 9.346907878178145e-06, + "loss": 0.4195, + "step": 10865 + }, + { + "epoch": 1.563129134311188, + "grad_norm": 0.2573324547012047, + "learning_rate": 9.339394677027457e-06, + "loss": 0.4288, + "step": 10870 + }, + { + "epoch": 1.5638481449525452, + "grad_norm": 0.25421395718926176, + "learning_rate": 9.331881850392382e-06, + "loss": 0.413, + "step": 10875 + }, + { + "epoch": 1.564567155593903, + "grad_norm": 0.24639179255443322, + "learning_rate": 9.324369402532146e-06, + "loss": 0.4064, + "step": 10880 + }, + { + "epoch": 1.5652861662352602, + "grad_norm": 0.2490898649657323, + "learning_rate": 9.316857337705757e-06, + "loss": 0.4018, + "step": 10885 + }, + { + "epoch": 1.5660051768766179, + "grad_norm": 0.3612706971331954, + "learning_rate": 9.309345660172025e-06, + "loss": 0.4214, + "step": 10890 + }, + { + "epoch": 1.5667241875179752, + "grad_norm": 0.26504703274984404, + "learning_rate": 9.30183437418953e-06, + "loss": 0.4239, + "step": 10895 + }, + { + "epoch": 1.5674431981593329, + "grad_norm": 0.244324764458801, + "learning_rate": 9.294323484016621e-06, + "loss": 0.3935, + "step": 10900 + }, + { + "epoch": 1.5681622088006901, + "grad_norm": 0.2528967431828394, + "learning_rate": 9.28681299391144e-06, + "loss": 0.418, + "step": 10905 + }, + { + "epoch": 1.5688812194420478, + "grad_norm": 0.244293237365086, + "learning_rate": 9.27930290813189e-06, + "loss": 0.4123, + "step": 10910 + }, + { + "epoch": 1.569600230083405, + "grad_norm": 0.26940252175040885, + "learning_rate": 9.271793230935646e-06, + "loss": 0.4166, + "step": 10915 + }, + { + "epoch": 1.5703192407247628, + "grad_norm": 0.24979780787360428, + "learning_rate": 9.264283966580161e-06, + "loss": 0.4292, + "step": 10920 + }, + { + "epoch": 1.57103825136612, + "grad_norm": 0.26004584564041544, + "learning_rate": 9.256775119322642e-06, + "loss": 0.4252, + "step": 10925 + }, + { + "epoch": 1.5717572620074778, + "grad_norm": 0.2535980461972811, + "learning_rate": 9.24926669342006e-06, + "loss": 0.4037, + "step": 10930 + }, + { + "epoch": 1.572476272648835, + "grad_norm": 0.2597973742778775, + "learning_rate": 9.241758693129157e-06, + "loss": 0.3816, + "step": 10935 + }, + { + "epoch": 1.5731952832901928, + "grad_norm": 0.26000746055154517, + "learning_rate": 9.234251122706429e-06, + "loss": 0.4076, + "step": 10940 + }, + { + "epoch": 1.57391429393155, + "grad_norm": 0.26088456069640015, + "learning_rate": 9.226743986408123e-06, + "loss": 0.416, + "step": 10945 + }, + { + "epoch": 1.5746333045729077, + "grad_norm": 0.24785788574271747, + "learning_rate": 9.219237288490248e-06, + "loss": 0.4222, + "step": 10950 + }, + { + "epoch": 1.5753523152142652, + "grad_norm": 0.2578127016481043, + "learning_rate": 9.211731033208555e-06, + "loss": 0.414, + "step": 10955 + }, + { + "epoch": 1.5760713258556227, + "grad_norm": 0.25311249447487005, + "learning_rate": 9.204225224818556e-06, + "loss": 0.4179, + "step": 10960 + }, + { + "epoch": 1.5767903364969802, + "grad_norm": 0.2522348815673237, + "learning_rate": 9.196719867575504e-06, + "loss": 0.4071, + "step": 10965 + }, + { + "epoch": 1.5775093471383377, + "grad_norm": 0.24787911038300595, + "learning_rate": 9.189214965734388e-06, + "loss": 0.4014, + "step": 10970 + }, + { + "epoch": 1.5782283577796952, + "grad_norm": 0.2560334016849389, + "learning_rate": 9.181710523549956e-06, + "loss": 0.4409, + "step": 10975 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.2583820636877086, + "learning_rate": 9.174206545276678e-06, + "loss": 0.4184, + "step": 10980 + }, + { + "epoch": 1.5796663790624101, + "grad_norm": 0.26098999460444455, + "learning_rate": 9.166703035168772e-06, + "loss": 0.4192, + "step": 10985 + }, + { + "epoch": 1.5803853897037676, + "grad_norm": 0.2536017389758038, + "learning_rate": 9.159199997480187e-06, + "loss": 0.4179, + "step": 10990 + }, + { + "epoch": 1.5811044003451251, + "grad_norm": 0.24919709422780187, + "learning_rate": 9.151697436464608e-06, + "loss": 0.4135, + "step": 10995 + }, + { + "epoch": 1.5818234109864826, + "grad_norm": 0.359916355739022, + "learning_rate": 9.144195356375439e-06, + "loss": 0.4179, + "step": 11000 + }, + { + "epoch": 1.58254242162784, + "grad_norm": 0.25759112404931467, + "learning_rate": 9.136693761465827e-06, + "loss": 0.4165, + "step": 11005 + }, + { + "epoch": 1.5832614322691976, + "grad_norm": 0.24704507666893488, + "learning_rate": 9.12919265598863e-06, + "loss": 0.4115, + "step": 11010 + }, + { + "epoch": 1.583980442910555, + "grad_norm": 0.26040498404898454, + "learning_rate": 9.121692044196433e-06, + "loss": 0.403, + "step": 11015 + }, + { + "epoch": 1.5846994535519126, + "grad_norm": 0.2572213722815523, + "learning_rate": 9.11419193034155e-06, + "loss": 0.4265, + "step": 11020 + }, + { + "epoch": 1.58541846419327, + "grad_norm": 0.26562420815981674, + "learning_rate": 9.106692318676e-06, + "loss": 0.4163, + "step": 11025 + }, + { + "epoch": 1.5861374748346275, + "grad_norm": 0.2545731156371461, + "learning_rate": 9.099193213451518e-06, + "loss": 0.418, + "step": 11030 + }, + { + "epoch": 1.586856485475985, + "grad_norm": 0.25251780679210656, + "learning_rate": 9.091694618919563e-06, + "loss": 0.4177, + "step": 11035 + }, + { + "epoch": 1.5875754961173425, + "grad_norm": 0.25318760884805, + "learning_rate": 9.084196539331298e-06, + "loss": 0.416, + "step": 11040 + }, + { + "epoch": 1.5882945067587, + "grad_norm": 0.26462756901637724, + "learning_rate": 9.076698978937585e-06, + "loss": 0.412, + "step": 11045 + }, + { + "epoch": 1.5890135174000575, + "grad_norm": 0.25503799261850707, + "learning_rate": 9.069201941989012e-06, + "loss": 0.4233, + "step": 11050 + }, + { + "epoch": 1.589732528041415, + "grad_norm": 0.26225264720562946, + "learning_rate": 9.061705432735852e-06, + "loss": 0.4253, + "step": 11055 + }, + { + "epoch": 1.5904515386827724, + "grad_norm": 0.24447655868276502, + "learning_rate": 9.054209455428083e-06, + "loss": 0.4164, + "step": 11060 + }, + { + "epoch": 1.5911705493241302, + "grad_norm": 0.2547140130287968, + "learning_rate": 9.046714014315391e-06, + "loss": 0.4249, + "step": 11065 + }, + { + "epoch": 1.5918895599654874, + "grad_norm": 0.26102607906573094, + "learning_rate": 9.039219113647144e-06, + "loss": 0.4304, + "step": 11070 + }, + { + "epoch": 1.5926085706068451, + "grad_norm": 0.25631456582076034, + "learning_rate": 9.031724757672417e-06, + "loss": 0.4072, + "step": 11075 + }, + { + "epoch": 1.5933275812482024, + "grad_norm": 0.2638138418681577, + "learning_rate": 9.024230950639965e-06, + "loss": 0.4306, + "step": 11080 + }, + { + "epoch": 1.59404659188956, + "grad_norm": 0.25661768187519934, + "learning_rate": 9.016737696798236e-06, + "loss": 0.4124, + "step": 11085 + }, + { + "epoch": 1.5947656025309174, + "grad_norm": 0.24843426147942013, + "learning_rate": 9.009245000395371e-06, + "loss": 0.429, + "step": 11090 + }, + { + "epoch": 1.595484613172275, + "grad_norm": 0.2606728470226574, + "learning_rate": 9.001752865679184e-06, + "loss": 0.4037, + "step": 11095 + }, + { + "epoch": 1.5962036238136323, + "grad_norm": 0.25306716381995065, + "learning_rate": 8.994261296897174e-06, + "loss": 0.4072, + "step": 11100 + }, + { + "epoch": 1.59692263445499, + "grad_norm": 0.251463255993275, + "learning_rate": 8.986770298296521e-06, + "loss": 0.4295, + "step": 11105 + }, + { + "epoch": 1.5976416450963473, + "grad_norm": 0.24957252123112794, + "learning_rate": 8.979279874124088e-06, + "loss": 0.4135, + "step": 11110 + }, + { + "epoch": 1.598360655737705, + "grad_norm": 0.25487316069456933, + "learning_rate": 8.971790028626395e-06, + "loss": 0.4236, + "step": 11115 + }, + { + "epoch": 1.5990796663790623, + "grad_norm": 0.25062693498899585, + "learning_rate": 8.964300766049657e-06, + "loss": 0.4158, + "step": 11120 + }, + { + "epoch": 1.59979867702042, + "grad_norm": 0.25586841864722226, + "learning_rate": 8.956812090639733e-06, + "loss": 0.4192, + "step": 11125 + }, + { + "epoch": 1.6005176876617773, + "grad_norm": 0.2523516405591824, + "learning_rate": 8.949324006642171e-06, + "loss": 0.4163, + "step": 11130 + }, + { + "epoch": 1.601236698303135, + "grad_norm": 0.2547821939348679, + "learning_rate": 8.941836518302172e-06, + "loss": 0.4057, + "step": 11135 + }, + { + "epoch": 1.6019557089444922, + "grad_norm": 0.25350007716673045, + "learning_rate": 8.934349629864605e-06, + "loss": 0.4075, + "step": 11140 + }, + { + "epoch": 1.60267471958585, + "grad_norm": 0.2545362292573287, + "learning_rate": 8.92686334557399e-06, + "loss": 0.41, + "step": 11145 + }, + { + "epoch": 1.6033937302272072, + "grad_norm": 0.2500310953377668, + "learning_rate": 8.91937766967452e-06, + "loss": 0.408, + "step": 11150 + }, + { + "epoch": 1.604112740868565, + "grad_norm": 0.26014949108035895, + "learning_rate": 8.911892606410025e-06, + "loss": 0.4183, + "step": 11155 + }, + { + "epoch": 1.6048317515099222, + "grad_norm": 0.2579606481931132, + "learning_rate": 8.904408160023995e-06, + "loss": 0.4096, + "step": 11160 + }, + { + "epoch": 1.60555076215128, + "grad_norm": 0.2512133091754519, + "learning_rate": 8.896924334759584e-06, + "loss": 0.4082, + "step": 11165 + }, + { + "epoch": 1.6062697727926374, + "grad_norm": 0.26226985742683234, + "learning_rate": 8.889441134859569e-06, + "loss": 0.4228, + "step": 11170 + }, + { + "epoch": 1.6069887834339949, + "grad_norm": 0.2570180142160302, + "learning_rate": 8.881958564566391e-06, + "loss": 0.4275, + "step": 11175 + }, + { + "epoch": 1.6077077940753524, + "grad_norm": 0.2561715585777985, + "learning_rate": 8.874476628122128e-06, + "loss": 0.4238, + "step": 11180 + }, + { + "epoch": 1.6084268047167098, + "grad_norm": 0.2550125475833642, + "learning_rate": 8.866995329768495e-06, + "loss": 0.4192, + "step": 11185 + }, + { + "epoch": 1.6091458153580673, + "grad_norm": 0.2699936703979793, + "learning_rate": 8.859514673746856e-06, + "loss": 0.4196, + "step": 11190 + }, + { + "epoch": 1.6098648259994248, + "grad_norm": 0.2505338294862466, + "learning_rate": 8.852034664298198e-06, + "loss": 0.4153, + "step": 11195 + }, + { + "epoch": 1.6105838366407823, + "grad_norm": 0.26502315769999407, + "learning_rate": 8.844555305663145e-06, + "loss": 0.4209, + "step": 11200 + }, + { + "epoch": 1.6113028472821398, + "grad_norm": 0.2500014115346305, + "learning_rate": 8.83707660208196e-06, + "loss": 0.4151, + "step": 11205 + }, + { + "epoch": 1.6120218579234973, + "grad_norm": 0.25469064057360297, + "learning_rate": 8.82959855779453e-06, + "loss": 0.4202, + "step": 11210 + }, + { + "epoch": 1.6127408685648548, + "grad_norm": 0.25981083449373843, + "learning_rate": 8.822121177040361e-06, + "loss": 0.402, + "step": 11215 + }, + { + "epoch": 1.6134598792062123, + "grad_norm": 0.2646924764535613, + "learning_rate": 8.814644464058593e-06, + "loss": 0.4172, + "step": 11220 + }, + { + "epoch": 1.6141788898475697, + "grad_norm": 0.25471272472143974, + "learning_rate": 8.807168423087983e-06, + "loss": 0.4239, + "step": 11225 + }, + { + "epoch": 1.6148979004889272, + "grad_norm": 0.2601487142133797, + "learning_rate": 8.799693058366907e-06, + "loss": 0.3952, + "step": 11230 + }, + { + "epoch": 1.6156169111302847, + "grad_norm": 0.2613849714941667, + "learning_rate": 8.792218374133356e-06, + "loss": 0.3974, + "step": 11235 + }, + { + "epoch": 1.6163359217716422, + "grad_norm": 0.2593664689454349, + "learning_rate": 8.784744374624942e-06, + "loss": 0.3999, + "step": 11240 + }, + { + "epoch": 1.6170549324129997, + "grad_norm": 0.25484361159747676, + "learning_rate": 8.777271064078876e-06, + "loss": 0.4157, + "step": 11245 + }, + { + "epoch": 1.6177739430543572, + "grad_norm": 0.2523734311430229, + "learning_rate": 8.769798446731998e-06, + "loss": 0.3991, + "step": 11250 + }, + { + "epoch": 1.6184929536957147, + "grad_norm": 0.26994175615265537, + "learning_rate": 8.762326526820732e-06, + "loss": 0.4286, + "step": 11255 + }, + { + "epoch": 1.6192119643370722, + "grad_norm": 0.2632910638122419, + "learning_rate": 8.754855308581125e-06, + "loss": 0.4229, + "step": 11260 + }, + { + "epoch": 1.6199309749784296, + "grad_norm": 0.25915962081022337, + "learning_rate": 8.747384796248819e-06, + "loss": 0.4139, + "step": 11265 + }, + { + "epoch": 1.6206499856197871, + "grad_norm": 0.25097847169720805, + "learning_rate": 8.739914994059055e-06, + "loss": 0.4272, + "step": 11270 + }, + { + "epoch": 1.6213689962611446, + "grad_norm": 0.24985927157515658, + "learning_rate": 8.732445906246667e-06, + "loss": 0.4112, + "step": 11275 + }, + { + "epoch": 1.6220880069025023, + "grad_norm": 0.256598628829339, + "learning_rate": 8.724977537046098e-06, + "loss": 0.4083, + "step": 11280 + }, + { + "epoch": 1.6228070175438596, + "grad_norm": 0.2450641360853421, + "learning_rate": 8.717509890691369e-06, + "loss": 0.4311, + "step": 11285 + }, + { + "epoch": 1.6235260281852173, + "grad_norm": 0.2671795024941469, + "learning_rate": 8.710042971416103e-06, + "loss": 0.4121, + "step": 11290 + }, + { + "epoch": 1.6242450388265746, + "grad_norm": 0.2587107726625021, + "learning_rate": 8.702576783453502e-06, + "loss": 0.4135, + "step": 11295 + }, + { + "epoch": 1.6249640494679323, + "grad_norm": 0.2622689960782433, + "learning_rate": 8.695111331036355e-06, + "loss": 0.4201, + "step": 11300 + }, + { + "epoch": 1.6256830601092895, + "grad_norm": 0.2513570762063957, + "learning_rate": 8.687646618397036e-06, + "loss": 0.416, + "step": 11305 + }, + { + "epoch": 1.6264020707506472, + "grad_norm": 0.24671811263841703, + "learning_rate": 8.680182649767503e-06, + "loss": 0.4045, + "step": 11310 + }, + { + "epoch": 1.6271210813920045, + "grad_norm": 0.2623165897224748, + "learning_rate": 8.672719429379281e-06, + "loss": 0.4088, + "step": 11315 + }, + { + "epoch": 1.6278400920333622, + "grad_norm": 0.26592779005926837, + "learning_rate": 8.665256961463484e-06, + "loss": 0.4234, + "step": 11320 + }, + { + "epoch": 1.6285591026747195, + "grad_norm": 0.25144319821086275, + "learning_rate": 8.657795250250794e-06, + "loss": 0.4378, + "step": 11325 + }, + { + "epoch": 1.6292781133160772, + "grad_norm": 0.253085995785813, + "learning_rate": 8.650334299971455e-06, + "loss": 0.418, + "step": 11330 + }, + { + "epoch": 1.6299971239574345, + "grad_norm": 0.252589440886837, + "learning_rate": 8.642874114855301e-06, + "loss": 0.4168, + "step": 11335 + }, + { + "epoch": 1.6307161345987922, + "grad_norm": 0.2547259874503826, + "learning_rate": 8.635414699131712e-06, + "loss": 0.4214, + "step": 11340 + }, + { + "epoch": 1.6314351452401494, + "grad_norm": 0.24450373380712287, + "learning_rate": 8.627956057029635e-06, + "loss": 0.4123, + "step": 11345 + }, + { + "epoch": 1.6321541558815071, + "grad_norm": 0.24479091576118742, + "learning_rate": 8.62049819277759e-06, + "loss": 0.4095, + "step": 11350 + }, + { + "epoch": 1.6328731665228644, + "grad_norm": 0.2636491979983794, + "learning_rate": 8.613041110603647e-06, + "loss": 0.4156, + "step": 11355 + }, + { + "epoch": 1.6335921771642221, + "grad_norm": 0.2590163047833479, + "learning_rate": 8.605584814735427e-06, + "loss": 0.4384, + "step": 11360 + }, + { + "epoch": 1.6343111878055794, + "grad_norm": 0.25448186761597574, + "learning_rate": 8.598129309400127e-06, + "loss": 0.4151, + "step": 11365 + }, + { + "epoch": 1.635030198446937, + "grad_norm": 0.252721234303155, + "learning_rate": 8.590674598824466e-06, + "loss": 0.4155, + "step": 11370 + }, + { + "epoch": 1.6357492090882944, + "grad_norm": 0.37533618952671777, + "learning_rate": 8.583220687234736e-06, + "loss": 0.42, + "step": 11375 + }, + { + "epoch": 1.636468219729652, + "grad_norm": 0.2449704487827158, + "learning_rate": 8.575767578856765e-06, + "loss": 0.3945, + "step": 11380 + }, + { + "epoch": 1.6371872303710093, + "grad_norm": 0.2702485491460619, + "learning_rate": 8.568315277915931e-06, + "loss": 0.4058, + "step": 11385 + }, + { + "epoch": 1.637906241012367, + "grad_norm": 0.26175005707515614, + "learning_rate": 8.560863788637144e-06, + "loss": 0.4115, + "step": 11390 + }, + { + "epoch": 1.6386252516537245, + "grad_norm": 0.24693775467819265, + "learning_rate": 8.553413115244873e-06, + "loss": 0.3991, + "step": 11395 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 0.25829277088746205, + "learning_rate": 8.545963261963102e-06, + "loss": 0.4201, + "step": 11400 + }, + { + "epoch": 1.6400632729364395, + "grad_norm": 0.2535140416799969, + "learning_rate": 8.538514233015367e-06, + "loss": 0.4217, + "step": 11405 + }, + { + "epoch": 1.640782283577797, + "grad_norm": 0.25550572212281636, + "learning_rate": 8.531066032624732e-06, + "loss": 0.4111, + "step": 11410 + }, + { + "epoch": 1.6415012942191545, + "grad_norm": 0.2586339585288042, + "learning_rate": 8.523618665013782e-06, + "loss": 0.4289, + "step": 11415 + }, + { + "epoch": 1.642220304860512, + "grad_norm": 0.25344775465833025, + "learning_rate": 8.516172134404647e-06, + "loss": 0.4272, + "step": 11420 + }, + { + "epoch": 1.6429393155018694, + "grad_norm": 0.25587785003191293, + "learning_rate": 8.508726445018967e-06, + "loss": 0.42, + "step": 11425 + }, + { + "epoch": 1.643658326143227, + "grad_norm": 0.2716660468365486, + "learning_rate": 8.50128160107791e-06, + "loss": 0.4277, + "step": 11430 + }, + { + "epoch": 1.6443773367845844, + "grad_norm": 0.2574628146609621, + "learning_rate": 8.493837606802173e-06, + "loss": 0.4096, + "step": 11435 + }, + { + "epoch": 1.645096347425942, + "grad_norm": 0.251306677542674, + "learning_rate": 8.486394466411963e-06, + "loss": 0.4173, + "step": 11440 + }, + { + "epoch": 1.6458153580672994, + "grad_norm": 0.2623028187732833, + "learning_rate": 8.478952184126994e-06, + "loss": 0.4132, + "step": 11445 + }, + { + "epoch": 1.6465343687086569, + "grad_norm": 0.2424419146128029, + "learning_rate": 8.471510764166514e-06, + "loss": 0.4139, + "step": 11450 + }, + { + "epoch": 1.6472533793500144, + "grad_norm": 0.2531680948215564, + "learning_rate": 8.464070210749272e-06, + "loss": 0.4108, + "step": 11455 + }, + { + "epoch": 1.6479723899913719, + "grad_norm": 0.24696367691684704, + "learning_rate": 8.456630528093516e-06, + "loss": 0.3996, + "step": 11460 + }, + { + "epoch": 1.6486914006327293, + "grad_norm": 0.24706444705139494, + "learning_rate": 8.449191720417021e-06, + "loss": 0.4093, + "step": 11465 + }, + { + "epoch": 1.6494104112740868, + "grad_norm": 0.24554094568955817, + "learning_rate": 8.441753791937048e-06, + "loss": 0.4091, + "step": 11470 + }, + { + "epoch": 1.6501294219154443, + "grad_norm": 0.25029044608014944, + "learning_rate": 8.434316746870366e-06, + "loss": 0.4209, + "step": 11475 + }, + { + "epoch": 1.6508484325568018, + "grad_norm": 0.264859220220412, + "learning_rate": 8.426880589433251e-06, + "loss": 0.3988, + "step": 11480 + }, + { + "epoch": 1.6515674431981593, + "grad_norm": 0.24597444882032374, + "learning_rate": 8.419445323841464e-06, + "loss": 0.4182, + "step": 11485 + }, + { + "epoch": 1.6522864538395168, + "grad_norm": 0.2588401531342179, + "learning_rate": 8.412010954310259e-06, + "loss": 0.3916, + "step": 11490 + }, + { + "epoch": 1.6530054644808743, + "grad_norm": 0.25570847615940656, + "learning_rate": 8.404577485054394e-06, + "loss": 0.4031, + "step": 11495 + }, + { + "epoch": 1.6537244751222318, + "grad_norm": 0.2579101209032299, + "learning_rate": 8.39714492028811e-06, + "loss": 0.4161, + "step": 11500 + }, + { + "epoch": 1.6544434857635895, + "grad_norm": 0.2447172516290584, + "learning_rate": 8.389713264225134e-06, + "loss": 0.4217, + "step": 11505 + }, + { + "epoch": 1.6551624964049467, + "grad_norm": 0.24872364074589265, + "learning_rate": 8.382282521078682e-06, + "loss": 0.4129, + "step": 11510 + }, + { + "epoch": 1.6558815070463044, + "grad_norm": 0.2718399601622027, + "learning_rate": 8.374852695061444e-06, + "loss": 0.416, + "step": 11515 + }, + { + "epoch": 1.6566005176876617, + "grad_norm": 0.2683049774924081, + "learning_rate": 8.367423790385605e-06, + "loss": 0.4034, + "step": 11520 + }, + { + "epoch": 1.6573195283290194, + "grad_norm": 0.2573042976025013, + "learning_rate": 8.35999581126281e-06, + "loss": 0.4144, + "step": 11525 + }, + { + "epoch": 1.6580385389703767, + "grad_norm": 0.2582272922447818, + "learning_rate": 8.352568761904187e-06, + "loss": 0.4143, + "step": 11530 + }, + { + "epoch": 1.6587575496117344, + "grad_norm": 0.25288542470974607, + "learning_rate": 8.345142646520347e-06, + "loss": 0.4215, + "step": 11535 + }, + { + "epoch": 1.6594765602530916, + "grad_norm": 0.2827850044449669, + "learning_rate": 8.337717469321359e-06, + "loss": 0.418, + "step": 11540 + }, + { + "epoch": 1.6601955708944494, + "grad_norm": 0.2469343296900172, + "learning_rate": 8.330293234516753e-06, + "loss": 0.4245, + "step": 11545 + }, + { + "epoch": 1.6609145815358066, + "grad_norm": 0.24968847563271604, + "learning_rate": 8.322869946315549e-06, + "loss": 0.4147, + "step": 11550 + }, + { + "epoch": 1.6616335921771643, + "grad_norm": 0.25565355098064496, + "learning_rate": 8.315447608926211e-06, + "loss": 0.4174, + "step": 11555 + }, + { + "epoch": 1.6623526028185216, + "grad_norm": 0.25859426197349605, + "learning_rate": 8.308026226556665e-06, + "loss": 0.4029, + "step": 11560 + }, + { + "epoch": 1.6630716134598793, + "grad_norm": 0.2681683238557652, + "learning_rate": 8.300605803414308e-06, + "loss": 0.4045, + "step": 11565 + }, + { + "epoch": 1.6637906241012366, + "grad_norm": 0.2633484725649643, + "learning_rate": 8.293186343705979e-06, + "loss": 0.4057, + "step": 11570 + }, + { + "epoch": 1.6645096347425943, + "grad_norm": 0.2638725208834779, + "learning_rate": 8.285767851637977e-06, + "loss": 0.4159, + "step": 11575 + }, + { + "epoch": 1.6652286453839515, + "grad_norm": 0.2553500904157079, + "learning_rate": 8.278350331416057e-06, + "loss": 0.4241, + "step": 11580 + }, + { + "epoch": 1.6659476560253093, + "grad_norm": 0.25723705364445204, + "learning_rate": 8.270933787245417e-06, + "loss": 0.4017, + "step": 11585 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.2727500616792483, + "learning_rate": 8.263518223330698e-06, + "loss": 0.4053, + "step": 11590 + }, + { + "epoch": 1.6673856773080242, + "grad_norm": 0.26941061846366393, + "learning_rate": 8.256103643875995e-06, + "loss": 0.4165, + "step": 11595 + }, + { + "epoch": 1.6681046879493815, + "grad_norm": 0.2444646366791917, + "learning_rate": 8.248690053084841e-06, + "loss": 0.4072, + "step": 11600 + }, + { + "epoch": 1.6688236985907392, + "grad_norm": 0.25602855863749197, + "learning_rate": 8.241277455160202e-06, + "loss": 0.4113, + "step": 11605 + }, + { + "epoch": 1.6695427092320967, + "grad_norm": 0.25080661137297516, + "learning_rate": 8.233865854304497e-06, + "loss": 0.4107, + "step": 11610 + }, + { + "epoch": 1.6702617198734542, + "grad_norm": 0.2601363858469565, + "learning_rate": 8.226455254719555e-06, + "loss": 0.432, + "step": 11615 + }, + { + "epoch": 1.6709807305148117, + "grad_norm": 0.28065750563168684, + "learning_rate": 8.219045660606664e-06, + "loss": 0.4159, + "step": 11620 + }, + { + "epoch": 1.6716997411561692, + "grad_norm": 0.2548133076568602, + "learning_rate": 8.211637076166528e-06, + "loss": 0.4208, + "step": 11625 + }, + { + "epoch": 1.6724187517975266, + "grad_norm": 0.25883680184634433, + "learning_rate": 8.204229505599273e-06, + "loss": 0.4372, + "step": 11630 + }, + { + "epoch": 1.6731377624388841, + "grad_norm": 0.2535910290217827, + "learning_rate": 8.196822953104467e-06, + "loss": 0.4242, + "step": 11635 + }, + { + "epoch": 1.6738567730802416, + "grad_norm": 0.256481935859069, + "learning_rate": 8.189417422881089e-06, + "loss": 0.4179, + "step": 11640 + }, + { + "epoch": 1.674575783721599, + "grad_norm": 0.25562450892249666, + "learning_rate": 8.182012919127533e-06, + "loss": 0.4157, + "step": 11645 + }, + { + "epoch": 1.6752947943629566, + "grad_norm": 0.25399721604854614, + "learning_rate": 8.174609446041629e-06, + "loss": 0.4128, + "step": 11650 + }, + { + "epoch": 1.676013805004314, + "grad_norm": 0.2573526645625574, + "learning_rate": 8.167207007820609e-06, + "loss": 0.3922, + "step": 11655 + }, + { + "epoch": 1.6767328156456716, + "grad_norm": 0.2546319232775869, + "learning_rate": 8.159805608661118e-06, + "loss": 0.3997, + "step": 11660 + }, + { + "epoch": 1.677451826287029, + "grad_norm": 0.24976033067638184, + "learning_rate": 8.152405252759224e-06, + "loss": 0.4041, + "step": 11665 + }, + { + "epoch": 1.6781708369283865, + "grad_norm": 0.246033218802702, + "learning_rate": 8.14500594431039e-06, + "loss": 0.4219, + "step": 11670 + }, + { + "epoch": 1.678889847569744, + "grad_norm": 0.25489234934871835, + "learning_rate": 8.137607687509488e-06, + "loss": 0.4253, + "step": 11675 + }, + { + "epoch": 1.6796088582111015, + "grad_norm": 0.2599339826732556, + "learning_rate": 8.130210486550805e-06, + "loss": 0.4092, + "step": 11680 + }, + { + "epoch": 1.680327868852459, + "grad_norm": 0.24877726767549682, + "learning_rate": 8.122814345628016e-06, + "loss": 0.3958, + "step": 11685 + }, + { + "epoch": 1.6810468794938165, + "grad_norm": 0.2571497954450008, + "learning_rate": 8.115419268934196e-06, + "loss": 0.4288, + "step": 11690 + }, + { + "epoch": 1.681765890135174, + "grad_norm": 0.26365956839645893, + "learning_rate": 8.108025260661826e-06, + "loss": 0.414, + "step": 11695 + }, + { + "epoch": 1.6824849007765315, + "grad_norm": 0.2583046571555305, + "learning_rate": 8.100632325002775e-06, + "loss": 0.4095, + "step": 11700 + }, + { + "epoch": 1.683203911417889, + "grad_norm": 0.25333167054802613, + "learning_rate": 8.0932404661483e-06, + "loss": 0.4184, + "step": 11705 + }, + { + "epoch": 1.6839229220592464, + "grad_norm": 0.2545174291939654, + "learning_rate": 8.08584968828906e-06, + "loss": 0.4286, + "step": 11710 + }, + { + "epoch": 1.684641932700604, + "grad_norm": 0.24189809389835804, + "learning_rate": 8.07845999561509e-06, + "loss": 0.4244, + "step": 11715 + }, + { + "epoch": 1.6853609433419616, + "grad_norm": 0.25475398318370074, + "learning_rate": 8.071071392315807e-06, + "loss": 0.4025, + "step": 11720 + }, + { + "epoch": 1.686079953983319, + "grad_norm": 0.24862476692330737, + "learning_rate": 8.063683882580027e-06, + "loss": 0.4017, + "step": 11725 + }, + { + "epoch": 1.6867989646246766, + "grad_norm": 0.2514570552491363, + "learning_rate": 8.056297470595926e-06, + "loss": 0.4033, + "step": 11730 + }, + { + "epoch": 1.6875179752660339, + "grad_norm": 0.2570073890835067, + "learning_rate": 8.048912160551076e-06, + "loss": 0.4174, + "step": 11735 + }, + { + "epoch": 1.6882369859073916, + "grad_norm": 0.25748440151309115, + "learning_rate": 8.041527956632412e-06, + "loss": 0.4304, + "step": 11740 + }, + { + "epoch": 1.6889559965487488, + "grad_norm": 0.25199098359637623, + "learning_rate": 8.03414486302624e-06, + "loss": 0.4077, + "step": 11745 + }, + { + "epoch": 1.6896750071901065, + "grad_norm": 0.25570662131789734, + "learning_rate": 8.02676288391825e-06, + "loss": 0.4037, + "step": 11750 + }, + { + "epoch": 1.6903940178314638, + "grad_norm": 0.24959302860918878, + "learning_rate": 8.019382023493491e-06, + "loss": 0.4185, + "step": 11755 + }, + { + "epoch": 1.6911130284728215, + "grad_norm": 0.2585340435554818, + "learning_rate": 8.012002285936372e-06, + "loss": 0.4192, + "step": 11760 + }, + { + "epoch": 1.6918320391141788, + "grad_norm": 0.2512367989560759, + "learning_rate": 8.00462367543068e-06, + "loss": 0.4029, + "step": 11765 + }, + { + "epoch": 1.6925510497555365, + "grad_norm": 0.2557612581821425, + "learning_rate": 7.997246196159552e-06, + "loss": 0.4161, + "step": 11770 + }, + { + "epoch": 1.6932700603968938, + "grad_norm": 0.25333856532359916, + "learning_rate": 7.989869852305485e-06, + "loss": 0.4188, + "step": 11775 + }, + { + "epoch": 1.6939890710382515, + "grad_norm": 0.26739129697120595, + "learning_rate": 7.982494648050341e-06, + "loss": 0.4093, + "step": 11780 + }, + { + "epoch": 1.6947080816796087, + "grad_norm": 0.25048961243751644, + "learning_rate": 7.975120587575325e-06, + "loss": 0.4203, + "step": 11785 + }, + { + "epoch": 1.6954270923209664, + "grad_norm": 0.2613477012494576, + "learning_rate": 7.967747675060993e-06, + "loss": 0.431, + "step": 11790 + }, + { + "epoch": 1.6961461029623237, + "grad_norm": 0.27191929486403327, + "learning_rate": 7.960375914687264e-06, + "loss": 0.4323, + "step": 11795 + }, + { + "epoch": 1.6968651136036814, + "grad_norm": 0.24900545939168423, + "learning_rate": 7.95300531063339e-06, + "loss": 0.4048, + "step": 11800 + }, + { + "epoch": 1.6975841242450387, + "grad_norm": 0.2618696749267013, + "learning_rate": 7.945635867077971e-06, + "loss": 0.405, + "step": 11805 + }, + { + "epoch": 1.6983031348863964, + "grad_norm": 0.258639069476796, + "learning_rate": 7.938267588198955e-06, + "loss": 0.4081, + "step": 11810 + }, + { + "epoch": 1.6990221455277537, + "grad_norm": 0.26244337690348685, + "learning_rate": 7.930900478173621e-06, + "loss": 0.4121, + "step": 11815 + }, + { + "epoch": 1.6997411561691114, + "grad_norm": 0.2504237314387894, + "learning_rate": 7.92353454117859e-06, + "loss": 0.417, + "step": 11820 + }, + { + "epoch": 1.7004601668104686, + "grad_norm": 0.24874053964898726, + "learning_rate": 7.91616978138982e-06, + "loss": 0.4058, + "step": 11825 + }, + { + "epoch": 1.7011791774518263, + "grad_norm": 0.24678653994922276, + "learning_rate": 7.908806202982595e-06, + "loss": 0.4127, + "step": 11830 + }, + { + "epoch": 1.7018981880931838, + "grad_norm": 0.2543663868155727, + "learning_rate": 7.90144381013154e-06, + "loss": 0.4246, + "step": 11835 + }, + { + "epoch": 1.7026171987345413, + "grad_norm": 0.2493487529962913, + "learning_rate": 7.894082607010593e-06, + "loss": 0.411, + "step": 11840 + }, + { + "epoch": 1.7033362093758988, + "grad_norm": 0.25970051800487864, + "learning_rate": 7.886722597793029e-06, + "loss": 0.4186, + "step": 11845 + }, + { + "epoch": 1.7040552200172563, + "grad_norm": 0.25806108757156404, + "learning_rate": 7.879363786651445e-06, + "loss": 0.4187, + "step": 11850 + }, + { + "epoch": 1.7047742306586138, + "grad_norm": 0.26438051545829483, + "learning_rate": 7.872006177757757e-06, + "loss": 0.3951, + "step": 11855 + }, + { + "epoch": 1.7054932412999713, + "grad_norm": 0.2729109214158478, + "learning_rate": 7.86464977528319e-06, + "loss": 0.4217, + "step": 11860 + }, + { + "epoch": 1.7062122519413288, + "grad_norm": 0.24814552432716896, + "learning_rate": 7.857294583398303e-06, + "loss": 0.4196, + "step": 11865 + }, + { + "epoch": 1.7069312625826862, + "grad_norm": 0.25027211561214646, + "learning_rate": 7.849940606272962e-06, + "loss": 0.4087, + "step": 11870 + }, + { + "epoch": 1.7076502732240437, + "grad_norm": 0.2566008836069061, + "learning_rate": 7.842587848076329e-06, + "loss": 0.4077, + "step": 11875 + }, + { + "epoch": 1.7083692838654012, + "grad_norm": 0.2601929542751101, + "learning_rate": 7.835236312976903e-06, + "loss": 0.4126, + "step": 11880 + }, + { + "epoch": 1.7090882945067587, + "grad_norm": 0.2580914302200825, + "learning_rate": 7.827886005142466e-06, + "loss": 0.4194, + "step": 11885 + }, + { + "epoch": 1.7098073051481162, + "grad_norm": 0.24538322336023752, + "learning_rate": 7.820536928740113e-06, + "loss": 0.4136, + "step": 11890 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.2564642684564163, + "learning_rate": 7.813189087936243e-06, + "loss": 0.413, + "step": 11895 + }, + { + "epoch": 1.7112453264308312, + "grad_norm": 0.24768429163281047, + "learning_rate": 7.805842486896553e-06, + "loss": 0.4135, + "step": 11900 + }, + { + "epoch": 1.7119643370721886, + "grad_norm": 0.2557952248722331, + "learning_rate": 7.79849712978603e-06, + "loss": 0.4206, + "step": 11905 + }, + { + "epoch": 1.7126833477135461, + "grad_norm": 0.2549016847496704, + "learning_rate": 7.791153020768974e-06, + "loss": 0.415, + "step": 11910 + }, + { + "epoch": 1.7134023583549036, + "grad_norm": 0.24628783851295163, + "learning_rate": 7.783810164008954e-06, + "loss": 0.4039, + "step": 11915 + }, + { + "epoch": 1.714121368996261, + "grad_norm": 0.27296354639667286, + "learning_rate": 7.776468563668842e-06, + "loss": 0.4066, + "step": 11920 + }, + { + "epoch": 1.7148403796376186, + "grad_norm": 0.2601042254906317, + "learning_rate": 7.769128223910805e-06, + "loss": 0.4246, + "step": 11925 + }, + { + "epoch": 1.715559390278976, + "grad_norm": 0.24466200646047612, + "learning_rate": 7.761789148896279e-06, + "loss": 0.3994, + "step": 11930 + }, + { + "epoch": 1.7162784009203336, + "grad_norm": 0.2508836356329874, + "learning_rate": 7.75445134278599e-06, + "loss": 0.4112, + "step": 11935 + }, + { + "epoch": 1.716997411561691, + "grad_norm": 0.2500487508873629, + "learning_rate": 7.747114809739949e-06, + "loss": 0.4105, + "step": 11940 + }, + { + "epoch": 1.7177164222030488, + "grad_norm": 0.25512578741818087, + "learning_rate": 7.739779553917437e-06, + "loss": 0.4133, + "step": 11945 + }, + { + "epoch": 1.718435432844406, + "grad_norm": 0.2506313982709355, + "learning_rate": 7.732445579477022e-06, + "loss": 0.4169, + "step": 11950 + }, + { + "epoch": 1.7191544434857637, + "grad_norm": 0.25695235121598614, + "learning_rate": 7.725112890576537e-06, + "loss": 0.409, + "step": 11955 + }, + { + "epoch": 1.719873454127121, + "grad_norm": 0.2673624331694823, + "learning_rate": 7.717781491373082e-06, + "loss": 0.4036, + "step": 11960 + }, + { + "epoch": 1.7205924647684787, + "grad_norm": 0.25013590803678853, + "learning_rate": 7.710451386023037e-06, + "loss": 0.4097, + "step": 11965 + }, + { + "epoch": 1.721311475409836, + "grad_norm": 0.25866063365324493, + "learning_rate": 7.703122578682047e-06, + "loss": 0.4118, + "step": 11970 + }, + { + "epoch": 1.7220304860511937, + "grad_norm": 0.2433560125133353, + "learning_rate": 7.695795073505007e-06, + "loss": 0.4116, + "step": 11975 + }, + { + "epoch": 1.722749496692551, + "grad_norm": 0.2572523400000175, + "learning_rate": 7.688468874646096e-06, + "loss": 0.409, + "step": 11980 + }, + { + "epoch": 1.7234685073339087, + "grad_norm": 0.254914406228328, + "learning_rate": 7.681143986258734e-06, + "loss": 0.4055, + "step": 11985 + }, + { + "epoch": 1.724187517975266, + "grad_norm": 0.25364391121639074, + "learning_rate": 7.673820412495603e-06, + "loss": 0.4232, + "step": 11990 + }, + { + "epoch": 1.7249065286166236, + "grad_norm": 0.2646047450444079, + "learning_rate": 7.666498157508651e-06, + "loss": 0.4396, + "step": 11995 + }, + { + "epoch": 1.725625539257981, + "grad_norm": 0.25353357082340133, + "learning_rate": 7.65917722544906e-06, + "loss": 0.4138, + "step": 12000 + }, + { + "epoch": 1.7263445498993386, + "grad_norm": 0.2530485936086203, + "learning_rate": 7.65185762046727e-06, + "loss": 0.4064, + "step": 12005 + }, + { + "epoch": 1.7270635605406959, + "grad_norm": 0.2610894328689711, + "learning_rate": 7.644539346712975e-06, + "loss": 0.4091, + "step": 12010 + }, + { + "epoch": 1.7277825711820536, + "grad_norm": 0.2531467817667373, + "learning_rate": 7.63722240833511e-06, + "loss": 0.4184, + "step": 12015 + }, + { + "epoch": 1.7285015818234108, + "grad_norm": 0.24719229058431075, + "learning_rate": 7.629906809481843e-06, + "loss": 0.4289, + "step": 12020 + }, + { + "epoch": 1.7292205924647686, + "grad_norm": 0.2561062301934565, + "learning_rate": 7.6225925543006005e-06, + "loss": 0.4188, + "step": 12025 + }, + { + "epoch": 1.7299396031061258, + "grad_norm": 0.2568015067914993, + "learning_rate": 7.6152796469380354e-06, + "loss": 0.4148, + "step": 12030 + }, + { + "epoch": 1.7306586137474835, + "grad_norm": 0.26593094171482756, + "learning_rate": 7.607968091540032e-06, + "loss": 0.4022, + "step": 12035 + }, + { + "epoch": 1.7313776243888408, + "grad_norm": 0.2623399010130668, + "learning_rate": 7.600657892251725e-06, + "loss": 0.4152, + "step": 12040 + }, + { + "epoch": 1.7320966350301985, + "grad_norm": 0.2550964119530657, + "learning_rate": 7.593349053217468e-06, + "loss": 0.4099, + "step": 12045 + }, + { + "epoch": 1.732815645671556, + "grad_norm": 0.25329097455045146, + "learning_rate": 7.586041578580841e-06, + "loss": 0.4087, + "step": 12050 + }, + { + "epoch": 1.7335346563129135, + "grad_norm": 0.25088733417566467, + "learning_rate": 7.578735472484663e-06, + "loss": 0.4135, + "step": 12055 + }, + { + "epoch": 1.734253666954271, + "grad_norm": 0.26380431521754316, + "learning_rate": 7.571430739070962e-06, + "loss": 0.4145, + "step": 12060 + }, + { + "epoch": 1.7349726775956285, + "grad_norm": 0.25931018794731403, + "learning_rate": 7.564127382481e-06, + "loss": 0.4185, + "step": 12065 + }, + { + "epoch": 1.735691688236986, + "grad_norm": 0.25817381474927426, + "learning_rate": 7.556825406855256e-06, + "loss": 0.4144, + "step": 12070 + }, + { + "epoch": 1.7364106988783434, + "grad_norm": 0.25889864425988607, + "learning_rate": 7.549524816333416e-06, + "loss": 0.4348, + "step": 12075 + }, + { + "epoch": 1.737129709519701, + "grad_norm": 0.25930933779186754, + "learning_rate": 7.542225615054397e-06, + "loss": 0.4058, + "step": 12080 + }, + { + "epoch": 1.7378487201610584, + "grad_norm": 0.2504175659441515, + "learning_rate": 7.534927807156316e-06, + "loss": 0.4047, + "step": 12085 + }, + { + "epoch": 1.738567730802416, + "grad_norm": 0.26844272991946744, + "learning_rate": 7.527631396776503e-06, + "loss": 0.4173, + "step": 12090 + }, + { + "epoch": 1.7392867414437734, + "grad_norm": 0.2445372194665716, + "learning_rate": 7.5203363880515005e-06, + "loss": 0.4035, + "step": 12095 + }, + { + "epoch": 1.7400057520851309, + "grad_norm": 0.26059488273974246, + "learning_rate": 7.513042785117052e-06, + "loss": 0.4278, + "step": 12100 + }, + { + "epoch": 1.7407247627264884, + "grad_norm": 0.2543288007640774, + "learning_rate": 7.505750592108099e-06, + "loss": 0.4237, + "step": 12105 + }, + { + "epoch": 1.7414437733678458, + "grad_norm": 0.2528374529354086, + "learning_rate": 7.498459813158795e-06, + "loss": 0.4122, + "step": 12110 + }, + { + "epoch": 1.7421627840092033, + "grad_norm": 0.24673863195302134, + "learning_rate": 7.4911704524024875e-06, + "loss": 0.3958, + "step": 12115 + }, + { + "epoch": 1.7428817946505608, + "grad_norm": 0.26716661370758454, + "learning_rate": 7.483882513971712e-06, + "loss": 0.4197, + "step": 12120 + }, + { + "epoch": 1.7436008052919183, + "grad_norm": 0.26342608501436565, + "learning_rate": 7.476596001998212e-06, + "loss": 0.4071, + "step": 12125 + }, + { + "epoch": 1.7443198159332758, + "grad_norm": 0.24654088237466748, + "learning_rate": 7.469310920612909e-06, + "loss": 0.3981, + "step": 12130 + }, + { + "epoch": 1.7450388265746333, + "grad_norm": 0.24286792380956676, + "learning_rate": 7.462027273945922e-06, + "loss": 0.4047, + "step": 12135 + }, + { + "epoch": 1.7457578372159908, + "grad_norm": 0.26085705339506143, + "learning_rate": 7.4547450661265516e-06, + "loss": 0.4265, + "step": 12140 + }, + { + "epoch": 1.7464768478573482, + "grad_norm": 0.24945305437436543, + "learning_rate": 7.44746430128329e-06, + "loss": 0.4044, + "step": 12145 + }, + { + "epoch": 1.7471958584987057, + "grad_norm": 0.24713572243066181, + "learning_rate": 7.440184983543797e-06, + "loss": 0.3991, + "step": 12150 + }, + { + "epoch": 1.7479148691400632, + "grad_norm": 0.25381336439566454, + "learning_rate": 7.43290711703493e-06, + "loss": 0.401, + "step": 12155 + }, + { + "epoch": 1.748633879781421, + "grad_norm": 0.2578191736856875, + "learning_rate": 7.425630705882707e-06, + "loss": 0.4056, + "step": 12160 + }, + { + "epoch": 1.7493528904227782, + "grad_norm": 0.26202925689193507, + "learning_rate": 7.4183557542123344e-06, + "loss": 0.3927, + "step": 12165 + }, + { + "epoch": 1.750071901064136, + "grad_norm": 0.26943617048489577, + "learning_rate": 7.4110822661481875e-06, + "loss": 0.443, + "step": 12170 + }, + { + "epoch": 1.7507909117054932, + "grad_norm": 0.2561860768336747, + "learning_rate": 7.4038102458138e-06, + "loss": 0.4143, + "step": 12175 + }, + { + "epoch": 1.7515099223468509, + "grad_norm": 0.25232020768058827, + "learning_rate": 7.396539697331895e-06, + "loss": 0.4213, + "step": 12180 + }, + { + "epoch": 1.7522289329882081, + "grad_norm": 0.25036255007581687, + "learning_rate": 7.389270624824342e-06, + "loss": 0.4313, + "step": 12185 + }, + { + "epoch": 1.7529479436295659, + "grad_norm": 0.2522764405837337, + "learning_rate": 7.3820030324121796e-06, + "loss": 0.4085, + "step": 12190 + }, + { + "epoch": 1.7536669542709231, + "grad_norm": 0.32132127532992866, + "learning_rate": 7.374736924215618e-06, + "loss": 0.4203, + "step": 12195 + }, + { + "epoch": 1.7543859649122808, + "grad_norm": 0.2611388242956791, + "learning_rate": 7.367472304354011e-06, + "loss": 0.4051, + "step": 12200 + }, + { + "epoch": 1.755104975553638, + "grad_norm": 0.2643832666228025, + "learning_rate": 7.3602091769458695e-06, + "loss": 0.4204, + "step": 12205 + }, + { + "epoch": 1.7558239861949958, + "grad_norm": 0.2617194604985132, + "learning_rate": 7.352947546108873e-06, + "loss": 0.4099, + "step": 12210 + }, + { + "epoch": 1.756542996836353, + "grad_norm": 0.256218391720951, + "learning_rate": 7.345687415959839e-06, + "loss": 0.431, + "step": 12215 + }, + { + "epoch": 1.7572620074777108, + "grad_norm": 0.26285248982787435, + "learning_rate": 7.338428790614732e-06, + "loss": 0.4197, + "step": 12220 + }, + { + "epoch": 1.757981018119068, + "grad_norm": 0.25901669856983645, + "learning_rate": 7.3311716741886806e-06, + "loss": 0.417, + "step": 12225 + }, + { + "epoch": 1.7587000287604257, + "grad_norm": 0.2513594012736961, + "learning_rate": 7.323916070795939e-06, + "loss": 0.4025, + "step": 12230 + }, + { + "epoch": 1.759419039401783, + "grad_norm": 0.25998980539805544, + "learning_rate": 7.316661984549911e-06, + "loss": 0.4233, + "step": 12235 + }, + { + "epoch": 1.7601380500431407, + "grad_norm": 0.2591071066572429, + "learning_rate": 7.309409419563147e-06, + "loss": 0.4014, + "step": 12240 + }, + { + "epoch": 1.760857060684498, + "grad_norm": 0.2558272045675344, + "learning_rate": 7.302158379947325e-06, + "loss": 0.4089, + "step": 12245 + }, + { + "epoch": 1.7615760713258557, + "grad_norm": 0.25425164833366926, + "learning_rate": 7.294908869813258e-06, + "loss": 0.3968, + "step": 12250 + }, + { + "epoch": 1.762295081967213, + "grad_norm": 0.2506035785957442, + "learning_rate": 7.287660893270901e-06, + "loss": 0.4223, + "step": 12255 + }, + { + "epoch": 1.7630140926085707, + "grad_norm": 0.258303350313874, + "learning_rate": 7.280414454429335e-06, + "loss": 0.4134, + "step": 12260 + }, + { + "epoch": 1.763733103249928, + "grad_norm": 0.24989677588921191, + "learning_rate": 7.27316955739676e-06, + "loss": 0.4064, + "step": 12265 + }, + { + "epoch": 1.7644521138912856, + "grad_norm": 0.24880185248679065, + "learning_rate": 7.265926206280523e-06, + "loss": 0.4064, + "step": 12270 + }, + { + "epoch": 1.7651711245326431, + "grad_norm": 0.24900137931539176, + "learning_rate": 7.258684405187071e-06, + "loss": 0.4228, + "step": 12275 + }, + { + "epoch": 1.7658901351740006, + "grad_norm": 0.25847035549259967, + "learning_rate": 7.251444158221992e-06, + "loss": 0.4308, + "step": 12280 + }, + { + "epoch": 1.766609145815358, + "grad_norm": 0.2537305545308351, + "learning_rate": 7.244205469489979e-06, + "loss": 0.4046, + "step": 12285 + }, + { + "epoch": 1.7673281564567156, + "grad_norm": 0.26388046085735983, + "learning_rate": 7.236968343094846e-06, + "loss": 0.4141, + "step": 12290 + }, + { + "epoch": 1.768047167098073, + "grad_norm": 0.26114087770670197, + "learning_rate": 7.229732783139527e-06, + "loss": 0.4033, + "step": 12295 + }, + { + "epoch": 1.7687661777394306, + "grad_norm": 0.24486941677588542, + "learning_rate": 7.222498793726061e-06, + "loss": 0.414, + "step": 12300 + }, + { + "epoch": 1.769485188380788, + "grad_norm": 0.2589314180901157, + "learning_rate": 7.215266378955592e-06, + "loss": 0.4209, + "step": 12305 + }, + { + "epoch": 1.7702041990221455, + "grad_norm": 0.26416909532177263, + "learning_rate": 7.208035542928388e-06, + "loss": 0.4019, + "step": 12310 + }, + { + "epoch": 1.770923209663503, + "grad_norm": 0.2615146702373038, + "learning_rate": 7.2008062897438084e-06, + "loss": 0.4177, + "step": 12315 + }, + { + "epoch": 1.7716422203048605, + "grad_norm": 0.2558786053060417, + "learning_rate": 7.193578623500314e-06, + "loss": 0.3994, + "step": 12320 + }, + { + "epoch": 1.772361230946218, + "grad_norm": 0.2591167653155897, + "learning_rate": 7.186352548295479e-06, + "loss": 0.4176, + "step": 12325 + }, + { + "epoch": 1.7730802415875755, + "grad_norm": 0.2565385002090668, + "learning_rate": 7.179128068225959e-06, + "loss": 0.417, + "step": 12330 + }, + { + "epoch": 1.773799252228933, + "grad_norm": 0.2513995475419433, + "learning_rate": 7.171905187387517e-06, + "loss": 0.4261, + "step": 12335 + }, + { + "epoch": 1.7745182628702905, + "grad_norm": 0.2540054014535628, + "learning_rate": 7.16468390987501e-06, + "loss": 0.4056, + "step": 12340 + }, + { + "epoch": 1.775237273511648, + "grad_norm": 0.26390233707389416, + "learning_rate": 7.1574642397823764e-06, + "loss": 0.4284, + "step": 12345 + }, + { + "epoch": 1.7759562841530054, + "grad_norm": 0.251247535422529, + "learning_rate": 7.150246181202648e-06, + "loss": 0.4165, + "step": 12350 + }, + { + "epoch": 1.776675294794363, + "grad_norm": 0.25335251899602307, + "learning_rate": 7.143029738227948e-06, + "loss": 0.3999, + "step": 12355 + }, + { + "epoch": 1.7773943054357204, + "grad_norm": 0.254603817611765, + "learning_rate": 7.135814914949479e-06, + "loss": 0.4183, + "step": 12360 + }, + { + "epoch": 1.778113316077078, + "grad_norm": 0.2627901777993984, + "learning_rate": 7.128601715457522e-06, + "loss": 0.4123, + "step": 12365 + }, + { + "epoch": 1.7788323267184354, + "grad_norm": 0.267622124641138, + "learning_rate": 7.1213901438414455e-06, + "loss": 0.4159, + "step": 12370 + }, + { + "epoch": 1.7795513373597929, + "grad_norm": 0.25978798992318286, + "learning_rate": 7.114180204189689e-06, + "loss": 0.4229, + "step": 12375 + }, + { + "epoch": 1.7802703480011504, + "grad_norm": 0.24647432047225887, + "learning_rate": 7.106971900589765e-06, + "loss": 0.4039, + "step": 12380 + }, + { + "epoch": 1.780989358642508, + "grad_norm": 0.2570864161270665, + "learning_rate": 7.099765237128271e-06, + "loss": 0.4201, + "step": 12385 + }, + { + "epoch": 1.7817083692838653, + "grad_norm": 0.24494223052253658, + "learning_rate": 7.0925602178908555e-06, + "loss": 0.4069, + "step": 12390 + }, + { + "epoch": 1.782427379925223, + "grad_norm": 0.24786662275056098, + "learning_rate": 7.085356846962256e-06, + "loss": 0.4088, + "step": 12395 + }, + { + "epoch": 1.7831463905665803, + "grad_norm": 0.27211048051117026, + "learning_rate": 7.078155128426256e-06, + "loss": 0.4086, + "step": 12400 + }, + { + "epoch": 1.783865401207938, + "grad_norm": 0.24763847807165335, + "learning_rate": 7.070955066365714e-06, + "loss": 0.4066, + "step": 12405 + }, + { + "epoch": 1.7845844118492953, + "grad_norm": 0.28744376769244007, + "learning_rate": 7.063756664862546e-06, + "loss": 0.4223, + "step": 12410 + }, + { + "epoch": 1.785303422490653, + "grad_norm": 0.25607430562276395, + "learning_rate": 7.056559927997728e-06, + "loss": 0.4165, + "step": 12415 + }, + { + "epoch": 1.7860224331320103, + "grad_norm": 0.25376526296389273, + "learning_rate": 7.049364859851286e-06, + "loss": 0.3973, + "step": 12420 + }, + { + "epoch": 1.786741443773368, + "grad_norm": 0.25481894320851795, + "learning_rate": 7.042171464502314e-06, + "loss": 0.4037, + "step": 12425 + }, + { + "epoch": 1.7874604544147252, + "grad_norm": 0.24849470990176253, + "learning_rate": 7.034979746028942e-06, + "loss": 0.4206, + "step": 12430 + }, + { + "epoch": 1.788179465056083, + "grad_norm": 0.2500859118547924, + "learning_rate": 7.027789708508355e-06, + "loss": 0.4141, + "step": 12435 + }, + { + "epoch": 1.7888984756974402, + "grad_norm": 0.2442725442196124, + "learning_rate": 7.020601356016793e-06, + "loss": 0.4161, + "step": 12440 + }, + { + "epoch": 1.789617486338798, + "grad_norm": 0.2689372064593602, + "learning_rate": 7.01341469262953e-06, + "loss": 0.412, + "step": 12445 + }, + { + "epoch": 1.7903364969801552, + "grad_norm": 0.2652584184287822, + "learning_rate": 7.0062297224208805e-06, + "loss": 0.4188, + "step": 12450 + }, + { + "epoch": 1.7910555076215129, + "grad_norm": 0.2548079457132161, + "learning_rate": 6.999046449464214e-06, + "loss": 0.4087, + "step": 12455 + }, + { + "epoch": 1.7917745182628702, + "grad_norm": 0.24320376487368286, + "learning_rate": 6.9918648778319264e-06, + "loss": 0.412, + "step": 12460 + }, + { + "epoch": 1.7924935289042279, + "grad_norm": 0.24889708442612957, + "learning_rate": 6.984685011595445e-06, + "loss": 0.4282, + "step": 12465 + }, + { + "epoch": 1.7932125395455851, + "grad_norm": 0.2533004872723184, + "learning_rate": 6.977506854825244e-06, + "loss": 0.4197, + "step": 12470 + }, + { + "epoch": 1.7939315501869428, + "grad_norm": 0.2480345197472715, + "learning_rate": 6.970330411590818e-06, + "loss": 0.4078, + "step": 12475 + }, + { + "epoch": 1.7946505608283, + "grad_norm": 0.2641405088587578, + "learning_rate": 6.963155685960689e-06, + "loss": 0.4037, + "step": 12480 + }, + { + "epoch": 1.7953695714696578, + "grad_norm": 0.27067691777708486, + "learning_rate": 6.955982682002419e-06, + "loss": 0.4337, + "step": 12485 + }, + { + "epoch": 1.7960885821110153, + "grad_norm": 0.25504037126841456, + "learning_rate": 6.948811403782574e-06, + "loss": 0.4285, + "step": 12490 + }, + { + "epoch": 1.7968075927523728, + "grad_norm": 0.2626109496981662, + "learning_rate": 6.941641855366761e-06, + "loss": 0.4136, + "step": 12495 + }, + { + "epoch": 1.7975266033937303, + "grad_norm": 0.2597384672585987, + "learning_rate": 6.93447404081959e-06, + "loss": 0.4144, + "step": 12500 + }, + { + "epoch": 1.7982456140350878, + "grad_norm": 0.25519689958709324, + "learning_rate": 6.927307964204695e-06, + "loss": 0.42, + "step": 12505 + }, + { + "epoch": 1.7989646246764452, + "grad_norm": 0.24724204883741235, + "learning_rate": 6.920143629584734e-06, + "loss": 0.4168, + "step": 12510 + }, + { + "epoch": 1.7996836353178027, + "grad_norm": 0.2617317305762162, + "learning_rate": 6.91298104102136e-06, + "loss": 0.415, + "step": 12515 + }, + { + "epoch": 1.8004026459591602, + "grad_norm": 0.2874116408502503, + "learning_rate": 6.905820202575245e-06, + "loss": 0.4172, + "step": 12520 + }, + { + "epoch": 1.8011216566005177, + "grad_norm": 0.2373847142610924, + "learning_rate": 6.898661118306074e-06, + "loss": 0.4065, + "step": 12525 + }, + { + "epoch": 1.8018406672418752, + "grad_norm": 0.24865915584559958, + "learning_rate": 6.891503792272525e-06, + "loss": 0.4202, + "step": 12530 + }, + { + "epoch": 1.8025596778832327, + "grad_norm": 0.24895704753808048, + "learning_rate": 6.884348228532287e-06, + "loss": 0.4181, + "step": 12535 + }, + { + "epoch": 1.8032786885245902, + "grad_norm": 0.26065825694480377, + "learning_rate": 6.877194431142055e-06, + "loss": 0.4141, + "step": 12540 + }, + { + "epoch": 1.8039976991659477, + "grad_norm": 0.26054346482482493, + "learning_rate": 6.870042404157513e-06, + "loss": 0.4122, + "step": 12545 + }, + { + "epoch": 1.8047167098073051, + "grad_norm": 0.2601813629726882, + "learning_rate": 6.862892151633339e-06, + "loss": 0.4271, + "step": 12550 + }, + { + "epoch": 1.8054357204486626, + "grad_norm": 0.25007485946893376, + "learning_rate": 6.855743677623219e-06, + "loss": 0.3967, + "step": 12555 + }, + { + "epoch": 1.8061547310900201, + "grad_norm": 0.26287949884764095, + "learning_rate": 6.848596986179821e-06, + "loss": 0.4113, + "step": 12560 + }, + { + "epoch": 1.8068737417313776, + "grad_norm": 0.2580383417830257, + "learning_rate": 6.841452081354799e-06, + "loss": 0.4142, + "step": 12565 + }, + { + "epoch": 1.807592752372735, + "grad_norm": 0.2544987361957738, + "learning_rate": 6.834308967198806e-06, + "loss": 0.4228, + "step": 12570 + }, + { + "epoch": 1.8083117630140926, + "grad_norm": 0.2463911431032656, + "learning_rate": 6.827167647761469e-06, + "loss": 0.408, + "step": 12575 + }, + { + "epoch": 1.80903077365545, + "grad_norm": 0.2647094856586929, + "learning_rate": 6.820028127091398e-06, + "loss": 0.4177, + "step": 12580 + }, + { + "epoch": 1.8097497842968076, + "grad_norm": 0.2552792364570339, + "learning_rate": 6.812890409236197e-06, + "loss": 0.4222, + "step": 12585 + }, + { + "epoch": 1.810468794938165, + "grad_norm": 0.2654823728887575, + "learning_rate": 6.805754498242429e-06, + "loss": 0.4217, + "step": 12590 + }, + { + "epoch": 1.8111878055795225, + "grad_norm": 0.24467511945177642, + "learning_rate": 6.798620398155642e-06, + "loss": 0.4107, + "step": 12595 + }, + { + "epoch": 1.8119068162208802, + "grad_norm": 0.24596016975854293, + "learning_rate": 6.791488113020359e-06, + "loss": 0.407, + "step": 12600 + }, + { + "epoch": 1.8126258268622375, + "grad_norm": 0.2638435716871941, + "learning_rate": 6.784357646880069e-06, + "loss": 0.4177, + "step": 12605 + }, + { + "epoch": 1.8133448375035952, + "grad_norm": 0.2527538982365996, + "learning_rate": 6.777229003777237e-06, + "loss": 0.4088, + "step": 12610 + }, + { + "epoch": 1.8140638481449525, + "grad_norm": 0.25342996535741125, + "learning_rate": 6.770102187753287e-06, + "loss": 0.4328, + "step": 12615 + }, + { + "epoch": 1.8147828587863102, + "grad_norm": 0.25121476066061776, + "learning_rate": 6.762977202848606e-06, + "loss": 0.3992, + "step": 12620 + }, + { + "epoch": 1.8155018694276674, + "grad_norm": 0.25413098892527763, + "learning_rate": 6.755854053102554e-06, + "loss": 0.4026, + "step": 12625 + }, + { + "epoch": 1.8162208800690252, + "grad_norm": 0.2554934683514058, + "learning_rate": 6.748732742553441e-06, + "loss": 0.4162, + "step": 12630 + }, + { + "epoch": 1.8169398907103824, + "grad_norm": 0.26198073463994964, + "learning_rate": 6.741613275238535e-06, + "loss": 0.4146, + "step": 12635 + }, + { + "epoch": 1.8176589013517401, + "grad_norm": 0.2725434061704402, + "learning_rate": 6.734495655194063e-06, + "loss": 0.4285, + "step": 12640 + }, + { + "epoch": 1.8183779119930974, + "grad_norm": 0.26161207219094385, + "learning_rate": 6.727379886455201e-06, + "loss": 0.4179, + "step": 12645 + }, + { + "epoch": 1.819096922634455, + "grad_norm": 0.25113077663083744, + "learning_rate": 6.720265973056077e-06, + "loss": 0.4136, + "step": 12650 + }, + { + "epoch": 1.8198159332758124, + "grad_norm": 0.25588530370887597, + "learning_rate": 6.713153919029769e-06, + "loss": 0.4226, + "step": 12655 + }, + { + "epoch": 1.82053494391717, + "grad_norm": 0.2524262779441796, + "learning_rate": 6.7060437284083004e-06, + "loss": 0.4025, + "step": 12660 + }, + { + "epoch": 1.8212539545585273, + "grad_norm": 0.25099598105760423, + "learning_rate": 6.698935405222628e-06, + "loss": 0.4086, + "step": 12665 + }, + { + "epoch": 1.821972965199885, + "grad_norm": 0.25154432322773645, + "learning_rate": 6.691828953502673e-06, + "loss": 0.4042, + "step": 12670 + }, + { + "epoch": 1.8226919758412423, + "grad_norm": 0.26731049015276837, + "learning_rate": 6.684724377277267e-06, + "loss": 0.4309, + "step": 12675 + }, + { + "epoch": 1.8234109864826, + "grad_norm": 0.25097642688246163, + "learning_rate": 6.6776216805742e-06, + "loss": 0.4071, + "step": 12680 + }, + { + "epoch": 1.8241299971239573, + "grad_norm": 0.2542332473785001, + "learning_rate": 6.670520867420191e-06, + "loss": 0.4313, + "step": 12685 + }, + { + "epoch": 1.824849007765315, + "grad_norm": 0.2545845953821712, + "learning_rate": 6.663421941840889e-06, + "loss": 0.4106, + "step": 12690 + }, + { + "epoch": 1.8255680184066723, + "grad_norm": 0.27993315805553826, + "learning_rate": 6.656324907860864e-06, + "loss": 0.4117, + "step": 12695 + }, + { + "epoch": 1.82628702904803, + "grad_norm": 0.25662187192675506, + "learning_rate": 6.649229769503632e-06, + "loss": 0.3998, + "step": 12700 + }, + { + "epoch": 1.8270060396893872, + "grad_norm": 0.26107627516777776, + "learning_rate": 6.642136530791626e-06, + "loss": 0.4114, + "step": 12705 + }, + { + "epoch": 1.827725050330745, + "grad_norm": 0.2529386238008444, + "learning_rate": 6.635045195746192e-06, + "loss": 0.4183, + "step": 12710 + }, + { + "epoch": 1.8284440609721024, + "grad_norm": 0.2605371133189607, + "learning_rate": 6.627955768387616e-06, + "loss": 0.4251, + "step": 12715 + }, + { + "epoch": 1.82916307161346, + "grad_norm": 0.2847902105830355, + "learning_rate": 6.620868252735084e-06, + "loss": 0.4048, + "step": 12720 + }, + { + "epoch": 1.8298820822548174, + "grad_norm": 0.25081838850235133, + "learning_rate": 6.613782652806713e-06, + "loss": 0.4115, + "step": 12725 + }, + { + "epoch": 1.830601092896175, + "grad_norm": 0.2542820360088698, + "learning_rate": 6.6066989726195265e-06, + "loss": 0.3999, + "step": 12730 + }, + { + "epoch": 1.8313201035375324, + "grad_norm": 0.28428395658288935, + "learning_rate": 6.599617216189456e-06, + "loss": 0.4176, + "step": 12735 + }, + { + "epoch": 1.8320391141788899, + "grad_norm": 0.2727531478489274, + "learning_rate": 6.5925373875313524e-06, + "loss": 0.3978, + "step": 12740 + }, + { + "epoch": 1.8327581248202474, + "grad_norm": 0.2635639047940844, + "learning_rate": 6.5854594906589655e-06, + "loss": 0.4236, + "step": 12745 + }, + { + "epoch": 1.8334771354616048, + "grad_norm": 0.26708872883780405, + "learning_rate": 6.578383529584949e-06, + "loss": 0.4161, + "step": 12750 + }, + { + "epoch": 1.8341961461029623, + "grad_norm": 0.26688572699946095, + "learning_rate": 6.571309508320873e-06, + "loss": 0.4233, + "step": 12755 + }, + { + "epoch": 1.8349151567443198, + "grad_norm": 0.26384569268835023, + "learning_rate": 6.564237430877192e-06, + "loss": 0.4087, + "step": 12760 + }, + { + "epoch": 1.8356341673856773, + "grad_norm": 0.27821558281067704, + "learning_rate": 6.557167301263258e-06, + "loss": 0.4052, + "step": 12765 + }, + { + "epoch": 1.8363531780270348, + "grad_norm": 0.259505941454427, + "learning_rate": 6.550099123487336e-06, + "loss": 0.4102, + "step": 12770 + }, + { + "epoch": 1.8370721886683923, + "grad_norm": 0.25155415594346603, + "learning_rate": 6.543032901556569e-06, + "loss": 0.4187, + "step": 12775 + }, + { + "epoch": 1.8377911993097498, + "grad_norm": 0.2634295028970137, + "learning_rate": 6.5359686394769905e-06, + "loss": 0.4074, + "step": 12780 + }, + { + "epoch": 1.8385102099511073, + "grad_norm": 0.25269820552671673, + "learning_rate": 6.528906341253536e-06, + "loss": 0.4201, + "step": 12785 + }, + { + "epoch": 1.8392292205924647, + "grad_norm": 0.2520139078540608, + "learning_rate": 6.521846010890014e-06, + "loss": 0.4208, + "step": 12790 + }, + { + "epoch": 1.8399482312338222, + "grad_norm": 0.25448265431325007, + "learning_rate": 6.514787652389125e-06, + "loss": 0.4127, + "step": 12795 + }, + { + "epoch": 1.8406672418751797, + "grad_norm": 0.25435034988332555, + "learning_rate": 6.507731269752448e-06, + "loss": 0.433, + "step": 12800 + }, + { + "epoch": 1.8413862525165372, + "grad_norm": 0.25312210692886666, + "learning_rate": 6.500676866980449e-06, + "loss": 0.4073, + "step": 12805 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.25850193244283504, + "learning_rate": 6.4936244480724575e-06, + "loss": 0.3943, + "step": 12810 + }, + { + "epoch": 1.8428242737992522, + "grad_norm": 0.2654030793300329, + "learning_rate": 6.486574017026694e-06, + "loss": 0.4157, + "step": 12815 + }, + { + "epoch": 1.8435432844406097, + "grad_norm": 0.2584387715247146, + "learning_rate": 6.4795255778402375e-06, + "loss": 0.4032, + "step": 12820 + }, + { + "epoch": 1.8442622950819674, + "grad_norm": 0.2654331773866595, + "learning_rate": 6.472479134509052e-06, + "loss": 0.4061, + "step": 12825 + }, + { + "epoch": 1.8449813057233246, + "grad_norm": 0.25224831243599477, + "learning_rate": 6.465434691027963e-06, + "loss": 0.4144, + "step": 12830 + }, + { + "epoch": 1.8457003163646823, + "grad_norm": 0.24964713143355033, + "learning_rate": 6.458392251390654e-06, + "loss": 0.4234, + "step": 12835 + }, + { + "epoch": 1.8464193270060396, + "grad_norm": 0.2729379241895573, + "learning_rate": 6.45135181958969e-06, + "loss": 0.435, + "step": 12840 + }, + { + "epoch": 1.8471383376473973, + "grad_norm": 0.26302708975701156, + "learning_rate": 6.4443133996164844e-06, + "loss": 0.4125, + "step": 12845 + }, + { + "epoch": 1.8478573482887546, + "grad_norm": 0.2555072174920062, + "learning_rate": 6.437276995461311e-06, + "loss": 0.4058, + "step": 12850 + }, + { + "epoch": 1.8485763589301123, + "grad_norm": 0.25696777543860977, + "learning_rate": 6.430242611113312e-06, + "loss": 0.4202, + "step": 12855 + }, + { + "epoch": 1.8492953695714696, + "grad_norm": 0.2774263836915849, + "learning_rate": 6.423210250560471e-06, + "loss": 0.414, + "step": 12860 + }, + { + "epoch": 1.8500143802128273, + "grad_norm": 0.25292831806005195, + "learning_rate": 6.4161799177896265e-06, + "loss": 0.4246, + "step": 12865 + }, + { + "epoch": 1.8507333908541845, + "grad_norm": 0.2523045428415083, + "learning_rate": 6.409151616786475e-06, + "loss": 0.4077, + "step": 12870 + }, + { + "epoch": 1.8514524014955422, + "grad_norm": 0.260183429075673, + "learning_rate": 6.402125351535557e-06, + "loss": 0.4137, + "step": 12875 + }, + { + "epoch": 1.8521714121368995, + "grad_norm": 0.25793382343739896, + "learning_rate": 6.395101126020256e-06, + "loss": 0.4201, + "step": 12880 + }, + { + "epoch": 1.8528904227782572, + "grad_norm": 0.2534260150257859, + "learning_rate": 6.388078944222804e-06, + "loss": 0.4015, + "step": 12885 + }, + { + "epoch": 1.8536094334196145, + "grad_norm": 0.24802590654137802, + "learning_rate": 6.38105881012427e-06, + "loss": 0.4148, + "step": 12890 + }, + { + "epoch": 1.8543284440609722, + "grad_norm": 0.25789258495597844, + "learning_rate": 6.374040727704562e-06, + "loss": 0.4012, + "step": 12895 + }, + { + "epoch": 1.8550474547023295, + "grad_norm": 0.26345993239303045, + "learning_rate": 6.367024700942435e-06, + "loss": 0.4096, + "step": 12900 + }, + { + "epoch": 1.8557664653436872, + "grad_norm": 0.2668160591818745, + "learning_rate": 6.360010733815465e-06, + "loss": 0.4047, + "step": 12905 + }, + { + "epoch": 1.8564854759850444, + "grad_norm": 0.2566605802151127, + "learning_rate": 6.352998830300061e-06, + "loss": 0.4265, + "step": 12910 + }, + { + "epoch": 1.8572044866264021, + "grad_norm": 0.2759657295529972, + "learning_rate": 6.345988994371477e-06, + "loss": 0.4189, + "step": 12915 + }, + { + "epoch": 1.8579234972677594, + "grad_norm": 0.2552560819729121, + "learning_rate": 6.3389812300037774e-06, + "loss": 0.4065, + "step": 12920 + }, + { + "epoch": 1.8586425079091171, + "grad_norm": 0.24802835502600706, + "learning_rate": 6.33197554116986e-06, + "loss": 0.4004, + "step": 12925 + }, + { + "epoch": 1.8593615185504746, + "grad_norm": 0.3281056503270654, + "learning_rate": 6.324971931841453e-06, + "loss": 0.4188, + "step": 12930 + }, + { + "epoch": 1.860080529191832, + "grad_norm": 0.25313620800312736, + "learning_rate": 6.317970405989086e-06, + "loss": 0.4176, + "step": 12935 + }, + { + "epoch": 1.8607995398331896, + "grad_norm": 0.25855560585825904, + "learning_rate": 6.310970967582131e-06, + "loss": 0.4116, + "step": 12940 + }, + { + "epoch": 1.861518550474547, + "grad_norm": 0.26802055162752714, + "learning_rate": 6.303973620588757e-06, + "loss": 0.4169, + "step": 12945 + }, + { + "epoch": 1.8622375611159045, + "grad_norm": 0.2578356282861538, + "learning_rate": 6.296978368975958e-06, + "loss": 0.4217, + "step": 12950 + }, + { + "epoch": 1.862956571757262, + "grad_norm": 0.26944809309820655, + "learning_rate": 6.289985216709542e-06, + "loss": 0.4283, + "step": 12955 + }, + { + "epoch": 1.8636755823986195, + "grad_norm": 0.25758698256009627, + "learning_rate": 6.282994167754117e-06, + "loss": 0.4156, + "step": 12960 + }, + { + "epoch": 1.864394593039977, + "grad_norm": 0.2481740645545019, + "learning_rate": 6.276005226073103e-06, + "loss": 0.412, + "step": 12965 + }, + { + "epoch": 1.8651136036813345, + "grad_norm": 0.258105239791881, + "learning_rate": 6.26901839562873e-06, + "loss": 0.3994, + "step": 12970 + }, + { + "epoch": 1.865832614322692, + "grad_norm": 0.2614869674290543, + "learning_rate": 6.262033680382027e-06, + "loss": 0.4363, + "step": 12975 + }, + { + "epoch": 1.8665516249640495, + "grad_norm": 0.24276092784851724, + "learning_rate": 6.255051084292821e-06, + "loss": 0.4002, + "step": 12980 + }, + { + "epoch": 1.867270635605407, + "grad_norm": 0.25774303470890625, + "learning_rate": 6.2480706113197445e-06, + "loss": 0.4014, + "step": 12985 + }, + { + "epoch": 1.8679896462467644, + "grad_norm": 0.26867975206439915, + "learning_rate": 6.241092265420219e-06, + "loss": 0.409, + "step": 12990 + }, + { + "epoch": 1.868708656888122, + "grad_norm": 0.25712796179182074, + "learning_rate": 6.2341160505504636e-06, + "loss": 0.422, + "step": 12995 + }, + { + "epoch": 1.8694276675294794, + "grad_norm": 0.26283971628716823, + "learning_rate": 6.227141970665496e-06, + "loss": 0.4163, + "step": 13000 + }, + { + "epoch": 1.870146678170837, + "grad_norm": 0.26169927239092655, + "learning_rate": 6.220170029719111e-06, + "loss": 0.4106, + "step": 13005 + }, + { + "epoch": 1.8708656888121944, + "grad_norm": 0.24783098567903028, + "learning_rate": 6.213200231663894e-06, + "loss": 0.4216, + "step": 13010 + }, + { + "epoch": 1.8715846994535519, + "grad_norm": 0.2640476765001785, + "learning_rate": 6.206232580451225e-06, + "loss": 0.4137, + "step": 13015 + }, + { + "epoch": 1.8723037100949094, + "grad_norm": 0.24416363628731877, + "learning_rate": 6.199267080031257e-06, + "loss": 0.3997, + "step": 13020 + }, + { + "epoch": 1.8730227207362669, + "grad_norm": 0.25814773161479015, + "learning_rate": 6.192303734352925e-06, + "loss": 0.4153, + "step": 13025 + }, + { + "epoch": 1.8737417313776243, + "grad_norm": 0.2569713415930947, + "learning_rate": 6.185342547363947e-06, + "loss": 0.412, + "step": 13030 + }, + { + "epoch": 1.8744607420189818, + "grad_norm": 0.2516973791241243, + "learning_rate": 6.178383523010813e-06, + "loss": 0.4111, + "step": 13035 + }, + { + "epoch": 1.8751797526603395, + "grad_norm": 0.26717730576239873, + "learning_rate": 6.171426665238787e-06, + "loss": 0.4258, + "step": 13040 + }, + { + "epoch": 1.8758987633016968, + "grad_norm": 0.2525000043033776, + "learning_rate": 6.164471977991908e-06, + "loss": 0.4084, + "step": 13045 + }, + { + "epoch": 1.8766177739430545, + "grad_norm": 0.2568542615635378, + "learning_rate": 6.15751946521298e-06, + "loss": 0.4228, + "step": 13050 + }, + { + "epoch": 1.8773367845844118, + "grad_norm": 0.25254186094659625, + "learning_rate": 6.150569130843582e-06, + "loss": 0.411, + "step": 13055 + }, + { + "epoch": 1.8780557952257695, + "grad_norm": 0.26287766874238017, + "learning_rate": 6.143620978824048e-06, + "loss": 0.4057, + "step": 13060 + }, + { + "epoch": 1.8787748058671268, + "grad_norm": 0.2691898986952207, + "learning_rate": 6.1366750130934785e-06, + "loss": 0.4093, + "step": 13065 + }, + { + "epoch": 1.8794938165084845, + "grad_norm": 0.24589000962412044, + "learning_rate": 6.129731237589738e-06, + "loss": 0.3976, + "step": 13070 + }, + { + "epoch": 1.8802128271498417, + "grad_norm": 0.24933464122253188, + "learning_rate": 6.1227896562494485e-06, + "loss": 0.3975, + "step": 13075 + }, + { + "epoch": 1.8809318377911994, + "grad_norm": 0.2551105905003046, + "learning_rate": 6.11585027300798e-06, + "loss": 0.4197, + "step": 13080 + }, + { + "epoch": 1.8816508484325567, + "grad_norm": 0.2548425666015531, + "learning_rate": 6.10891309179947e-06, + "loss": 0.4134, + "step": 13085 + }, + { + "epoch": 1.8823698590739144, + "grad_norm": 0.2519283793583434, + "learning_rate": 6.1019781165567946e-06, + "loss": 0.4058, + "step": 13090 + }, + { + "epoch": 1.8830888697152717, + "grad_norm": 0.258597723985932, + "learning_rate": 6.095045351211586e-06, + "loss": 0.4083, + "step": 13095 + }, + { + "epoch": 1.8838078803566294, + "grad_norm": 0.2571978705208484, + "learning_rate": 6.088114799694229e-06, + "loss": 0.4177, + "step": 13100 + }, + { + "epoch": 1.8845268909979866, + "grad_norm": 0.25178796849206486, + "learning_rate": 6.081186465933839e-06, + "loss": 0.4056, + "step": 13105 + }, + { + "epoch": 1.8852459016393444, + "grad_norm": 0.2474711452655329, + "learning_rate": 6.074260353858283e-06, + "loss": 0.4215, + "step": 13110 + }, + { + "epoch": 1.8859649122807016, + "grad_norm": 0.2570837114079029, + "learning_rate": 6.067336467394169e-06, + "loss": 0.395, + "step": 13115 + }, + { + "epoch": 1.8866839229220593, + "grad_norm": 0.26398409651585353, + "learning_rate": 6.060414810466844e-06, + "loss": 0.4118, + "step": 13120 + }, + { + "epoch": 1.8874029335634166, + "grad_norm": 0.24110724622419008, + "learning_rate": 6.053495387000382e-06, + "loss": 0.3981, + "step": 13125 + }, + { + "epoch": 1.8881219442047743, + "grad_norm": 0.25341520879124657, + "learning_rate": 6.0465782009176056e-06, + "loss": 0.4209, + "step": 13130 + }, + { + "epoch": 1.8888409548461316, + "grad_norm": 0.25898627368433486, + "learning_rate": 6.039663256140055e-06, + "loss": 0.4053, + "step": 13135 + }, + { + "epoch": 1.8895599654874893, + "grad_norm": 0.26413833142848836, + "learning_rate": 6.032750556588004e-06, + "loss": 0.4044, + "step": 13140 + }, + { + "epoch": 1.8902789761288465, + "grad_norm": 0.26163899057288426, + "learning_rate": 6.0258401061804625e-06, + "loss": 0.4061, + "step": 13145 + }, + { + "epoch": 1.8909979867702043, + "grad_norm": 0.2626566632795502, + "learning_rate": 6.01893190883515e-06, + "loss": 0.4197, + "step": 13150 + }, + { + "epoch": 1.8917169974115617, + "grad_norm": 0.24307479703105075, + "learning_rate": 6.012025968468525e-06, + "loss": 0.4182, + "step": 13155 + }, + { + "epoch": 1.8924360080529192, + "grad_norm": 0.2566826095216113, + "learning_rate": 6.005122288995748e-06, + "loss": 0.4163, + "step": 13160 + }, + { + "epoch": 1.8931550186942767, + "grad_norm": 0.2560152561936091, + "learning_rate": 5.998220874330714e-06, + "loss": 0.4284, + "step": 13165 + }, + { + "epoch": 1.8938740293356342, + "grad_norm": 0.26048476462091, + "learning_rate": 5.991321728386028e-06, + "loss": 0.4049, + "step": 13170 + }, + { + "epoch": 1.8945930399769917, + "grad_norm": 0.2585226989827463, + "learning_rate": 5.984424855073007e-06, + "loss": 0.431, + "step": 13175 + }, + { + "epoch": 1.8953120506183492, + "grad_norm": 0.26135587087437545, + "learning_rate": 5.977530258301678e-06, + "loss": 0.4132, + "step": 13180 + }, + { + "epoch": 1.8960310612597067, + "grad_norm": 0.2523667592703136, + "learning_rate": 5.970637941980786e-06, + "loss": 0.3932, + "step": 13185 + }, + { + "epoch": 1.8967500719010641, + "grad_norm": 0.24588638369790913, + "learning_rate": 5.963747910017774e-06, + "loss": 0.4186, + "step": 13190 + }, + { + "epoch": 1.8974690825424216, + "grad_norm": 0.2537110075846357, + "learning_rate": 5.956860166318792e-06, + "loss": 0.4132, + "step": 13195 + }, + { + "epoch": 1.8981880931837791, + "grad_norm": 0.24279395102296025, + "learning_rate": 5.949974714788702e-06, + "loss": 0.4037, + "step": 13200 + }, + { + "epoch": 1.8989071038251366, + "grad_norm": 0.2658080544254367, + "learning_rate": 5.943091559331054e-06, + "loss": 0.3998, + "step": 13205 + }, + { + "epoch": 1.899626114466494, + "grad_norm": 0.2710711751352035, + "learning_rate": 5.936210703848095e-06, + "loss": 0.4138, + "step": 13210 + }, + { + "epoch": 1.9003451251078516, + "grad_norm": 0.2500168805986903, + "learning_rate": 5.929332152240782e-06, + "loss": 0.4035, + "step": 13215 + }, + { + "epoch": 1.901064135749209, + "grad_norm": 0.2586364723380644, + "learning_rate": 5.922455908408757e-06, + "loss": 0.4062, + "step": 13220 + }, + { + "epoch": 1.9017831463905666, + "grad_norm": 0.2602658295039629, + "learning_rate": 5.915581976250351e-06, + "loss": 0.4154, + "step": 13225 + }, + { + "epoch": 1.902502157031924, + "grad_norm": 0.2646875529076759, + "learning_rate": 5.908710359662595e-06, + "loss": 0.4235, + "step": 13230 + }, + { + "epoch": 1.9032211676732815, + "grad_norm": 0.25362401233014675, + "learning_rate": 5.901841062541192e-06, + "loss": 0.4195, + "step": 13235 + }, + { + "epoch": 1.903940178314639, + "grad_norm": 0.2522932738809794, + "learning_rate": 5.894974088780543e-06, + "loss": 0.4002, + "step": 13240 + }, + { + "epoch": 1.9046591889559965, + "grad_norm": 0.2641327392718366, + "learning_rate": 5.888109442273729e-06, + "loss": 0.4084, + "step": 13245 + }, + { + "epoch": 1.905378199597354, + "grad_norm": 0.24629504499371987, + "learning_rate": 5.881247126912506e-06, + "loss": 0.4099, + "step": 13250 + }, + { + "epoch": 1.9060972102387115, + "grad_norm": 0.2542730372298661, + "learning_rate": 5.874387146587311e-06, + "loss": 0.4094, + "step": 13255 + }, + { + "epoch": 1.906816220880069, + "grad_norm": 0.2550950761308887, + "learning_rate": 5.867529505187264e-06, + "loss": 0.4031, + "step": 13260 + }, + { + "epoch": 1.9075352315214267, + "grad_norm": 0.2625217431254056, + "learning_rate": 5.860674206600145e-06, + "loss": 0.4129, + "step": 13265 + }, + { + "epoch": 1.908254242162784, + "grad_norm": 0.2557313481219586, + "learning_rate": 5.853821254712426e-06, + "loss": 0.3976, + "step": 13270 + }, + { + "epoch": 1.9089732528041417, + "grad_norm": 0.25342454097146244, + "learning_rate": 5.8469706534092315e-06, + "loss": 0.3964, + "step": 13275 + }, + { + "epoch": 1.909692263445499, + "grad_norm": 0.2478590136487242, + "learning_rate": 5.840122406574352e-06, + "loss": 0.402, + "step": 13280 + }, + { + "epoch": 1.9104112740868566, + "grad_norm": 0.25092233391931035, + "learning_rate": 5.833276518090261e-06, + "loss": 0.413, + "step": 13285 + }, + { + "epoch": 1.911130284728214, + "grad_norm": 0.25427748407834705, + "learning_rate": 5.826432991838077e-06, + "loss": 0.4184, + "step": 13290 + }, + { + "epoch": 1.9118492953695716, + "grad_norm": 0.26016680851467244, + "learning_rate": 5.819591831697584e-06, + "loss": 0.4262, + "step": 13295 + }, + { + "epoch": 1.9125683060109289, + "grad_norm": 0.2753524866938533, + "learning_rate": 5.81275304154723e-06, + "loss": 0.421, + "step": 13300 + }, + { + "epoch": 1.9132873166522866, + "grad_norm": 0.2614118633604326, + "learning_rate": 5.805916625264121e-06, + "loss": 0.4089, + "step": 13305 + }, + { + "epoch": 1.9140063272936438, + "grad_norm": 0.2551883246804781, + "learning_rate": 5.799082586724003e-06, + "loss": 0.4057, + "step": 13310 + }, + { + "epoch": 1.9147253379350015, + "grad_norm": 0.3150339976324254, + "learning_rate": 5.792250929801292e-06, + "loss": 0.4191, + "step": 13315 + }, + { + "epoch": 1.9154443485763588, + "grad_norm": 0.2567802822715765, + "learning_rate": 5.785421658369041e-06, + "loss": 0.4276, + "step": 13320 + }, + { + "epoch": 1.9161633592177165, + "grad_norm": 0.25215346030617053, + "learning_rate": 5.7785947762989515e-06, + "loss": 0.4148, + "step": 13325 + }, + { + "epoch": 1.9168823698590738, + "grad_norm": 0.26247158778480834, + "learning_rate": 5.771770287461381e-06, + "loss": 0.4112, + "step": 13330 + }, + { + "epoch": 1.9176013805004315, + "grad_norm": 0.2593393629670231, + "learning_rate": 5.7649481957253195e-06, + "loss": 0.4107, + "step": 13335 + }, + { + "epoch": 1.9183203911417888, + "grad_norm": 0.26044429642699995, + "learning_rate": 5.758128504958396e-06, + "loss": 0.417, + "step": 13340 + }, + { + "epoch": 1.9190394017831465, + "grad_norm": 0.26474646616105685, + "learning_rate": 5.751311219026887e-06, + "loss": 0.419, + "step": 13345 + }, + { + "epoch": 1.9197584124245037, + "grad_norm": 0.2557340922976566, + "learning_rate": 5.744496341795709e-06, + "loss": 0.4199, + "step": 13350 + }, + { + "epoch": 1.9204774230658614, + "grad_norm": 0.25120018293705426, + "learning_rate": 5.737683877128396e-06, + "loss": 0.4138, + "step": 13355 + }, + { + "epoch": 1.9211964337072187, + "grad_norm": 0.25802447772756554, + "learning_rate": 5.730873828887133e-06, + "loss": 0.4358, + "step": 13360 + }, + { + "epoch": 1.9219154443485764, + "grad_norm": 0.268303239943271, + "learning_rate": 5.724066200932724e-06, + "loss": 0.391, + "step": 13365 + }, + { + "epoch": 1.922634454989934, + "grad_norm": 0.266224674709648, + "learning_rate": 5.717260997124597e-06, + "loss": 0.4182, + "step": 13370 + }, + { + "epoch": 1.9233534656312914, + "grad_norm": 0.25210315862281496, + "learning_rate": 5.710458221320823e-06, + "loss": 0.4069, + "step": 13375 + }, + { + "epoch": 1.9240724762726489, + "grad_norm": 0.258824638291291, + "learning_rate": 5.703657877378074e-06, + "loss": 0.4149, + "step": 13380 + }, + { + "epoch": 1.9247914869140064, + "grad_norm": 0.2581183392991827, + "learning_rate": 5.696859969151664e-06, + "loss": 0.3981, + "step": 13385 + }, + { + "epoch": 1.9255104975553639, + "grad_norm": 0.24773348494876463, + "learning_rate": 5.6900645004955155e-06, + "loss": 0.4234, + "step": 13390 + }, + { + "epoch": 1.9262295081967213, + "grad_norm": 0.24356325001336865, + "learning_rate": 5.683271475262165e-06, + "loss": 0.4102, + "step": 13395 + }, + { + "epoch": 1.9269485188380788, + "grad_norm": 0.25903420847328895, + "learning_rate": 5.676480897302767e-06, + "loss": 0.4045, + "step": 13400 + }, + { + "epoch": 1.9276675294794363, + "grad_norm": 0.2598026496554222, + "learning_rate": 5.669692770467101e-06, + "loss": 0.4305, + "step": 13405 + }, + { + "epoch": 1.9283865401207938, + "grad_norm": 0.25495982306280357, + "learning_rate": 5.6629070986035336e-06, + "loss": 0.4123, + "step": 13410 + }, + { + "epoch": 1.9291055507621513, + "grad_norm": 0.25261257640552676, + "learning_rate": 5.6561238855590605e-06, + "loss": 0.3984, + "step": 13415 + }, + { + "epoch": 1.9298245614035088, + "grad_norm": 0.2549778638324147, + "learning_rate": 5.649343135179271e-06, + "loss": 0.4176, + "step": 13420 + }, + { + "epoch": 1.9305435720448663, + "grad_norm": 0.2630585276751543, + "learning_rate": 5.642564851308356e-06, + "loss": 0.413, + "step": 13425 + }, + { + "epoch": 1.9312625826862237, + "grad_norm": 0.2611508569793235, + "learning_rate": 5.635789037789126e-06, + "loss": 0.4117, + "step": 13430 + }, + { + "epoch": 1.9319815933275812, + "grad_norm": 0.2622970798730447, + "learning_rate": 5.629015698462969e-06, + "loss": 0.4215, + "step": 13435 + }, + { + "epoch": 1.9327006039689387, + "grad_norm": 0.2704859040010164, + "learning_rate": 5.622244837169881e-06, + "loss": 0.4196, + "step": 13440 + }, + { + "epoch": 1.9334196146102962, + "grad_norm": 0.25127827927080826, + "learning_rate": 5.615476457748456e-06, + "loss": 0.4311, + "step": 13445 + }, + { + "epoch": 1.9341386252516537, + "grad_norm": 0.2596218554377162, + "learning_rate": 5.6087105640358794e-06, + "loss": 0.412, + "step": 13450 + }, + { + "epoch": 1.9348576358930112, + "grad_norm": 0.26981195401964797, + "learning_rate": 5.6019471598679176e-06, + "loss": 0.4086, + "step": 13455 + }, + { + "epoch": 1.9355766465343687, + "grad_norm": 0.2645329644303195, + "learning_rate": 5.595186249078943e-06, + "loss": 0.4126, + "step": 13460 + }, + { + "epoch": 1.9362956571757262, + "grad_norm": 0.25139715586511135, + "learning_rate": 5.588427835501899e-06, + "loss": 0.4078, + "step": 13465 + }, + { + "epoch": 1.9370146678170836, + "grad_norm": 0.26008467818228065, + "learning_rate": 5.581671922968316e-06, + "loss": 0.4313, + "step": 13470 + }, + { + "epoch": 1.9377336784584411, + "grad_norm": 0.2598127792714966, + "learning_rate": 5.574918515308316e-06, + "loss": 0.4104, + "step": 13475 + }, + { + "epoch": 1.9384526890997988, + "grad_norm": 0.26726922409397147, + "learning_rate": 5.568167616350588e-06, + "loss": 0.4097, + "step": 13480 + }, + { + "epoch": 1.939171699741156, + "grad_norm": 0.2586683894957867, + "learning_rate": 5.561419229922414e-06, + "loss": 0.3944, + "step": 13485 + }, + { + "epoch": 1.9398907103825138, + "grad_norm": 0.2545620194841944, + "learning_rate": 5.554673359849632e-06, + "loss": 0.4045, + "step": 13490 + }, + { + "epoch": 1.940609721023871, + "grad_norm": 0.255270974055844, + "learning_rate": 5.5479300099566735e-06, + "loss": 0.4124, + "step": 13495 + }, + { + "epoch": 1.9413287316652288, + "grad_norm": 0.2824456607568959, + "learning_rate": 5.541189184066524e-06, + "loss": 0.4144, + "step": 13500 + }, + { + "epoch": 1.942047742306586, + "grad_norm": 0.26608110665659906, + "learning_rate": 5.534450886000754e-06, + "loss": 0.3896, + "step": 13505 + }, + { + "epoch": 1.9427667529479438, + "grad_norm": 0.2630987060749629, + "learning_rate": 5.527715119579484e-06, + "loss": 0.4041, + "step": 13510 + }, + { + "epoch": 1.943485763589301, + "grad_norm": 0.25227145550355407, + "learning_rate": 5.520981888621419e-06, + "loss": 0.399, + "step": 13515 + }, + { + "epoch": 1.9442047742306587, + "grad_norm": 0.2568201628350947, + "learning_rate": 5.514251196943808e-06, + "loss": 0.4043, + "step": 13520 + }, + { + "epoch": 1.944923784872016, + "grad_norm": 0.24854524537821318, + "learning_rate": 5.507523048362464e-06, + "loss": 0.4037, + "step": 13525 + }, + { + "epoch": 1.9456427955133737, + "grad_norm": 0.24892934206827358, + "learning_rate": 5.5007974466917745e-06, + "loss": 0.4061, + "step": 13530 + }, + { + "epoch": 1.946361806154731, + "grad_norm": 0.25452226933530475, + "learning_rate": 5.494074395744663e-06, + "loss": 0.4195, + "step": 13535 + }, + { + "epoch": 1.9470808167960887, + "grad_norm": 0.2640790007497816, + "learning_rate": 5.487353899332613e-06, + "loss": 0.4066, + "step": 13540 + }, + { + "epoch": 1.947799827437446, + "grad_norm": 0.2571805148580205, + "learning_rate": 5.480635961265663e-06, + "loss": 0.4171, + "step": 13545 + }, + { + "epoch": 1.9485188380788037, + "grad_norm": 0.2596407932136947, + "learning_rate": 5.473920585352408e-06, + "loss": 0.4178, + "step": 13550 + }, + { + "epoch": 1.949237848720161, + "grad_norm": 0.2508756708764493, + "learning_rate": 5.46720777539997e-06, + "loss": 0.4195, + "step": 13555 + }, + { + "epoch": 1.9499568593615186, + "grad_norm": 0.25792489723424716, + "learning_rate": 5.460497535214037e-06, + "loss": 0.4141, + "step": 13560 + }, + { + "epoch": 1.950675870002876, + "grad_norm": 0.25915427113602196, + "learning_rate": 5.453789868598831e-06, + "loss": 0.3975, + "step": 13565 + }, + { + "epoch": 1.9513948806442336, + "grad_norm": 0.2615882332919308, + "learning_rate": 5.447084779357108e-06, + "loss": 0.403, + "step": 13570 + }, + { + "epoch": 1.9521138912855909, + "grad_norm": 0.2764317227668451, + "learning_rate": 5.4403822712901784e-06, + "loss": 0.4106, + "step": 13575 + }, + { + "epoch": 1.9528329019269486, + "grad_norm": 0.26583620020318643, + "learning_rate": 5.43368234819788e-06, + "loss": 0.4073, + "step": 13580 + }, + { + "epoch": 1.9535519125683058, + "grad_norm": 0.2510693373522846, + "learning_rate": 5.42698501387858e-06, + "loss": 0.4073, + "step": 13585 + }, + { + "epoch": 1.9542709232096636, + "grad_norm": 0.2754723365853278, + "learning_rate": 5.420290272129189e-06, + "loss": 0.417, + "step": 13590 + }, + { + "epoch": 1.954989933851021, + "grad_norm": 0.26119369021393657, + "learning_rate": 5.413598126745143e-06, + "loss": 0.4086, + "step": 13595 + }, + { + "epoch": 1.9557089444923785, + "grad_norm": 0.264200670292894, + "learning_rate": 5.406908581520411e-06, + "loss": 0.4234, + "step": 13600 + }, + { + "epoch": 1.956427955133736, + "grad_norm": 0.25025560341334857, + "learning_rate": 5.400221640247476e-06, + "loss": 0.4014, + "step": 13605 + }, + { + "epoch": 1.9571469657750935, + "grad_norm": 0.24933665275190425, + "learning_rate": 5.393537306717351e-06, + "loss": 0.4167, + "step": 13610 + }, + { + "epoch": 1.957865976416451, + "grad_norm": 0.25356414864778404, + "learning_rate": 5.386855584719578e-06, + "loss": 0.4021, + "step": 13615 + }, + { + "epoch": 1.9585849870578085, + "grad_norm": 0.2589269061483978, + "learning_rate": 5.380176478042207e-06, + "loss": 0.4125, + "step": 13620 + }, + { + "epoch": 1.959303997699166, + "grad_norm": 0.25559736392321186, + "learning_rate": 5.373499990471809e-06, + "loss": 0.4209, + "step": 13625 + }, + { + "epoch": 1.9600230083405235, + "grad_norm": 0.2597373727126001, + "learning_rate": 5.3668261257934766e-06, + "loss": 0.4205, + "step": 13630 + }, + { + "epoch": 1.960742018981881, + "grad_norm": 0.2549162950845531, + "learning_rate": 5.360154887790806e-06, + "loss": 0.4124, + "step": 13635 + }, + { + "epoch": 1.9614610296232384, + "grad_norm": 0.2606798442782481, + "learning_rate": 5.353486280245905e-06, + "loss": 0.4163, + "step": 13640 + }, + { + "epoch": 1.962180040264596, + "grad_norm": 0.2608700722504959, + "learning_rate": 5.3468203069394e-06, + "loss": 0.4171, + "step": 13645 + }, + { + "epoch": 1.9628990509059534, + "grad_norm": 0.2556515194260987, + "learning_rate": 5.340156971650416e-06, + "loss": 0.4026, + "step": 13650 + }, + { + "epoch": 1.963618061547311, + "grad_norm": 0.24822280007935313, + "learning_rate": 5.333496278156581e-06, + "loss": 0.3912, + "step": 13655 + }, + { + "epoch": 1.9643370721886684, + "grad_norm": 0.2632693931318459, + "learning_rate": 5.326838230234034e-06, + "loss": 0.4155, + "step": 13660 + }, + { + "epoch": 1.9650560828300259, + "grad_norm": 0.25566472312596306, + "learning_rate": 5.320182831657403e-06, + "loss": 0.4087, + "step": 13665 + }, + { + "epoch": 1.9657750934713834, + "grad_norm": 0.2593060159688528, + "learning_rate": 5.3135300861998186e-06, + "loss": 0.4148, + "step": 13670 + }, + { + "epoch": 1.9664941041127408, + "grad_norm": 0.25357384343389155, + "learning_rate": 5.3068799976329125e-06, + "loss": 0.4112, + "step": 13675 + }, + { + "epoch": 1.9672131147540983, + "grad_norm": 0.2669274390452347, + "learning_rate": 5.300232569726805e-06, + "loss": 0.41, + "step": 13680 + }, + { + "epoch": 1.9679321253954558, + "grad_norm": 0.26011483232030275, + "learning_rate": 5.2935878062501e-06, + "loss": 0.4083, + "step": 13685 + }, + { + "epoch": 1.9686511360368133, + "grad_norm": 0.25724239983605096, + "learning_rate": 5.286945710969909e-06, + "loss": 0.4197, + "step": 13690 + }, + { + "epoch": 1.9693701466781708, + "grad_norm": 0.26454569242107173, + "learning_rate": 5.28030628765182e-06, + "loss": 0.4011, + "step": 13695 + }, + { + "epoch": 1.9700891573195283, + "grad_norm": 0.25623339763113145, + "learning_rate": 5.273669540059905e-06, + "loss": 0.4101, + "step": 13700 + }, + { + "epoch": 1.970808167960886, + "grad_norm": 0.26380601542253146, + "learning_rate": 5.2670354719567256e-06, + "loss": 0.4012, + "step": 13705 + }, + { + "epoch": 1.9715271786022432, + "grad_norm": 0.2534213399237968, + "learning_rate": 5.260404087103312e-06, + "loss": 0.4069, + "step": 13710 + }, + { + "epoch": 1.972246189243601, + "grad_norm": 0.25076119035156, + "learning_rate": 5.253775389259193e-06, + "loss": 0.4086, + "step": 13715 + }, + { + "epoch": 1.9729651998849582, + "grad_norm": 0.2798231328631839, + "learning_rate": 5.247149382182355e-06, + "loss": 0.4035, + "step": 13720 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.24860145144078252, + "learning_rate": 5.240526069629265e-06, + "loss": 0.3852, + "step": 13725 + }, + { + "epoch": 1.9744032211676732, + "grad_norm": 0.24961360405881555, + "learning_rate": 5.23390545535487e-06, + "loss": 0.4147, + "step": 13730 + }, + { + "epoch": 1.975122231809031, + "grad_norm": 0.26259295202760446, + "learning_rate": 5.227287543112573e-06, + "loss": 0.41, + "step": 13735 + }, + { + "epoch": 1.9758412424503882, + "grad_norm": 0.27428978408513577, + "learning_rate": 5.220672336654265e-06, + "loss": 0.4079, + "step": 13740 + }, + { + "epoch": 1.9765602530917459, + "grad_norm": 0.2635671472080491, + "learning_rate": 5.214059839730277e-06, + "loss": 0.4091, + "step": 13745 + }, + { + "epoch": 1.9772792637331031, + "grad_norm": 0.2513180411967393, + "learning_rate": 5.207450056089431e-06, + "loss": 0.4079, + "step": 13750 + }, + { + "epoch": 1.9779982743744609, + "grad_norm": 0.25223460265034453, + "learning_rate": 5.200842989478989e-06, + "loss": 0.4059, + "step": 13755 + }, + { + "epoch": 1.9787172850158181, + "grad_norm": 0.24391249457708836, + "learning_rate": 5.194238643644689e-06, + "loss": 0.3982, + "step": 13760 + }, + { + "epoch": 1.9794362956571758, + "grad_norm": 0.2531494228581974, + "learning_rate": 5.187637022330715e-06, + "loss": 0.4156, + "step": 13765 + }, + { + "epoch": 1.980155306298533, + "grad_norm": 0.2550019649565652, + "learning_rate": 5.181038129279708e-06, + "loss": 0.4174, + "step": 13770 + }, + { + "epoch": 1.9808743169398908, + "grad_norm": 0.27942094830551606, + "learning_rate": 5.174441968232769e-06, + "loss": 0.4106, + "step": 13775 + }, + { + "epoch": 1.981593327581248, + "grad_norm": 0.25451593113355353, + "learning_rate": 5.167848542929446e-06, + "loss": 0.4094, + "step": 13780 + }, + { + "epoch": 1.9823123382226058, + "grad_norm": 0.26568260174618225, + "learning_rate": 5.161257857107729e-06, + "loss": 0.4137, + "step": 13785 + }, + { + "epoch": 1.983031348863963, + "grad_norm": 0.2603853156579003, + "learning_rate": 5.154669914504068e-06, + "loss": 0.4055, + "step": 13790 + }, + { + "epoch": 1.9837503595053207, + "grad_norm": 0.2620176232894427, + "learning_rate": 5.148084718853354e-06, + "loss": 0.4127, + "step": 13795 + }, + { + "epoch": 1.984469370146678, + "grad_norm": 0.2530362930596608, + "learning_rate": 5.141502273888912e-06, + "loss": 0.4214, + "step": 13800 + }, + { + "epoch": 1.9851883807880357, + "grad_norm": 0.25447724836362867, + "learning_rate": 5.134922583342521e-06, + "loss": 0.4001, + "step": 13805 + }, + { + "epoch": 1.9859073914293932, + "grad_norm": 0.2540296468666251, + "learning_rate": 5.128345650944384e-06, + "loss": 0.4042, + "step": 13810 + }, + { + "epoch": 1.9866264020707507, + "grad_norm": 0.2577371870378256, + "learning_rate": 5.1217714804231545e-06, + "loss": 0.4191, + "step": 13815 + }, + { + "epoch": 1.9873454127121082, + "grad_norm": 0.25990723537924326, + "learning_rate": 5.115200075505908e-06, + "loss": 0.409, + "step": 13820 + }, + { + "epoch": 1.9880644233534657, + "grad_norm": 0.25299497569163515, + "learning_rate": 5.108631439918158e-06, + "loss": 0.4048, + "step": 13825 + }, + { + "epoch": 1.9887834339948232, + "grad_norm": 0.2588824820181239, + "learning_rate": 5.102065577383852e-06, + "loss": 0.4205, + "step": 13830 + }, + { + "epoch": 1.9895024446361806, + "grad_norm": 0.26804413155093115, + "learning_rate": 5.095502491625353e-06, + "loss": 0.4301, + "step": 13835 + }, + { + "epoch": 1.9902214552775381, + "grad_norm": 0.25278955155155564, + "learning_rate": 5.0889421863634636e-06, + "loss": 0.399, + "step": 13840 + }, + { + "epoch": 1.9909404659188956, + "grad_norm": 0.26550430618529297, + "learning_rate": 5.082384665317406e-06, + "loss": 0.4136, + "step": 13845 + }, + { + "epoch": 1.991659476560253, + "grad_norm": 0.25015117720339847, + "learning_rate": 5.075829932204818e-06, + "loss": 0.3849, + "step": 13850 + }, + { + "epoch": 1.9923784872016106, + "grad_norm": 0.2645179055223207, + "learning_rate": 5.069277990741758e-06, + "loss": 0.385, + "step": 13855 + }, + { + "epoch": 1.993097497842968, + "grad_norm": 0.25131298311416017, + "learning_rate": 5.062728844642712e-06, + "loss": 0.4058, + "step": 13860 + }, + { + "epoch": 1.9938165084843256, + "grad_norm": 0.25839485709157983, + "learning_rate": 5.05618249762057e-06, + "loss": 0.412, + "step": 13865 + }, + { + "epoch": 1.994535519125683, + "grad_norm": 0.2579182912611581, + "learning_rate": 5.049638953386635e-06, + "loss": 0.4018, + "step": 13870 + }, + { + "epoch": 1.9952545297670405, + "grad_norm": 0.2580154354036394, + "learning_rate": 5.043098215650634e-06, + "loss": 0.4002, + "step": 13875 + }, + { + "epoch": 1.995973540408398, + "grad_norm": 0.25201844554372577, + "learning_rate": 5.0365602881206845e-06, + "loss": 0.4069, + "step": 13880 + }, + { + "epoch": 1.9966925510497555, + "grad_norm": 0.25214547421292893, + "learning_rate": 5.030025174503327e-06, + "loss": 0.4029, + "step": 13885 + }, + { + "epoch": 1.997411561691113, + "grad_norm": 0.26499122387478796, + "learning_rate": 5.023492878503495e-06, + "loss": 0.4104, + "step": 13890 + }, + { + "epoch": 1.9981305723324705, + "grad_norm": 0.26369627402307694, + "learning_rate": 5.016963403824535e-06, + "loss": 0.4221, + "step": 13895 + }, + { + "epoch": 1.998849582973828, + "grad_norm": 0.2969355284033133, + "learning_rate": 5.010436754168182e-06, + "loss": 0.4133, + "step": 13900 + }, + { + "epoch": 1.9995685936151855, + "grad_norm": 0.2662741266239133, + "learning_rate": 5.003912933234584e-06, + "loss": 0.4026, + "step": 13905 + }, + { + "epoch": 2.0, + "eval_loss": 0.44030478596687317, + "eval_runtime": 0.6026, + "eval_samples_per_second": 41.485, + "eval_steps_per_second": 1.659, + "step": 13908 + }, + { + "epoch": 2.000287604256543, + "grad_norm": 0.318210591858962, + "learning_rate": 4.997391944722272e-06, + "loss": 0.3762, + "step": 13910 + }, + { + "epoch": 2.0010066148979004, + "grad_norm": 0.2962005622488439, + "learning_rate": 4.990873792328173e-06, + "loss": 0.3654, + "step": 13915 + }, + { + "epoch": 2.001725625539258, + "grad_norm": 0.30623782183632015, + "learning_rate": 4.984358479747618e-06, + "loss": 0.3534, + "step": 13920 + }, + { + "epoch": 2.0024446361806154, + "grad_norm": 0.30036561935160455, + "learning_rate": 4.9778460106743134e-06, + "loss": 0.3678, + "step": 13925 + }, + { + "epoch": 2.003163646821973, + "grad_norm": 0.3307656806538744, + "learning_rate": 4.971336388800364e-06, + "loss": 0.3447, + "step": 13930 + }, + { + "epoch": 2.0038826574633304, + "grad_norm": 0.2771791786177321, + "learning_rate": 4.9648296178162506e-06, + "loss": 0.3676, + "step": 13935 + }, + { + "epoch": 2.004601668104688, + "grad_norm": 0.28213964927100427, + "learning_rate": 4.958325701410848e-06, + "loss": 0.3631, + "step": 13940 + }, + { + "epoch": 2.0053206787460454, + "grad_norm": 0.2861346785121185, + "learning_rate": 4.951824643271409e-06, + "loss": 0.3606, + "step": 13945 + }, + { + "epoch": 2.006039689387403, + "grad_norm": 0.2769230735597401, + "learning_rate": 4.945326447083565e-06, + "loss": 0.3546, + "step": 13950 + }, + { + "epoch": 2.0067587000287603, + "grad_norm": 0.3057122437215269, + "learning_rate": 4.938831116531317e-06, + "loss": 0.3666, + "step": 13955 + }, + { + "epoch": 2.007477710670118, + "grad_norm": 0.2875326612135141, + "learning_rate": 4.932338655297061e-06, + "loss": 0.3474, + "step": 13960 + }, + { + "epoch": 2.0081967213114753, + "grad_norm": 0.28108990760304803, + "learning_rate": 4.925849067061548e-06, + "loss": 0.3575, + "step": 13965 + }, + { + "epoch": 2.008915731952833, + "grad_norm": 0.2915498411636721, + "learning_rate": 4.919362355503904e-06, + "loss": 0.3641, + "step": 13970 + }, + { + "epoch": 2.0096347425941903, + "grad_norm": 0.2987753594184248, + "learning_rate": 4.912878524301634e-06, + "loss": 0.3468, + "step": 13975 + }, + { + "epoch": 2.010353753235548, + "grad_norm": 0.2824354278114593, + "learning_rate": 4.906397577130597e-06, + "loss": 0.3572, + "step": 13980 + }, + { + "epoch": 2.0110727638769053, + "grad_norm": 0.27434107221527915, + "learning_rate": 4.899919517665024e-06, + "loss": 0.3617, + "step": 13985 + }, + { + "epoch": 2.011791774518263, + "grad_norm": 0.2885017590335467, + "learning_rate": 4.893444349577514e-06, + "loss": 0.3597, + "step": 13990 + }, + { + "epoch": 2.0125107851596202, + "grad_norm": 0.301491921813707, + "learning_rate": 4.886972076539016e-06, + "loss": 0.3466, + "step": 13995 + }, + { + "epoch": 2.013229795800978, + "grad_norm": 0.2843171690037388, + "learning_rate": 4.880502702218838e-06, + "loss": 0.3601, + "step": 14000 + }, + { + "epoch": 2.013948806442335, + "grad_norm": 0.2853742848739993, + "learning_rate": 4.874036230284658e-06, + "loss": 0.3503, + "step": 14005 + }, + { + "epoch": 2.014667817083693, + "grad_norm": 0.30382693759446466, + "learning_rate": 4.867572664402494e-06, + "loss": 0.3474, + "step": 14010 + }, + { + "epoch": 2.01538682772505, + "grad_norm": 0.28593200867510593, + "learning_rate": 4.861112008236719e-06, + "loss": 0.35, + "step": 14015 + }, + { + "epoch": 2.016105838366408, + "grad_norm": 0.27405861148051325, + "learning_rate": 4.8546542654500674e-06, + "loss": 0.3506, + "step": 14020 + }, + { + "epoch": 2.016824849007765, + "grad_norm": 0.30547019029543415, + "learning_rate": 4.848199439703609e-06, + "loss": 0.3532, + "step": 14025 + }, + { + "epoch": 2.017543859649123, + "grad_norm": 0.32579122125534155, + "learning_rate": 4.8417475346567635e-06, + "loss": 0.369, + "step": 14030 + }, + { + "epoch": 2.01826287029048, + "grad_norm": 0.291301403012313, + "learning_rate": 4.835298553967296e-06, + "loss": 0.3353, + "step": 14035 + }, + { + "epoch": 2.018981880931838, + "grad_norm": 0.2941231633496769, + "learning_rate": 4.828852501291317e-06, + "loss": 0.3484, + "step": 14040 + }, + { + "epoch": 2.019700891573195, + "grad_norm": 0.2862438775719036, + "learning_rate": 4.822409380283276e-06, + "loss": 0.3509, + "step": 14045 + }, + { + "epoch": 2.020419902214553, + "grad_norm": 0.3126513367597776, + "learning_rate": 4.8159691945959554e-06, + "loss": 0.3577, + "step": 14050 + }, + { + "epoch": 2.02113891285591, + "grad_norm": 0.28619669680342014, + "learning_rate": 4.809531947880472e-06, + "loss": 0.3538, + "step": 14055 + }, + { + "epoch": 2.021857923497268, + "grad_norm": 0.28770586287425526, + "learning_rate": 4.803097643786289e-06, + "loss": 0.3591, + "step": 14060 + }, + { + "epoch": 2.022576934138625, + "grad_norm": 0.2940569243641126, + "learning_rate": 4.7966662859611865e-06, + "loss": 0.3534, + "step": 14065 + }, + { + "epoch": 2.0232959447799828, + "grad_norm": 0.29374459433661443, + "learning_rate": 4.790237878051282e-06, + "loss": 0.3507, + "step": 14070 + }, + { + "epoch": 2.02401495542134, + "grad_norm": 0.2907103302826579, + "learning_rate": 4.783812423701022e-06, + "loss": 0.3537, + "step": 14075 + }, + { + "epoch": 2.0247339660626977, + "grad_norm": 0.2837728280937334, + "learning_rate": 4.777389926553172e-06, + "loss": 0.3628, + "step": 14080 + }, + { + "epoch": 2.025452976704055, + "grad_norm": 0.30257042493792674, + "learning_rate": 4.770970390248827e-06, + "loss": 0.3585, + "step": 14085 + }, + { + "epoch": 2.0261719873454127, + "grad_norm": 0.29397252530146495, + "learning_rate": 4.764553818427405e-06, + "loss": 0.3473, + "step": 14090 + }, + { + "epoch": 2.0268909979867704, + "grad_norm": 0.29913070232198746, + "learning_rate": 4.758140214726637e-06, + "loss": 0.3527, + "step": 14095 + }, + { + "epoch": 2.0276100086281277, + "grad_norm": 0.3033281960114325, + "learning_rate": 4.751729582782572e-06, + "loss": 0.3589, + "step": 14100 + }, + { + "epoch": 2.0283290192694854, + "grad_norm": 0.29961537340269007, + "learning_rate": 4.745321926229579e-06, + "loss": 0.3611, + "step": 14105 + }, + { + "epoch": 2.0290480299108427, + "grad_norm": 0.29372344469519207, + "learning_rate": 4.738917248700337e-06, + "loss": 0.3498, + "step": 14110 + }, + { + "epoch": 2.0297670405522004, + "grad_norm": 0.29942224504578746, + "learning_rate": 4.732515553825834e-06, + "loss": 0.3653, + "step": 14115 + }, + { + "epoch": 2.0304860511935576, + "grad_norm": 0.3108499644186701, + "learning_rate": 4.726116845235375e-06, + "loss": 0.3535, + "step": 14120 + }, + { + "epoch": 2.0312050618349153, + "grad_norm": 0.3057455159859912, + "learning_rate": 4.719721126556558e-06, + "loss": 0.3534, + "step": 14125 + }, + { + "epoch": 2.0319240724762726, + "grad_norm": 0.28673110439469285, + "learning_rate": 4.713328401415305e-06, + "loss": 0.3445, + "step": 14130 + }, + { + "epoch": 2.0326430831176303, + "grad_norm": 0.28869406746532883, + "learning_rate": 4.70693867343582e-06, + "loss": 0.3503, + "step": 14135 + }, + { + "epoch": 2.0333620937589876, + "grad_norm": 0.33039922667587013, + "learning_rate": 4.700551946240625e-06, + "loss": 0.3435, + "step": 14140 + }, + { + "epoch": 2.0340811044003453, + "grad_norm": 0.284841054056291, + "learning_rate": 4.694168223450535e-06, + "loss": 0.3636, + "step": 14145 + }, + { + "epoch": 2.0348001150417026, + "grad_norm": 0.2941154139075771, + "learning_rate": 4.687787508684658e-06, + "loss": 0.3637, + "step": 14150 + }, + { + "epoch": 2.0355191256830603, + "grad_norm": 0.30787130118377565, + "learning_rate": 4.681409805560397e-06, + "loss": 0.3624, + "step": 14155 + }, + { + "epoch": 2.0362381363244175, + "grad_norm": 0.2877335384595125, + "learning_rate": 4.675035117693455e-06, + "loss": 0.3499, + "step": 14160 + }, + { + "epoch": 2.0369571469657752, + "grad_norm": 0.29341964401922843, + "learning_rate": 4.668663448697819e-06, + "loss": 0.3517, + "step": 14165 + }, + { + "epoch": 2.0376761576071325, + "grad_norm": 0.28623660995216765, + "learning_rate": 4.662294802185762e-06, + "loss": 0.3475, + "step": 14170 + }, + { + "epoch": 2.03839516824849, + "grad_norm": 0.29546272646756755, + "learning_rate": 4.655929181767853e-06, + "loss": 0.3516, + "step": 14175 + }, + { + "epoch": 2.0391141788898475, + "grad_norm": 0.2938453129065855, + "learning_rate": 4.649566591052935e-06, + "loss": 0.3601, + "step": 14180 + }, + { + "epoch": 2.039833189531205, + "grad_norm": 0.2991841102600859, + "learning_rate": 4.643207033648141e-06, + "loss": 0.3501, + "step": 14185 + }, + { + "epoch": 2.0405522001725624, + "grad_norm": 0.297570848958521, + "learning_rate": 4.6368505131588856e-06, + "loss": 0.357, + "step": 14190 + }, + { + "epoch": 2.04127121081392, + "grad_norm": 0.3045653759481441, + "learning_rate": 4.630497033188856e-06, + "loss": 0.3714, + "step": 14195 + }, + { + "epoch": 2.0419902214552774, + "grad_norm": 0.28008392242760666, + "learning_rate": 4.624146597340009e-06, + "loss": 0.3458, + "step": 14200 + }, + { + "epoch": 2.042709232096635, + "grad_norm": 0.28888393828368586, + "learning_rate": 4.617799209212596e-06, + "loss": 0.3708, + "step": 14205 + }, + { + "epoch": 2.0434282427379924, + "grad_norm": 0.298326296344726, + "learning_rate": 4.611454872405122e-06, + "loss": 0.3479, + "step": 14210 + }, + { + "epoch": 2.04414725337935, + "grad_norm": 0.28692582057259947, + "learning_rate": 4.605113590514366e-06, + "loss": 0.3582, + "step": 14215 + }, + { + "epoch": 2.0448662640207074, + "grad_norm": 0.30045309214713506, + "learning_rate": 4.598775367135386e-06, + "loss": 0.3522, + "step": 14220 + }, + { + "epoch": 2.045585274662065, + "grad_norm": 0.3058044871128482, + "learning_rate": 4.5924402058614904e-06, + "loss": 0.3705, + "step": 14225 + }, + { + "epoch": 2.0463042853034223, + "grad_norm": 0.2946397549664444, + "learning_rate": 4.586108110284262e-06, + "loss": 0.3601, + "step": 14230 + }, + { + "epoch": 2.04702329594478, + "grad_norm": 0.29475974329126486, + "learning_rate": 4.579779083993546e-06, + "loss": 0.3521, + "step": 14235 + }, + { + "epoch": 2.0477423065861373, + "grad_norm": 0.34681742545231375, + "learning_rate": 4.573453130577441e-06, + "loss": 0.3386, + "step": 14240 + }, + { + "epoch": 2.048461317227495, + "grad_norm": 0.3030014599463751, + "learning_rate": 4.567130253622303e-06, + "loss": 0.3586, + "step": 14245 + }, + { + "epoch": 2.0491803278688523, + "grad_norm": 0.29317145892468344, + "learning_rate": 4.560810456712754e-06, + "loss": 0.3435, + "step": 14250 + }, + { + "epoch": 2.04989933851021, + "grad_norm": 0.29355434870329244, + "learning_rate": 4.554493743431658e-06, + "loss": 0.3485, + "step": 14255 + }, + { + "epoch": 2.0506183491515673, + "grad_norm": 0.3317194647189456, + "learning_rate": 4.548180117360143e-06, + "loss": 0.378, + "step": 14260 + }, + { + "epoch": 2.051337359792925, + "grad_norm": 0.30572393220881305, + "learning_rate": 4.5418695820775735e-06, + "loss": 0.3664, + "step": 14265 + }, + { + "epoch": 2.0520563704342822, + "grad_norm": 0.28519918939779276, + "learning_rate": 4.535562141161568e-06, + "loss": 0.3592, + "step": 14270 + }, + { + "epoch": 2.05277538107564, + "grad_norm": 0.2914991729304061, + "learning_rate": 4.529257798187996e-06, + "loss": 0.3603, + "step": 14275 + }, + { + "epoch": 2.053494391716997, + "grad_norm": 0.29364357649864903, + "learning_rate": 4.52295655673096e-06, + "loss": 0.3512, + "step": 14280 + }, + { + "epoch": 2.054213402358355, + "grad_norm": 0.2882934807059941, + "learning_rate": 4.516658420362812e-06, + "loss": 0.3576, + "step": 14285 + }, + { + "epoch": 2.054932412999712, + "grad_norm": 0.3043270892198163, + "learning_rate": 4.510363392654146e-06, + "loss": 0.3726, + "step": 14290 + }, + { + "epoch": 2.05565142364107, + "grad_norm": 0.28633599973268004, + "learning_rate": 4.5040714771737845e-06, + "loss": 0.3654, + "step": 14295 + }, + { + "epoch": 2.056370434282427, + "grad_norm": 0.2965342297286475, + "learning_rate": 4.497782677488786e-06, + "loss": 0.3442, + "step": 14300 + }, + { + "epoch": 2.057089444923785, + "grad_norm": 0.3014694456691445, + "learning_rate": 4.4914969971644575e-06, + "loss": 0.357, + "step": 14305 + }, + { + "epoch": 2.0578084555651426, + "grad_norm": 0.3011763462966578, + "learning_rate": 4.4852144397643196e-06, + "loss": 0.3382, + "step": 14310 + }, + { + "epoch": 2.0585274662065, + "grad_norm": 0.308543485656549, + "learning_rate": 4.478935008850126e-06, + "loss": 0.3506, + "step": 14315 + }, + { + "epoch": 2.0592464768478576, + "grad_norm": 0.29654145322937825, + "learning_rate": 4.472658707981869e-06, + "loss": 0.3429, + "step": 14320 + }, + { + "epoch": 2.059965487489215, + "grad_norm": 0.2946816514144125, + "learning_rate": 4.4663855407177535e-06, + "loss": 0.3456, + "step": 14325 + }, + { + "epoch": 2.0606844981305725, + "grad_norm": 0.30452616080885697, + "learning_rate": 4.4601155106142145e-06, + "loss": 0.3597, + "step": 14330 + }, + { + "epoch": 2.06140350877193, + "grad_norm": 0.2990826005623624, + "learning_rate": 4.453848621225913e-06, + "loss": 0.3456, + "step": 14335 + }, + { + "epoch": 2.0621225194132875, + "grad_norm": 0.2880488234980847, + "learning_rate": 4.4475848761057175e-06, + "loss": 0.3513, + "step": 14340 + }, + { + "epoch": 2.0628415300546448, + "grad_norm": 0.3021201223017955, + "learning_rate": 4.441324278804717e-06, + "loss": 0.3606, + "step": 14345 + }, + { + "epoch": 2.0635605406960025, + "grad_norm": 0.3113479744007447, + "learning_rate": 4.435066832872228e-06, + "loss": 0.3709, + "step": 14350 + }, + { + "epoch": 2.0642795513373597, + "grad_norm": 0.3044471482844953, + "learning_rate": 4.428812541855766e-06, + "loss": 0.3567, + "step": 14355 + }, + { + "epoch": 2.0649985619787175, + "grad_norm": 0.2996535042717931, + "learning_rate": 4.422561409301061e-06, + "loss": 0.353, + "step": 14360 + }, + { + "epoch": 2.0657175726200747, + "grad_norm": 0.29841095505179394, + "learning_rate": 4.4163134387520604e-06, + "loss": 0.3646, + "step": 14365 + }, + { + "epoch": 2.0664365832614324, + "grad_norm": 0.2932029120444435, + "learning_rate": 4.410068633750906e-06, + "loss": 0.3817, + "step": 14370 + }, + { + "epoch": 2.0671555939027897, + "grad_norm": 0.31253609693634077, + "learning_rate": 4.4038269978379575e-06, + "loss": 0.3668, + "step": 14375 + }, + { + "epoch": 2.0678746045441474, + "grad_norm": 0.3116402212010058, + "learning_rate": 4.397588534551774e-06, + "loss": 0.3606, + "step": 14380 + }, + { + "epoch": 2.0685936151855047, + "grad_norm": 0.3024380986750069, + "learning_rate": 4.39135324742911e-06, + "loss": 0.3509, + "step": 14385 + }, + { + "epoch": 2.0693126258268624, + "grad_norm": 0.2845116612221238, + "learning_rate": 4.385121140004929e-06, + "loss": 0.3379, + "step": 14390 + }, + { + "epoch": 2.0700316364682196, + "grad_norm": 0.2835837243213674, + "learning_rate": 4.3788922158123825e-06, + "loss": 0.3399, + "step": 14395 + }, + { + "epoch": 2.0707506471095773, + "grad_norm": 0.29576413680103925, + "learning_rate": 4.372666478382821e-06, + "loss": 0.3609, + "step": 14400 + }, + { + "epoch": 2.0714696577509346, + "grad_norm": 0.29558308439934255, + "learning_rate": 4.366443931245793e-06, + "loss": 0.3576, + "step": 14405 + }, + { + "epoch": 2.0721886683922923, + "grad_norm": 0.3369409665924106, + "learning_rate": 4.360224577929032e-06, + "loss": 0.3564, + "step": 14410 + }, + { + "epoch": 2.0729076790336496, + "grad_norm": 0.2968084345430545, + "learning_rate": 4.35400842195846e-06, + "loss": 0.3653, + "step": 14415 + }, + { + "epoch": 2.0736266896750073, + "grad_norm": 0.2985475993720737, + "learning_rate": 4.347795466858196e-06, + "loss": 0.3455, + "step": 14420 + }, + { + "epoch": 2.0743457003163646, + "grad_norm": 0.30199060510993814, + "learning_rate": 4.34158571615053e-06, + "loss": 0.368, + "step": 14425 + }, + { + "epoch": 2.0750647109577223, + "grad_norm": 0.30041301544820775, + "learning_rate": 4.335379173355949e-06, + "loss": 0.3577, + "step": 14430 + }, + { + "epoch": 2.0757837215990795, + "grad_norm": 0.3011286542550627, + "learning_rate": 4.329175841993116e-06, + "loss": 0.3486, + "step": 14435 + }, + { + "epoch": 2.0765027322404372, + "grad_norm": 0.3016219566506042, + "learning_rate": 4.322975725578871e-06, + "loss": 0.354, + "step": 14440 + }, + { + "epoch": 2.0772217428817945, + "grad_norm": 0.29783170544799753, + "learning_rate": 4.3167788276282285e-06, + "loss": 0.3576, + "step": 14445 + }, + { + "epoch": 2.077940753523152, + "grad_norm": 0.31733612657339716, + "learning_rate": 4.310585151654392e-06, + "loss": 0.361, + "step": 14450 + }, + { + "epoch": 2.0786597641645095, + "grad_norm": 0.2911711281064737, + "learning_rate": 4.304394701168724e-06, + "loss": 0.3508, + "step": 14455 + }, + { + "epoch": 2.079378774805867, + "grad_norm": 0.297068438784082, + "learning_rate": 4.298207479680761e-06, + "loss": 0.351, + "step": 14460 + }, + { + "epoch": 2.0800977854472245, + "grad_norm": 0.2955112135325009, + "learning_rate": 4.292023490698219e-06, + "loss": 0.332, + "step": 14465 + }, + { + "epoch": 2.080816796088582, + "grad_norm": 0.3010730238291027, + "learning_rate": 4.285842737726965e-06, + "loss": 0.356, + "step": 14470 + }, + { + "epoch": 2.0815358067299394, + "grad_norm": 0.3038348805009081, + "learning_rate": 4.279665224271045e-06, + "loss": 0.3527, + "step": 14475 + }, + { + "epoch": 2.082254817371297, + "grad_norm": 0.318280624520725, + "learning_rate": 4.273490953832671e-06, + "loss": 0.3626, + "step": 14480 + }, + { + "epoch": 2.0829738280126544, + "grad_norm": 0.3171020198243272, + "learning_rate": 4.267319929912197e-06, + "loss": 0.3533, + "step": 14485 + }, + { + "epoch": 2.083692838654012, + "grad_norm": 0.28888046653161686, + "learning_rate": 4.261152156008159e-06, + "loss": 0.3408, + "step": 14490 + }, + { + "epoch": 2.0844118492953694, + "grad_norm": 0.3065065717283249, + "learning_rate": 4.2549876356172355e-06, + "loss": 0.3683, + "step": 14495 + }, + { + "epoch": 2.085130859936727, + "grad_norm": 0.3094188588362977, + "learning_rate": 4.2488263722342625e-06, + "loss": 0.3582, + "step": 14500 + }, + { + "epoch": 2.0858498705780844, + "grad_norm": 0.3052973971809332, + "learning_rate": 4.2426683693522395e-06, + "loss": 0.3642, + "step": 14505 + }, + { + "epoch": 2.086568881219442, + "grad_norm": 0.2962269533273906, + "learning_rate": 4.236513630462305e-06, + "loss": 0.3619, + "step": 14510 + }, + { + "epoch": 2.0872878918607993, + "grad_norm": 0.2944235945817763, + "learning_rate": 4.230362159053752e-06, + "loss": 0.348, + "step": 14515 + }, + { + "epoch": 2.088006902502157, + "grad_norm": 0.2997896341259781, + "learning_rate": 4.224213958614025e-06, + "loss": 0.3448, + "step": 14520 + }, + { + "epoch": 2.0887259131435147, + "grad_norm": 0.29245152361115756, + "learning_rate": 4.218069032628706e-06, + "loss": 0.3564, + "step": 14525 + }, + { + "epoch": 2.089444923784872, + "grad_norm": 0.31088657601953423, + "learning_rate": 4.211927384581527e-06, + "loss": 0.3567, + "step": 14530 + }, + { + "epoch": 2.0901639344262297, + "grad_norm": 0.3059053867853581, + "learning_rate": 4.205789017954364e-06, + "loss": 0.3594, + "step": 14535 + }, + { + "epoch": 2.090882945067587, + "grad_norm": 0.29479970664099747, + "learning_rate": 4.199653936227225e-06, + "loss": 0.3666, + "step": 14540 + }, + { + "epoch": 2.0916019557089447, + "grad_norm": 0.3027974345567834, + "learning_rate": 4.193522142878256e-06, + "loss": 0.3589, + "step": 14545 + }, + { + "epoch": 2.092320966350302, + "grad_norm": 0.2984127166650263, + "learning_rate": 4.187393641383748e-06, + "loss": 0.3445, + "step": 14550 + }, + { + "epoch": 2.0930399769916597, + "grad_norm": 0.29434023230091527, + "learning_rate": 4.181268435218118e-06, + "loss": 0.367, + "step": 14555 + }, + { + "epoch": 2.093758987633017, + "grad_norm": 0.30990202774075715, + "learning_rate": 4.175146527853911e-06, + "loss": 0.3638, + "step": 14560 + }, + { + "epoch": 2.0944779982743746, + "grad_norm": 0.2984330732391715, + "learning_rate": 4.169027922761814e-06, + "loss": 0.3647, + "step": 14565 + }, + { + "epoch": 2.095197008915732, + "grad_norm": 0.29561284011499295, + "learning_rate": 4.16291262341063e-06, + "loss": 0.3689, + "step": 14570 + }, + { + "epoch": 2.0959160195570896, + "grad_norm": 0.3447341085031207, + "learning_rate": 4.156800633267295e-06, + "loss": 0.3627, + "step": 14575 + }, + { + "epoch": 2.096635030198447, + "grad_norm": 0.30914718629127896, + "learning_rate": 4.150691955796871e-06, + "loss": 0.3701, + "step": 14580 + }, + { + "epoch": 2.0973540408398046, + "grad_norm": 0.3055974026554414, + "learning_rate": 4.144586594462532e-06, + "loss": 0.3643, + "step": 14585 + }, + { + "epoch": 2.098073051481162, + "grad_norm": 0.316488731474431, + "learning_rate": 4.138484552725582e-06, + "loss": 0.358, + "step": 14590 + }, + { + "epoch": 2.0987920621225196, + "grad_norm": 0.3181537442868115, + "learning_rate": 4.132385834045438e-06, + "loss": 0.3598, + "step": 14595 + }, + { + "epoch": 2.099511072763877, + "grad_norm": 0.2990267629620605, + "learning_rate": 4.126290441879629e-06, + "loss": 0.3653, + "step": 14600 + }, + { + "epoch": 2.1002300834052345, + "grad_norm": 0.3031855089515431, + "learning_rate": 4.120198379683811e-06, + "loss": 0.365, + "step": 14605 + }, + { + "epoch": 2.100949094046592, + "grad_norm": 0.3139042925193933, + "learning_rate": 4.11410965091174e-06, + "loss": 0.3534, + "step": 14610 + }, + { + "epoch": 2.1016681046879495, + "grad_norm": 0.31907498432839176, + "learning_rate": 4.108024259015283e-06, + "loss": 0.3484, + "step": 14615 + }, + { + "epoch": 2.1023871153293068, + "grad_norm": 0.29760108656159406, + "learning_rate": 4.101942207444421e-06, + "loss": 0.3489, + "step": 14620 + }, + { + "epoch": 2.1031061259706645, + "grad_norm": 0.30405200853613723, + "learning_rate": 4.095863499647246e-06, + "loss": 0.3599, + "step": 14625 + }, + { + "epoch": 2.1038251366120218, + "grad_norm": 0.30393682775695036, + "learning_rate": 4.089788139069936e-06, + "loss": 0.363, + "step": 14630 + }, + { + "epoch": 2.1045441472533795, + "grad_norm": 0.2967264561430701, + "learning_rate": 4.083716129156792e-06, + "loss": 0.349, + "step": 14635 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.2938902261935861, + "learning_rate": 4.077647473350201e-06, + "loss": 0.3725, + "step": 14640 + }, + { + "epoch": 2.1059821685360944, + "grad_norm": 0.3156046257036086, + "learning_rate": 4.071582175090652e-06, + "loss": 0.3704, + "step": 14645 + }, + { + "epoch": 2.1067011791774517, + "grad_norm": 0.31885314663699743, + "learning_rate": 4.065520237816738e-06, + "loss": 0.355, + "step": 14650 + }, + { + "epoch": 2.1074201898188094, + "grad_norm": 0.2976767542993816, + "learning_rate": 4.059461664965136e-06, + "loss": 0.3471, + "step": 14655 + }, + { + "epoch": 2.1081392004601667, + "grad_norm": 0.2998513718549436, + "learning_rate": 4.053406459970618e-06, + "loss": 0.3646, + "step": 14660 + }, + { + "epoch": 2.1088582111015244, + "grad_norm": 0.29277607344471285, + "learning_rate": 4.047354626266055e-06, + "loss": 0.3431, + "step": 14665 + }, + { + "epoch": 2.1095772217428816, + "grad_norm": 0.29228324476185435, + "learning_rate": 4.041306167282394e-06, + "loss": 0.3725, + "step": 14670 + }, + { + "epoch": 2.1102962323842394, + "grad_norm": 0.3073195069040935, + "learning_rate": 4.035261086448678e-06, + "loss": 0.3471, + "step": 14675 + }, + { + "epoch": 2.1110152430255966, + "grad_norm": 0.29651468102486633, + "learning_rate": 4.029219387192037e-06, + "loss": 0.3643, + "step": 14680 + }, + { + "epoch": 2.1117342536669543, + "grad_norm": 0.29495521394621027, + "learning_rate": 4.0231810729376755e-06, + "loss": 0.3535, + "step": 14685 + }, + { + "epoch": 2.1124532643083116, + "grad_norm": 0.30215728429801914, + "learning_rate": 4.017146147108877e-06, + "loss": 0.371, + "step": 14690 + }, + { + "epoch": 2.1131722749496693, + "grad_norm": 0.30810009456278137, + "learning_rate": 4.0111146131270185e-06, + "loss": 0.348, + "step": 14695 + }, + { + "epoch": 2.1138912855910266, + "grad_norm": 0.3038910693861325, + "learning_rate": 4.005086474411537e-06, + "loss": 0.3666, + "step": 14700 + }, + { + "epoch": 2.1146102962323843, + "grad_norm": 0.3179492455943479, + "learning_rate": 3.999061734379961e-06, + "loss": 0.3573, + "step": 14705 + }, + { + "epoch": 2.1153293068737415, + "grad_norm": 0.30833578681381413, + "learning_rate": 3.993040396447878e-06, + "loss": 0.341, + "step": 14710 + }, + { + "epoch": 2.1160483175150993, + "grad_norm": 0.3060367891311736, + "learning_rate": 3.987022464028953e-06, + "loss": 0.3599, + "step": 14715 + }, + { + "epoch": 2.1167673281564565, + "grad_norm": 0.3029894917005249, + "learning_rate": 3.981007940534919e-06, + "loss": 0.3666, + "step": 14720 + }, + { + "epoch": 2.1174863387978142, + "grad_norm": 0.3307925019991336, + "learning_rate": 3.974996829375584e-06, + "loss": 0.3573, + "step": 14725 + }, + { + "epoch": 2.1182053494391715, + "grad_norm": 0.3030445334315976, + "learning_rate": 3.968989133958805e-06, + "loss": 0.3623, + "step": 14730 + }, + { + "epoch": 2.118924360080529, + "grad_norm": 0.2998221367827612, + "learning_rate": 3.962984857690523e-06, + "loss": 0.3618, + "step": 14735 + }, + { + "epoch": 2.119643370721887, + "grad_norm": 0.29657577366382637, + "learning_rate": 3.956984003974723e-06, + "loss": 0.3661, + "step": 14740 + }, + { + "epoch": 2.120362381363244, + "grad_norm": 0.2942364740462061, + "learning_rate": 3.950986576213454e-06, + "loss": 0.3297, + "step": 14745 + }, + { + "epoch": 2.1210813920046014, + "grad_norm": 0.3086184084415184, + "learning_rate": 3.9449925778068345e-06, + "loss": 0.3472, + "step": 14750 + }, + { + "epoch": 2.121800402645959, + "grad_norm": 0.31743366653990324, + "learning_rate": 3.939002012153023e-06, + "loss": 0.3694, + "step": 14755 + }, + { + "epoch": 2.122519413287317, + "grad_norm": 0.2989229645333128, + "learning_rate": 3.9330148826482376e-06, + "loss": 0.3554, + "step": 14760 + }, + { + "epoch": 2.123238423928674, + "grad_norm": 0.30918978741841063, + "learning_rate": 3.927031192686751e-06, + "loss": 0.3583, + "step": 14765 + }, + { + "epoch": 2.123957434570032, + "grad_norm": 0.3118035443895809, + "learning_rate": 3.921050945660888e-06, + "loss": 0.3618, + "step": 14770 + }, + { + "epoch": 2.124676445211389, + "grad_norm": 0.3118575800553098, + "learning_rate": 3.91507414496101e-06, + "loss": 0.3509, + "step": 14775 + }, + { + "epoch": 2.125395455852747, + "grad_norm": 0.28782970273266467, + "learning_rate": 3.909100793975541e-06, + "loss": 0.3492, + "step": 14780 + }, + { + "epoch": 2.126114466494104, + "grad_norm": 0.30464380032983196, + "learning_rate": 3.903130896090935e-06, + "loss": 0.3559, + "step": 14785 + }, + { + "epoch": 2.126833477135462, + "grad_norm": 0.3192987940414797, + "learning_rate": 3.897164454691692e-06, + "loss": 0.344, + "step": 14790 + }, + { + "epoch": 2.127552487776819, + "grad_norm": 0.30121843819306726, + "learning_rate": 3.891201473160361e-06, + "loss": 0.3627, + "step": 14795 + }, + { + "epoch": 2.1282714984181768, + "grad_norm": 0.31249465666523135, + "learning_rate": 3.885241954877514e-06, + "loss": 0.3754, + "step": 14800 + }, + { + "epoch": 2.128990509059534, + "grad_norm": 0.33257311622890157, + "learning_rate": 3.8792859032217774e-06, + "loss": 0.3499, + "step": 14805 + }, + { + "epoch": 2.1297095197008917, + "grad_norm": 0.306534039307065, + "learning_rate": 3.8733333215698e-06, + "loss": 0.3537, + "step": 14810 + }, + { + "epoch": 2.130428530342249, + "grad_norm": 0.30919711460875726, + "learning_rate": 3.867384213296261e-06, + "loss": 0.3685, + "step": 14815 + }, + { + "epoch": 2.1311475409836067, + "grad_norm": 0.2949428765145592, + "learning_rate": 3.86143858177388e-06, + "loss": 0.3556, + "step": 14820 + }, + { + "epoch": 2.131866551624964, + "grad_norm": 0.29868068292926775, + "learning_rate": 3.855496430373407e-06, + "loss": 0.348, + "step": 14825 + }, + { + "epoch": 2.1325855622663217, + "grad_norm": 0.29091085369573366, + "learning_rate": 3.849557762463603e-06, + "loss": 0.3626, + "step": 14830 + }, + { + "epoch": 2.133304572907679, + "grad_norm": 0.29997048165748763, + "learning_rate": 3.843622581411277e-06, + "loss": 0.3633, + "step": 14835 + }, + { + "epoch": 2.1340235835490367, + "grad_norm": 0.30839851301814, + "learning_rate": 3.83769089058124e-06, + "loss": 0.3665, + "step": 14840 + }, + { + "epoch": 2.134742594190394, + "grad_norm": 0.3178213348617244, + "learning_rate": 3.8317626933363335e-06, + "loss": 0.3536, + "step": 14845 + }, + { + "epoch": 2.1354616048317516, + "grad_norm": 0.2957764219336339, + "learning_rate": 3.8258379930374235e-06, + "loss": 0.3481, + "step": 14850 + }, + { + "epoch": 2.136180615473109, + "grad_norm": 0.3063908238021033, + "learning_rate": 3.819916793043383e-06, + "loss": 0.3556, + "step": 14855 + }, + { + "epoch": 2.1368996261144666, + "grad_norm": 0.2913578247940509, + "learning_rate": 3.8139990967111053e-06, + "loss": 0.3487, + "step": 14860 + }, + { + "epoch": 2.137618636755824, + "grad_norm": 0.3177639829875217, + "learning_rate": 3.8080849073954996e-06, + "loss": 0.3534, + "step": 14865 + }, + { + "epoch": 2.1383376473971816, + "grad_norm": 0.30605057135730174, + "learning_rate": 3.802174228449489e-06, + "loss": 0.3646, + "step": 14870 + }, + { + "epoch": 2.139056658038539, + "grad_norm": 0.2980466608826346, + "learning_rate": 3.796267063223994e-06, + "loss": 0.3584, + "step": 14875 + }, + { + "epoch": 2.1397756686798965, + "grad_norm": 0.3083635667925451, + "learning_rate": 3.79036341506796e-06, + "loss": 0.3482, + "step": 14880 + }, + { + "epoch": 2.140494679321254, + "grad_norm": 0.31459440458454185, + "learning_rate": 3.784463287328326e-06, + "loss": 0.3458, + "step": 14885 + }, + { + "epoch": 2.1412136899626115, + "grad_norm": 0.29919560929347505, + "learning_rate": 3.7785666833500356e-06, + "loss": 0.3446, + "step": 14890 + }, + { + "epoch": 2.141932700603969, + "grad_norm": 0.29062309267369785, + "learning_rate": 3.772673606476046e-06, + "loss": 0.353, + "step": 14895 + }, + { + "epoch": 2.1426517112453265, + "grad_norm": 0.3150948575899284, + "learning_rate": 3.766784060047303e-06, + "loss": 0.3547, + "step": 14900 + }, + { + "epoch": 2.1433707218866838, + "grad_norm": 0.30156362208356413, + "learning_rate": 3.760898047402751e-06, + "loss": 0.367, + "step": 14905 + }, + { + "epoch": 2.1440897325280415, + "grad_norm": 0.2961650363272026, + "learning_rate": 3.7550155718793433e-06, + "loss": 0.3611, + "step": 14910 + }, + { + "epoch": 2.1448087431693987, + "grad_norm": 0.32527527776782844, + "learning_rate": 3.749136636812011e-06, + "loss": 0.3617, + "step": 14915 + }, + { + "epoch": 2.1455277538107564, + "grad_norm": 0.3018389075600328, + "learning_rate": 3.7432612455336915e-06, + "loss": 0.335, + "step": 14920 + }, + { + "epoch": 2.1462467644521137, + "grad_norm": 0.30937969642349905, + "learning_rate": 3.737389401375311e-06, + "loss": 0.3686, + "step": 14925 + }, + { + "epoch": 2.1469657750934714, + "grad_norm": 0.2985516260167024, + "learning_rate": 3.7315211076657745e-06, + "loss": 0.3426, + "step": 14930 + }, + { + "epoch": 2.1476847857348287, + "grad_norm": 0.31397886253424284, + "learning_rate": 3.725656367731988e-06, + "loss": 0.368, + "step": 14935 + }, + { + "epoch": 2.1484037963761864, + "grad_norm": 0.30606097677604527, + "learning_rate": 3.7197951848988356e-06, + "loss": 0.3717, + "step": 14940 + }, + { + "epoch": 2.1491228070175437, + "grad_norm": 0.2972460298848214, + "learning_rate": 3.7139375624891795e-06, + "loss": 0.3447, + "step": 14945 + }, + { + "epoch": 2.1498418176589014, + "grad_norm": 0.2941505932629645, + "learning_rate": 3.7080835038238773e-06, + "loss": 0.3392, + "step": 14950 + }, + { + "epoch": 2.150560828300259, + "grad_norm": 0.31366021885026046, + "learning_rate": 3.7022330122217543e-06, + "loss": 0.3614, + "step": 14955 + }, + { + "epoch": 2.1512798389416163, + "grad_norm": 0.3013875352411097, + "learning_rate": 3.6963860909996154e-06, + "loss": 0.3624, + "step": 14960 + }, + { + "epoch": 2.1519988495829736, + "grad_norm": 0.30679656936391514, + "learning_rate": 3.6905427434722452e-06, + "loss": 0.363, + "step": 14965 + }, + { + "epoch": 2.1527178602243313, + "grad_norm": 0.29533974380722106, + "learning_rate": 3.6847029729524062e-06, + "loss": 0.3579, + "step": 14970 + }, + { + "epoch": 2.153436870865689, + "grad_norm": 0.30993853814479577, + "learning_rate": 3.6788667827508185e-06, + "loss": 0.3546, + "step": 14975 + }, + { + "epoch": 2.1541558815070463, + "grad_norm": 0.3082180112335835, + "learning_rate": 3.67303417617619e-06, + "loss": 0.3604, + "step": 14980 + }, + { + "epoch": 2.154874892148404, + "grad_norm": 0.2993173008034473, + "learning_rate": 3.667205156535183e-06, + "loss": 0.354, + "step": 14985 + }, + { + "epoch": 2.1555939027897613, + "grad_norm": 0.30863406552666395, + "learning_rate": 3.661379727132429e-06, + "loss": 0.3631, + "step": 14990 + }, + { + "epoch": 2.156312913431119, + "grad_norm": 0.2988879862647366, + "learning_rate": 3.6555578912705335e-06, + "loss": 0.3539, + "step": 14995 + }, + { + "epoch": 2.1570319240724762, + "grad_norm": 0.30342094389011454, + "learning_rate": 3.649739652250055e-06, + "loss": 0.3498, + "step": 15000 + }, + { + "epoch": 2.157750934713834, + "grad_norm": 0.29900908254499947, + "learning_rate": 3.6439250133695113e-06, + "loss": 0.3627, + "step": 15005 + }, + { + "epoch": 2.158469945355191, + "grad_norm": 0.30994355898884496, + "learning_rate": 3.638113977925387e-06, + "loss": 0.3578, + "step": 15010 + }, + { + "epoch": 2.159188955996549, + "grad_norm": 0.3014829152149826, + "learning_rate": 3.6323065492121244e-06, + "loss": 0.3485, + "step": 15015 + }, + { + "epoch": 2.159907966637906, + "grad_norm": 0.302328397977993, + "learning_rate": 3.62650273052211e-06, + "loss": 0.3579, + "step": 15020 + }, + { + "epoch": 2.160626977279264, + "grad_norm": 0.3058435300497303, + "learning_rate": 3.6207025251456974e-06, + "loss": 0.3447, + "step": 15025 + }, + { + "epoch": 2.161345987920621, + "grad_norm": 0.30457986940937143, + "learning_rate": 3.614905936371178e-06, + "loss": 0.335, + "step": 15030 + }, + { + "epoch": 2.162064998561979, + "grad_norm": 0.30658271927475794, + "learning_rate": 3.609112967484807e-06, + "loss": 0.3717, + "step": 15035 + }, + { + "epoch": 2.162784009203336, + "grad_norm": 0.3029350646367455, + "learning_rate": 3.6033236217707766e-06, + "loss": 0.361, + "step": 15040 + }, + { + "epoch": 2.163503019844694, + "grad_norm": 0.3031235787806439, + "learning_rate": 3.5975379025112254e-06, + "loss": 0.3405, + "step": 15045 + }, + { + "epoch": 2.164222030486051, + "grad_norm": 0.30775906932936425, + "learning_rate": 3.591755812986246e-06, + "loss": 0.3687, + "step": 15050 + }, + { + "epoch": 2.164941041127409, + "grad_norm": 0.30566859413667363, + "learning_rate": 3.5859773564738633e-06, + "loss": 0.3591, + "step": 15055 + }, + { + "epoch": 2.165660051768766, + "grad_norm": 0.300167154605826, + "learning_rate": 3.5802025362500415e-06, + "loss": 0.3496, + "step": 15060 + }, + { + "epoch": 2.166379062410124, + "grad_norm": 0.29809112797987314, + "learning_rate": 3.5744313555886912e-06, + "loss": 0.357, + "step": 15065 + }, + { + "epoch": 2.167098073051481, + "grad_norm": 0.3125754657967547, + "learning_rate": 3.5686638177616594e-06, + "loss": 0.3596, + "step": 15070 + }, + { + "epoch": 2.1678170836928388, + "grad_norm": 0.29974889207961936, + "learning_rate": 3.5628999260387176e-06, + "loss": 0.3312, + "step": 15075 + }, + { + "epoch": 2.168536094334196, + "grad_norm": 0.2876712343810564, + "learning_rate": 3.5571396836875848e-06, + "loss": 0.3464, + "step": 15080 + }, + { + "epoch": 2.1692551049755537, + "grad_norm": 0.2932707741588461, + "learning_rate": 3.551383093973898e-06, + "loss": 0.3505, + "step": 15085 + }, + { + "epoch": 2.169974115616911, + "grad_norm": 0.31004210796868886, + "learning_rate": 3.5456301601612252e-06, + "loss": 0.3476, + "step": 15090 + }, + { + "epoch": 2.1706931262582687, + "grad_norm": 0.2980460736902264, + "learning_rate": 3.5398808855110745e-06, + "loss": 0.3269, + "step": 15095 + }, + { + "epoch": 2.171412136899626, + "grad_norm": 0.3078827121154138, + "learning_rate": 3.534135273282865e-06, + "loss": 0.3567, + "step": 15100 + }, + { + "epoch": 2.1721311475409837, + "grad_norm": 0.31095557656155537, + "learning_rate": 3.528393326733941e-06, + "loss": 0.3629, + "step": 15105 + }, + { + "epoch": 2.172850158182341, + "grad_norm": 0.3042862356254187, + "learning_rate": 3.5226550491195765e-06, + "loss": 0.3579, + "step": 15110 + }, + { + "epoch": 2.1735691688236987, + "grad_norm": 0.30292291186327597, + "learning_rate": 3.5169204436929647e-06, + "loss": 0.3557, + "step": 15115 + }, + { + "epoch": 2.174288179465056, + "grad_norm": 0.3001117215317004, + "learning_rate": 3.5111895137052065e-06, + "loss": 0.3484, + "step": 15120 + }, + { + "epoch": 2.1750071901064136, + "grad_norm": 0.30233367047550447, + "learning_rate": 3.5054622624053335e-06, + "loss": 0.3542, + "step": 15125 + }, + { + "epoch": 2.175726200747771, + "grad_norm": 0.3106028320426427, + "learning_rate": 3.499738693040278e-06, + "loss": 0.3666, + "step": 15130 + }, + { + "epoch": 2.1764452113891286, + "grad_norm": 0.3014691795399771, + "learning_rate": 3.4940188088548963e-06, + "loss": 0.3425, + "step": 15135 + }, + { + "epoch": 2.177164222030486, + "grad_norm": 0.31040705000513114, + "learning_rate": 3.4883026130919486e-06, + "loss": 0.3456, + "step": 15140 + }, + { + "epoch": 2.1778832326718436, + "grad_norm": 0.30381473669958653, + "learning_rate": 3.482590108992101e-06, + "loss": 0.362, + "step": 15145 + }, + { + "epoch": 2.178602243313201, + "grad_norm": 0.30480112115160074, + "learning_rate": 3.4768812997939406e-06, + "loss": 0.3449, + "step": 15150 + }, + { + "epoch": 2.1793212539545586, + "grad_norm": 0.31175874268061876, + "learning_rate": 3.4711761887339434e-06, + "loss": 0.3608, + "step": 15155 + }, + { + "epoch": 2.180040264595916, + "grad_norm": 0.3143375703331615, + "learning_rate": 3.4654747790465015e-06, + "loss": 0.3425, + "step": 15160 + }, + { + "epoch": 2.1807592752372735, + "grad_norm": 0.29956531504798056, + "learning_rate": 3.459777073963898e-06, + "loss": 0.374, + "step": 15165 + }, + { + "epoch": 2.181478285878631, + "grad_norm": 0.3078129825099616, + "learning_rate": 3.454083076716327e-06, + "loss": 0.3512, + "step": 15170 + }, + { + "epoch": 2.1821972965199885, + "grad_norm": 0.29806482217760777, + "learning_rate": 3.4483927905318683e-06, + "loss": 0.3518, + "step": 15175 + }, + { + "epoch": 2.1829163071613458, + "grad_norm": 0.3002561520340104, + "learning_rate": 3.44270621863651e-06, + "loss": 0.3609, + "step": 15180 + }, + { + "epoch": 2.1836353178027035, + "grad_norm": 0.30917529484807416, + "learning_rate": 3.4370233642541263e-06, + "loss": 0.3765, + "step": 15185 + }, + { + "epoch": 2.184354328444061, + "grad_norm": 0.31772980432478815, + "learning_rate": 3.4313442306064813e-06, + "loss": 0.3667, + "step": 15190 + }, + { + "epoch": 2.1850733390854185, + "grad_norm": 0.30465170420030224, + "learning_rate": 3.4256688209132426e-06, + "loss": 0.3599, + "step": 15195 + }, + { + "epoch": 2.185792349726776, + "grad_norm": 0.31717957291545246, + "learning_rate": 3.4199971383919538e-06, + "loss": 0.3721, + "step": 15200 + }, + { + "epoch": 2.1865113603681334, + "grad_norm": 0.29953324558392497, + "learning_rate": 3.4143291862580484e-06, + "loss": 0.3537, + "step": 15205 + }, + { + "epoch": 2.187230371009491, + "grad_norm": 0.29164252095032084, + "learning_rate": 3.4086649677248494e-06, + "loss": 0.3499, + "step": 15210 + }, + { + "epoch": 2.1879493816508484, + "grad_norm": 0.3119406011184142, + "learning_rate": 3.403004486003563e-06, + "loss": 0.3568, + "step": 15215 + }, + { + "epoch": 2.188668392292206, + "grad_norm": 0.3144362138159923, + "learning_rate": 3.3973477443032675e-06, + "loss": 0.3604, + "step": 15220 + }, + { + "epoch": 2.1893874029335634, + "grad_norm": 0.30049624753036813, + "learning_rate": 3.3916947458309367e-06, + "loss": 0.3306, + "step": 15225 + }, + { + "epoch": 2.190106413574921, + "grad_norm": 0.2907555205434776, + "learning_rate": 3.386045493791408e-06, + "loss": 0.3418, + "step": 15230 + }, + { + "epoch": 2.1908254242162783, + "grad_norm": 0.32286636944068536, + "learning_rate": 3.3803999913873964e-06, + "loss": 0.3554, + "step": 15235 + }, + { + "epoch": 2.191544434857636, + "grad_norm": 0.31197806907480435, + "learning_rate": 3.3747582418195034e-06, + "loss": 0.3356, + "step": 15240 + }, + { + "epoch": 2.1922634454989933, + "grad_norm": 0.3074070732635455, + "learning_rate": 3.3691202482861864e-06, + "loss": 0.3589, + "step": 15245 + }, + { + "epoch": 2.192982456140351, + "grad_norm": 0.30666247444748174, + "learning_rate": 3.3634860139837877e-06, + "loss": 0.3561, + "step": 15250 + }, + { + "epoch": 2.1937014667817083, + "grad_norm": 0.29794705043688, + "learning_rate": 3.357855542106507e-06, + "loss": 0.3734, + "step": 15255 + }, + { + "epoch": 2.194420477423066, + "grad_norm": 0.3248391107950035, + "learning_rate": 3.3522288358464184e-06, + "loss": 0.3379, + "step": 15260 + }, + { + "epoch": 2.1951394880644233, + "grad_norm": 0.29350187702529457, + "learning_rate": 3.3466058983934623e-06, + "loss": 0.3693, + "step": 15265 + }, + { + "epoch": 2.195858498705781, + "grad_norm": 0.30924967362422306, + "learning_rate": 3.3409867329354352e-06, + "loss": 0.3621, + "step": 15270 + }, + { + "epoch": 2.1965775093471382, + "grad_norm": 0.30868716963529236, + "learning_rate": 3.335371342657996e-06, + "loss": 0.3539, + "step": 15275 + }, + { + "epoch": 2.197296519988496, + "grad_norm": 0.32089606318561353, + "learning_rate": 3.3297597307446738e-06, + "loss": 0.3598, + "step": 15280 + }, + { + "epoch": 2.198015530629853, + "grad_norm": 0.29621403088366444, + "learning_rate": 3.324151900376843e-06, + "loss": 0.3573, + "step": 15285 + }, + { + "epoch": 2.198734541271211, + "grad_norm": 0.30789761677291216, + "learning_rate": 3.318547854733737e-06, + "loss": 0.3513, + "step": 15290 + }, + { + "epoch": 2.199453551912568, + "grad_norm": 0.29817485223243784, + "learning_rate": 3.3129475969924528e-06, + "loss": 0.3505, + "step": 15295 + }, + { + "epoch": 2.200172562553926, + "grad_norm": 0.3153210416440025, + "learning_rate": 3.3073511303279282e-06, + "loss": 0.3578, + "step": 15300 + }, + { + "epoch": 2.200891573195283, + "grad_norm": 0.3105491649370774, + "learning_rate": 3.301758457912955e-06, + "loss": 0.3538, + "step": 15305 + }, + { + "epoch": 2.201610583836641, + "grad_norm": 0.3074164397101723, + "learning_rate": 3.2961695829181772e-06, + "loss": 0.3417, + "step": 15310 + }, + { + "epoch": 2.202329594477998, + "grad_norm": 0.31478703453870654, + "learning_rate": 3.290584508512088e-06, + "loss": 0.3649, + "step": 15315 + }, + { + "epoch": 2.203048605119356, + "grad_norm": 0.308567424709636, + "learning_rate": 3.2850032378610154e-06, + "loss": 0.3508, + "step": 15320 + }, + { + "epoch": 2.203767615760713, + "grad_norm": 0.3117897029814399, + "learning_rate": 3.2794257741291437e-06, + "loss": 0.3534, + "step": 15325 + }, + { + "epoch": 2.204486626402071, + "grad_norm": 0.31285403661336864, + "learning_rate": 3.2738521204784903e-06, + "loss": 0.3508, + "step": 15330 + }, + { + "epoch": 2.205205637043428, + "grad_norm": 0.32383182492607826, + "learning_rate": 3.268282280068912e-06, + "loss": 0.3551, + "step": 15335 + }, + { + "epoch": 2.205924647684786, + "grad_norm": 0.3022420535325948, + "learning_rate": 3.2627162560581118e-06, + "loss": 0.3589, + "step": 15340 + }, + { + "epoch": 2.206643658326143, + "grad_norm": 0.3116761418818082, + "learning_rate": 3.257154051601623e-06, + "loss": 0.3607, + "step": 15345 + }, + { + "epoch": 2.2073626689675008, + "grad_norm": 0.3072452853157531, + "learning_rate": 3.2515956698528108e-06, + "loss": 0.3716, + "step": 15350 + }, + { + "epoch": 2.208081679608858, + "grad_norm": 0.3402342532426353, + "learning_rate": 3.246041113962879e-06, + "loss": 0.3693, + "step": 15355 + }, + { + "epoch": 2.2088006902502157, + "grad_norm": 0.30767073922808924, + "learning_rate": 3.2404903870808625e-06, + "loss": 0.3742, + "step": 15360 + }, + { + "epoch": 2.209519700891573, + "grad_norm": 0.30262327184958626, + "learning_rate": 3.2349434923536248e-06, + "loss": 0.3321, + "step": 15365 + }, + { + "epoch": 2.2102387115329307, + "grad_norm": 0.31283562479069693, + "learning_rate": 3.2294004329258534e-06, + "loss": 0.3582, + "step": 15370 + }, + { + "epoch": 2.210957722174288, + "grad_norm": 0.4953642571396755, + "learning_rate": 3.2238612119400594e-06, + "loss": 0.3545, + "step": 15375 + }, + { + "epoch": 2.2116767328156457, + "grad_norm": 0.3076785429212798, + "learning_rate": 3.2183258325365885e-06, + "loss": 0.3547, + "step": 15380 + }, + { + "epoch": 2.212395743457003, + "grad_norm": 0.3133405325342391, + "learning_rate": 3.2127942978535987e-06, + "loss": 0.3419, + "step": 15385 + }, + { + "epoch": 2.2131147540983607, + "grad_norm": 0.30801589506070054, + "learning_rate": 3.207266611027069e-06, + "loss": 0.3545, + "step": 15390 + }, + { + "epoch": 2.213833764739718, + "grad_norm": 0.31141767311731, + "learning_rate": 3.201742775190806e-06, + "loss": 0.3564, + "step": 15395 + }, + { + "epoch": 2.2145527753810756, + "grad_norm": 0.30431280084831613, + "learning_rate": 3.1962227934764187e-06, + "loss": 0.3576, + "step": 15400 + }, + { + "epoch": 2.2152717860224334, + "grad_norm": 0.3126341809804547, + "learning_rate": 3.190706669013346e-06, + "loss": 0.3522, + "step": 15405 + }, + { + "epoch": 2.2159907966637906, + "grad_norm": 0.3049806229743361, + "learning_rate": 3.1851944049288263e-06, + "loss": 0.361, + "step": 15410 + }, + { + "epoch": 2.216709807305148, + "grad_norm": 0.3040650333890194, + "learning_rate": 3.179686004347923e-06, + "loss": 0.346, + "step": 15415 + }, + { + "epoch": 2.2174288179465056, + "grad_norm": 0.30646538919734423, + "learning_rate": 3.174181470393496e-06, + "loss": 0.3431, + "step": 15420 + }, + { + "epoch": 2.2181478285878633, + "grad_norm": 0.29807202663145027, + "learning_rate": 3.168680806186224e-06, + "loss": 0.3546, + "step": 15425 + }, + { + "epoch": 2.2188668392292206, + "grad_norm": 0.3002164885802567, + "learning_rate": 3.1631840148445857e-06, + "loss": 0.3467, + "step": 15430 + }, + { + "epoch": 2.2195858498705783, + "grad_norm": 0.33499720918803877, + "learning_rate": 3.157691099484863e-06, + "loss": 0.343, + "step": 15435 + }, + { + "epoch": 2.2203048605119355, + "grad_norm": 0.3028194034436359, + "learning_rate": 3.152202063221147e-06, + "loss": 0.3671, + "step": 15440 + }, + { + "epoch": 2.2210238711532932, + "grad_norm": 0.3075807304788357, + "learning_rate": 3.1467169091653236e-06, + "loss": 0.356, + "step": 15445 + }, + { + "epoch": 2.2217428817946505, + "grad_norm": 0.30956393385332825, + "learning_rate": 3.1412356404270785e-06, + "loss": 0.3665, + "step": 15450 + }, + { + "epoch": 2.2224618924360082, + "grad_norm": 0.3009663731038914, + "learning_rate": 3.1357582601138958e-06, + "loss": 0.3484, + "step": 15455 + }, + { + "epoch": 2.2231809030773655, + "grad_norm": 0.3012810747112443, + "learning_rate": 3.130284771331058e-06, + "loss": 0.3661, + "step": 15460 + }, + { + "epoch": 2.223899913718723, + "grad_norm": 0.3192250756440365, + "learning_rate": 3.1248151771816416e-06, + "loss": 0.3518, + "step": 15465 + }, + { + "epoch": 2.2246189243600805, + "grad_norm": 0.2957842413439877, + "learning_rate": 3.119349480766507e-06, + "loss": 0.3601, + "step": 15470 + }, + { + "epoch": 2.225337935001438, + "grad_norm": 0.3130242606501442, + "learning_rate": 3.1138876851843094e-06, + "loss": 0.3599, + "step": 15475 + }, + { + "epoch": 2.2260569456427954, + "grad_norm": 0.3001981940997634, + "learning_rate": 3.108429793531499e-06, + "loss": 0.3697, + "step": 15480 + }, + { + "epoch": 2.226775956284153, + "grad_norm": 0.32044826136790355, + "learning_rate": 3.1029758089023032e-06, + "loss": 0.3452, + "step": 15485 + }, + { + "epoch": 2.2274949669255104, + "grad_norm": 0.2904558932561433, + "learning_rate": 3.0975257343887343e-06, + "loss": 0.3755, + "step": 15490 + }, + { + "epoch": 2.228213977566868, + "grad_norm": 0.31093571792372343, + "learning_rate": 3.0920795730806006e-06, + "loss": 0.3555, + "step": 15495 + }, + { + "epoch": 2.2289329882082254, + "grad_norm": 0.34203500612032955, + "learning_rate": 3.086637328065475e-06, + "loss": 0.3441, + "step": 15500 + }, + { + "epoch": 2.229651998849583, + "grad_norm": 0.3181575619683084, + "learning_rate": 3.081199002428721e-06, + "loss": 0.3432, + "step": 15505 + }, + { + "epoch": 2.2303710094909404, + "grad_norm": 0.30280951337401874, + "learning_rate": 3.0757645992534812e-06, + "loss": 0.3656, + "step": 15510 + }, + { + "epoch": 2.231090020132298, + "grad_norm": 0.29228748952483163, + "learning_rate": 3.0703341216206685e-06, + "loss": 0.3572, + "step": 15515 + }, + { + "epoch": 2.2318090307736553, + "grad_norm": 0.3150188564033169, + "learning_rate": 3.064907572608966e-06, + "loss": 0.3506, + "step": 15520 + }, + { + "epoch": 2.232528041415013, + "grad_norm": 0.2863452623300753, + "learning_rate": 3.059484955294845e-06, + "loss": 0.3384, + "step": 15525 + }, + { + "epoch": 2.2332470520563703, + "grad_norm": 0.3136187865723598, + "learning_rate": 3.054066272752535e-06, + "loss": 0.3443, + "step": 15530 + }, + { + "epoch": 2.233966062697728, + "grad_norm": 0.29462193365920486, + "learning_rate": 3.048651528054034e-06, + "loss": 0.3467, + "step": 15535 + }, + { + "epoch": 2.2346850733390853, + "grad_norm": 0.3073501301884894, + "learning_rate": 3.0432407242691196e-06, + "loss": 0.3612, + "step": 15540 + }, + { + "epoch": 2.235404083980443, + "grad_norm": 0.33020786822391146, + "learning_rate": 3.0378338644653218e-06, + "loss": 0.3764, + "step": 15545 + }, + { + "epoch": 2.2361230946218003, + "grad_norm": 0.30585435518707055, + "learning_rate": 3.032430951707945e-06, + "loss": 0.3499, + "step": 15550 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.3155861870757336, + "learning_rate": 3.0270319890600465e-06, + "loss": 0.3541, + "step": 15555 + }, + { + "epoch": 2.2375611159045152, + "grad_norm": 0.31654698321161245, + "learning_rate": 3.021636979582454e-06, + "loss": 0.3503, + "step": 15560 + }, + { + "epoch": 2.238280126545873, + "grad_norm": 0.31198070231838565, + "learning_rate": 3.016245926333743e-06, + "loss": 0.3712, + "step": 15565 + }, + { + "epoch": 2.23899913718723, + "grad_norm": 0.3044694033535308, + "learning_rate": 3.01085883237026e-06, + "loss": 0.3605, + "step": 15570 + }, + { + "epoch": 2.239718147828588, + "grad_norm": 0.31146232468825474, + "learning_rate": 3.005475700746091e-06, + "loss": 0.3519, + "step": 15575 + }, + { + "epoch": 2.240437158469945, + "grad_norm": 0.2973543686887364, + "learning_rate": 3.0000965345130904e-06, + "loss": 0.3437, + "step": 15580 + }, + { + "epoch": 2.241156169111303, + "grad_norm": 0.32080700171427373, + "learning_rate": 2.994721336720855e-06, + "loss": 0.3603, + "step": 15585 + }, + { + "epoch": 2.24187517975266, + "grad_norm": 0.3062482822760849, + "learning_rate": 2.989350110416731e-06, + "loss": 0.3551, + "step": 15590 + }, + { + "epoch": 2.242594190394018, + "grad_norm": 0.3208660404332907, + "learning_rate": 2.9839828586458232e-06, + "loss": 0.3614, + "step": 15595 + }, + { + "epoch": 2.243313201035375, + "grad_norm": 0.30901275585164917, + "learning_rate": 2.97861958445097e-06, + "loss": 0.3689, + "step": 15600 + }, + { + "epoch": 2.244032211676733, + "grad_norm": 0.31725814824936704, + "learning_rate": 2.9732602908727647e-06, + "loss": 0.3602, + "step": 15605 + }, + { + "epoch": 2.24475122231809, + "grad_norm": 0.2993168036916425, + "learning_rate": 2.967904980949543e-06, + "loss": 0.3639, + "step": 15610 + }, + { + "epoch": 2.245470232959448, + "grad_norm": 0.2995763336344399, + "learning_rate": 2.9625536577173773e-06, + "loss": 0.3465, + "step": 15615 + }, + { + "epoch": 2.2461892436008055, + "grad_norm": 0.30855529017030303, + "learning_rate": 2.957206324210079e-06, + "loss": 0.3457, + "step": 15620 + }, + { + "epoch": 2.246908254242163, + "grad_norm": 0.30477098341901515, + "learning_rate": 2.951862983459207e-06, + "loss": 0.3674, + "step": 15625 + }, + { + "epoch": 2.24762726488352, + "grad_norm": 0.31246333173314134, + "learning_rate": 2.9465236384940464e-06, + "loss": 0.3512, + "step": 15630 + }, + { + "epoch": 2.2483462755248778, + "grad_norm": 0.3062742648222082, + "learning_rate": 2.941188292341619e-06, + "loss": 0.357, + "step": 15635 + }, + { + "epoch": 2.2490652861662355, + "grad_norm": 0.2916864562044178, + "learning_rate": 2.9358569480266873e-06, + "loss": 0.3647, + "step": 15640 + }, + { + "epoch": 2.2497842968075927, + "grad_norm": 0.32087937470394745, + "learning_rate": 2.930529608571733e-06, + "loss": 0.3577, + "step": 15645 + }, + { + "epoch": 2.25050330744895, + "grad_norm": 0.3083378561850482, + "learning_rate": 2.9252062769969767e-06, + "loss": 0.3493, + "step": 15650 + }, + { + "epoch": 2.2512223180903077, + "grad_norm": 0.312189750363743, + "learning_rate": 2.919886956320367e-06, + "loss": 0.3476, + "step": 15655 + }, + { + "epoch": 2.2519413287316654, + "grad_norm": 0.31107989062754704, + "learning_rate": 2.9145716495575725e-06, + "loss": 0.3646, + "step": 15660 + }, + { + "epoch": 2.2526603393730227, + "grad_norm": 0.2981106045175469, + "learning_rate": 2.9092603597219848e-06, + "loss": 0.3496, + "step": 15665 + }, + { + "epoch": 2.2533793500143804, + "grad_norm": 0.32624578665001563, + "learning_rate": 2.90395308982473e-06, + "loss": 0.3802, + "step": 15670 + }, + { + "epoch": 2.2540983606557377, + "grad_norm": 0.3056872661221696, + "learning_rate": 2.8986498428746448e-06, + "loss": 0.3561, + "step": 15675 + }, + { + "epoch": 2.2548173712970954, + "grad_norm": 0.2964373136455243, + "learning_rate": 2.8933506218782826e-06, + "loss": 0.3598, + "step": 15680 + }, + { + "epoch": 2.2555363819384526, + "grad_norm": 0.31765984610242426, + "learning_rate": 2.888055429839929e-06, + "loss": 0.3462, + "step": 15685 + }, + { + "epoch": 2.2562553925798103, + "grad_norm": 0.298625914399591, + "learning_rate": 2.8827642697615665e-06, + "loss": 0.3648, + "step": 15690 + }, + { + "epoch": 2.2569744032211676, + "grad_norm": 0.3252226755907827, + "learning_rate": 2.8774771446429116e-06, + "loss": 0.3643, + "step": 15695 + }, + { + "epoch": 2.2576934138625253, + "grad_norm": 0.30512926093764386, + "learning_rate": 2.8721940574813745e-06, + "loss": 0.3655, + "step": 15700 + }, + { + "epoch": 2.2584124245038826, + "grad_norm": 0.2957418253130133, + "learning_rate": 2.866915011272089e-06, + "loss": 0.3501, + "step": 15705 + }, + { + "epoch": 2.2591314351452403, + "grad_norm": 0.2984240975142514, + "learning_rate": 2.8616400090078956e-06, + "loss": 0.3529, + "step": 15710 + }, + { + "epoch": 2.2598504457865976, + "grad_norm": 0.29953165879659893, + "learning_rate": 2.856369053679339e-06, + "loss": 0.3594, + "step": 15715 + }, + { + "epoch": 2.2605694564279553, + "grad_norm": 0.30859784356444436, + "learning_rate": 2.8511021482746672e-06, + "loss": 0.357, + "step": 15720 + }, + { + "epoch": 2.2612884670693125, + "grad_norm": 0.30757631486619363, + "learning_rate": 2.845839295779841e-06, + "loss": 0.352, + "step": 15725 + }, + { + "epoch": 2.2620074777106702, + "grad_norm": 0.3191302133672286, + "learning_rate": 2.840580499178517e-06, + "loss": 0.3626, + "step": 15730 + }, + { + "epoch": 2.2627264883520275, + "grad_norm": 0.33964528675054473, + "learning_rate": 2.83532576145205e-06, + "loss": 0.3512, + "step": 15735 + }, + { + "epoch": 2.263445498993385, + "grad_norm": 0.30911237389364155, + "learning_rate": 2.8300750855795043e-06, + "loss": 0.346, + "step": 15740 + }, + { + "epoch": 2.2641645096347425, + "grad_norm": 0.30446224483523926, + "learning_rate": 2.8248284745376285e-06, + "loss": 0.3764, + "step": 15745 + }, + { + "epoch": 2.2648835202761, + "grad_norm": 0.2940806440072444, + "learning_rate": 2.8195859313008754e-06, + "loss": 0.3457, + "step": 15750 + }, + { + "epoch": 2.2656025309174574, + "grad_norm": 0.3072868701821893, + "learning_rate": 2.814347458841392e-06, + "loss": 0.3718, + "step": 15755 + }, + { + "epoch": 2.266321541558815, + "grad_norm": 0.30242544877386035, + "learning_rate": 2.8091130601290127e-06, + "loss": 0.3369, + "step": 15760 + }, + { + "epoch": 2.2670405522001724, + "grad_norm": 0.31504463543639355, + "learning_rate": 2.8038827381312607e-06, + "loss": 0.3418, + "step": 15765 + }, + { + "epoch": 2.26775956284153, + "grad_norm": 0.31615831738207584, + "learning_rate": 2.7986564958133564e-06, + "loss": 0.3514, + "step": 15770 + }, + { + "epoch": 2.2684785734828874, + "grad_norm": 0.3126755495552674, + "learning_rate": 2.793434336138202e-06, + "loss": 0.367, + "step": 15775 + }, + { + "epoch": 2.269197584124245, + "grad_norm": 0.3038635159784138, + "learning_rate": 2.788216262066381e-06, + "loss": 0.3635, + "step": 15780 + }, + { + "epoch": 2.2699165947656024, + "grad_norm": 0.3786827203869844, + "learning_rate": 2.7830022765561725e-06, + "loss": 0.3481, + "step": 15785 + }, + { + "epoch": 2.27063560540696, + "grad_norm": 0.3118278632641194, + "learning_rate": 2.777792382563522e-06, + "loss": 0.332, + "step": 15790 + }, + { + "epoch": 2.2713546160483173, + "grad_norm": 0.32160839287608234, + "learning_rate": 2.7725865830420697e-06, + "loss": 0.3598, + "step": 15795 + }, + { + "epoch": 2.272073626689675, + "grad_norm": 0.2983078542384667, + "learning_rate": 2.7673848809431316e-06, + "loss": 0.3637, + "step": 15800 + }, + { + "epoch": 2.2727926373310323, + "grad_norm": 0.3088423052329731, + "learning_rate": 2.762187279215689e-06, + "loss": 0.3374, + "step": 15805 + }, + { + "epoch": 2.27351164797239, + "grad_norm": 0.3086333495661721, + "learning_rate": 2.7569937808064164e-06, + "loss": 0.3526, + "step": 15810 + }, + { + "epoch": 2.2742306586137473, + "grad_norm": 0.31900651905313676, + "learning_rate": 2.7518043886596492e-06, + "loss": 0.3626, + "step": 15815 + }, + { + "epoch": 2.274949669255105, + "grad_norm": 0.3486672770395252, + "learning_rate": 2.7466191057173952e-06, + "loss": 0.3376, + "step": 15820 + }, + { + "epoch": 2.2756686798964623, + "grad_norm": 0.30635695085728376, + "learning_rate": 2.741437934919342e-06, + "loss": 0.3482, + "step": 15825 + }, + { + "epoch": 2.27638769053782, + "grad_norm": 0.29498996360939916, + "learning_rate": 2.736260879202839e-06, + "loss": 0.3521, + "step": 15830 + }, + { + "epoch": 2.2771067011791777, + "grad_norm": 0.30356297104944324, + "learning_rate": 2.731087941502898e-06, + "loss": 0.3576, + "step": 15835 + }, + { + "epoch": 2.277825711820535, + "grad_norm": 0.3006613726046492, + "learning_rate": 2.72591912475221e-06, + "loss": 0.3725, + "step": 15840 + }, + { + "epoch": 2.278544722461892, + "grad_norm": 0.30407371669199124, + "learning_rate": 2.720754431881114e-06, + "loss": 0.3512, + "step": 15845 + }, + { + "epoch": 2.27926373310325, + "grad_norm": 0.30929772831522084, + "learning_rate": 2.7155938658176227e-06, + "loss": 0.3447, + "step": 15850 + }, + { + "epoch": 2.2799827437446076, + "grad_norm": 0.31130611419720466, + "learning_rate": 2.7104374294874082e-06, + "loss": 0.356, + "step": 15855 + }, + { + "epoch": 2.280701754385965, + "grad_norm": 0.32375382544412157, + "learning_rate": 2.7052851258137936e-06, + "loss": 0.3484, + "step": 15860 + }, + { + "epoch": 2.281420765027322, + "grad_norm": 0.32767196685216654, + "learning_rate": 2.700136957717763e-06, + "loss": 0.364, + "step": 15865 + }, + { + "epoch": 2.28213977566868, + "grad_norm": 0.30275935601370696, + "learning_rate": 2.694992928117961e-06, + "loss": 0.3548, + "step": 15870 + }, + { + "epoch": 2.2828587863100376, + "grad_norm": 0.2999566528493883, + "learning_rate": 2.689853039930679e-06, + "loss": 0.352, + "step": 15875 + }, + { + "epoch": 2.283577796951395, + "grad_norm": 0.3028370068555194, + "learning_rate": 2.6847172960698607e-06, + "loss": 0.3567, + "step": 15880 + }, + { + "epoch": 2.2842968075927526, + "grad_norm": 0.30491958972868694, + "learning_rate": 2.679585699447108e-06, + "loss": 0.3518, + "step": 15885 + }, + { + "epoch": 2.28501581823411, + "grad_norm": 0.31053388049647473, + "learning_rate": 2.6744582529716613e-06, + "loss": 0.3428, + "step": 15890 + }, + { + "epoch": 2.2857348288754675, + "grad_norm": 0.32522931033478, + "learning_rate": 2.6693349595504146e-06, + "loss": 0.3738, + "step": 15895 + }, + { + "epoch": 2.286453839516825, + "grad_norm": 0.3166723767856955, + "learning_rate": 2.664215822087912e-06, + "loss": 0.3699, + "step": 15900 + }, + { + "epoch": 2.2871728501581825, + "grad_norm": 0.3064335102460339, + "learning_rate": 2.6591008434863264e-06, + "loss": 0.3493, + "step": 15905 + }, + { + "epoch": 2.2878918607995398, + "grad_norm": 0.31630254631437876, + "learning_rate": 2.6539900266454886e-06, + "loss": 0.3456, + "step": 15910 + }, + { + "epoch": 2.2886108714408975, + "grad_norm": 0.3347713777428592, + "learning_rate": 2.6488833744628618e-06, + "loss": 0.357, + "step": 15915 + }, + { + "epoch": 2.2893298820822547, + "grad_norm": 0.3158742778367817, + "learning_rate": 2.643780889833546e-06, + "loss": 0.3688, + "step": 15920 + }, + { + "epoch": 2.2900488927236125, + "grad_norm": 0.32102594487563607, + "learning_rate": 2.6386825756502878e-06, + "loss": 0.3661, + "step": 15925 + }, + { + "epoch": 2.2907679033649697, + "grad_norm": 0.30611574690282606, + "learning_rate": 2.6335884348034614e-06, + "loss": 0.3366, + "step": 15930 + }, + { + "epoch": 2.2914869140063274, + "grad_norm": 0.2944612402777182, + "learning_rate": 2.6284984701810745e-06, + "loss": 0.3516, + "step": 15935 + }, + { + "epoch": 2.2922059246476847, + "grad_norm": 0.35762278792529484, + "learning_rate": 2.6234126846687757e-06, + "loss": 0.3505, + "step": 15940 + }, + { + "epoch": 2.2929249352890424, + "grad_norm": 0.3135762338755491, + "learning_rate": 2.618331081149833e-06, + "loss": 0.3687, + "step": 15945 + }, + { + "epoch": 2.2936439459303997, + "grad_norm": 0.30807275833208, + "learning_rate": 2.613253662505153e-06, + "loss": 0.3523, + "step": 15950 + }, + { + "epoch": 2.2943629565717574, + "grad_norm": 0.2997473598425893, + "learning_rate": 2.6081804316132685e-06, + "loss": 0.3363, + "step": 15955 + }, + { + "epoch": 2.2950819672131146, + "grad_norm": 0.30609918339915915, + "learning_rate": 2.6031113913503337e-06, + "loss": 0.3663, + "step": 15960 + }, + { + "epoch": 2.2958009778544723, + "grad_norm": 0.3017644713676623, + "learning_rate": 2.5980465445901247e-06, + "loss": 0.3476, + "step": 15965 + }, + { + "epoch": 2.2965199884958296, + "grad_norm": 0.3096068413767663, + "learning_rate": 2.592985894204051e-06, + "loss": 0.3741, + "step": 15970 + }, + { + "epoch": 2.2972389991371873, + "grad_norm": 0.3093683860504157, + "learning_rate": 2.5879294430611346e-06, + "loss": 0.3747, + "step": 15975 + }, + { + "epoch": 2.2979580097785446, + "grad_norm": 0.3244995841406146, + "learning_rate": 2.582877194028014e-06, + "loss": 0.3611, + "step": 15980 + }, + { + "epoch": 2.2986770204199023, + "grad_norm": 0.3111852058029451, + "learning_rate": 2.5778291499689577e-06, + "loss": 0.3621, + "step": 15985 + }, + { + "epoch": 2.2993960310612596, + "grad_norm": 0.30975552140475193, + "learning_rate": 2.572785313745837e-06, + "loss": 0.3682, + "step": 15990 + }, + { + "epoch": 2.3001150417026173, + "grad_norm": 0.3101174849309566, + "learning_rate": 2.5677456882181463e-06, + "loss": 0.3623, + "step": 15995 + }, + { + "epoch": 2.3008340523439745, + "grad_norm": 0.31071583961660165, + "learning_rate": 2.562710276242992e-06, + "loss": 0.3592, + "step": 16000 + }, + { + "epoch": 2.3015530629853322, + "grad_norm": 0.31231135268967924, + "learning_rate": 2.5576790806750882e-06, + "loss": 0.3549, + "step": 16005 + }, + { + "epoch": 2.3022720736266895, + "grad_norm": 0.33928947652643504, + "learning_rate": 2.5526521043667564e-06, + "loss": 0.362, + "step": 16010 + }, + { + "epoch": 2.302991084268047, + "grad_norm": 0.29992825922532507, + "learning_rate": 2.547629350167936e-06, + "loss": 0.3647, + "step": 16015 + }, + { + "epoch": 2.3037100949094045, + "grad_norm": 0.3004583761327109, + "learning_rate": 2.5426108209261614e-06, + "loss": 0.3557, + "step": 16020 + }, + { + "epoch": 2.304429105550762, + "grad_norm": 0.3069875577536048, + "learning_rate": 2.5375965194865813e-06, + "loss": 0.3433, + "step": 16025 + }, + { + "epoch": 2.3051481161921195, + "grad_norm": 0.3277259137871057, + "learning_rate": 2.5325864486919417e-06, + "loss": 0.3633, + "step": 16030 + }, + { + "epoch": 2.305867126833477, + "grad_norm": 0.3038629438042739, + "learning_rate": 2.5275806113825885e-06, + "loss": 0.3562, + "step": 16035 + }, + { + "epoch": 2.3065861374748344, + "grad_norm": 0.31461803132898014, + "learning_rate": 2.522579010396472e-06, + "loss": 0.3619, + "step": 16040 + }, + { + "epoch": 2.307305148116192, + "grad_norm": 0.2959354817846541, + "learning_rate": 2.517581648569145e-06, + "loss": 0.3647, + "step": 16045 + }, + { + "epoch": 2.30802415875755, + "grad_norm": 0.30566584330916874, + "learning_rate": 2.5125885287337438e-06, + "loss": 0.3586, + "step": 16050 + }, + { + "epoch": 2.308743169398907, + "grad_norm": 0.3646508015892226, + "learning_rate": 2.5075996537210133e-06, + "loss": 0.3488, + "step": 16055 + }, + { + "epoch": 2.3094621800402644, + "grad_norm": 0.3153456121902386, + "learning_rate": 2.502615026359285e-06, + "loss": 0.3688, + "step": 16060 + }, + { + "epoch": 2.310181190681622, + "grad_norm": 0.29404551026941367, + "learning_rate": 2.4976346494744785e-06, + "loss": 0.3581, + "step": 16065 + }, + { + "epoch": 2.31090020132298, + "grad_norm": 0.3105472552136778, + "learning_rate": 2.492658525890115e-06, + "loss": 0.3496, + "step": 16070 + }, + { + "epoch": 2.311619211964337, + "grad_norm": 0.319832608016326, + "learning_rate": 2.487686658427295e-06, + "loss": 0.3508, + "step": 16075 + }, + { + "epoch": 2.3123382226056943, + "grad_norm": 0.3167547707643775, + "learning_rate": 2.482719049904706e-06, + "loss": 0.3531, + "step": 16080 + }, + { + "epoch": 2.313057233247052, + "grad_norm": 0.31759970183497527, + "learning_rate": 2.4777557031386302e-06, + "loss": 0.3485, + "step": 16085 + }, + { + "epoch": 2.3137762438884097, + "grad_norm": 0.30660907697399054, + "learning_rate": 2.472796620942922e-06, + "loss": 0.3479, + "step": 16090 + }, + { + "epoch": 2.314495254529767, + "grad_norm": 0.3063113107537106, + "learning_rate": 2.4678418061290253e-06, + "loss": 0.3559, + "step": 16095 + }, + { + "epoch": 2.3152142651711247, + "grad_norm": 0.2972991853403953, + "learning_rate": 2.4628912615059664e-06, + "loss": 0.3744, + "step": 16100 + }, + { + "epoch": 2.315933275812482, + "grad_norm": 0.31162519376561765, + "learning_rate": 2.4579449898803453e-06, + "loss": 0.36, + "step": 16105 + }, + { + "epoch": 2.3166522864538397, + "grad_norm": 0.3018722218181238, + "learning_rate": 2.453002994056337e-06, + "loss": 0.3538, + "step": 16110 + }, + { + "epoch": 2.317371297095197, + "grad_norm": 0.30595044958719786, + "learning_rate": 2.448065276835705e-06, + "loss": 0.3546, + "step": 16115 + }, + { + "epoch": 2.3180903077365547, + "grad_norm": 0.30736329196433115, + "learning_rate": 2.4431318410177705e-06, + "loss": 0.3481, + "step": 16120 + }, + { + "epoch": 2.318809318377912, + "grad_norm": 0.32241614377384, + "learning_rate": 2.4382026893994435e-06, + "loss": 0.3447, + "step": 16125 + }, + { + "epoch": 2.3195283290192696, + "grad_norm": 0.3419157713808682, + "learning_rate": 2.4332778247751953e-06, + "loss": 0.3501, + "step": 16130 + }, + { + "epoch": 2.320247339660627, + "grad_norm": 0.3124712043542506, + "learning_rate": 2.4283572499370655e-06, + "loss": 0.354, + "step": 16135 + }, + { + "epoch": 2.3209663503019846, + "grad_norm": 0.31444315193194494, + "learning_rate": 2.4234409676746673e-06, + "loss": 0.3709, + "step": 16140 + }, + { + "epoch": 2.321685360943342, + "grad_norm": 0.299491226974594, + "learning_rate": 2.4185289807751833e-06, + "loss": 0.3547, + "step": 16145 + }, + { + "epoch": 2.3224043715846996, + "grad_norm": 0.3028786378211861, + "learning_rate": 2.413621292023349e-06, + "loss": 0.3631, + "step": 16150 + }, + { + "epoch": 2.323123382226057, + "grad_norm": 0.3038788084800907, + "learning_rate": 2.4087179042014774e-06, + "loss": 0.3466, + "step": 16155 + }, + { + "epoch": 2.3238423928674146, + "grad_norm": 0.30843838680085833, + "learning_rate": 2.403818820089431e-06, + "loss": 0.3571, + "step": 16160 + }, + { + "epoch": 2.324561403508772, + "grad_norm": 0.2949958202416981, + "learning_rate": 2.3989240424646355e-06, + "loss": 0.3363, + "step": 16165 + }, + { + "epoch": 2.3252804141501295, + "grad_norm": 0.32492884881713374, + "learning_rate": 2.3940335741020826e-06, + "loss": 0.3531, + "step": 16170 + }, + { + "epoch": 2.325999424791487, + "grad_norm": 0.29904998162825586, + "learning_rate": 2.3891474177743136e-06, + "loss": 0.3578, + "step": 16175 + }, + { + "epoch": 2.3267184354328445, + "grad_norm": 0.3014079508481498, + "learning_rate": 2.3842655762514234e-06, + "loss": 0.3472, + "step": 16180 + }, + { + "epoch": 2.3274374460742018, + "grad_norm": 0.3003664642803967, + "learning_rate": 2.379388052301066e-06, + "loss": 0.3527, + "step": 16185 + }, + { + "epoch": 2.3281564567155595, + "grad_norm": 0.3205562401165908, + "learning_rate": 2.3745148486884505e-06, + "loss": 0.34, + "step": 16190 + }, + { + "epoch": 2.3288754673569168, + "grad_norm": 0.3022338320914003, + "learning_rate": 2.369645968176326e-06, + "loss": 0.3532, + "step": 16195 + }, + { + "epoch": 2.3295944779982745, + "grad_norm": 0.31456416907548845, + "learning_rate": 2.3647814135250025e-06, + "loss": 0.3635, + "step": 16200 + }, + { + "epoch": 2.3303134886396317, + "grad_norm": 0.31209161437727956, + "learning_rate": 2.359921187492329e-06, + "loss": 0.3557, + "step": 16205 + }, + { + "epoch": 2.3310324992809894, + "grad_norm": 0.31100188059013256, + "learning_rate": 2.3550652928336994e-06, + "loss": 0.3604, + "step": 16210 + }, + { + "epoch": 2.3317515099223467, + "grad_norm": 0.3047213678132922, + "learning_rate": 2.3502137323020636e-06, + "loss": 0.3498, + "step": 16215 + }, + { + "epoch": 2.3324705205637044, + "grad_norm": 0.30394871087647873, + "learning_rate": 2.3453665086479015e-06, + "loss": 0.3422, + "step": 16220 + }, + { + "epoch": 2.3331895312050617, + "grad_norm": 0.3002461274106404, + "learning_rate": 2.34052362461924e-06, + "loss": 0.3511, + "step": 16225 + }, + { + "epoch": 2.3339085418464194, + "grad_norm": 0.3019570152023722, + "learning_rate": 2.3356850829616486e-06, + "loss": 0.3543, + "step": 16230 + }, + { + "epoch": 2.3346275524877766, + "grad_norm": 0.3028763815910138, + "learning_rate": 2.3308508864182254e-06, + "loss": 0.3646, + "step": 16235 + }, + { + "epoch": 2.3353465631291344, + "grad_norm": 0.2939782533371511, + "learning_rate": 2.3260210377296166e-06, + "loss": 0.3445, + "step": 16240 + }, + { + "epoch": 2.3360655737704916, + "grad_norm": 0.306733886861416, + "learning_rate": 2.3211955396340003e-06, + "loss": 0.358, + "step": 16245 + }, + { + "epoch": 2.3367845844118493, + "grad_norm": 0.3187293409049959, + "learning_rate": 2.3163743948670793e-06, + "loss": 0.347, + "step": 16250 + }, + { + "epoch": 2.3375035950532066, + "grad_norm": 0.3123739995817767, + "learning_rate": 2.3115576061621024e-06, + "loss": 0.359, + "step": 16255 + }, + { + "epoch": 2.3382226056945643, + "grad_norm": 0.30354960150343874, + "learning_rate": 2.306745176249838e-06, + "loss": 0.3488, + "step": 16260 + }, + { + "epoch": 2.338941616335922, + "grad_norm": 0.4608452621624845, + "learning_rate": 2.301937107858584e-06, + "loss": 0.3377, + "step": 16265 + }, + { + "epoch": 2.3396606269772793, + "grad_norm": 0.3134200470172266, + "learning_rate": 2.2971334037141756e-06, + "loss": 0.3479, + "step": 16270 + }, + { + "epoch": 2.3403796376186365, + "grad_norm": 0.30648596554580404, + "learning_rate": 2.2923340665399617e-06, + "loss": 0.3548, + "step": 16275 + }, + { + "epoch": 2.3410986482599943, + "grad_norm": 0.3122376400261476, + "learning_rate": 2.2875390990568204e-06, + "loss": 0.3551, + "step": 16280 + }, + { + "epoch": 2.341817658901352, + "grad_norm": 0.33082925511950845, + "learning_rate": 2.2827485039831533e-06, + "loss": 0.3526, + "step": 16285 + }, + { + "epoch": 2.3425366695427092, + "grad_norm": 0.31757353800080146, + "learning_rate": 2.2779622840348868e-06, + "loss": 0.3624, + "step": 16290 + }, + { + "epoch": 2.3432556801840665, + "grad_norm": 0.3104326101785247, + "learning_rate": 2.2731804419254565e-06, + "loss": 0.3622, + "step": 16295 + }, + { + "epoch": 2.343974690825424, + "grad_norm": 0.3050757513897761, + "learning_rate": 2.268402980365828e-06, + "loss": 0.3603, + "step": 16300 + }, + { + "epoch": 2.344693701466782, + "grad_norm": 0.31463816006910816, + "learning_rate": 2.263629902064475e-06, + "loss": 0.3569, + "step": 16305 + }, + { + "epoch": 2.345412712108139, + "grad_norm": 0.33432411275730034, + "learning_rate": 2.2588612097273843e-06, + "loss": 0.3636, + "step": 16310 + }, + { + "epoch": 2.346131722749497, + "grad_norm": 0.3045011398905948, + "learning_rate": 2.2540969060580685e-06, + "loss": 0.3513, + "step": 16315 + }, + { + "epoch": 2.346850733390854, + "grad_norm": 0.3113277493082616, + "learning_rate": 2.2493369937575414e-06, + "loss": 0.3503, + "step": 16320 + }, + { + "epoch": 2.347569744032212, + "grad_norm": 0.3017165149030103, + "learning_rate": 2.2445814755243277e-06, + "loss": 0.3563, + "step": 16325 + }, + { + "epoch": 2.348288754673569, + "grad_norm": 0.32665381028134893, + "learning_rate": 2.2398303540544675e-06, + "loss": 0.3641, + "step": 16330 + }, + { + "epoch": 2.349007765314927, + "grad_norm": 0.311750150664556, + "learning_rate": 2.2350836320414994e-06, + "loss": 0.35, + "step": 16335 + }, + { + "epoch": 2.349726775956284, + "grad_norm": 0.31191935995809517, + "learning_rate": 2.230341312176476e-06, + "loss": 0.3586, + "step": 16340 + }, + { + "epoch": 2.350445786597642, + "grad_norm": 0.3106659240495657, + "learning_rate": 2.225603397147953e-06, + "loss": 0.3624, + "step": 16345 + }, + { + "epoch": 2.351164797238999, + "grad_norm": 0.32111354852882784, + "learning_rate": 2.220869889641982e-06, + "loss": 0.3581, + "step": 16350 + }, + { + "epoch": 2.351883807880357, + "grad_norm": 0.31067249529602536, + "learning_rate": 2.216140792342125e-06, + "loss": 0.345, + "step": 16355 + }, + { + "epoch": 2.352602818521714, + "grad_norm": 0.3007292058247453, + "learning_rate": 2.211416107929437e-06, + "loss": 0.3507, + "step": 16360 + }, + { + "epoch": 2.3533218291630718, + "grad_norm": 0.32091144533272364, + "learning_rate": 2.206695839082472e-06, + "loss": 0.3424, + "step": 16365 + }, + { + "epoch": 2.354040839804429, + "grad_norm": 0.31334951666591937, + "learning_rate": 2.2019799884772862e-06, + "loss": 0.3395, + "step": 16370 + }, + { + "epoch": 2.3547598504457867, + "grad_norm": 0.3219576297127824, + "learning_rate": 2.1972685587874245e-06, + "loss": 0.3537, + "step": 16375 + }, + { + "epoch": 2.355478861087144, + "grad_norm": 0.2947101729287441, + "learning_rate": 2.192561552683926e-06, + "loss": 0.3604, + "step": 16380 + }, + { + "epoch": 2.3561978717285017, + "grad_norm": 0.30076076072427377, + "learning_rate": 2.187858972835326e-06, + "loss": 0.362, + "step": 16385 + }, + { + "epoch": 2.356916882369859, + "grad_norm": 0.32859975659304574, + "learning_rate": 2.1831608219076506e-06, + "loss": 0.3661, + "step": 16390 + }, + { + "epoch": 2.3576358930112167, + "grad_norm": 0.31032566988966753, + "learning_rate": 2.178467102564409e-06, + "loss": 0.3596, + "step": 16395 + }, + { + "epoch": 2.358354903652574, + "grad_norm": 0.31282176337577733, + "learning_rate": 2.1737778174666048e-06, + "loss": 0.3517, + "step": 16400 + }, + { + "epoch": 2.3590739142939317, + "grad_norm": 0.3148146729786297, + "learning_rate": 2.1690929692727246e-06, + "loss": 0.3663, + "step": 16405 + }, + { + "epoch": 2.359792924935289, + "grad_norm": 0.3117374664985557, + "learning_rate": 2.1644125606387346e-06, + "loss": 0.3609, + "step": 16410 + }, + { + "epoch": 2.3605119355766466, + "grad_norm": 0.3064740677110053, + "learning_rate": 2.159736594218097e-06, + "loss": 0.3647, + "step": 16415 + }, + { + "epoch": 2.361230946218004, + "grad_norm": 0.304115220511543, + "learning_rate": 2.1550650726617426e-06, + "loss": 0.3542, + "step": 16420 + }, + { + "epoch": 2.3619499568593616, + "grad_norm": 0.31155459585360695, + "learning_rate": 2.1503979986180866e-06, + "loss": 0.3412, + "step": 16425 + }, + { + "epoch": 2.362668967500719, + "grad_norm": 0.3104529776654304, + "learning_rate": 2.1457353747330247e-06, + "loss": 0.3663, + "step": 16430 + }, + { + "epoch": 2.3633879781420766, + "grad_norm": 0.3121733802967262, + "learning_rate": 2.1410772036499327e-06, + "loss": 0.3418, + "step": 16435 + }, + { + "epoch": 2.364106988783434, + "grad_norm": 0.30938847939299197, + "learning_rate": 2.1364234880096524e-06, + "loss": 0.3532, + "step": 16440 + }, + { + "epoch": 2.3648259994247915, + "grad_norm": 0.3135484840880156, + "learning_rate": 2.1317742304505097e-06, + "loss": 0.3591, + "step": 16445 + }, + { + "epoch": 2.365545010066149, + "grad_norm": 0.3175485579628117, + "learning_rate": 2.1271294336082936e-06, + "loss": 0.3465, + "step": 16450 + }, + { + "epoch": 2.3662640207075065, + "grad_norm": 0.30714339546031744, + "learning_rate": 2.1224891001162738e-06, + "loss": 0.3543, + "step": 16455 + }, + { + "epoch": 2.366983031348864, + "grad_norm": 0.30491007660977404, + "learning_rate": 2.1178532326051837e-06, + "loss": 0.3444, + "step": 16460 + }, + { + "epoch": 2.3677020419902215, + "grad_norm": 0.299116979713143, + "learning_rate": 2.1132218337032227e-06, + "loss": 0.3687, + "step": 16465 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.30244982913768803, + "learning_rate": 2.1085949060360654e-06, + "loss": 0.3456, + "step": 16470 + }, + { + "epoch": 2.3691400632729365, + "grad_norm": 0.3017966260338068, + "learning_rate": 2.1039724522268436e-06, + "loss": 0.3701, + "step": 16475 + }, + { + "epoch": 2.369859073914294, + "grad_norm": 0.2990197771685328, + "learning_rate": 2.0993544748961524e-06, + "loss": 0.3559, + "step": 16480 + }, + { + "epoch": 2.3705780845556514, + "grad_norm": 0.31138638604362306, + "learning_rate": 2.0947409766620562e-06, + "loss": 0.3469, + "step": 16485 + }, + { + "epoch": 2.3712970951970087, + "grad_norm": 0.31228122688384496, + "learning_rate": 2.0901319601400772e-06, + "loss": 0.3624, + "step": 16490 + }, + { + "epoch": 2.3720161058383664, + "grad_norm": 0.313733282980535, + "learning_rate": 2.0855274279431914e-06, + "loss": 0.3574, + "step": 16495 + }, + { + "epoch": 2.372735116479724, + "grad_norm": 0.28752112482067166, + "learning_rate": 2.080927382681841e-06, + "loss": 0.3386, + "step": 16500 + }, + { + "epoch": 2.3734541271210814, + "grad_norm": 0.3122161854370265, + "learning_rate": 2.0763318269639175e-06, + "loss": 0.3562, + "step": 16505 + }, + { + "epoch": 2.3741731377624387, + "grad_norm": 0.30338082102032904, + "learning_rate": 2.0717407633947683e-06, + "loss": 0.3626, + "step": 16510 + }, + { + "epoch": 2.3748921484037964, + "grad_norm": 0.31256480793426256, + "learning_rate": 2.0671541945772e-06, + "loss": 0.3673, + "step": 16515 + }, + { + "epoch": 2.375611159045154, + "grad_norm": 0.3022378542581336, + "learning_rate": 2.0625721231114638e-06, + "loss": 0.3554, + "step": 16520 + }, + { + "epoch": 2.3763301696865113, + "grad_norm": 0.3091646292671584, + "learning_rate": 2.0579945515952616e-06, + "loss": 0.3495, + "step": 16525 + }, + { + "epoch": 2.3770491803278686, + "grad_norm": 0.3052049807170149, + "learning_rate": 2.0534214826237486e-06, + "loss": 0.3541, + "step": 16530 + }, + { + "epoch": 2.3777681909692263, + "grad_norm": 0.29774851165940125, + "learning_rate": 2.048852918789529e-06, + "loss": 0.375, + "step": 16535 + }, + { + "epoch": 2.378487201610584, + "grad_norm": 0.31071838766471116, + "learning_rate": 2.044288862682643e-06, + "loss": 0.3557, + "step": 16540 + }, + { + "epoch": 2.3792062122519413, + "grad_norm": 0.3015799692278713, + "learning_rate": 2.0397293168905876e-06, + "loss": 0.3457, + "step": 16545 + }, + { + "epoch": 2.379925222893299, + "grad_norm": 0.3171182651740354, + "learning_rate": 2.0351742839982936e-06, + "loss": 0.3715, + "step": 16550 + }, + { + "epoch": 2.3806442335346563, + "grad_norm": 0.30145775610388537, + "learning_rate": 2.0306237665881336e-06, + "loss": 0.3438, + "step": 16555 + }, + { + "epoch": 2.381363244176014, + "grad_norm": 0.29783895885463646, + "learning_rate": 2.026077767239928e-06, + "loss": 0.3513, + "step": 16560 + }, + { + "epoch": 2.3820822548173712, + "grad_norm": 0.3083372562288726, + "learning_rate": 2.0215362885309253e-06, + "loss": 0.3653, + "step": 16565 + }, + { + "epoch": 2.382801265458729, + "grad_norm": 0.30894133256713824, + "learning_rate": 2.016999333035824e-06, + "loss": 0.365, + "step": 16570 + }, + { + "epoch": 2.383520276100086, + "grad_norm": 0.3100165022014743, + "learning_rate": 2.012466903326743e-06, + "loss": 0.3624, + "step": 16575 + }, + { + "epoch": 2.384239286741444, + "grad_norm": 0.3172582338793338, + "learning_rate": 2.007939001973249e-06, + "loss": 0.3632, + "step": 16580 + }, + { + "epoch": 2.384958297382801, + "grad_norm": 0.2993700156185241, + "learning_rate": 2.0034156315423325e-06, + "loss": 0.3411, + "step": 16585 + }, + { + "epoch": 2.385677308024159, + "grad_norm": 0.3067991007904251, + "learning_rate": 1.9988967945984216e-06, + "loss": 0.3765, + "step": 16590 + }, + { + "epoch": 2.386396318665516, + "grad_norm": 0.29813149304561987, + "learning_rate": 1.9943824937033675e-06, + "loss": 0.3673, + "step": 16595 + }, + { + "epoch": 2.387115329306874, + "grad_norm": 0.31150650600211865, + "learning_rate": 1.989872731416457e-06, + "loss": 0.3475, + "step": 16600 + }, + { + "epoch": 2.387834339948231, + "grad_norm": 0.2987440339189312, + "learning_rate": 1.985367510294398e-06, + "loss": 0.3473, + "step": 16605 + }, + { + "epoch": 2.388553350589589, + "grad_norm": 0.31135653279374875, + "learning_rate": 1.980866832891325e-06, + "loss": 0.3593, + "step": 16610 + }, + { + "epoch": 2.389272361230946, + "grad_norm": 0.345410883194383, + "learning_rate": 1.976370701758802e-06, + "loss": 0.3643, + "step": 16615 + }, + { + "epoch": 2.389991371872304, + "grad_norm": 0.29833245223834287, + "learning_rate": 1.9718791194458086e-06, + "loss": 0.3525, + "step": 16620 + }, + { + "epoch": 2.390710382513661, + "grad_norm": 0.3070446717095762, + "learning_rate": 1.9673920884987462e-06, + "loss": 0.3574, + "step": 16625 + }, + { + "epoch": 2.391429393155019, + "grad_norm": 0.3165435717253997, + "learning_rate": 1.96290961146144e-06, + "loss": 0.3602, + "step": 16630 + }, + { + "epoch": 2.392148403796376, + "grad_norm": 0.3061032201985541, + "learning_rate": 1.9584316908751334e-06, + "loss": 0.3575, + "step": 16635 + }, + { + "epoch": 2.3928674144377338, + "grad_norm": 0.33595527793917046, + "learning_rate": 1.9539583292784805e-06, + "loss": 0.3451, + "step": 16640 + }, + { + "epoch": 2.393586425079091, + "grad_norm": 0.31692637409251406, + "learning_rate": 1.94948952920756e-06, + "loss": 0.366, + "step": 16645 + }, + { + "epoch": 2.3943054357204487, + "grad_norm": 0.32106051722998524, + "learning_rate": 1.945025293195857e-06, + "loss": 0.3629, + "step": 16650 + }, + { + "epoch": 2.395024446361806, + "grad_norm": 0.31875715775304597, + "learning_rate": 1.9405656237742678e-06, + "loss": 0.3562, + "step": 16655 + }, + { + "epoch": 2.3957434570031637, + "grad_norm": 0.31159749223707356, + "learning_rate": 1.936110523471111e-06, + "loss": 0.3505, + "step": 16660 + }, + { + "epoch": 2.396462467644521, + "grad_norm": 0.31894280138883213, + "learning_rate": 1.9316599948121017e-06, + "loss": 0.3565, + "step": 16665 + }, + { + "epoch": 2.3971814782858787, + "grad_norm": 0.30113119455442694, + "learning_rate": 1.9272140403203687e-06, + "loss": 0.3394, + "step": 16670 + }, + { + "epoch": 2.397900488927236, + "grad_norm": 0.30222770577244146, + "learning_rate": 1.92277266251645e-06, + "loss": 0.3696, + "step": 16675 + }, + { + "epoch": 2.3986194995685937, + "grad_norm": 0.3197379106148971, + "learning_rate": 1.918335863918286e-06, + "loss": 0.3582, + "step": 16680 + }, + { + "epoch": 2.399338510209951, + "grad_norm": 0.30244520929727703, + "learning_rate": 1.913903647041224e-06, + "loss": 0.3435, + "step": 16685 + }, + { + "epoch": 2.4000575208513086, + "grad_norm": 0.30923941953194345, + "learning_rate": 1.9094760143980107e-06, + "loss": 0.3457, + "step": 16690 + }, + { + "epoch": 2.400776531492666, + "grad_norm": 0.3076681220598972, + "learning_rate": 1.9050529684987906e-06, + "loss": 0.3657, + "step": 16695 + }, + { + "epoch": 2.4014955421340236, + "grad_norm": 0.3063633406576403, + "learning_rate": 1.9006345118511171e-06, + "loss": 0.344, + "step": 16700 + }, + { + "epoch": 2.402214552775381, + "grad_norm": 0.31846373280083884, + "learning_rate": 1.8962206469599353e-06, + "loss": 0.3464, + "step": 16705 + }, + { + "epoch": 2.4029335634167386, + "grad_norm": 0.3067389528265799, + "learning_rate": 1.8918113763275847e-06, + "loss": 0.3622, + "step": 16710 + }, + { + "epoch": 2.4036525740580963, + "grad_norm": 0.3012500601430393, + "learning_rate": 1.887406702453809e-06, + "loss": 0.3537, + "step": 16715 + }, + { + "epoch": 2.4043715846994536, + "grad_norm": 0.3123994980644394, + "learning_rate": 1.8830066278357395e-06, + "loss": 0.3667, + "step": 16720 + }, + { + "epoch": 2.405090595340811, + "grad_norm": 0.3070931996515153, + "learning_rate": 1.8786111549678977e-06, + "loss": 0.3576, + "step": 16725 + }, + { + "epoch": 2.4058096059821685, + "grad_norm": 0.30152513549698934, + "learning_rate": 1.8742202863422033e-06, + "loss": 0.3582, + "step": 16730 + }, + { + "epoch": 2.4065286166235262, + "grad_norm": 0.31045639787630824, + "learning_rate": 1.869834024447964e-06, + "loss": 0.3627, + "step": 16735 + }, + { + "epoch": 2.4072476272648835, + "grad_norm": 0.313458442773504, + "learning_rate": 1.8654523717718697e-06, + "loss": 0.358, + "step": 16740 + }, + { + "epoch": 2.4079666379062408, + "grad_norm": 0.3121673881775161, + "learning_rate": 1.8610753307980068e-06, + "loss": 0.3422, + "step": 16745 + }, + { + "epoch": 2.4086856485475985, + "grad_norm": 0.3138157455498551, + "learning_rate": 1.85670290400784e-06, + "loss": 0.3514, + "step": 16750 + }, + { + "epoch": 2.409404659188956, + "grad_norm": 0.3062831057689979, + "learning_rate": 1.8523350938802165e-06, + "loss": 0.345, + "step": 16755 + }, + { + "epoch": 2.4101236698303135, + "grad_norm": 0.3088378047998563, + "learning_rate": 1.8479719028913746e-06, + "loss": 0.3428, + "step": 16760 + }, + { + "epoch": 2.410842680471671, + "grad_norm": 0.31155147696743823, + "learning_rate": 1.8436133335149276e-06, + "loss": 0.3702, + "step": 16765 + }, + { + "epoch": 2.4115616911130284, + "grad_norm": 0.31777544208044184, + "learning_rate": 1.839259388221868e-06, + "loss": 0.3589, + "step": 16770 + }, + { + "epoch": 2.412280701754386, + "grad_norm": 0.31445743776164564, + "learning_rate": 1.8349100694805711e-06, + "loss": 0.3543, + "step": 16775 + }, + { + "epoch": 2.4129997123957434, + "grad_norm": 0.33402574635843285, + "learning_rate": 1.8305653797567869e-06, + "loss": 0.3626, + "step": 16780 + }, + { + "epoch": 2.413718723037101, + "grad_norm": 0.30402025220962886, + "learning_rate": 1.8262253215136438e-06, + "loss": 0.3563, + "step": 16785 + }, + { + "epoch": 2.4144377336784584, + "grad_norm": 0.30518960338687, + "learning_rate": 1.8218898972116394e-06, + "loss": 0.3543, + "step": 16790 + }, + { + "epoch": 2.415156744319816, + "grad_norm": 0.3061444672734299, + "learning_rate": 1.8175591093086442e-06, + "loss": 0.3516, + "step": 16795 + }, + { + "epoch": 2.4158757549611733, + "grad_norm": 0.30806427596885644, + "learning_rate": 1.8132329602599097e-06, + "loss": 0.3648, + "step": 16800 + }, + { + "epoch": 2.416594765602531, + "grad_norm": 0.38849773803962784, + "learning_rate": 1.8089114525180451e-06, + "loss": 0.349, + "step": 16805 + }, + { + "epoch": 2.4173137762438883, + "grad_norm": 0.30563185571890733, + "learning_rate": 1.8045945885330341e-06, + "loss": 0.3537, + "step": 16810 + }, + { + "epoch": 2.418032786885246, + "grad_norm": 0.29673282061356554, + "learning_rate": 1.80028237075223e-06, + "loss": 0.3704, + "step": 16815 + }, + { + "epoch": 2.4187517975266033, + "grad_norm": 0.3070195687967652, + "learning_rate": 1.795974801620346e-06, + "loss": 0.3675, + "step": 16820 + }, + { + "epoch": 2.419470808167961, + "grad_norm": 0.30232957242307895, + "learning_rate": 1.791671883579469e-06, + "loss": 0.3489, + "step": 16825 + }, + { + "epoch": 2.4201898188093183, + "grad_norm": 0.31102097099611603, + "learning_rate": 1.787373619069036e-06, + "loss": 0.3619, + "step": 16830 + }, + { + "epoch": 2.420908829450676, + "grad_norm": 0.3613315161174587, + "learning_rate": 1.7830800105258605e-06, + "loss": 0.3602, + "step": 16835 + }, + { + "epoch": 2.4216278400920332, + "grad_norm": 0.3015445183271017, + "learning_rate": 1.778791060384104e-06, + "loss": 0.3492, + "step": 16840 + }, + { + "epoch": 2.422346850733391, + "grad_norm": 0.30762161087025014, + "learning_rate": 1.774506771075295e-06, + "loss": 0.3575, + "step": 16845 + }, + { + "epoch": 2.423065861374748, + "grad_norm": 0.3105962165052004, + "learning_rate": 1.770227145028316e-06, + "loss": 0.3519, + "step": 16850 + }, + { + "epoch": 2.423784872016106, + "grad_norm": 0.2996130875400581, + "learning_rate": 1.7659521846694039e-06, + "loss": 0.3611, + "step": 16855 + }, + { + "epoch": 2.424503882657463, + "grad_norm": 0.3030618387750977, + "learning_rate": 1.761681892422158e-06, + "loss": 0.3567, + "step": 16860 + }, + { + "epoch": 2.425222893298821, + "grad_norm": 0.3090647114349956, + "learning_rate": 1.7574162707075226e-06, + "loss": 0.3615, + "step": 16865 + }, + { + "epoch": 2.425941903940178, + "grad_norm": 0.3086330386760215, + "learning_rate": 1.753155321943797e-06, + "loss": 0.3697, + "step": 16870 + }, + { + "epoch": 2.426660914581536, + "grad_norm": 0.3076850664842922, + "learning_rate": 1.748899048546634e-06, + "loss": 0.3615, + "step": 16875 + }, + { + "epoch": 2.427379925222893, + "grad_norm": 0.2957884747974804, + "learning_rate": 1.7446474529290359e-06, + "loss": 0.3431, + "step": 16880 + }, + { + "epoch": 2.428098935864251, + "grad_norm": 0.3107362937467926, + "learning_rate": 1.7404005375013466e-06, + "loss": 0.3597, + "step": 16885 + }, + { + "epoch": 2.428817946505608, + "grad_norm": 0.312188974556344, + "learning_rate": 1.7361583046712649e-06, + "loss": 0.3715, + "step": 16890 + }, + { + "epoch": 2.429536957146966, + "grad_norm": 0.2967382756754711, + "learning_rate": 1.7319207568438278e-06, + "loss": 0.3599, + "step": 16895 + }, + { + "epoch": 2.430255967788323, + "grad_norm": 0.31529094661142887, + "learning_rate": 1.7276878964214227e-06, + "loss": 0.3403, + "step": 16900 + }, + { + "epoch": 2.430974978429681, + "grad_norm": 0.2986818445027334, + "learning_rate": 1.7234597258037756e-06, + "loss": 0.3519, + "step": 16905 + }, + { + "epoch": 2.431693989071038, + "grad_norm": 0.3125459971936233, + "learning_rate": 1.719236247387951e-06, + "loss": 0.3656, + "step": 16910 + }, + { + "epoch": 2.4324129997123958, + "grad_norm": 0.3717572805196429, + "learning_rate": 1.7150174635683615e-06, + "loss": 0.3642, + "step": 16915 + }, + { + "epoch": 2.433132010353753, + "grad_norm": 0.3059666316215835, + "learning_rate": 1.7108033767367494e-06, + "loss": 0.3725, + "step": 16920 + }, + { + "epoch": 2.4338510209951107, + "grad_norm": 0.3196174908517126, + "learning_rate": 1.7065939892821992e-06, + "loss": 0.3495, + "step": 16925 + }, + { + "epoch": 2.4345700316364685, + "grad_norm": 0.30571009156213, + "learning_rate": 1.7023893035911355e-06, + "loss": 0.3706, + "step": 16930 + }, + { + "epoch": 2.4352890422778257, + "grad_norm": 0.30831064132873515, + "learning_rate": 1.6981893220473067e-06, + "loss": 0.3394, + "step": 16935 + }, + { + "epoch": 2.436008052919183, + "grad_norm": 0.33104396855694757, + "learning_rate": 1.6939940470317984e-06, + "loss": 0.3537, + "step": 16940 + }, + { + "epoch": 2.4367270635605407, + "grad_norm": 0.31128059060130037, + "learning_rate": 1.6898034809230334e-06, + "loss": 0.3753, + "step": 16945 + }, + { + "epoch": 2.4374460742018984, + "grad_norm": 0.3148376623822208, + "learning_rate": 1.6856176260967593e-06, + "loss": 0.3574, + "step": 16950 + }, + { + "epoch": 2.4381650848432557, + "grad_norm": 0.31121691153326964, + "learning_rate": 1.681436484926051e-06, + "loss": 0.349, + "step": 16955 + }, + { + "epoch": 2.438884095484613, + "grad_norm": 0.3097706154767799, + "learning_rate": 1.6772600597813194e-06, + "loss": 0.3545, + "step": 16960 + }, + { + "epoch": 2.4396031061259706, + "grad_norm": 0.3138394742795372, + "learning_rate": 1.673088353030291e-06, + "loss": 0.3583, + "step": 16965 + }, + { + "epoch": 2.4403221167673284, + "grad_norm": 0.3009795543198072, + "learning_rate": 1.668921367038029e-06, + "loss": 0.3557, + "step": 16970 + }, + { + "epoch": 2.4410411274086856, + "grad_norm": 0.3058900019055182, + "learning_rate": 1.6647591041669076e-06, + "loss": 0.3662, + "step": 16975 + }, + { + "epoch": 2.4417601380500433, + "grad_norm": 0.30977411122929144, + "learning_rate": 1.6606015667766362e-06, + "loss": 0.3404, + "step": 16980 + }, + { + "epoch": 2.4424791486914006, + "grad_norm": 0.3086532344833748, + "learning_rate": 1.6564487572242338e-06, + "loss": 0.3634, + "step": 16985 + }, + { + "epoch": 2.4431981593327583, + "grad_norm": 0.30919037973353314, + "learning_rate": 1.6523006778640472e-06, + "loss": 0.345, + "step": 16990 + }, + { + "epoch": 2.4439171699741156, + "grad_norm": 0.3350256855598322, + "learning_rate": 1.6481573310477384e-06, + "loss": 0.3553, + "step": 16995 + }, + { + "epoch": 2.4446361806154733, + "grad_norm": 0.3399771416992865, + "learning_rate": 1.644018719124283e-06, + "loss": 0.3373, + "step": 17000 + }, + { + "epoch": 2.4453551912568305, + "grad_norm": 0.31607703923232544, + "learning_rate": 1.6398848444399794e-06, + "loss": 0.3586, + "step": 17005 + }, + { + "epoch": 2.4460742018981882, + "grad_norm": 0.3201203019090241, + "learning_rate": 1.6357557093384335e-06, + "loss": 0.3595, + "step": 17010 + }, + { + "epoch": 2.4467932125395455, + "grad_norm": 0.3002187152760896, + "learning_rate": 1.6316313161605723e-06, + "loss": 0.3457, + "step": 17015 + }, + { + "epoch": 2.4475122231809032, + "grad_norm": 0.3243764298092394, + "learning_rate": 1.6275116672446235e-06, + "loss": 0.3576, + "step": 17020 + }, + { + "epoch": 2.4482312338222605, + "grad_norm": 0.3277410455544984, + "learning_rate": 1.6233967649261328e-06, + "loss": 0.362, + "step": 17025 + }, + { + "epoch": 2.448950244463618, + "grad_norm": 0.31461864856915484, + "learning_rate": 1.619286611537958e-06, + "loss": 0.3579, + "step": 17030 + }, + { + "epoch": 2.4496692551049755, + "grad_norm": 0.3160979685375077, + "learning_rate": 1.6151812094102548e-06, + "loss": 0.3611, + "step": 17035 + }, + { + "epoch": 2.450388265746333, + "grad_norm": 0.3040492263650492, + "learning_rate": 1.6110805608704904e-06, + "loss": 0.3596, + "step": 17040 + }, + { + "epoch": 2.4511072763876904, + "grad_norm": 0.31725131938443385, + "learning_rate": 1.606984668243441e-06, + "loss": 0.3631, + "step": 17045 + }, + { + "epoch": 2.451826287029048, + "grad_norm": 0.30864474580795526, + "learning_rate": 1.6028935338511786e-06, + "loss": 0.3338, + "step": 17050 + }, + { + "epoch": 2.4525452976704054, + "grad_norm": 0.3001447069907468, + "learning_rate": 1.5988071600130805e-06, + "loss": 0.3397, + "step": 17055 + }, + { + "epoch": 2.453264308311763, + "grad_norm": 0.3165638007878571, + "learning_rate": 1.5947255490458312e-06, + "loss": 0.3606, + "step": 17060 + }, + { + "epoch": 2.4539833189531204, + "grad_norm": 0.32135725369278234, + "learning_rate": 1.5906487032634055e-06, + "loss": 0.359, + "step": 17065 + }, + { + "epoch": 2.454702329594478, + "grad_norm": 0.3266234948475013, + "learning_rate": 1.586576624977082e-06, + "loss": 0.3553, + "step": 17070 + }, + { + "epoch": 2.4554213402358354, + "grad_norm": 0.31119789748060417, + "learning_rate": 1.5825093164954387e-06, + "loss": 0.3501, + "step": 17075 + }, + { + "epoch": 2.456140350877193, + "grad_norm": 0.29703370983787136, + "learning_rate": 1.578446780124344e-06, + "loss": 0.3546, + "step": 17080 + }, + { + "epoch": 2.4568593615185503, + "grad_norm": 0.31650867374625985, + "learning_rate": 1.5743890181669607e-06, + "loss": 0.342, + "step": 17085 + }, + { + "epoch": 2.457578372159908, + "grad_norm": 0.3250380853925492, + "learning_rate": 1.5703360329237526e-06, + "loss": 0.3555, + "step": 17090 + }, + { + "epoch": 2.4582973828012653, + "grad_norm": 0.31577974895217814, + "learning_rate": 1.5662878266924675e-06, + "loss": 0.362, + "step": 17095 + }, + { + "epoch": 2.459016393442623, + "grad_norm": 0.29951875548924073, + "learning_rate": 1.5622444017681438e-06, + "loss": 0.3471, + "step": 17100 + }, + { + "epoch": 2.4597354040839803, + "grad_norm": 0.308005757258318, + "learning_rate": 1.5582057604431178e-06, + "loss": 0.3643, + "step": 17105 + }, + { + "epoch": 2.460454414725338, + "grad_norm": 0.30455057643136085, + "learning_rate": 1.5541719050070026e-06, + "loss": 0.352, + "step": 17110 + }, + { + "epoch": 2.4611734253666953, + "grad_norm": 0.3057923328411924, + "learning_rate": 1.5501428377467087e-06, + "loss": 0.3462, + "step": 17115 + }, + { + "epoch": 2.461892436008053, + "grad_norm": 0.30086318438027765, + "learning_rate": 1.5461185609464214e-06, + "loss": 0.3556, + "step": 17120 + }, + { + "epoch": 2.4626114466494102, + "grad_norm": 0.3099221276873916, + "learning_rate": 1.5420990768876175e-06, + "loss": 0.3562, + "step": 17125 + }, + { + "epoch": 2.463330457290768, + "grad_norm": 0.32262263906273925, + "learning_rate": 1.5380843878490592e-06, + "loss": 0.3659, + "step": 17130 + }, + { + "epoch": 2.464049467932125, + "grad_norm": 0.33418684176700825, + "learning_rate": 1.5340744961067821e-06, + "loss": 0.3462, + "step": 17135 + }, + { + "epoch": 2.464768478573483, + "grad_norm": 0.3097293293337475, + "learning_rate": 1.5300694039341035e-06, + "loss": 0.353, + "step": 17140 + }, + { + "epoch": 2.4654874892148406, + "grad_norm": 0.298674900463793, + "learning_rate": 1.526069113601627e-06, + "loss": 0.3484, + "step": 17145 + }, + { + "epoch": 2.466206499856198, + "grad_norm": 0.3158087180370093, + "learning_rate": 1.5220736273772263e-06, + "loss": 0.3517, + "step": 17150 + }, + { + "epoch": 2.466925510497555, + "grad_norm": 0.31834404918792936, + "learning_rate": 1.5180829475260517e-06, + "loss": 0.3744, + "step": 17155 + }, + { + "epoch": 2.467644521138913, + "grad_norm": 0.31709258011300673, + "learning_rate": 1.5140970763105356e-06, + "loss": 0.3544, + "step": 17160 + }, + { + "epoch": 2.4683635317802706, + "grad_norm": 0.3120763334555011, + "learning_rate": 1.510116015990376e-06, + "loss": 0.3513, + "step": 17165 + }, + { + "epoch": 2.469082542421628, + "grad_norm": 0.3162692296179125, + "learning_rate": 1.5061397688225477e-06, + "loss": 0.3557, + "step": 17170 + }, + { + "epoch": 2.469801553062985, + "grad_norm": 0.31157358107536154, + "learning_rate": 1.5021683370613017e-06, + "loss": 0.3685, + "step": 17175 + }, + { + "epoch": 2.470520563704343, + "grad_norm": 0.2954469245495106, + "learning_rate": 1.498201722958148e-06, + "loss": 0.3482, + "step": 17180 + }, + { + "epoch": 2.4712395743457005, + "grad_norm": 0.3118611389765953, + "learning_rate": 1.494239928761869e-06, + "loss": 0.3651, + "step": 17185 + }, + { + "epoch": 2.471958584987058, + "grad_norm": 0.3153770084694792, + "learning_rate": 1.490282956718524e-06, + "loss": 0.3539, + "step": 17190 + }, + { + "epoch": 2.4726775956284155, + "grad_norm": 0.30868214955442846, + "learning_rate": 1.4863308090714258e-06, + "loss": 0.3473, + "step": 17195 + }, + { + "epoch": 2.4733966062697728, + "grad_norm": 0.30759194940633955, + "learning_rate": 1.4823834880611554e-06, + "loss": 0.3507, + "step": 17200 + }, + { + "epoch": 2.4741156169111305, + "grad_norm": 0.3402795028334639, + "learning_rate": 1.4784409959255642e-06, + "loss": 0.3567, + "step": 17205 + }, + { + "epoch": 2.4748346275524877, + "grad_norm": 0.3175717638334829, + "learning_rate": 1.4745033348997572e-06, + "loss": 0.3613, + "step": 17210 + }, + { + "epoch": 2.4755536381938454, + "grad_norm": 0.3074937262153812, + "learning_rate": 1.470570507216108e-06, + "loss": 0.3522, + "step": 17215 + }, + { + "epoch": 2.4762726488352027, + "grad_norm": 0.3125326237358324, + "learning_rate": 1.4666425151042429e-06, + "loss": 0.3458, + "step": 17220 + }, + { + "epoch": 2.4769916594765604, + "grad_norm": 0.30608668052185395, + "learning_rate": 1.4627193607910516e-06, + "loss": 0.353, + "step": 17225 + }, + { + "epoch": 2.4777106701179177, + "grad_norm": 0.318666532375494, + "learning_rate": 1.458801046500683e-06, + "loss": 0.3549, + "step": 17230 + }, + { + "epoch": 2.4784296807592754, + "grad_norm": 0.3142834316070068, + "learning_rate": 1.4548875744545366e-06, + "loss": 0.367, + "step": 17235 + }, + { + "epoch": 2.4791486914006327, + "grad_norm": 0.30397581194824813, + "learning_rate": 1.4509789468712653e-06, + "loss": 0.3575, + "step": 17240 + }, + { + "epoch": 2.4798677020419904, + "grad_norm": 0.3057273682661973, + "learning_rate": 1.4470751659667849e-06, + "loss": 0.3443, + "step": 17245 + }, + { + "epoch": 2.4805867126833476, + "grad_norm": 0.32534194389958865, + "learning_rate": 1.4431762339542553e-06, + "loss": 0.3561, + "step": 17250 + }, + { + "epoch": 2.4813057233247053, + "grad_norm": 0.3160287954987181, + "learning_rate": 1.4392821530440882e-06, + "loss": 0.3516, + "step": 17255 + }, + { + "epoch": 2.4820247339660626, + "grad_norm": 0.3136330939409183, + "learning_rate": 1.4353929254439502e-06, + "loss": 0.3556, + "step": 17260 + }, + { + "epoch": 2.4827437446074203, + "grad_norm": 0.3215116924328709, + "learning_rate": 1.4315085533587502e-06, + "loss": 0.3562, + "step": 17265 + }, + { + "epoch": 2.4834627552487776, + "grad_norm": 0.31491915843149676, + "learning_rate": 1.4276290389906478e-06, + "loss": 0.342, + "step": 17270 + }, + { + "epoch": 2.4841817658901353, + "grad_norm": 0.3090368767273289, + "learning_rate": 1.423754384539051e-06, + "loss": 0.3492, + "step": 17275 + }, + { + "epoch": 2.4849007765314925, + "grad_norm": 0.3009441303694336, + "learning_rate": 1.419884592200609e-06, + "loss": 0.3484, + "step": 17280 + }, + { + "epoch": 2.4856197871728503, + "grad_norm": 0.32423942758517144, + "learning_rate": 1.4160196641692093e-06, + "loss": 0.3685, + "step": 17285 + }, + { + "epoch": 2.4863387978142075, + "grad_norm": 0.3032972586724873, + "learning_rate": 1.4121596026359951e-06, + "loss": 0.3579, + "step": 17290 + }, + { + "epoch": 2.4870578084555652, + "grad_norm": 0.3027485270411955, + "learning_rate": 1.4083044097893396e-06, + "loss": 0.3451, + "step": 17295 + }, + { + "epoch": 2.4877768190969225, + "grad_norm": 0.31783825112011593, + "learning_rate": 1.4044540878148572e-06, + "loss": 0.3567, + "step": 17300 + }, + { + "epoch": 2.48849582973828, + "grad_norm": 0.3157336204160479, + "learning_rate": 1.4006086388954066e-06, + "loss": 0.3693, + "step": 17305 + }, + { + "epoch": 2.4892148403796375, + "grad_norm": 0.30291168506729627, + "learning_rate": 1.3967680652110783e-06, + "loss": 0.3733, + "step": 17310 + }, + { + "epoch": 2.489933851020995, + "grad_norm": 0.32324881197648486, + "learning_rate": 1.3929323689391994e-06, + "loss": 0.3605, + "step": 17315 + }, + { + "epoch": 2.4906528616623524, + "grad_norm": 0.29364732713138836, + "learning_rate": 1.3891015522543382e-06, + "loss": 0.3464, + "step": 17320 + }, + { + "epoch": 2.49137187230371, + "grad_norm": 0.3261749345456971, + "learning_rate": 1.3852756173282889e-06, + "loss": 0.365, + "step": 17325 + }, + { + "epoch": 2.4920908829450674, + "grad_norm": 0.2997107742358322, + "learning_rate": 1.3814545663300783e-06, + "loss": 0.3591, + "step": 17330 + }, + { + "epoch": 2.492809893586425, + "grad_norm": 0.307601657265674, + "learning_rate": 1.3776384014259714e-06, + "loss": 0.3512, + "step": 17335 + }, + { + "epoch": 2.4935289042277824, + "grad_norm": 0.3163799337854286, + "learning_rate": 1.3738271247794533e-06, + "loss": 0.3467, + "step": 17340 + }, + { + "epoch": 2.49424791486914, + "grad_norm": 0.3190152944508011, + "learning_rate": 1.3700207385512497e-06, + "loss": 0.3561, + "step": 17345 + }, + { + "epoch": 2.4949669255104974, + "grad_norm": 0.3130223226974614, + "learning_rate": 1.3662192448993028e-06, + "loss": 0.3467, + "step": 17350 + }, + { + "epoch": 2.495685936151855, + "grad_norm": 0.306490021257793, + "learning_rate": 1.3624226459787849e-06, + "loss": 0.3517, + "step": 17355 + }, + { + "epoch": 2.496404946793213, + "grad_norm": 0.3229106798668172, + "learning_rate": 1.3586309439420985e-06, + "loss": 0.3484, + "step": 17360 + }, + { + "epoch": 2.49712395743457, + "grad_norm": 0.3088760623768454, + "learning_rate": 1.3548441409388591e-06, + "loss": 0.3536, + "step": 17365 + }, + { + "epoch": 2.4978429680759273, + "grad_norm": 0.3157699187140034, + "learning_rate": 1.3510622391159156e-06, + "loss": 0.3631, + "step": 17370 + }, + { + "epoch": 2.498561978717285, + "grad_norm": 0.3111580111211083, + "learning_rate": 1.3472852406173342e-06, + "loss": 0.3382, + "step": 17375 + }, + { + "epoch": 2.4992809893586427, + "grad_norm": 0.30354069525747657, + "learning_rate": 1.3435131475843988e-06, + "loss": 0.3717, + "step": 17380 + }, + { + "epoch": 2.5, + "grad_norm": 0.3196523648789475, + "learning_rate": 1.339745962155613e-06, + "loss": 0.3676, + "step": 17385 + }, + { + "epoch": 2.5007190106413573, + "grad_norm": 0.29607395991693086, + "learning_rate": 1.3359836864667043e-06, + "loss": 0.3413, + "step": 17390 + }, + { + "epoch": 2.501438021282715, + "grad_norm": 0.30646191943253703, + "learning_rate": 1.3322263226506072e-06, + "loss": 0.3753, + "step": 17395 + }, + { + "epoch": 2.5021570319240727, + "grad_norm": 0.29913069531853337, + "learning_rate": 1.3284738728374769e-06, + "loss": 0.3618, + "step": 17400 + }, + { + "epoch": 2.50287604256543, + "grad_norm": 0.30682008904914354, + "learning_rate": 1.3247263391546838e-06, + "loss": 0.3453, + "step": 17405 + }, + { + "epoch": 2.503595053206787, + "grad_norm": 0.3068404197607742, + "learning_rate": 1.3209837237268075e-06, + "loss": 0.3588, + "step": 17410 + }, + { + "epoch": 2.504314063848145, + "grad_norm": 0.30055187674162964, + "learning_rate": 1.3172460286756417e-06, + "loss": 0.3468, + "step": 17415 + }, + { + "epoch": 2.5050330744895026, + "grad_norm": 0.3043986575852982, + "learning_rate": 1.3135132561201925e-06, + "loss": 0.3541, + "step": 17420 + }, + { + "epoch": 2.50575208513086, + "grad_norm": 0.3029808494772463, + "learning_rate": 1.3097854081766715e-06, + "loss": 0.3579, + "step": 17425 + }, + { + "epoch": 2.506471095772217, + "grad_norm": 0.3063283709079658, + "learning_rate": 1.3060624869584959e-06, + "loss": 0.35, + "step": 17430 + }, + { + "epoch": 2.507190106413575, + "grad_norm": 0.3156085285867541, + "learning_rate": 1.3023444945762997e-06, + "loss": 0.3539, + "step": 17435 + }, + { + "epoch": 2.5079091170549326, + "grad_norm": 0.30120145554191713, + "learning_rate": 1.2986314331379147e-06, + "loss": 0.3527, + "step": 17440 + }, + { + "epoch": 2.50862812769629, + "grad_norm": 0.32084302624344235, + "learning_rate": 1.2949233047483756e-06, + "loss": 0.3541, + "step": 17445 + }, + { + "epoch": 2.5093471383376476, + "grad_norm": 0.31549607511388816, + "learning_rate": 1.29122011150993e-06, + "loss": 0.3478, + "step": 17450 + }, + { + "epoch": 2.510066148979005, + "grad_norm": 0.3061931307570909, + "learning_rate": 1.287521855522015e-06, + "loss": 0.3386, + "step": 17455 + }, + { + "epoch": 2.5107851596203625, + "grad_norm": 0.30550206028882093, + "learning_rate": 1.2838285388812788e-06, + "loss": 0.3597, + "step": 17460 + }, + { + "epoch": 2.51150417026172, + "grad_norm": 0.3115625018502871, + "learning_rate": 1.280140163681568e-06, + "loss": 0.3615, + "step": 17465 + }, + { + "epoch": 2.5122231809030775, + "grad_norm": 0.31812462882374754, + "learning_rate": 1.276456732013921e-06, + "loss": 0.3598, + "step": 17470 + }, + { + "epoch": 2.5129421915444348, + "grad_norm": 0.3179610690815218, + "learning_rate": 1.2727782459665816e-06, + "loss": 0.352, + "step": 17475 + }, + { + "epoch": 2.5136612021857925, + "grad_norm": 0.3089280360690141, + "learning_rate": 1.2691047076249852e-06, + "loss": 0.3478, + "step": 17480 + }, + { + "epoch": 2.5143802128271497, + "grad_norm": 0.3064307988896098, + "learning_rate": 1.26543611907176e-06, + "loss": 0.3581, + "step": 17485 + }, + { + "epoch": 2.5150992234685075, + "grad_norm": 0.3217412579727273, + "learning_rate": 1.2617724823867373e-06, + "loss": 0.3721, + "step": 17490 + }, + { + "epoch": 2.5158182341098647, + "grad_norm": 0.3215853577403647, + "learning_rate": 1.2581137996469306e-06, + "loss": 0.3672, + "step": 17495 + }, + { + "epoch": 2.5165372447512224, + "grad_norm": 0.3098704726577232, + "learning_rate": 1.2544600729265499e-06, + "loss": 0.3458, + "step": 17500 + }, + { + "epoch": 2.5172562553925797, + "grad_norm": 0.30909301288355273, + "learning_rate": 1.2508113042969972e-06, + "loss": 0.3637, + "step": 17505 + }, + { + "epoch": 2.5179752660339374, + "grad_norm": 0.3082915578651469, + "learning_rate": 1.2471674958268564e-06, + "loss": 0.3459, + "step": 17510 + }, + { + "epoch": 2.5186942766752947, + "grad_norm": 0.3054486820572545, + "learning_rate": 1.2435286495819088e-06, + "loss": 0.3626, + "step": 17515 + }, + { + "epoch": 2.5194132873166524, + "grad_norm": 0.32736410711487907, + "learning_rate": 1.2398947676251194e-06, + "loss": 0.3559, + "step": 17520 + }, + { + "epoch": 2.5201322979580096, + "grad_norm": 0.30674435608235473, + "learning_rate": 1.2362658520166348e-06, + "loss": 0.3599, + "step": 17525 + }, + { + "epoch": 2.5208513085993673, + "grad_norm": 0.30842284080151283, + "learning_rate": 1.232641904813785e-06, + "loss": 0.3498, + "step": 17530 + }, + { + "epoch": 2.5215703192407246, + "grad_norm": 0.3096068528572147, + "learning_rate": 1.2290229280710942e-06, + "loss": 0.3452, + "step": 17535 + }, + { + "epoch": 2.5222893298820823, + "grad_norm": 0.35087830788333857, + "learning_rate": 1.2254089238402567e-06, + "loss": 0.3536, + "step": 17540 + }, + { + "epoch": 2.5230083405234396, + "grad_norm": 0.3047042168981482, + "learning_rate": 1.2217998941701515e-06, + "loss": 0.3575, + "step": 17545 + }, + { + "epoch": 2.5237273511647973, + "grad_norm": 0.3151491714584883, + "learning_rate": 1.218195841106843e-06, + "loss": 0.3601, + "step": 17550 + }, + { + "epoch": 2.524446361806155, + "grad_norm": 0.31762041317369133, + "learning_rate": 1.2145967666935632e-06, + "loss": 0.3471, + "step": 17555 + }, + { + "epoch": 2.5251653724475123, + "grad_norm": 0.30093015661914774, + "learning_rate": 1.2110026729707325e-06, + "loss": 0.3583, + "step": 17560 + }, + { + "epoch": 2.5258843830888695, + "grad_norm": 0.32399559788590615, + "learning_rate": 1.2074135619759431e-06, + "loss": 0.356, + "step": 17565 + }, + { + "epoch": 2.5266033937302272, + "grad_norm": 0.30586256467197237, + "learning_rate": 1.2038294357439596e-06, + "loss": 0.3464, + "step": 17570 + }, + { + "epoch": 2.527322404371585, + "grad_norm": 0.30796096457299954, + "learning_rate": 1.2002502963067274e-06, + "loss": 0.3658, + "step": 17575 + }, + { + "epoch": 2.528041415012942, + "grad_norm": 0.30947016052734266, + "learning_rate": 1.1966761456933573e-06, + "loss": 0.3598, + "step": 17580 + }, + { + "epoch": 2.5287604256542995, + "grad_norm": 0.29855342196192786, + "learning_rate": 1.1931069859301335e-06, + "loss": 0.3493, + "step": 17585 + }, + { + "epoch": 2.529479436295657, + "grad_norm": 0.30982323067125345, + "learning_rate": 1.1895428190405168e-06, + "loss": 0.3545, + "step": 17590 + }, + { + "epoch": 2.530198446937015, + "grad_norm": 0.30672308153682704, + "learning_rate": 1.1859836470451314e-06, + "loss": 0.3546, + "step": 17595 + }, + { + "epoch": 2.530917457578372, + "grad_norm": 0.31941929039113764, + "learning_rate": 1.182429471961768e-06, + "loss": 0.3557, + "step": 17600 + }, + { + "epoch": 2.5316364682197294, + "grad_norm": 0.3083870835576022, + "learning_rate": 1.1788802958053924e-06, + "loss": 0.3569, + "step": 17605 + }, + { + "epoch": 2.532355478861087, + "grad_norm": 0.3163874409785299, + "learning_rate": 1.1753361205881275e-06, + "loss": 0.3535, + "step": 17610 + }, + { + "epoch": 2.533074489502445, + "grad_norm": 0.3133733691437593, + "learning_rate": 1.1717969483192671e-06, + "loss": 0.3573, + "step": 17615 + }, + { + "epoch": 2.533793500143802, + "grad_norm": 0.308815961752189, + "learning_rate": 1.1682627810052693e-06, + "loss": 0.3459, + "step": 17620 + }, + { + "epoch": 2.5345125107851594, + "grad_norm": 0.30696375516601637, + "learning_rate": 1.1647336206497505e-06, + "loss": 0.3695, + "step": 17625 + }, + { + "epoch": 2.535231521426517, + "grad_norm": 0.3374303060855002, + "learning_rate": 1.161209469253487e-06, + "loss": 0.3521, + "step": 17630 + }, + { + "epoch": 2.535950532067875, + "grad_norm": 0.3135294382429081, + "learning_rate": 1.1576903288144237e-06, + "loss": 0.3688, + "step": 17635 + }, + { + "epoch": 2.536669542709232, + "grad_norm": 0.31574909250794675, + "learning_rate": 1.154176201327658e-06, + "loss": 0.3549, + "step": 17640 + }, + { + "epoch": 2.5373885533505893, + "grad_norm": 0.3171119631374329, + "learning_rate": 1.1506670887854432e-06, + "loss": 0.3611, + "step": 17645 + }, + { + "epoch": 2.538107563991947, + "grad_norm": 0.3381488113116444, + "learning_rate": 1.1471629931771988e-06, + "loss": 0.3626, + "step": 17650 + }, + { + "epoch": 2.5388265746333047, + "grad_norm": 0.3127780772949529, + "learning_rate": 1.1436639164894893e-06, + "loss": 0.3521, + "step": 17655 + }, + { + "epoch": 2.539545585274662, + "grad_norm": 0.3041307380485427, + "learning_rate": 1.1401698607060418e-06, + "loss": 0.3536, + "step": 17660 + }, + { + "epoch": 2.5402645959160197, + "grad_norm": 0.31128470056928004, + "learning_rate": 1.1366808278077368e-06, + "loss": 0.3631, + "step": 17665 + }, + { + "epoch": 2.540983606557377, + "grad_norm": 0.311601683935439, + "learning_rate": 1.1331968197725985e-06, + "loss": 0.3599, + "step": 17670 + }, + { + "epoch": 2.5417026171987347, + "grad_norm": 0.2928710699884809, + "learning_rate": 1.1297178385758146e-06, + "loss": 0.3679, + "step": 17675 + }, + { + "epoch": 2.542421627840092, + "grad_norm": 0.31033503445344474, + "learning_rate": 1.1262438861897117e-06, + "loss": 0.3461, + "step": 17680 + }, + { + "epoch": 2.5431406384814497, + "grad_norm": 0.3102263513449576, + "learning_rate": 1.1227749645837716e-06, + "loss": 0.3545, + "step": 17685 + }, + { + "epoch": 2.543859649122807, + "grad_norm": 0.3093251646618003, + "learning_rate": 1.1193110757246251e-06, + "loss": 0.345, + "step": 17690 + }, + { + "epoch": 2.5445786597641646, + "grad_norm": 0.31033525496857073, + "learning_rate": 1.115852221576047e-06, + "loss": 0.3604, + "step": 17695 + }, + { + "epoch": 2.545297670405522, + "grad_norm": 0.32487944012429926, + "learning_rate": 1.1123984040989532e-06, + "loss": 0.3446, + "step": 17700 + }, + { + "epoch": 2.5460166810468796, + "grad_norm": 0.30611626755303806, + "learning_rate": 1.1089496252514153e-06, + "loss": 0.3573, + "step": 17705 + }, + { + "epoch": 2.546735691688237, + "grad_norm": 0.31201259122817254, + "learning_rate": 1.1055058869886414e-06, + "loss": 0.3578, + "step": 17710 + }, + { + "epoch": 2.5474547023295946, + "grad_norm": 0.36987983935988966, + "learning_rate": 1.10206719126298e-06, + "loss": 0.3304, + "step": 17715 + }, + { + "epoch": 2.548173712970952, + "grad_norm": 0.30973641081492104, + "learning_rate": 1.0986335400239268e-06, + "loss": 0.3676, + "step": 17720 + }, + { + "epoch": 2.5488927236123096, + "grad_norm": 0.3063446783456226, + "learning_rate": 1.095204935218115e-06, + "loss": 0.3595, + "step": 17725 + }, + { + "epoch": 2.549611734253667, + "grad_norm": 0.2982906314716578, + "learning_rate": 1.0917813787893118e-06, + "loss": 0.3407, + "step": 17730 + }, + { + "epoch": 2.5503307448950245, + "grad_norm": 0.33960199106630035, + "learning_rate": 1.0883628726784323e-06, + "loss": 0.3699, + "step": 17735 + }, + { + "epoch": 2.551049755536382, + "grad_norm": 0.31833517392182115, + "learning_rate": 1.0849494188235198e-06, + "loss": 0.3476, + "step": 17740 + }, + { + "epoch": 2.5517687661777395, + "grad_norm": 0.3122112574473752, + "learning_rate": 1.0815410191597563e-06, + "loss": 0.3544, + "step": 17745 + }, + { + "epoch": 2.5524877768190968, + "grad_norm": 0.3128498719695118, + "learning_rate": 1.0781376756194628e-06, + "loss": 0.3553, + "step": 17750 + }, + { + "epoch": 2.5532067874604545, + "grad_norm": 0.30965344017042834, + "learning_rate": 1.0747393901320836e-06, + "loss": 0.3453, + "step": 17755 + }, + { + "epoch": 2.5539257981018118, + "grad_norm": 0.30062785024974664, + "learning_rate": 1.0713461646242063e-06, + "loss": 0.3557, + "step": 17760 + }, + { + "epoch": 2.5546448087431695, + "grad_norm": 0.3085812113907845, + "learning_rate": 1.0679580010195444e-06, + "loss": 0.3599, + "step": 17765 + }, + { + "epoch": 2.5553638193845267, + "grad_norm": 0.30933192998939213, + "learning_rate": 1.0645749012389438e-06, + "loss": 0.3653, + "step": 17770 + }, + { + "epoch": 2.5560828300258844, + "grad_norm": 0.3140084895452016, + "learning_rate": 1.0611968672003735e-06, + "loss": 0.3482, + "step": 17775 + }, + { + "epoch": 2.5568018406672417, + "grad_norm": 0.29400669684850556, + "learning_rate": 1.0578239008189406e-06, + "loss": 0.3525, + "step": 17780 + }, + { + "epoch": 2.5575208513085994, + "grad_norm": 0.31072248256953744, + "learning_rate": 1.0544560040068697e-06, + "loss": 0.3672, + "step": 17785 + }, + { + "epoch": 2.558239861949957, + "grad_norm": 0.30413205651252373, + "learning_rate": 1.0510931786735191e-06, + "loss": 0.3541, + "step": 17790 + }, + { + "epoch": 2.5589588725913144, + "grad_norm": 0.3209217544752419, + "learning_rate": 1.047735426725368e-06, + "loss": 0.3412, + "step": 17795 + }, + { + "epoch": 2.5596778832326716, + "grad_norm": 0.31029080755246485, + "learning_rate": 1.0443827500660152e-06, + "loss": 0.352, + "step": 17800 + }, + { + "epoch": 2.5603968938740294, + "grad_norm": 0.31861603957763635, + "learning_rate": 1.0410351505961912e-06, + "loss": 0.3636, + "step": 17805 + }, + { + "epoch": 2.561115904515387, + "grad_norm": 0.3134869954025055, + "learning_rate": 1.0376926302137435e-06, + "loss": 0.3471, + "step": 17810 + }, + { + "epoch": 2.5618349151567443, + "grad_norm": 0.31756572789531323, + "learning_rate": 1.0343551908136385e-06, + "loss": 0.3498, + "step": 17815 + }, + { + "epoch": 2.5625539257981016, + "grad_norm": 0.3161529464224881, + "learning_rate": 1.0310228342879658e-06, + "loss": 0.3523, + "step": 17820 + }, + { + "epoch": 2.5632729364394593, + "grad_norm": 0.30553748286795546, + "learning_rate": 1.0276955625259299e-06, + "loss": 0.3565, + "step": 17825 + }, + { + "epoch": 2.563991947080817, + "grad_norm": 0.3232474732656798, + "learning_rate": 1.024373377413853e-06, + "loss": 0.3724, + "step": 17830 + }, + { + "epoch": 2.5647109577221743, + "grad_norm": 0.2966854239027476, + "learning_rate": 1.0210562808351775e-06, + "loss": 0.369, + "step": 17835 + }, + { + "epoch": 2.5654299683635315, + "grad_norm": 0.31398220428883766, + "learning_rate": 1.017744274670457e-06, + "loss": 0.3637, + "step": 17840 + }, + { + "epoch": 2.5661489790048893, + "grad_norm": 0.3070092928631495, + "learning_rate": 1.0144373607973578e-06, + "loss": 0.3656, + "step": 17845 + }, + { + "epoch": 2.566867989646247, + "grad_norm": 0.30754228214286794, + "learning_rate": 1.0111355410906632e-06, + "loss": 0.3617, + "step": 17850 + }, + { + "epoch": 2.5675870002876042, + "grad_norm": 0.3079138574317495, + "learning_rate": 1.0078388174222696e-06, + "loss": 0.3558, + "step": 17855 + }, + { + "epoch": 2.5683060109289615, + "grad_norm": 0.2951486391053495, + "learning_rate": 1.004547191661178e-06, + "loss": 0.3581, + "step": 17860 + }, + { + "epoch": 2.569025021570319, + "grad_norm": 0.30423514391994716, + "learning_rate": 1.001260665673508e-06, + "loss": 0.3716, + "step": 17865 + }, + { + "epoch": 2.569744032211677, + "grad_norm": 0.31846350303726495, + "learning_rate": 9.979792413224775e-07, + "loss": 0.3706, + "step": 17870 + }, + { + "epoch": 2.570463042853034, + "grad_norm": 0.31488956701231063, + "learning_rate": 9.94702920468419e-07, + "loss": 0.3692, + "step": 17875 + }, + { + "epoch": 2.5711820534943914, + "grad_norm": 0.3092619322032862, + "learning_rate": 9.914317049687727e-07, + "loss": 0.3547, + "step": 17880 + }, + { + "epoch": 2.571901064135749, + "grad_norm": 0.3172533533313283, + "learning_rate": 9.88165596678079e-07, + "loss": 0.3549, + "step": 17885 + }, + { + "epoch": 2.572620074777107, + "grad_norm": 0.31089167574958426, + "learning_rate": 9.849045974479887e-07, + "loss": 0.3579, + "step": 17890 + }, + { + "epoch": 2.573339085418464, + "grad_norm": 0.3086333598820324, + "learning_rate": 9.81648709127252e-07, + "loss": 0.3663, + "step": 17895 + }, + { + "epoch": 2.574058096059822, + "grad_norm": 0.4560442882955573, + "learning_rate": 9.7839793356172e-07, + "loss": 0.3523, + "step": 17900 + }, + { + "epoch": 2.574777106701179, + "grad_norm": 0.3149997741880437, + "learning_rate": 9.751522725943519e-07, + "loss": 0.3577, + "step": 17905 + }, + { + "epoch": 2.575496117342537, + "grad_norm": 0.31545564164529466, + "learning_rate": 9.719117280652045e-07, + "loss": 0.3659, + "step": 17910 + }, + { + "epoch": 2.576215127983894, + "grad_norm": 0.29418506531220967, + "learning_rate": 9.686763018114299e-07, + "loss": 0.3609, + "step": 17915 + }, + { + "epoch": 2.576934138625252, + "grad_norm": 0.3551091360630454, + "learning_rate": 9.654459956672834e-07, + "loss": 0.3506, + "step": 17920 + }, + { + "epoch": 2.577653149266609, + "grad_norm": 0.30902365189976183, + "learning_rate": 9.622208114641163e-07, + "loss": 0.3554, + "step": 17925 + }, + { + "epoch": 2.5783721599079668, + "grad_norm": 0.31418474404733315, + "learning_rate": 9.590007510303711e-07, + "loss": 0.3663, + "step": 17930 + }, + { + "epoch": 2.579091170549324, + "grad_norm": 0.3012377127454265, + "learning_rate": 9.557858161915968e-07, + "loss": 0.3634, + "step": 17935 + }, + { + "epoch": 2.5798101811906817, + "grad_norm": 0.2982718072566914, + "learning_rate": 9.525760087704261e-07, + "loss": 0.3449, + "step": 17940 + }, + { + "epoch": 2.580529191832039, + "grad_norm": 0.31614761632902033, + "learning_rate": 9.493713305865859e-07, + "loss": 0.3554, + "step": 17945 + }, + { + "epoch": 2.5812482024733967, + "grad_norm": 0.31841954472989525, + "learning_rate": 9.461717834569007e-07, + "loss": 0.3593, + "step": 17950 + }, + { + "epoch": 2.581967213114754, + "grad_norm": 0.3125590065874319, + "learning_rate": 9.42977369195286e-07, + "loss": 0.3381, + "step": 17955 + }, + { + "epoch": 2.5826862237561117, + "grad_norm": 0.30646306122852585, + "learning_rate": 9.397880896127387e-07, + "loss": 0.3668, + "step": 17960 + }, + { + "epoch": 2.583405234397469, + "grad_norm": 0.31857745113245123, + "learning_rate": 9.366039465173549e-07, + "loss": 0.3409, + "step": 17965 + }, + { + "epoch": 2.5841242450388267, + "grad_norm": 0.30420519814884317, + "learning_rate": 9.334249417143126e-07, + "loss": 0.3542, + "step": 17970 + }, + { + "epoch": 2.584843255680184, + "grad_norm": 0.3189126112868735, + "learning_rate": 9.30251077005877e-07, + "loss": 0.3379, + "step": 17975 + }, + { + "epoch": 2.5855622663215416, + "grad_norm": 0.3112465994030675, + "learning_rate": 9.270823541914031e-07, + "loss": 0.3548, + "step": 17980 + }, + { + "epoch": 2.586281276962899, + "grad_norm": 0.3135723637127105, + "learning_rate": 9.239187750673284e-07, + "loss": 0.3598, + "step": 17985 + }, + { + "epoch": 2.5870002876042566, + "grad_norm": 0.3168485875913864, + "learning_rate": 9.207603414271704e-07, + "loss": 0.3442, + "step": 17990 + }, + { + "epoch": 2.587719298245614, + "grad_norm": 0.3130767429698904, + "learning_rate": 9.176070550615379e-07, + "loss": 0.3586, + "step": 17995 + }, + { + "epoch": 2.5884383088869716, + "grad_norm": 0.2986598627803862, + "learning_rate": 9.144589177581132e-07, + "loss": 0.3504, + "step": 18000 + }, + { + "epoch": 2.5891573195283293, + "grad_norm": 0.30468076985726344, + "learning_rate": 9.113159313016662e-07, + "loss": 0.3553, + "step": 18005 + }, + { + "epoch": 2.5898763301696865, + "grad_norm": 0.29062379652839426, + "learning_rate": 9.08178097474044e-07, + "loss": 0.3596, + "step": 18010 + }, + { + "epoch": 2.590595340811044, + "grad_norm": 0.31065467582213546, + "learning_rate": 9.050454180541679e-07, + "loss": 0.3576, + "step": 18015 + }, + { + "epoch": 2.5913143514524015, + "grad_norm": 0.31633070892778303, + "learning_rate": 9.019178948180474e-07, + "loss": 0.3548, + "step": 18020 + }, + { + "epoch": 2.5920333620937592, + "grad_norm": 0.3173632886828699, + "learning_rate": 8.987955295387596e-07, + "loss": 0.3699, + "step": 18025 + }, + { + "epoch": 2.5927523727351165, + "grad_norm": 0.29854965996870914, + "learning_rate": 8.956783239864586e-07, + "loss": 0.3514, + "step": 18030 + }, + { + "epoch": 2.5934713833764738, + "grad_norm": 0.31656456590031207, + "learning_rate": 8.925662799283797e-07, + "loss": 0.3668, + "step": 18035 + }, + { + "epoch": 2.5941903940178315, + "grad_norm": 0.5433124842010192, + "learning_rate": 8.894593991288259e-07, + "loss": 0.3555, + "step": 18040 + }, + { + "epoch": 2.594909404659189, + "grad_norm": 0.3222805926226404, + "learning_rate": 8.863576833491705e-07, + "loss": 0.348, + "step": 18045 + }, + { + "epoch": 2.5956284153005464, + "grad_norm": 0.3575555047987363, + "learning_rate": 8.832611343478681e-07, + "loss": 0.3617, + "step": 18050 + }, + { + "epoch": 2.5963474259419037, + "grad_norm": 0.3154306626253122, + "learning_rate": 8.801697538804377e-07, + "loss": 0.3497, + "step": 18055 + }, + { + "epoch": 2.5970664365832614, + "grad_norm": 0.2985330395295317, + "learning_rate": 8.770835436994674e-07, + "loss": 0.3508, + "step": 18060 + }, + { + "epoch": 2.597785447224619, + "grad_norm": 0.3031063965380999, + "learning_rate": 8.740025055546186e-07, + "loss": 0.3624, + "step": 18065 + }, + { + "epoch": 2.5985044578659764, + "grad_norm": 0.3214116364593015, + "learning_rate": 8.709266411926165e-07, + "loss": 0.3539, + "step": 18070 + }, + { + "epoch": 2.5992234685073337, + "grad_norm": 0.30763234128501515, + "learning_rate": 8.678559523572527e-07, + "loss": 0.3553, + "step": 18075 + }, + { + "epoch": 2.5999424791486914, + "grad_norm": 0.3001909852834488, + "learning_rate": 8.647904407893904e-07, + "loss": 0.3656, + "step": 18080 + }, + { + "epoch": 2.600661489790049, + "grad_norm": 0.30630237394703397, + "learning_rate": 8.617301082269514e-07, + "loss": 0.3554, + "step": 18085 + }, + { + "epoch": 2.6013805004314063, + "grad_norm": 0.31033057872110525, + "learning_rate": 8.586749564049223e-07, + "loss": 0.3544, + "step": 18090 + }, + { + "epoch": 2.6020995110727636, + "grad_norm": 0.3247236658783421, + "learning_rate": 8.556249870553546e-07, + "loss": 0.3477, + "step": 18095 + }, + { + "epoch": 2.6028185217141213, + "grad_norm": 0.32313911455388106, + "learning_rate": 8.525802019073647e-07, + "loss": 0.3568, + "step": 18100 + }, + { + "epoch": 2.603537532355479, + "grad_norm": 0.30326882259229654, + "learning_rate": 8.495406026871212e-07, + "loss": 0.3539, + "step": 18105 + }, + { + "epoch": 2.6042565429968363, + "grad_norm": 0.33029483328489695, + "learning_rate": 8.465061911178619e-07, + "loss": 0.3507, + "step": 18110 + }, + { + "epoch": 2.604975553638194, + "grad_norm": 0.31693139615258986, + "learning_rate": 8.434769689198763e-07, + "loss": 0.3484, + "step": 18115 + }, + { + "epoch": 2.6056945642795513, + "grad_norm": 0.30435444065947753, + "learning_rate": 8.404529378105186e-07, + "loss": 0.3524, + "step": 18120 + }, + { + "epoch": 2.606413574920909, + "grad_norm": 0.3145248094633032, + "learning_rate": 8.374340995041941e-07, + "loss": 0.3507, + "step": 18125 + }, + { + "epoch": 2.6071325855622662, + "grad_norm": 0.3027205535143405, + "learning_rate": 8.344204557123648e-07, + "loss": 0.3517, + "step": 18130 + }, + { + "epoch": 2.607851596203624, + "grad_norm": 0.3031565366950298, + "learning_rate": 8.314120081435539e-07, + "loss": 0.3615, + "step": 18135 + }, + { + "epoch": 2.608570606844981, + "grad_norm": 0.3123310425463929, + "learning_rate": 8.284087585033329e-07, + "loss": 0.3455, + "step": 18140 + }, + { + "epoch": 2.609289617486339, + "grad_norm": 0.3153991945372381, + "learning_rate": 8.254107084943241e-07, + "loss": 0.3657, + "step": 18145 + }, + { + "epoch": 2.610008628127696, + "grad_norm": 0.31486864682643856, + "learning_rate": 8.224178598162091e-07, + "loss": 0.3526, + "step": 18150 + }, + { + "epoch": 2.610727638769054, + "grad_norm": 0.30974477297603165, + "learning_rate": 8.194302141657185e-07, + "loss": 0.3504, + "step": 18155 + }, + { + "epoch": 2.611446649410411, + "grad_norm": 0.30736985502698316, + "learning_rate": 8.164477732366294e-07, + "loss": 0.3559, + "step": 18160 + }, + { + "epoch": 2.612165660051769, + "grad_norm": 0.31725108094571464, + "learning_rate": 8.134705387197728e-07, + "loss": 0.3564, + "step": 18165 + }, + { + "epoch": 2.612884670693126, + "grad_norm": 0.3200454927042961, + "learning_rate": 8.104985123030263e-07, + "loss": 0.3673, + "step": 18170 + }, + { + "epoch": 2.613603681334484, + "grad_norm": 0.30975804083250985, + "learning_rate": 8.075316956713119e-07, + "loss": 0.3436, + "step": 18175 + }, + { + "epoch": 2.614322691975841, + "grad_norm": 0.30489268103297146, + "learning_rate": 8.045700905066034e-07, + "loss": 0.3392, + "step": 18180 + }, + { + "epoch": 2.615041702617199, + "grad_norm": 0.3133491432561115, + "learning_rate": 8.016136984879175e-07, + "loss": 0.3717, + "step": 18185 + }, + { + "epoch": 2.615760713258556, + "grad_norm": 0.31546749604628177, + "learning_rate": 7.986625212913124e-07, + "loss": 0.3575, + "step": 18190 + }, + { + "epoch": 2.616479723899914, + "grad_norm": 0.30925664697615834, + "learning_rate": 7.957165605898964e-07, + "loss": 0.3481, + "step": 18195 + }, + { + "epoch": 2.617198734541271, + "grad_norm": 0.297558200940703, + "learning_rate": 7.927758180538158e-07, + "loss": 0.3432, + "step": 18200 + }, + { + "epoch": 2.6179177451826288, + "grad_norm": 0.3206407749722453, + "learning_rate": 7.898402953502582e-07, + "loss": 0.3409, + "step": 18205 + }, + { + "epoch": 2.618636755823986, + "grad_norm": 0.3243519309879927, + "learning_rate": 7.869099941434565e-07, + "loss": 0.3472, + "step": 18210 + }, + { + "epoch": 2.6193557664653437, + "grad_norm": 0.3168554065725209, + "learning_rate": 7.839849160946766e-07, + "loss": 0.3479, + "step": 18215 + }, + { + "epoch": 2.6200747771067014, + "grad_norm": 0.32068591660149026, + "learning_rate": 7.810650628622308e-07, + "loss": 0.3604, + "step": 18220 + }, + { + "epoch": 2.6207937877480587, + "grad_norm": 0.31186403526473955, + "learning_rate": 7.781504361014635e-07, + "loss": 0.3579, + "step": 18225 + }, + { + "epoch": 2.621512798389416, + "grad_norm": 0.3160010911992544, + "learning_rate": 7.752410374647557e-07, + "loss": 0.3615, + "step": 18230 + }, + { + "epoch": 2.6222318090307737, + "grad_norm": 0.31075518910777417, + "learning_rate": 7.723368686015309e-07, + "loss": 0.354, + "step": 18235 + }, + { + "epoch": 2.6229508196721314, + "grad_norm": 0.30430970186633305, + "learning_rate": 7.694379311582401e-07, + "loss": 0.3495, + "step": 18240 + }, + { + "epoch": 2.6236698303134887, + "grad_norm": 0.31048597447448645, + "learning_rate": 7.665442267783741e-07, + "loss": 0.3574, + "step": 18245 + }, + { + "epoch": 2.624388840954846, + "grad_norm": 0.309648764495263, + "learning_rate": 7.636557571024528e-07, + "loss": 0.3367, + "step": 18250 + }, + { + "epoch": 2.6251078515962036, + "grad_norm": 0.30943657694739996, + "learning_rate": 7.607725237680342e-07, + "loss": 0.3622, + "step": 18255 + }, + { + "epoch": 2.6258268622375613, + "grad_norm": 0.32244387001738317, + "learning_rate": 7.578945284096983e-07, + "loss": 0.354, + "step": 18260 + }, + { + "epoch": 2.6265458728789186, + "grad_norm": 0.3386284710455727, + "learning_rate": 7.550217726590658e-07, + "loss": 0.3562, + "step": 18265 + }, + { + "epoch": 2.627264883520276, + "grad_norm": 0.31781993035567807, + "learning_rate": 7.521542581447804e-07, + "loss": 0.3578, + "step": 18270 + }, + { + "epoch": 2.6279838941616336, + "grad_norm": 0.3027853958633486, + "learning_rate": 7.492919864925153e-07, + "loss": 0.3533, + "step": 18275 + }, + { + "epoch": 2.6287029048029913, + "grad_norm": 0.3042117835743225, + "learning_rate": 7.464349593249731e-07, + "loss": 0.3533, + "step": 18280 + }, + { + "epoch": 2.6294219154443486, + "grad_norm": 0.29235389032971715, + "learning_rate": 7.435831782618829e-07, + "loss": 0.3416, + "step": 18285 + }, + { + "epoch": 2.630140926085706, + "grad_norm": 0.30215436536724255, + "learning_rate": 7.407366449199959e-07, + "loss": 0.3579, + "step": 18290 + }, + { + "epoch": 2.6308599367270635, + "grad_norm": 0.3168111913387137, + "learning_rate": 7.378953609130946e-07, + "loss": 0.3599, + "step": 18295 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.37295092331405927, + "learning_rate": 7.350593278519824e-07, + "loss": 0.3532, + "step": 18300 + }, + { + "epoch": 2.6322979580097785, + "grad_norm": 0.32834135943493065, + "learning_rate": 7.322285473444835e-07, + "loss": 0.3683, + "step": 18305 + }, + { + "epoch": 2.6330169686511358, + "grad_norm": 0.3368581794847914, + "learning_rate": 7.294030209954494e-07, + "loss": 0.3553, + "step": 18310 + }, + { + "epoch": 2.6337359792924935, + "grad_norm": 0.3038531205271354, + "learning_rate": 7.265827504067479e-07, + "loss": 0.3599, + "step": 18315 + }, + { + "epoch": 2.634454989933851, + "grad_norm": 0.3133142852428987, + "learning_rate": 7.237677371772667e-07, + "loss": 0.353, + "step": 18320 + }, + { + "epoch": 2.6351740005752085, + "grad_norm": 0.31187021864545034, + "learning_rate": 7.209579829029211e-07, + "loss": 0.3354, + "step": 18325 + }, + { + "epoch": 2.635893011216566, + "grad_norm": 0.3087799419620964, + "learning_rate": 7.181534891766329e-07, + "loss": 0.3586, + "step": 18330 + }, + { + "epoch": 2.6366120218579234, + "grad_norm": 0.3013821823288862, + "learning_rate": 7.153542575883543e-07, + "loss": 0.3437, + "step": 18335 + }, + { + "epoch": 2.637331032499281, + "grad_norm": 0.31452374304676034, + "learning_rate": 7.125602897250427e-07, + "loss": 0.3544, + "step": 18340 + }, + { + "epoch": 2.6380500431406384, + "grad_norm": 0.3059157386158428, + "learning_rate": 7.097715871706778e-07, + "loss": 0.3714, + "step": 18345 + }, + { + "epoch": 2.638769053781996, + "grad_norm": 0.3128598559179821, + "learning_rate": 7.06988151506256e-07, + "loss": 0.3653, + "step": 18350 + }, + { + "epoch": 2.6394880644233534, + "grad_norm": 0.309303362265719, + "learning_rate": 7.042099843097827e-07, + "loss": 0.3426, + "step": 18355 + }, + { + "epoch": 2.640207075064711, + "grad_norm": 0.3038414633493157, + "learning_rate": 7.014370871562759e-07, + "loss": 0.354, + "step": 18360 + }, + { + "epoch": 2.6409260857060683, + "grad_norm": 0.31176618525619093, + "learning_rate": 6.986694616177736e-07, + "loss": 0.3691, + "step": 18365 + }, + { + "epoch": 2.641645096347426, + "grad_norm": 0.30105168726294435, + "learning_rate": 6.959071092633163e-07, + "loss": 0.3556, + "step": 18370 + }, + { + "epoch": 2.6423641069887833, + "grad_norm": 0.29985646196713983, + "learning_rate": 6.931500316589578e-07, + "loss": 0.351, + "step": 18375 + }, + { + "epoch": 2.643083117630141, + "grad_norm": 0.29909723415454065, + "learning_rate": 6.903982303677659e-07, + "loss": 0.348, + "step": 18380 + }, + { + "epoch": 2.6438021282714983, + "grad_norm": 0.2989167855128057, + "learning_rate": 6.876517069498123e-07, + "loss": 0.351, + "step": 18385 + }, + { + "epoch": 2.644521138912856, + "grad_norm": 0.30636967525330233, + "learning_rate": 6.84910462962175e-07, + "loss": 0.361, + "step": 18390 + }, + { + "epoch": 2.6452401495542133, + "grad_norm": 0.3426303376526915, + "learning_rate": 6.821744999589452e-07, + "loss": 0.3575, + "step": 18395 + }, + { + "epoch": 2.645959160195571, + "grad_norm": 0.3084242076335015, + "learning_rate": 6.794438194912168e-07, + "loss": 0.3355, + "step": 18400 + }, + { + "epoch": 2.6466781708369282, + "grad_norm": 0.3194661800181507, + "learning_rate": 6.767184231070855e-07, + "loss": 0.3658, + "step": 18405 + }, + { + "epoch": 2.647397181478286, + "grad_norm": 0.30745927021889674, + "learning_rate": 6.739983123516591e-07, + "loss": 0.3486, + "step": 18410 + }, + { + "epoch": 2.648116192119643, + "grad_norm": 0.3071331898673472, + "learning_rate": 6.712834887670417e-07, + "loss": 0.3545, + "step": 18415 + }, + { + "epoch": 2.648835202761001, + "grad_norm": 0.3124056453002358, + "learning_rate": 6.685739538923419e-07, + "loss": 0.3738, + "step": 18420 + }, + { + "epoch": 2.649554213402358, + "grad_norm": 0.3055154897012926, + "learning_rate": 6.658697092636735e-07, + "loss": 0.3396, + "step": 18425 + }, + { + "epoch": 2.650273224043716, + "grad_norm": 0.3171116831309729, + "learning_rate": 6.631707564141454e-07, + "loss": 0.351, + "step": 18430 + }, + { + "epoch": 2.6509922346850736, + "grad_norm": 0.31313833552359555, + "learning_rate": 6.604770968738705e-07, + "loss": 0.3673, + "step": 18435 + }, + { + "epoch": 2.651711245326431, + "grad_norm": 0.314753899908688, + "learning_rate": 6.577887321699583e-07, + "loss": 0.3766, + "step": 18440 + }, + { + "epoch": 2.652430255967788, + "grad_norm": 0.3044781194749951, + "learning_rate": 6.551056638265208e-07, + "loss": 0.3582, + "step": 18445 + }, + { + "epoch": 2.653149266609146, + "grad_norm": 0.30834514659184414, + "learning_rate": 6.524278933646633e-07, + "loss": 0.3409, + "step": 18450 + }, + { + "epoch": 2.6538682772505036, + "grad_norm": 0.2953324976439212, + "learning_rate": 6.497554223024883e-07, + "loss": 0.3643, + "step": 18455 + }, + { + "epoch": 2.654587287891861, + "grad_norm": 0.31342055968388427, + "learning_rate": 6.470882521550914e-07, + "loss": 0.3388, + "step": 18460 + }, + { + "epoch": 2.655306298533218, + "grad_norm": 0.30962123859836094, + "learning_rate": 6.44426384434571e-07, + "loss": 0.3533, + "step": 18465 + }, + { + "epoch": 2.656025309174576, + "grad_norm": 0.2992654887729307, + "learning_rate": 6.417698206500123e-07, + "loss": 0.375, + "step": 18470 + }, + { + "epoch": 2.6567443198159335, + "grad_norm": 0.30336724108173907, + "learning_rate": 6.391185623074935e-07, + "loss": 0.3558, + "step": 18475 + }, + { + "epoch": 2.6574633304572908, + "grad_norm": 0.31580569462875396, + "learning_rate": 6.364726109100894e-07, + "loss": 0.3579, + "step": 18480 + }, + { + "epoch": 2.658182341098648, + "grad_norm": 0.30753080592729337, + "learning_rate": 6.338319679578619e-07, + "loss": 0.3444, + "step": 18485 + }, + { + "epoch": 2.6589013517400057, + "grad_norm": 0.30236913583372577, + "learning_rate": 6.311966349478671e-07, + "loss": 0.3552, + "step": 18490 + }, + { + "epoch": 2.6596203623813635, + "grad_norm": 0.30749247481492153, + "learning_rate": 6.285666133741463e-07, + "loss": 0.3707, + "step": 18495 + }, + { + "epoch": 2.6603393730227207, + "grad_norm": 0.32415010648046794, + "learning_rate": 6.25941904727736e-07, + "loss": 0.3373, + "step": 18500 + }, + { + "epoch": 2.661058383664078, + "grad_norm": 0.2989538667429938, + "learning_rate": 6.233225104966534e-07, + "loss": 0.3389, + "step": 18505 + }, + { + "epoch": 2.6617773943054357, + "grad_norm": 0.31222880480938014, + "learning_rate": 6.207084321659085e-07, + "loss": 0.3556, + "step": 18510 + }, + { + "epoch": 2.6624964049467934, + "grad_norm": 0.3107654979460972, + "learning_rate": 6.180996712174936e-07, + "loss": 0.3428, + "step": 18515 + }, + { + "epoch": 2.6632154155881507, + "grad_norm": 0.29323243544712446, + "learning_rate": 6.15496229130389e-07, + "loss": 0.3582, + "step": 18520 + }, + { + "epoch": 2.663934426229508, + "grad_norm": 0.30046781829391606, + "learning_rate": 6.128981073805585e-07, + "loss": 0.3563, + "step": 18525 + }, + { + "epoch": 2.6646534368708656, + "grad_norm": 0.31216589986443405, + "learning_rate": 6.103053074409515e-07, + "loss": 0.3473, + "step": 18530 + }, + { + "epoch": 2.6653724475122234, + "grad_norm": 0.30703745582960706, + "learning_rate": 6.077178307814946e-07, + "loss": 0.3644, + "step": 18535 + }, + { + "epoch": 2.6660914581535806, + "grad_norm": 0.31184638928636954, + "learning_rate": 6.051356788691032e-07, + "loss": 0.3564, + "step": 18540 + }, + { + "epoch": 2.6668104687949383, + "grad_norm": 0.32313849767841774, + "learning_rate": 6.025588531676719e-07, + "loss": 0.3751, + "step": 18545 + }, + { + "epoch": 2.6675294794362956, + "grad_norm": 0.3294691163967009, + "learning_rate": 5.999873551380753e-07, + "loss": 0.3478, + "step": 18550 + }, + { + "epoch": 2.6682484900776533, + "grad_norm": 0.32674976118151594, + "learning_rate": 5.974211862381673e-07, + "loss": 0.3488, + "step": 18555 + }, + { + "epoch": 2.6689675007190106, + "grad_norm": 0.31470234216041487, + "learning_rate": 5.948603479227777e-07, + "loss": 0.3561, + "step": 18560 + }, + { + "epoch": 2.6696865113603683, + "grad_norm": 0.3300101863212417, + "learning_rate": 5.923048416437215e-07, + "loss": 0.3509, + "step": 18565 + }, + { + "epoch": 2.6704055220017255, + "grad_norm": 0.31705094998007205, + "learning_rate": 5.897546688497857e-07, + "loss": 0.3671, + "step": 18570 + }, + { + "epoch": 2.6711245326430832, + "grad_norm": 0.30827868767097877, + "learning_rate": 5.872098309867314e-07, + "loss": 0.3593, + "step": 18575 + }, + { + "epoch": 2.6718435432844405, + "grad_norm": 0.31382404359434163, + "learning_rate": 5.84670329497301e-07, + "loss": 0.3579, + "step": 18580 + }, + { + "epoch": 2.6725625539257982, + "grad_norm": 0.31692913487461893, + "learning_rate": 5.821361658212077e-07, + "loss": 0.3561, + "step": 18585 + }, + { + "epoch": 2.6732815645671555, + "grad_norm": 0.3022309763466634, + "learning_rate": 5.796073413951398e-07, + "loss": 0.3601, + "step": 18590 + }, + { + "epoch": 2.674000575208513, + "grad_norm": 0.30453373089008384, + "learning_rate": 5.770838576527604e-07, + "loss": 0.3567, + "step": 18595 + }, + { + "epoch": 2.6747195858498705, + "grad_norm": 0.31273564745374643, + "learning_rate": 5.74565716024702e-07, + "loss": 0.3525, + "step": 18600 + }, + { + "epoch": 2.675438596491228, + "grad_norm": 0.3188454358204983, + "learning_rate": 5.720529179385659e-07, + "loss": 0.3626, + "step": 18605 + }, + { + "epoch": 2.6761576071325854, + "grad_norm": 0.3144833786152819, + "learning_rate": 5.695454648189336e-07, + "loss": 0.3489, + "step": 18610 + }, + { + "epoch": 2.676876617773943, + "grad_norm": 0.30619781719720385, + "learning_rate": 5.670433580873458e-07, + "loss": 0.3625, + "step": 18615 + }, + { + "epoch": 2.6775956284153004, + "grad_norm": 0.3035857668552543, + "learning_rate": 5.645465991623167e-07, + "loss": 0.3523, + "step": 18620 + }, + { + "epoch": 2.678314639056658, + "grad_norm": 0.31449667720556634, + "learning_rate": 5.620551894593318e-07, + "loss": 0.3473, + "step": 18625 + }, + { + "epoch": 2.6790336496980154, + "grad_norm": 0.3142448634332198, + "learning_rate": 5.595691303908368e-07, + "loss": 0.3458, + "step": 18630 + }, + { + "epoch": 2.679752660339373, + "grad_norm": 0.30135205870349274, + "learning_rate": 5.570884233662521e-07, + "loss": 0.3552, + "step": 18635 + }, + { + "epoch": 2.6804716709807304, + "grad_norm": 0.30971611305814795, + "learning_rate": 5.54613069791956e-07, + "loss": 0.3545, + "step": 18640 + }, + { + "epoch": 2.681190681622088, + "grad_norm": 0.3033914862578706, + "learning_rate": 5.521430710712994e-07, + "loss": 0.3495, + "step": 18645 + }, + { + "epoch": 2.6819096922634453, + "grad_norm": 0.30910133743457535, + "learning_rate": 5.496784286045898e-07, + "loss": 0.3553, + "step": 18650 + }, + { + "epoch": 2.682628702904803, + "grad_norm": 0.3221062115483798, + "learning_rate": 5.47219143789105e-07, + "loss": 0.3524, + "step": 18655 + }, + { + "epoch": 2.6833477135461603, + "grad_norm": 0.31435498970811887, + "learning_rate": 5.447652180190799e-07, + "loss": 0.3554, + "step": 18660 + }, + { + "epoch": 2.684066724187518, + "grad_norm": 0.3086658790808085, + "learning_rate": 5.42316652685716e-07, + "loss": 0.3533, + "step": 18665 + }, + { + "epoch": 2.6847857348288757, + "grad_norm": 0.3130566647412001, + "learning_rate": 5.398734491771718e-07, + "loss": 0.3338, + "step": 18670 + }, + { + "epoch": 2.685504745470233, + "grad_norm": 0.3141328453505302, + "learning_rate": 5.374356088785659e-07, + "loss": 0.3438, + "step": 18675 + }, + { + "epoch": 2.6862237561115903, + "grad_norm": 0.3196956028730137, + "learning_rate": 5.350031331719818e-07, + "loss": 0.36, + "step": 18680 + }, + { + "epoch": 2.686942766752948, + "grad_norm": 0.29995725083229124, + "learning_rate": 5.325760234364541e-07, + "loss": 0.3523, + "step": 18685 + }, + { + "epoch": 2.6876617773943057, + "grad_norm": 0.3084889722071449, + "learning_rate": 5.301542810479809e-07, + "loss": 0.3379, + "step": 18690 + }, + { + "epoch": 2.688380788035663, + "grad_norm": 0.2988514783129927, + "learning_rate": 5.277379073795175e-07, + "loss": 0.3523, + "step": 18695 + }, + { + "epoch": 2.68909979867702, + "grad_norm": 0.3201023826593751, + "learning_rate": 5.253269038009711e-07, + "loss": 0.3625, + "step": 18700 + }, + { + "epoch": 2.689818809318378, + "grad_norm": 0.31833673612930263, + "learning_rate": 5.229212716792065e-07, + "loss": 0.3449, + "step": 18705 + }, + { + "epoch": 2.6905378199597356, + "grad_norm": 0.3170454396678282, + "learning_rate": 5.205210123780468e-07, + "loss": 0.3753, + "step": 18710 + }, + { + "epoch": 2.691256830601093, + "grad_norm": 0.30917530066201315, + "learning_rate": 5.181261272582638e-07, + "loss": 0.3579, + "step": 18715 + }, + { + "epoch": 2.69197584124245, + "grad_norm": 0.3104175867900994, + "learning_rate": 5.157366176775835e-07, + "loss": 0.3562, + "step": 18720 + }, + { + "epoch": 2.692694851883808, + "grad_norm": 0.31521969091333557, + "learning_rate": 5.13352484990689e-07, + "loss": 0.3516, + "step": 18725 + }, + { + "epoch": 2.6934138625251656, + "grad_norm": 0.3154907210066356, + "learning_rate": 5.10973730549208e-07, + "loss": 0.353, + "step": 18730 + }, + { + "epoch": 2.694132873166523, + "grad_norm": 0.31154418289279073, + "learning_rate": 5.08600355701725e-07, + "loss": 0.352, + "step": 18735 + }, + { + "epoch": 2.69485188380788, + "grad_norm": 0.3218114148291255, + "learning_rate": 5.062323617937736e-07, + "loss": 0.3671, + "step": 18740 + }, + { + "epoch": 2.695570894449238, + "grad_norm": 0.30783533205590813, + "learning_rate": 5.038697501678336e-07, + "loss": 0.3639, + "step": 18745 + }, + { + "epoch": 2.6962899050905955, + "grad_norm": 0.32083279526223407, + "learning_rate": 5.015125221633355e-07, + "loss": 0.3592, + "step": 18750 + }, + { + "epoch": 2.697008915731953, + "grad_norm": 0.31814805964899057, + "learning_rate": 4.991606791166592e-07, + "loss": 0.3467, + "step": 18755 + }, + { + "epoch": 2.69772792637331, + "grad_norm": 0.3238850743949208, + "learning_rate": 4.968142223611306e-07, + "loss": 0.3682, + "step": 18760 + }, + { + "epoch": 2.6984469370146678, + "grad_norm": 0.29977506526631764, + "learning_rate": 4.944731532270175e-07, + "loss": 0.3479, + "step": 18765 + }, + { + "epoch": 2.6991659476560255, + "grad_norm": 0.3094979116924287, + "learning_rate": 4.921374730415418e-07, + "loss": 0.3532, + "step": 18770 + }, + { + "epoch": 2.6998849582973827, + "grad_norm": 0.3136896957980207, + "learning_rate": 4.898071831288631e-07, + "loss": 0.3531, + "step": 18775 + }, + { + "epoch": 2.7006039689387404, + "grad_norm": 0.300332682624589, + "learning_rate": 4.874822848100902e-07, + "loss": 0.3456, + "step": 18780 + }, + { + "epoch": 2.7013229795800977, + "grad_norm": 0.30510090318461636, + "learning_rate": 4.851627794032709e-07, + "loss": 0.3552, + "step": 18785 + }, + { + "epoch": 2.7020419902214554, + "grad_norm": 0.3053803908202146, + "learning_rate": 4.82848668223398e-07, + "loss": 0.3505, + "step": 18790 + }, + { + "epoch": 2.7027610008628127, + "grad_norm": 0.3145419940435831, + "learning_rate": 4.805399525824072e-07, + "loss": 0.3526, + "step": 18795 + }, + { + "epoch": 2.7034800115041704, + "grad_norm": 0.30657817865780707, + "learning_rate": 4.78236633789173e-07, + "loss": 0.329, + "step": 18800 + }, + { + "epoch": 2.7041990221455277, + "grad_norm": 0.30153501861176296, + "learning_rate": 4.759387131495097e-07, + "loss": 0.3476, + "step": 18805 + }, + { + "epoch": 2.7049180327868854, + "grad_norm": 0.3200553213274552, + "learning_rate": 4.73646191966175e-07, + "loss": 0.3509, + "step": 18810 + }, + { + "epoch": 2.7056370434282426, + "grad_norm": 0.32163306304030215, + "learning_rate": 4.7135907153886163e-07, + "loss": 0.3553, + "step": 18815 + }, + { + "epoch": 2.7063560540696003, + "grad_norm": 0.3101983446467971, + "learning_rate": 4.690773531642023e-07, + "loss": 0.3489, + "step": 18820 + }, + { + "epoch": 2.7070750647109576, + "grad_norm": 0.3045462102486363, + "learning_rate": 4.668010381357679e-07, + "loss": 0.3647, + "step": 18825 + }, + { + "epoch": 2.7077940753523153, + "grad_norm": 0.37845373384606695, + "learning_rate": 4.6453012774406283e-07, + "loss": 0.351, + "step": 18830 + }, + { + "epoch": 2.7085130859936726, + "grad_norm": 0.3085350917759782, + "learning_rate": 4.622646232765304e-07, + "loss": 0.3349, + "step": 18835 + }, + { + "epoch": 2.7092320966350303, + "grad_norm": 0.3136971389236542, + "learning_rate": 4.600045260175512e-07, + "loss": 0.3368, + "step": 18840 + }, + { + "epoch": 2.7099511072763875, + "grad_norm": 0.3124886404752603, + "learning_rate": 4.577498372484346e-07, + "loss": 0.3704, + "step": 18845 + }, + { + "epoch": 2.7106701179177453, + "grad_norm": 0.3111384819969063, + "learning_rate": 4.555005582474259e-07, + "loss": 0.3569, + "step": 18850 + }, + { + "epoch": 2.7113891285591025, + "grad_norm": 0.28801618774079224, + "learning_rate": 4.532566902897062e-07, + "loss": 0.3563, + "step": 18855 + }, + { + "epoch": 2.7121081392004602, + "grad_norm": 0.2971301894165523, + "learning_rate": 4.5101823464738683e-07, + "loss": 0.3438, + "step": 18860 + }, + { + "epoch": 2.7128271498418175, + "grad_norm": 0.31455805491954564, + "learning_rate": 4.4878519258950927e-07, + "loss": 0.3746, + "step": 18865 + }, + { + "epoch": 2.713546160483175, + "grad_norm": 0.3065010227643302, + "learning_rate": 4.4655756538204977e-07, + "loss": 0.339, + "step": 18870 + }, + { + "epoch": 2.7142651711245325, + "grad_norm": 0.3015148779282554, + "learning_rate": 4.443353542879092e-07, + "loss": 0.3555, + "step": 18875 + }, + { + "epoch": 2.71498418176589, + "grad_norm": 0.3211160666636949, + "learning_rate": 4.4211856056692424e-07, + "loss": 0.3699, + "step": 18880 + }, + { + "epoch": 2.715703192407248, + "grad_norm": 0.3102988134950522, + "learning_rate": 4.399071854758541e-07, + "loss": 0.3593, + "step": 18885 + }, + { + "epoch": 2.716422203048605, + "grad_norm": 0.3171698575154616, + "learning_rate": 4.377012302683914e-07, + "loss": 0.3732, + "step": 18890 + }, + { + "epoch": 2.7171412136899624, + "grad_norm": 0.31623857776697184, + "learning_rate": 4.3550069619515357e-07, + "loss": 0.3388, + "step": 18895 + }, + { + "epoch": 2.71786022433132, + "grad_norm": 0.29798574840923625, + "learning_rate": 4.33305584503686e-07, + "loss": 0.3484, + "step": 18900 + }, + { + "epoch": 2.718579234972678, + "grad_norm": 0.30965254627004185, + "learning_rate": 4.311158964384543e-07, + "loss": 0.3489, + "step": 18905 + }, + { + "epoch": 2.719298245614035, + "grad_norm": 0.3090538492119933, + "learning_rate": 4.2893163324085886e-07, + "loss": 0.3511, + "step": 18910 + }, + { + "epoch": 2.7200172562553924, + "grad_norm": 0.2998415100524849, + "learning_rate": 4.2675279614921683e-07, + "loss": 0.3453, + "step": 18915 + }, + { + "epoch": 2.72073626689675, + "grad_norm": 0.32312935380484853, + "learning_rate": 4.2457938639877126e-07, + "loss": 0.3573, + "step": 18920 + }, + { + "epoch": 2.721455277538108, + "grad_norm": 0.30687821717128466, + "learning_rate": 4.22411405221691e-07, + "loss": 0.3572, + "step": 18925 + }, + { + "epoch": 2.722174288179465, + "grad_norm": 0.3094724714007833, + "learning_rate": 4.202488538470628e-07, + "loss": 0.3552, + "step": 18930 + }, + { + "epoch": 2.7228932988208223, + "grad_norm": 0.30483700125558255, + "learning_rate": 4.180917335008994e-07, + "loss": 0.3512, + "step": 18935 + }, + { + "epoch": 2.72361230946218, + "grad_norm": 0.317142667617482, + "learning_rate": 4.159400454061324e-07, + "loss": 0.3608, + "step": 18940 + }, + { + "epoch": 2.7243313201035377, + "grad_norm": 0.30437333279705286, + "learning_rate": 4.1379379078261285e-07, + "loss": 0.3461, + "step": 18945 + }, + { + "epoch": 2.725050330744895, + "grad_norm": 0.315368021904134, + "learning_rate": 4.1165297084711176e-07, + "loss": 0.3539, + "step": 18950 + }, + { + "epoch": 2.7257693413862523, + "grad_norm": 0.2951030237927974, + "learning_rate": 4.095175868133228e-07, + "loss": 0.3325, + "step": 18955 + }, + { + "epoch": 2.72648835202761, + "grad_norm": 0.32123425416763773, + "learning_rate": 4.073876398918519e-07, + "loss": 0.3659, + "step": 18960 + }, + { + "epoch": 2.7272073626689677, + "grad_norm": 0.3138156016388089, + "learning_rate": 4.0526313129022556e-07, + "loss": 0.3643, + "step": 18965 + }, + { + "epoch": 2.727926373310325, + "grad_norm": 0.31792892655190597, + "learning_rate": 4.0314406221288904e-07, + "loss": 0.3548, + "step": 18970 + }, + { + "epoch": 2.728645383951682, + "grad_norm": 0.31750579980553373, + "learning_rate": 4.0103043386120034e-07, + "loss": 0.3534, + "step": 18975 + }, + { + "epoch": 2.72936439459304, + "grad_norm": 0.3127828508352821, + "learning_rate": 3.989222474334331e-07, + "loss": 0.3552, + "step": 18980 + }, + { + "epoch": 2.7300834052343976, + "grad_norm": 0.31537497523927804, + "learning_rate": 3.968195041247813e-07, + "loss": 0.3583, + "step": 18985 + }, + { + "epoch": 2.730802415875755, + "grad_norm": 0.31938161691099504, + "learning_rate": 3.947222051273436e-07, + "loss": 0.3501, + "step": 18990 + }, + { + "epoch": 2.7315214265171126, + "grad_norm": 0.31652148422752785, + "learning_rate": 3.9263035163014216e-07, + "loss": 0.3444, + "step": 18995 + }, + { + "epoch": 2.73224043715847, + "grad_norm": 0.30126188868707954, + "learning_rate": 3.9054394481910507e-07, + "loss": 0.3586, + "step": 19000 + }, + { + "epoch": 2.7329594477998276, + "grad_norm": 0.3102429634472407, + "learning_rate": 3.8846298587707276e-07, + "loss": 0.3558, + "step": 19005 + }, + { + "epoch": 2.733678458441185, + "grad_norm": 0.32717685832361776, + "learning_rate": 3.863874759838027e-07, + "loss": 0.342, + "step": 19010 + }, + { + "epoch": 2.7343974690825426, + "grad_norm": 0.3122104808693987, + "learning_rate": 3.8431741631595577e-07, + "loss": 0.351, + "step": 19015 + }, + { + "epoch": 2.7351164797239, + "grad_norm": 0.309312955399503, + "learning_rate": 3.8225280804710884e-07, + "loss": 0.3453, + "step": 19020 + }, + { + "epoch": 2.7358354903652575, + "grad_norm": 0.31322827510848456, + "learning_rate": 3.8019365234774565e-07, + "loss": 0.351, + "step": 19025 + }, + { + "epoch": 2.736554501006615, + "grad_norm": 0.30609678361472104, + "learning_rate": 3.7813995038525785e-07, + "loss": 0.3467, + "step": 19030 + }, + { + "epoch": 2.7372735116479725, + "grad_norm": 0.3036416135499562, + "learning_rate": 3.760917033239475e-07, + "loss": 0.3696, + "step": 19035 + }, + { + "epoch": 2.7379925222893298, + "grad_norm": 0.3127183110799516, + "learning_rate": 3.740489123250246e-07, + "loss": 0.335, + "step": 19040 + }, + { + "epoch": 2.7387115329306875, + "grad_norm": 0.31472999885863273, + "learning_rate": 3.7201157854660276e-07, + "loss": 0.3531, + "step": 19045 + }, + { + "epoch": 2.7394305435720447, + "grad_norm": 0.3060140112019616, + "learning_rate": 3.6997970314370244e-07, + "loss": 0.333, + "step": 19050 + }, + { + "epoch": 2.7401495542134024, + "grad_norm": 0.31436707159088967, + "learning_rate": 3.679532872682523e-07, + "loss": 0.3564, + "step": 19055 + }, + { + "epoch": 2.7408685648547597, + "grad_norm": 0.2973837776497081, + "learning_rate": 3.659323320690833e-07, + "loss": 0.3583, + "step": 19060 + }, + { + "epoch": 2.7415875754961174, + "grad_norm": 0.31471514171052006, + "learning_rate": 3.6391683869193005e-07, + "loss": 0.3572, + "step": 19065 + }, + { + "epoch": 2.7423065861374747, + "grad_norm": 0.31183057464625613, + "learning_rate": 3.619068082794353e-07, + "loss": 0.3585, + "step": 19070 + }, + { + "epoch": 2.7430255967788324, + "grad_norm": 0.3187039125707166, + "learning_rate": 3.5990224197113843e-07, + "loss": 0.3604, + "step": 19075 + }, + { + "epoch": 2.7437446074201897, + "grad_norm": 0.32225379075816213, + "learning_rate": 3.579031409034839e-07, + "loss": 0.3545, + "step": 19080 + }, + { + "epoch": 2.7444636180615474, + "grad_norm": 0.31170249003966904, + "learning_rate": 3.559095062098217e-07, + "loss": 0.3418, + "step": 19085 + }, + { + "epoch": 2.7451826287029046, + "grad_norm": 0.3076672510727331, + "learning_rate": 3.5392133902039663e-07, + "loss": 0.3519, + "step": 19090 + }, + { + "epoch": 2.7459016393442623, + "grad_norm": 0.32914504991219135, + "learning_rate": 3.5193864046235373e-07, + "loss": 0.3479, + "step": 19095 + }, + { + "epoch": 2.74662064998562, + "grad_norm": 0.2894981089596311, + "learning_rate": 3.4996141165974494e-07, + "loss": 0.3551, + "step": 19100 + }, + { + "epoch": 2.7473396606269773, + "grad_norm": 0.3291202963850717, + "learning_rate": 3.479896537335126e-07, + "loss": 0.345, + "step": 19105 + }, + { + "epoch": 2.7480586712683346, + "grad_norm": 0.3156365442267992, + "learning_rate": 3.4602336780150345e-07, + "loss": 0.3368, + "step": 19110 + }, + { + "epoch": 2.7487776819096923, + "grad_norm": 0.3131468049975896, + "learning_rate": 3.440625549784604e-07, + "loss": 0.3651, + "step": 19115 + }, + { + "epoch": 2.74949669255105, + "grad_norm": 0.32355695103390114, + "learning_rate": 3.4210721637601973e-07, + "loss": 0.3485, + "step": 19120 + }, + { + "epoch": 2.7502157031924073, + "grad_norm": 0.3265074768625927, + "learning_rate": 3.4015735310272024e-07, + "loss": 0.3545, + "step": 19125 + }, + { + "epoch": 2.7509347138337645, + "grad_norm": 0.31465707754685873, + "learning_rate": 3.3821296626399436e-07, + "loss": 0.336, + "step": 19130 + }, + { + "epoch": 2.7516537244751222, + "grad_norm": 0.3164483446457853, + "learning_rate": 3.36274056962167e-07, + "loss": 0.3563, + "step": 19135 + }, + { + "epoch": 2.75237273511648, + "grad_norm": 0.2971233101419614, + "learning_rate": 3.343406262964621e-07, + "loss": 0.3439, + "step": 19140 + }, + { + "epoch": 2.753091745757837, + "grad_norm": 0.3035407471454841, + "learning_rate": 3.3241267536299524e-07, + "loss": 0.3623, + "step": 19145 + }, + { + "epoch": 2.7538107563991945, + "grad_norm": 0.30526484121954384, + "learning_rate": 3.3049020525477316e-07, + "loss": 0.3393, + "step": 19150 + }, + { + "epoch": 2.754529767040552, + "grad_norm": 0.3024804150027958, + "learning_rate": 3.2857321706170175e-07, + "loss": 0.3508, + "step": 19155 + }, + { + "epoch": 2.75524877768191, + "grad_norm": 0.3124146280953073, + "learning_rate": 3.2666171187057284e-07, + "loss": 0.3588, + "step": 19160 + }, + { + "epoch": 2.755967788323267, + "grad_norm": 0.35628555495546016, + "learning_rate": 3.2475569076507064e-07, + "loss": 0.3479, + "step": 19165 + }, + { + "epoch": 2.7566867989646244, + "grad_norm": 0.31606148166597947, + "learning_rate": 3.2285515482577524e-07, + "loss": 0.3464, + "step": 19170 + }, + { + "epoch": 2.757405809605982, + "grad_norm": 0.3141344280963558, + "learning_rate": 3.209601051301503e-07, + "loss": 0.342, + "step": 19175 + }, + { + "epoch": 2.75812482024734, + "grad_norm": 0.3032858758815662, + "learning_rate": 3.190705427525542e-07, + "loss": 0.357, + "step": 19180 + }, + { + "epoch": 2.758843830888697, + "grad_norm": 0.316002321061407, + "learning_rate": 3.171864687642334e-07, + "loss": 0.3501, + "step": 19185 + }, + { + "epoch": 2.7595628415300544, + "grad_norm": 0.3096518923870791, + "learning_rate": 3.1530788423332124e-07, + "loss": 0.3508, + "step": 19190 + }, + { + "epoch": 2.760281852171412, + "grad_norm": 0.3083885146318327, + "learning_rate": 3.1343479022483805e-07, + "loss": 0.3627, + "step": 19195 + }, + { + "epoch": 2.76100086281277, + "grad_norm": 0.3401606194708313, + "learning_rate": 3.115671878006965e-07, + "loss": 0.3619, + "step": 19200 + }, + { + "epoch": 2.761719873454127, + "grad_norm": 0.31816989117475597, + "learning_rate": 3.097050780196886e-07, + "loss": 0.3552, + "step": 19205 + }, + { + "epoch": 2.7624388840954848, + "grad_norm": 0.3215018520643212, + "learning_rate": 3.0784846193749995e-07, + "loss": 0.3632, + "step": 19210 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.306117019547072, + "learning_rate": 3.059973406066963e-07, + "loss": 0.369, + "step": 19215 + }, + { + "epoch": 2.7638769053781997, + "grad_norm": 0.3075540707527376, + "learning_rate": 3.0415171507673034e-07, + "loss": 0.3615, + "step": 19220 + }, + { + "epoch": 2.764595916019557, + "grad_norm": 0.3167288278275876, + "learning_rate": 3.0231158639393744e-07, + "loss": 0.359, + "step": 19225 + }, + { + "epoch": 2.7653149266609147, + "grad_norm": 0.31235025822274415, + "learning_rate": 3.004769556015408e-07, + "loss": 0.3621, + "step": 19230 + }, + { + "epoch": 2.766033937302272, + "grad_norm": 0.31043606133104146, + "learning_rate": 2.9864782373964064e-07, + "loss": 0.3627, + "step": 19235 + }, + { + "epoch": 2.7667529479436297, + "grad_norm": 0.3061465253691321, + "learning_rate": 2.968241918452264e-07, + "loss": 0.3508, + "step": 19240 + }, + { + "epoch": 2.767471958584987, + "grad_norm": 0.31869793718769496, + "learning_rate": 2.9500606095216323e-07, + "loss": 0.3503, + "step": 19245 + }, + { + "epoch": 2.7681909692263447, + "grad_norm": 0.2998803905548194, + "learning_rate": 2.931934320912011e-07, + "loss": 0.3595, + "step": 19250 + }, + { + "epoch": 2.768909979867702, + "grad_norm": 0.29028144343980955, + "learning_rate": 2.913863062899702e-07, + "loss": 0.3542, + "step": 19255 + }, + { + "epoch": 2.7696289905090596, + "grad_norm": 0.32494600515578226, + "learning_rate": 2.8958468457297996e-07, + "loss": 0.3553, + "step": 19260 + }, + { + "epoch": 2.770348001150417, + "grad_norm": 0.3034610623098884, + "learning_rate": 2.8778856796161994e-07, + "loss": 0.3567, + "step": 19265 + }, + { + "epoch": 2.7710670117917746, + "grad_norm": 0.3069232780605492, + "learning_rate": 2.859979574741589e-07, + "loss": 0.3572, + "step": 19270 + }, + { + "epoch": 2.771786022433132, + "grad_norm": 0.31245632549477803, + "learning_rate": 2.8421285412574607e-07, + "loss": 0.3592, + "step": 19275 + }, + { + "epoch": 2.7725050330744896, + "grad_norm": 0.3033053941899692, + "learning_rate": 2.824332589284029e-07, + "loss": 0.3417, + "step": 19280 + }, + { + "epoch": 2.773224043715847, + "grad_norm": 0.32735633280602244, + "learning_rate": 2.806591728910357e-07, + "loss": 0.3577, + "step": 19285 + }, + { + "epoch": 2.7739430543572046, + "grad_norm": 0.31162274414451313, + "learning_rate": 2.7889059701942e-07, + "loss": 0.3488, + "step": 19290 + }, + { + "epoch": 2.774662064998562, + "grad_norm": 0.307206554261849, + "learning_rate": 2.7712753231621036e-07, + "loss": 0.3561, + "step": 19295 + }, + { + "epoch": 2.7753810756399195, + "grad_norm": 0.31895368120415385, + "learning_rate": 2.753699797809406e-07, + "loss": 0.3605, + "step": 19300 + }, + { + "epoch": 2.776100086281277, + "grad_norm": 0.31873626300607877, + "learning_rate": 2.7361794041001474e-07, + "loss": 0.3524, + "step": 19305 + }, + { + "epoch": 2.7768190969226345, + "grad_norm": 0.31776333008390883, + "learning_rate": 2.7187141519671277e-07, + "loss": 0.3635, + "step": 19310 + }, + { + "epoch": 2.777538107563992, + "grad_norm": 0.30582751106832323, + "learning_rate": 2.7013040513118813e-07, + "loss": 0.3406, + "step": 19315 + }, + { + "epoch": 2.7782571182053495, + "grad_norm": 0.3108606741265266, + "learning_rate": 2.68394911200468e-07, + "loss": 0.3653, + "step": 19320 + }, + { + "epoch": 2.7789761288467068, + "grad_norm": 0.30765930009578635, + "learning_rate": 2.666649343884531e-07, + "loss": 0.3576, + "step": 19325 + }, + { + "epoch": 2.7796951394880645, + "grad_norm": 0.3095905947908685, + "learning_rate": 2.6494047567591664e-07, + "loss": 0.3711, + "step": 19330 + }, + { + "epoch": 2.780414150129422, + "grad_norm": 0.3026101419425275, + "learning_rate": 2.6322153604049994e-07, + "loss": 0.3516, + "step": 19335 + }, + { + "epoch": 2.7811331607707794, + "grad_norm": 0.3023031462941279, + "learning_rate": 2.61508116456719e-07, + "loss": 0.351, + "step": 19340 + }, + { + "epoch": 2.7818521714121367, + "grad_norm": 0.3175845507467177, + "learning_rate": 2.598002178959602e-07, + "loss": 0.3635, + "step": 19345 + }, + { + "epoch": 2.7825711820534944, + "grad_norm": 0.3023168961882304, + "learning_rate": 2.5809784132647786e-07, + "loss": 0.3511, + "step": 19350 + }, + { + "epoch": 2.783290192694852, + "grad_norm": 0.31426012236313855, + "learning_rate": 2.564009877133977e-07, + "loss": 0.3897, + "step": 19355 + }, + { + "epoch": 2.7840092033362094, + "grad_norm": 0.3233617148924042, + "learning_rate": 2.547096580187125e-07, + "loss": 0.3666, + "step": 19360 + }, + { + "epoch": 2.7847282139775666, + "grad_norm": 0.3085777263843596, + "learning_rate": 2.5302385320128295e-07, + "loss": 0.3374, + "step": 19365 + }, + { + "epoch": 2.7854472246189244, + "grad_norm": 0.315282681974723, + "learning_rate": 2.513435742168413e-07, + "loss": 0.3511, + "step": 19370 + }, + { + "epoch": 2.786166235260282, + "grad_norm": 0.3141889988453881, + "learning_rate": 2.4966882201798436e-07, + "loss": 0.3571, + "step": 19375 + }, + { + "epoch": 2.7868852459016393, + "grad_norm": 0.3239914217961741, + "learning_rate": 2.479995975541749e-07, + "loss": 0.3549, + "step": 19380 + }, + { + "epoch": 2.7876042565429966, + "grad_norm": 0.3011667215231966, + "learning_rate": 2.463359017717437e-07, + "loss": 0.3602, + "step": 19385 + }, + { + "epoch": 2.7883232671843543, + "grad_norm": 0.29901731608420934, + "learning_rate": 2.446777356138863e-07, + "loss": 0.3419, + "step": 19390 + }, + { + "epoch": 2.789042277825712, + "grad_norm": 0.38783833461175193, + "learning_rate": 2.430251000206618e-07, + "loss": 0.349, + "step": 19395 + }, + { + "epoch": 2.7897612884670693, + "grad_norm": 0.31196293453991447, + "learning_rate": 2.4137799592899857e-07, + "loss": 0.3711, + "step": 19400 + }, + { + "epoch": 2.7904802991084265, + "grad_norm": 0.31413563856842847, + "learning_rate": 2.3973642427268405e-07, + "loss": 0.3551, + "step": 19405 + }, + { + "epoch": 2.7911993097497843, + "grad_norm": 0.30669848987056847, + "learning_rate": 2.381003859823694e-07, + "loss": 0.3645, + "step": 19410 + }, + { + "epoch": 2.791918320391142, + "grad_norm": 0.3129950796498745, + "learning_rate": 2.3646988198557375e-07, + "loss": 0.3436, + "step": 19415 + }, + { + "epoch": 2.7926373310324992, + "grad_norm": 0.3020904742624162, + "learning_rate": 2.3484491320667324e-07, + "loss": 0.3515, + "step": 19420 + }, + { + "epoch": 2.793356341673857, + "grad_norm": 0.31348790424609296, + "learning_rate": 2.3322548056690763e-07, + "loss": 0.3411, + "step": 19425 + }, + { + "epoch": 2.794075352315214, + "grad_norm": 0.30315554360892016, + "learning_rate": 2.316115849843803e-07, + "loss": 0.3369, + "step": 19430 + }, + { + "epoch": 2.794794362956572, + "grad_norm": 0.3029707042387598, + "learning_rate": 2.3000322737405266e-07, + "loss": 0.345, + "step": 19435 + }, + { + "epoch": 2.795513373597929, + "grad_norm": 0.30827057562429894, + "learning_rate": 2.284004086477487e-07, + "loss": 0.3551, + "step": 19440 + }, + { + "epoch": 2.796232384239287, + "grad_norm": 0.31638290497312094, + "learning_rate": 2.268031297141504e-07, + "loss": 0.3552, + "step": 19445 + }, + { + "epoch": 2.796951394880644, + "grad_norm": 0.298083641375201, + "learning_rate": 2.252113914787979e-07, + "loss": 0.3601, + "step": 19450 + }, + { + "epoch": 2.797670405522002, + "grad_norm": 0.29388492637341807, + "learning_rate": 2.2362519484409484e-07, + "loss": 0.3582, + "step": 19455 + }, + { + "epoch": 2.798389416163359, + "grad_norm": 0.3074794572376518, + "learning_rate": 2.220445407092997e-07, + "loss": 0.3545, + "step": 19460 + }, + { + "epoch": 2.799108426804717, + "grad_norm": 0.33058963087328974, + "learning_rate": 2.20469429970529e-07, + "loss": 0.3657, + "step": 19465 + }, + { + "epoch": 2.799827437446074, + "grad_norm": 0.32059830509972015, + "learning_rate": 2.1889986352075621e-07, + "loss": 0.3598, + "step": 19470 + }, + { + "epoch": 2.800546448087432, + "grad_norm": 0.31344674596084215, + "learning_rate": 2.1733584224981396e-07, + "loss": 0.3576, + "step": 19475 + }, + { + "epoch": 2.801265458728789, + "grad_norm": 0.3138969602559058, + "learning_rate": 2.1577736704438746e-07, + "loss": 0.3523, + "step": 19480 + }, + { + "epoch": 2.801984469370147, + "grad_norm": 0.31218452641803646, + "learning_rate": 2.1422443878802323e-07, + "loss": 0.3504, + "step": 19485 + }, + { + "epoch": 2.802703480011504, + "grad_norm": 0.3103625493885958, + "learning_rate": 2.1267705836111708e-07, + "loss": 0.3481, + "step": 19490 + }, + { + "epoch": 2.8034224906528618, + "grad_norm": 0.3394743217767908, + "learning_rate": 2.1113522664092168e-07, + "loss": 0.3614, + "step": 19495 + }, + { + "epoch": 2.804141501294219, + "grad_norm": 0.3194563285631549, + "learning_rate": 2.0959894450154783e-07, + "loss": 0.3573, + "step": 19500 + }, + { + "epoch": 2.8048605119355767, + "grad_norm": 0.31943217420345055, + "learning_rate": 2.0806821281395328e-07, + "loss": 0.3645, + "step": 19505 + }, + { + "epoch": 2.805579522576934, + "grad_norm": 0.3215617418555956, + "learning_rate": 2.0654303244595274e-07, + "loss": 0.3506, + "step": 19510 + }, + { + "epoch": 2.8062985332182917, + "grad_norm": 0.3186694011640869, + "learning_rate": 2.0502340426221568e-07, + "loss": 0.3764, + "step": 19515 + }, + { + "epoch": 2.807017543859649, + "grad_norm": 0.30126128632485133, + "learning_rate": 2.035093291242607e-07, + "loss": 0.348, + "step": 19520 + }, + { + "epoch": 2.8077365545010067, + "grad_norm": 0.31837897587547037, + "learning_rate": 2.0200080789045895e-07, + "loss": 0.3509, + "step": 19525 + }, + { + "epoch": 2.808455565142364, + "grad_norm": 0.3049340670206854, + "learning_rate": 2.0049784141603525e-07, + "loss": 0.3732, + "step": 19530 + }, + { + "epoch": 2.8091745757837217, + "grad_norm": 0.3200739069071617, + "learning_rate": 1.9900043055306018e-07, + "loss": 0.3623, + "step": 19535 + }, + { + "epoch": 2.809893586425079, + "grad_norm": 0.30563109975452546, + "learning_rate": 1.9750857615045915e-07, + "loss": 0.3446, + "step": 19540 + }, + { + "epoch": 2.8106125970664366, + "grad_norm": 0.3222392767558234, + "learning_rate": 1.9602227905400673e-07, + "loss": 0.3498, + "step": 19545 + }, + { + "epoch": 2.8113316077077943, + "grad_norm": 0.30758975157613416, + "learning_rate": 1.9454154010632553e-07, + "loss": 0.3657, + "step": 19550 + }, + { + "epoch": 2.8120506183491516, + "grad_norm": 0.30649135149595613, + "learning_rate": 1.930663601468885e-07, + "loss": 0.3555, + "step": 19555 + }, + { + "epoch": 2.812769628990509, + "grad_norm": 0.3151315801542019, + "learning_rate": 1.9159674001201556e-07, + "loss": 0.3472, + "step": 19560 + }, + { + "epoch": 2.8134886396318666, + "grad_norm": 0.3016552975710565, + "learning_rate": 1.9013268053487465e-07, + "loss": 0.3536, + "step": 19565 + }, + { + "epoch": 2.8142076502732243, + "grad_norm": 0.31533068041057283, + "learning_rate": 1.8867418254548298e-07, + "loss": 0.3506, + "step": 19570 + }, + { + "epoch": 2.8149266609145815, + "grad_norm": 0.29714007702411543, + "learning_rate": 1.8722124687070574e-07, + "loss": 0.3403, + "step": 19575 + }, + { + "epoch": 2.815645671555939, + "grad_norm": 0.31169946439304297, + "learning_rate": 1.8577387433424854e-07, + "loss": 0.3583, + "step": 19580 + }, + { + "epoch": 2.8163646821972965, + "grad_norm": 0.30565219090359186, + "learning_rate": 1.8433206575667161e-07, + "loss": 0.348, + "step": 19585 + }, + { + "epoch": 2.8170836928386542, + "grad_norm": 0.309771796015684, + "learning_rate": 1.8289582195537337e-07, + "loss": 0.353, + "step": 19590 + }, + { + "epoch": 2.8178027034800115, + "grad_norm": 0.30738045526837676, + "learning_rate": 1.8146514374460134e-07, + "loss": 0.3427, + "step": 19595 + }, + { + "epoch": 2.8185217141213688, + "grad_norm": 0.32212616978895764, + "learning_rate": 1.8004003193544894e-07, + "loss": 0.3345, + "step": 19600 + }, + { + "epoch": 2.8192407247627265, + "grad_norm": 0.31935665499208105, + "learning_rate": 1.7862048733584882e-07, + "loss": 0.3436, + "step": 19605 + }, + { + "epoch": 2.819959735404084, + "grad_norm": 0.31011607261544066, + "learning_rate": 1.772065107505816e-07, + "loss": 0.3549, + "step": 19610 + }, + { + "epoch": 2.8206787460454414, + "grad_norm": 0.2963752944269382, + "learning_rate": 1.7579810298127054e-07, + "loss": 0.3637, + "step": 19615 + }, + { + "epoch": 2.8213977566867987, + "grad_norm": 0.303716726779769, + "learning_rate": 1.7439526482638136e-07, + "loss": 0.3603, + "step": 19620 + }, + { + "epoch": 2.8221167673281564, + "grad_norm": 0.3194904910954896, + "learning_rate": 1.7299799708122124e-07, + "loss": 0.3648, + "step": 19625 + }, + { + "epoch": 2.822835777969514, + "grad_norm": 0.32609672240121534, + "learning_rate": 1.7160630053794203e-07, + "loss": 0.3431, + "step": 19630 + }, + { + "epoch": 2.8235547886108714, + "grad_norm": 0.30902138416485625, + "learning_rate": 1.7022017598553376e-07, + "loss": 0.3488, + "step": 19635 + }, + { + "epoch": 2.8242737992522287, + "grad_norm": 0.32988805197025134, + "learning_rate": 1.6883962420982892e-07, + "loss": 0.3591, + "step": 19640 + }, + { + "epoch": 2.8249928098935864, + "grad_norm": 0.31791131140624473, + "learning_rate": 1.6746464599350253e-07, + "loss": 0.3705, + "step": 19645 + }, + { + "epoch": 2.825711820534944, + "grad_norm": 0.36161136472309374, + "learning_rate": 1.6609524211606666e-07, + "loss": 0.3498, + "step": 19650 + }, + { + "epoch": 2.8264308311763013, + "grad_norm": 0.3069902726420009, + "learning_rate": 1.6473141335387688e-07, + "loss": 0.3606, + "step": 19655 + }, + { + "epoch": 2.827149841817659, + "grad_norm": 0.30450847009491083, + "learning_rate": 1.6337316048012142e-07, + "loss": 0.3513, + "step": 19660 + }, + { + "epoch": 2.8278688524590163, + "grad_norm": 0.3090335390208858, + "learning_rate": 1.6202048426483652e-07, + "loss": 0.3593, + "step": 19665 + }, + { + "epoch": 2.828587863100374, + "grad_norm": 0.29473249703071286, + "learning_rate": 1.6067338547488875e-07, + "loss": 0.3409, + "step": 19670 + }, + { + "epoch": 2.8293068737417313, + "grad_norm": 0.30502673474783937, + "learning_rate": 1.5933186487398945e-07, + "loss": 0.3539, + "step": 19675 + }, + { + "epoch": 2.830025884383089, + "grad_norm": 0.3237522346303837, + "learning_rate": 1.579959232226802e-07, + "loss": 0.3565, + "step": 19680 + }, + { + "epoch": 2.8307448950244463, + "grad_norm": 0.29410601198753455, + "learning_rate": 1.566655612783452e-07, + "loss": 0.3494, + "step": 19685 + }, + { + "epoch": 2.831463905665804, + "grad_norm": 0.3201855612913448, + "learning_rate": 1.5534077979520558e-07, + "loss": 0.36, + "step": 19690 + }, + { + "epoch": 2.8321829163071612, + "grad_norm": 0.3176807634275686, + "learning_rate": 1.5402157952431385e-07, + "loss": 0.357, + "step": 19695 + }, + { + "epoch": 2.832901926948519, + "grad_norm": 0.32624540853994793, + "learning_rate": 1.5270796121356402e-07, + "loss": 0.3555, + "step": 19700 + }, + { + "epoch": 2.833620937589876, + "grad_norm": 0.30558565906106155, + "learning_rate": 1.5139992560768257e-07, + "loss": 0.3638, + "step": 19705 + }, + { + "epoch": 2.834339948231234, + "grad_norm": 0.3148425885106696, + "learning_rate": 1.5009747344822966e-07, + "loss": 0.3485, + "step": 19710 + }, + { + "epoch": 2.835058958872591, + "grad_norm": 0.3151890767325172, + "learning_rate": 1.488006054736024e-07, + "loss": 0.3486, + "step": 19715 + }, + { + "epoch": 2.835777969513949, + "grad_norm": 0.2990771173282484, + "learning_rate": 1.4750932241903382e-07, + "loss": 0.3583, + "step": 19720 + }, + { + "epoch": 2.836496980155306, + "grad_norm": 0.3092892688996735, + "learning_rate": 1.4622362501658495e-07, + "loss": 0.3478, + "step": 19725 + }, + { + "epoch": 2.837215990796664, + "grad_norm": 0.31105647203704995, + "learning_rate": 1.4494351399515604e-07, + "loss": 0.3742, + "step": 19730 + }, + { + "epoch": 2.837935001438021, + "grad_norm": 0.31333961454391107, + "learning_rate": 1.4366899008047774e-07, + "loss": 0.3457, + "step": 19735 + }, + { + "epoch": 2.838654012079379, + "grad_norm": 0.3098972660151477, + "learning_rate": 1.4240005399511091e-07, + "loss": 0.3445, + "step": 19740 + }, + { + "epoch": 2.839373022720736, + "grad_norm": 0.29904952311822824, + "learning_rate": 1.4113670645845345e-07, + "loss": 0.3674, + "step": 19745 + }, + { + "epoch": 2.840092033362094, + "grad_norm": 0.30724864773465854, + "learning_rate": 1.398789481867313e-07, + "loss": 0.3683, + "step": 19750 + }, + { + "epoch": 2.840811044003451, + "grad_norm": 0.3098518253670769, + "learning_rate": 1.3862677989300188e-07, + "loss": 0.3427, + "step": 19755 + }, + { + "epoch": 2.841530054644809, + "grad_norm": 0.3007055183846121, + "learning_rate": 1.373802022871551e-07, + "loss": 0.3337, + "step": 19760 + }, + { + "epoch": 2.8422490652861665, + "grad_norm": 0.31362905222662657, + "learning_rate": 1.361392160759112e-07, + "loss": 0.3541, + "step": 19765 + }, + { + "epoch": 2.8429680759275238, + "grad_norm": 0.3086581602027274, + "learning_rate": 1.3490382196281959e-07, + "loss": 0.3366, + "step": 19770 + }, + { + "epoch": 2.843687086568881, + "grad_norm": 0.32186996399691165, + "learning_rate": 1.3367402064826007e-07, + "loss": 0.3569, + "step": 19775 + }, + { + "epoch": 2.8444060972102387, + "grad_norm": 0.30885699347696555, + "learning_rate": 1.3244981282944047e-07, + "loss": 0.3534, + "step": 19780 + }, + { + "epoch": 2.8451251078515964, + "grad_norm": 0.31284364176405044, + "learning_rate": 1.3123119920039894e-07, + "loss": 0.3544, + "step": 19785 + }, + { + "epoch": 2.8458441184929537, + "grad_norm": 0.3160906628771324, + "learning_rate": 1.3001818045200175e-07, + "loss": 0.3401, + "step": 19790 + }, + { + "epoch": 2.846563129134311, + "grad_norm": 0.32702047456560096, + "learning_rate": 1.2881075727194214e-07, + "loss": 0.3769, + "step": 19795 + }, + { + "epoch": 2.8472821397756687, + "grad_norm": 0.3090608205253169, + "learning_rate": 1.2760893034474254e-07, + "loss": 0.3499, + "step": 19800 + }, + { + "epoch": 2.8480011504170264, + "grad_norm": 0.3138204067749859, + "learning_rate": 1.2641270035175347e-07, + "loss": 0.3728, + "step": 19805 + }, + { + "epoch": 2.8487201610583837, + "grad_norm": 0.30587465368286, + "learning_rate": 1.25222067971148e-07, + "loss": 0.3591, + "step": 19810 + }, + { + "epoch": 2.849439171699741, + "grad_norm": 0.3151828226894727, + "learning_rate": 1.2403703387793176e-07, + "loss": 0.3524, + "step": 19815 + }, + { + "epoch": 2.8501581823410986, + "grad_norm": 0.31006606220294936, + "learning_rate": 1.228575987439329e-07, + "loss": 0.3546, + "step": 19820 + }, + { + "epoch": 2.8508771929824563, + "grad_norm": 0.29605394252109407, + "learning_rate": 1.2168376323780652e-07, + "loss": 0.356, + "step": 19825 + }, + { + "epoch": 2.8515962036238136, + "grad_norm": 0.31417656865686544, + "learning_rate": 1.205155280250314e-07, + "loss": 0.351, + "step": 19830 + }, + { + "epoch": 2.852315214265171, + "grad_norm": 0.30377191215236227, + "learning_rate": 1.193528937679145e-07, + "loss": 0.3546, + "step": 19835 + }, + { + "epoch": 2.8530342249065286, + "grad_norm": 0.3140116671650671, + "learning_rate": 1.1819586112558401e-07, + "loss": 0.3697, + "step": 19840 + }, + { + "epoch": 2.8537532355478863, + "grad_norm": 0.31720157777001484, + "learning_rate": 1.1704443075399418e-07, + "loss": 0.3534, + "step": 19845 + }, + { + "epoch": 2.8544722461892436, + "grad_norm": 0.2927661022495849, + "learning_rate": 1.1589860330592506e-07, + "loss": 0.3356, + "step": 19850 + }, + { + "epoch": 2.855191256830601, + "grad_norm": 0.3074670230872882, + "learning_rate": 1.147583794309759e-07, + "loss": 0.3433, + "step": 19855 + }, + { + "epoch": 2.8559102674719585, + "grad_norm": 0.31164255393992035, + "learning_rate": 1.1362375977557183e-07, + "loss": 0.3407, + "step": 19860 + }, + { + "epoch": 2.8566292781133162, + "grad_norm": 0.3144825590840295, + "learning_rate": 1.1249474498296053e-07, + "loss": 0.3461, + "step": 19865 + }, + { + "epoch": 2.8573482887546735, + "grad_norm": 0.315202884878571, + "learning_rate": 1.1137133569321335e-07, + "loss": 0.3491, + "step": 19870 + }, + { + "epoch": 2.858067299396031, + "grad_norm": 0.45033323533661335, + "learning_rate": 1.1025353254322191e-07, + "loss": 0.3529, + "step": 19875 + }, + { + "epoch": 2.8587863100373885, + "grad_norm": 0.3159910074980656, + "learning_rate": 1.0914133616669931e-07, + "loss": 0.3548, + "step": 19880 + }, + { + "epoch": 2.859505320678746, + "grad_norm": 0.31832852961200697, + "learning_rate": 1.0803474719418006e-07, + "loss": 0.3601, + "step": 19885 + }, + { + "epoch": 2.8602243313201035, + "grad_norm": 0.3227663752680385, + "learning_rate": 1.0693376625302232e-07, + "loss": 0.3533, + "step": 19890 + }, + { + "epoch": 2.860943341961461, + "grad_norm": 0.3201045100876346, + "learning_rate": 1.0583839396740126e-07, + "loss": 0.3418, + "step": 19895 + }, + { + "epoch": 2.8616623526028184, + "grad_norm": 0.3065616056971823, + "learning_rate": 1.0474863095831566e-07, + "loss": 0.3651, + "step": 19900 + }, + { + "epoch": 2.862381363244176, + "grad_norm": 0.3093595730538003, + "learning_rate": 1.0366447784358025e-07, + "loss": 0.3558, + "step": 19905 + }, + { + "epoch": 2.8631003738855334, + "grad_norm": 0.3127569775360688, + "learning_rate": 1.0258593523783444e-07, + "loss": 0.3487, + "step": 19910 + }, + { + "epoch": 2.863819384526891, + "grad_norm": 0.3127546755793686, + "learning_rate": 1.0151300375253138e-07, + "loss": 0.3515, + "step": 19915 + }, + { + "epoch": 2.8645383951682484, + "grad_norm": 0.3127556843332249, + "learning_rate": 1.0044568399594778e-07, + "loss": 0.3667, + "step": 19920 + }, + { + "epoch": 2.865257405809606, + "grad_norm": 0.3019032436054994, + "learning_rate": 9.938397657317633e-08, + "loss": 0.3417, + "step": 19925 + }, + { + "epoch": 2.8659764164509633, + "grad_norm": 0.32179642385988344, + "learning_rate": 9.832788208612998e-08, + "loss": 0.3506, + "step": 19930 + }, + { + "epoch": 2.866695427092321, + "grad_norm": 0.31825252312903735, + "learning_rate": 9.727740113353645e-08, + "loss": 0.3552, + "step": 19935 + }, + { + "epoch": 2.8674144377336783, + "grad_norm": 0.3059276048014507, + "learning_rate": 9.62325343109427e-08, + "loss": 0.3606, + "step": 19940 + }, + { + "epoch": 2.868133448375036, + "grad_norm": 0.31518111667148346, + "learning_rate": 9.519328221071378e-08, + "loss": 0.3538, + "step": 19945 + }, + { + "epoch": 2.8688524590163933, + "grad_norm": 0.33909437397438713, + "learning_rate": 9.415964542203059e-08, + "loss": 0.36, + "step": 19950 + }, + { + "epoch": 2.869571469657751, + "grad_norm": 0.3271387134786379, + "learning_rate": 9.313162453088997e-08, + "loss": 0.3628, + "step": 19955 + }, + { + "epoch": 2.8702904802991083, + "grad_norm": 0.3022971868672396, + "learning_rate": 9.210922012010681e-08, + "loss": 0.3524, + "step": 19960 + }, + { + "epoch": 2.871009490940466, + "grad_norm": 0.3032486265662045, + "learning_rate": 9.109243276930968e-08, + "loss": 0.3519, + "step": 19965 + }, + { + "epoch": 2.8717285015818232, + "grad_norm": 0.3166470766101174, + "learning_rate": 9.008126305494524e-08, + "loss": 0.3506, + "step": 19970 + }, + { + "epoch": 2.872447512223181, + "grad_norm": 0.30724298045971954, + "learning_rate": 8.907571155027272e-08, + "loss": 0.344, + "step": 19975 + }, + { + "epoch": 2.8731665228645387, + "grad_norm": 0.2998477886331141, + "learning_rate": 8.807577882536611e-08, + "loss": 0.3327, + "step": 19980 + }, + { + "epoch": 2.873885533505896, + "grad_norm": 0.31285567261052705, + "learning_rate": 8.708146544711749e-08, + "loss": 0.3558, + "step": 19985 + }, + { + "epoch": 2.874604544147253, + "grad_norm": 0.336148110439647, + "learning_rate": 8.609277197923038e-08, + "loss": 0.3574, + "step": 19990 + }, + { + "epoch": 2.875323554788611, + "grad_norm": 0.3142366128590145, + "learning_rate": 8.510969898222199e-08, + "loss": 0.3657, + "step": 19995 + }, + { + "epoch": 2.8760425654299686, + "grad_norm": 0.35590677060330256, + "learning_rate": 8.413224701342427e-08, + "loss": 0.336, + "step": 20000 + }, + { + "epoch": 2.876761576071326, + "grad_norm": 0.3155349391121181, + "learning_rate": 8.31604166269806e-08, + "loss": 0.3615, + "step": 20005 + }, + { + "epoch": 2.877480586712683, + "grad_norm": 0.3105909821116713, + "learning_rate": 8.219420837385139e-08, + "loss": 0.3558, + "step": 20010 + }, + { + "epoch": 2.878199597354041, + "grad_norm": 0.31943968035518244, + "learning_rate": 8.123362280180514e-08, + "loss": 0.3392, + "step": 20015 + }, + { + "epoch": 2.8789186079953986, + "grad_norm": 0.3188491758331916, + "learning_rate": 8.02786604554262e-08, + "loss": 0.3378, + "step": 20020 + }, + { + "epoch": 2.879637618636756, + "grad_norm": 0.31796106698426874, + "learning_rate": 7.93293218761071e-08, + "loss": 0.3755, + "step": 20025 + }, + { + "epoch": 2.880356629278113, + "grad_norm": 0.30798491903781205, + "learning_rate": 7.838560760205727e-08, + "loss": 0.3452, + "step": 20030 + }, + { + "epoch": 2.881075639919471, + "grad_norm": 0.3147825658639276, + "learning_rate": 7.74475181682921e-08, + "loss": 0.3589, + "step": 20035 + }, + { + "epoch": 2.8817946505608285, + "grad_norm": 0.30130707474401197, + "learning_rate": 7.651505410664284e-08, + "loss": 0.3433, + "step": 20040 + }, + { + "epoch": 2.8825136612021858, + "grad_norm": 0.3001199192213933, + "learning_rate": 7.558821594574773e-08, + "loss": 0.3523, + "step": 20045 + }, + { + "epoch": 2.883232671843543, + "grad_norm": 0.30609348693305766, + "learning_rate": 7.466700421105643e-08, + "loss": 0.3539, + "step": 20050 + }, + { + "epoch": 2.8839516824849007, + "grad_norm": 0.30529711807306814, + "learning_rate": 7.375141942483343e-08, + "loss": 0.3535, + "step": 20055 + }, + { + "epoch": 2.8846706931262585, + "grad_norm": 0.3085725150077207, + "learning_rate": 7.284146210614463e-08, + "loss": 0.3596, + "step": 20060 + }, + { + "epoch": 2.8853897037676157, + "grad_norm": 0.32379734232553514, + "learning_rate": 7.1937132770874e-08, + "loss": 0.3502, + "step": 20065 + }, + { + "epoch": 2.886108714408973, + "grad_norm": 0.30441899310242754, + "learning_rate": 7.103843193170924e-08, + "loss": 0.3643, + "step": 20070 + }, + { + "epoch": 2.8868277250503307, + "grad_norm": 0.30441945106316937, + "learning_rate": 7.014536009814943e-08, + "loss": 0.3623, + "step": 20075 + }, + { + "epoch": 2.8875467356916884, + "grad_norm": 0.307420928138311, + "learning_rate": 6.925791777650181e-08, + "loss": 0.3422, + "step": 20080 + }, + { + "epoch": 2.8882657463330457, + "grad_norm": 0.3133443991613013, + "learning_rate": 6.837610546988061e-08, + "loss": 0.3449, + "step": 20085 + }, + { + "epoch": 2.8889847569744034, + "grad_norm": 0.31544187213757585, + "learning_rate": 6.749992367821367e-08, + "loss": 0.362, + "step": 20090 + }, + { + "epoch": 2.8897037676157606, + "grad_norm": 0.3181257327525517, + "learning_rate": 6.662937289822924e-08, + "loss": 0.3524, + "step": 20095 + }, + { + "epoch": 2.8904227782571184, + "grad_norm": 0.31568453456892315, + "learning_rate": 6.576445362346917e-08, + "loss": 0.35, + "step": 20100 + }, + { + "epoch": 2.8911417888984756, + "grad_norm": 0.31390550340544804, + "learning_rate": 6.490516634427901e-08, + "loss": 0.3374, + "step": 20105 + }, + { + "epoch": 2.8918607995398333, + "grad_norm": 0.31081366225391455, + "learning_rate": 6.405151154781241e-08, + "loss": 0.3625, + "step": 20110 + }, + { + "epoch": 2.8925798101811906, + "grad_norm": 0.3177500145092479, + "learning_rate": 6.320348971803225e-08, + "loss": 0.3564, + "step": 20115 + }, + { + "epoch": 2.8932988208225483, + "grad_norm": 0.29677972139612535, + "learning_rate": 6.236110133570505e-08, + "loss": 0.3517, + "step": 20120 + }, + { + "epoch": 2.8940178314639056, + "grad_norm": 0.3206822403012554, + "learning_rate": 6.152434687840214e-08, + "loss": 0.3471, + "step": 20125 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.30565224332619295, + "learning_rate": 6.069322682050516e-08, + "loss": 0.359, + "step": 20130 + }, + { + "epoch": 2.8954558527466205, + "grad_norm": 0.2990491966371965, + "learning_rate": 5.986774163319942e-08, + "loss": 0.3442, + "step": 20135 + }, + { + "epoch": 2.8961748633879782, + "grad_norm": 0.328629335753113, + "learning_rate": 5.90478917844739e-08, + "loss": 0.3781, + "step": 20140 + }, + { + "epoch": 2.8968938740293355, + "grad_norm": 0.3011937892687381, + "learning_rate": 5.823367773912569e-08, + "loss": 0.3389, + "step": 20145 + }, + { + "epoch": 2.8976128846706932, + "grad_norm": 0.3068318241936056, + "learning_rate": 5.742509995875445e-08, + "loss": 0.3702, + "step": 20150 + }, + { + "epoch": 2.8983318953120505, + "grad_norm": 0.2999250357976834, + "learning_rate": 5.66221589017657e-08, + "loss": 0.3576, + "step": 20155 + }, + { + "epoch": 2.899050905953408, + "grad_norm": 0.30259364045053866, + "learning_rate": 5.582485502337087e-08, + "loss": 0.3578, + "step": 20160 + }, + { + "epoch": 2.8997699165947655, + "grad_norm": 0.31667873659095214, + "learning_rate": 5.503318877558172e-08, + "loss": 0.3567, + "step": 20165 + }, + { + "epoch": 2.900488927236123, + "grad_norm": 0.3115584463254306, + "learning_rate": 5.424716060721702e-08, + "loss": 0.3552, + "step": 20170 + }, + { + "epoch": 2.9012079378774804, + "grad_norm": 0.3079569355921638, + "learning_rate": 5.3466770963898074e-08, + "loss": 0.3515, + "step": 20175 + }, + { + "epoch": 2.901926948518838, + "grad_norm": 0.3130676722117026, + "learning_rate": 5.269202028804876e-08, + "loss": 0.3421, + "step": 20180 + }, + { + "epoch": 2.9026459591601954, + "grad_norm": 0.31657513998731357, + "learning_rate": 5.192290901889774e-08, + "loss": 0.3552, + "step": 20185 + }, + { + "epoch": 2.903364969801553, + "grad_norm": 0.3033146127249199, + "learning_rate": 5.11594375924751e-08, + "loss": 0.3312, + "step": 20190 + }, + { + "epoch": 2.904083980442911, + "grad_norm": 0.3090288154840797, + "learning_rate": 5.0401606441613515e-08, + "loss": 0.3829, + "step": 20195 + }, + { + "epoch": 2.904802991084268, + "grad_norm": 0.6597731135932114, + "learning_rate": 4.964941599595041e-08, + "loss": 0.3414, + "step": 20200 + }, + { + "epoch": 2.9055220017256254, + "grad_norm": 0.3039729546228988, + "learning_rate": 4.890286668192246e-08, + "loss": 0.3509, + "step": 20205 + }, + { + "epoch": 2.906241012366983, + "grad_norm": 0.31292629673882, + "learning_rate": 4.816195892276887e-08, + "loss": 0.3675, + "step": 20210 + }, + { + "epoch": 2.9069600230083408, + "grad_norm": 0.3091273065471905, + "learning_rate": 4.742669313853254e-08, + "loss": 0.3425, + "step": 20215 + }, + { + "epoch": 2.907679033649698, + "grad_norm": 0.31573160565098474, + "learning_rate": 4.669706974605559e-08, + "loss": 0.3676, + "step": 20220 + }, + { + "epoch": 2.9083980442910553, + "grad_norm": 0.32873206136837513, + "learning_rate": 4.5973089158980464e-08, + "loss": 0.3499, + "step": 20225 + }, + { + "epoch": 2.909117054932413, + "grad_norm": 0.3004924279055244, + "learning_rate": 4.5254751787753294e-08, + "loss": 0.3555, + "step": 20230 + }, + { + "epoch": 2.9098360655737707, + "grad_norm": 0.3169102105053304, + "learning_rate": 4.454205803961942e-08, + "loss": 0.3607, + "step": 20235 + }, + { + "epoch": 2.910555076215128, + "grad_norm": 0.3021816853831664, + "learning_rate": 4.383500831862342e-08, + "loss": 0.3476, + "step": 20240 + }, + { + "epoch": 2.9112740868564853, + "grad_norm": 0.3060949401927673, + "learning_rate": 4.3133603025614644e-08, + "loss": 0.3745, + "step": 20245 + }, + { + "epoch": 2.911993097497843, + "grad_norm": 0.3109649592297195, + "learning_rate": 4.243784255823613e-08, + "loss": 0.3462, + "step": 20250 + }, + { + "epoch": 2.9127121081392007, + "grad_norm": 0.32036028908374864, + "learning_rate": 4.1747727310935683e-08, + "loss": 0.3631, + "step": 20255 + }, + { + "epoch": 2.913431118780558, + "grad_norm": 0.31581149777093853, + "learning_rate": 4.106325767495811e-08, + "loss": 0.3466, + "step": 20260 + }, + { + "epoch": 2.914150129421915, + "grad_norm": 0.31533401382203285, + "learning_rate": 4.038443403834969e-08, + "loss": 0.3474, + "step": 20265 + }, + { + "epoch": 2.914869140063273, + "grad_norm": 0.30029568636648557, + "learning_rate": 3.9711256785953666e-08, + "loss": 0.335, + "step": 20270 + }, + { + "epoch": 2.9155881507046306, + "grad_norm": 0.30555415771095046, + "learning_rate": 3.9043726299412555e-08, + "loss": 0.3643, + "step": 20275 + }, + { + "epoch": 2.916307161345988, + "grad_norm": 0.31782358760179896, + "learning_rate": 3.838184295716807e-08, + "loss": 0.3509, + "step": 20280 + }, + { + "epoch": 2.917026171987345, + "grad_norm": 0.3076610569146311, + "learning_rate": 3.772560713446116e-08, + "loss": 0.3564, + "step": 20285 + }, + { + "epoch": 2.917745182628703, + "grad_norm": 0.29596165046549433, + "learning_rate": 3.7075019203329785e-08, + "loss": 0.3553, + "step": 20290 + }, + { + "epoch": 2.9184641932700606, + "grad_norm": 0.28946501766402427, + "learning_rate": 3.643007953261002e-08, + "loss": 0.3449, + "step": 20295 + }, + { + "epoch": 2.919183203911418, + "grad_norm": 0.3033439392793503, + "learning_rate": 3.579078848793605e-08, + "loss": 0.3634, + "step": 20300 + }, + { + "epoch": 2.9199022145527755, + "grad_norm": 0.32510429371217153, + "learning_rate": 3.5157146431741285e-08, + "loss": 0.3495, + "step": 20305 + }, + { + "epoch": 2.920621225194133, + "grad_norm": 0.31844070605679214, + "learning_rate": 3.452915372325394e-08, + "loss": 0.3543, + "step": 20310 + }, + { + "epoch": 2.9213402358354905, + "grad_norm": 0.31533466726486986, + "learning_rate": 3.390681071850033e-08, + "loss": 0.3672, + "step": 20315 + }, + { + "epoch": 2.922059246476848, + "grad_norm": 0.30434420248270794, + "learning_rate": 3.3290117770306e-08, + "loss": 0.3597, + "step": 20320 + }, + { + "epoch": 2.9227782571182055, + "grad_norm": 0.3566339511084951, + "learning_rate": 3.2679075228289056e-08, + "loss": 0.3476, + "step": 20325 + }, + { + "epoch": 2.9234972677595628, + "grad_norm": 0.3161302342664479, + "learning_rate": 3.2073683438866856e-08, + "loss": 0.3311, + "step": 20330 + }, + { + "epoch": 2.9242162784009205, + "grad_norm": 0.33681221067176464, + "learning_rate": 3.147394274525484e-08, + "loss": 0.3534, + "step": 20335 + }, + { + "epoch": 2.9249352890422777, + "grad_norm": 0.3183033251815415, + "learning_rate": 3.0879853487461034e-08, + "loss": 0.3531, + "step": 20340 + }, + { + "epoch": 2.9256542996836354, + "grad_norm": 0.31419493538006127, + "learning_rate": 3.029141600229157e-08, + "loss": 0.3563, + "step": 20345 + }, + { + "epoch": 2.9263733103249927, + "grad_norm": 0.32750253413973196, + "learning_rate": 2.97086306233485e-08, + "loss": 0.3617, + "step": 20350 + }, + { + "epoch": 2.9270923209663504, + "grad_norm": 0.3194802734755754, + "learning_rate": 2.913149768102752e-08, + "loss": 0.3698, + "step": 20355 + }, + { + "epoch": 2.9278113316077077, + "grad_norm": 0.3065275294013698, + "learning_rate": 2.8560017502524684e-08, + "loss": 0.3525, + "step": 20360 + }, + { + "epoch": 2.9285303422490654, + "grad_norm": 0.31590621149316106, + "learning_rate": 2.7994190411825272e-08, + "loss": 0.3543, + "step": 20365 + }, + { + "epoch": 2.9292493528904227, + "grad_norm": 0.2988018337298097, + "learning_rate": 2.7434016729712688e-08, + "loss": 0.3454, + "step": 20370 + }, + { + "epoch": 2.9299683635317804, + "grad_norm": 0.3111543446081254, + "learning_rate": 2.6879496773766223e-08, + "loss": 0.3492, + "step": 20375 + }, + { + "epoch": 2.9306873741731376, + "grad_norm": 0.3135170301967151, + "learning_rate": 2.6330630858358854e-08, + "loss": 0.3639, + "step": 20380 + }, + { + "epoch": 2.9314063848144953, + "grad_norm": 0.30293972283016546, + "learning_rate": 2.5787419294656113e-08, + "loss": 0.3563, + "step": 20385 + }, + { + "epoch": 2.9321253954558526, + "grad_norm": 0.3088210817503903, + "learning_rate": 2.524986239062166e-08, + "loss": 0.3533, + "step": 20390 + }, + { + "epoch": 2.9328444060972103, + "grad_norm": 0.3057889408584442, + "learning_rate": 2.4717960451010604e-08, + "loss": 0.3562, + "step": 20395 + }, + { + "epoch": 2.9335634167385676, + "grad_norm": 0.3140805414092627, + "learning_rate": 2.4191713777373947e-08, + "loss": 0.3419, + "step": 20400 + }, + { + "epoch": 2.9342824273799253, + "grad_norm": 0.3171695758890566, + "learning_rate": 2.3671122668054157e-08, + "loss": 0.3641, + "step": 20405 + }, + { + "epoch": 2.9350014380212825, + "grad_norm": 0.30623977691725907, + "learning_rate": 2.3156187418189592e-08, + "loss": 0.3439, + "step": 20410 + }, + { + "epoch": 2.9357204486626403, + "grad_norm": 0.3119455073162684, + "learning_rate": 2.264690831971228e-08, + "loss": 0.3542, + "step": 20415 + }, + { + "epoch": 2.9364394593039975, + "grad_norm": 0.31863085872319036, + "learning_rate": 2.2143285661345716e-08, + "loss": 0.3616, + "step": 20420 + }, + { + "epoch": 2.9371584699453552, + "grad_norm": 0.32105718207060935, + "learning_rate": 2.1645319728607063e-08, + "loss": 0.3602, + "step": 20425 + }, + { + "epoch": 2.937877480586713, + "grad_norm": 0.31514042136363335, + "learning_rate": 2.115301080380827e-08, + "loss": 0.3522, + "step": 20430 + }, + { + "epoch": 2.93859649122807, + "grad_norm": 0.3137089621665793, + "learning_rate": 2.066635916605386e-08, + "loss": 0.3518, + "step": 20435 + }, + { + "epoch": 2.9393155018694275, + "grad_norm": 0.31346497350839764, + "learning_rate": 2.0185365091237584e-08, + "loss": 0.3629, + "step": 20440 + }, + { + "epoch": 2.940034512510785, + "grad_norm": 0.31697462485362327, + "learning_rate": 1.971002885205131e-08, + "loss": 0.343, + "step": 20445 + }, + { + "epoch": 2.940753523152143, + "grad_norm": 0.30895496705501163, + "learning_rate": 1.924035071797392e-08, + "loss": 0.3546, + "step": 20450 + }, + { + "epoch": 2.9414725337935, + "grad_norm": 0.324785557475365, + "learning_rate": 1.87763309552802e-08, + "loss": 0.3589, + "step": 20455 + }, + { + "epoch": 2.9421915444348574, + "grad_norm": 0.3178029299759339, + "learning_rate": 1.8317969827036374e-08, + "loss": 0.3598, + "step": 20460 + }, + { + "epoch": 2.942910555076215, + "grad_norm": 0.31453703135147415, + "learning_rate": 1.7865267593099035e-08, + "loss": 0.3573, + "step": 20465 + }, + { + "epoch": 2.943629565717573, + "grad_norm": 0.31021776925122346, + "learning_rate": 1.741822451011954e-08, + "loss": 0.3607, + "step": 20470 + }, + { + "epoch": 2.94434857635893, + "grad_norm": 0.31773060517094964, + "learning_rate": 1.697684083153739e-08, + "loss": 0.3598, + "step": 20475 + }, + { + "epoch": 2.9450675870002874, + "grad_norm": 0.3242150798460791, + "learning_rate": 1.6541116807585746e-08, + "loss": 0.3556, + "step": 20480 + }, + { + "epoch": 2.945786597641645, + "grad_norm": 0.31054625988596773, + "learning_rate": 1.611105268528812e-08, + "loss": 0.3583, + "step": 20485 + }, + { + "epoch": 2.946505608283003, + "grad_norm": 0.3221031333992454, + "learning_rate": 1.5686648708461706e-08, + "loss": 0.358, + "step": 20490 + }, + { + "epoch": 2.94722461892436, + "grad_norm": 0.30824838943691657, + "learning_rate": 1.52679051177107e-08, + "loss": 0.3649, + "step": 20495 + }, + { + "epoch": 2.9479436295657173, + "grad_norm": 0.3032873167790253, + "learning_rate": 1.4854822150435211e-08, + "loss": 0.3365, + "step": 20500 + }, + { + "epoch": 2.948662640207075, + "grad_norm": 0.3096758621157358, + "learning_rate": 1.4447400040821236e-08, + "loss": 0.3488, + "step": 20505 + }, + { + "epoch": 2.9493816508484327, + "grad_norm": 0.3128423976533039, + "learning_rate": 1.4045639019848456e-08, + "loss": 0.3601, + "step": 20510 + }, + { + "epoch": 2.95010066148979, + "grad_norm": 0.31522955258712954, + "learning_rate": 1.3649539315285787e-08, + "loss": 0.3521, + "step": 20515 + }, + { + "epoch": 2.9508196721311473, + "grad_norm": 0.3011454778763948, + "learning_rate": 1.325910115169471e-08, + "loss": 0.3576, + "step": 20520 + }, + { + "epoch": 2.951538682772505, + "grad_norm": 0.3137639258571949, + "learning_rate": 1.2874324750424827e-08, + "loss": 0.3481, + "step": 20525 + }, + { + "epoch": 2.9522576934138627, + "grad_norm": 0.30701991466704626, + "learning_rate": 1.2495210329616091e-08, + "loss": 0.3481, + "step": 20530 + }, + { + "epoch": 2.95297670405522, + "grad_norm": 0.32093792697243734, + "learning_rate": 1.212175810419991e-08, + "loss": 0.3516, + "step": 20535 + }, + { + "epoch": 2.9536957146965777, + "grad_norm": 0.3314417785675213, + "learning_rate": 1.1753968285895812e-08, + "loss": 0.3637, + "step": 20540 + }, + { + "epoch": 2.954414725337935, + "grad_norm": 0.31720763388416395, + "learning_rate": 1.1391841083214783e-08, + "loss": 0.3375, + "step": 20545 + }, + { + "epoch": 2.9551337359792926, + "grad_norm": 0.3287030502099959, + "learning_rate": 1.1035376701457046e-08, + "loss": 0.3566, + "step": 20550 + }, + { + "epoch": 2.95585274662065, + "grad_norm": 0.3316071546997749, + "learning_rate": 1.0684575342710946e-08, + "loss": 0.3494, + "step": 20555 + }, + { + "epoch": 2.9565717572620076, + "grad_norm": 0.29597642878693026, + "learning_rate": 1.0339437205857395e-08, + "loss": 0.3406, + "step": 20560 + }, + { + "epoch": 2.957290767903365, + "grad_norm": 0.30733500521657, + "learning_rate": 9.999962486564319e-09, + "loss": 0.3464, + "step": 20565 + }, + { + "epoch": 2.9580097785447226, + "grad_norm": 0.31820148103238727, + "learning_rate": 9.666151377287768e-09, + "loss": 0.3521, + "step": 20570 + }, + { + "epoch": 2.95872878918608, + "grad_norm": 0.3099185849479619, + "learning_rate": 9.338004067277473e-09, + "loss": 0.3527, + "step": 20575 + }, + { + "epoch": 2.9594477998274376, + "grad_norm": 0.3173987262020995, + "learning_rate": 9.01552074256684e-09, + "loss": 0.3618, + "step": 20580 + }, + { + "epoch": 2.960166810468795, + "grad_norm": 0.32166638003526127, + "learning_rate": 8.69870158598074e-09, + "loss": 0.3605, + "step": 20585 + }, + { + "epoch": 2.9608858211101525, + "grad_norm": 0.3210814857400657, + "learning_rate": 8.387546777134382e-09, + "loss": 0.3503, + "step": 20590 + }, + { + "epoch": 2.96160483175151, + "grad_norm": 0.2969681480427528, + "learning_rate": 8.082056492428881e-09, + "loss": 0.3553, + "step": 20595 + }, + { + "epoch": 2.9623238423928675, + "grad_norm": 0.3288645255292419, + "learning_rate": 7.782230905055699e-09, + "loss": 0.355, + "step": 20600 + }, + { + "epoch": 2.9630428530342248, + "grad_norm": 0.3027674170191426, + "learning_rate": 7.488070184995532e-09, + "loss": 0.3662, + "step": 20605 + }, + { + "epoch": 2.9637618636755825, + "grad_norm": 0.3186717157693368, + "learning_rate": 7.1995744990138725e-09, + "loss": 0.3618, + "step": 20610 + }, + { + "epoch": 2.9644808743169397, + "grad_norm": 0.3236458510160991, + "learning_rate": 6.916744010667664e-09, + "loss": 0.3532, + "step": 20615 + }, + { + "epoch": 2.9651998849582974, + "grad_norm": 0.308800845677921, + "learning_rate": 6.639578880303088e-09, + "loss": 0.3481, + "step": 20620 + }, + { + "epoch": 2.9659188955996547, + "grad_norm": 0.31311455469072286, + "learning_rate": 6.3680792650511195e-09, + "loss": 0.3381, + "step": 20625 + }, + { + "epoch": 2.9666379062410124, + "grad_norm": 0.327509452324389, + "learning_rate": 6.102245318833078e-09, + "loss": 0.3658, + "step": 20630 + }, + { + "epoch": 2.9673569168823697, + "grad_norm": 0.3161914892201409, + "learning_rate": 5.842077192357298e-09, + "loss": 0.3611, + "step": 20635 + }, + { + "epoch": 2.9680759275237274, + "grad_norm": 0.30985081917277596, + "learning_rate": 5.587575033121351e-09, + "loss": 0.3539, + "step": 20640 + }, + { + "epoch": 2.968794938165085, + "grad_norm": 0.2981910052886259, + "learning_rate": 5.338738985407599e-09, + "loss": 0.3529, + "step": 20645 + }, + { + "epoch": 2.9695139488064424, + "grad_norm": 0.2944166779019275, + "learning_rate": 5.095569190290972e-09, + "loss": 0.3594, + "step": 20650 + }, + { + "epoch": 2.9702329594477996, + "grad_norm": 0.3085063397705311, + "learning_rate": 4.858065785627863e-09, + "loss": 0.3629, + "step": 20655 + }, + { + "epoch": 2.9709519700891573, + "grad_norm": 0.30295517426485213, + "learning_rate": 4.6262289060683414e-09, + "loss": 0.3572, + "step": 20660 + }, + { + "epoch": 2.971670980730515, + "grad_norm": 0.35923076109615476, + "learning_rate": 4.40005868304727e-09, + "loss": 0.3597, + "step": 20665 + }, + { + "epoch": 2.9723899913718723, + "grad_norm": 0.34893271399115416, + "learning_rate": 4.179555244784306e-09, + "loss": 0.3509, + "step": 20670 + }, + { + "epoch": 2.9731090020132296, + "grad_norm": 0.3143883926628604, + "learning_rate": 3.964718716291671e-09, + "loss": 0.3652, + "step": 20675 + }, + { + "epoch": 2.9738280126545873, + "grad_norm": 0.29830006241925805, + "learning_rate": 3.7555492193641626e-09, + "loss": 0.3589, + "step": 20680 + }, + { + "epoch": 2.974547023295945, + "grad_norm": 0.32414274647085906, + "learning_rate": 3.552046872586923e-09, + "loss": 0.3513, + "step": 20685 + }, + { + "epoch": 2.9752660339373023, + "grad_norm": 0.32909445069510174, + "learning_rate": 3.354211791330997e-09, + "loss": 0.3556, + "step": 20690 + }, + { + "epoch": 2.9759850445786595, + "grad_norm": 0.31026678511950995, + "learning_rate": 3.1620440877544455e-09, + "loss": 0.3606, + "step": 20695 + }, + { + "epoch": 2.9767040552200172, + "grad_norm": 0.305190412044465, + "learning_rate": 2.9755438708034545e-09, + "loss": 0.3598, + "step": 20700 + }, + { + "epoch": 2.977423065861375, + "grad_norm": 0.3194916805504214, + "learning_rate": 2.7947112462078928e-09, + "loss": 0.3553, + "step": 20705 + }, + { + "epoch": 2.978142076502732, + "grad_norm": 0.3057262686960243, + "learning_rate": 2.6195463164901956e-09, + "loss": 0.3426, + "step": 20710 + }, + { + "epoch": 2.9788610871440895, + "grad_norm": 0.30864903152168977, + "learning_rate": 2.4500491809531514e-09, + "loss": 0.3618, + "step": 20715 + }, + { + "epoch": 2.979580097785447, + "grad_norm": 0.3097525173144828, + "learning_rate": 2.286219935689893e-09, + "loss": 0.3465, + "step": 20720 + }, + { + "epoch": 2.980299108426805, + "grad_norm": 0.3069420211958998, + "learning_rate": 2.1280586735816787e-09, + "loss": 0.3487, + "step": 20725 + }, + { + "epoch": 2.981018119068162, + "grad_norm": 0.3177879063527259, + "learning_rate": 1.9755654842923413e-09, + "loss": 0.3618, + "step": 20730 + }, + { + "epoch": 2.9817371297095194, + "grad_norm": 0.3056246865363535, + "learning_rate": 1.8287404542771669e-09, + "loss": 0.3551, + "step": 20735 + }, + { + "epoch": 2.982456140350877, + "grad_norm": 0.3173333176806556, + "learning_rate": 1.6875836667729073e-09, + "loss": 0.3561, + "step": 20740 + }, + { + "epoch": 2.983175150992235, + "grad_norm": 0.30558313989198144, + "learning_rate": 1.5520952018055479e-09, + "loss": 0.3471, + "step": 20745 + }, + { + "epoch": 2.983894161633592, + "grad_norm": 0.3014106514790185, + "learning_rate": 1.4222751361880894e-09, + "loss": 0.345, + "step": 20750 + }, + { + "epoch": 2.98461317227495, + "grad_norm": 0.3088583558020864, + "learning_rate": 1.298123543519436e-09, + "loss": 0.365, + "step": 20755 + }, + { + "epoch": 2.985332182916307, + "grad_norm": 0.30252174119943026, + "learning_rate": 1.1796404941843975e-09, + "loss": 0.3653, + "step": 20760 + }, + { + "epoch": 2.986051193557665, + "grad_norm": 0.31549823464697346, + "learning_rate": 1.0668260553525767e-09, + "loss": 0.3644, + "step": 20765 + }, + { + "epoch": 2.986770204199022, + "grad_norm": 0.3013501884323165, + "learning_rate": 9.59680290983922e-10, + "loss": 0.3503, + "step": 20770 + }, + { + "epoch": 2.9874892148403798, + "grad_norm": 0.3121051698193485, + "learning_rate": 8.582032618220659e-10, + "loss": 0.3694, + "step": 20775 + }, + { + "epoch": 2.988208225481737, + "grad_norm": 0.31331924741224554, + "learning_rate": 7.62395025396545e-10, + "loss": 0.3695, + "step": 20780 + }, + { + "epoch": 2.9889272361230947, + "grad_norm": 0.30108692196709286, + "learning_rate": 6.722556360228006e-10, + "loss": 0.3546, + "step": 20785 + }, + { + "epoch": 2.989646246764452, + "grad_norm": 0.31417699206808497, + "learning_rate": 5.877851448055083e-10, + "loss": 0.3517, + "step": 20790 + }, + { + "epoch": 2.9903652574058097, + "grad_norm": 0.3198470224153279, + "learning_rate": 5.089835996319181e-10, + "loss": 0.3592, + "step": 20795 + }, + { + "epoch": 2.991084268047167, + "grad_norm": 0.3095277946847495, + "learning_rate": 4.3585104517629427e-10, + "loss": 0.3549, + "step": 20800 + }, + { + "epoch": 2.9918032786885247, + "grad_norm": 0.3064379816557907, + "learning_rate": 3.683875229010259e-10, + "loss": 0.3572, + "step": 20805 + }, + { + "epoch": 2.992522289329882, + "grad_norm": 0.31077957491930513, + "learning_rate": 3.0659307105218584e-10, + "loss": 0.3561, + "step": 20810 + }, + { + "epoch": 2.9932412999712397, + "grad_norm": 0.3094845779687916, + "learning_rate": 2.504677246628617e-10, + "loss": 0.3618, + "step": 20815 + }, + { + "epoch": 2.993960310612597, + "grad_norm": 0.30761750000478694, + "learning_rate": 2.0001151555315567e-10, + "loss": 0.3668, + "step": 20820 + }, + { + "epoch": 2.9946793212539546, + "grad_norm": 0.40069067232077094, + "learning_rate": 1.5522447232574345e-10, + "loss": 0.3679, + "step": 20825 + }, + { + "epoch": 2.995398331895312, + "grad_norm": 0.30539391121856113, + "learning_rate": 1.1610662037364607e-10, + "loss": 0.3552, + "step": 20830 + }, + { + "epoch": 2.9961173425366696, + "grad_norm": 0.30632811646513874, + "learning_rate": 8.265798187356844e-11, + "loss": 0.3503, + "step": 20835 + }, + { + "epoch": 2.996836353178027, + "grad_norm": 0.31324750295382386, + "learning_rate": 5.487857578811984e-11, + "loss": 0.3634, + "step": 20840 + }, + { + "epoch": 2.9975553638193846, + "grad_norm": 0.2974223358331177, + "learning_rate": 3.276841786581386e-11, + "loss": 0.3547, + "step": 20845 + }, + { + "epoch": 2.998274374460742, + "grad_norm": 0.3129318240396816, + "learning_rate": 1.6327520642178686e-11, + "loss": 0.3584, + "step": 20850 + }, + { + "epoch": 2.9989933851020996, + "grad_norm": 0.30680068198124816, + "learning_rate": 5.55589343864682e-12, + "loss": 0.3546, + "step": 20855 + }, + { + "epoch": 2.9997123957434573, + "grad_norm": 0.32602309806564345, + "learning_rate": 4.5354236033468e-13, + "loss": 0.3461, + "step": 20860 + }, + { + "epoch": 3.0, + "eval_loss": 0.45375168323516846, + "eval_runtime": 0.5768, + "eval_samples_per_second": 43.343, + "eval_steps_per_second": 1.734, + "step": 20862 + }, + { + "epoch": 3.0, + "step": 20862, + "total_flos": 2049636776804352.0, + "train_loss": 0.42822988295175696, + "train_runtime": 30718.8165, + "train_samples_per_second": 21.729, + "train_steps_per_second": 0.679 + } + ], + "logging_steps": 5, + "max_steps": 20862, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 2087, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2049636776804352.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}