diff --git "a/LORAs/300mb-DB-CodeFeedback-Tinyllama/checkpoint-313052/trainer_state.json" "b/LORAs/300mb-DB-CodeFeedback-Tinyllama/checkpoint-313052/trainer_state.json" new file mode 100644--- /dev/null +++ "b/LORAs/300mb-DB-CodeFeedback-Tinyllama/checkpoint-313052/trainer_state.json" @@ -0,0 +1,219168 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 313052, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.388714973870156e-05, + "grad_norm": 0.3715410828590393, + "learning_rate": 9.999999974822838e-05, + "loss": 1.0516, + "step": 10 + }, + { + "epoch": 0.00012777429947740312, + "grad_norm": 0.5814082026481628, + "learning_rate": 9.99999989929135e-05, + "loss": 1.1442, + "step": 20 + }, + { + "epoch": 0.00019166144921610467, + "grad_norm": 0.5080631375312805, + "learning_rate": 9.999999773405537e-05, + "loss": 0.786, + "step": 30 + }, + { + "epoch": 0.00025554859895480624, + "grad_norm": 0.9986467361450195, + "learning_rate": 9.999999636441773e-05, + "loss": 0.9332, + "step": 40 + }, + { + "epoch": 0.00031943574869350776, + "grad_norm": 0.9622060060501099, + "learning_rate": 9.99999941991818e-05, + "loss": 1.0711, + "step": 50 + }, + { + "epoch": 0.00038332289843220934, + "grad_norm": 0.7242945432662964, + "learning_rate": 9.999999153040267e-05, + "loss": 1.0533, + "step": 60 + }, + { + "epoch": 0.0004472100481709109, + "grad_norm": 0.6160323619842529, + "learning_rate": 9.999998835808037e-05, + "loss": 1.0065, + "step": 70 + }, + { + "epoch": 0.0005110971979096125, + "grad_norm": 1.1578069925308228, + "learning_rate": 9.999998468221492e-05, + "loss": 1.0088, + "step": 80 + }, + { + "epoch": 0.0005749843476483141, + "grad_norm": 0.7251982092857361, + "learning_rate": 9.999998050280638e-05, + "loss": 0.9858, + "step": 90 + }, + { + "epoch": 0.0006388714973870155, + "grad_norm": 0.9950217604637146, + "learning_rate": 9.999997581985477e-05, + "loss": 1.0171, + "step": 100 + }, + { + "epoch": 0.0007027586471257171, + "grad_norm": 1.055307388305664, + "learning_rate": 9.999997063336015e-05, + "loss": 0.993, + "step": 110 + }, + { + "epoch": 0.0007666457968644187, + "grad_norm": 0.8479915857315063, + "learning_rate": 9.999996494332258e-05, + "loss": 0.9431, + "step": 120 + }, + { + "epoch": 0.0008305329466031203, + "grad_norm": 0.8658767938613892, + "learning_rate": 9.999995874974209e-05, + "loss": 0.8251, + "step": 130 + }, + { + "epoch": 0.0008944200963418218, + "grad_norm": 2.8936755657196045, + "learning_rate": 9.999995205261878e-05, + "loss": 1.2926, + "step": 140 + }, + { + "epoch": 0.0009583072460805234, + "grad_norm": 0.5815268158912659, + "learning_rate": 9.999994485195268e-05, + "loss": 1.0419, + "step": 150 + }, + { + "epoch": 0.001022194395819225, + "grad_norm": 1.1052826642990112, + "learning_rate": 9.999993714774389e-05, + "loss": 1.1736, + "step": 160 + }, + { + "epoch": 0.0010860815455579266, + "grad_norm": 1.4575506448745728, + "learning_rate": 9.999992893999246e-05, + "loss": 1.129, + "step": 170 + }, + { + "epoch": 0.0011499686952966281, + "grad_norm": 0.6741073131561279, + "learning_rate": 9.99999202286985e-05, + "loss": 1.0822, + "step": 180 + }, + { + "epoch": 0.0012138558450353297, + "grad_norm": 0.7011894583702087, + "learning_rate": 9.99999110138621e-05, + "loss": 0.8165, + "step": 190 + }, + { + "epoch": 0.001277742994774031, + "grad_norm": 0.812254011631012, + "learning_rate": 9.999990129548333e-05, + "loss": 0.901, + "step": 200 + }, + { + "epoch": 0.0013416301445127326, + "grad_norm": 2.268691301345825, + "learning_rate": 9.99998910735623e-05, + "loss": 1.0642, + "step": 210 + }, + { + "epoch": 0.0014055172942514342, + "grad_norm": 1.1439971923828125, + "learning_rate": 9.999988034809911e-05, + "loss": 1.0481, + "step": 220 + }, + { + "epoch": 0.0014694044439901358, + "grad_norm": 0.953459620475769, + "learning_rate": 9.999986911909385e-05, + "loss": 0.9192, + "step": 230 + }, + { + "epoch": 0.0015332915937288374, + "grad_norm": 1.2826868295669556, + "learning_rate": 9.999985738654666e-05, + "loss": 0.9682, + "step": 240 + }, + { + "epoch": 0.001597178743467539, + "grad_norm": 0.9371192455291748, + "learning_rate": 9.999984515045768e-05, + "loss": 1.0604, + "step": 250 + }, + { + "epoch": 0.0016610658932062405, + "grad_norm": 1.085310459136963, + "learning_rate": 9.999983241082698e-05, + "loss": 0.9281, + "step": 260 + }, + { + "epoch": 0.001724953042944942, + "grad_norm": 0.906535267829895, + "learning_rate": 9.99998191676547e-05, + "loss": 0.9776, + "step": 270 + }, + { + "epoch": 0.0017888401926836437, + "grad_norm": 1.267155647277832, + "learning_rate": 9.9999805420941e-05, + "loss": 1.0792, + "step": 280 + }, + { + "epoch": 0.0018527273424223452, + "grad_norm": 0.6740846037864685, + "learning_rate": 9.9999791170686e-05, + "loss": 0.8078, + "step": 290 + }, + { + "epoch": 0.0019166144921610468, + "grad_norm": 1.0244088172912598, + "learning_rate": 9.999977641688985e-05, + "loss": 0.8445, + "step": 300 + }, + { + "epoch": 0.001980501641899748, + "grad_norm": 1.1094986200332642, + "learning_rate": 9.99997611595527e-05, + "loss": 0.9578, + "step": 310 + }, + { + "epoch": 0.00204438879163845, + "grad_norm": 1.1098616123199463, + "learning_rate": 9.99997453986747e-05, + "loss": 0.968, + "step": 320 + }, + { + "epoch": 0.0021082759413771513, + "grad_norm": 0.9596286416053772, + "learning_rate": 9.9999729134256e-05, + "loss": 1.0052, + "step": 330 + }, + { + "epoch": 0.002172163091115853, + "grad_norm": 1.476741075515747, + "learning_rate": 9.999971236629676e-05, + "loss": 1.0505, + "step": 340 + }, + { + "epoch": 0.0022360502408545545, + "grad_norm": 1.221603274345398, + "learning_rate": 9.999969509479718e-05, + "loss": 0.7985, + "step": 350 + }, + { + "epoch": 0.0022999373905932563, + "grad_norm": 1.42596435546875, + "learning_rate": 9.99996773197574e-05, + "loss": 1.0444, + "step": 360 + }, + { + "epoch": 0.0023638245403319576, + "grad_norm": 2.004958152770996, + "learning_rate": 9.999965904117762e-05, + "loss": 0.9776, + "step": 370 + }, + { + "epoch": 0.0024277116900706594, + "grad_norm": 0.6965352892875671, + "learning_rate": 9.999964025905801e-05, + "loss": 0.8532, + "step": 380 + }, + { + "epoch": 0.0024915988398093608, + "grad_norm": 0.995827853679657, + "learning_rate": 9.999962097339879e-05, + "loss": 1.1039, + "step": 390 + }, + { + "epoch": 0.002555485989548062, + "grad_norm": 4.0594635009765625, + "learning_rate": 9.999960118420011e-05, + "loss": 0.954, + "step": 400 + }, + { + "epoch": 0.002619373139286764, + "grad_norm": 1.0161226987838745, + "learning_rate": 9.99995808914622e-05, + "loss": 0.9513, + "step": 410 + }, + { + "epoch": 0.0026832602890254653, + "grad_norm": 1.1862547397613525, + "learning_rate": 9.999956009518525e-05, + "loss": 0.8693, + "step": 420 + }, + { + "epoch": 0.002747147438764167, + "grad_norm": 1.0898480415344238, + "learning_rate": 9.999953879536947e-05, + "loss": 1.0837, + "step": 430 + }, + { + "epoch": 0.0028110345885028684, + "grad_norm": 0.5398057103157043, + "learning_rate": 9.999951699201509e-05, + "loss": 0.8603, + "step": 440 + }, + { + "epoch": 0.00287492173824157, + "grad_norm": 0.825309157371521, + "learning_rate": 9.999949468512231e-05, + "loss": 1.1857, + "step": 450 + }, + { + "epoch": 0.0029388088879802716, + "grad_norm": 1.3066387176513672, + "learning_rate": 9.999947187469137e-05, + "loss": 0.918, + "step": 460 + }, + { + "epoch": 0.0030026960377189734, + "grad_norm": 1.3756886720657349, + "learning_rate": 9.999944856072248e-05, + "loss": 0.9851, + "step": 470 + }, + { + "epoch": 0.0030665831874576747, + "grad_norm": 1.39496648311615, + "learning_rate": 9.99994247432159e-05, + "loss": 0.9551, + "step": 480 + }, + { + "epoch": 0.0031304703371963765, + "grad_norm": 1.3940093517303467, + "learning_rate": 9.999940042217184e-05, + "loss": 0.9794, + "step": 490 + }, + { + "epoch": 0.003194357486935078, + "grad_norm": 0.708533763885498, + "learning_rate": 9.999937559759059e-05, + "loss": 0.8929, + "step": 500 + }, + { + "epoch": 0.0032582446366737792, + "grad_norm": 1.2305490970611572, + "learning_rate": 9.999935026947235e-05, + "loss": 0.8819, + "step": 510 + }, + { + "epoch": 0.003322131786412481, + "grad_norm": 1.3446779251098633, + "learning_rate": 9.99993244378174e-05, + "loss": 1.5002, + "step": 520 + }, + { + "epoch": 0.0033860189361511824, + "grad_norm": 1.0329649448394775, + "learning_rate": 9.9999298102626e-05, + "loss": 0.9671, + "step": 530 + }, + { + "epoch": 0.003449906085889884, + "grad_norm": 1.0138870477676392, + "learning_rate": 9.99992712638984e-05, + "loss": 0.9673, + "step": 540 + }, + { + "epoch": 0.0035137932356285855, + "grad_norm": 0.76459139585495, + "learning_rate": 9.999924392163491e-05, + "loss": 1.1123, + "step": 550 + }, + { + "epoch": 0.0035776803853672873, + "grad_norm": 1.9452675580978394, + "learning_rate": 9.999921607583576e-05, + "loss": 0.8708, + "step": 560 + }, + { + "epoch": 0.0036415675351059887, + "grad_norm": 0.7392802834510803, + "learning_rate": 9.999918772650126e-05, + "loss": 1.0164, + "step": 570 + }, + { + "epoch": 0.0037054546848446905, + "grad_norm": 1.3913438320159912, + "learning_rate": 9.999915887363167e-05, + "loss": 1.0721, + "step": 580 + }, + { + "epoch": 0.003769341834583392, + "grad_norm": 0.5592684745788574, + "learning_rate": 9.99991295172273e-05, + "loss": 1.0308, + "step": 590 + }, + { + "epoch": 0.0038332289843220936, + "grad_norm": 1.1413140296936035, + "learning_rate": 9.999909965728845e-05, + "loss": 0.8317, + "step": 600 + }, + { + "epoch": 0.003897116134060795, + "grad_norm": 1.4501404762268066, + "learning_rate": 9.99990692938154e-05, + "loss": 0.8625, + "step": 610 + }, + { + "epoch": 0.003961003283799496, + "grad_norm": 0.7926478981971741, + "learning_rate": 9.999903842680846e-05, + "loss": 1.0851, + "step": 620 + }, + { + "epoch": 0.004024890433538198, + "grad_norm": 2.0299394130706787, + "learning_rate": 9.999900705626797e-05, + "loss": 0.8804, + "step": 630 + }, + { + "epoch": 0.0040887775832769, + "grad_norm": 0.8396856188774109, + "learning_rate": 9.99989751821942e-05, + "loss": 0.9552, + "step": 640 + }, + { + "epoch": 0.004152664733015601, + "grad_norm": 1.275235891342163, + "learning_rate": 9.999894280458752e-05, + "loss": 0.9358, + "step": 650 + }, + { + "epoch": 0.004216551882754303, + "grad_norm": 0.8721204400062561, + "learning_rate": 9.999890992344821e-05, + "loss": 0.8874, + "step": 660 + }, + { + "epoch": 0.004280439032493004, + "grad_norm": 0.6353357434272766, + "learning_rate": 9.999887653877663e-05, + "loss": 1.1176, + "step": 670 + }, + { + "epoch": 0.004344326182231706, + "grad_norm": 1.0310698747634888, + "learning_rate": 9.999884265057311e-05, + "loss": 0.9272, + "step": 680 + }, + { + "epoch": 0.004408213331970407, + "grad_norm": 0.8742356896400452, + "learning_rate": 9.999880825883798e-05, + "loss": 0.9773, + "step": 690 + }, + { + "epoch": 0.004472100481709109, + "grad_norm": 0.9229012131690979, + "learning_rate": 9.99987733635716e-05, + "loss": 1.0118, + "step": 700 + }, + { + "epoch": 0.004535987631447811, + "grad_norm": 1.0641270875930786, + "learning_rate": 9.999873796477433e-05, + "loss": 0.9236, + "step": 710 + }, + { + "epoch": 0.0045998747811865125, + "grad_norm": 1.2784768342971802, + "learning_rate": 9.99987020624465e-05, + "loss": 1.3926, + "step": 720 + }, + { + "epoch": 0.004663761930925213, + "grad_norm": 0.919906497001648, + "learning_rate": 9.999866565658848e-05, + "loss": 0.9255, + "step": 730 + }, + { + "epoch": 0.004727649080663915, + "grad_norm": 1.3947570323944092, + "learning_rate": 9.999862874720065e-05, + "loss": 0.9953, + "step": 740 + }, + { + "epoch": 0.004791536230402617, + "grad_norm": 1.0191991329193115, + "learning_rate": 9.999859133428338e-05, + "loss": 0.9042, + "step": 750 + }, + { + "epoch": 0.004855423380141319, + "grad_norm": 5.101995944976807, + "learning_rate": 9.999855341783703e-05, + "loss": 1.0483, + "step": 760 + }, + { + "epoch": 0.00491931052988002, + "grad_norm": 1.350167155265808, + "learning_rate": 9.9998514997862e-05, + "loss": 1.0955, + "step": 770 + }, + { + "epoch": 0.0049831976796187215, + "grad_norm": 2.278700351715088, + "learning_rate": 9.999847607435866e-05, + "loss": 0.9322, + "step": 780 + }, + { + "epoch": 0.005047084829357423, + "grad_norm": 0.9818449020385742, + "learning_rate": 9.999843664732743e-05, + "loss": 0.8905, + "step": 790 + }, + { + "epoch": 0.005110971979096124, + "grad_norm": 1.0081336498260498, + "learning_rate": 9.999839671676865e-05, + "loss": 1.0194, + "step": 800 + }, + { + "epoch": 0.005174859128834826, + "grad_norm": 1.2605959177017212, + "learning_rate": 9.999835628268279e-05, + "loss": 0.9553, + "step": 810 + }, + { + "epoch": 0.005238746278573528, + "grad_norm": 0.9298542737960815, + "learning_rate": 9.999831534507022e-05, + "loss": 0.7657, + "step": 820 + }, + { + "epoch": 0.00530263342831223, + "grad_norm": 1.2965129613876343, + "learning_rate": 9.999827390393136e-05, + "loss": 0.7605, + "step": 830 + }, + { + "epoch": 0.0053665205780509305, + "grad_norm": 0.6737092137336731, + "learning_rate": 9.999823195926663e-05, + "loss": 1.3731, + "step": 840 + }, + { + "epoch": 0.005430407727789632, + "grad_norm": 1.1260855197906494, + "learning_rate": 9.999818951107644e-05, + "loss": 1.1665, + "step": 850 + }, + { + "epoch": 0.005494294877528334, + "grad_norm": 0.9080353379249573, + "learning_rate": 9.999814655936123e-05, + "loss": 1.053, + "step": 860 + }, + { + "epoch": 0.005558182027267036, + "grad_norm": 0.7714121341705322, + "learning_rate": 9.999810310412146e-05, + "loss": 1.0622, + "step": 870 + }, + { + "epoch": 0.005622069177005737, + "grad_norm": 1.5367814302444458, + "learning_rate": 9.99980591453575e-05, + "loss": 0.832, + "step": 880 + }, + { + "epoch": 0.005685956326744439, + "grad_norm": 0.8397789597511292, + "learning_rate": 9.999801468306984e-05, + "loss": 0.8085, + "step": 890 + }, + { + "epoch": 0.00574984347648314, + "grad_norm": 1.5233057737350464, + "learning_rate": 9.999796971725892e-05, + "loss": 0.7896, + "step": 900 + }, + { + "epoch": 0.005813730626221841, + "grad_norm": 0.987886369228363, + "learning_rate": 9.99979242479252e-05, + "loss": 1.2048, + "step": 910 + }, + { + "epoch": 0.005877617775960543, + "grad_norm": 0.8574057817459106, + "learning_rate": 9.999787827506911e-05, + "loss": 1.1049, + "step": 920 + }, + { + "epoch": 0.005941504925699245, + "grad_norm": 0.8249441385269165, + "learning_rate": 9.999783179869114e-05, + "loss": 0.8109, + "step": 930 + }, + { + "epoch": 0.006005392075437947, + "grad_norm": 1.311522364616394, + "learning_rate": 9.999778481879175e-05, + "loss": 1.0632, + "step": 940 + }, + { + "epoch": 0.006069279225176648, + "grad_norm": 0.7848984599113464, + "learning_rate": 9.999773733537141e-05, + "loss": 0.8783, + "step": 950 + }, + { + "epoch": 0.006133166374915349, + "grad_norm": 1.3800158500671387, + "learning_rate": 9.999768934843062e-05, + "loss": 1.0681, + "step": 960 + }, + { + "epoch": 0.006197053524654051, + "grad_norm": 2.259437322616577, + "learning_rate": 9.999764085796981e-05, + "loss": 1.0596, + "step": 970 + }, + { + "epoch": 0.006260940674392753, + "grad_norm": 0.876724123954773, + "learning_rate": 9.999759186398951e-05, + "loss": 0.9664, + "step": 980 + }, + { + "epoch": 0.006324827824131454, + "grad_norm": 0.6863592267036438, + "learning_rate": 9.999754236649023e-05, + "loss": 0.9189, + "step": 990 + }, + { + "epoch": 0.006388714973870156, + "grad_norm": 1.7007731199264526, + "learning_rate": 9.999749236547242e-05, + "loss": 1.3958, + "step": 1000 + }, + { + "epoch": 0.0064526021236088575, + "grad_norm": 1.7878336906433105, + "learning_rate": 9.999744186093662e-05, + "loss": 0.689, + "step": 1010 + }, + { + "epoch": 0.0065164892733475584, + "grad_norm": 0.8811324238777161, + "learning_rate": 9.999739085288333e-05, + "loss": 1.0409, + "step": 1020 + }, + { + "epoch": 0.00658037642308626, + "grad_norm": 0.7681977152824402, + "learning_rate": 9.999733934131305e-05, + "loss": 0.6836, + "step": 1030 + }, + { + "epoch": 0.006644263572824962, + "grad_norm": 0.9528589844703674, + "learning_rate": 9.999728732622631e-05, + "loss": 0.8524, + "step": 1040 + }, + { + "epoch": 0.006708150722563664, + "grad_norm": 0.8264364004135132, + "learning_rate": 9.999723480762365e-05, + "loss": 1.2183, + "step": 1050 + }, + { + "epoch": 0.006772037872302365, + "grad_norm": 0.740313708782196, + "learning_rate": 9.999718178550556e-05, + "loss": 1.0717, + "step": 1060 + }, + { + "epoch": 0.0068359250220410665, + "grad_norm": 1.0919981002807617, + "learning_rate": 9.99971282598726e-05, + "loss": 0.8355, + "step": 1070 + }, + { + "epoch": 0.006899812171779768, + "grad_norm": 1.0758978128433228, + "learning_rate": 9.999707423072531e-05, + "loss": 0.853, + "step": 1080 + }, + { + "epoch": 0.00696369932151847, + "grad_norm": 1.2707561254501343, + "learning_rate": 9.999701969806424e-05, + "loss": 1.0517, + "step": 1090 + }, + { + "epoch": 0.007027586471257171, + "grad_norm": 0.8416491150856018, + "learning_rate": 9.99969646618899e-05, + "loss": 0.9035, + "step": 1100 + }, + { + "epoch": 0.007091473620995873, + "grad_norm": 1.4568763971328735, + "learning_rate": 9.99969091222029e-05, + "loss": 0.8714, + "step": 1110 + }, + { + "epoch": 0.007155360770734575, + "grad_norm": 1.4576863050460815, + "learning_rate": 9.999685307900376e-05, + "loss": 0.8144, + "step": 1120 + }, + { + "epoch": 0.007219247920473276, + "grad_norm": 1.0689126253128052, + "learning_rate": 9.999679653229304e-05, + "loss": 0.925, + "step": 1130 + }, + { + "epoch": 0.007283135070211977, + "grad_norm": 1.1141548156738281, + "learning_rate": 9.999673948207134e-05, + "loss": 1.1567, + "step": 1140 + }, + { + "epoch": 0.007347022219950679, + "grad_norm": 0.8566306829452515, + "learning_rate": 9.999668192833922e-05, + "loss": 0.9069, + "step": 1150 + }, + { + "epoch": 0.007410909369689381, + "grad_norm": 0.7586050629615784, + "learning_rate": 9.999662387109728e-05, + "loss": 0.9713, + "step": 1160 + }, + { + "epoch": 0.007474796519428082, + "grad_norm": 0.6867004036903381, + "learning_rate": 9.999656531034604e-05, + "loss": 0.9686, + "step": 1170 + }, + { + "epoch": 0.007538683669166784, + "grad_norm": 0.9020546078681946, + "learning_rate": 9.999650624608617e-05, + "loss": 0.8857, + "step": 1180 + }, + { + "epoch": 0.007602570818905485, + "grad_norm": 0.6556907892227173, + "learning_rate": 9.999644667831822e-05, + "loss": 0.7392, + "step": 1190 + }, + { + "epoch": 0.007666457968644187, + "grad_norm": 0.8906095027923584, + "learning_rate": 9.99963866070428e-05, + "loss": 0.9946, + "step": 1200 + }, + { + "epoch": 0.007730345118382888, + "grad_norm": 0.776619017124176, + "learning_rate": 9.99963260322605e-05, + "loss": 0.8289, + "step": 1210 + }, + { + "epoch": 0.00779423226812159, + "grad_norm": 0.7643131613731384, + "learning_rate": 9.999626495397197e-05, + "loss": 1.1158, + "step": 1220 + }, + { + "epoch": 0.007858119417860292, + "grad_norm": 0.6919121742248535, + "learning_rate": 9.999620337217778e-05, + "loss": 0.713, + "step": 1230 + }, + { + "epoch": 0.007922006567598993, + "grad_norm": 0.9334784150123596, + "learning_rate": 9.999614128687857e-05, + "loss": 1.1716, + "step": 1240 + }, + { + "epoch": 0.007985893717337695, + "grad_norm": 1.4314568042755127, + "learning_rate": 9.999607869807496e-05, + "loss": 1.1029, + "step": 1250 + }, + { + "epoch": 0.008049780867076396, + "grad_norm": 1.5527832508087158, + "learning_rate": 9.99960156057676e-05, + "loss": 0.9211, + "step": 1260 + }, + { + "epoch": 0.008113668016815097, + "grad_norm": 0.7879507541656494, + "learning_rate": 9.999595200995711e-05, + "loss": 1.0019, + "step": 1270 + }, + { + "epoch": 0.0081775551665538, + "grad_norm": 1.07510244846344, + "learning_rate": 9.999588791064412e-05, + "loss": 0.9774, + "step": 1280 + }, + { + "epoch": 0.0082414423162925, + "grad_norm": 0.6843255162239075, + "learning_rate": 9.999582330782928e-05, + "loss": 0.9584, + "step": 1290 + }, + { + "epoch": 0.008305329466031202, + "grad_norm": 1.3522765636444092, + "learning_rate": 9.999575820151326e-05, + "loss": 0.8153, + "step": 1300 + }, + { + "epoch": 0.008369216615769904, + "grad_norm": 0.546192467212677, + "learning_rate": 9.99956925916967e-05, + "loss": 1.19, + "step": 1310 + }, + { + "epoch": 0.008433103765508605, + "grad_norm": 0.7880367636680603, + "learning_rate": 9.999562647838026e-05, + "loss": 1.017, + "step": 1320 + }, + { + "epoch": 0.008496990915247308, + "grad_norm": 0.9877641201019287, + "learning_rate": 9.999555986156461e-05, + "loss": 1.1224, + "step": 1330 + }, + { + "epoch": 0.008560878064986009, + "grad_norm": 1.541818618774414, + "learning_rate": 9.999549274125042e-05, + "loss": 0.8995, + "step": 1340 + }, + { + "epoch": 0.00862476521472471, + "grad_norm": 0.7599831223487854, + "learning_rate": 9.999542511743836e-05, + "loss": 1.0069, + "step": 1350 + }, + { + "epoch": 0.008688652364463412, + "grad_norm": 1.1491132974624634, + "learning_rate": 9.999535699012912e-05, + "loss": 0.882, + "step": 1360 + }, + { + "epoch": 0.008752539514202113, + "grad_norm": 0.8400082588195801, + "learning_rate": 9.999528835932339e-05, + "loss": 0.9932, + "step": 1370 + }, + { + "epoch": 0.008816426663940814, + "grad_norm": 0.897087037563324, + "learning_rate": 9.999521922502185e-05, + "loss": 0.8435, + "step": 1380 + }, + { + "epoch": 0.008880313813679517, + "grad_norm": 1.469058632850647, + "learning_rate": 9.99951495872252e-05, + "loss": 1.1746, + "step": 1390 + }, + { + "epoch": 0.008944200963418218, + "grad_norm": 0.7353557348251343, + "learning_rate": 9.999507944593413e-05, + "loss": 0.8944, + "step": 1400 + }, + { + "epoch": 0.009008088113156919, + "grad_norm": 0.8597003817558289, + "learning_rate": 9.999500880114938e-05, + "loss": 1.1237, + "step": 1410 + }, + { + "epoch": 0.009071975262895621, + "grad_norm": 1.2774052619934082, + "learning_rate": 9.999493765287164e-05, + "loss": 1.0234, + "step": 1420 + }, + { + "epoch": 0.009135862412634322, + "grad_norm": 1.0299676656723022, + "learning_rate": 9.99948660011016e-05, + "loss": 1.2595, + "step": 1430 + }, + { + "epoch": 0.009199749562373025, + "grad_norm": 0.6526196002960205, + "learning_rate": 9.999479384584003e-05, + "loss": 0.9003, + "step": 1440 + }, + { + "epoch": 0.009263636712111726, + "grad_norm": 0.9184065461158752, + "learning_rate": 9.999472118708763e-05, + "loss": 0.8782, + "step": 1450 + }, + { + "epoch": 0.009327523861850427, + "grad_norm": 1.0141165256500244, + "learning_rate": 9.999464802484513e-05, + "loss": 0.8616, + "step": 1460 + }, + { + "epoch": 0.00939141101158913, + "grad_norm": 1.9449567794799805, + "learning_rate": 9.999457435911328e-05, + "loss": 0.9921, + "step": 1470 + }, + { + "epoch": 0.00945529816132783, + "grad_norm": 0.9585944414138794, + "learning_rate": 9.99945001898928e-05, + "loss": 0.9861, + "step": 1480 + }, + { + "epoch": 0.009519185311066531, + "grad_norm": 1.201170802116394, + "learning_rate": 9.999442551718448e-05, + "loss": 1.3192, + "step": 1490 + }, + { + "epoch": 0.009583072460805234, + "grad_norm": 0.8674155473709106, + "learning_rate": 9.999435034098901e-05, + "loss": 0.8345, + "step": 1500 + }, + { + "epoch": 0.009646959610543935, + "grad_norm": 1.0349905490875244, + "learning_rate": 9.999427466130721e-05, + "loss": 1.1643, + "step": 1510 + }, + { + "epoch": 0.009710846760282638, + "grad_norm": 0.8286603689193726, + "learning_rate": 9.99941984781398e-05, + "loss": 1.0253, + "step": 1520 + }, + { + "epoch": 0.009774733910021339, + "grad_norm": 1.2230565547943115, + "learning_rate": 9.999412179148756e-05, + "loss": 1.1343, + "step": 1530 + }, + { + "epoch": 0.00983862105976004, + "grad_norm": 0.7413927912712097, + "learning_rate": 9.999404460135126e-05, + "loss": 0.9257, + "step": 1540 + }, + { + "epoch": 0.009902508209498742, + "grad_norm": 1.2482092380523682, + "learning_rate": 9.999396690773169e-05, + "loss": 1.2573, + "step": 1550 + }, + { + "epoch": 0.009966395359237443, + "grad_norm": 1.8260524272918701, + "learning_rate": 9.99938887106296e-05, + "loss": 0.9978, + "step": 1560 + }, + { + "epoch": 0.010030282508976144, + "grad_norm": 0.7294577956199646, + "learning_rate": 9.999381001004582e-05, + "loss": 0.8249, + "step": 1570 + }, + { + "epoch": 0.010094169658714847, + "grad_norm": 0.8026980757713318, + "learning_rate": 9.999373080598112e-05, + "loss": 0.9687, + "step": 1580 + }, + { + "epoch": 0.010158056808453548, + "grad_norm": 0.9354428052902222, + "learning_rate": 9.99936510984363e-05, + "loss": 1.0309, + "step": 1590 + }, + { + "epoch": 0.010221943958192248, + "grad_norm": 1.3766313791275024, + "learning_rate": 9.999357088741216e-05, + "loss": 1.0006, + "step": 1600 + }, + { + "epoch": 0.010285831107930951, + "grad_norm": 0.6556980609893799, + "learning_rate": 9.999349017290951e-05, + "loss": 0.9616, + "step": 1610 + }, + { + "epoch": 0.010349718257669652, + "grad_norm": 0.6386595368385315, + "learning_rate": 9.999340895492917e-05, + "loss": 0.9685, + "step": 1620 + }, + { + "epoch": 0.010413605407408355, + "grad_norm": 0.859089195728302, + "learning_rate": 9.999332723347194e-05, + "loss": 1.1593, + "step": 1630 + }, + { + "epoch": 0.010477492557147056, + "grad_norm": 0.8099786043167114, + "learning_rate": 9.999324500853866e-05, + "loss": 1.1586, + "step": 1640 + }, + { + "epoch": 0.010541379706885757, + "grad_norm": 0.8301547169685364, + "learning_rate": 9.999316228013016e-05, + "loss": 0.9436, + "step": 1650 + }, + { + "epoch": 0.01060526685662446, + "grad_norm": 1.2509781122207642, + "learning_rate": 9.999307904824725e-05, + "loss": 0.8458, + "step": 1660 + }, + { + "epoch": 0.01066915400636316, + "grad_norm": 1.0006517171859741, + "learning_rate": 9.99929953128908e-05, + "loss": 0.9221, + "step": 1670 + }, + { + "epoch": 0.010733041156101861, + "grad_norm": 0.8921092748641968, + "learning_rate": 9.999291107406163e-05, + "loss": 0.918, + "step": 1680 + }, + { + "epoch": 0.010796928305840564, + "grad_norm": 0.8920373916625977, + "learning_rate": 9.999282633176059e-05, + "loss": 1.059, + "step": 1690 + }, + { + "epoch": 0.010860815455579265, + "grad_norm": 0.7163852453231812, + "learning_rate": 9.999274108598854e-05, + "loss": 1.1965, + "step": 1700 + }, + { + "epoch": 0.010924702605317966, + "grad_norm": 0.7184985876083374, + "learning_rate": 9.999265533674635e-05, + "loss": 1.1157, + "step": 1710 + }, + { + "epoch": 0.010988589755056668, + "grad_norm": 1.6878572702407837, + "learning_rate": 9.999256908403485e-05, + "loss": 0.7872, + "step": 1720 + }, + { + "epoch": 0.01105247690479537, + "grad_norm": 2.4965457916259766, + "learning_rate": 9.999248232785494e-05, + "loss": 0.7284, + "step": 1730 + }, + { + "epoch": 0.011116364054534072, + "grad_norm": 0.6647805571556091, + "learning_rate": 9.999239506820749e-05, + "loss": 1.0634, + "step": 1740 + }, + { + "epoch": 0.011180251204272773, + "grad_norm": 1.112949252128601, + "learning_rate": 9.999230730509337e-05, + "loss": 1.0865, + "step": 1750 + }, + { + "epoch": 0.011244138354011474, + "grad_norm": 0.7501624822616577, + "learning_rate": 9.999221903851346e-05, + "loss": 1.0212, + "step": 1760 + }, + { + "epoch": 0.011308025503750176, + "grad_norm": 0.6178969144821167, + "learning_rate": 9.999213026846865e-05, + "loss": 0.9825, + "step": 1770 + }, + { + "epoch": 0.011371912653488877, + "grad_norm": 0.7546608448028564, + "learning_rate": 9.999204099495984e-05, + "loss": 0.8365, + "step": 1780 + }, + { + "epoch": 0.011435799803227578, + "grad_norm": 0.6355531811714172, + "learning_rate": 9.999195121798795e-05, + "loss": 1.1684, + "step": 1790 + }, + { + "epoch": 0.01149968695296628, + "grad_norm": 1.0356401205062866, + "learning_rate": 9.999186093755385e-05, + "loss": 1.0752, + "step": 1800 + }, + { + "epoch": 0.011563574102704982, + "grad_norm": 0.9333721399307251, + "learning_rate": 9.999177015365844e-05, + "loss": 0.9288, + "step": 1810 + }, + { + "epoch": 0.011627461252443683, + "grad_norm": 0.9251835942268372, + "learning_rate": 9.999167886630269e-05, + "loss": 0.748, + "step": 1820 + }, + { + "epoch": 0.011691348402182385, + "grad_norm": 1.0885391235351562, + "learning_rate": 9.999158707548745e-05, + "loss": 1.1773, + "step": 1830 + }, + { + "epoch": 0.011755235551921086, + "grad_norm": 1.2786647081375122, + "learning_rate": 9.99914947812137e-05, + "loss": 1.1812, + "step": 1840 + }, + { + "epoch": 0.011819122701659789, + "grad_norm": 0.6569780111312866, + "learning_rate": 9.999140198348236e-05, + "loss": 0.989, + "step": 1850 + }, + { + "epoch": 0.01188300985139849, + "grad_norm": 1.241723656654358, + "learning_rate": 9.999130868229434e-05, + "loss": 1.0771, + "step": 1860 + }, + { + "epoch": 0.01194689700113719, + "grad_norm": 1.3552509546279907, + "learning_rate": 9.999121487765058e-05, + "loss": 1.0246, + "step": 1870 + }, + { + "epoch": 0.012010784150875893, + "grad_norm": 0.6326724290847778, + "learning_rate": 9.999112056955205e-05, + "loss": 0.9514, + "step": 1880 + }, + { + "epoch": 0.012074671300614594, + "grad_norm": 2.2786476612091064, + "learning_rate": 9.99910257579997e-05, + "loss": 0.6857, + "step": 1890 + }, + { + "epoch": 0.012138558450353295, + "grad_norm": 1.0282983779907227, + "learning_rate": 9.999093044299446e-05, + "loss": 0.7788, + "step": 1900 + }, + { + "epoch": 0.012202445600091998, + "grad_norm": 1.1858989000320435, + "learning_rate": 9.999083462453728e-05, + "loss": 0.9619, + "step": 1910 + }, + { + "epoch": 0.012266332749830699, + "grad_norm": 0.6922428011894226, + "learning_rate": 9.999073830262918e-05, + "loss": 1.1683, + "step": 1920 + }, + { + "epoch": 0.0123302198995694, + "grad_norm": 0.6754278540611267, + "learning_rate": 9.999064147727109e-05, + "loss": 0.7358, + "step": 1930 + }, + { + "epoch": 0.012394107049308102, + "grad_norm": 0.7409210205078125, + "learning_rate": 9.999054414846398e-05, + "loss": 1.1866, + "step": 1940 + }, + { + "epoch": 0.012457994199046803, + "grad_norm": 0.8322914242744446, + "learning_rate": 9.999044631620887e-05, + "loss": 0.7945, + "step": 1950 + }, + { + "epoch": 0.012521881348785506, + "grad_norm": 1.1325633525848389, + "learning_rate": 9.999034798050668e-05, + "loss": 0.9324, + "step": 1960 + }, + { + "epoch": 0.012585768498524207, + "grad_norm": 0.9204065203666687, + "learning_rate": 9.999024914135846e-05, + "loss": 0.8747, + "step": 1970 + }, + { + "epoch": 0.012649655648262908, + "grad_norm": 1.3020517826080322, + "learning_rate": 9.999014979876517e-05, + "loss": 1.0649, + "step": 1980 + }, + { + "epoch": 0.01271354279800161, + "grad_norm": 1.0476547479629517, + "learning_rate": 9.999004995272785e-05, + "loss": 1.0729, + "step": 1990 + }, + { + "epoch": 0.012777429947740311, + "grad_norm": 0.8980121612548828, + "learning_rate": 9.998994960324746e-05, + "loss": 0.8566, + "step": 2000 + }, + { + "epoch": 0.012841317097479012, + "grad_norm": 2.678067684173584, + "learning_rate": 9.998984875032503e-05, + "loss": 1.01, + "step": 2010 + }, + { + "epoch": 0.012905204247217715, + "grad_norm": 1.1093647480010986, + "learning_rate": 9.998974739396159e-05, + "loss": 0.886, + "step": 2020 + }, + { + "epoch": 0.012969091396956416, + "grad_norm": 0.5292948484420776, + "learning_rate": 9.998964553415813e-05, + "loss": 0.9973, + "step": 2030 + }, + { + "epoch": 0.013032978546695117, + "grad_norm": 1.0876336097717285, + "learning_rate": 9.998954317091568e-05, + "loss": 1.09, + "step": 2040 + }, + { + "epoch": 0.01309686569643382, + "grad_norm": 1.7153469324111938, + "learning_rate": 9.998944030423531e-05, + "loss": 0.8236, + "step": 2050 + }, + { + "epoch": 0.01316075284617252, + "grad_norm": 0.5473589897155762, + "learning_rate": 9.998933693411802e-05, + "loss": 0.8271, + "step": 2060 + }, + { + "epoch": 0.013224639995911223, + "grad_norm": 0.930847704410553, + "learning_rate": 9.998923306056487e-05, + "loss": 1.0062, + "step": 2070 + }, + { + "epoch": 0.013288527145649924, + "grad_norm": 1.016547679901123, + "learning_rate": 9.998912868357688e-05, + "loss": 0.9092, + "step": 2080 + }, + { + "epoch": 0.013352414295388625, + "grad_norm": 0.8655534386634827, + "learning_rate": 9.99890238031551e-05, + "loss": 0.8901, + "step": 2090 + }, + { + "epoch": 0.013416301445127328, + "grad_norm": 0.7575225830078125, + "learning_rate": 9.998891841930064e-05, + "loss": 1.1021, + "step": 2100 + }, + { + "epoch": 0.013480188594866029, + "grad_norm": 0.8108758330345154, + "learning_rate": 9.998881253201452e-05, + "loss": 1.0897, + "step": 2110 + }, + { + "epoch": 0.01354407574460473, + "grad_norm": 1.2894190549850464, + "learning_rate": 9.998870614129781e-05, + "loss": 1.1317, + "step": 2120 + }, + { + "epoch": 0.013607962894343432, + "grad_norm": 1.173697590827942, + "learning_rate": 9.998859924715157e-05, + "loss": 0.7373, + "step": 2130 + }, + { + "epoch": 0.013671850044082133, + "grad_norm": 0.7047708034515381, + "learning_rate": 9.998849184957689e-05, + "loss": 0.7464, + "step": 2140 + }, + { + "epoch": 0.013735737193820836, + "grad_norm": 0.7167409062385559, + "learning_rate": 9.998838394857486e-05, + "loss": 0.9529, + "step": 2150 + }, + { + "epoch": 0.013799624343559537, + "grad_norm": 0.8524914383888245, + "learning_rate": 9.998827554414656e-05, + "loss": 1.0282, + "step": 2160 + }, + { + "epoch": 0.013863511493298238, + "grad_norm": 0.7894335389137268, + "learning_rate": 9.998816663629307e-05, + "loss": 1.0432, + "step": 2170 + }, + { + "epoch": 0.01392739864303694, + "grad_norm": 0.7883844971656799, + "learning_rate": 9.99880572250155e-05, + "loss": 1.1062, + "step": 2180 + }, + { + "epoch": 0.013991285792775641, + "grad_norm": 1.115862250328064, + "learning_rate": 9.998794731031494e-05, + "loss": 1.0593, + "step": 2190 + }, + { + "epoch": 0.014055172942514342, + "grad_norm": 0.5956576466560364, + "learning_rate": 9.998783689219251e-05, + "loss": 0.8832, + "step": 2200 + }, + { + "epoch": 0.014119060092253045, + "grad_norm": 1.0389795303344727, + "learning_rate": 9.998773708545755e-05, + "loss": 0.9955, + "step": 2210 + }, + { + "epoch": 0.014182947241991746, + "grad_norm": 1.3539459705352783, + "learning_rate": 9.998762571083662e-05, + "loss": 0.9878, + "step": 2220 + }, + { + "epoch": 0.014246834391730447, + "grad_norm": 1.031422734260559, + "learning_rate": 9.998751383279706e-05, + "loss": 0.9666, + "step": 2230 + }, + { + "epoch": 0.01431072154146915, + "grad_norm": 0.5059804320335388, + "learning_rate": 9.998740145134e-05, + "loss": 0.7655, + "step": 2240 + }, + { + "epoch": 0.01437460869120785, + "grad_norm": 0.7834402322769165, + "learning_rate": 9.998728856646656e-05, + "loss": 1.0262, + "step": 2250 + }, + { + "epoch": 0.014438495840946553, + "grad_norm": 0.7399794459342957, + "learning_rate": 9.998717517817786e-05, + "loss": 0.982, + "step": 2260 + }, + { + "epoch": 0.014502382990685254, + "grad_norm": 0.7037153840065002, + "learning_rate": 9.998706128647508e-05, + "loss": 0.7902, + "step": 2270 + }, + { + "epoch": 0.014566270140423955, + "grad_norm": 0.8694613575935364, + "learning_rate": 9.998694689135934e-05, + "loss": 1.0659, + "step": 2280 + }, + { + "epoch": 0.014630157290162657, + "grad_norm": 1.4297699928283691, + "learning_rate": 9.99868319928318e-05, + "loss": 1.0498, + "step": 2290 + }, + { + "epoch": 0.014694044439901358, + "grad_norm": 1.0179654359817505, + "learning_rate": 9.998671659089361e-05, + "loss": 0.9041, + "step": 2300 + }, + { + "epoch": 0.01475793158964006, + "grad_norm": 0.9118665456771851, + "learning_rate": 9.998660068554596e-05, + "loss": 1.0452, + "step": 2310 + }, + { + "epoch": 0.014821818739378762, + "grad_norm": 1.0615768432617188, + "learning_rate": 9.998649594031891e-05, + "loss": 0.9364, + "step": 2320 + }, + { + "epoch": 0.014885705889117463, + "grad_norm": 1.8446980714797974, + "learning_rate": 9.998637907849646e-05, + "loss": 1.0038, + "step": 2330 + }, + { + "epoch": 0.014949593038856164, + "grad_norm": 1.1372798681259155, + "learning_rate": 9.998626171326792e-05, + "loss": 1.0814, + "step": 2340 + }, + { + "epoch": 0.015013480188594866, + "grad_norm": 1.2520413398742676, + "learning_rate": 9.998614384463449e-05, + "loss": 0.9373, + "step": 2350 + }, + { + "epoch": 0.015077367338333567, + "grad_norm": 0.7592064738273621, + "learning_rate": 9.998602547259734e-05, + "loss": 1.0605, + "step": 2360 + }, + { + "epoch": 0.01514125448807227, + "grad_norm": 0.8538485169410706, + "learning_rate": 9.998590659715766e-05, + "loss": 0.8727, + "step": 2370 + }, + { + "epoch": 0.01520514163781097, + "grad_norm": 0.7715469002723694, + "learning_rate": 9.998578721831666e-05, + "loss": 1.0918, + "step": 2380 + }, + { + "epoch": 0.015269028787549672, + "grad_norm": 1.0266464948654175, + "learning_rate": 9.998566733607554e-05, + "loss": 1.0816, + "step": 2390 + }, + { + "epoch": 0.015332915937288374, + "grad_norm": 0.564927339553833, + "learning_rate": 9.998554695043552e-05, + "loss": 0.8394, + "step": 2400 + }, + { + "epoch": 0.015396803087027075, + "grad_norm": 1.2067440748214722, + "learning_rate": 9.998542606139779e-05, + "loss": 1.1371, + "step": 2410 + }, + { + "epoch": 0.015460690236765776, + "grad_norm": 1.1786682605743408, + "learning_rate": 9.998530466896357e-05, + "loss": 0.9845, + "step": 2420 + }, + { + "epoch": 0.015524577386504479, + "grad_norm": 3.820138454437256, + "learning_rate": 9.99851827731341e-05, + "loss": 0.9517, + "step": 2430 + }, + { + "epoch": 0.01558846453624318, + "grad_norm": 0.8492526412010193, + "learning_rate": 9.998506037391058e-05, + "loss": 0.989, + "step": 2440 + }, + { + "epoch": 0.01565235168598188, + "grad_norm": 1.1744376420974731, + "learning_rate": 9.998493747129428e-05, + "loss": 0.8713, + "step": 2450 + }, + { + "epoch": 0.015716238835720583, + "grad_norm": 1.1239817142486572, + "learning_rate": 9.99848140652864e-05, + "loss": 0.801, + "step": 2460 + }, + { + "epoch": 0.015780125985459286, + "grad_norm": 0.8037886023521423, + "learning_rate": 9.99846901558882e-05, + "loss": 0.8116, + "step": 2470 + }, + { + "epoch": 0.015844013135197985, + "grad_norm": 0.9169192314147949, + "learning_rate": 9.998456574310094e-05, + "loss": 1.0343, + "step": 2480 + }, + { + "epoch": 0.015907900284936688, + "grad_norm": 0.7503566145896912, + "learning_rate": 9.998444082692585e-05, + "loss": 1.0077, + "step": 2490 + }, + { + "epoch": 0.01597178743467539, + "grad_norm": 1.1476398706436157, + "learning_rate": 9.99843154073642e-05, + "loss": 0.8427, + "step": 2500 + }, + { + "epoch": 0.01603567458441409, + "grad_norm": 0.7474212646484375, + "learning_rate": 9.998418948441726e-05, + "loss": 0.7488, + "step": 2510 + }, + { + "epoch": 0.016099561734152792, + "grad_norm": 0.9779971837997437, + "learning_rate": 9.998406305808627e-05, + "loss": 0.8778, + "step": 2520 + }, + { + "epoch": 0.016163448883891495, + "grad_norm": 1.0902825593948364, + "learning_rate": 9.998393612837254e-05, + "loss": 1.2649, + "step": 2530 + }, + { + "epoch": 0.016227336033630194, + "grad_norm": 0.9004558324813843, + "learning_rate": 9.998380869527732e-05, + "loss": 0.7976, + "step": 2540 + }, + { + "epoch": 0.016291223183368897, + "grad_norm": 0.8847173452377319, + "learning_rate": 9.998368075880192e-05, + "loss": 0.9168, + "step": 2550 + }, + { + "epoch": 0.0163551103331076, + "grad_norm": 1.1703412532806396, + "learning_rate": 9.99835523189476e-05, + "loss": 0.9307, + "step": 2560 + }, + { + "epoch": 0.0164189974828463, + "grad_norm": 0.7630004286766052, + "learning_rate": 9.998342337571565e-05, + "loss": 0.8969, + "step": 2570 + }, + { + "epoch": 0.016482884632585, + "grad_norm": 0.9424830079078674, + "learning_rate": 9.998329392910741e-05, + "loss": 1.0097, + "step": 2580 + }, + { + "epoch": 0.016546771782323704, + "grad_norm": 0.891345739364624, + "learning_rate": 9.998316397912415e-05, + "loss": 0.9626, + "step": 2590 + }, + { + "epoch": 0.016610658932062403, + "grad_norm": 0.7180986404418945, + "learning_rate": 9.998303352576719e-05, + "loss": 0.9426, + "step": 2600 + }, + { + "epoch": 0.016674546081801106, + "grad_norm": 1.2385119199752808, + "learning_rate": 9.998290256903784e-05, + "loss": 0.7992, + "step": 2610 + }, + { + "epoch": 0.01673843323153981, + "grad_norm": 0.9304938316345215, + "learning_rate": 9.998277110893741e-05, + "loss": 1.1183, + "step": 2620 + }, + { + "epoch": 0.01680232038127851, + "grad_norm": 4.244834899902344, + "learning_rate": 9.998263914546724e-05, + "loss": 1.1446, + "step": 2630 + }, + { + "epoch": 0.01686620753101721, + "grad_norm": 1.0744621753692627, + "learning_rate": 9.998250667862868e-05, + "loss": 0.7592, + "step": 2640 + }, + { + "epoch": 0.016930094680755913, + "grad_norm": 1.1547142267227173, + "learning_rate": 9.9982373708423e-05, + "loss": 0.9098, + "step": 2650 + }, + { + "epoch": 0.016993981830494616, + "grad_norm": 0.8676884770393372, + "learning_rate": 9.998224023485159e-05, + "loss": 1.1234, + "step": 2660 + }, + { + "epoch": 0.017057868980233315, + "grad_norm": 1.3594059944152832, + "learning_rate": 9.998210625791578e-05, + "loss": 0.7285, + "step": 2670 + }, + { + "epoch": 0.017121756129972018, + "grad_norm": 0.9443914294242859, + "learning_rate": 9.998197177761692e-05, + "loss": 1.0057, + "step": 2680 + }, + { + "epoch": 0.01718564327971072, + "grad_norm": 0.7387935519218445, + "learning_rate": 9.998183679395636e-05, + "loss": 0.8873, + "step": 2690 + }, + { + "epoch": 0.01724953042944942, + "grad_norm": 0.9435983896255493, + "learning_rate": 9.998170130693545e-05, + "loss": 0.8891, + "step": 2700 + }, + { + "epoch": 0.017313417579188122, + "grad_norm": 1.0034334659576416, + "learning_rate": 9.998156531655557e-05, + "loss": 1.0039, + "step": 2710 + }, + { + "epoch": 0.017377304728926825, + "grad_norm": 1.2125136852264404, + "learning_rate": 9.99814288228181e-05, + "loss": 0.7617, + "step": 2720 + }, + { + "epoch": 0.017441191878665524, + "grad_norm": 0.9862277507781982, + "learning_rate": 9.998129182572442e-05, + "loss": 0.8764, + "step": 2730 + }, + { + "epoch": 0.017505079028404227, + "grad_norm": 1.1421021223068237, + "learning_rate": 9.998115432527586e-05, + "loss": 0.9241, + "step": 2740 + }, + { + "epoch": 0.01756896617814293, + "grad_norm": 0.8746705651283264, + "learning_rate": 9.998101632147385e-05, + "loss": 0.9238, + "step": 2750 + }, + { + "epoch": 0.01763285332788163, + "grad_norm": 0.6663450002670288, + "learning_rate": 9.998087781431977e-05, + "loss": 1.0525, + "step": 2760 + }, + { + "epoch": 0.01769674047762033, + "grad_norm": 1.4795788526535034, + "learning_rate": 9.9980738803815e-05, + "loss": 0.9025, + "step": 2770 + }, + { + "epoch": 0.017760627627359034, + "grad_norm": 0.7279462218284607, + "learning_rate": 9.998059928996095e-05, + "loss": 1.1858, + "step": 2780 + }, + { + "epoch": 0.017824514777097733, + "grad_norm": 0.7917711138725281, + "learning_rate": 9.998045927275903e-05, + "loss": 0.9119, + "step": 2790 + }, + { + "epoch": 0.017888401926836436, + "grad_norm": 1.2472501993179321, + "learning_rate": 9.998031875221065e-05, + "loss": 0.839, + "step": 2800 + }, + { + "epoch": 0.01795228907657514, + "grad_norm": 0.9328956604003906, + "learning_rate": 9.998017772831723e-05, + "loss": 0.9749, + "step": 2810 + }, + { + "epoch": 0.018016176226313838, + "grad_norm": 0.732351541519165, + "learning_rate": 9.998003620108017e-05, + "loss": 1.0359, + "step": 2820 + }, + { + "epoch": 0.01808006337605254, + "grad_norm": 0.8829627633094788, + "learning_rate": 9.99798941705009e-05, + "loss": 0.9921, + "step": 2830 + }, + { + "epoch": 0.018143950525791243, + "grad_norm": 0.7300599813461304, + "learning_rate": 9.997975163658086e-05, + "loss": 0.9041, + "step": 2840 + }, + { + "epoch": 0.018207837675529946, + "grad_norm": 1.0057677030563354, + "learning_rate": 9.997960859932148e-05, + "loss": 1.1656, + "step": 2850 + }, + { + "epoch": 0.018271724825268645, + "grad_norm": 0.6405202746391296, + "learning_rate": 9.997946505872421e-05, + "loss": 0.7273, + "step": 2860 + }, + { + "epoch": 0.018335611975007347, + "grad_norm": 1.383867621421814, + "learning_rate": 9.997932101479049e-05, + "loss": 1.0818, + "step": 2870 + }, + { + "epoch": 0.01839949912474605, + "grad_norm": 2.046144723892212, + "learning_rate": 9.997917646752175e-05, + "loss": 1.0075, + "step": 2880 + }, + { + "epoch": 0.01846338627448475, + "grad_norm": 0.6531755924224854, + "learning_rate": 9.99790314169195e-05, + "loss": 0.978, + "step": 2890 + }, + { + "epoch": 0.018527273424223452, + "grad_norm": 0.8605973720550537, + "learning_rate": 9.997888586298514e-05, + "loss": 1.0424, + "step": 2900 + }, + { + "epoch": 0.018591160573962155, + "grad_norm": 1.2451750040054321, + "learning_rate": 9.997873980572017e-05, + "loss": 0.9909, + "step": 2910 + }, + { + "epoch": 0.018655047723700854, + "grad_norm": 1.1829801797866821, + "learning_rate": 9.997859324512604e-05, + "loss": 0.8175, + "step": 2920 + }, + { + "epoch": 0.018718934873439556, + "grad_norm": 1.987342357635498, + "learning_rate": 9.997844618120424e-05, + "loss": 1.1086, + "step": 2930 + }, + { + "epoch": 0.01878282202317826, + "grad_norm": 1.5796905755996704, + "learning_rate": 9.997829861395627e-05, + "loss": 0.9863, + "step": 2940 + }, + { + "epoch": 0.018846709172916958, + "grad_norm": 0.5378701686859131, + "learning_rate": 9.997815054338357e-05, + "loss": 0.7471, + "step": 2950 + }, + { + "epoch": 0.01891059632265566, + "grad_norm": 1.4551935195922852, + "learning_rate": 9.997800196948768e-05, + "loss": 1.1466, + "step": 2960 + }, + { + "epoch": 0.018974483472394364, + "grad_norm": 0.5287359356880188, + "learning_rate": 9.997785289227007e-05, + "loss": 0.8842, + "step": 2970 + }, + { + "epoch": 0.019038370622133063, + "grad_norm": 0.6062310338020325, + "learning_rate": 9.997770331173221e-05, + "loss": 0.8015, + "step": 2980 + }, + { + "epoch": 0.019102257771871765, + "grad_norm": 0.9560365676879883, + "learning_rate": 9.997755322787568e-05, + "loss": 1.1405, + "step": 2990 + }, + { + "epoch": 0.019166144921610468, + "grad_norm": 0.7935013175010681, + "learning_rate": 9.997740264070194e-05, + "loss": 0.8133, + "step": 3000 + }, + { + "epoch": 0.019230032071349167, + "grad_norm": 0.8417540788650513, + "learning_rate": 9.997725155021253e-05, + "loss": 0.8547, + "step": 3010 + }, + { + "epoch": 0.01929391922108787, + "grad_norm": 0.5501998662948608, + "learning_rate": 9.997709995640894e-05, + "loss": 0.9299, + "step": 3020 + }, + { + "epoch": 0.019357806370826573, + "grad_norm": 0.821506917476654, + "learning_rate": 9.997694785929273e-05, + "loss": 0.9835, + "step": 3030 + }, + { + "epoch": 0.019421693520565275, + "grad_norm": 0.817926287651062, + "learning_rate": 9.997679525886541e-05, + "loss": 1.2224, + "step": 3040 + }, + { + "epoch": 0.019485580670303974, + "grad_norm": 2.5229651927948, + "learning_rate": 9.997664215512854e-05, + "loss": 1.0535, + "step": 3050 + }, + { + "epoch": 0.019549467820042677, + "grad_norm": 0.8168900609016418, + "learning_rate": 9.997648854808364e-05, + "loss": 1.0088, + "step": 3060 + }, + { + "epoch": 0.01961335496978138, + "grad_norm": 0.522985577583313, + "learning_rate": 9.997633443773226e-05, + "loss": 0.9106, + "step": 3070 + }, + { + "epoch": 0.01967724211952008, + "grad_norm": 0.5633349418640137, + "learning_rate": 9.997617982407595e-05, + "loss": 0.9174, + "step": 3080 + }, + { + "epoch": 0.01974112926925878, + "grad_norm": 2.293459892272949, + "learning_rate": 9.997602470711628e-05, + "loss": 0.8805, + "step": 3090 + }, + { + "epoch": 0.019805016418997484, + "grad_norm": 0.6353404521942139, + "learning_rate": 9.997586908685481e-05, + "loss": 0.9407, + "step": 3100 + }, + { + "epoch": 0.019868903568736183, + "grad_norm": 0.6325660943984985, + "learning_rate": 9.997571296329312e-05, + "loss": 0.6832, + "step": 3110 + }, + { + "epoch": 0.019932790718474886, + "grad_norm": 0.7705810070037842, + "learning_rate": 9.997555633643274e-05, + "loss": 0.7368, + "step": 3120 + }, + { + "epoch": 0.01999667786821359, + "grad_norm": 0.601768434047699, + "learning_rate": 9.997539920627527e-05, + "loss": 1.0854, + "step": 3130 + }, + { + "epoch": 0.020060565017952288, + "grad_norm": 1.055450439453125, + "learning_rate": 9.997524157282231e-05, + "loss": 1.1712, + "step": 3140 + }, + { + "epoch": 0.02012445216769099, + "grad_norm": 0.5919578671455383, + "learning_rate": 9.997508343607542e-05, + "loss": 0.9698, + "step": 3150 + }, + { + "epoch": 0.020188339317429693, + "grad_norm": 1.1966851949691772, + "learning_rate": 9.997492479603623e-05, + "loss": 0.9131, + "step": 3160 + }, + { + "epoch": 0.020252226467168392, + "grad_norm": 0.5245844721794128, + "learning_rate": 9.997476565270629e-05, + "loss": 0.9533, + "step": 3170 + }, + { + "epoch": 0.020316113616907095, + "grad_norm": 0.6640262603759766, + "learning_rate": 9.997460600608723e-05, + "loss": 0.954, + "step": 3180 + }, + { + "epoch": 0.020380000766645798, + "grad_norm": 1.1632764339447021, + "learning_rate": 9.997444585618066e-05, + "loss": 0.9693, + "step": 3190 + }, + { + "epoch": 0.020443887916384497, + "grad_norm": 0.8746532797813416, + "learning_rate": 9.997428520298817e-05, + "loss": 0.9353, + "step": 3200 + }, + { + "epoch": 0.0205077750661232, + "grad_norm": 0.7248082756996155, + "learning_rate": 9.997412404651141e-05, + "loss": 1.0746, + "step": 3210 + }, + { + "epoch": 0.020571662215861902, + "grad_norm": 1.0290027856826782, + "learning_rate": 9.997396238675198e-05, + "loss": 1.013, + "step": 3220 + }, + { + "epoch": 0.0206355493656006, + "grad_norm": 1.3203686475753784, + "learning_rate": 9.997380022371153e-05, + "loss": 0.9819, + "step": 3230 + }, + { + "epoch": 0.020699436515339304, + "grad_norm": 1.1412265300750732, + "learning_rate": 9.997363755739166e-05, + "loss": 0.756, + "step": 3240 + }, + { + "epoch": 0.020763323665078007, + "grad_norm": 1.012272596359253, + "learning_rate": 9.997347438779403e-05, + "loss": 0.8896, + "step": 3250 + }, + { + "epoch": 0.02082721081481671, + "grad_norm": 0.6581144332885742, + "learning_rate": 9.997331071492028e-05, + "loss": 0.9093, + "step": 3260 + }, + { + "epoch": 0.02089109796455541, + "grad_norm": 0.6292199492454529, + "learning_rate": 9.997314653877206e-05, + "loss": 0.8898, + "step": 3270 + }, + { + "epoch": 0.02095498511429411, + "grad_norm": 0.8514048457145691, + "learning_rate": 9.997298185935102e-05, + "loss": 1.0219, + "step": 3280 + }, + { + "epoch": 0.021018872264032814, + "grad_norm": 0.8251546621322632, + "learning_rate": 9.99728166766588e-05, + "loss": 1.1417, + "step": 3290 + }, + { + "epoch": 0.021082759413771513, + "grad_norm": 0.7164210081100464, + "learning_rate": 9.997265099069712e-05, + "loss": 0.8025, + "step": 3300 + }, + { + "epoch": 0.021146646563510216, + "grad_norm": 0.6162307858467102, + "learning_rate": 9.997248480146759e-05, + "loss": 1.1907, + "step": 3310 + }, + { + "epoch": 0.02121053371324892, + "grad_norm": 0.9600428938865662, + "learning_rate": 9.997231810897191e-05, + "loss": 1.0066, + "step": 3320 + }, + { + "epoch": 0.021274420862987618, + "grad_norm": 1.1238371133804321, + "learning_rate": 9.997215091321178e-05, + "loss": 0.8551, + "step": 3330 + }, + { + "epoch": 0.02133830801272632, + "grad_norm": 1.7699466943740845, + "learning_rate": 9.997198321418881e-05, + "loss": 1.049, + "step": 3340 + }, + { + "epoch": 0.021402195162465023, + "grad_norm": 1.0499175786972046, + "learning_rate": 9.997181501190478e-05, + "loss": 1.11, + "step": 3350 + }, + { + "epoch": 0.021466082312203722, + "grad_norm": 0.9096553325653076, + "learning_rate": 9.997164630636132e-05, + "loss": 0.9371, + "step": 3360 + }, + { + "epoch": 0.021529969461942425, + "grad_norm": 0.8059217929840088, + "learning_rate": 9.997147709756014e-05, + "loss": 0.8638, + "step": 3370 + }, + { + "epoch": 0.021593856611681127, + "grad_norm": 0.6484128832817078, + "learning_rate": 9.997130738550298e-05, + "loss": 0.9817, + "step": 3380 + }, + { + "epoch": 0.021657743761419827, + "grad_norm": 1.1222511529922485, + "learning_rate": 9.997113717019151e-05, + "loss": 0.7598, + "step": 3390 + }, + { + "epoch": 0.02172163091115853, + "grad_norm": 1.0018411874771118, + "learning_rate": 9.997096645162745e-05, + "loss": 0.9593, + "step": 3400 + }, + { + "epoch": 0.021785518060897232, + "grad_norm": 0.6298023462295532, + "learning_rate": 9.997079522981254e-05, + "loss": 0.8118, + "step": 3410 + }, + { + "epoch": 0.02184940521063593, + "grad_norm": 0.5194735527038574, + "learning_rate": 9.997062350474849e-05, + "loss": 0.8344, + "step": 3420 + }, + { + "epoch": 0.021913292360374634, + "grad_norm": 0.7458469271659851, + "learning_rate": 9.997045127643703e-05, + "loss": 1.1305, + "step": 3430 + }, + { + "epoch": 0.021977179510113336, + "grad_norm": 1.092467188835144, + "learning_rate": 9.997027854487988e-05, + "loss": 0.7839, + "step": 3440 + }, + { + "epoch": 0.022041066659852036, + "grad_norm": 0.5377646088600159, + "learning_rate": 9.997010531007879e-05, + "loss": 0.9457, + "step": 3450 + }, + { + "epoch": 0.02210495380959074, + "grad_norm": 0.8158820271492004, + "learning_rate": 9.996993157203554e-05, + "loss": 1.0827, + "step": 3460 + }, + { + "epoch": 0.02216884095932944, + "grad_norm": 0.9033936858177185, + "learning_rate": 9.996975733075184e-05, + "loss": 0.8901, + "step": 3470 + }, + { + "epoch": 0.022232728109068144, + "grad_norm": 0.6493645310401917, + "learning_rate": 9.996958258622944e-05, + "loss": 1.0609, + "step": 3480 + }, + { + "epoch": 0.022296615258806843, + "grad_norm": 1.416635274887085, + "learning_rate": 9.996940733847013e-05, + "loss": 0.9017, + "step": 3490 + }, + { + "epoch": 0.022360502408545545, + "grad_norm": 0.9830083847045898, + "learning_rate": 9.996923158747564e-05, + "loss": 0.8952, + "step": 3500 + }, + { + "epoch": 0.022424389558284248, + "grad_norm": 1.130096197128296, + "learning_rate": 9.996905533324777e-05, + "loss": 0.8779, + "step": 3510 + }, + { + "epoch": 0.022488276708022947, + "grad_norm": 0.7025210857391357, + "learning_rate": 9.996887857578828e-05, + "loss": 1.0576, + "step": 3520 + }, + { + "epoch": 0.02255216385776165, + "grad_norm": 0.7813702821731567, + "learning_rate": 9.996870131509897e-05, + "loss": 1.1396, + "step": 3530 + }, + { + "epoch": 0.022616051007500353, + "grad_norm": 0.9451877474784851, + "learning_rate": 9.996852355118158e-05, + "loss": 0.8531, + "step": 3540 + }, + { + "epoch": 0.022679938157239052, + "grad_norm": 0.9123436212539673, + "learning_rate": 9.996834528403795e-05, + "loss": 0.8832, + "step": 3550 + }, + { + "epoch": 0.022743825306977754, + "grad_norm": 3.4489307403564453, + "learning_rate": 9.996816651366985e-05, + "loss": 0.8413, + "step": 3560 + }, + { + "epoch": 0.022807712456716457, + "grad_norm": 1.0235848426818848, + "learning_rate": 9.996798724007907e-05, + "loss": 0.9275, + "step": 3570 + }, + { + "epoch": 0.022871599606455156, + "grad_norm": 0.7772485017776489, + "learning_rate": 9.996780746326743e-05, + "loss": 1.0924, + "step": 3580 + }, + { + "epoch": 0.02293548675619386, + "grad_norm": 0.7384485006332397, + "learning_rate": 9.996762718323677e-05, + "loss": 0.8231, + "step": 3590 + }, + { + "epoch": 0.02299937390593256, + "grad_norm": 0.9038792848587036, + "learning_rate": 9.996744639998885e-05, + "loss": 0.7318, + "step": 3600 + }, + { + "epoch": 0.02306326105567126, + "grad_norm": 0.7685703039169312, + "learning_rate": 9.996726511352553e-05, + "loss": 0.7945, + "step": 3610 + }, + { + "epoch": 0.023127148205409963, + "grad_norm": 0.9612904787063599, + "learning_rate": 9.996708332384862e-05, + "loss": 0.7389, + "step": 3620 + }, + { + "epoch": 0.023191035355148666, + "grad_norm": 0.6820782423019409, + "learning_rate": 9.996690103095995e-05, + "loss": 0.7208, + "step": 3630 + }, + { + "epoch": 0.023254922504887365, + "grad_norm": 0.7813957333564758, + "learning_rate": 9.996671823486135e-05, + "loss": 1.1023, + "step": 3640 + }, + { + "epoch": 0.023318809654626068, + "grad_norm": 0.41932976245880127, + "learning_rate": 9.996653493555469e-05, + "loss": 0.8274, + "step": 3650 + }, + { + "epoch": 0.02338269680436477, + "grad_norm": 1.1898959875106812, + "learning_rate": 9.996635113304178e-05, + "loss": 0.862, + "step": 3660 + }, + { + "epoch": 0.023446583954103473, + "grad_norm": 1.4429035186767578, + "learning_rate": 9.99661668273245e-05, + "loss": 0.9036, + "step": 3670 + }, + { + "epoch": 0.023510471103842173, + "grad_norm": 0.9616169929504395, + "learning_rate": 9.996598201840469e-05, + "loss": 0.9577, + "step": 3680 + }, + { + "epoch": 0.023574358253580875, + "grad_norm": 0.8261591196060181, + "learning_rate": 9.99657967062842e-05, + "loss": 1.0401, + "step": 3690 + }, + { + "epoch": 0.023638245403319578, + "grad_norm": 0.8811150789260864, + "learning_rate": 9.996561089096493e-05, + "loss": 0.8111, + "step": 3700 + }, + { + "epoch": 0.023702132553058277, + "grad_norm": 0.5696326494216919, + "learning_rate": 9.996542457244871e-05, + "loss": 0.7984, + "step": 3710 + }, + { + "epoch": 0.02376601970279698, + "grad_norm": 0.9691576361656189, + "learning_rate": 9.996523775073746e-05, + "loss": 0.9321, + "step": 3720 + }, + { + "epoch": 0.023829906852535682, + "grad_norm": 0.7957014441490173, + "learning_rate": 9.996505042583303e-05, + "loss": 0.9805, + "step": 3730 + }, + { + "epoch": 0.02389379400227438, + "grad_norm": 0.920781135559082, + "learning_rate": 9.996486259773732e-05, + "loss": 0.83, + "step": 3740 + }, + { + "epoch": 0.023957681152013084, + "grad_norm": 1.661712646484375, + "learning_rate": 9.996467426645221e-05, + "loss": 0.7208, + "step": 3750 + }, + { + "epoch": 0.024021568301751787, + "grad_norm": 0.6871623396873474, + "learning_rate": 9.99644854319796e-05, + "loss": 0.7284, + "step": 3760 + }, + { + "epoch": 0.024085455451490486, + "grad_norm": 0.9017264246940613, + "learning_rate": 9.99642960943214e-05, + "loss": 1.1523, + "step": 3770 + }, + { + "epoch": 0.02414934260122919, + "grad_norm": 0.894895613193512, + "learning_rate": 9.996410625347953e-05, + "loss": 0.7732, + "step": 3780 + }, + { + "epoch": 0.02421322975096789, + "grad_norm": 0.8452061414718628, + "learning_rate": 9.996391590945588e-05, + "loss": 0.913, + "step": 3790 + }, + { + "epoch": 0.02427711690070659, + "grad_norm": 0.7204217910766602, + "learning_rate": 9.996372506225235e-05, + "loss": 0.8552, + "step": 3800 + }, + { + "epoch": 0.024341004050445293, + "grad_norm": 2.9905290603637695, + "learning_rate": 9.996353371187091e-05, + "loss": 0.8771, + "step": 3810 + }, + { + "epoch": 0.024404891200183996, + "grad_norm": 0.9556611180305481, + "learning_rate": 9.996334185831346e-05, + "loss": 0.8103, + "step": 3820 + }, + { + "epoch": 0.024468778349922695, + "grad_norm": 0.775848925113678, + "learning_rate": 9.996314950158192e-05, + "loss": 0.8078, + "step": 3830 + }, + { + "epoch": 0.024532665499661398, + "grad_norm": 0.9693676829338074, + "learning_rate": 9.996295664167824e-05, + "loss": 0.9336, + "step": 3840 + }, + { + "epoch": 0.0245965526494001, + "grad_norm": 1.195697546005249, + "learning_rate": 9.996276327860436e-05, + "loss": 1.2527, + "step": 3850 + }, + { + "epoch": 0.0246604397991388, + "grad_norm": 0.8424214124679565, + "learning_rate": 9.996256941236223e-05, + "loss": 0.966, + "step": 3860 + }, + { + "epoch": 0.024724326948877502, + "grad_norm": 0.6259729266166687, + "learning_rate": 9.996237504295382e-05, + "loss": 0.9363, + "step": 3870 + }, + { + "epoch": 0.024788214098616205, + "grad_norm": 0.7807269096374512, + "learning_rate": 9.996218017038106e-05, + "loss": 0.6411, + "step": 3880 + }, + { + "epoch": 0.024852101248354908, + "grad_norm": 0.6563220024108887, + "learning_rate": 9.996198479464591e-05, + "loss": 0.8191, + "step": 3890 + }, + { + "epoch": 0.024915988398093607, + "grad_norm": 0.831295371055603, + "learning_rate": 9.996178891575037e-05, + "loss": 0.8589, + "step": 3900 + }, + { + "epoch": 0.02497987554783231, + "grad_norm": 1.157340168952942, + "learning_rate": 9.996159253369638e-05, + "loss": 0.9202, + "step": 3910 + }, + { + "epoch": 0.025043762697571012, + "grad_norm": 0.7473374009132385, + "learning_rate": 9.996139564848594e-05, + "loss": 0.829, + "step": 3920 + }, + { + "epoch": 0.02510764984730971, + "grad_norm": 1.1940234899520874, + "learning_rate": 9.996119826012101e-05, + "loss": 0.9879, + "step": 3930 + }, + { + "epoch": 0.025171536997048414, + "grad_norm": 0.7762036323547363, + "learning_rate": 9.99610003686036e-05, + "loss": 0.9162, + "step": 3940 + }, + { + "epoch": 0.025235424146787117, + "grad_norm": 1.1545424461364746, + "learning_rate": 9.996080197393569e-05, + "loss": 0.9567, + "step": 3950 + }, + { + "epoch": 0.025299311296525816, + "grad_norm": 0.6979715824127197, + "learning_rate": 9.996060307611927e-05, + "loss": 0.9685, + "step": 3960 + }, + { + "epoch": 0.02536319844626452, + "grad_norm": 0.9557220339775085, + "learning_rate": 9.996040367515638e-05, + "loss": 1.0768, + "step": 3970 + }, + { + "epoch": 0.02542708559600322, + "grad_norm": 0.8868962526321411, + "learning_rate": 9.996020377104898e-05, + "loss": 1.0351, + "step": 3980 + }, + { + "epoch": 0.02549097274574192, + "grad_norm": 0.5406913757324219, + "learning_rate": 9.996000336379913e-05, + "loss": 0.9042, + "step": 3990 + }, + { + "epoch": 0.025554859895480623, + "grad_norm": 0.64485764503479, + "learning_rate": 9.995980245340881e-05, + "loss": 1.1883, + "step": 4000 + }, + { + "epoch": 0.025618747045219326, + "grad_norm": 1.2904107570648193, + "learning_rate": 9.995960103988005e-05, + "loss": 0.936, + "step": 4010 + }, + { + "epoch": 0.025682634194958025, + "grad_norm": 1.247886061668396, + "learning_rate": 9.99593991232149e-05, + "loss": 0.8806, + "step": 4020 + }, + { + "epoch": 0.025746521344696727, + "grad_norm": 0.9545615911483765, + "learning_rate": 9.995919670341538e-05, + "loss": 1.1493, + "step": 4030 + }, + { + "epoch": 0.02581040849443543, + "grad_norm": 1.999590277671814, + "learning_rate": 9.995899378048352e-05, + "loss": 0.6754, + "step": 4040 + }, + { + "epoch": 0.02587429564417413, + "grad_norm": 0.7333373427391052, + "learning_rate": 9.995879035442138e-05, + "loss": 0.8109, + "step": 4050 + }, + { + "epoch": 0.025938182793912832, + "grad_norm": 0.7739579081535339, + "learning_rate": 9.995858642523099e-05, + "loss": 0.8638, + "step": 4060 + }, + { + "epoch": 0.026002069943651535, + "grad_norm": 1.069405198097229, + "learning_rate": 9.995838199291443e-05, + "loss": 0.9313, + "step": 4070 + }, + { + "epoch": 0.026065957093390234, + "grad_norm": 1.366487979888916, + "learning_rate": 9.995817705747372e-05, + "loss": 1.0205, + "step": 4080 + }, + { + "epoch": 0.026129844243128936, + "grad_norm": 1.6458861827850342, + "learning_rate": 9.995797161891097e-05, + "loss": 0.9609, + "step": 4090 + }, + { + "epoch": 0.02619373139286764, + "grad_norm": 1.0026328563690186, + "learning_rate": 9.995776567722822e-05, + "loss": 1.0618, + "step": 4100 + }, + { + "epoch": 0.02625761854260634, + "grad_norm": 1.0415229797363281, + "learning_rate": 9.995755923242754e-05, + "loss": 0.761, + "step": 4110 + }, + { + "epoch": 0.02632150569234504, + "grad_norm": 1.169027328491211, + "learning_rate": 9.995735228451103e-05, + "loss": 0.92, + "step": 4120 + }, + { + "epoch": 0.026385392842083744, + "grad_norm": 1.2535079717636108, + "learning_rate": 9.995714483348076e-05, + "loss": 0.8859, + "step": 4130 + }, + { + "epoch": 0.026449279991822446, + "grad_norm": 0.6948879957199097, + "learning_rate": 9.995693687933883e-05, + "loss": 0.7189, + "step": 4140 + }, + { + "epoch": 0.026513167141561145, + "grad_norm": 0.7670521140098572, + "learning_rate": 9.995672842208731e-05, + "loss": 1.0072, + "step": 4150 + }, + { + "epoch": 0.026577054291299848, + "grad_norm": 0.8560011982917786, + "learning_rate": 9.995651946172833e-05, + "loss": 1.1125, + "step": 4160 + }, + { + "epoch": 0.02664094144103855, + "grad_norm": 0.762663722038269, + "learning_rate": 9.995630999826397e-05, + "loss": 0.9922, + "step": 4170 + }, + { + "epoch": 0.02670482859077725, + "grad_norm": 1.432151198387146, + "learning_rate": 9.995610003169635e-05, + "loss": 1.0305, + "step": 4180 + }, + { + "epoch": 0.026768715740515953, + "grad_norm": 1.0463693141937256, + "learning_rate": 9.99558895620276e-05, + "loss": 0.9721, + "step": 4190 + }, + { + "epoch": 0.026832602890254655, + "grad_norm": 0.6497074961662292, + "learning_rate": 9.99556785892598e-05, + "loss": 0.8886, + "step": 4200 + }, + { + "epoch": 0.026896490039993354, + "grad_norm": 0.8750442266464233, + "learning_rate": 9.995546711339512e-05, + "loss": 1.1452, + "step": 4210 + }, + { + "epoch": 0.026960377189732057, + "grad_norm": 0.5352575778961182, + "learning_rate": 9.995525513443566e-05, + "loss": 1.3216, + "step": 4220 + }, + { + "epoch": 0.02702426433947076, + "grad_norm": 0.7286153435707092, + "learning_rate": 9.995504265238357e-05, + "loss": 0.9927, + "step": 4230 + }, + { + "epoch": 0.02708815148920946, + "grad_norm": 1.133766770362854, + "learning_rate": 9.995482966724098e-05, + "loss": 0.9198, + "step": 4240 + }, + { + "epoch": 0.02715203863894816, + "grad_norm": 1.060925006866455, + "learning_rate": 9.995461617901004e-05, + "loss": 0.984, + "step": 4250 + }, + { + "epoch": 0.027215925788686864, + "grad_norm": 0.8017410039901733, + "learning_rate": 9.995440218769288e-05, + "loss": 0.8302, + "step": 4260 + }, + { + "epoch": 0.027279812938425563, + "grad_norm": 0.6474617719650269, + "learning_rate": 9.995418769329171e-05, + "loss": 0.8526, + "step": 4270 + }, + { + "epoch": 0.027343700088164266, + "grad_norm": 0.7051038146018982, + "learning_rate": 9.995397269580862e-05, + "loss": 0.6267, + "step": 4280 + }, + { + "epoch": 0.02740758723790297, + "grad_norm": 0.8523268699645996, + "learning_rate": 9.995375719524582e-05, + "loss": 0.7513, + "step": 4290 + }, + { + "epoch": 0.02747147438764167, + "grad_norm": 0.5515130162239075, + "learning_rate": 9.995354119160546e-05, + "loss": 0.8045, + "step": 4300 + }, + { + "epoch": 0.02753536153738037, + "grad_norm": 0.6105387806892395, + "learning_rate": 9.995332468488974e-05, + "loss": 0.9739, + "step": 4310 + }, + { + "epoch": 0.027599248687119073, + "grad_norm": 0.9270747303962708, + "learning_rate": 9.99531076751008e-05, + "loss": 0.8789, + "step": 4320 + }, + { + "epoch": 0.027663135836857776, + "grad_norm": 0.46213430166244507, + "learning_rate": 9.995289016224087e-05, + "loss": 0.8914, + "step": 4330 + }, + { + "epoch": 0.027727022986596475, + "grad_norm": 0.8763656616210938, + "learning_rate": 9.995267214631213e-05, + "loss": 0.9085, + "step": 4340 + }, + { + "epoch": 0.027790910136335178, + "grad_norm": 1.6064941883087158, + "learning_rate": 9.995245362731676e-05, + "loss": 1.0047, + "step": 4350 + }, + { + "epoch": 0.02785479728607388, + "grad_norm": 1.2199528217315674, + "learning_rate": 9.995223460525696e-05, + "loss": 0.749, + "step": 4360 + }, + { + "epoch": 0.02791868443581258, + "grad_norm": 0.9066464304924011, + "learning_rate": 9.995201508013494e-05, + "loss": 1.0363, + "step": 4370 + }, + { + "epoch": 0.027982571585551282, + "grad_norm": 0.8760823011398315, + "learning_rate": 9.995179505195291e-05, + "loss": 1.1568, + "step": 4380 + }, + { + "epoch": 0.028046458735289985, + "grad_norm": 0.6646769046783447, + "learning_rate": 9.99515745207131e-05, + "loss": 1.3106, + "step": 4390 + }, + { + "epoch": 0.028110345885028684, + "grad_norm": 0.7811892032623291, + "learning_rate": 9.995135348641771e-05, + "loss": 0.8003, + "step": 4400 + }, + { + "epoch": 0.028174233034767387, + "grad_norm": 1.2583142518997192, + "learning_rate": 9.995113194906899e-05, + "loss": 0.934, + "step": 4410 + }, + { + "epoch": 0.02823812018450609, + "grad_norm": 1.4330214262008667, + "learning_rate": 9.995090990866915e-05, + "loss": 0.8924, + "step": 4420 + }, + { + "epoch": 0.02830200733424479, + "grad_norm": 0.7987727522850037, + "learning_rate": 9.995068736522044e-05, + "loss": 1.257, + "step": 4430 + }, + { + "epoch": 0.02836589448398349, + "grad_norm": 0.90681391954422, + "learning_rate": 9.995046431872507e-05, + "loss": 0.9746, + "step": 4440 + }, + { + "epoch": 0.028429781633722194, + "grad_norm": 1.1222659349441528, + "learning_rate": 9.995024076918534e-05, + "loss": 0.8702, + "step": 4450 + }, + { + "epoch": 0.028493668783460893, + "grad_norm": 1.4470833539962769, + "learning_rate": 9.995001671660347e-05, + "loss": 0.9072, + "step": 4460 + }, + { + "epoch": 0.028557555933199596, + "grad_norm": 0.9265400767326355, + "learning_rate": 9.994979216098171e-05, + "loss": 0.9651, + "step": 4470 + }, + { + "epoch": 0.0286214430829383, + "grad_norm": 0.40936312079429626, + "learning_rate": 9.994956710232232e-05, + "loss": 0.9576, + "step": 4480 + }, + { + "epoch": 0.028685330232676998, + "grad_norm": 0.7994583249092102, + "learning_rate": 9.99493415406276e-05, + "loss": 0.8773, + "step": 4490 + }, + { + "epoch": 0.0287492173824157, + "grad_norm": 0.8965862989425659, + "learning_rate": 9.994911547589979e-05, + "loss": 0.9247, + "step": 4500 + }, + { + "epoch": 0.028813104532154403, + "grad_norm": 0.5341432690620422, + "learning_rate": 9.994888890814116e-05, + "loss": 0.9735, + "step": 4510 + }, + { + "epoch": 0.028876991681893106, + "grad_norm": 0.796406090259552, + "learning_rate": 9.994866183735403e-05, + "loss": 1.0474, + "step": 4520 + }, + { + "epoch": 0.028940878831631805, + "grad_norm": 0.6537384986877441, + "learning_rate": 9.994843426354064e-05, + "loss": 0.7858, + "step": 4530 + }, + { + "epoch": 0.029004765981370507, + "grad_norm": 0.7321698665618896, + "learning_rate": 9.994820618670332e-05, + "loss": 1.017, + "step": 4540 + }, + { + "epoch": 0.02906865313110921, + "grad_norm": 0.9634839296340942, + "learning_rate": 9.994797760684435e-05, + "loss": 0.9671, + "step": 4550 + }, + { + "epoch": 0.02913254028084791, + "grad_norm": 0.7006617784500122, + "learning_rate": 9.994774852396603e-05, + "loss": 1.053, + "step": 4560 + }, + { + "epoch": 0.029196427430586612, + "grad_norm": 0.7608281373977661, + "learning_rate": 9.994751893807068e-05, + "loss": 0.7445, + "step": 4570 + }, + { + "epoch": 0.029260314580325315, + "grad_norm": 1.0257230997085571, + "learning_rate": 9.99472888491606e-05, + "loss": 0.7304, + "step": 4580 + }, + { + "epoch": 0.029324201730064014, + "grad_norm": 0.6806319355964661, + "learning_rate": 9.994705825723811e-05, + "loss": 1.1287, + "step": 4590 + }, + { + "epoch": 0.029388088879802717, + "grad_norm": 1.2967884540557861, + "learning_rate": 9.994682716230552e-05, + "loss": 0.948, + "step": 4600 + }, + { + "epoch": 0.02945197602954142, + "grad_norm": 1.0324482917785645, + "learning_rate": 9.994659556436518e-05, + "loss": 1.0642, + "step": 4610 + }, + { + "epoch": 0.02951586317928012, + "grad_norm": 0.5615150928497314, + "learning_rate": 9.994636346341943e-05, + "loss": 0.6903, + "step": 4620 + }, + { + "epoch": 0.02957975032901882, + "grad_norm": 0.6164289712905884, + "learning_rate": 9.994613085947058e-05, + "loss": 0.8748, + "step": 4630 + }, + { + "epoch": 0.029643637478757524, + "grad_norm": 0.9414746761322021, + "learning_rate": 9.994589775252097e-05, + "loss": 0.9157, + "step": 4640 + }, + { + "epoch": 0.029707524628496223, + "grad_norm": 0.8447662591934204, + "learning_rate": 9.994566414257297e-05, + "loss": 1.1894, + "step": 4650 + }, + { + "epoch": 0.029771411778234926, + "grad_norm": 0.8695082664489746, + "learning_rate": 9.994543002962892e-05, + "loss": 1.173, + "step": 4660 + }, + { + "epoch": 0.029835298927973628, + "grad_norm": 1.3696662187576294, + "learning_rate": 9.994519541369119e-05, + "loss": 0.8384, + "step": 4670 + }, + { + "epoch": 0.029899186077712327, + "grad_norm": 0.6377172470092773, + "learning_rate": 9.994496029476213e-05, + "loss": 0.8018, + "step": 4680 + }, + { + "epoch": 0.02996307322745103, + "grad_norm": 1.396103858947754, + "learning_rate": 9.99447246728441e-05, + "loss": 0.8777, + "step": 4690 + }, + { + "epoch": 0.030026960377189733, + "grad_norm": 0.741669774055481, + "learning_rate": 9.99444885479395e-05, + "loss": 1.1431, + "step": 4700 + }, + { + "epoch": 0.030090847526928435, + "grad_norm": 0.8591098189353943, + "learning_rate": 9.994425192005067e-05, + "loss": 1.0976, + "step": 4710 + }, + { + "epoch": 0.030154734676667135, + "grad_norm": 0.6573971509933472, + "learning_rate": 9.994401478918003e-05, + "loss": 0.9112, + "step": 4720 + }, + { + "epoch": 0.030218621826405837, + "grad_norm": 0.7204700708389282, + "learning_rate": 9.994377715532996e-05, + "loss": 0.8728, + "step": 4730 + }, + { + "epoch": 0.03028250897614454, + "grad_norm": 1.0097802877426147, + "learning_rate": 9.994353901850283e-05, + "loss": 1.0269, + "step": 4740 + }, + { + "epoch": 0.03034639612588324, + "grad_norm": 1.4376720190048218, + "learning_rate": 9.994330037870107e-05, + "loss": 0.8102, + "step": 4750 + }, + { + "epoch": 0.03041028327562194, + "grad_norm": 0.7325295209884644, + "learning_rate": 9.994306123592704e-05, + "loss": 0.9336, + "step": 4760 + }, + { + "epoch": 0.030474170425360644, + "grad_norm": 0.7449788451194763, + "learning_rate": 9.994282159018323e-05, + "loss": 0.8539, + "step": 4770 + }, + { + "epoch": 0.030538057575099344, + "grad_norm": 0.7632824778556824, + "learning_rate": 9.994258144147195e-05, + "loss": 0.6919, + "step": 4780 + }, + { + "epoch": 0.030601944724838046, + "grad_norm": 0.9371885657310486, + "learning_rate": 9.99423407897957e-05, + "loss": 0.8518, + "step": 4790 + }, + { + "epoch": 0.03066583187457675, + "grad_norm": 0.9703862071037292, + "learning_rate": 9.994209963515684e-05, + "loss": 0.8882, + "step": 4800 + }, + { + "epoch": 0.030729719024315448, + "grad_norm": 0.6255933046340942, + "learning_rate": 9.994185797755787e-05, + "loss": 0.9969, + "step": 4810 + }, + { + "epoch": 0.03079360617405415, + "grad_norm": 2.328423261642456, + "learning_rate": 9.994161581700115e-05, + "loss": 0.8677, + "step": 4820 + }, + { + "epoch": 0.030857493323792853, + "grad_norm": 0.8444818258285522, + "learning_rate": 9.994137315348917e-05, + "loss": 0.9273, + "step": 4830 + }, + { + "epoch": 0.030921380473531553, + "grad_norm": 0.6778904795646667, + "learning_rate": 9.994112998702434e-05, + "loss": 1.136, + "step": 4840 + }, + { + "epoch": 0.030985267623270255, + "grad_norm": 0.7254196405410767, + "learning_rate": 9.994088631760914e-05, + "loss": 0.9659, + "step": 4850 + }, + { + "epoch": 0.031049154773008958, + "grad_norm": 2.3594653606414795, + "learning_rate": 9.994064214524602e-05, + "loss": 0.9981, + "step": 4860 + }, + { + "epoch": 0.031113041922747657, + "grad_norm": 0.9346766471862793, + "learning_rate": 9.994039746993742e-05, + "loss": 1.2296, + "step": 4870 + }, + { + "epoch": 0.03117692907248636, + "grad_norm": 1.1860244274139404, + "learning_rate": 9.994015229168581e-05, + "loss": 1.0124, + "step": 4880 + }, + { + "epoch": 0.031240816222225062, + "grad_norm": 0.977857232093811, + "learning_rate": 9.993990661049366e-05, + "loss": 0.8632, + "step": 4890 + }, + { + "epoch": 0.03130470337196376, + "grad_norm": 0.9144421815872192, + "learning_rate": 9.993966042636345e-05, + "loss": 0.9927, + "step": 4900 + }, + { + "epoch": 0.03136859052170247, + "grad_norm": 1.034429669380188, + "learning_rate": 9.993941373929764e-05, + "loss": 0.8818, + "step": 4910 + }, + { + "epoch": 0.03143247767144117, + "grad_norm": 0.4996863007545471, + "learning_rate": 9.993916654929876e-05, + "loss": 0.6711, + "step": 4920 + }, + { + "epoch": 0.031496364821179866, + "grad_norm": 0.6924141049385071, + "learning_rate": 9.993891885636925e-05, + "loss": 0.9002, + "step": 4930 + }, + { + "epoch": 0.03156025197091857, + "grad_norm": 0.7536648511886597, + "learning_rate": 9.993867066051163e-05, + "loss": 1.0268, + "step": 4940 + }, + { + "epoch": 0.03162413912065727, + "grad_norm": 1.059717059135437, + "learning_rate": 9.993842196172838e-05, + "loss": 1.4731, + "step": 4950 + }, + { + "epoch": 0.03168802627039597, + "grad_norm": 0.9447365999221802, + "learning_rate": 9.993817276002203e-05, + "loss": 0.8936, + "step": 4960 + }, + { + "epoch": 0.03175191342013468, + "grad_norm": 2.9407436847686768, + "learning_rate": 9.993792305539507e-05, + "loss": 0.9535, + "step": 4970 + }, + { + "epoch": 0.031815800569873376, + "grad_norm": 0.9434256553649902, + "learning_rate": 9.993767284785003e-05, + "loss": 0.9241, + "step": 4980 + }, + { + "epoch": 0.031879687719612075, + "grad_norm": 0.5843566060066223, + "learning_rate": 9.993742213738942e-05, + "loss": 1.1005, + "step": 4990 + }, + { + "epoch": 0.03194357486935078, + "grad_norm": 0.5183364748954773, + "learning_rate": 9.993717092401577e-05, + "loss": 1.0861, + "step": 5000 + }, + { + "epoch": 0.03200746201908948, + "grad_norm": 0.716195821762085, + "learning_rate": 9.99369192077316e-05, + "loss": 1.0468, + "step": 5010 + }, + { + "epoch": 0.03207134916882818, + "grad_norm": 0.6783444285392761, + "learning_rate": 9.993666698853946e-05, + "loss": 0.9456, + "step": 5020 + }, + { + "epoch": 0.032135236318566886, + "grad_norm": 0.8905858397483826, + "learning_rate": 9.99364142664419e-05, + "loss": 0.9607, + "step": 5030 + }, + { + "epoch": 0.032199123468305585, + "grad_norm": 1.1394882202148438, + "learning_rate": 9.993616104144141e-05, + "loss": 0.7845, + "step": 5040 + }, + { + "epoch": 0.032263010618044284, + "grad_norm": 0.9417553544044495, + "learning_rate": 9.99359073135406e-05, + "loss": 0.9869, + "step": 5050 + }, + { + "epoch": 0.03232689776778299, + "grad_norm": 0.6557328104972839, + "learning_rate": 9.993565308274199e-05, + "loss": 1.132, + "step": 5060 + }, + { + "epoch": 0.03239078491752169, + "grad_norm": 1.505283236503601, + "learning_rate": 9.993539834904816e-05, + "loss": 0.6938, + "step": 5070 + }, + { + "epoch": 0.03245467206726039, + "grad_norm": 0.7740111947059631, + "learning_rate": 9.993514311246166e-05, + "loss": 0.9475, + "step": 5080 + }, + { + "epoch": 0.032518559216999095, + "grad_norm": 1.1379529237747192, + "learning_rate": 9.993488737298509e-05, + "loss": 0.7626, + "step": 5090 + }, + { + "epoch": 0.032582446366737794, + "grad_norm": 0.5552259683609009, + "learning_rate": 9.993463113062099e-05, + "loss": 0.9058, + "step": 5100 + }, + { + "epoch": 0.03264633351647649, + "grad_norm": 0.7772766351699829, + "learning_rate": 9.993437438537194e-05, + "loss": 1.0914, + "step": 5110 + }, + { + "epoch": 0.0327102206662152, + "grad_norm": 0.7294765114784241, + "learning_rate": 9.993411713724056e-05, + "loss": 0.9447, + "step": 5120 + }, + { + "epoch": 0.0327741078159539, + "grad_norm": 0.8332342505455017, + "learning_rate": 9.993385938622942e-05, + "loss": 0.7607, + "step": 5130 + }, + { + "epoch": 0.0328379949656926, + "grad_norm": 0.759425163269043, + "learning_rate": 9.993360113234111e-05, + "loss": 0.8551, + "step": 5140 + }, + { + "epoch": 0.032901882115431304, + "grad_norm": 0.8883112668991089, + "learning_rate": 9.993334237557825e-05, + "loss": 0.815, + "step": 5150 + }, + { + "epoch": 0.03296576926517, + "grad_norm": 0.5959163308143616, + "learning_rate": 9.993308311594343e-05, + "loss": 1.0528, + "step": 5160 + }, + { + "epoch": 0.0330296564149087, + "grad_norm": 1.0523767471313477, + "learning_rate": 9.993282335343925e-05, + "loss": 1.0073, + "step": 5170 + }, + { + "epoch": 0.03309354356464741, + "grad_norm": 0.8208662271499634, + "learning_rate": 9.993256308806835e-05, + "loss": 0.8802, + "step": 5180 + }, + { + "epoch": 0.03315743071438611, + "grad_norm": 0.7097920775413513, + "learning_rate": 9.993230231983334e-05, + "loss": 1.0191, + "step": 5190 + }, + { + "epoch": 0.03322131786412481, + "grad_norm": 0.7505048513412476, + "learning_rate": 9.993204104873686e-05, + "loss": 1.0811, + "step": 5200 + }, + { + "epoch": 0.03328520501386351, + "grad_norm": 0.9009354710578918, + "learning_rate": 9.993177927478152e-05, + "loss": 0.9172, + "step": 5210 + }, + { + "epoch": 0.03334909216360221, + "grad_norm": 0.681164562702179, + "learning_rate": 9.993151699796996e-05, + "loss": 0.8789, + "step": 5220 + }, + { + "epoch": 0.03341297931334091, + "grad_norm": 0.9279341101646423, + "learning_rate": 9.993125421830484e-05, + "loss": 0.7841, + "step": 5230 + }, + { + "epoch": 0.03347686646307962, + "grad_norm": 0.8030073642730713, + "learning_rate": 9.993099093578879e-05, + "loss": 1.1084, + "step": 5240 + }, + { + "epoch": 0.033540753612818316, + "grad_norm": 0.8783805966377258, + "learning_rate": 9.993072715042447e-05, + "loss": 0.9935, + "step": 5250 + }, + { + "epoch": 0.03360464076255702, + "grad_norm": 1.2054526805877686, + "learning_rate": 9.99304628622145e-05, + "loss": 1.0037, + "step": 5260 + }, + { + "epoch": 0.03366852791229572, + "grad_norm": 0.7649316787719727, + "learning_rate": 9.99301980711616e-05, + "loss": 0.7857, + "step": 5270 + }, + { + "epoch": 0.03373241506203442, + "grad_norm": 1.0451691150665283, + "learning_rate": 9.992993277726841e-05, + "loss": 1.0657, + "step": 5280 + }, + { + "epoch": 0.03379630221177313, + "grad_norm": 1.1677067279815674, + "learning_rate": 9.99296669805376e-05, + "loss": 0.8225, + "step": 5290 + }, + { + "epoch": 0.033860189361511826, + "grad_norm": 0.8038674592971802, + "learning_rate": 9.992940068097184e-05, + "loss": 0.8793, + "step": 5300 + }, + { + "epoch": 0.033924076511250525, + "grad_norm": 0.8285770416259766, + "learning_rate": 9.992913387857383e-05, + "loss": 1.175, + "step": 5310 + }, + { + "epoch": 0.03398796366098923, + "grad_norm": 1.8478131294250488, + "learning_rate": 9.992886657334624e-05, + "loss": 1.1025, + "step": 5320 + }, + { + "epoch": 0.03405185081072793, + "grad_norm": 0.6567774415016174, + "learning_rate": 9.992859876529177e-05, + "loss": 0.979, + "step": 5330 + }, + { + "epoch": 0.03411573796046663, + "grad_norm": 1.635343074798584, + "learning_rate": 9.992833045441312e-05, + "loss": 0.9373, + "step": 5340 + }, + { + "epoch": 0.034179625110205336, + "grad_norm": 0.6428894400596619, + "learning_rate": 9.992806164071298e-05, + "loss": 0.9726, + "step": 5350 + }, + { + "epoch": 0.034243512259944035, + "grad_norm": 0.9768702983856201, + "learning_rate": 9.992779232419407e-05, + "loss": 1.1691, + "step": 5360 + }, + { + "epoch": 0.034307399409682734, + "grad_norm": 0.9969322681427002, + "learning_rate": 9.99275225048591e-05, + "loss": 0.9453, + "step": 5370 + }, + { + "epoch": 0.03437128655942144, + "grad_norm": 1.498533010482788, + "learning_rate": 9.992725218271078e-05, + "loss": 0.9161, + "step": 5380 + }, + { + "epoch": 0.03443517370916014, + "grad_norm": 0.6910355687141418, + "learning_rate": 9.992698135775185e-05, + "loss": 0.8751, + "step": 5390 + }, + { + "epoch": 0.03449906085889884, + "grad_norm": 0.7530591487884521, + "learning_rate": 9.992671002998502e-05, + "loss": 1.0573, + "step": 5400 + }, + { + "epoch": 0.034562948008637545, + "grad_norm": 0.9451344013214111, + "learning_rate": 9.992643819941301e-05, + "loss": 0.8682, + "step": 5410 + }, + { + "epoch": 0.034626835158376244, + "grad_norm": 1.7209718227386475, + "learning_rate": 9.992616586603859e-05, + "loss": 0.8826, + "step": 5420 + }, + { + "epoch": 0.034690722308114944, + "grad_norm": 0.7069958448410034, + "learning_rate": 9.992589302986448e-05, + "loss": 0.8965, + "step": 5430 + }, + { + "epoch": 0.03475460945785365, + "grad_norm": 0.6233651041984558, + "learning_rate": 9.992561969089345e-05, + "loss": 0.9789, + "step": 5440 + }, + { + "epoch": 0.03481849660759235, + "grad_norm": 0.7849096655845642, + "learning_rate": 9.992534584912823e-05, + "loss": 1.0208, + "step": 5450 + }, + { + "epoch": 0.03488238375733105, + "grad_norm": 0.7504194378852844, + "learning_rate": 9.992507150457158e-05, + "loss": 0.7951, + "step": 5460 + }, + { + "epoch": 0.034946270907069754, + "grad_norm": 1.141536831855774, + "learning_rate": 9.992479665722627e-05, + "loss": 0.7366, + "step": 5470 + }, + { + "epoch": 0.03501015805680845, + "grad_norm": 0.8907060623168945, + "learning_rate": 9.992452130709507e-05, + "loss": 1.1784, + "step": 5480 + }, + { + "epoch": 0.03507404520654715, + "grad_norm": 0.9252203106880188, + "learning_rate": 9.992424545418074e-05, + "loss": 0.9195, + "step": 5490 + }, + { + "epoch": 0.03513793235628586, + "grad_norm": 0.9670997262001038, + "learning_rate": 9.992396909848608e-05, + "loss": 0.8106, + "step": 5500 + }, + { + "epoch": 0.03520181950602456, + "grad_norm": 0.9867545962333679, + "learning_rate": 9.992369224001386e-05, + "loss": 0.8976, + "step": 5510 + }, + { + "epoch": 0.03526570665576326, + "grad_norm": 1.0230097770690918, + "learning_rate": 9.992341487876686e-05, + "loss": 0.8986, + "step": 5520 + }, + { + "epoch": 0.03532959380550196, + "grad_norm": 0.7679455876350403, + "learning_rate": 9.99231370147479e-05, + "loss": 0.9554, + "step": 5530 + }, + { + "epoch": 0.03539348095524066, + "grad_norm": 0.6599009037017822, + "learning_rate": 9.992285864795974e-05, + "loss": 0.8623, + "step": 5540 + }, + { + "epoch": 0.03545736810497936, + "grad_norm": 1.114585041999817, + "learning_rate": 9.992257977840521e-05, + "loss": 1.0822, + "step": 5550 + }, + { + "epoch": 0.03552125525471807, + "grad_norm": 0.6967979073524475, + "learning_rate": 9.992230040608713e-05, + "loss": 1.0806, + "step": 5560 + }, + { + "epoch": 0.03558514240445677, + "grad_norm": 2.6597609519958496, + "learning_rate": 9.992202053100826e-05, + "loss": 0.958, + "step": 5570 + }, + { + "epoch": 0.035649029554195466, + "grad_norm": 0.7488609552383423, + "learning_rate": 9.992174015317148e-05, + "loss": 0.6722, + "step": 5580 + }, + { + "epoch": 0.03571291670393417, + "grad_norm": 1.290249228477478, + "learning_rate": 9.992145927257958e-05, + "loss": 1.1259, + "step": 5590 + }, + { + "epoch": 0.03577680385367287, + "grad_norm": 0.7017959952354431, + "learning_rate": 9.99211778892354e-05, + "loss": 0.9599, + "step": 5600 + }, + { + "epoch": 0.03584069100341157, + "grad_norm": 0.6516076922416687, + "learning_rate": 9.992089600314179e-05, + "loss": 1.0698, + "step": 5610 + }, + { + "epoch": 0.03590457815315028, + "grad_norm": 0.860114336013794, + "learning_rate": 9.992061361430153e-05, + "loss": 0.8568, + "step": 5620 + }, + { + "epoch": 0.035968465302888976, + "grad_norm": 0.6573166847229004, + "learning_rate": 9.992033072271754e-05, + "loss": 0.9076, + "step": 5630 + }, + { + "epoch": 0.036032352452627675, + "grad_norm": 1.0699505805969238, + "learning_rate": 9.992004732839261e-05, + "loss": 0.8982, + "step": 5640 + }, + { + "epoch": 0.03609623960236638, + "grad_norm": 0.8025882840156555, + "learning_rate": 9.991976343132963e-05, + "loss": 0.9928, + "step": 5650 + }, + { + "epoch": 0.03616012675210508, + "grad_norm": 0.7112436294555664, + "learning_rate": 9.991947903153143e-05, + "loss": 1.0748, + "step": 5660 + }, + { + "epoch": 0.036224013901843787, + "grad_norm": 0.8061192631721497, + "learning_rate": 9.991919412900091e-05, + "loss": 1.0776, + "step": 5670 + }, + { + "epoch": 0.036287901051582486, + "grad_norm": 3.550689220428467, + "learning_rate": 9.99189087237409e-05, + "loss": 0.8897, + "step": 5680 + }, + { + "epoch": 0.036351788201321185, + "grad_norm": 0.6956158876419067, + "learning_rate": 9.991862281575431e-05, + "loss": 0.9601, + "step": 5690 + }, + { + "epoch": 0.03641567535105989, + "grad_norm": 2.4917612075805664, + "learning_rate": 9.991833640504397e-05, + "loss": 1.2047, + "step": 5700 + }, + { + "epoch": 0.03647956250079859, + "grad_norm": 0.8588683009147644, + "learning_rate": 9.991804949161284e-05, + "loss": 0.8791, + "step": 5710 + }, + { + "epoch": 0.03654344965053729, + "grad_norm": 1.8225440979003906, + "learning_rate": 9.991776207546373e-05, + "loss": 1.1723, + "step": 5720 + }, + { + "epoch": 0.036607336800275996, + "grad_norm": 0.6750584244728088, + "learning_rate": 9.991747415659959e-05, + "loss": 1.0424, + "step": 5730 + }, + { + "epoch": 0.036671223950014695, + "grad_norm": 1.0814725160598755, + "learning_rate": 9.99171857350233e-05, + "loss": 0.7245, + "step": 5740 + }, + { + "epoch": 0.036735111099753394, + "grad_norm": 0.6731589436531067, + "learning_rate": 9.991689681073776e-05, + "loss": 0.7107, + "step": 5750 + }, + { + "epoch": 0.0367989982494921, + "grad_norm": 1.090672492980957, + "learning_rate": 9.991660738374589e-05, + "loss": 1.1092, + "step": 5760 + }, + { + "epoch": 0.0368628853992308, + "grad_norm": 0.9638064503669739, + "learning_rate": 9.991631745405059e-05, + "loss": 1.0152, + "step": 5770 + }, + { + "epoch": 0.0369267725489695, + "grad_norm": 0.6535985469818115, + "learning_rate": 9.99160270216548e-05, + "loss": 0.9731, + "step": 5780 + }, + { + "epoch": 0.036990659698708205, + "grad_norm": 0.8303619623184204, + "learning_rate": 9.991573608656144e-05, + "loss": 1.0109, + "step": 5790 + }, + { + "epoch": 0.037054546848446904, + "grad_norm": 0.8238627910614014, + "learning_rate": 9.991544464877342e-05, + "loss": 1.1488, + "step": 5800 + }, + { + "epoch": 0.0371184339981856, + "grad_norm": 0.7430026531219482, + "learning_rate": 9.991515270829369e-05, + "loss": 0.9808, + "step": 5810 + }, + { + "epoch": 0.03718232114792431, + "grad_norm": 1.1487149000167847, + "learning_rate": 9.99148602651252e-05, + "loss": 0.8169, + "step": 5820 + }, + { + "epoch": 0.03724620829766301, + "grad_norm": 0.8699382543563843, + "learning_rate": 9.991456731927087e-05, + "loss": 0.9892, + "step": 5830 + }, + { + "epoch": 0.03731009544740171, + "grad_norm": 0.92801833152771, + "learning_rate": 9.991427387073367e-05, + "loss": 1.1314, + "step": 5840 + }, + { + "epoch": 0.037373982597140414, + "grad_norm": 0.9899303913116455, + "learning_rate": 9.991397991951656e-05, + "loss": 0.7899, + "step": 5850 + }, + { + "epoch": 0.03743786974687911, + "grad_norm": 0.6273317933082581, + "learning_rate": 9.991368546562249e-05, + "loss": 1.0946, + "step": 5860 + }, + { + "epoch": 0.03750175689661781, + "grad_norm": 1.1781492233276367, + "learning_rate": 9.991339050905442e-05, + "loss": 0.9631, + "step": 5870 + }, + { + "epoch": 0.03756564404635652, + "grad_norm": 1.5557823181152344, + "learning_rate": 9.991309504981533e-05, + "loss": 0.8755, + "step": 5880 + }, + { + "epoch": 0.03762953119609522, + "grad_norm": 1.418256402015686, + "learning_rate": 9.991279908790818e-05, + "loss": 1.0737, + "step": 5890 + }, + { + "epoch": 0.037693418345833916, + "grad_norm": 1.275620460510254, + "learning_rate": 9.991250262333597e-05, + "loss": 0.7169, + "step": 5900 + }, + { + "epoch": 0.03775730549557262, + "grad_norm": 0.9257436394691467, + "learning_rate": 9.991220565610169e-05, + "loss": 1.0117, + "step": 5910 + }, + { + "epoch": 0.03782119264531132, + "grad_norm": 0.6086337566375732, + "learning_rate": 9.99119081862083e-05, + "loss": 0.9319, + "step": 5920 + }, + { + "epoch": 0.03788507979505002, + "grad_norm": 1.3489453792572021, + "learning_rate": 9.991161021365882e-05, + "loss": 1.1381, + "step": 5930 + }, + { + "epoch": 0.03794896694478873, + "grad_norm": 0.7379159927368164, + "learning_rate": 9.991131173845624e-05, + "loss": 1.1553, + "step": 5940 + }, + { + "epoch": 0.038012854094527426, + "grad_norm": 0.8401197195053101, + "learning_rate": 9.991101276060358e-05, + "loss": 0.8074, + "step": 5950 + }, + { + "epoch": 0.038076741244266125, + "grad_norm": 1.0958367586135864, + "learning_rate": 9.991071328010384e-05, + "loss": 1.1319, + "step": 5960 + }, + { + "epoch": 0.03814062839400483, + "grad_norm": 0.9215190410614014, + "learning_rate": 9.991041329696005e-05, + "loss": 1.1632, + "step": 5970 + }, + { + "epoch": 0.03820451554374353, + "grad_norm": 1.5827072858810425, + "learning_rate": 9.991011281117521e-05, + "loss": 0.9153, + "step": 5980 + }, + { + "epoch": 0.03826840269348223, + "grad_norm": 0.67779141664505, + "learning_rate": 9.990981182275236e-05, + "loss": 0.968, + "step": 5990 + }, + { + "epoch": 0.038332289843220936, + "grad_norm": 1.1568547487258911, + "learning_rate": 9.990951033169451e-05, + "loss": 0.9781, + "step": 6000 + }, + { + "epoch": 0.038396176992959635, + "grad_norm": 0.7177845239639282, + "learning_rate": 9.990920833800472e-05, + "loss": 0.9362, + "step": 6010 + }, + { + "epoch": 0.038460064142698334, + "grad_norm": 0.7867560982704163, + "learning_rate": 9.990890584168604e-05, + "loss": 0.8053, + "step": 6020 + }, + { + "epoch": 0.03852395129243704, + "grad_norm": 0.9753761887550354, + "learning_rate": 9.990860284274148e-05, + "loss": 0.9772, + "step": 6030 + }, + { + "epoch": 0.03858783844217574, + "grad_norm": 1.043918490409851, + "learning_rate": 9.990829934117413e-05, + "loss": 1.0062, + "step": 6040 + }, + { + "epoch": 0.03865172559191444, + "grad_norm": 0.6653173565864563, + "learning_rate": 9.990799533698703e-05, + "loss": 0.946, + "step": 6050 + }, + { + "epoch": 0.038715612741653145, + "grad_norm": 0.6706075072288513, + "learning_rate": 9.990769083018322e-05, + "loss": 0.9202, + "step": 6060 + }, + { + "epoch": 0.038779499891391844, + "grad_norm": 1.005500078201294, + "learning_rate": 9.99073858207658e-05, + "loss": 1.2583, + "step": 6070 + }, + { + "epoch": 0.03884338704113055, + "grad_norm": 0.9135782122612, + "learning_rate": 9.990708030873783e-05, + "loss": 1.1592, + "step": 6080 + }, + { + "epoch": 0.03890727419086925, + "grad_norm": 0.8927890658378601, + "learning_rate": 9.990677429410237e-05, + "loss": 1.0624, + "step": 6090 + }, + { + "epoch": 0.03897116134060795, + "grad_norm": 1.1654282808303833, + "learning_rate": 9.990646777686255e-05, + "loss": 0.8439, + "step": 6100 + }, + { + "epoch": 0.039035048490346655, + "grad_norm": 0.5983591079711914, + "learning_rate": 9.99061607570214e-05, + "loss": 0.8542, + "step": 6110 + }, + { + "epoch": 0.039098935640085354, + "grad_norm": 0.9841302633285522, + "learning_rate": 9.990585323458204e-05, + "loss": 1.0852, + "step": 6120 + }, + { + "epoch": 0.03916282278982405, + "grad_norm": 1.078748106956482, + "learning_rate": 9.990554520954755e-05, + "loss": 0.8696, + "step": 6130 + }, + { + "epoch": 0.03922670993956276, + "grad_norm": 0.9046047925949097, + "learning_rate": 9.990523668192106e-05, + "loss": 0.9837, + "step": 6140 + }, + { + "epoch": 0.03929059708930146, + "grad_norm": 0.6112083196640015, + "learning_rate": 9.990492765170567e-05, + "loss": 1.445, + "step": 6150 + }, + { + "epoch": 0.03935448423904016, + "grad_norm": 0.8192219138145447, + "learning_rate": 9.990461811890447e-05, + "loss": 0.7521, + "step": 6160 + }, + { + "epoch": 0.039418371388778864, + "grad_norm": 1.2310230731964111, + "learning_rate": 9.99043080835206e-05, + "loss": 0.7873, + "step": 6170 + }, + { + "epoch": 0.03948225853851756, + "grad_norm": 0.5166013836860657, + "learning_rate": 9.990399754555717e-05, + "loss": 1.0726, + "step": 6180 + }, + { + "epoch": 0.03954614568825626, + "grad_norm": 0.8496847748756409, + "learning_rate": 9.990368650501731e-05, + "loss": 0.8312, + "step": 6190 + }, + { + "epoch": 0.03961003283799497, + "grad_norm": 1.445300579071045, + "learning_rate": 9.990337496190416e-05, + "loss": 0.8953, + "step": 6200 + }, + { + "epoch": 0.03967391998773367, + "grad_norm": 2.797938108444214, + "learning_rate": 9.990306291622085e-05, + "loss": 0.8305, + "step": 6210 + }, + { + "epoch": 0.03973780713747237, + "grad_norm": 0.5867908596992493, + "learning_rate": 9.990275036797054e-05, + "loss": 0.7997, + "step": 6220 + }, + { + "epoch": 0.03980169428721107, + "grad_norm": 0.5474823713302612, + "learning_rate": 9.990243731715634e-05, + "loss": 1.3339, + "step": 6230 + }, + { + "epoch": 0.03986558143694977, + "grad_norm": 1.1061484813690186, + "learning_rate": 9.990212376378143e-05, + "loss": 0.8513, + "step": 6240 + }, + { + "epoch": 0.03992946858668847, + "grad_norm": 1.0674853324890137, + "learning_rate": 9.990180970784897e-05, + "loss": 1.0124, + "step": 6250 + }, + { + "epoch": 0.03999335573642718, + "grad_norm": 0.7848487496376038, + "learning_rate": 9.99014951493621e-05, + "loss": 0.9535, + "step": 6260 + }, + { + "epoch": 0.04005724288616588, + "grad_norm": 0.7292889356613159, + "learning_rate": 9.9901180088324e-05, + "loss": 1.1792, + "step": 6270 + }, + { + "epoch": 0.040121130035904576, + "grad_norm": 0.7035486698150635, + "learning_rate": 9.990086452473785e-05, + "loss": 0.8471, + "step": 6280 + }, + { + "epoch": 0.04018501718564328, + "grad_norm": 0.6115634441375732, + "learning_rate": 9.990054845860683e-05, + "loss": 1.1244, + "step": 6290 + }, + { + "epoch": 0.04024890433538198, + "grad_norm": 2.171461582183838, + "learning_rate": 9.990023188993412e-05, + "loss": 1.0045, + "step": 6300 + }, + { + "epoch": 0.04031279148512068, + "grad_norm": 0.8362821936607361, + "learning_rate": 9.989991481872292e-05, + "loss": 1.0352, + "step": 6310 + }, + { + "epoch": 0.040376678634859386, + "grad_norm": 0.8392160534858704, + "learning_rate": 9.989959724497638e-05, + "loss": 0.785, + "step": 6320 + }, + { + "epoch": 0.040440565784598086, + "grad_norm": 0.4593855142593384, + "learning_rate": 9.989927916869773e-05, + "loss": 0.8819, + "step": 6330 + }, + { + "epoch": 0.040504452934336785, + "grad_norm": 0.6949111223220825, + "learning_rate": 9.98989605898902e-05, + "loss": 0.9824, + "step": 6340 + }, + { + "epoch": 0.04056834008407549, + "grad_norm": 0.6681846976280212, + "learning_rate": 9.989864150855693e-05, + "loss": 0.7795, + "step": 6350 + }, + { + "epoch": 0.04063222723381419, + "grad_norm": 0.9278548359870911, + "learning_rate": 9.989832192470118e-05, + "loss": 0.9975, + "step": 6360 + }, + { + "epoch": 0.04069611438355289, + "grad_norm": 0.7522639632225037, + "learning_rate": 9.989800183832616e-05, + "loss": 1.0204, + "step": 6370 + }, + { + "epoch": 0.040760001533291595, + "grad_norm": 0.9609561562538147, + "learning_rate": 9.98976812494351e-05, + "loss": 1.0157, + "step": 6380 + }, + { + "epoch": 0.040823888683030295, + "grad_norm": 0.7092857956886292, + "learning_rate": 9.989736015803123e-05, + "loss": 0.9443, + "step": 6390 + }, + { + "epoch": 0.040887775832768994, + "grad_norm": 4.257565498352051, + "learning_rate": 9.989703856411776e-05, + "loss": 1.134, + "step": 6400 + }, + { + "epoch": 0.0409516629825077, + "grad_norm": 1.1755651235580444, + "learning_rate": 9.989671646769796e-05, + "loss": 1.1108, + "step": 6410 + }, + { + "epoch": 0.0410155501322464, + "grad_norm": 0.8459087610244751, + "learning_rate": 9.989639386877505e-05, + "loss": 1.0194, + "step": 6420 + }, + { + "epoch": 0.0410794372819851, + "grad_norm": 1.175000786781311, + "learning_rate": 9.989607076735229e-05, + "loss": 0.8072, + "step": 6430 + }, + { + "epoch": 0.041143324431723804, + "grad_norm": 1.2269272804260254, + "learning_rate": 9.989574716343294e-05, + "loss": 1.1758, + "step": 6440 + }, + { + "epoch": 0.041207211581462504, + "grad_norm": 0.7292816042900085, + "learning_rate": 9.989542305702022e-05, + "loss": 0.9037, + "step": 6450 + }, + { + "epoch": 0.0412710987312012, + "grad_norm": 1.1013445854187012, + "learning_rate": 9.989509844811745e-05, + "loss": 0.7594, + "step": 6460 + }, + { + "epoch": 0.04133498588093991, + "grad_norm": 1.5162911415100098, + "learning_rate": 9.989477333672787e-05, + "loss": 0.8458, + "step": 6470 + }, + { + "epoch": 0.04139887303067861, + "grad_norm": 0.5727777481079102, + "learning_rate": 9.989444772285475e-05, + "loss": 1.0281, + "step": 6480 + }, + { + "epoch": 0.041462760180417314, + "grad_norm": 0.940905749797821, + "learning_rate": 9.989412160650137e-05, + "loss": 0.8714, + "step": 6490 + }, + { + "epoch": 0.041526647330156014, + "grad_norm": 1.0898019075393677, + "learning_rate": 9.989379498767104e-05, + "loss": 0.8905, + "step": 6500 + }, + { + "epoch": 0.04159053447989471, + "grad_norm": 1.05965256690979, + "learning_rate": 9.989346786636701e-05, + "loss": 1.0419, + "step": 6510 + }, + { + "epoch": 0.04165442162963342, + "grad_norm": 1.0670409202575684, + "learning_rate": 9.989314024259262e-05, + "loss": 0.7306, + "step": 6520 + }, + { + "epoch": 0.04171830877937212, + "grad_norm": 0.9134021401405334, + "learning_rate": 9.989281211635114e-05, + "loss": 0.9002, + "step": 6530 + }, + { + "epoch": 0.04178219592911082, + "grad_norm": 0.9163311719894409, + "learning_rate": 9.989248348764586e-05, + "loss": 0.9131, + "step": 6540 + }, + { + "epoch": 0.04184608307884952, + "grad_norm": 0.6874496936798096, + "learning_rate": 9.989215435648011e-05, + "loss": 0.9497, + "step": 6550 + }, + { + "epoch": 0.04190997022858822, + "grad_norm": 0.9504197239875793, + "learning_rate": 9.989182472285721e-05, + "loss": 1.06, + "step": 6560 + }, + { + "epoch": 0.04197385737832692, + "grad_norm": 0.794982373714447, + "learning_rate": 9.989149458678046e-05, + "loss": 0.8137, + "step": 6570 + }, + { + "epoch": 0.04203774452806563, + "grad_norm": 0.9030359983444214, + "learning_rate": 9.989116394825322e-05, + "loss": 0.7989, + "step": 6580 + }, + { + "epoch": 0.04210163167780433, + "grad_norm": 0.7701511979103088, + "learning_rate": 9.989083280727878e-05, + "loss": 1.0566, + "step": 6590 + }, + { + "epoch": 0.042165518827543026, + "grad_norm": 0.8130073547363281, + "learning_rate": 9.98905011638605e-05, + "loss": 0.9397, + "step": 6600 + }, + { + "epoch": 0.04222940597728173, + "grad_norm": 0.6246233582496643, + "learning_rate": 9.989016901800171e-05, + "loss": 0.8776, + "step": 6610 + }, + { + "epoch": 0.04229329312702043, + "grad_norm": 0.7861520648002625, + "learning_rate": 9.988983636970576e-05, + "loss": 1.0794, + "step": 6620 + }, + { + "epoch": 0.04235718027675913, + "grad_norm": 1.3345977067947388, + "learning_rate": 9.988950321897599e-05, + "loss": 0.8676, + "step": 6630 + }, + { + "epoch": 0.04242106742649784, + "grad_norm": 0.56337571144104, + "learning_rate": 9.988916956581577e-05, + "loss": 0.8426, + "step": 6640 + }, + { + "epoch": 0.042484954576236536, + "grad_norm": 1.3534024953842163, + "learning_rate": 9.988883541022844e-05, + "loss": 0.7897, + "step": 6650 + }, + { + "epoch": 0.042548841725975235, + "grad_norm": 1.3062078952789307, + "learning_rate": 9.988850075221738e-05, + "loss": 1.1495, + "step": 6660 + }, + { + "epoch": 0.04261272887571394, + "grad_norm": 0.8563300967216492, + "learning_rate": 9.988816559178597e-05, + "loss": 0.7691, + "step": 6670 + }, + { + "epoch": 0.04267661602545264, + "grad_norm": 0.6267048120498657, + "learning_rate": 9.988782992893757e-05, + "loss": 0.9558, + "step": 6680 + }, + { + "epoch": 0.04274050317519134, + "grad_norm": 1.3723206520080566, + "learning_rate": 9.988749376367556e-05, + "loss": 0.9185, + "step": 6690 + }, + { + "epoch": 0.042804390324930046, + "grad_norm": 1.9447133541107178, + "learning_rate": 9.988715709600332e-05, + "loss": 1.0383, + "step": 6700 + }, + { + "epoch": 0.042868277474668745, + "grad_norm": 0.8852369785308838, + "learning_rate": 9.988681992592426e-05, + "loss": 0.8813, + "step": 6710 + }, + { + "epoch": 0.042932164624407444, + "grad_norm": 2.174041986465454, + "learning_rate": 9.988648225344177e-05, + "loss": 0.9662, + "step": 6720 + }, + { + "epoch": 0.04299605177414615, + "grad_norm": 1.9878665208816528, + "learning_rate": 9.988614407855924e-05, + "loss": 0.9924, + "step": 6730 + }, + { + "epoch": 0.04305993892388485, + "grad_norm": 0.9836265444755554, + "learning_rate": 9.988580540128008e-05, + "loss": 1.2755, + "step": 6740 + }, + { + "epoch": 0.04312382607362355, + "grad_norm": 0.999160647392273, + "learning_rate": 9.98854662216077e-05, + "loss": 0.9726, + "step": 6750 + }, + { + "epoch": 0.043187713223362255, + "grad_norm": 1.9516860246658325, + "learning_rate": 9.988512653954552e-05, + "loss": 0.7816, + "step": 6760 + }, + { + "epoch": 0.043251600373100954, + "grad_norm": 0.7745450735092163, + "learning_rate": 9.988478635509696e-05, + "loss": 0.7726, + "step": 6770 + }, + { + "epoch": 0.04331548752283965, + "grad_norm": 0.8929428458213806, + "learning_rate": 9.988444566826544e-05, + "loss": 1.0001, + "step": 6780 + }, + { + "epoch": 0.04337937467257836, + "grad_norm": 0.895820140838623, + "learning_rate": 9.98841044790544e-05, + "loss": 0.8765, + "step": 6790 + }, + { + "epoch": 0.04344326182231706, + "grad_norm": 0.6711694598197937, + "learning_rate": 9.988376278746727e-05, + "loss": 0.9975, + "step": 6800 + }, + { + "epoch": 0.04350714897205576, + "grad_norm": 0.9492961764335632, + "learning_rate": 9.988342059350751e-05, + "loss": 1.0356, + "step": 6810 + }, + { + "epoch": 0.043571036121794464, + "grad_norm": 0.7187815308570862, + "learning_rate": 9.988307789717853e-05, + "loss": 0.8538, + "step": 6820 + }, + { + "epoch": 0.04363492327153316, + "grad_norm": 0.9014946222305298, + "learning_rate": 9.98827346984838e-05, + "loss": 1.0214, + "step": 6830 + }, + { + "epoch": 0.04369881042127186, + "grad_norm": 0.5608994960784912, + "learning_rate": 9.98823909974268e-05, + "loss": 0.8462, + "step": 6840 + }, + { + "epoch": 0.04376269757101057, + "grad_norm": 0.8809041976928711, + "learning_rate": 9.988204679401094e-05, + "loss": 0.813, + "step": 6850 + }, + { + "epoch": 0.04382658472074927, + "grad_norm": 0.7527191638946533, + "learning_rate": 9.988170208823972e-05, + "loss": 1.0194, + "step": 6860 + }, + { + "epoch": 0.04389047187048797, + "grad_norm": 0.7817595601081848, + "learning_rate": 9.988135688011662e-05, + "loss": 0.8165, + "step": 6870 + }, + { + "epoch": 0.04395435902022667, + "grad_norm": 0.8186140656471252, + "learning_rate": 9.988101116964508e-05, + "loss": 0.8789, + "step": 6880 + }, + { + "epoch": 0.04401824616996537, + "grad_norm": 0.6612401008605957, + "learning_rate": 9.988066495682863e-05, + "loss": 0.8621, + "step": 6890 + }, + { + "epoch": 0.04408213331970407, + "grad_norm": 0.8166273832321167, + "learning_rate": 9.988031824167073e-05, + "loss": 1.0722, + "step": 6900 + }, + { + "epoch": 0.04414602046944278, + "grad_norm": 1.0065597295761108, + "learning_rate": 9.987997102417486e-05, + "loss": 1.082, + "step": 6910 + }, + { + "epoch": 0.04420990761918148, + "grad_norm": 1.0010764598846436, + "learning_rate": 9.987962330434452e-05, + "loss": 0.8206, + "step": 6920 + }, + { + "epoch": 0.04427379476892018, + "grad_norm": 0.7217119932174683, + "learning_rate": 9.987927508218324e-05, + "loss": 0.8516, + "step": 6930 + }, + { + "epoch": 0.04433768191865888, + "grad_norm": 1.464766502380371, + "learning_rate": 9.987892635769449e-05, + "loss": 1.1353, + "step": 6940 + }, + { + "epoch": 0.04440156906839758, + "grad_norm": 0.887629508972168, + "learning_rate": 9.987857713088182e-05, + "loss": 0.8636, + "step": 6950 + }, + { + "epoch": 0.04446545621813629, + "grad_norm": 1.562030553817749, + "learning_rate": 9.987822740174871e-05, + "loss": 1.2412, + "step": 6960 + }, + { + "epoch": 0.044529343367874986, + "grad_norm": 0.6418665647506714, + "learning_rate": 9.987787717029871e-05, + "loss": 1.1301, + "step": 6970 + }, + { + "epoch": 0.044593230517613686, + "grad_norm": 0.7377752065658569, + "learning_rate": 9.987752643653533e-05, + "loss": 0.89, + "step": 6980 + }, + { + "epoch": 0.04465711766735239, + "grad_norm": 0.709084689617157, + "learning_rate": 9.987717520046211e-05, + "loss": 0.9194, + "step": 6990 + }, + { + "epoch": 0.04472100481709109, + "grad_norm": 0.7699615359306335, + "learning_rate": 9.98768234620826e-05, + "loss": 0.995, + "step": 7000 + }, + { + "epoch": 0.04478489196682979, + "grad_norm": 0.8531057238578796, + "learning_rate": 9.987647122140031e-05, + "loss": 0.8096, + "step": 7010 + }, + { + "epoch": 0.044848779116568496, + "grad_norm": 1.1459274291992188, + "learning_rate": 9.987611847841883e-05, + "loss": 0.9038, + "step": 7020 + }, + { + "epoch": 0.044912666266307195, + "grad_norm": 0.966291606426239, + "learning_rate": 9.987576523314167e-05, + "loss": 0.9996, + "step": 7030 + }, + { + "epoch": 0.044976553416045895, + "grad_norm": 1.0549588203430176, + "learning_rate": 9.987541148557238e-05, + "loss": 0.7135, + "step": 7040 + }, + { + "epoch": 0.0450404405657846, + "grad_norm": 0.8475518226623535, + "learning_rate": 9.987505723571458e-05, + "loss": 0.7685, + "step": 7050 + }, + { + "epoch": 0.0451043277155233, + "grad_norm": 0.8754829168319702, + "learning_rate": 9.98747024835718e-05, + "loss": 0.9184, + "step": 7060 + }, + { + "epoch": 0.045168214865262, + "grad_norm": 0.8908385038375854, + "learning_rate": 9.987434722914762e-05, + "loss": 1.0456, + "step": 7070 + }, + { + "epoch": 0.045232102015000705, + "grad_norm": 0.9609813094139099, + "learning_rate": 9.987399147244562e-05, + "loss": 1.1562, + "step": 7080 + }, + { + "epoch": 0.045295989164739404, + "grad_norm": 0.681609034538269, + "learning_rate": 9.987363521346937e-05, + "loss": 0.8802, + "step": 7090 + }, + { + "epoch": 0.045359876314478104, + "grad_norm": 0.6809660792350769, + "learning_rate": 9.987327845222246e-05, + "loss": 0.9104, + "step": 7100 + }, + { + "epoch": 0.04542376346421681, + "grad_norm": 0.5972456932067871, + "learning_rate": 9.98729211887085e-05, + "loss": 0.9686, + "step": 7110 + }, + { + "epoch": 0.04548765061395551, + "grad_norm": 2.145796537399292, + "learning_rate": 9.987256342293108e-05, + "loss": 0.8764, + "step": 7120 + }, + { + "epoch": 0.04555153776369421, + "grad_norm": 1.2157313823699951, + "learning_rate": 9.98722051548938e-05, + "loss": 0.8955, + "step": 7130 + }, + { + "epoch": 0.045615424913432914, + "grad_norm": 0.8759172558784485, + "learning_rate": 9.987184638460026e-05, + "loss": 0.8679, + "step": 7140 + }, + { + "epoch": 0.04567931206317161, + "grad_norm": 1.0199391841888428, + "learning_rate": 9.987148711205408e-05, + "loss": 0.7592, + "step": 7150 + }, + { + "epoch": 0.04574319921291031, + "grad_norm": 0.7216569185256958, + "learning_rate": 9.98711273372589e-05, + "loss": 0.7954, + "step": 7160 + }, + { + "epoch": 0.04580708636264902, + "grad_norm": 1.0680534839630127, + "learning_rate": 9.98707670602183e-05, + "loss": 1.0779, + "step": 7170 + }, + { + "epoch": 0.04587097351238772, + "grad_norm": 0.9365562796592712, + "learning_rate": 9.987040628093594e-05, + "loss": 1.0918, + "step": 7180 + }, + { + "epoch": 0.04593486066212642, + "grad_norm": 1.0162864923477173, + "learning_rate": 9.987004499941545e-05, + "loss": 0.791, + "step": 7190 + }, + { + "epoch": 0.04599874781186512, + "grad_norm": 0.9427816271781921, + "learning_rate": 9.986968321566045e-05, + "loss": 0.8263, + "step": 7200 + }, + { + "epoch": 0.04606263496160382, + "grad_norm": 0.9530696868896484, + "learning_rate": 9.98693209296746e-05, + "loss": 1.0719, + "step": 7210 + }, + { + "epoch": 0.04612652211134252, + "grad_norm": 0.687778890132904, + "learning_rate": 9.986895814146156e-05, + "loss": 0.8541, + "step": 7220 + }, + { + "epoch": 0.04619040926108123, + "grad_norm": 0.8100598454475403, + "learning_rate": 9.986859485102495e-05, + "loss": 1.0194, + "step": 7230 + }, + { + "epoch": 0.04625429641081993, + "grad_norm": 0.5516176819801331, + "learning_rate": 9.986823105836847e-05, + "loss": 0.8347, + "step": 7240 + }, + { + "epoch": 0.046318183560558626, + "grad_norm": 0.8812345862388611, + "learning_rate": 9.986786676349573e-05, + "loss": 1.0472, + "step": 7250 + }, + { + "epoch": 0.04638207071029733, + "grad_norm": 1.0025354623794556, + "learning_rate": 9.986750196641047e-05, + "loss": 1.0196, + "step": 7260 + }, + { + "epoch": 0.04644595786003603, + "grad_norm": 1.2470890283584595, + "learning_rate": 9.986713666711629e-05, + "loss": 0.7237, + "step": 7270 + }, + { + "epoch": 0.04650984500977473, + "grad_norm": 0.7719841599464417, + "learning_rate": 9.986677086561691e-05, + "loss": 0.9012, + "step": 7280 + }, + { + "epoch": 0.04657373215951344, + "grad_norm": 0.5865141749382019, + "learning_rate": 9.9866404561916e-05, + "loss": 0.7885, + "step": 7290 + }, + { + "epoch": 0.046637619309252136, + "grad_norm": 0.8722718954086304, + "learning_rate": 9.986603775601728e-05, + "loss": 0.9654, + "step": 7300 + }, + { + "epoch": 0.046701506458990835, + "grad_norm": 0.9440786838531494, + "learning_rate": 9.98656704479244e-05, + "loss": 1.1144, + "step": 7310 + }, + { + "epoch": 0.04676539360872954, + "grad_norm": 0.8505666851997375, + "learning_rate": 9.986530263764108e-05, + "loss": 0.9502, + "step": 7320 + }, + { + "epoch": 0.04682928075846824, + "grad_norm": 0.7318026423454285, + "learning_rate": 9.986493432517103e-05, + "loss": 0.6851, + "step": 7330 + }, + { + "epoch": 0.04689316790820695, + "grad_norm": 1.4378130435943604, + "learning_rate": 9.986456551051795e-05, + "loss": 0.8454, + "step": 7340 + }, + { + "epoch": 0.046957055057945646, + "grad_norm": 0.9807822704315186, + "learning_rate": 9.986419619368554e-05, + "loss": 1.0638, + "step": 7350 + }, + { + "epoch": 0.047020942207684345, + "grad_norm": 1.2284691333770752, + "learning_rate": 9.986382637467757e-05, + "loss": 0.9615, + "step": 7360 + }, + { + "epoch": 0.04708482935742305, + "grad_norm": 0.7769535183906555, + "learning_rate": 9.986345605349769e-05, + "loss": 0.8708, + "step": 7370 + }, + { + "epoch": 0.04714871650716175, + "grad_norm": 1.48138427734375, + "learning_rate": 9.98630852301497e-05, + "loss": 0.7993, + "step": 7380 + }, + { + "epoch": 0.04721260365690045, + "grad_norm": 0.605939507484436, + "learning_rate": 9.986271390463728e-05, + "loss": 0.8898, + "step": 7390 + }, + { + "epoch": 0.047276490806639156, + "grad_norm": 0.7884547710418701, + "learning_rate": 9.986234207696421e-05, + "loss": 0.9975, + "step": 7400 + }, + { + "epoch": 0.047340377956377855, + "grad_norm": 0.9767579436302185, + "learning_rate": 9.986196974713422e-05, + "loss": 0.9493, + "step": 7410 + }, + { + "epoch": 0.047404265106116554, + "grad_norm": 0.9091633558273315, + "learning_rate": 9.986159691515105e-05, + "loss": 0.7876, + "step": 7420 + }, + { + "epoch": 0.04746815225585526, + "grad_norm": 0.6155557036399841, + "learning_rate": 9.986122358101847e-05, + "loss": 0.5978, + "step": 7430 + }, + { + "epoch": 0.04753203940559396, + "grad_norm": 0.8261324763298035, + "learning_rate": 9.986084974474024e-05, + "loss": 0.9533, + "step": 7440 + }, + { + "epoch": 0.04759592655533266, + "grad_norm": 0.5973717570304871, + "learning_rate": 9.98604754063201e-05, + "loss": 0.8045, + "step": 7450 + }, + { + "epoch": 0.047659813705071365, + "grad_norm": 1.0176916122436523, + "learning_rate": 9.986010056576184e-05, + "loss": 1.0215, + "step": 7460 + }, + { + "epoch": 0.047723700854810064, + "grad_norm": 0.5865172147750854, + "learning_rate": 9.985972522306923e-05, + "loss": 0.7648, + "step": 7470 + }, + { + "epoch": 0.04778758800454876, + "grad_norm": 1.0286486148834229, + "learning_rate": 9.985934937824605e-05, + "loss": 0.8718, + "step": 7480 + }, + { + "epoch": 0.04785147515428747, + "grad_norm": 1.0322641134262085, + "learning_rate": 9.98589730312961e-05, + "loss": 0.9538, + "step": 7490 + }, + { + "epoch": 0.04791536230402617, + "grad_norm": 0.8804035782814026, + "learning_rate": 9.985859618222316e-05, + "loss": 0.7283, + "step": 7500 + }, + { + "epoch": 0.04797924945376487, + "grad_norm": 0.7622368931770325, + "learning_rate": 9.985821883103102e-05, + "loss": 0.7618, + "step": 7510 + }, + { + "epoch": 0.048043136603503574, + "grad_norm": 1.1401050090789795, + "learning_rate": 9.985784097772347e-05, + "loss": 1.0667, + "step": 7520 + }, + { + "epoch": 0.04810702375324227, + "grad_norm": 0.6780824661254883, + "learning_rate": 9.985746262230433e-05, + "loss": 0.9327, + "step": 7530 + }, + { + "epoch": 0.04817091090298097, + "grad_norm": 1.0564121007919312, + "learning_rate": 9.985708376477743e-05, + "loss": 0.857, + "step": 7540 + }, + { + "epoch": 0.04823479805271968, + "grad_norm": 0.45248645544052124, + "learning_rate": 9.985670440514654e-05, + "loss": 0.7797, + "step": 7550 + }, + { + "epoch": 0.04829868520245838, + "grad_norm": 0.9228289127349854, + "learning_rate": 9.985632454341551e-05, + "loss": 1.2661, + "step": 7560 + }, + { + "epoch": 0.04836257235219708, + "grad_norm": 0.665448784828186, + "learning_rate": 9.985594417958816e-05, + "loss": 0.8736, + "step": 7570 + }, + { + "epoch": 0.04842645950193578, + "grad_norm": 0.7093620896339417, + "learning_rate": 9.985556331366832e-05, + "loss": 0.9296, + "step": 7580 + }, + { + "epoch": 0.04849034665167448, + "grad_norm": 1.1496485471725464, + "learning_rate": 9.985518194565983e-05, + "loss": 1.0429, + "step": 7590 + }, + { + "epoch": 0.04855423380141318, + "grad_norm": 0.8305206298828125, + "learning_rate": 9.985480007556653e-05, + "loss": 0.9499, + "step": 7600 + }, + { + "epoch": 0.04861812095115189, + "grad_norm": 0.8451396822929382, + "learning_rate": 9.985441770339226e-05, + "loss": 0.9502, + "step": 7610 + }, + { + "epoch": 0.048682008100890586, + "grad_norm": 1.2433000802993774, + "learning_rate": 9.985403482914087e-05, + "loss": 0.6543, + "step": 7620 + }, + { + "epoch": 0.048745895250629286, + "grad_norm": 0.8674241304397583, + "learning_rate": 9.985365145281622e-05, + "loss": 1.1627, + "step": 7630 + }, + { + "epoch": 0.04880978240036799, + "grad_norm": 0.5980839133262634, + "learning_rate": 9.985326757442217e-05, + "loss": 1.1205, + "step": 7640 + }, + { + "epoch": 0.04887366955010669, + "grad_norm": 1.4166803359985352, + "learning_rate": 9.98528831939626e-05, + "loss": 0.8682, + "step": 7650 + }, + { + "epoch": 0.04893755669984539, + "grad_norm": 0.8415298461914062, + "learning_rate": 9.985249831144135e-05, + "loss": 0.9133, + "step": 7660 + }, + { + "epoch": 0.049001443849584096, + "grad_norm": 1.0600535869598389, + "learning_rate": 9.985211292686231e-05, + "loss": 0.9593, + "step": 7670 + }, + { + "epoch": 0.049065330999322795, + "grad_norm": 0.5692518353462219, + "learning_rate": 9.985172704022939e-05, + "loss": 1.1105, + "step": 7680 + }, + { + "epoch": 0.049129218149061495, + "grad_norm": 1.1608545780181885, + "learning_rate": 9.985134065154643e-05, + "loss": 0.9287, + "step": 7690 + }, + { + "epoch": 0.0491931052988002, + "grad_norm": 0.9091508984565735, + "learning_rate": 9.985095376081734e-05, + "loss": 0.8312, + "step": 7700 + }, + { + "epoch": 0.0492569924485389, + "grad_norm": 0.8366988897323608, + "learning_rate": 9.985056636804604e-05, + "loss": 1.0451, + "step": 7710 + }, + { + "epoch": 0.0493208795982776, + "grad_norm": 1.0978457927703857, + "learning_rate": 9.98501784732364e-05, + "loss": 0.8821, + "step": 7720 + }, + { + "epoch": 0.049384766748016305, + "grad_norm": 1.7002284526824951, + "learning_rate": 9.984979007639233e-05, + "loss": 0.7092, + "step": 7730 + }, + { + "epoch": 0.049448653897755004, + "grad_norm": 1.77642023563385, + "learning_rate": 9.984940117751773e-05, + "loss": 1.0623, + "step": 7740 + }, + { + "epoch": 0.04951254104749371, + "grad_norm": 0.800308883190155, + "learning_rate": 9.984901177661656e-05, + "loss": 1.3445, + "step": 7750 + }, + { + "epoch": 0.04957642819723241, + "grad_norm": 0.9408762454986572, + "learning_rate": 9.98486218736927e-05, + "loss": 0.8466, + "step": 7760 + }, + { + "epoch": 0.04964031534697111, + "grad_norm": 0.7024977207183838, + "learning_rate": 9.98482314687501e-05, + "loss": 0.7186, + "step": 7770 + }, + { + "epoch": 0.049704202496709815, + "grad_norm": 0.7420535087585449, + "learning_rate": 9.98478405617927e-05, + "loss": 1.0408, + "step": 7780 + }, + { + "epoch": 0.049768089646448514, + "grad_norm": 1.0378546714782715, + "learning_rate": 9.98474491528244e-05, + "loss": 0.8885, + "step": 7790 + }, + { + "epoch": 0.04983197679618721, + "grad_norm": 1.380505919456482, + "learning_rate": 9.984705724184917e-05, + "loss": 1.113, + "step": 7800 + }, + { + "epoch": 0.04989586394592592, + "grad_norm": 1.8946232795715332, + "learning_rate": 9.984666482887096e-05, + "loss": 0.8355, + "step": 7810 + }, + { + "epoch": 0.04995975109566462, + "grad_norm": 1.4878778457641602, + "learning_rate": 9.98462719138937e-05, + "loss": 0.992, + "step": 7820 + }, + { + "epoch": 0.05002363824540332, + "grad_norm": 0.7730852365493774, + "learning_rate": 9.984587849692136e-05, + "loss": 0.7539, + "step": 7830 + }, + { + "epoch": 0.050087525395142024, + "grad_norm": 0.83015376329422, + "learning_rate": 9.984548457795791e-05, + "loss": 0.8696, + "step": 7840 + }, + { + "epoch": 0.05015141254488072, + "grad_norm": 0.7511310577392578, + "learning_rate": 9.98450901570073e-05, + "loss": 0.9013, + "step": 7850 + }, + { + "epoch": 0.05021529969461942, + "grad_norm": 0.9059261679649353, + "learning_rate": 9.984469523407349e-05, + "loss": 0.8444, + "step": 7860 + }, + { + "epoch": 0.05027918684435813, + "grad_norm": 0.9825949668884277, + "learning_rate": 9.98442998091605e-05, + "loss": 0.8864, + "step": 7870 + }, + { + "epoch": 0.05034307399409683, + "grad_norm": 0.904929518699646, + "learning_rate": 9.984390388227228e-05, + "loss": 0.7628, + "step": 7880 + }, + { + "epoch": 0.05040696114383553, + "grad_norm": 0.736785888671875, + "learning_rate": 9.984350745341284e-05, + "loss": 0.6913, + "step": 7890 + }, + { + "epoch": 0.05047084829357423, + "grad_norm": 0.7877079248428345, + "learning_rate": 9.984311052258615e-05, + "loss": 1.2899, + "step": 7900 + }, + { + "epoch": 0.05053473544331293, + "grad_norm": 3.8321728706359863, + "learning_rate": 9.984271308979622e-05, + "loss": 0.9465, + "step": 7910 + }, + { + "epoch": 0.05059862259305163, + "grad_norm": 0.729813277721405, + "learning_rate": 9.984231515504705e-05, + "loss": 1.1176, + "step": 7920 + }, + { + "epoch": 0.05066250974279034, + "grad_norm": 1.07712984085083, + "learning_rate": 9.984191671834264e-05, + "loss": 0.821, + "step": 7930 + }, + { + "epoch": 0.05072639689252904, + "grad_norm": 0.6421816349029541, + "learning_rate": 9.984151777968701e-05, + "loss": 0.8634, + "step": 7940 + }, + { + "epoch": 0.050790284042267736, + "grad_norm": 1.0871955156326294, + "learning_rate": 9.984111833908419e-05, + "loss": 0.9175, + "step": 7950 + }, + { + "epoch": 0.05085417119200644, + "grad_norm": 0.9562147855758667, + "learning_rate": 9.984071839653817e-05, + "loss": 0.8648, + "step": 7960 + }, + { + "epoch": 0.05091805834174514, + "grad_norm": 0.8465697169303894, + "learning_rate": 9.9840317952053e-05, + "loss": 0.7018, + "step": 7970 + }, + { + "epoch": 0.05098194549148384, + "grad_norm": 0.4053485095500946, + "learning_rate": 9.983991700563273e-05, + "loss": 0.8683, + "step": 7980 + }, + { + "epoch": 0.05104583264122255, + "grad_norm": 0.7025613188743591, + "learning_rate": 9.983951555728135e-05, + "loss": 0.9431, + "step": 7990 + }, + { + "epoch": 0.051109719790961246, + "grad_norm": 0.7401816248893738, + "learning_rate": 9.983911360700296e-05, + "loss": 1.1364, + "step": 8000 + }, + { + "epoch": 0.051173606940699945, + "grad_norm": 0.41972461342811584, + "learning_rate": 9.983871115480155e-05, + "loss": 0.9925, + "step": 8010 + }, + { + "epoch": 0.05123749409043865, + "grad_norm": 0.577347457408905, + "learning_rate": 9.983830820068123e-05, + "loss": 0.7687, + "step": 8020 + }, + { + "epoch": 0.05130138124017735, + "grad_norm": 0.8155549764633179, + "learning_rate": 9.983790474464601e-05, + "loss": 0.9115, + "step": 8030 + }, + { + "epoch": 0.05136526838991605, + "grad_norm": 0.9730279445648193, + "learning_rate": 9.983750078669998e-05, + "loss": 1.1313, + "step": 8040 + }, + { + "epoch": 0.051429155539654756, + "grad_norm": 0.8205385208129883, + "learning_rate": 9.98370963268472e-05, + "loss": 0.9971, + "step": 8050 + }, + { + "epoch": 0.051493042689393455, + "grad_norm": 0.5464890599250793, + "learning_rate": 9.983669136509175e-05, + "loss": 0.7868, + "step": 8060 + }, + { + "epoch": 0.051556929839132154, + "grad_norm": 1.3623446226119995, + "learning_rate": 9.98362859014377e-05, + "loss": 0.9343, + "step": 8070 + }, + { + "epoch": 0.05162081698887086, + "grad_norm": 0.8901773691177368, + "learning_rate": 9.983587993588914e-05, + "loss": 0.7135, + "step": 8080 + }, + { + "epoch": 0.05168470413860956, + "grad_norm": 0.7160339951515198, + "learning_rate": 9.983547346845015e-05, + "loss": 1.2925, + "step": 8090 + }, + { + "epoch": 0.05174859128834826, + "grad_norm": 0.6623441576957703, + "learning_rate": 9.983506649912482e-05, + "loss": 0.923, + "step": 8100 + }, + { + "epoch": 0.051812478438086965, + "grad_norm": 0.469149112701416, + "learning_rate": 9.983465902791726e-05, + "loss": 0.94, + "step": 8110 + }, + { + "epoch": 0.051876365587825664, + "grad_norm": 0.5665640234947205, + "learning_rate": 9.98342510548316e-05, + "loss": 1.0792, + "step": 8120 + }, + { + "epoch": 0.05194025273756436, + "grad_norm": 1.4578264951705933, + "learning_rate": 9.983384257987189e-05, + "loss": 1.0587, + "step": 8130 + }, + { + "epoch": 0.05200413988730307, + "grad_norm": 0.8157141804695129, + "learning_rate": 9.983343360304227e-05, + "loss": 1.2347, + "step": 8140 + }, + { + "epoch": 0.05206802703704177, + "grad_norm": 0.9772050976753235, + "learning_rate": 9.983302412434688e-05, + "loss": 1.1827, + "step": 8150 + }, + { + "epoch": 0.05213191418678047, + "grad_norm": 1.2389028072357178, + "learning_rate": 9.983261414378982e-05, + "loss": 0.8998, + "step": 8160 + }, + { + "epoch": 0.052195801336519174, + "grad_norm": 1.1656652688980103, + "learning_rate": 9.983220366137522e-05, + "loss": 0.9351, + "step": 8170 + }, + { + "epoch": 0.05225968848625787, + "grad_norm": 1.1599071025848389, + "learning_rate": 9.983179267710721e-05, + "loss": 0.8263, + "step": 8180 + }, + { + "epoch": 0.05232357563599658, + "grad_norm": 1.0255035161972046, + "learning_rate": 9.983138119098993e-05, + "loss": 1.1271, + "step": 8190 + }, + { + "epoch": 0.05238746278573528, + "grad_norm": 0.7129418849945068, + "learning_rate": 9.983096920302755e-05, + "loss": 0.7378, + "step": 8200 + }, + { + "epoch": 0.05245134993547398, + "grad_norm": 1.268712043762207, + "learning_rate": 9.983055671322421e-05, + "loss": 0.7541, + "step": 8210 + }, + { + "epoch": 0.05251523708521268, + "grad_norm": 0.8625295758247375, + "learning_rate": 9.983014372158403e-05, + "loss": 0.7441, + "step": 8220 + }, + { + "epoch": 0.05257912423495138, + "grad_norm": 3.3473124504089355, + "learning_rate": 9.982973022811122e-05, + "loss": 1.0331, + "step": 8230 + }, + { + "epoch": 0.05264301138469008, + "grad_norm": 1.0163989067077637, + "learning_rate": 9.982931623280989e-05, + "loss": 0.7206, + "step": 8240 + }, + { + "epoch": 0.05270689853442879, + "grad_norm": 0.6889442205429077, + "learning_rate": 9.982890173568426e-05, + "loss": 0.7788, + "step": 8250 + }, + { + "epoch": 0.05277078568416749, + "grad_norm": 0.7175592184066772, + "learning_rate": 9.982848673673846e-05, + "loss": 0.847, + "step": 8260 + }, + { + "epoch": 0.052834672833906186, + "grad_norm": 1.019832730293274, + "learning_rate": 9.98280712359767e-05, + "loss": 0.8542, + "step": 8270 + }, + { + "epoch": 0.05289855998364489, + "grad_norm": 0.9718403220176697, + "learning_rate": 9.982765523340316e-05, + "loss": 1.0609, + "step": 8280 + }, + { + "epoch": 0.05296244713338359, + "grad_norm": 2.7732856273651123, + "learning_rate": 9.982723872902202e-05, + "loss": 0.9938, + "step": 8290 + }, + { + "epoch": 0.05302633428312229, + "grad_norm": 0.6997831463813782, + "learning_rate": 9.982682172283748e-05, + "loss": 0.7695, + "step": 8300 + }, + { + "epoch": 0.053090221432861, + "grad_norm": 1.2381385564804077, + "learning_rate": 9.982640421485374e-05, + "loss": 0.8043, + "step": 8310 + }, + { + "epoch": 0.053154108582599696, + "grad_norm": 1.2460087537765503, + "learning_rate": 9.9825986205075e-05, + "loss": 0.9461, + "step": 8320 + }, + { + "epoch": 0.053217995732338395, + "grad_norm": 0.7866740822792053, + "learning_rate": 9.982556769350549e-05, + "loss": 0.88, + "step": 8330 + }, + { + "epoch": 0.0532818828820771, + "grad_norm": 1.1013973951339722, + "learning_rate": 9.982514868014938e-05, + "loss": 0.7032, + "step": 8340 + }, + { + "epoch": 0.0533457700318158, + "grad_norm": 0.7456531524658203, + "learning_rate": 9.982472916501093e-05, + "loss": 0.8763, + "step": 8350 + }, + { + "epoch": 0.0534096571815545, + "grad_norm": 0.6022664904594421, + "learning_rate": 9.982430914809437e-05, + "loss": 1.0766, + "step": 8360 + }, + { + "epoch": 0.053473544331293206, + "grad_norm": 0.6867753267288208, + "learning_rate": 9.982388862940389e-05, + "loss": 0.8823, + "step": 8370 + }, + { + "epoch": 0.053537431481031905, + "grad_norm": 1.045599102973938, + "learning_rate": 9.982346760894375e-05, + "loss": 1.0784, + "step": 8380 + }, + { + "epoch": 0.053601318630770604, + "grad_norm": 1.3521573543548584, + "learning_rate": 9.982304608671819e-05, + "loss": 1.1522, + "step": 8390 + }, + { + "epoch": 0.05366520578050931, + "grad_norm": 0.6618836522102356, + "learning_rate": 9.982262406273146e-05, + "loss": 0.863, + "step": 8400 + }, + { + "epoch": 0.05372909293024801, + "grad_norm": 0.6689035892486572, + "learning_rate": 9.98222015369878e-05, + "loss": 0.9005, + "step": 8410 + }, + { + "epoch": 0.05379298007998671, + "grad_norm": 1.0590460300445557, + "learning_rate": 9.982177850949147e-05, + "loss": 1.0022, + "step": 8420 + }, + { + "epoch": 0.053856867229725415, + "grad_norm": 0.6324277520179749, + "learning_rate": 9.982135498024673e-05, + "loss": 0.7492, + "step": 8430 + }, + { + "epoch": 0.053920754379464114, + "grad_norm": 0.5392162203788757, + "learning_rate": 9.982093094925784e-05, + "loss": 0.991, + "step": 8440 + }, + { + "epoch": 0.05398464152920281, + "grad_norm": 0.6738571524620056, + "learning_rate": 9.982050641652908e-05, + "loss": 1.0112, + "step": 8450 + }, + { + "epoch": 0.05404852867894152, + "grad_norm": 0.8277943730354309, + "learning_rate": 9.98200813820647e-05, + "loss": 0.6247, + "step": 8460 + }, + { + "epoch": 0.05411241582868022, + "grad_norm": 1.3968684673309326, + "learning_rate": 9.981965584586901e-05, + "loss": 1.0051, + "step": 8470 + }, + { + "epoch": 0.05417630297841892, + "grad_norm": 1.391640543937683, + "learning_rate": 9.981922980794629e-05, + "loss": 0.9332, + "step": 8480 + }, + { + "epoch": 0.054240190128157624, + "grad_norm": 2.0874507427215576, + "learning_rate": 9.981880326830083e-05, + "loss": 1.135, + "step": 8490 + }, + { + "epoch": 0.05430407727789632, + "grad_norm": 1.9418469667434692, + "learning_rate": 9.981837622693692e-05, + "loss": 0.8689, + "step": 8500 + }, + { + "epoch": 0.05436796442763502, + "grad_norm": 0.9285494089126587, + "learning_rate": 9.981794868385886e-05, + "loss": 0.8521, + "step": 8510 + }, + { + "epoch": 0.05443185157737373, + "grad_norm": 1.062789797782898, + "learning_rate": 9.981752063907096e-05, + "loss": 1.0655, + "step": 8520 + }, + { + "epoch": 0.05449573872711243, + "grad_norm": 0.6997897624969482, + "learning_rate": 9.981709209257752e-05, + "loss": 0.9636, + "step": 8530 + }, + { + "epoch": 0.05455962587685113, + "grad_norm": 0.8409900665283203, + "learning_rate": 9.981666304438286e-05, + "loss": 0.9073, + "step": 8540 + }, + { + "epoch": 0.05462351302658983, + "grad_norm": 0.7529276013374329, + "learning_rate": 9.981623349449131e-05, + "loss": 0.695, + "step": 8550 + }, + { + "epoch": 0.05468740017632853, + "grad_norm": 0.6798946261405945, + "learning_rate": 9.981580344290722e-05, + "loss": 0.9083, + "step": 8560 + }, + { + "epoch": 0.05475128732606723, + "grad_norm": 0.537013828754425, + "learning_rate": 9.981537288963487e-05, + "loss": 0.9872, + "step": 8570 + }, + { + "epoch": 0.05481517447580594, + "grad_norm": 0.9144914150238037, + "learning_rate": 9.981494183467861e-05, + "loss": 0.9987, + "step": 8580 + }, + { + "epoch": 0.05487906162554464, + "grad_norm": 1.6605632305145264, + "learning_rate": 9.98145102780428e-05, + "loss": 0.9811, + "step": 8590 + }, + { + "epoch": 0.05494294877528334, + "grad_norm": 0.8611153960227966, + "learning_rate": 9.981407821973176e-05, + "loss": 1.0801, + "step": 8600 + }, + { + "epoch": 0.05500683592502204, + "grad_norm": 0.9995184540748596, + "learning_rate": 9.981364565974988e-05, + "loss": 0.9886, + "step": 8610 + }, + { + "epoch": 0.05507072307476074, + "grad_norm": 1.9788289070129395, + "learning_rate": 9.981321259810149e-05, + "loss": 0.8339, + "step": 8620 + }, + { + "epoch": 0.05513461022449945, + "grad_norm": 0.6516178250312805, + "learning_rate": 9.981277903479095e-05, + "loss": 0.87, + "step": 8630 + }, + { + "epoch": 0.05519849737423815, + "grad_norm": 0.6122477054595947, + "learning_rate": 9.981234496982262e-05, + "loss": 0.9143, + "step": 8640 + }, + { + "epoch": 0.055262384523976846, + "grad_norm": 0.6674822568893433, + "learning_rate": 9.98119104032009e-05, + "loss": 0.9851, + "step": 8650 + }, + { + "epoch": 0.05532627167371555, + "grad_norm": 0.7896667122840881, + "learning_rate": 9.981147533493013e-05, + "loss": 0.9507, + "step": 8660 + }, + { + "epoch": 0.05539015882345425, + "grad_norm": 0.5288309454917908, + "learning_rate": 9.981103976501474e-05, + "loss": 0.8592, + "step": 8670 + }, + { + "epoch": 0.05545404597319295, + "grad_norm": 1.2801772356033325, + "learning_rate": 9.981060369345905e-05, + "loss": 0.799, + "step": 8680 + }, + { + "epoch": 0.055517933122931656, + "grad_norm": 1.178462266921997, + "learning_rate": 9.981016712026752e-05, + "loss": 0.8998, + "step": 8690 + }, + { + "epoch": 0.055581820272670356, + "grad_norm": 0.5843381285667419, + "learning_rate": 9.98097300454445e-05, + "loss": 0.9922, + "step": 8700 + }, + { + "epoch": 0.055645707422409055, + "grad_norm": 1.011044979095459, + "learning_rate": 9.980929246899441e-05, + "loss": 0.8379, + "step": 8710 + }, + { + "epoch": 0.05570959457214776, + "grad_norm": 0.9214301109313965, + "learning_rate": 9.980885439092165e-05, + "loss": 0.6383, + "step": 8720 + }, + { + "epoch": 0.05577348172188646, + "grad_norm": 0.6694325804710388, + "learning_rate": 9.980841581123064e-05, + "loss": 1.1735, + "step": 8730 + }, + { + "epoch": 0.05583736887162516, + "grad_norm": 0.6404210329055786, + "learning_rate": 9.98079767299258e-05, + "loss": 0.7791, + "step": 8740 + }, + { + "epoch": 0.055901256021363865, + "grad_norm": 1.0678333044052124, + "learning_rate": 9.980753714701152e-05, + "loss": 0.8481, + "step": 8750 + }, + { + "epoch": 0.055965143171102565, + "grad_norm": 2.0661401748657227, + "learning_rate": 9.980709706249227e-05, + "loss": 1.0899, + "step": 8760 + }, + { + "epoch": 0.056029030320841264, + "grad_norm": 1.161922812461853, + "learning_rate": 9.980665647637246e-05, + "loss": 0.7383, + "step": 8770 + }, + { + "epoch": 0.05609291747057997, + "grad_norm": 0.5117482542991638, + "learning_rate": 9.980621538865654e-05, + "loss": 0.9479, + "step": 8780 + }, + { + "epoch": 0.05615680462031867, + "grad_norm": 1.124691367149353, + "learning_rate": 9.980581798085118e-05, + "loss": 1.0286, + "step": 8790 + }, + { + "epoch": 0.05622069177005737, + "grad_norm": 0.9648489952087402, + "learning_rate": 9.980537594011486e-05, + "loss": 0.7825, + "step": 8800 + }, + { + "epoch": 0.056284578919796074, + "grad_norm": 0.9334906339645386, + "learning_rate": 9.980493339779533e-05, + "loss": 0.8359, + "step": 8810 + }, + { + "epoch": 0.056348466069534774, + "grad_norm": 0.8304029107093811, + "learning_rate": 9.980449035389702e-05, + "loss": 0.7827, + "step": 8820 + }, + { + "epoch": 0.05641235321927347, + "grad_norm": 0.8497231006622314, + "learning_rate": 9.980404680842441e-05, + "loss": 1.0369, + "step": 8830 + }, + { + "epoch": 0.05647624036901218, + "grad_norm": 0.7895182371139526, + "learning_rate": 9.980360276138196e-05, + "loss": 0.8317, + "step": 8840 + }, + { + "epoch": 0.05654012751875088, + "grad_norm": 2.521169900894165, + "learning_rate": 9.980315821277415e-05, + "loss": 1.1953, + "step": 8850 + }, + { + "epoch": 0.05660401466848958, + "grad_norm": 0.8677668571472168, + "learning_rate": 9.980271316260544e-05, + "loss": 0.6768, + "step": 8860 + }, + { + "epoch": 0.05666790181822828, + "grad_norm": 0.6117026805877686, + "learning_rate": 9.980226761088033e-05, + "loss": 0.8991, + "step": 8870 + }, + { + "epoch": 0.05673178896796698, + "grad_norm": 0.5636959075927734, + "learning_rate": 9.98018215576033e-05, + "loss": 0.9742, + "step": 8880 + }, + { + "epoch": 0.05679567611770568, + "grad_norm": 1.0202407836914062, + "learning_rate": 9.980137500277885e-05, + "loss": 0.8069, + "step": 8890 + }, + { + "epoch": 0.05685956326744439, + "grad_norm": 0.7063365578651428, + "learning_rate": 9.980092794641144e-05, + "loss": 0.7919, + "step": 8900 + }, + { + "epoch": 0.05692345041718309, + "grad_norm": 0.6419750452041626, + "learning_rate": 9.980048038850564e-05, + "loss": 1.0765, + "step": 8910 + }, + { + "epoch": 0.056987337566921786, + "grad_norm": 0.8232806921005249, + "learning_rate": 9.98000323290659e-05, + "loss": 0.9938, + "step": 8920 + }, + { + "epoch": 0.05705122471666049, + "grad_norm": 0.846300482749939, + "learning_rate": 9.979958376809675e-05, + "loss": 0.9364, + "step": 8930 + }, + { + "epoch": 0.05711511186639919, + "grad_norm": 0.9861621856689453, + "learning_rate": 9.979913470560271e-05, + "loss": 0.9568, + "step": 8940 + }, + { + "epoch": 0.05717899901613789, + "grad_norm": 1.035204529762268, + "learning_rate": 9.97986851415883e-05, + "loss": 1.1155, + "step": 8950 + }, + { + "epoch": 0.0572428861658766, + "grad_norm": 0.901535153388977, + "learning_rate": 9.979823507605806e-05, + "loss": 0.9014, + "step": 8960 + }, + { + "epoch": 0.057306773315615296, + "grad_norm": 0.7851259708404541, + "learning_rate": 9.97977845090165e-05, + "loss": 0.9278, + "step": 8970 + }, + { + "epoch": 0.057370660465353995, + "grad_norm": 0.8578255772590637, + "learning_rate": 9.979733344046818e-05, + "loss": 1.0668, + "step": 8980 + }, + { + "epoch": 0.0574345476150927, + "grad_norm": 0.5631706714630127, + "learning_rate": 9.979688187041761e-05, + "loss": 0.7958, + "step": 8990 + }, + { + "epoch": 0.0574984347648314, + "grad_norm": 0.9356205463409424, + "learning_rate": 9.979642979886938e-05, + "loss": 0.9709, + "step": 9000 + }, + { + "epoch": 0.05756232191457011, + "grad_norm": 1.1016316413879395, + "learning_rate": 9.979597722582801e-05, + "loss": 1.0941, + "step": 9010 + }, + { + "epoch": 0.057626209064308806, + "grad_norm": 0.7269836068153381, + "learning_rate": 9.979552415129806e-05, + "loss": 0.8328, + "step": 9020 + }, + { + "epoch": 0.057690096214047505, + "grad_norm": 1.0700838565826416, + "learning_rate": 9.979507057528412e-05, + "loss": 1.0288, + "step": 9030 + }, + { + "epoch": 0.05775398336378621, + "grad_norm": 0.9521405100822449, + "learning_rate": 9.979461649779074e-05, + "loss": 0.8238, + "step": 9040 + }, + { + "epoch": 0.05781787051352491, + "grad_norm": 1.3190817832946777, + "learning_rate": 9.97941619188225e-05, + "loss": 0.9142, + "step": 9050 + }, + { + "epoch": 0.05788175766326361, + "grad_norm": 0.7254020571708679, + "learning_rate": 9.979370683838396e-05, + "loss": 0.952, + "step": 9060 + }, + { + "epoch": 0.057945644813002316, + "grad_norm": 0.6510186195373535, + "learning_rate": 9.979325125647972e-05, + "loss": 0.9684, + "step": 9070 + }, + { + "epoch": 0.058009531962741015, + "grad_norm": 0.8847187757492065, + "learning_rate": 9.979279517311435e-05, + "loss": 0.867, + "step": 9080 + }, + { + "epoch": 0.058073419112479714, + "grad_norm": 1.1535148620605469, + "learning_rate": 9.979233858829246e-05, + "loss": 1.1381, + "step": 9090 + }, + { + "epoch": 0.05813730626221842, + "grad_norm": 0.8919582366943359, + "learning_rate": 9.979188150201866e-05, + "loss": 0.9911, + "step": 9100 + }, + { + "epoch": 0.05820119341195712, + "grad_norm": 0.8089918494224548, + "learning_rate": 9.979142391429753e-05, + "loss": 1.0435, + "step": 9110 + }, + { + "epoch": 0.05826508056169582, + "grad_norm": 1.1607420444488525, + "learning_rate": 9.979096582513366e-05, + "loss": 0.8656, + "step": 9120 + }, + { + "epoch": 0.058328967711434525, + "grad_norm": 0.8984375596046448, + "learning_rate": 9.979050723453171e-05, + "loss": 0.7627, + "step": 9130 + }, + { + "epoch": 0.058392854861173224, + "grad_norm": 1.1916580200195312, + "learning_rate": 9.979004814249629e-05, + "loss": 0.8041, + "step": 9140 + }, + { + "epoch": 0.05845674201091192, + "grad_norm": 1.0592631101608276, + "learning_rate": 9.978958854903198e-05, + "loss": 0.8423, + "step": 9150 + }, + { + "epoch": 0.05852062916065063, + "grad_norm": 0.8369486331939697, + "learning_rate": 9.978912845414347e-05, + "loss": 0.7743, + "step": 9160 + }, + { + "epoch": 0.05858451631038933, + "grad_norm": 0.8720589280128479, + "learning_rate": 9.978866785783533e-05, + "loss": 0.8537, + "step": 9170 + }, + { + "epoch": 0.05864840346012803, + "grad_norm": 1.644795298576355, + "learning_rate": 9.978820676011227e-05, + "loss": 0.7972, + "step": 9180 + }, + { + "epoch": 0.058712290609866734, + "grad_norm": 2.7408289909362793, + "learning_rate": 9.978774516097886e-05, + "loss": 1.3147, + "step": 9190 + }, + { + "epoch": 0.05877617775960543, + "grad_norm": 0.5846157670021057, + "learning_rate": 9.97872830604398e-05, + "loss": 0.948, + "step": 9200 + }, + { + "epoch": 0.05884006490934413, + "grad_norm": 0.5981711149215698, + "learning_rate": 9.978682045849975e-05, + "loss": 0.9317, + "step": 9210 + }, + { + "epoch": 0.05890395205908284, + "grad_norm": 0.8747972249984741, + "learning_rate": 9.97863573551633e-05, + "loss": 1.0936, + "step": 9220 + }, + { + "epoch": 0.05896783920882154, + "grad_norm": 0.8239877223968506, + "learning_rate": 9.978589375043519e-05, + "loss": 0.9512, + "step": 9230 + }, + { + "epoch": 0.05903172635856024, + "grad_norm": 0.48801517486572266, + "learning_rate": 9.978542964432005e-05, + "loss": 0.8562, + "step": 9240 + }, + { + "epoch": 0.05909561350829894, + "grad_norm": 2.152211904525757, + "learning_rate": 9.978496503682258e-05, + "loss": 1.0353, + "step": 9250 + }, + { + "epoch": 0.05915950065803764, + "grad_norm": 0.9108942747116089, + "learning_rate": 9.978449992794742e-05, + "loss": 0.9188, + "step": 9260 + }, + { + "epoch": 0.05922338780777634, + "grad_norm": 0.41772526502609253, + "learning_rate": 9.978403431769927e-05, + "loss": 0.8177, + "step": 9270 + }, + { + "epoch": 0.05928727495751505, + "grad_norm": 0.5694353580474854, + "learning_rate": 9.978356820608284e-05, + "loss": 0.8956, + "step": 9280 + }, + { + "epoch": 0.059351162107253747, + "grad_norm": 0.620496928691864, + "learning_rate": 9.978310159310282e-05, + "loss": 0.631, + "step": 9290 + }, + { + "epoch": 0.059415049256992446, + "grad_norm": 3.017289638519287, + "learning_rate": 9.978263447876388e-05, + "loss": 0.7887, + "step": 9300 + }, + { + "epoch": 0.05947893640673115, + "grad_norm": 1.065492868423462, + "learning_rate": 9.978216686307075e-05, + "loss": 0.8404, + "step": 9310 + }, + { + "epoch": 0.05954282355646985, + "grad_norm": 0.5826980471611023, + "learning_rate": 9.978169874602813e-05, + "loss": 0.8956, + "step": 9320 + }, + { + "epoch": 0.05960671070620855, + "grad_norm": 0.9797850847244263, + "learning_rate": 9.978123012764074e-05, + "loss": 1.0606, + "step": 9330 + }, + { + "epoch": 0.059670597855947256, + "grad_norm": 0.6139065027236938, + "learning_rate": 9.97807610079133e-05, + "loss": 0.9635, + "step": 9340 + }, + { + "epoch": 0.059734485005685956, + "grad_norm": 0.7059098482131958, + "learning_rate": 9.978029138685052e-05, + "loss": 0.861, + "step": 9350 + }, + { + "epoch": 0.059798372155424655, + "grad_norm": 0.8045505285263062, + "learning_rate": 9.977982126445712e-05, + "loss": 0.923, + "step": 9360 + }, + { + "epoch": 0.05986225930516336, + "grad_norm": NaN, + "learning_rate": 9.977939772566934e-05, + "loss": 1.1034, + "step": 9370 + }, + { + "epoch": 0.05992614645490206, + "grad_norm": 0.8647670745849609, + "learning_rate": 9.977892665076088e-05, + "loss": 0.9245, + "step": 9380 + }, + { + "epoch": 0.05999003360464076, + "grad_norm": 1.1494901180267334, + "learning_rate": 9.977845507453554e-05, + "loss": 0.9954, + "step": 9390 + }, + { + "epoch": 0.060053920754379465, + "grad_norm": 0.6723154783248901, + "learning_rate": 9.977798299699811e-05, + "loss": 0.7105, + "step": 9400 + }, + { + "epoch": 0.060117807904118165, + "grad_norm": 1.0932044982910156, + "learning_rate": 9.977751041815333e-05, + "loss": 0.994, + "step": 9410 + }, + { + "epoch": 0.06018169505385687, + "grad_norm": 1.036632776260376, + "learning_rate": 9.977703733800594e-05, + "loss": 0.9975, + "step": 9420 + }, + { + "epoch": 0.06024558220359557, + "grad_norm": 0.9420500993728638, + "learning_rate": 9.977656375656072e-05, + "loss": 0.8054, + "step": 9430 + }, + { + "epoch": 0.06030946935333427, + "grad_norm": 0.8750972747802734, + "learning_rate": 9.977608967382246e-05, + "loss": 1.233, + "step": 9440 + }, + { + "epoch": 0.060373356503072975, + "grad_norm": 2.4813504219055176, + "learning_rate": 9.977561508979591e-05, + "loss": 1.0237, + "step": 9450 + }, + { + "epoch": 0.060437243652811674, + "grad_norm": 0.8663612604141235, + "learning_rate": 9.977514000448584e-05, + "loss": 0.9739, + "step": 9460 + }, + { + "epoch": 0.060501130802550374, + "grad_norm": 0.5622289776802063, + "learning_rate": 9.977466441789707e-05, + "loss": 0.6195, + "step": 9470 + }, + { + "epoch": 0.06056501795228908, + "grad_norm": 0.7465640902519226, + "learning_rate": 9.977418833003436e-05, + "loss": 0.6977, + "step": 9480 + }, + { + "epoch": 0.06062890510202778, + "grad_norm": 0.8643200993537903, + "learning_rate": 9.97737117409025e-05, + "loss": 1.1056, + "step": 9490 + }, + { + "epoch": 0.06069279225176648, + "grad_norm": 0.8004162311553955, + "learning_rate": 9.977323465050631e-05, + "loss": 0.8349, + "step": 9500 + }, + { + "epoch": 0.060756679401505184, + "grad_norm": 0.7937789559364319, + "learning_rate": 9.977275705885058e-05, + "loss": 1.0755, + "step": 9510 + }, + { + "epoch": 0.06082056655124388, + "grad_norm": 0.7888356447219849, + "learning_rate": 9.977227896594014e-05, + "loss": 1.242, + "step": 9520 + }, + { + "epoch": 0.06088445370098258, + "grad_norm": 0.6252095103263855, + "learning_rate": 9.977180037177979e-05, + "loss": 1.1968, + "step": 9530 + }, + { + "epoch": 0.06094834085072129, + "grad_norm": 0.6318356990814209, + "learning_rate": 9.977132127637434e-05, + "loss": 1.0921, + "step": 9540 + }, + { + "epoch": 0.06101222800045999, + "grad_norm": 0.6336533427238464, + "learning_rate": 9.977084167972863e-05, + "loss": 0.7744, + "step": 9550 + }, + { + "epoch": 0.06107611515019869, + "grad_norm": 0.7241688966751099, + "learning_rate": 9.97703615818475e-05, + "loss": 0.7843, + "step": 9560 + }, + { + "epoch": 0.06114000229993739, + "grad_norm": 1.5715322494506836, + "learning_rate": 9.976988098273576e-05, + "loss": 1.1104, + "step": 9570 + }, + { + "epoch": 0.06120388944967609, + "grad_norm": 0.5444793105125427, + "learning_rate": 9.976939988239826e-05, + "loss": 0.9894, + "step": 9580 + }, + { + "epoch": 0.06126777659941479, + "grad_norm": 0.785284698009491, + "learning_rate": 9.976891828083985e-05, + "loss": 0.9782, + "step": 9590 + }, + { + "epoch": 0.0613316637491535, + "grad_norm": 1.1315600872039795, + "learning_rate": 9.976843617806538e-05, + "loss": 0.9443, + "step": 9600 + }, + { + "epoch": 0.0613955508988922, + "grad_norm": 5.169201850891113, + "learning_rate": 9.97679535740797e-05, + "loss": 0.9371, + "step": 9610 + }, + { + "epoch": 0.061459438048630896, + "grad_norm": 0.6818580031394958, + "learning_rate": 9.976747046888767e-05, + "loss": 0.8102, + "step": 9620 + }, + { + "epoch": 0.0615233251983696, + "grad_norm": 0.8099622130393982, + "learning_rate": 9.976698686249416e-05, + "loss": 1.0892, + "step": 9630 + }, + { + "epoch": 0.0615872123481083, + "grad_norm": 0.8413625955581665, + "learning_rate": 9.976650275490404e-05, + "loss": 0.9822, + "step": 9640 + }, + { + "epoch": 0.061651099497847, + "grad_norm": 1.4564588069915771, + "learning_rate": 9.976601814612217e-05, + "loss": 1.1034, + "step": 9650 + }, + { + "epoch": 0.06171498664758571, + "grad_norm": 0.897906482219696, + "learning_rate": 9.976553303615346e-05, + "loss": 0.9956, + "step": 9660 + }, + { + "epoch": 0.061778873797324406, + "grad_norm": 0.5349118113517761, + "learning_rate": 9.976504742500277e-05, + "loss": 0.9361, + "step": 9670 + }, + { + "epoch": 0.061842760947063105, + "grad_norm": 0.8950748443603516, + "learning_rate": 9.9764561312675e-05, + "loss": 0.8486, + "step": 9680 + }, + { + "epoch": 0.06190664809680181, + "grad_norm": 1.0401899814605713, + "learning_rate": 9.976407469917504e-05, + "loss": 0.855, + "step": 9690 + }, + { + "epoch": 0.06197053524654051, + "grad_norm": 1.0119845867156982, + "learning_rate": 9.976358758450781e-05, + "loss": 0.8117, + "step": 9700 + }, + { + "epoch": 0.06203442239627921, + "grad_norm": 1.028308629989624, + "learning_rate": 9.976309996867819e-05, + "loss": 1.0832, + "step": 9710 + }, + { + "epoch": 0.062098309546017916, + "grad_norm": 0.6654931902885437, + "learning_rate": 9.976261185169111e-05, + "loss": 0.9543, + "step": 9720 + }, + { + "epoch": 0.062162196695756615, + "grad_norm": 0.7170969843864441, + "learning_rate": 9.976212323355148e-05, + "loss": 0.7589, + "step": 9730 + }, + { + "epoch": 0.062226083845495314, + "grad_norm": 0.7951648831367493, + "learning_rate": 9.97616341142642e-05, + "loss": 0.7643, + "step": 9740 + }, + { + "epoch": 0.06228997099523402, + "grad_norm": 0.8642029166221619, + "learning_rate": 9.976114449383422e-05, + "loss": 0.7792, + "step": 9750 + }, + { + "epoch": 0.06235385814497272, + "grad_norm": 0.7159494757652283, + "learning_rate": 9.976065437226648e-05, + "loss": 0.7695, + "step": 9760 + }, + { + "epoch": 0.06241774529471142, + "grad_norm": 0.8568373918533325, + "learning_rate": 9.976016374956589e-05, + "loss": 0.9835, + "step": 9770 + }, + { + "epoch": 0.062481632444450125, + "grad_norm": 0.7960609793663025, + "learning_rate": 9.97596726257374e-05, + "loss": 0.7966, + "step": 9780 + }, + { + "epoch": 0.06254551959418883, + "grad_norm": 0.9307446479797363, + "learning_rate": 9.975918100078598e-05, + "loss": 0.9899, + "step": 9790 + }, + { + "epoch": 0.06260940674392752, + "grad_norm": 0.6713595986366272, + "learning_rate": 9.975868887471654e-05, + "loss": 0.9225, + "step": 9800 + }, + { + "epoch": 0.06267329389366623, + "grad_norm": 0.8115236163139343, + "learning_rate": 9.975819624753405e-05, + "loss": 1.0275, + "step": 9810 + }, + { + "epoch": 0.06273718104340494, + "grad_norm": 1.7901041507720947, + "learning_rate": 9.975770311924348e-05, + "loss": 1.0027, + "step": 9820 + }, + { + "epoch": 0.06280106819314363, + "grad_norm": 1.7000713348388672, + "learning_rate": 9.975720948984981e-05, + "loss": 0.7821, + "step": 9830 + }, + { + "epoch": 0.06286495534288233, + "grad_norm": 0.7414657473564148, + "learning_rate": 9.975671535935797e-05, + "loss": 1.1558, + "step": 9840 + }, + { + "epoch": 0.06292884249262104, + "grad_norm": 1.6867907047271729, + "learning_rate": 9.975622072777299e-05, + "loss": 0.8346, + "step": 9850 + }, + { + "epoch": 0.06299272964235973, + "grad_norm": 0.8193734884262085, + "learning_rate": 9.97557255950998e-05, + "loss": 1.0878, + "step": 9860 + }, + { + "epoch": 0.06305661679209844, + "grad_norm": 0.8677065968513489, + "learning_rate": 9.975522996134341e-05, + "loss": 1.0119, + "step": 9870 + }, + { + "epoch": 0.06312050394183714, + "grad_norm": 0.9119147658348083, + "learning_rate": 9.975473382650882e-05, + "loss": 0.9826, + "step": 9880 + }, + { + "epoch": 0.06318439109157584, + "grad_norm": 0.5988776683807373, + "learning_rate": 9.9754237190601e-05, + "loss": 0.9762, + "step": 9890 + }, + { + "epoch": 0.06324827824131454, + "grad_norm": 0.7673864364624023, + "learning_rate": 9.9753740053625e-05, + "loss": 1.0638, + "step": 9900 + }, + { + "epoch": 0.06331216539105325, + "grad_norm": 0.7230051159858704, + "learning_rate": 9.975324241558577e-05, + "loss": 1.005, + "step": 9910 + }, + { + "epoch": 0.06337605254079194, + "grad_norm": 0.7979596257209778, + "learning_rate": 9.975274427648834e-05, + "loss": 0.816, + "step": 9920 + }, + { + "epoch": 0.06343993969053065, + "grad_norm": 0.8814641237258911, + "learning_rate": 9.975224563633774e-05, + "loss": 0.808, + "step": 9930 + }, + { + "epoch": 0.06350382684026935, + "grad_norm": 1.0135507583618164, + "learning_rate": 9.975174649513899e-05, + "loss": 0.6825, + "step": 9940 + }, + { + "epoch": 0.06356771399000805, + "grad_norm": 2.055793046951294, + "learning_rate": 9.97512468528971e-05, + "loss": 0.9164, + "step": 9950 + }, + { + "epoch": 0.06363160113974675, + "grad_norm": 1.3944244384765625, + "learning_rate": 9.975074670961712e-05, + "loss": 0.6302, + "step": 9960 + }, + { + "epoch": 0.06369548828948546, + "grad_norm": 0.828702986240387, + "learning_rate": 9.97502460653041e-05, + "loss": 0.8697, + "step": 9970 + }, + { + "epoch": 0.06375937543922415, + "grad_norm": 0.6198043823242188, + "learning_rate": 9.974974491996303e-05, + "loss": 1.019, + "step": 9980 + }, + { + "epoch": 0.06382326258896286, + "grad_norm": 1.0051112174987793, + "learning_rate": 9.9749243273599e-05, + "loss": 0.8931, + "step": 9990 + }, + { + "epoch": 0.06388714973870156, + "grad_norm": 0.7894333004951477, + "learning_rate": 9.974874112621706e-05, + "loss": 0.729, + "step": 10000 + }, + { + "epoch": 0.06395103688844025, + "grad_norm": 1.0666780471801758, + "learning_rate": 9.974823847782226e-05, + "loss": 0.8405, + "step": 10010 + }, + { + "epoch": 0.06401492403817896, + "grad_norm": 0.8409984111785889, + "learning_rate": 9.974773532841965e-05, + "loss": 0.7593, + "step": 10020 + }, + { + "epoch": 0.06407881118791767, + "grad_norm": 0.7679229974746704, + "learning_rate": 9.97472316780143e-05, + "loss": 0.9023, + "step": 10030 + }, + { + "epoch": 0.06414269833765636, + "grad_norm": 0.893464207649231, + "learning_rate": 9.97467275266113e-05, + "loss": 0.9048, + "step": 10040 + }, + { + "epoch": 0.06420658548739507, + "grad_norm": 0.8160121440887451, + "learning_rate": 9.974622287421571e-05, + "loss": 0.7204, + "step": 10050 + }, + { + "epoch": 0.06427047263713377, + "grad_norm": 1.0811116695404053, + "learning_rate": 9.974571772083264e-05, + "loss": 1.0378, + "step": 10060 + }, + { + "epoch": 0.06433435978687246, + "grad_norm": 2.037599802017212, + "learning_rate": 9.974521206646714e-05, + "loss": 1.205, + "step": 10070 + }, + { + "epoch": 0.06439824693661117, + "grad_norm": 1.153348445892334, + "learning_rate": 9.974470591112431e-05, + "loss": 1.1017, + "step": 10080 + }, + { + "epoch": 0.06446213408634988, + "grad_norm": 0.8410546183586121, + "learning_rate": 9.974419925480927e-05, + "loss": 0.9647, + "step": 10090 + }, + { + "epoch": 0.06452602123608857, + "grad_norm": 1.0550462007522583, + "learning_rate": 9.97436920975271e-05, + "loss": 0.6909, + "step": 10100 + }, + { + "epoch": 0.06458990838582727, + "grad_norm": 0.7067312598228455, + "learning_rate": 9.974318443928292e-05, + "loss": 1.0198, + "step": 10110 + }, + { + "epoch": 0.06465379553556598, + "grad_norm": 0.9904884696006775, + "learning_rate": 9.974267628008184e-05, + "loss": 0.853, + "step": 10120 + }, + { + "epoch": 0.06471768268530467, + "grad_norm": 0.9248889684677124, + "learning_rate": 9.974216761992899e-05, + "loss": 0.8722, + "step": 10130 + }, + { + "epoch": 0.06478156983504338, + "grad_norm": 0.9939360618591309, + "learning_rate": 9.974165845882946e-05, + "loss": 0.7184, + "step": 10140 + }, + { + "epoch": 0.06484545698478208, + "grad_norm": 0.7473933696746826, + "learning_rate": 9.97411487967884e-05, + "loss": 1.1064, + "step": 10150 + }, + { + "epoch": 0.06490934413452078, + "grad_norm": 0.6957441568374634, + "learning_rate": 9.974063863381093e-05, + "loss": 1.0598, + "step": 10160 + }, + { + "epoch": 0.06497323128425948, + "grad_norm": 0.5153073072433472, + "learning_rate": 9.974012796990222e-05, + "loss": 1.0821, + "step": 10170 + }, + { + "epoch": 0.06503711843399819, + "grad_norm": 0.6289156675338745, + "learning_rate": 9.973961680506736e-05, + "loss": 0.7954, + "step": 10180 + }, + { + "epoch": 0.06510100558373688, + "grad_norm": 0.8114803433418274, + "learning_rate": 9.973910513931155e-05, + "loss": 1.0314, + "step": 10190 + }, + { + "epoch": 0.06516489273347559, + "grad_norm": 0.9270540475845337, + "learning_rate": 9.973859297263992e-05, + "loss": 1.0626, + "step": 10200 + }, + { + "epoch": 0.0652287798832143, + "grad_norm": 0.7939660549163818, + "learning_rate": 9.973808030505762e-05, + "loss": 1.0844, + "step": 10210 + }, + { + "epoch": 0.06529266703295299, + "grad_norm": 0.7727285027503967, + "learning_rate": 9.973756713656983e-05, + "loss": 1.1614, + "step": 10220 + }, + { + "epoch": 0.06535655418269169, + "grad_norm": 0.628436803817749, + "learning_rate": 9.973705346718172e-05, + "loss": 1.0243, + "step": 10230 + }, + { + "epoch": 0.0654204413324304, + "grad_norm": 0.6849284172058105, + "learning_rate": 9.973653929689843e-05, + "loss": 0.9389, + "step": 10240 + }, + { + "epoch": 0.06548432848216909, + "grad_norm": 1.1843525171279907, + "learning_rate": 9.973602462572517e-05, + "loss": 1.1492, + "step": 10250 + }, + { + "epoch": 0.0655482156319078, + "grad_norm": 0.8269469141960144, + "learning_rate": 9.973550945366713e-05, + "loss": 1.2698, + "step": 10260 + }, + { + "epoch": 0.0656121027816465, + "grad_norm": 1.4048844575881958, + "learning_rate": 9.973499378072945e-05, + "loss": 0.9471, + "step": 10270 + }, + { + "epoch": 0.0656759899313852, + "grad_norm": 0.4660175144672394, + "learning_rate": 9.973447760691738e-05, + "loss": 1.0006, + "step": 10280 + }, + { + "epoch": 0.0657398770811239, + "grad_norm": 0.5896976590156555, + "learning_rate": 9.973396093223609e-05, + "loss": 0.9568, + "step": 10290 + }, + { + "epoch": 0.06580376423086261, + "grad_norm": 0.8697670102119446, + "learning_rate": 9.973344375669078e-05, + "loss": 1.0002, + "step": 10300 + }, + { + "epoch": 0.0658676513806013, + "grad_norm": 0.6355106234550476, + "learning_rate": 9.973292608028667e-05, + "loss": 0.847, + "step": 10310 + }, + { + "epoch": 0.06593153853034, + "grad_norm": 0.8912832736968994, + "learning_rate": 9.973240790302898e-05, + "loss": 0.9665, + "step": 10320 + }, + { + "epoch": 0.06599542568007871, + "grad_norm": 0.662343442440033, + "learning_rate": 9.97318892249229e-05, + "loss": 0.8517, + "step": 10330 + }, + { + "epoch": 0.0660593128298174, + "grad_norm": 0.7244953513145447, + "learning_rate": 9.973137004597368e-05, + "loss": 0.8731, + "step": 10340 + }, + { + "epoch": 0.06612319997955611, + "grad_norm": 0.9315572381019592, + "learning_rate": 9.973085036618655e-05, + "loss": 0.8918, + "step": 10350 + }, + { + "epoch": 0.06618708712929482, + "grad_norm": 0.7289429306983948, + "learning_rate": 9.973033018556671e-05, + "loss": 0.8263, + "step": 10360 + }, + { + "epoch": 0.06625097427903351, + "grad_norm": 1.0025968551635742, + "learning_rate": 9.972980950411944e-05, + "loss": 0.8438, + "step": 10370 + }, + { + "epoch": 0.06631486142877221, + "grad_norm": 1.0387686491012573, + "learning_rate": 9.972928832184996e-05, + "loss": 1.0417, + "step": 10380 + }, + { + "epoch": 0.06637874857851092, + "grad_norm": 1.1046053171157837, + "learning_rate": 9.972876663876352e-05, + "loss": 1.0033, + "step": 10390 + }, + { + "epoch": 0.06644263572824961, + "grad_norm": 0.7507469058036804, + "learning_rate": 9.972824445486539e-05, + "loss": 0.7265, + "step": 10400 + }, + { + "epoch": 0.06650652287798832, + "grad_norm": 0.8238981366157532, + "learning_rate": 9.972772177016081e-05, + "loss": 1.0, + "step": 10410 + }, + { + "epoch": 0.06657041002772703, + "grad_norm": 0.9973478317260742, + "learning_rate": 9.972719858465504e-05, + "loss": 1.1175, + "step": 10420 + }, + { + "epoch": 0.06663429717746572, + "grad_norm": 1.0181374549865723, + "learning_rate": 9.972667489835338e-05, + "loss": 0.9529, + "step": 10430 + }, + { + "epoch": 0.06669818432720442, + "grad_norm": 0.5428194403648376, + "learning_rate": 9.972615071126108e-05, + "loss": 0.6749, + "step": 10440 + }, + { + "epoch": 0.06676207147694313, + "grad_norm": 1.1994624137878418, + "learning_rate": 9.972562602338341e-05, + "loss": 0.8246, + "step": 10450 + }, + { + "epoch": 0.06682595862668182, + "grad_norm": 1.502936601638794, + "learning_rate": 9.972510083472569e-05, + "loss": 0.9699, + "step": 10460 + }, + { + "epoch": 0.06688984577642053, + "grad_norm": 0.9399340748786926, + "learning_rate": 9.972457514529316e-05, + "loss": 0.8597, + "step": 10470 + }, + { + "epoch": 0.06695373292615923, + "grad_norm": 1.0776817798614502, + "learning_rate": 9.972404895509116e-05, + "loss": 1.0443, + "step": 10480 + }, + { + "epoch": 0.06701762007589794, + "grad_norm": 1.5870468616485596, + "learning_rate": 9.972352226412495e-05, + "loss": 1.1327, + "step": 10490 + }, + { + "epoch": 0.06708150722563663, + "grad_norm": 0.8504364490509033, + "learning_rate": 9.972299507239988e-05, + "loss": 0.9158, + "step": 10500 + }, + { + "epoch": 0.06714539437537534, + "grad_norm": 0.7087526321411133, + "learning_rate": 9.972246737992122e-05, + "loss": 0.9687, + "step": 10510 + }, + { + "epoch": 0.06720928152511405, + "grad_norm": 0.9799100756645203, + "learning_rate": 9.972193918669429e-05, + "loss": 1.1421, + "step": 10520 + }, + { + "epoch": 0.06727316867485274, + "grad_norm": 0.6044210195541382, + "learning_rate": 9.972141049272444e-05, + "loss": 1.1096, + "step": 10530 + }, + { + "epoch": 0.06733705582459144, + "grad_norm": 0.8850777745246887, + "learning_rate": 9.972088129801693e-05, + "loss": 0.8467, + "step": 10540 + }, + { + "epoch": 0.06740094297433015, + "grad_norm": 0.8483796715736389, + "learning_rate": 9.972035160257717e-05, + "loss": 0.9819, + "step": 10550 + }, + { + "epoch": 0.06746483012406884, + "grad_norm": 1.1407147645950317, + "learning_rate": 9.971982140641043e-05, + "loss": 0.9107, + "step": 10560 + }, + { + "epoch": 0.06752871727380755, + "grad_norm": 0.834553599357605, + "learning_rate": 9.971929070952209e-05, + "loss": 1.1262, + "step": 10570 + }, + { + "epoch": 0.06759260442354625, + "grad_norm": 1.0828417539596558, + "learning_rate": 9.971875951191747e-05, + "loss": 0.9017, + "step": 10580 + }, + { + "epoch": 0.06765649157328495, + "grad_norm": 0.5860454440116882, + "learning_rate": 9.971822781360194e-05, + "loss": 0.7191, + "step": 10590 + }, + { + "epoch": 0.06772037872302365, + "grad_norm": 0.767382025718689, + "learning_rate": 9.971769561458084e-05, + "loss": 1.048, + "step": 10600 + }, + { + "epoch": 0.06778426587276236, + "grad_norm": 0.6914779543876648, + "learning_rate": 9.971716291485953e-05, + "loss": 0.949, + "step": 10610 + }, + { + "epoch": 0.06784815302250105, + "grad_norm": 1.306636929512024, + "learning_rate": 9.971662971444338e-05, + "loss": 0.8191, + "step": 10620 + }, + { + "epoch": 0.06791204017223976, + "grad_norm": 1.0141420364379883, + "learning_rate": 9.971609601333776e-05, + "loss": 0.9747, + "step": 10630 + }, + { + "epoch": 0.06797592732197846, + "grad_norm": 0.7582118511199951, + "learning_rate": 9.971556181154802e-05, + "loss": 0.7757, + "step": 10640 + }, + { + "epoch": 0.06803981447171716, + "grad_norm": 0.6744197010993958, + "learning_rate": 9.971502710907958e-05, + "loss": 0.7907, + "step": 10650 + }, + { + "epoch": 0.06810370162145586, + "grad_norm": 1.1960172653198242, + "learning_rate": 9.971449190593782e-05, + "loss": 0.9023, + "step": 10660 + }, + { + "epoch": 0.06816758877119457, + "grad_norm": 1.0107911825180054, + "learning_rate": 9.971395620212811e-05, + "loss": 0.918, + "step": 10670 + }, + { + "epoch": 0.06823147592093326, + "grad_norm": 0.6501746773719788, + "learning_rate": 9.971341999765585e-05, + "loss": 0.9352, + "step": 10680 + }, + { + "epoch": 0.06829536307067197, + "grad_norm": 0.9184291362762451, + "learning_rate": 9.971288329252644e-05, + "loss": 1.1747, + "step": 10690 + }, + { + "epoch": 0.06835925022041067, + "grad_norm": 0.5910547971725464, + "learning_rate": 9.971234608674529e-05, + "loss": 0.7598, + "step": 10700 + }, + { + "epoch": 0.06842313737014936, + "grad_norm": 0.8851799964904785, + "learning_rate": 9.97118083803178e-05, + "loss": 0.9643, + "step": 10710 + }, + { + "epoch": 0.06848702451988807, + "grad_norm": 0.6597937941551208, + "learning_rate": 9.97112701732494e-05, + "loss": 0.9897, + "step": 10720 + }, + { + "epoch": 0.06855091166962678, + "grad_norm": 0.6581412553787231, + "learning_rate": 9.97107314655455e-05, + "loss": 1.1446, + "step": 10730 + }, + { + "epoch": 0.06861479881936547, + "grad_norm": 0.5868738293647766, + "learning_rate": 9.971019225721153e-05, + "loss": 1.0789, + "step": 10740 + }, + { + "epoch": 0.06867868596910418, + "grad_norm": 0.6730684041976929, + "learning_rate": 9.970965254825292e-05, + "loss": 0.9802, + "step": 10750 + }, + { + "epoch": 0.06874257311884288, + "grad_norm": 0.8661940097808838, + "learning_rate": 9.970911233867511e-05, + "loss": 1.0777, + "step": 10760 + }, + { + "epoch": 0.06880646026858157, + "grad_norm": 1.0571337938308716, + "learning_rate": 9.970857162848352e-05, + "loss": 1.0175, + "step": 10770 + }, + { + "epoch": 0.06887034741832028, + "grad_norm": 1.2184176445007324, + "learning_rate": 9.970803041768362e-05, + "loss": 0.9196, + "step": 10780 + }, + { + "epoch": 0.06893423456805899, + "grad_norm": 0.6517652869224548, + "learning_rate": 9.970748870628083e-05, + "loss": 0.9498, + "step": 10790 + }, + { + "epoch": 0.06899812171779768, + "grad_norm": 1.2037395238876343, + "learning_rate": 9.970694649428065e-05, + "loss": 0.785, + "step": 10800 + }, + { + "epoch": 0.06906200886753638, + "grad_norm": 0.8196636438369751, + "learning_rate": 9.97064037816885e-05, + "loss": 0.9136, + "step": 10810 + }, + { + "epoch": 0.06912589601727509, + "grad_norm": 0.9403445720672607, + "learning_rate": 9.970586056850988e-05, + "loss": 0.847, + "step": 10820 + }, + { + "epoch": 0.06918978316701378, + "grad_norm": 0.5096237659454346, + "learning_rate": 9.970531685475024e-05, + "loss": 0.8693, + "step": 10830 + }, + { + "epoch": 0.06925367031675249, + "grad_norm": 0.5676767230033875, + "learning_rate": 9.970477264041505e-05, + "loss": 0.9367, + "step": 10840 + }, + { + "epoch": 0.0693175574664912, + "grad_norm": 0.9769662618637085, + "learning_rate": 9.970422792550978e-05, + "loss": 0.9091, + "step": 10850 + }, + { + "epoch": 0.06938144461622989, + "grad_norm": 0.6873984932899475, + "learning_rate": 9.970368271003995e-05, + "loss": 0.9392, + "step": 10860 + }, + { + "epoch": 0.0694453317659686, + "grad_norm": 1.1281991004943848, + "learning_rate": 9.970313699401104e-05, + "loss": 0.8311, + "step": 10870 + }, + { + "epoch": 0.0695092189157073, + "grad_norm": 0.8184236288070679, + "learning_rate": 9.970259077742855e-05, + "loss": 0.7781, + "step": 10880 + }, + { + "epoch": 0.06957310606544599, + "grad_norm": 0.7411293983459473, + "learning_rate": 9.970204406029796e-05, + "loss": 0.8319, + "step": 10890 + }, + { + "epoch": 0.0696369932151847, + "grad_norm": 0.8405719995498657, + "learning_rate": 9.97014968426248e-05, + "loss": 1.1157, + "step": 10900 + }, + { + "epoch": 0.0697008803649234, + "grad_norm": 0.8236634731292725, + "learning_rate": 9.970094912441454e-05, + "loss": 0.8209, + "step": 10910 + }, + { + "epoch": 0.0697647675146621, + "grad_norm": 0.7503064870834351, + "learning_rate": 9.970040090567275e-05, + "loss": 1.1207, + "step": 10920 + }, + { + "epoch": 0.0698286546644008, + "grad_norm": 1.037656545639038, + "learning_rate": 9.969985218640492e-05, + "loss": 1.0938, + "step": 10930 + }, + { + "epoch": 0.06989254181413951, + "grad_norm": 2.2834203243255615, + "learning_rate": 9.969930296661658e-05, + "loss": 1.0299, + "step": 10940 + }, + { + "epoch": 0.0699564289638782, + "grad_norm": 0.47441643476486206, + "learning_rate": 9.969875324631327e-05, + "loss": 0.8998, + "step": 10950 + }, + { + "epoch": 0.0700203161136169, + "grad_norm": 0.8986606597900391, + "learning_rate": 9.969820302550051e-05, + "loss": 0.8735, + "step": 10960 + }, + { + "epoch": 0.07008420326335561, + "grad_norm": 0.6057919263839722, + "learning_rate": 9.969765230418386e-05, + "loss": 0.8311, + "step": 10970 + }, + { + "epoch": 0.0701480904130943, + "grad_norm": 0.9726822972297668, + "learning_rate": 9.969710108236885e-05, + "loss": 1.0337, + "step": 10980 + }, + { + "epoch": 0.07021197756283301, + "grad_norm": 0.875328779220581, + "learning_rate": 9.969654936006102e-05, + "loss": 0.978, + "step": 10990 + }, + { + "epoch": 0.07027586471257172, + "grad_norm": 1.4699301719665527, + "learning_rate": 9.969599713726599e-05, + "loss": 0.709, + "step": 11000 + }, + { + "epoch": 0.07033975186231041, + "grad_norm": 0.9150874614715576, + "learning_rate": 9.969544441398924e-05, + "loss": 0.7534, + "step": 11010 + }, + { + "epoch": 0.07040363901204912, + "grad_norm": 0.9999013543128967, + "learning_rate": 9.969489119023638e-05, + "loss": 1.0469, + "step": 11020 + }, + { + "epoch": 0.07046752616178782, + "grad_norm": 1.0596497058868408, + "learning_rate": 9.969433746601298e-05, + "loss": 0.8638, + "step": 11030 + }, + { + "epoch": 0.07053141331152651, + "grad_norm": 0.5560715198516846, + "learning_rate": 9.96937832413246e-05, + "loss": 0.8614, + "step": 11040 + }, + { + "epoch": 0.07059530046126522, + "grad_norm": 0.7285141944885254, + "learning_rate": 9.969322851617684e-05, + "loss": 0.7894, + "step": 11050 + }, + { + "epoch": 0.07065918761100393, + "grad_norm": 0.8218443393707275, + "learning_rate": 9.969267329057526e-05, + "loss": 1.116, + "step": 11060 + }, + { + "epoch": 0.07072307476074262, + "grad_norm": 0.7729995250701904, + "learning_rate": 9.96921175645255e-05, + "loss": 0.9044, + "step": 11070 + }, + { + "epoch": 0.07078696191048132, + "grad_norm": 0.719794511795044, + "learning_rate": 9.96915613380331e-05, + "loss": 1.182, + "step": 11080 + }, + { + "epoch": 0.07085084906022003, + "grad_norm": 0.9527838230133057, + "learning_rate": 9.96910046111037e-05, + "loss": 1.0074, + "step": 11090 + }, + { + "epoch": 0.07091473620995872, + "grad_norm": 0.7101008892059326, + "learning_rate": 9.969044738374289e-05, + "loss": 1.0559, + "step": 11100 + }, + { + "epoch": 0.07097862335969743, + "grad_norm": 0.4492223560810089, + "learning_rate": 9.968988965595629e-05, + "loss": 0.727, + "step": 11110 + }, + { + "epoch": 0.07104251050943614, + "grad_norm": 0.6947804093360901, + "learning_rate": 9.968933142774952e-05, + "loss": 0.9424, + "step": 11120 + }, + { + "epoch": 0.07110639765917483, + "grad_norm": 1.0676300525665283, + "learning_rate": 9.968877269912819e-05, + "loss": 0.7982, + "step": 11130 + }, + { + "epoch": 0.07117028480891353, + "grad_norm": 0.7446919679641724, + "learning_rate": 9.968821347009792e-05, + "loss": 0.9773, + "step": 11140 + }, + { + "epoch": 0.07123417195865224, + "grad_norm": 1.2251659631729126, + "learning_rate": 9.968765374066437e-05, + "loss": 0.8226, + "step": 11150 + }, + { + "epoch": 0.07129805910839093, + "grad_norm": 0.635826051235199, + "learning_rate": 9.968709351083315e-05, + "loss": 0.7913, + "step": 11160 + }, + { + "epoch": 0.07136194625812964, + "grad_norm": 1.43468177318573, + "learning_rate": 9.968653278060992e-05, + "loss": 0.7686, + "step": 11170 + }, + { + "epoch": 0.07142583340786834, + "grad_norm": 6.540151596069336, + "learning_rate": 9.968597155000033e-05, + "loss": 0.8114, + "step": 11180 + }, + { + "epoch": 0.07148972055760704, + "grad_norm": 1.3150196075439453, + "learning_rate": 9.968540981901e-05, + "loss": 0.6353, + "step": 11190 + }, + { + "epoch": 0.07155360770734574, + "grad_norm": 0.5050914883613586, + "learning_rate": 9.968484758764462e-05, + "loss": 0.6865, + "step": 11200 + }, + { + "epoch": 0.07161749485708445, + "grad_norm": 0.9180815815925598, + "learning_rate": 9.968428485590983e-05, + "loss": 0.9142, + "step": 11210 + }, + { + "epoch": 0.07168138200682314, + "grad_norm": 1.4517556428909302, + "learning_rate": 9.968372162381133e-05, + "loss": 0.7999, + "step": 11220 + }, + { + "epoch": 0.07174526915656185, + "grad_norm": 0.6034737229347229, + "learning_rate": 9.968315789135475e-05, + "loss": 1.23, + "step": 11230 + }, + { + "epoch": 0.07180915630630055, + "grad_norm": 0.9869849681854248, + "learning_rate": 9.96825936585458e-05, + "loss": 1.0731, + "step": 11240 + }, + { + "epoch": 0.07187304345603925, + "grad_norm": 0.6998457908630371, + "learning_rate": 9.968202892539014e-05, + "loss": 1.1126, + "step": 11250 + }, + { + "epoch": 0.07193693060577795, + "grad_norm": 0.7587766647338867, + "learning_rate": 9.968146369189349e-05, + "loss": 0.9376, + "step": 11260 + }, + { + "epoch": 0.07200081775551666, + "grad_norm": 0.9407736659049988, + "learning_rate": 9.96808979580615e-05, + "loss": 0.7904, + "step": 11270 + }, + { + "epoch": 0.07206470490525535, + "grad_norm": 1.7557258605957031, + "learning_rate": 9.968033172389989e-05, + "loss": 0.8119, + "step": 11280 + }, + { + "epoch": 0.07212859205499406, + "grad_norm": 0.6084944605827332, + "learning_rate": 9.967976498941436e-05, + "loss": 0.7708, + "step": 11290 + }, + { + "epoch": 0.07219247920473276, + "grad_norm": 0.7556819915771484, + "learning_rate": 9.967919775461063e-05, + "loss": 0.7996, + "step": 11300 + }, + { + "epoch": 0.07225636635447147, + "grad_norm": 0.7954988479614258, + "learning_rate": 9.967863001949438e-05, + "loss": 1.1191, + "step": 11310 + }, + { + "epoch": 0.07232025350421016, + "grad_norm": 0.7278555631637573, + "learning_rate": 9.967806178407135e-05, + "loss": 0.8343, + "step": 11320 + }, + { + "epoch": 0.07238414065394887, + "grad_norm": 0.7036782503128052, + "learning_rate": 9.967749304834728e-05, + "loss": 0.8194, + "step": 11330 + }, + { + "epoch": 0.07244802780368757, + "grad_norm": 1.3781989812850952, + "learning_rate": 9.967692381232786e-05, + "loss": 0.8285, + "step": 11340 + }, + { + "epoch": 0.07251191495342627, + "grad_norm": 0.885075569152832, + "learning_rate": 9.967635407601886e-05, + "loss": 1.0042, + "step": 11350 + }, + { + "epoch": 0.07257580210316497, + "grad_norm": 0.6959792375564575, + "learning_rate": 9.967578383942597e-05, + "loss": 0.8172, + "step": 11360 + }, + { + "epoch": 0.07263968925290368, + "grad_norm": 1.800525188446045, + "learning_rate": 9.967521310255498e-05, + "loss": 0.8708, + "step": 11370 + }, + { + "epoch": 0.07270357640264237, + "grad_norm": 0.6853658556938171, + "learning_rate": 9.96746418654116e-05, + "loss": 0.8518, + "step": 11380 + }, + { + "epoch": 0.07276746355238108, + "grad_norm": 0.7943517565727234, + "learning_rate": 9.967407012800163e-05, + "loss": 0.7797, + "step": 11390 + }, + { + "epoch": 0.07283135070211978, + "grad_norm": 0.7777195572853088, + "learning_rate": 9.967349789033078e-05, + "loss": 0.7811, + "step": 11400 + }, + { + "epoch": 0.07289523785185847, + "grad_norm": 0.9152284860610962, + "learning_rate": 9.967292515240486e-05, + "loss": 0.7322, + "step": 11410 + }, + { + "epoch": 0.07295912500159718, + "grad_norm": 1.2940709590911865, + "learning_rate": 9.967235191422957e-05, + "loss": 0.7784, + "step": 11420 + }, + { + "epoch": 0.07302301215133589, + "grad_norm": 1.4273176193237305, + "learning_rate": 9.967177817581075e-05, + "loss": 1.334, + "step": 11430 + }, + { + "epoch": 0.07308689930107458, + "grad_norm": 0.9415301084518433, + "learning_rate": 9.967120393715414e-05, + "loss": 1.33, + "step": 11440 + }, + { + "epoch": 0.07315078645081328, + "grad_norm": 1.6769905090332031, + "learning_rate": 9.967062919826552e-05, + "loss": 0.8804, + "step": 11450 + }, + { + "epoch": 0.07321467360055199, + "grad_norm": 0.8233237266540527, + "learning_rate": 9.967005395915072e-05, + "loss": 0.9747, + "step": 11460 + }, + { + "epoch": 0.07327856075029068, + "grad_norm": 0.793849527835846, + "learning_rate": 9.966947821981551e-05, + "loss": 0.736, + "step": 11470 + }, + { + "epoch": 0.07334244790002939, + "grad_norm": 0.8288117051124573, + "learning_rate": 9.966890198026566e-05, + "loss": 0.9165, + "step": 11480 + }, + { + "epoch": 0.0734063350497681, + "grad_norm": 0.7047694325447083, + "learning_rate": 9.966832524050702e-05, + "loss": 0.8662, + "step": 11490 + }, + { + "epoch": 0.07347022219950679, + "grad_norm": 0.6443949937820435, + "learning_rate": 9.966774800054535e-05, + "loss": 1.0167, + "step": 11500 + }, + { + "epoch": 0.0735341093492455, + "grad_norm": 0.6362110376358032, + "learning_rate": 9.966717026038651e-05, + "loss": 1.0175, + "step": 11510 + }, + { + "epoch": 0.0735979964989842, + "grad_norm": 0.7651115655899048, + "learning_rate": 9.96665920200363e-05, + "loss": 0.914, + "step": 11520 + }, + { + "epoch": 0.07366188364872289, + "grad_norm": 0.7375466823577881, + "learning_rate": 9.966601327950052e-05, + "loss": 0.9936, + "step": 11530 + }, + { + "epoch": 0.0737257707984616, + "grad_norm": 0.7288793325424194, + "learning_rate": 9.966543403878503e-05, + "loss": 1.1943, + "step": 11540 + }, + { + "epoch": 0.0737896579482003, + "grad_norm": 0.8896105289459229, + "learning_rate": 9.966485429789565e-05, + "loss": 1.0228, + "step": 11550 + }, + { + "epoch": 0.073853545097939, + "grad_norm": 1.1143486499786377, + "learning_rate": 9.966427405683823e-05, + "loss": 0.8327, + "step": 11560 + }, + { + "epoch": 0.0739174322476777, + "grad_norm": 0.9701015949249268, + "learning_rate": 9.96636933156186e-05, + "loss": 0.8488, + "step": 11570 + }, + { + "epoch": 0.07398131939741641, + "grad_norm": 0.8440617322921753, + "learning_rate": 9.966311207424261e-05, + "loss": 1.1248, + "step": 11580 + }, + { + "epoch": 0.0740452065471551, + "grad_norm": 1.1028122901916504, + "learning_rate": 9.96625303327161e-05, + "loss": 0.941, + "step": 11590 + }, + { + "epoch": 0.07410909369689381, + "grad_norm": 0.8367504477500916, + "learning_rate": 9.966194809104498e-05, + "loss": 1.0069, + "step": 11600 + }, + { + "epoch": 0.07417298084663251, + "grad_norm": 0.6582353115081787, + "learning_rate": 9.966136534923507e-05, + "loss": 1.0914, + "step": 11610 + }, + { + "epoch": 0.0742368679963712, + "grad_norm": 0.720551609992981, + "learning_rate": 9.966078210729224e-05, + "loss": 0.8932, + "step": 11620 + }, + { + "epoch": 0.07430075514610991, + "grad_norm": 1.5726115703582764, + "learning_rate": 9.966019836522235e-05, + "loss": 0.666, + "step": 11630 + }, + { + "epoch": 0.07436464229584862, + "grad_norm": 0.8888491988182068, + "learning_rate": 9.965961412303133e-05, + "loss": 0.8511, + "step": 11640 + }, + { + "epoch": 0.07442852944558731, + "grad_norm": 0.9958298206329346, + "learning_rate": 9.965902938072503e-05, + "loss": 0.8403, + "step": 11650 + }, + { + "epoch": 0.07449241659532602, + "grad_norm": 0.9258823394775391, + "learning_rate": 9.965844413830934e-05, + "loss": 0.9406, + "step": 11660 + }, + { + "epoch": 0.07455630374506472, + "grad_norm": 0.6303139328956604, + "learning_rate": 9.965785839579016e-05, + "loss": 0.8162, + "step": 11670 + }, + { + "epoch": 0.07462019089480341, + "grad_norm": 0.8224695920944214, + "learning_rate": 9.965727215317338e-05, + "loss": 0.8578, + "step": 11680 + }, + { + "epoch": 0.07468407804454212, + "grad_norm": 0.7703375816345215, + "learning_rate": 9.965668541046491e-05, + "loss": 0.9871, + "step": 11690 + }, + { + "epoch": 0.07474796519428083, + "grad_norm": 0.5986992716789246, + "learning_rate": 9.965609816767066e-05, + "loss": 0.793, + "step": 11700 + }, + { + "epoch": 0.07481185234401952, + "grad_norm": 0.7556684613227844, + "learning_rate": 9.965551042479655e-05, + "loss": 0.9343, + "step": 11710 + }, + { + "epoch": 0.07487573949375823, + "grad_norm": 0.7659729719161987, + "learning_rate": 9.965492218184848e-05, + "loss": 0.8594, + "step": 11720 + }, + { + "epoch": 0.07493962664349693, + "grad_norm": 0.7803331017494202, + "learning_rate": 9.965433343883239e-05, + "loss": 0.7292, + "step": 11730 + }, + { + "epoch": 0.07500351379323562, + "grad_norm": 0.9800279140472412, + "learning_rate": 9.96537441957542e-05, + "loss": 0.7982, + "step": 11740 + }, + { + "epoch": 0.07506740094297433, + "grad_norm": 1.3977315425872803, + "learning_rate": 9.965315445261986e-05, + "loss": 0.7011, + "step": 11750 + }, + { + "epoch": 0.07513128809271304, + "grad_norm": 0.6457341313362122, + "learning_rate": 9.965256420943529e-05, + "loss": 0.8958, + "step": 11760 + }, + { + "epoch": 0.07519517524245173, + "grad_norm": 0.789249062538147, + "learning_rate": 9.965197346620645e-05, + "loss": 0.8956, + "step": 11770 + }, + { + "epoch": 0.07525906239219043, + "grad_norm": 0.8489546179771423, + "learning_rate": 9.965138222293928e-05, + "loss": 0.8684, + "step": 11780 + }, + { + "epoch": 0.07532294954192914, + "grad_norm": 0.7303208112716675, + "learning_rate": 9.965079047963974e-05, + "loss": 0.9646, + "step": 11790 + }, + { + "epoch": 0.07538683669166783, + "grad_norm": 3.839034080505371, + "learning_rate": 9.965019823631378e-05, + "loss": 0.8553, + "step": 11800 + }, + { + "epoch": 0.07545072384140654, + "grad_norm": 1.2064359188079834, + "learning_rate": 9.964960549296736e-05, + "loss": 1.0195, + "step": 11810 + }, + { + "epoch": 0.07551461099114525, + "grad_norm": 0.7502697706222534, + "learning_rate": 9.964901224960647e-05, + "loss": 0.9259, + "step": 11820 + }, + { + "epoch": 0.07557849814088394, + "grad_norm": 0.5781645774841309, + "learning_rate": 9.964841850623709e-05, + "loss": 0.8668, + "step": 11830 + }, + { + "epoch": 0.07564238529062264, + "grad_norm": 0.8652671575546265, + "learning_rate": 9.964782426286516e-05, + "loss": 0.8489, + "step": 11840 + }, + { + "epoch": 0.07570627244036135, + "grad_norm": 0.9653028845787048, + "learning_rate": 9.96472295194967e-05, + "loss": 0.9514, + "step": 11850 + }, + { + "epoch": 0.07577015959010004, + "grad_norm": 2.5349843502044678, + "learning_rate": 9.964663427613769e-05, + "loss": 1.0536, + "step": 11860 + }, + { + "epoch": 0.07583404673983875, + "grad_norm": 1.0257644653320312, + "learning_rate": 9.96460385327941e-05, + "loss": 1.05, + "step": 11870 + }, + { + "epoch": 0.07589793388957745, + "grad_norm": 0.6599146723747253, + "learning_rate": 9.964544228947199e-05, + "loss": 0.9347, + "step": 11880 + }, + { + "epoch": 0.07596182103931615, + "grad_norm": 1.0453253984451294, + "learning_rate": 9.96448455461773e-05, + "loss": 0.9054, + "step": 11890 + }, + { + "epoch": 0.07602570818905485, + "grad_norm": 0.5662599802017212, + "learning_rate": 9.964424830291607e-05, + "loss": 0.9117, + "step": 11900 + }, + { + "epoch": 0.07608959533879356, + "grad_norm": 0.6186836361885071, + "learning_rate": 9.964365055969431e-05, + "loss": 0.9725, + "step": 11910 + }, + { + "epoch": 0.07615348248853225, + "grad_norm": 0.8609874844551086, + "learning_rate": 9.964305231651804e-05, + "loss": 0.9634, + "step": 11920 + }, + { + "epoch": 0.07621736963827096, + "grad_norm": 0.8729275465011597, + "learning_rate": 9.96424535733933e-05, + "loss": 0.9384, + "step": 11930 + }, + { + "epoch": 0.07628125678800966, + "grad_norm": 0.9938400387763977, + "learning_rate": 9.964185433032609e-05, + "loss": 0.8695, + "step": 11940 + }, + { + "epoch": 0.07634514393774836, + "grad_norm": 0.836526095867157, + "learning_rate": 9.964125458732247e-05, + "loss": 0.9405, + "step": 11950 + }, + { + "epoch": 0.07640903108748706, + "grad_norm": 0.7302273511886597, + "learning_rate": 9.964065434438846e-05, + "loss": 1.0793, + "step": 11960 + }, + { + "epoch": 0.07647291823722577, + "grad_norm": 0.49212926626205444, + "learning_rate": 9.964005360153013e-05, + "loss": 0.8772, + "step": 11970 + }, + { + "epoch": 0.07653680538696446, + "grad_norm": 0.6889157295227051, + "learning_rate": 9.963945235875351e-05, + "loss": 0.9, + "step": 11980 + }, + { + "epoch": 0.07660069253670317, + "grad_norm": 0.9073895215988159, + "learning_rate": 9.963885061606466e-05, + "loss": 1.2127, + "step": 11990 + }, + { + "epoch": 0.07666457968644187, + "grad_norm": 0.8105494976043701, + "learning_rate": 9.963824837346963e-05, + "loss": 0.8683, + "step": 12000 + }, + { + "epoch": 0.07672846683618056, + "grad_norm": 0.9559453129768372, + "learning_rate": 9.963764563097451e-05, + "loss": 0.8229, + "step": 12010 + }, + { + "epoch": 0.07679235398591927, + "grad_norm": 0.7197737693786621, + "learning_rate": 9.963704238858535e-05, + "loss": 1.0417, + "step": 12020 + }, + { + "epoch": 0.07685624113565798, + "grad_norm": 1.704092025756836, + "learning_rate": 9.963643864630823e-05, + "loss": 0.8046, + "step": 12030 + }, + { + "epoch": 0.07692012828539667, + "grad_norm": 0.7579613327980042, + "learning_rate": 9.963583440414923e-05, + "loss": 0.9269, + "step": 12040 + }, + { + "epoch": 0.07698401543513538, + "grad_norm": 1.0408282279968262, + "learning_rate": 9.963522966211444e-05, + "loss": 1.0785, + "step": 12050 + }, + { + "epoch": 0.07704790258487408, + "grad_norm": 0.5655786991119385, + "learning_rate": 9.963462442020994e-05, + "loss": 0.8481, + "step": 12060 + }, + { + "epoch": 0.07711178973461277, + "grad_norm": 0.6558650732040405, + "learning_rate": 9.963401867844184e-05, + "loss": 0.9213, + "step": 12070 + }, + { + "epoch": 0.07717567688435148, + "grad_norm": 0.9138306975364685, + "learning_rate": 9.963341243681623e-05, + "loss": 0.8109, + "step": 12080 + }, + { + "epoch": 0.07723956403409019, + "grad_norm": 0.8476769924163818, + "learning_rate": 9.963280569533923e-05, + "loss": 0.8877, + "step": 12090 + }, + { + "epoch": 0.07730345118382888, + "grad_norm": 1.9213597774505615, + "learning_rate": 9.963219845401692e-05, + "loss": 0.8959, + "step": 12100 + }, + { + "epoch": 0.07736733833356758, + "grad_norm": 0.6933993697166443, + "learning_rate": 9.963159071285544e-05, + "loss": 0.8968, + "step": 12110 + }, + { + "epoch": 0.07743122548330629, + "grad_norm": 0.6891202926635742, + "learning_rate": 9.963098247186091e-05, + "loss": 1.2008, + "step": 12120 + }, + { + "epoch": 0.07749511263304498, + "grad_norm": 0.7064499855041504, + "learning_rate": 9.963037373103944e-05, + "loss": 0.9018, + "step": 12130 + }, + { + "epoch": 0.07755899978278369, + "grad_norm": 0.7487188577651978, + "learning_rate": 9.962976449039717e-05, + "loss": 1.0011, + "step": 12140 + }, + { + "epoch": 0.0776228869325224, + "grad_norm": 0.8367332816123962, + "learning_rate": 9.962915474994023e-05, + "loss": 0.9068, + "step": 12150 + }, + { + "epoch": 0.0776867740822611, + "grad_norm": 1.0736783742904663, + "learning_rate": 9.962854450967478e-05, + "loss": 0.9293, + "step": 12160 + }, + { + "epoch": 0.0777506612319998, + "grad_norm": 0.715390682220459, + "learning_rate": 9.962793376960695e-05, + "loss": 0.9036, + "step": 12170 + }, + { + "epoch": 0.0778145483817385, + "grad_norm": 1.1531165838241577, + "learning_rate": 9.962732252974289e-05, + "loss": 0.7847, + "step": 12180 + }, + { + "epoch": 0.0778784355314772, + "grad_norm": 0.6619348526000977, + "learning_rate": 9.962671079008876e-05, + "loss": 1.0075, + "step": 12190 + }, + { + "epoch": 0.0779423226812159, + "grad_norm": 1.0544220209121704, + "learning_rate": 9.962609855065072e-05, + "loss": 0.9982, + "step": 12200 + }, + { + "epoch": 0.0780062098309546, + "grad_norm": 0.6626638174057007, + "learning_rate": 9.962548581143494e-05, + "loss": 1.0559, + "step": 12210 + }, + { + "epoch": 0.07807009698069331, + "grad_norm": 1.291588544845581, + "learning_rate": 9.962487257244757e-05, + "loss": 1.0497, + "step": 12220 + }, + { + "epoch": 0.078133984130432, + "grad_norm": 0.7503036260604858, + "learning_rate": 9.962425883369481e-05, + "loss": 0.9837, + "step": 12230 + }, + { + "epoch": 0.07819787128017071, + "grad_norm": 0.789021909236908, + "learning_rate": 9.962364459518283e-05, + "loss": 0.8779, + "step": 12240 + }, + { + "epoch": 0.07826175842990941, + "grad_norm": 1.2305183410644531, + "learning_rate": 9.962302985691783e-05, + "loss": 0.9292, + "step": 12250 + }, + { + "epoch": 0.0783256455796481, + "grad_norm": 1.5961018800735474, + "learning_rate": 9.962241461890598e-05, + "loss": 0.9467, + "step": 12260 + }, + { + "epoch": 0.07838953272938681, + "grad_norm": 0.5835550427436829, + "learning_rate": 9.962179888115348e-05, + "loss": 1.0957, + "step": 12270 + }, + { + "epoch": 0.07845341987912552, + "grad_norm": 1.0020620822906494, + "learning_rate": 9.962118264366655e-05, + "loss": 0.9427, + "step": 12280 + }, + { + "epoch": 0.07851730702886421, + "grad_norm": 0.6819837689399719, + "learning_rate": 9.962056590645136e-05, + "loss": 1.0855, + "step": 12290 + }, + { + "epoch": 0.07858119417860292, + "grad_norm": 1.3488112688064575, + "learning_rate": 9.961994866951416e-05, + "loss": 0.6407, + "step": 12300 + }, + { + "epoch": 0.07864508132834162, + "grad_norm": 0.8530036807060242, + "learning_rate": 9.961933093286115e-05, + "loss": 1.0095, + "step": 12310 + }, + { + "epoch": 0.07870896847808032, + "grad_norm": 0.7318217158317566, + "learning_rate": 9.961871269649854e-05, + "loss": 0.8607, + "step": 12320 + }, + { + "epoch": 0.07877285562781902, + "grad_norm": 0.5192087292671204, + "learning_rate": 9.96180939604326e-05, + "loss": 0.7035, + "step": 12330 + }, + { + "epoch": 0.07883674277755773, + "grad_norm": 0.8365872502326965, + "learning_rate": 9.961747472466949e-05, + "loss": 1.4109, + "step": 12340 + }, + { + "epoch": 0.07890062992729642, + "grad_norm": 0.9271693229675293, + "learning_rate": 9.96168549892155e-05, + "loss": 0.842, + "step": 12350 + }, + { + "epoch": 0.07896451707703513, + "grad_norm": 1.00367271900177, + "learning_rate": 9.961623475407684e-05, + "loss": 1.0556, + "step": 12360 + }, + { + "epoch": 0.07902840422677383, + "grad_norm": 1.339418888092041, + "learning_rate": 9.96156140192598e-05, + "loss": 0.8171, + "step": 12370 + }, + { + "epoch": 0.07909229137651252, + "grad_norm": 1.03416109085083, + "learning_rate": 9.961499278477058e-05, + "loss": 0.8902, + "step": 12380 + }, + { + "epoch": 0.07915617852625123, + "grad_norm": 0.847169041633606, + "learning_rate": 9.961437105061546e-05, + "loss": 0.9201, + "step": 12390 + }, + { + "epoch": 0.07922006567598994, + "grad_norm": 1.1525788307189941, + "learning_rate": 9.961374881680072e-05, + "loss": 1.054, + "step": 12400 + }, + { + "epoch": 0.07928395282572863, + "grad_norm": 0.7588199973106384, + "learning_rate": 9.96131260833326e-05, + "loss": 0.9179, + "step": 12410 + }, + { + "epoch": 0.07934783997546734, + "grad_norm": 1.2406294345855713, + "learning_rate": 9.961250285021737e-05, + "loss": 1.1218, + "step": 12420 + }, + { + "epoch": 0.07941172712520604, + "grad_norm": 0.7575234174728394, + "learning_rate": 9.961187911746133e-05, + "loss": 1.0122, + "step": 12430 + }, + { + "epoch": 0.07947561427494473, + "grad_norm": 0.7496919631958008, + "learning_rate": 9.961125488507072e-05, + "loss": 1.0282, + "step": 12440 + }, + { + "epoch": 0.07953950142468344, + "grad_norm": 0.8383338451385498, + "learning_rate": 9.961063015305188e-05, + "loss": 0.9828, + "step": 12450 + }, + { + "epoch": 0.07960338857442215, + "grad_norm": 1.0005531311035156, + "learning_rate": 9.961000492141106e-05, + "loss": 1.061, + "step": 12460 + }, + { + "epoch": 0.07966727572416084, + "grad_norm": 0.9767794013023376, + "learning_rate": 9.960937919015458e-05, + "loss": 1.0097, + "step": 12470 + }, + { + "epoch": 0.07973116287389954, + "grad_norm": 0.7348878383636475, + "learning_rate": 9.960875295928874e-05, + "loss": 0.8203, + "step": 12480 + }, + { + "epoch": 0.07979505002363825, + "grad_norm": 0.7473248243331909, + "learning_rate": 9.960812622881982e-05, + "loss": 0.8261, + "step": 12490 + }, + { + "epoch": 0.07985893717337694, + "grad_norm": 0.6296994686126709, + "learning_rate": 9.960749899875417e-05, + "loss": 0.9531, + "step": 12500 + }, + { + "epoch": 0.07992282432311565, + "grad_norm": 0.48655831813812256, + "learning_rate": 9.960687126909807e-05, + "loss": 0.8131, + "step": 12510 + }, + { + "epoch": 0.07998671147285435, + "grad_norm": 0.8312428593635559, + "learning_rate": 9.960624303985787e-05, + "loss": 0.7988, + "step": 12520 + }, + { + "epoch": 0.08005059862259305, + "grad_norm": 0.7593886256217957, + "learning_rate": 9.96056143110399e-05, + "loss": 0.6993, + "step": 12530 + }, + { + "epoch": 0.08011448577233175, + "grad_norm": 0.9787190556526184, + "learning_rate": 9.960498508265046e-05, + "loss": 1.1168, + "step": 12540 + }, + { + "epoch": 0.08017837292207046, + "grad_norm": 1.374013066291809, + "learning_rate": 9.960435535469591e-05, + "loss": 0.959, + "step": 12550 + }, + { + "epoch": 0.08024226007180915, + "grad_norm": 0.632503867149353, + "learning_rate": 9.960372512718258e-05, + "loss": 0.9161, + "step": 12560 + }, + { + "epoch": 0.08030614722154786, + "grad_norm": 0.7403663992881775, + "learning_rate": 9.960309440011685e-05, + "loss": 0.5914, + "step": 12570 + }, + { + "epoch": 0.08037003437128656, + "grad_norm": 0.691646158695221, + "learning_rate": 9.960246317350503e-05, + "loss": 0.8991, + "step": 12580 + }, + { + "epoch": 0.08043392152102526, + "grad_norm": 0.5965979099273682, + "learning_rate": 9.960183144735348e-05, + "loss": 0.81, + "step": 12590 + }, + { + "epoch": 0.08049780867076396, + "grad_norm": 0.9545162320137024, + "learning_rate": 9.960119922166859e-05, + "loss": 1.0659, + "step": 12600 + }, + { + "epoch": 0.08056169582050267, + "grad_norm": 2.2266764640808105, + "learning_rate": 9.960056649645673e-05, + "loss": 1.2056, + "step": 12610 + }, + { + "epoch": 0.08062558297024136, + "grad_norm": 1.257367730140686, + "learning_rate": 9.959993327172423e-05, + "loss": 1.0144, + "step": 12620 + }, + { + "epoch": 0.08068947011998007, + "grad_norm": 0.8366072177886963, + "learning_rate": 9.959929954747751e-05, + "loss": 0.896, + "step": 12630 + }, + { + "epoch": 0.08075335726971877, + "grad_norm": 0.71613609790802, + "learning_rate": 9.959866532372292e-05, + "loss": 0.7121, + "step": 12640 + }, + { + "epoch": 0.08081724441945747, + "grad_norm": 0.678428053855896, + "learning_rate": 9.959803060046687e-05, + "loss": 0.8114, + "step": 12650 + }, + { + "epoch": 0.08088113156919617, + "grad_norm": 0.8528268337249756, + "learning_rate": 9.959739537771573e-05, + "loss": 0.9052, + "step": 12660 + }, + { + "epoch": 0.08094501871893488, + "grad_norm": 0.8090612292289734, + "learning_rate": 9.959675965547592e-05, + "loss": 0.9429, + "step": 12670 + }, + { + "epoch": 0.08100890586867357, + "grad_norm": 1.0413676500320435, + "learning_rate": 9.959612343375385e-05, + "loss": 0.9671, + "step": 12680 + }, + { + "epoch": 0.08107279301841228, + "grad_norm": 0.6349504590034485, + "learning_rate": 9.959548671255588e-05, + "loss": 1.0272, + "step": 12690 + }, + { + "epoch": 0.08113668016815098, + "grad_norm": 1.0371969938278198, + "learning_rate": 9.959484949188846e-05, + "loss": 0.7439, + "step": 12700 + }, + { + "epoch": 0.08120056731788967, + "grad_norm": 0.7047412991523743, + "learning_rate": 9.9594211771758e-05, + "loss": 0.8986, + "step": 12710 + }, + { + "epoch": 0.08126445446762838, + "grad_norm": 0.659905195236206, + "learning_rate": 9.959357355217093e-05, + "loss": 0.7917, + "step": 12720 + }, + { + "epoch": 0.08132834161736709, + "grad_norm": 0.7714025378227234, + "learning_rate": 9.959293483313368e-05, + "loss": 0.826, + "step": 12730 + }, + { + "epoch": 0.08139222876710578, + "grad_norm": 1.3492543697357178, + "learning_rate": 9.959229561465266e-05, + "loss": 1.0079, + "step": 12740 + }, + { + "epoch": 0.08145611591684448, + "grad_norm": 0.7474777698516846, + "learning_rate": 9.959165589673432e-05, + "loss": 0.8973, + "step": 12750 + }, + { + "epoch": 0.08152000306658319, + "grad_norm": 0.6047500371932983, + "learning_rate": 9.959101567938509e-05, + "loss": 0.8909, + "step": 12760 + }, + { + "epoch": 0.08158389021632188, + "grad_norm": 0.7488225698471069, + "learning_rate": 9.959037496261146e-05, + "loss": 0.9554, + "step": 12770 + }, + { + "epoch": 0.08164777736606059, + "grad_norm": 1.0440471172332764, + "learning_rate": 9.958973374641982e-05, + "loss": 0.7622, + "step": 12780 + }, + { + "epoch": 0.0817116645157993, + "grad_norm": 0.6892119646072388, + "learning_rate": 9.958909203081668e-05, + "loss": 0.9316, + "step": 12790 + }, + { + "epoch": 0.08177555166553799, + "grad_norm": 0.7813330292701721, + "learning_rate": 9.958844981580847e-05, + "loss": 1.0202, + "step": 12800 + }, + { + "epoch": 0.0818394388152767, + "grad_norm": 0.926389217376709, + "learning_rate": 9.958780710140167e-05, + "loss": 1.0061, + "step": 12810 + }, + { + "epoch": 0.0819033259650154, + "grad_norm": 0.7981832027435303, + "learning_rate": 9.958716388760277e-05, + "loss": 0.9619, + "step": 12820 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 0.7643110752105713, + "learning_rate": 9.958652017441822e-05, + "loss": 1.0358, + "step": 12830 + }, + { + "epoch": 0.0820311002644928, + "grad_norm": 2.3932769298553467, + "learning_rate": 9.958587596185451e-05, + "loss": 0.8638, + "step": 12840 + }, + { + "epoch": 0.0820949874142315, + "grad_norm": 0.6485501527786255, + "learning_rate": 9.958523124991814e-05, + "loss": 0.8252, + "step": 12850 + }, + { + "epoch": 0.0821588745639702, + "grad_norm": 1.1081517934799194, + "learning_rate": 9.958458603861559e-05, + "loss": 0.6834, + "step": 12860 + }, + { + "epoch": 0.0822227617137089, + "grad_norm": 0.6985851526260376, + "learning_rate": 9.958394032795335e-05, + "loss": 0.8498, + "step": 12870 + }, + { + "epoch": 0.08228664886344761, + "grad_norm": 0.9049435257911682, + "learning_rate": 9.958329411793796e-05, + "loss": 0.832, + "step": 12880 + }, + { + "epoch": 0.0823505360131863, + "grad_norm": 1.0366233587265015, + "learning_rate": 9.958264740857588e-05, + "loss": 0.6583, + "step": 12890 + }, + { + "epoch": 0.08241442316292501, + "grad_norm": 0.5812174081802368, + "learning_rate": 9.958200019987364e-05, + "loss": 0.8656, + "step": 12900 + }, + { + "epoch": 0.08247831031266371, + "grad_norm": 0.5848665237426758, + "learning_rate": 9.95813524918378e-05, + "loss": 0.9233, + "step": 12910 + }, + { + "epoch": 0.0825421974624024, + "grad_norm": 0.8434141278266907, + "learning_rate": 9.958070428447481e-05, + "loss": 0.8677, + "step": 12920 + }, + { + "epoch": 0.08260608461214111, + "grad_norm": 0.6627490520477295, + "learning_rate": 9.958005557779125e-05, + "loss": 1.0076, + "step": 12930 + }, + { + "epoch": 0.08266997176187982, + "grad_norm": 0.5368894934654236, + "learning_rate": 9.957940637179364e-05, + "loss": 1.0226, + "step": 12940 + }, + { + "epoch": 0.08273385891161851, + "grad_norm": 0.9018540978431702, + "learning_rate": 9.95787566664885e-05, + "loss": 0.9392, + "step": 12950 + }, + { + "epoch": 0.08279774606135722, + "grad_norm": 0.9104921817779541, + "learning_rate": 9.957810646188242e-05, + "loss": 0.8816, + "step": 12960 + }, + { + "epoch": 0.08286163321109592, + "grad_norm": 1.005777359008789, + "learning_rate": 9.957745575798189e-05, + "loss": 0.9567, + "step": 12970 + }, + { + "epoch": 0.08292552036083463, + "grad_norm": 0.9677864909172058, + "learning_rate": 9.957680455479348e-05, + "loss": 0.865, + "step": 12980 + }, + { + "epoch": 0.08298940751057332, + "grad_norm": 0.5736163854598999, + "learning_rate": 9.957615285232379e-05, + "loss": 0.9897, + "step": 12990 + }, + { + "epoch": 0.08305329466031203, + "grad_norm": 1.2024660110473633, + "learning_rate": 9.957550065057932e-05, + "loss": 0.8672, + "step": 13000 + }, + { + "epoch": 0.08311718181005073, + "grad_norm": 0.7755523920059204, + "learning_rate": 9.95748479495667e-05, + "loss": 0.833, + "step": 13010 + }, + { + "epoch": 0.08318106895978943, + "grad_norm": 2.249293088912964, + "learning_rate": 9.957419474929246e-05, + "loss": 0.9011, + "step": 13020 + }, + { + "epoch": 0.08324495610952813, + "grad_norm": 1.1623985767364502, + "learning_rate": 9.957354104976317e-05, + "loss": 0.9665, + "step": 13030 + }, + { + "epoch": 0.08330884325926684, + "grad_norm": 0.9698325395584106, + "learning_rate": 9.957288685098547e-05, + "loss": 0.581, + "step": 13040 + }, + { + "epoch": 0.08337273040900553, + "grad_norm": 1.5064680576324463, + "learning_rate": 9.957223215296589e-05, + "loss": 0.993, + "step": 13050 + }, + { + "epoch": 0.08343661755874424, + "grad_norm": 0.9795089960098267, + "learning_rate": 9.957157695571106e-05, + "loss": 0.8646, + "step": 13060 + }, + { + "epoch": 0.08350050470848294, + "grad_norm": 1.1535509824752808, + "learning_rate": 9.957092125922755e-05, + "loss": 0.9196, + "step": 13070 + }, + { + "epoch": 0.08356439185822163, + "grad_norm": 0.5842729210853577, + "learning_rate": 9.957026506352198e-05, + "loss": 1.1444, + "step": 13080 + }, + { + "epoch": 0.08362827900796034, + "grad_norm": 1.164316177368164, + "learning_rate": 9.956960836860096e-05, + "loss": 0.8979, + "step": 13090 + }, + { + "epoch": 0.08369216615769905, + "grad_norm": 1.0627108812332153, + "learning_rate": 9.956895117447112e-05, + "loss": 1.1704, + "step": 13100 + }, + { + "epoch": 0.08375605330743774, + "grad_norm": 0.5449188947677612, + "learning_rate": 9.956829348113903e-05, + "loss": 0.9608, + "step": 13110 + }, + { + "epoch": 0.08381994045717645, + "grad_norm": 0.8680428862571716, + "learning_rate": 9.956763528861135e-05, + "loss": 0.9228, + "step": 13120 + }, + { + "epoch": 0.08388382760691515, + "grad_norm": 0.9110902547836304, + "learning_rate": 9.95669765968947e-05, + "loss": 1.4213, + "step": 13130 + }, + { + "epoch": 0.08394771475665384, + "grad_norm": 2.3549108505249023, + "learning_rate": 9.956631740599571e-05, + "loss": 0.9036, + "step": 13140 + }, + { + "epoch": 0.08401160190639255, + "grad_norm": 0.9437476992607117, + "learning_rate": 9.956565771592103e-05, + "loss": 0.9577, + "step": 13150 + }, + { + "epoch": 0.08407548905613126, + "grad_norm": 0.5156351923942566, + "learning_rate": 9.956499752667729e-05, + "loss": 1.0223, + "step": 13160 + }, + { + "epoch": 0.08413937620586995, + "grad_norm": 0.6962876915931702, + "learning_rate": 9.956433683827115e-05, + "loss": 0.7827, + "step": 13170 + }, + { + "epoch": 0.08420326335560865, + "grad_norm": 1.191227912902832, + "learning_rate": 9.956367565070927e-05, + "loss": 0.7738, + "step": 13180 + }, + { + "epoch": 0.08426715050534736, + "grad_norm": 0.9918831586837769, + "learning_rate": 9.956301396399829e-05, + "loss": 1.2268, + "step": 13190 + }, + { + "epoch": 0.08433103765508605, + "grad_norm": 1.3545849323272705, + "learning_rate": 9.956235177814488e-05, + "loss": 0.9728, + "step": 13200 + }, + { + "epoch": 0.08439492480482476, + "grad_norm": 0.8052165508270264, + "learning_rate": 9.956168909315571e-05, + "loss": 0.8022, + "step": 13210 + }, + { + "epoch": 0.08445881195456346, + "grad_norm": 1.1841431856155396, + "learning_rate": 9.956102590903744e-05, + "loss": 0.8663, + "step": 13220 + }, + { + "epoch": 0.08452269910430216, + "grad_norm": 1.1858928203582764, + "learning_rate": 9.956036222579679e-05, + "loss": 0.8862, + "step": 13230 + }, + { + "epoch": 0.08458658625404086, + "grad_norm": 0.6900216937065125, + "learning_rate": 9.955969804344039e-05, + "loss": 0.7973, + "step": 13240 + }, + { + "epoch": 0.08465047340377957, + "grad_norm": 0.737177848815918, + "learning_rate": 9.955903336197497e-05, + "loss": 0.6908, + "step": 13250 + }, + { + "epoch": 0.08471436055351826, + "grad_norm": 1.1123918294906616, + "learning_rate": 9.955836818140721e-05, + "loss": 0.8086, + "step": 13260 + }, + { + "epoch": 0.08477824770325697, + "grad_norm": 0.9774020910263062, + "learning_rate": 9.95577025017438e-05, + "loss": 0.8224, + "step": 13270 + }, + { + "epoch": 0.08484213485299567, + "grad_norm": 1.0861930847167969, + "learning_rate": 9.955703632299144e-05, + "loss": 1.216, + "step": 13280 + }, + { + "epoch": 0.08490602200273437, + "grad_norm": 0.6377803683280945, + "learning_rate": 9.955636964515688e-05, + "loss": 1.0431, + "step": 13290 + }, + { + "epoch": 0.08496990915247307, + "grad_norm": 0.799303412437439, + "learning_rate": 9.95557024682468e-05, + "loss": 0.8008, + "step": 13300 + }, + { + "epoch": 0.08503379630221178, + "grad_norm": 0.6764736175537109, + "learning_rate": 9.955503479226791e-05, + "loss": 0.856, + "step": 13310 + }, + { + "epoch": 0.08509768345195047, + "grad_norm": 0.7718757390975952, + "learning_rate": 9.955436661722696e-05, + "loss": 1.1674, + "step": 13320 + }, + { + "epoch": 0.08516157060168918, + "grad_norm": 0.8467085957527161, + "learning_rate": 9.955369794313066e-05, + "loss": 0.7126, + "step": 13330 + }, + { + "epoch": 0.08522545775142788, + "grad_norm": 0.7613494992256165, + "learning_rate": 9.955302876998576e-05, + "loss": 0.8779, + "step": 13340 + }, + { + "epoch": 0.08528934490116657, + "grad_norm": 1.5320026874542236, + "learning_rate": 9.955235909779898e-05, + "loss": 0.92, + "step": 13350 + }, + { + "epoch": 0.08535323205090528, + "grad_norm": 0.9841747879981995, + "learning_rate": 9.955168892657709e-05, + "loss": 1.195, + "step": 13360 + }, + { + "epoch": 0.08541711920064399, + "grad_norm": 0.9456724524497986, + "learning_rate": 9.955101825632681e-05, + "loss": 0.8966, + "step": 13370 + }, + { + "epoch": 0.08548100635038268, + "grad_norm": 0.6288855671882629, + "learning_rate": 9.95503470870549e-05, + "loss": 1.018, + "step": 13380 + }, + { + "epoch": 0.08554489350012139, + "grad_norm": 0.6074085831642151, + "learning_rate": 9.954967541876816e-05, + "loss": 1.1021, + "step": 13390 + }, + { + "epoch": 0.08560878064986009, + "grad_norm": 0.6871976852416992, + "learning_rate": 9.954900325147329e-05, + "loss": 0.7936, + "step": 13400 + }, + { + "epoch": 0.08567266779959878, + "grad_norm": 1.1917479038238525, + "learning_rate": 9.954833058517712e-05, + "loss": 1.0316, + "step": 13410 + }, + { + "epoch": 0.08573655494933749, + "grad_norm": 0.8669334650039673, + "learning_rate": 9.954765741988638e-05, + "loss": 0.7559, + "step": 13420 + }, + { + "epoch": 0.0858004420990762, + "grad_norm": 1.0920523405075073, + "learning_rate": 9.954698375560786e-05, + "loss": 1.0566, + "step": 13430 + }, + { + "epoch": 0.08586432924881489, + "grad_norm": 0.6692205667495728, + "learning_rate": 9.954630959234835e-05, + "loss": 1.1381, + "step": 13440 + }, + { + "epoch": 0.0859282163985536, + "grad_norm": 0.435250461101532, + "learning_rate": 9.954563493011464e-05, + "loss": 0.656, + "step": 13450 + }, + { + "epoch": 0.0859921035482923, + "grad_norm": 0.719704806804657, + "learning_rate": 9.954495976891354e-05, + "loss": 0.9106, + "step": 13460 + }, + { + "epoch": 0.08605599069803099, + "grad_norm": 1.0210596323013306, + "learning_rate": 9.95442841087518e-05, + "loss": 1.0513, + "step": 13470 + }, + { + "epoch": 0.0861198778477697, + "grad_norm": 0.8312535881996155, + "learning_rate": 9.954360794963629e-05, + "loss": 0.9642, + "step": 13480 + }, + { + "epoch": 0.0861837649975084, + "grad_norm": 0.7173671126365662, + "learning_rate": 9.954299897983244e-05, + "loss": 0.9963, + "step": 13490 + }, + { + "epoch": 0.0862476521472471, + "grad_norm": 0.8660849928855896, + "learning_rate": 9.954232187272345e-05, + "loss": 0.7152, + "step": 13500 + }, + { + "epoch": 0.0863115392969858, + "grad_norm": 0.757793664932251, + "learning_rate": 9.954164426668044e-05, + "loss": 1.0053, + "step": 13510 + }, + { + "epoch": 0.08637542644672451, + "grad_norm": 0.6356269717216492, + "learning_rate": 9.954096616171018e-05, + "loss": 1.0546, + "step": 13520 + }, + { + "epoch": 0.0864393135964632, + "grad_norm": 0.6191072463989258, + "learning_rate": 9.954028755781956e-05, + "loss": 0.8486, + "step": 13530 + }, + { + "epoch": 0.08650320074620191, + "grad_norm": 1.0523960590362549, + "learning_rate": 9.953960845501537e-05, + "loss": 0.8107, + "step": 13540 + }, + { + "epoch": 0.08656708789594061, + "grad_norm": 0.7796614170074463, + "learning_rate": 9.953892885330447e-05, + "loss": 0.8723, + "step": 13550 + }, + { + "epoch": 0.0866309750456793, + "grad_norm": 0.7295846939086914, + "learning_rate": 9.953824875269369e-05, + "loss": 0.913, + "step": 13560 + }, + { + "epoch": 0.08669486219541801, + "grad_norm": 1.0830540657043457, + "learning_rate": 9.95375681531899e-05, + "loss": 0.8378, + "step": 13570 + }, + { + "epoch": 0.08675874934515672, + "grad_norm": 1.3589000701904297, + "learning_rate": 9.953688705479994e-05, + "loss": 0.9502, + "step": 13580 + }, + { + "epoch": 0.08682263649489541, + "grad_norm": 0.796097993850708, + "learning_rate": 9.953620545753067e-05, + "loss": 0.6924, + "step": 13590 + }, + { + "epoch": 0.08688652364463412, + "grad_norm": 0.933182954788208, + "learning_rate": 9.953552336138896e-05, + "loss": 1.0789, + "step": 13600 + }, + { + "epoch": 0.08695041079437282, + "grad_norm": 0.830242395401001, + "learning_rate": 9.953484076638166e-05, + "loss": 0.7949, + "step": 13610 + }, + { + "epoch": 0.08701429794411152, + "grad_norm": 0.751649022102356, + "learning_rate": 9.953415767251568e-05, + "loss": 0.8326, + "step": 13620 + }, + { + "epoch": 0.08707818509385022, + "grad_norm": 3.0472450256347656, + "learning_rate": 9.953347407979788e-05, + "loss": 0.7271, + "step": 13630 + }, + { + "epoch": 0.08714207224358893, + "grad_norm": 0.4621819853782654, + "learning_rate": 9.953278998823513e-05, + "loss": 1.0762, + "step": 13640 + }, + { + "epoch": 0.08720595939332762, + "grad_norm": 0.8232291340827942, + "learning_rate": 9.953210539783434e-05, + "loss": 0.7763, + "step": 13650 + }, + { + "epoch": 0.08726984654306633, + "grad_norm": 1.4312729835510254, + "learning_rate": 9.953142030860238e-05, + "loss": 0.8253, + "step": 13660 + }, + { + "epoch": 0.08733373369280503, + "grad_norm": 0.9248529672622681, + "learning_rate": 9.95307347205462e-05, + "loss": 1.0337, + "step": 13670 + }, + { + "epoch": 0.08739762084254372, + "grad_norm": 0.6953186392784119, + "learning_rate": 9.953004863367264e-05, + "loss": 0.93, + "step": 13680 + }, + { + "epoch": 0.08746150799228243, + "grad_norm": 0.8455583453178406, + "learning_rate": 9.952936204798866e-05, + "loss": 0.8386, + "step": 13690 + }, + { + "epoch": 0.08752539514202114, + "grad_norm": 1.119112253189087, + "learning_rate": 9.952867496350115e-05, + "loss": 0.8611, + "step": 13700 + }, + { + "epoch": 0.08758928229175983, + "grad_norm": 0.8125833868980408, + "learning_rate": 9.952798738021703e-05, + "loss": 0.9875, + "step": 13710 + }, + { + "epoch": 0.08765316944149854, + "grad_norm": 0.6726895570755005, + "learning_rate": 9.952729929814323e-05, + "loss": 1.0715, + "step": 13720 + }, + { + "epoch": 0.08771705659123724, + "grad_norm": 0.6909586787223816, + "learning_rate": 9.952661071728669e-05, + "loss": 0.9544, + "step": 13730 + }, + { + "epoch": 0.08778094374097593, + "grad_norm": 0.983298122882843, + "learning_rate": 9.952592163765432e-05, + "loss": 1.0024, + "step": 13740 + }, + { + "epoch": 0.08784483089071464, + "grad_norm": 1.025319218635559, + "learning_rate": 9.952523205925309e-05, + "loss": 0.7382, + "step": 13750 + }, + { + "epoch": 0.08790871804045335, + "grad_norm": 2.100965976715088, + "learning_rate": 9.952454198208991e-05, + "loss": 0.8063, + "step": 13760 + }, + { + "epoch": 0.08797260519019204, + "grad_norm": 0.9277796149253845, + "learning_rate": 9.952385140617174e-05, + "loss": 0.963, + "step": 13770 + }, + { + "epoch": 0.08803649233993074, + "grad_norm": 1.1502079963684082, + "learning_rate": 9.952316033150556e-05, + "loss": 0.857, + "step": 13780 + }, + { + "epoch": 0.08810037948966945, + "grad_norm": 0.5683081746101379, + "learning_rate": 9.952246875809831e-05, + "loss": 0.8632, + "step": 13790 + }, + { + "epoch": 0.08816426663940814, + "grad_norm": 0.7985507249832153, + "learning_rate": 9.952177668595695e-05, + "loss": 0.693, + "step": 13800 + }, + { + "epoch": 0.08822815378914685, + "grad_norm": 0.9982643723487854, + "learning_rate": 9.952108411508845e-05, + "loss": 0.7695, + "step": 13810 + }, + { + "epoch": 0.08829204093888555, + "grad_norm": 0.8026944994926453, + "learning_rate": 9.952039104549981e-05, + "loss": 0.9093, + "step": 13820 + }, + { + "epoch": 0.08835592808862426, + "grad_norm": 0.9833221435546875, + "learning_rate": 9.951969747719798e-05, + "loss": 1.1041, + "step": 13830 + }, + { + "epoch": 0.08841981523836295, + "grad_norm": 0.7445331811904907, + "learning_rate": 9.951900341018996e-05, + "loss": 1.1706, + "step": 13840 + }, + { + "epoch": 0.08848370238810166, + "grad_norm": 0.7326770424842834, + "learning_rate": 9.951830884448274e-05, + "loss": 1.1022, + "step": 13850 + }, + { + "epoch": 0.08854758953784037, + "grad_norm": 1.3713650703430176, + "learning_rate": 9.95176137800833e-05, + "loss": 0.812, + "step": 13860 + }, + { + "epoch": 0.08861147668757906, + "grad_norm": 0.8719102740287781, + "learning_rate": 9.951691821699864e-05, + "loss": 1.037, + "step": 13870 + }, + { + "epoch": 0.08867536383731776, + "grad_norm": 0.7241623997688293, + "learning_rate": 9.951622215523579e-05, + "loss": 0.9797, + "step": 13880 + }, + { + "epoch": 0.08873925098705647, + "grad_norm": 0.9998733401298523, + "learning_rate": 9.951552559480176e-05, + "loss": 1.0036, + "step": 13890 + }, + { + "epoch": 0.08880313813679516, + "grad_norm": 1.31692373752594, + "learning_rate": 9.951482853570353e-05, + "loss": 1.0621, + "step": 13900 + }, + { + "epoch": 0.08886702528653387, + "grad_norm": 0.509678840637207, + "learning_rate": 9.951413097794816e-05, + "loss": 0.7828, + "step": 13910 + }, + { + "epoch": 0.08893091243627257, + "grad_norm": 0.6443775296211243, + "learning_rate": 9.951343292154263e-05, + "loss": 0.8265, + "step": 13920 + }, + { + "epoch": 0.08899479958601127, + "grad_norm": 1.014041781425476, + "learning_rate": 9.9512734366494e-05, + "loss": 1.0371, + "step": 13930 + }, + { + "epoch": 0.08905868673574997, + "grad_norm": 0.8309150338172913, + "learning_rate": 9.951203531280931e-05, + "loss": 0.9042, + "step": 13940 + }, + { + "epoch": 0.08912257388548868, + "grad_norm": 0.6780155897140503, + "learning_rate": 9.951133576049558e-05, + "loss": 0.8917, + "step": 13950 + }, + { + "epoch": 0.08918646103522737, + "grad_norm": 0.7868662476539612, + "learning_rate": 9.951063570955988e-05, + "loss": 0.9667, + "step": 13960 + }, + { + "epoch": 0.08925034818496608, + "grad_norm": 0.6636529564857483, + "learning_rate": 9.950993516000924e-05, + "loss": 0.8601, + "step": 13970 + }, + { + "epoch": 0.08931423533470478, + "grad_norm": 0.8302227854728699, + "learning_rate": 9.950923411185071e-05, + "loss": 0.9081, + "step": 13980 + }, + { + "epoch": 0.08937812248444348, + "grad_norm": 0.9507797360420227, + "learning_rate": 9.950853256509138e-05, + "loss": 0.7923, + "step": 13990 + }, + { + "epoch": 0.08944200963418218, + "grad_norm": 0.5564282536506653, + "learning_rate": 9.950783051973828e-05, + "loss": 0.9981, + "step": 14000 + }, + { + "epoch": 0.08950589678392089, + "grad_norm": 1.1084082126617432, + "learning_rate": 9.950712797579849e-05, + "loss": 0.7917, + "step": 14010 + }, + { + "epoch": 0.08956978393365958, + "grad_norm": 1.2243750095367432, + "learning_rate": 9.950642493327911e-05, + "loss": 1.0782, + "step": 14020 + }, + { + "epoch": 0.08963367108339829, + "grad_norm": 1.1874489784240723, + "learning_rate": 9.950572139218719e-05, + "loss": 0.9879, + "step": 14030 + }, + { + "epoch": 0.08969755823313699, + "grad_norm": 0.6582461595535278, + "learning_rate": 9.950501735252984e-05, + "loss": 0.8992, + "step": 14040 + }, + { + "epoch": 0.08976144538287568, + "grad_norm": 0.945318341255188, + "learning_rate": 9.950431281431413e-05, + "loss": 0.9753, + "step": 14050 + }, + { + "epoch": 0.08982533253261439, + "grad_norm": 1.02214777469635, + "learning_rate": 9.950360777754716e-05, + "loss": 0.8625, + "step": 14060 + }, + { + "epoch": 0.0898892196823531, + "grad_norm": 0.6554903388023376, + "learning_rate": 9.950290224223604e-05, + "loss": 0.7558, + "step": 14070 + }, + { + "epoch": 0.08995310683209179, + "grad_norm": 0.9139891266822815, + "learning_rate": 9.950219620838786e-05, + "loss": 0.9843, + "step": 14080 + }, + { + "epoch": 0.0900169939818305, + "grad_norm": 0.6926449537277222, + "learning_rate": 9.950148967600974e-05, + "loss": 0.6626, + "step": 14090 + }, + { + "epoch": 0.0900808811315692, + "grad_norm": 1.608420968055725, + "learning_rate": 9.95007826451088e-05, + "loss": 1.0037, + "step": 14100 + }, + { + "epoch": 0.0901447682813079, + "grad_norm": 0.9414392113685608, + "learning_rate": 9.950007511569214e-05, + "loss": 0.9188, + "step": 14110 + }, + { + "epoch": 0.0902086554310466, + "grad_norm": 0.8587938547134399, + "learning_rate": 9.949936708776691e-05, + "loss": 0.9312, + "step": 14120 + }, + { + "epoch": 0.0902725425807853, + "grad_norm": 1.4284396171569824, + "learning_rate": 9.949865856134024e-05, + "loss": 1.1385, + "step": 14130 + }, + { + "epoch": 0.090336429730524, + "grad_norm": 0.7485639452934265, + "learning_rate": 9.949794953641925e-05, + "loss": 0.9224, + "step": 14140 + }, + { + "epoch": 0.0904003168802627, + "grad_norm": 0.7703597545623779, + "learning_rate": 9.949724001301108e-05, + "loss": 0.8031, + "step": 14150 + }, + { + "epoch": 0.09046420403000141, + "grad_norm": 0.6931461095809937, + "learning_rate": 9.949652999112289e-05, + "loss": 0.8585, + "step": 14160 + }, + { + "epoch": 0.0905280911797401, + "grad_norm": 0.9867964386940002, + "learning_rate": 9.94958194707618e-05, + "loss": 0.9662, + "step": 14170 + }, + { + "epoch": 0.09059197832947881, + "grad_norm": 0.7029063105583191, + "learning_rate": 9.949510845193501e-05, + "loss": 0.9446, + "step": 14180 + }, + { + "epoch": 0.09065586547921752, + "grad_norm": 0.6712666153907776, + "learning_rate": 9.949439693464965e-05, + "loss": 0.7581, + "step": 14190 + }, + { + "epoch": 0.09071975262895621, + "grad_norm": 0.8002526760101318, + "learning_rate": 9.94936849189129e-05, + "loss": 1.1783, + "step": 14200 + }, + { + "epoch": 0.09078363977869491, + "grad_norm": 1.9806957244873047, + "learning_rate": 9.949297240473192e-05, + "loss": 0.8167, + "step": 14210 + }, + { + "epoch": 0.09084752692843362, + "grad_norm": 0.6431198716163635, + "learning_rate": 9.949225939211391e-05, + "loss": 1.1454, + "step": 14220 + }, + { + "epoch": 0.09091141407817231, + "grad_norm": 1.083142638206482, + "learning_rate": 9.9491545881066e-05, + "loss": 1.2549, + "step": 14230 + }, + { + "epoch": 0.09097530122791102, + "grad_norm": 0.520418643951416, + "learning_rate": 9.949083187159542e-05, + "loss": 0.7501, + "step": 14240 + }, + { + "epoch": 0.09103918837764972, + "grad_norm": 1.0432296991348267, + "learning_rate": 9.949011736370935e-05, + "loss": 0.8595, + "step": 14250 + }, + { + "epoch": 0.09110307552738842, + "grad_norm": 0.8031591773033142, + "learning_rate": 9.948940235741499e-05, + "loss": 0.7955, + "step": 14260 + }, + { + "epoch": 0.09116696267712712, + "grad_norm": 0.7311345934867859, + "learning_rate": 9.948868685271952e-05, + "loss": 0.9517, + "step": 14270 + }, + { + "epoch": 0.09123084982686583, + "grad_norm": 1.3706258535385132, + "learning_rate": 9.948797084963016e-05, + "loss": 0.9347, + "step": 14280 + }, + { + "epoch": 0.09129473697660452, + "grad_norm": 0.5846802592277527, + "learning_rate": 9.948725434815413e-05, + "loss": 0.7575, + "step": 14290 + }, + { + "epoch": 0.09135862412634323, + "grad_norm": 0.7384892702102661, + "learning_rate": 9.948653734829863e-05, + "loss": 0.9603, + "step": 14300 + }, + { + "epoch": 0.09142251127608193, + "grad_norm": 2.91487717628479, + "learning_rate": 9.948581985007089e-05, + "loss": 1.0739, + "step": 14310 + }, + { + "epoch": 0.09148639842582063, + "grad_norm": 0.6311538815498352, + "learning_rate": 9.948510185347813e-05, + "loss": 1.0676, + "step": 14320 + }, + { + "epoch": 0.09155028557555933, + "grad_norm": 0.6362346410751343, + "learning_rate": 9.948438335852759e-05, + "loss": 1.1728, + "step": 14330 + }, + { + "epoch": 0.09161417272529804, + "grad_norm": 0.6874721646308899, + "learning_rate": 9.94836643652265e-05, + "loss": 1.2022, + "step": 14340 + }, + { + "epoch": 0.09167805987503673, + "grad_norm": 0.721106231212616, + "learning_rate": 9.948294487358208e-05, + "loss": 1.097, + "step": 14350 + }, + { + "epoch": 0.09174194702477544, + "grad_norm": 1.0813249349594116, + "learning_rate": 9.948222488360162e-05, + "loss": 1.2724, + "step": 14360 + }, + { + "epoch": 0.09180583417451414, + "grad_norm": 0.8952019810676575, + "learning_rate": 9.948150439529233e-05, + "loss": 0.8907, + "step": 14370 + }, + { + "epoch": 0.09186972132425283, + "grad_norm": 0.8344172835350037, + "learning_rate": 9.94807834086615e-05, + "loss": 0.8321, + "step": 14380 + }, + { + "epoch": 0.09193360847399154, + "grad_norm": 0.9786416888237, + "learning_rate": 9.948006192371635e-05, + "loss": 0.7653, + "step": 14390 + }, + { + "epoch": 0.09199749562373025, + "grad_norm": 1.2197997570037842, + "learning_rate": 9.947933994046419e-05, + "loss": 0.9922, + "step": 14400 + }, + { + "epoch": 0.09206138277346894, + "grad_norm": 0.915473222732544, + "learning_rate": 9.947861745891227e-05, + "loss": 0.921, + "step": 14410 + }, + { + "epoch": 0.09212526992320764, + "grad_norm": 0.9322916865348816, + "learning_rate": 9.947789447906785e-05, + "loss": 1.0827, + "step": 14420 + }, + { + "epoch": 0.09218915707294635, + "grad_norm": 1.073462963104248, + "learning_rate": 9.947717100093825e-05, + "loss": 0.9149, + "step": 14430 + }, + { + "epoch": 0.09225304422268504, + "grad_norm": 1.7424027919769287, + "learning_rate": 9.947644702453072e-05, + "loss": 0.9262, + "step": 14440 + }, + { + "epoch": 0.09231693137242375, + "grad_norm": 0.5602442026138306, + "learning_rate": 9.947572254985258e-05, + "loss": 0.8065, + "step": 14450 + }, + { + "epoch": 0.09238081852216246, + "grad_norm": 0.7667688727378845, + "learning_rate": 9.94749975769111e-05, + "loss": 0.9672, + "step": 14460 + }, + { + "epoch": 0.09244470567190115, + "grad_norm": 0.8217202425003052, + "learning_rate": 9.947427210571359e-05, + "loss": 0.8194, + "step": 14470 + }, + { + "epoch": 0.09250859282163985, + "grad_norm": 0.7690846920013428, + "learning_rate": 9.947354613626737e-05, + "loss": 0.6602, + "step": 14480 + }, + { + "epoch": 0.09257247997137856, + "grad_norm": 0.7123977541923523, + "learning_rate": 9.947281966857973e-05, + "loss": 0.9875, + "step": 14490 + }, + { + "epoch": 0.09263636712111725, + "grad_norm": 1.3590373992919922, + "learning_rate": 9.947209270265801e-05, + "loss": 1.0355, + "step": 14500 + }, + { + "epoch": 0.09270025427085596, + "grad_norm": 2.0925168991088867, + "learning_rate": 9.947136523850949e-05, + "loss": 0.9441, + "step": 14510 + }, + { + "epoch": 0.09276414142059466, + "grad_norm": 0.7630490064620972, + "learning_rate": 9.947063727614155e-05, + "loss": 0.7035, + "step": 14520 + }, + { + "epoch": 0.09282802857033336, + "grad_norm": 0.5995486378669739, + "learning_rate": 9.946990881556148e-05, + "loss": 0.8794, + "step": 14530 + }, + { + "epoch": 0.09289191572007206, + "grad_norm": 0.5936999917030334, + "learning_rate": 9.946917985677664e-05, + "loss": 0.8, + "step": 14540 + }, + { + "epoch": 0.09295580286981077, + "grad_norm": 2.0189425945281982, + "learning_rate": 9.946845039979436e-05, + "loss": 0.9379, + "step": 14550 + }, + { + "epoch": 0.09301969001954946, + "grad_norm": 0.9083710312843323, + "learning_rate": 9.946772044462197e-05, + "loss": 1.1928, + "step": 14560 + }, + { + "epoch": 0.09308357716928817, + "grad_norm": 0.7872990965843201, + "learning_rate": 9.946698999126686e-05, + "loss": 0.9303, + "step": 14570 + }, + { + "epoch": 0.09314746431902687, + "grad_norm": 0.9097589254379272, + "learning_rate": 9.946625903973636e-05, + "loss": 0.8706, + "step": 14580 + }, + { + "epoch": 0.09321135146876557, + "grad_norm": 1.2268530130386353, + "learning_rate": 9.946552759003783e-05, + "loss": 0.7452, + "step": 14590 + }, + { + "epoch": 0.09327523861850427, + "grad_norm": 0.7525649070739746, + "learning_rate": 9.946479564217866e-05, + "loss": 1.1206, + "step": 14600 + }, + { + "epoch": 0.09333912576824298, + "grad_norm": 0.9777686595916748, + "learning_rate": 9.946406319616619e-05, + "loss": 0.9522, + "step": 14610 + }, + { + "epoch": 0.09340301291798167, + "grad_norm": 0.7327966690063477, + "learning_rate": 9.946333025200781e-05, + "loss": 0.7119, + "step": 14620 + }, + { + "epoch": 0.09346690006772038, + "grad_norm": 0.8345320820808411, + "learning_rate": 9.946259680971091e-05, + "loss": 0.9164, + "step": 14630 + }, + { + "epoch": 0.09353078721745908, + "grad_norm": 1.128624439239502, + "learning_rate": 9.946186286928288e-05, + "loss": 0.8583, + "step": 14640 + }, + { + "epoch": 0.09359467436719779, + "grad_norm": 0.753193199634552, + "learning_rate": 9.946112843073107e-05, + "loss": 1.0453, + "step": 14650 + }, + { + "epoch": 0.09365856151693648, + "grad_norm": 0.9466274380683899, + "learning_rate": 9.946039349406294e-05, + "loss": 1.1494, + "step": 14660 + }, + { + "epoch": 0.09372244866667519, + "grad_norm": 0.8753125667572021, + "learning_rate": 9.945965805928583e-05, + "loss": 0.7926, + "step": 14670 + }, + { + "epoch": 0.0937863358164139, + "grad_norm": 0.7783929109573364, + "learning_rate": 9.94589221264072e-05, + "loss": 1.0401, + "step": 14680 + }, + { + "epoch": 0.09385022296615259, + "grad_norm": 1.0802748203277588, + "learning_rate": 9.945818569543441e-05, + "loss": 0.7928, + "step": 14690 + }, + { + "epoch": 0.09391411011589129, + "grad_norm": 0.8250336647033691, + "learning_rate": 9.945744876637491e-05, + "loss": 0.9204, + "step": 14700 + }, + { + "epoch": 0.09397799726563, + "grad_norm": 0.6883922815322876, + "learning_rate": 9.945671133923614e-05, + "loss": 0.8513, + "step": 14710 + }, + { + "epoch": 0.09404188441536869, + "grad_norm": 0.4683299958705902, + "learning_rate": 9.945597341402547e-05, + "loss": 0.6514, + "step": 14720 + }, + { + "epoch": 0.0941057715651074, + "grad_norm": 0.6585717797279358, + "learning_rate": 9.945523499075037e-05, + "loss": 0.9824, + "step": 14730 + }, + { + "epoch": 0.0941696587148461, + "grad_norm": 0.5519923567771912, + "learning_rate": 9.945449606941826e-05, + "loss": 1.007, + "step": 14740 + }, + { + "epoch": 0.0942335458645848, + "grad_norm": 0.6457942128181458, + "learning_rate": 9.945375665003661e-05, + "loss": 0.6664, + "step": 14750 + }, + { + "epoch": 0.0942974330143235, + "grad_norm": 0.906104326248169, + "learning_rate": 9.945301673261285e-05, + "loss": 0.8221, + "step": 14760 + }, + { + "epoch": 0.0943613201640622, + "grad_norm": 0.8347557187080383, + "learning_rate": 9.945227631715442e-05, + "loss": 0.8833, + "step": 14770 + }, + { + "epoch": 0.0944252073138009, + "grad_norm": 0.6181365847587585, + "learning_rate": 9.945153540366877e-05, + "loss": 1.0287, + "step": 14780 + }, + { + "epoch": 0.0944890944635396, + "grad_norm": 0.6475619077682495, + "learning_rate": 9.945079399216339e-05, + "loss": 0.8144, + "step": 14790 + }, + { + "epoch": 0.09455298161327831, + "grad_norm": 0.6462060809135437, + "learning_rate": 9.945005208264572e-05, + "loss": 0.8489, + "step": 14800 + }, + { + "epoch": 0.094616868763017, + "grad_norm": 0.6881303787231445, + "learning_rate": 9.944938393828552e-05, + "loss": 0.8207, + "step": 14810 + }, + { + "epoch": 0.09468075591275571, + "grad_norm": 0.6425718069076538, + "learning_rate": 9.944864108256513e-05, + "loss": 0.91, + "step": 14820 + }, + { + "epoch": 0.09474464306249442, + "grad_norm": 1.9519624710083008, + "learning_rate": 9.944789772885414e-05, + "loss": 0.8698, + "step": 14830 + }, + { + "epoch": 0.09480853021223311, + "grad_norm": 0.887140154838562, + "learning_rate": 9.944715387716004e-05, + "loss": 0.909, + "step": 14840 + }, + { + "epoch": 0.09487241736197181, + "grad_norm": 0.7273536920547485, + "learning_rate": 9.944640952749033e-05, + "loss": 1.1605, + "step": 14850 + }, + { + "epoch": 0.09493630451171052, + "grad_norm": 0.928715169429779, + "learning_rate": 9.944566467985249e-05, + "loss": 1.0493, + "step": 14860 + }, + { + "epoch": 0.09500019166144921, + "grad_norm": 0.5552724003791809, + "learning_rate": 9.944491933425403e-05, + "loss": 1.1027, + "step": 14870 + }, + { + "epoch": 0.09506407881118792, + "grad_norm": 0.8260436058044434, + "learning_rate": 9.944417349070247e-05, + "loss": 0.7093, + "step": 14880 + }, + { + "epoch": 0.09512796596092662, + "grad_norm": 2.4791147708892822, + "learning_rate": 9.944342714920529e-05, + "loss": 1.1502, + "step": 14890 + }, + { + "epoch": 0.09519185311066532, + "grad_norm": 0.8212199211120605, + "learning_rate": 9.944268030977003e-05, + "loss": 1.0912, + "step": 14900 + }, + { + "epoch": 0.09525574026040402, + "grad_norm": 0.8238768577575684, + "learning_rate": 9.94419329724042e-05, + "loss": 0.8248, + "step": 14910 + }, + { + "epoch": 0.09531962741014273, + "grad_norm": 1.0283452272415161, + "learning_rate": 9.944118513711535e-05, + "loss": 1.0666, + "step": 14920 + }, + { + "epoch": 0.09538351455988142, + "grad_norm": 0.7515852451324463, + "learning_rate": 9.944043680391098e-05, + "loss": 0.798, + "step": 14930 + }, + { + "epoch": 0.09544740170962013, + "grad_norm": 0.8797821998596191, + "learning_rate": 9.943968797279864e-05, + "loss": 0.8629, + "step": 14940 + }, + { + "epoch": 0.09551128885935883, + "grad_norm": 0.8942396640777588, + "learning_rate": 9.943893864378587e-05, + "loss": 0.8589, + "step": 14950 + }, + { + "epoch": 0.09557517600909753, + "grad_norm": 0.7868557572364807, + "learning_rate": 9.943818881688023e-05, + "loss": 0.7879, + "step": 14960 + }, + { + "epoch": 0.09563906315883623, + "grad_norm": 0.766189694404602, + "learning_rate": 9.943743849208924e-05, + "loss": 1.0051, + "step": 14970 + }, + { + "epoch": 0.09570295030857494, + "grad_norm": 0.7284533381462097, + "learning_rate": 9.943668766942049e-05, + "loss": 0.6991, + "step": 14980 + }, + { + "epoch": 0.09576683745831363, + "grad_norm": 1.0945543050765991, + "learning_rate": 9.943593634888151e-05, + "loss": 0.8595, + "step": 14990 + }, + { + "epoch": 0.09583072460805234, + "grad_norm": 1.704253077507019, + "learning_rate": 9.943518453047988e-05, + "loss": 1.0841, + "step": 15000 + }, + { + "epoch": 0.09589461175779104, + "grad_norm": 0.537315309047699, + "learning_rate": 9.943443221422319e-05, + "loss": 1.0965, + "step": 15010 + }, + { + "epoch": 0.09595849890752974, + "grad_norm": 1.1799222230911255, + "learning_rate": 9.9433679400119e-05, + "loss": 0.9025, + "step": 15020 + }, + { + "epoch": 0.09602238605726844, + "grad_norm": 3.8464369773864746, + "learning_rate": 9.943292608817489e-05, + "loss": 0.8995, + "step": 15030 + }, + { + "epoch": 0.09608627320700715, + "grad_norm": 1.1854133605957031, + "learning_rate": 9.943217227839845e-05, + "loss": 1.2093, + "step": 15040 + }, + { + "epoch": 0.09615016035674584, + "grad_norm": 1.119036078453064, + "learning_rate": 9.943141797079727e-05, + "loss": 0.6415, + "step": 15050 + }, + { + "epoch": 0.09621404750648455, + "grad_norm": 0.9091972708702087, + "learning_rate": 9.943066316537895e-05, + "loss": 0.7339, + "step": 15060 + }, + { + "epoch": 0.09627793465622325, + "grad_norm": 2.1518936157226562, + "learning_rate": 9.942990786215107e-05, + "loss": 0.7829, + "step": 15070 + }, + { + "epoch": 0.09634182180596194, + "grad_norm": 0.8024427890777588, + "learning_rate": 9.942915206112126e-05, + "loss": 0.9612, + "step": 15080 + }, + { + "epoch": 0.09640570895570065, + "grad_norm": 0.90773606300354, + "learning_rate": 9.942839576229714e-05, + "loss": 1.0113, + "step": 15090 + }, + { + "epoch": 0.09646959610543936, + "grad_norm": 1.2031515836715698, + "learning_rate": 9.942763896568632e-05, + "loss": 0.929, + "step": 15100 + }, + { + "epoch": 0.09653348325517805, + "grad_norm": 1.1134458780288696, + "learning_rate": 9.942688167129639e-05, + "loss": 1.0391, + "step": 15110 + }, + { + "epoch": 0.09659737040491675, + "grad_norm": 1.0063025951385498, + "learning_rate": 9.942612387913501e-05, + "loss": 0.8559, + "step": 15120 + }, + { + "epoch": 0.09666125755465546, + "grad_norm": 0.737177848815918, + "learning_rate": 9.94253655892098e-05, + "loss": 1.0731, + "step": 15130 + }, + { + "epoch": 0.09672514470439415, + "grad_norm": 0.8199975490570068, + "learning_rate": 9.942460680152842e-05, + "loss": 0.8919, + "step": 15140 + }, + { + "epoch": 0.09678903185413286, + "grad_norm": 0.9995172023773193, + "learning_rate": 9.942384751609848e-05, + "loss": 0.9533, + "step": 15150 + }, + { + "epoch": 0.09685291900387157, + "grad_norm": 1.6807196140289307, + "learning_rate": 9.942308773292764e-05, + "loss": 1.2186, + "step": 15160 + }, + { + "epoch": 0.09691680615361026, + "grad_norm": 0.6781327724456787, + "learning_rate": 9.942232745202353e-05, + "loss": 0.9126, + "step": 15170 + }, + { + "epoch": 0.09698069330334896, + "grad_norm": 0.8096178770065308, + "learning_rate": 9.942156667339385e-05, + "loss": 0.8445, + "step": 15180 + }, + { + "epoch": 0.09704458045308767, + "grad_norm": 0.4493632912635803, + "learning_rate": 9.942080539704621e-05, + "loss": 0.9263, + "step": 15190 + }, + { + "epoch": 0.09710846760282636, + "grad_norm": 1.0077593326568604, + "learning_rate": 9.942004362298834e-05, + "loss": 0.8551, + "step": 15200 + }, + { + "epoch": 0.09717235475256507, + "grad_norm": 0.7614121437072754, + "learning_rate": 9.941928135122784e-05, + "loss": 0.9088, + "step": 15210 + }, + { + "epoch": 0.09723624190230377, + "grad_norm": 1.770782470703125, + "learning_rate": 9.941851858177244e-05, + "loss": 0.8671, + "step": 15220 + }, + { + "epoch": 0.09730012905204247, + "grad_norm": 0.8057569861412048, + "learning_rate": 9.941775531462982e-05, + "loss": 0.8172, + "step": 15230 + }, + { + "epoch": 0.09736401620178117, + "grad_norm": 0.6936876177787781, + "learning_rate": 9.941699154980763e-05, + "loss": 0.8575, + "step": 15240 + }, + { + "epoch": 0.09742790335151988, + "grad_norm": 0.6702722311019897, + "learning_rate": 9.941622728731359e-05, + "loss": 1.004, + "step": 15250 + }, + { + "epoch": 0.09749179050125857, + "grad_norm": 1.0262168645858765, + "learning_rate": 9.94154625271554e-05, + "loss": 0.9267, + "step": 15260 + }, + { + "epoch": 0.09755567765099728, + "grad_norm": 1.287480115890503, + "learning_rate": 9.941469726934074e-05, + "loss": 0.8412, + "step": 15270 + }, + { + "epoch": 0.09761956480073598, + "grad_norm": 1.0471506118774414, + "learning_rate": 9.941393151387734e-05, + "loss": 0.9556, + "step": 15280 + }, + { + "epoch": 0.09768345195047468, + "grad_norm": 0.948810875415802, + "learning_rate": 9.941316526077289e-05, + "loss": 1.0511, + "step": 15290 + }, + { + "epoch": 0.09774733910021338, + "grad_norm": 0.6042103171348572, + "learning_rate": 9.941239851003511e-05, + "loss": 0.858, + "step": 15300 + }, + { + "epoch": 0.09781122624995209, + "grad_norm": 0.7108423113822937, + "learning_rate": 9.941163126167175e-05, + "loss": 1.0698, + "step": 15310 + }, + { + "epoch": 0.09787511339969078, + "grad_norm": 0.8583425283432007, + "learning_rate": 9.94108635156905e-05, + "loss": 0.9262, + "step": 15320 + }, + { + "epoch": 0.09793900054942949, + "grad_norm": 1.3478715419769287, + "learning_rate": 9.941009527209911e-05, + "loss": 0.8279, + "step": 15330 + }, + { + "epoch": 0.09800288769916819, + "grad_norm": 0.7297415137290955, + "learning_rate": 9.940932653090532e-05, + "loss": 0.7739, + "step": 15340 + }, + { + "epoch": 0.09806677484890688, + "grad_norm": 0.6165359616279602, + "learning_rate": 9.940855729211687e-05, + "loss": 0.9152, + "step": 15350 + }, + { + "epoch": 0.09813066199864559, + "grad_norm": 0.6644479632377625, + "learning_rate": 9.940778755574149e-05, + "loss": 0.8523, + "step": 15360 + }, + { + "epoch": 0.0981945491483843, + "grad_norm": 0.9046561121940613, + "learning_rate": 9.940701732178695e-05, + "loss": 0.9418, + "step": 15370 + }, + { + "epoch": 0.09825843629812299, + "grad_norm": 0.6211059093475342, + "learning_rate": 9.9406246590261e-05, + "loss": 0.7587, + "step": 15380 + }, + { + "epoch": 0.0983223234478617, + "grad_norm": 1.164886236190796, + "learning_rate": 9.940547536117142e-05, + "loss": 1.2, + "step": 15390 + }, + { + "epoch": 0.0983862105976004, + "grad_norm": 0.9881723523139954, + "learning_rate": 9.940470363452596e-05, + "loss": 0.7596, + "step": 15400 + }, + { + "epoch": 0.0984500977473391, + "grad_norm": 6.108283042907715, + "learning_rate": 9.940393141033238e-05, + "loss": 1.05, + "step": 15410 + }, + { + "epoch": 0.0985139848970778, + "grad_norm": 0.5831863880157471, + "learning_rate": 9.940315868859847e-05, + "loss": 1.2292, + "step": 15420 + }, + { + "epoch": 0.0985778720468165, + "grad_norm": 1.4908435344696045, + "learning_rate": 9.940238546933203e-05, + "loss": 0.99, + "step": 15430 + }, + { + "epoch": 0.0986417591965552, + "grad_norm": 0.80536949634552, + "learning_rate": 9.940161175254082e-05, + "loss": 0.9417, + "step": 15440 + }, + { + "epoch": 0.0987056463462939, + "grad_norm": 0.6706516146659851, + "learning_rate": 9.940083753823263e-05, + "loss": 1.249, + "step": 15450 + }, + { + "epoch": 0.09876953349603261, + "grad_norm": 0.6131950616836548, + "learning_rate": 9.940006282641527e-05, + "loss": 0.7975, + "step": 15460 + }, + { + "epoch": 0.0988334206457713, + "grad_norm": 0.9210124611854553, + "learning_rate": 9.939928761709655e-05, + "loss": 0.7322, + "step": 15470 + }, + { + "epoch": 0.09889730779551001, + "grad_norm": 0.8976283669471741, + "learning_rate": 9.939851191028426e-05, + "loss": 0.9391, + "step": 15480 + }, + { + "epoch": 0.09896119494524871, + "grad_norm": 0.7244909405708313, + "learning_rate": 9.939773570598623e-05, + "loss": 0.7818, + "step": 15490 + }, + { + "epoch": 0.09902508209498742, + "grad_norm": 1.1001940965652466, + "learning_rate": 9.939695900421024e-05, + "loss": 0.9527, + "step": 15500 + }, + { + "epoch": 0.09908896924472611, + "grad_norm": 0.7406299114227295, + "learning_rate": 9.939618180496417e-05, + "loss": 0.8922, + "step": 15510 + }, + { + "epoch": 0.09915285639446482, + "grad_norm": 1.2300517559051514, + "learning_rate": 9.93954041082558e-05, + "loss": 1.2021, + "step": 15520 + }, + { + "epoch": 0.09921674354420353, + "grad_norm": 0.9667423963546753, + "learning_rate": 9.9394625914093e-05, + "loss": 1.1348, + "step": 15530 + }, + { + "epoch": 0.09928063069394222, + "grad_norm": 0.8901247382164001, + "learning_rate": 9.939384722248355e-05, + "loss": 1.2461, + "step": 15540 + }, + { + "epoch": 0.09934451784368092, + "grad_norm": 0.8347676992416382, + "learning_rate": 9.939306803343533e-05, + "loss": 0.7845, + "step": 15550 + }, + { + "epoch": 0.09940840499341963, + "grad_norm": 0.9552205801010132, + "learning_rate": 9.93922883469562e-05, + "loss": 0.6641, + "step": 15560 + }, + { + "epoch": 0.09947229214315832, + "grad_norm": 0.8416782021522522, + "learning_rate": 9.939150816305399e-05, + "loss": 0.9133, + "step": 15570 + }, + { + "epoch": 0.09953617929289703, + "grad_norm": 1.2031623125076294, + "learning_rate": 9.939072748173656e-05, + "loss": 0.7874, + "step": 15580 + }, + { + "epoch": 0.09960006644263573, + "grad_norm": 1.0405542850494385, + "learning_rate": 9.938994630301179e-05, + "loss": 1.0763, + "step": 15590 + }, + { + "epoch": 0.09966395359237443, + "grad_norm": 0.7080594301223755, + "learning_rate": 9.938916462688753e-05, + "loss": 1.1229, + "step": 15600 + }, + { + "epoch": 0.09972784074211313, + "grad_norm": 0.6351432204246521, + "learning_rate": 9.938838245337163e-05, + "loss": 0.8626, + "step": 15610 + }, + { + "epoch": 0.09979172789185184, + "grad_norm": 1.3848146200180054, + "learning_rate": 9.938759978247201e-05, + "loss": 0.8473, + "step": 15620 + }, + { + "epoch": 0.09985561504159053, + "grad_norm": 0.9175819754600525, + "learning_rate": 9.938681661419654e-05, + "loss": 0.8902, + "step": 15630 + }, + { + "epoch": 0.09991950219132924, + "grad_norm": 0.729713499546051, + "learning_rate": 9.938603294855309e-05, + "loss": 0.8599, + "step": 15640 + }, + { + "epoch": 0.09998338934106794, + "grad_norm": 0.8896664381027222, + "learning_rate": 9.938524878554956e-05, + "loss": 1.2631, + "step": 15650 + }, + { + "epoch": 0.10004727649080664, + "grad_norm": 1.1083167791366577, + "learning_rate": 9.938446412519387e-05, + "loss": 0.8752, + "step": 15660 + }, + { + "epoch": 0.10011116364054534, + "grad_norm": 0.9350288510322571, + "learning_rate": 9.938367896749388e-05, + "loss": 0.891, + "step": 15670 + }, + { + "epoch": 0.10017505079028405, + "grad_norm": 0.8491414785385132, + "learning_rate": 9.938289331245753e-05, + "loss": 0.9962, + "step": 15680 + }, + { + "epoch": 0.10023893794002274, + "grad_norm": 1.3653219938278198, + "learning_rate": 9.938210716009272e-05, + "loss": 0.8964, + "step": 15690 + }, + { + "epoch": 0.10030282508976145, + "grad_norm": 1.137112021446228, + "learning_rate": 9.938132051040736e-05, + "loss": 0.9982, + "step": 15700 + }, + { + "epoch": 0.10036671223950015, + "grad_norm": 0.8561280965805054, + "learning_rate": 9.93805333634094e-05, + "loss": 0.7421, + "step": 15710 + }, + { + "epoch": 0.10043059938923884, + "grad_norm": 1.1215713024139404, + "learning_rate": 9.937974571910674e-05, + "loss": 0.926, + "step": 15720 + }, + { + "epoch": 0.10049448653897755, + "grad_norm": 0.6843059659004211, + "learning_rate": 9.937895757750733e-05, + "loss": 0.934, + "step": 15730 + }, + { + "epoch": 0.10055837368871626, + "grad_norm": 0.8098707795143127, + "learning_rate": 9.937816893861909e-05, + "loss": 0.8128, + "step": 15740 + }, + { + "epoch": 0.10062226083845495, + "grad_norm": 0.8894488215446472, + "learning_rate": 9.937737980244997e-05, + "loss": 0.9038, + "step": 15750 + }, + { + "epoch": 0.10068614798819366, + "grad_norm": 1.0936787128448486, + "learning_rate": 9.937659016900791e-05, + "loss": 0.8245, + "step": 15760 + }, + { + "epoch": 0.10075003513793236, + "grad_norm": 1.0727956295013428, + "learning_rate": 9.937580003830088e-05, + "loss": 0.8693, + "step": 15770 + }, + { + "epoch": 0.10081392228767105, + "grad_norm": 1.4079822301864624, + "learning_rate": 9.937500941033682e-05, + "loss": 0.6185, + "step": 15780 + }, + { + "epoch": 0.10087780943740976, + "grad_norm": 1.4234700202941895, + "learning_rate": 9.937421828512371e-05, + "loss": 0.7397, + "step": 15790 + }, + { + "epoch": 0.10094169658714847, + "grad_norm": 0.8071795701980591, + "learning_rate": 9.937342666266951e-05, + "loss": 1.1495, + "step": 15800 + }, + { + "epoch": 0.10100558373688716, + "grad_norm": 2.0237574577331543, + "learning_rate": 9.937263454298217e-05, + "loss": 1.0899, + "step": 15810 + }, + { + "epoch": 0.10106947088662586, + "grad_norm": 0.6750722527503967, + "learning_rate": 9.93718419260697e-05, + "loss": 0.7826, + "step": 15820 + }, + { + "epoch": 0.10113335803636457, + "grad_norm": 0.9621725678443909, + "learning_rate": 9.937104881194008e-05, + "loss": 1.0047, + "step": 15830 + }, + { + "epoch": 0.10119724518610326, + "grad_norm": 0.7849874496459961, + "learning_rate": 9.937025520060127e-05, + "loss": 0.8557, + "step": 15840 + }, + { + "epoch": 0.10126113233584197, + "grad_norm": 0.6543164253234863, + "learning_rate": 9.936946109206129e-05, + "loss": 0.8556, + "step": 15850 + }, + { + "epoch": 0.10132501948558068, + "grad_norm": 0.8516491651535034, + "learning_rate": 9.936866648632811e-05, + "loss": 0.8175, + "step": 15860 + }, + { + "epoch": 0.10138890663531937, + "grad_norm": 0.7480735778808594, + "learning_rate": 9.936787138340976e-05, + "loss": 0.9132, + "step": 15870 + }, + { + "epoch": 0.10145279378505807, + "grad_norm": 0.7891073822975159, + "learning_rate": 9.936707578331423e-05, + "loss": 0.9786, + "step": 15880 + }, + { + "epoch": 0.10151668093479678, + "grad_norm": 0.6075239181518555, + "learning_rate": 9.936627968604955e-05, + "loss": 0.7032, + "step": 15890 + }, + { + "epoch": 0.10158056808453547, + "grad_norm": 2.8596110343933105, + "learning_rate": 9.93654830916237e-05, + "loss": 0.9138, + "step": 15900 + }, + { + "epoch": 0.10164445523427418, + "grad_norm": 1.1982015371322632, + "learning_rate": 9.936468600004477e-05, + "loss": 0.7194, + "step": 15910 + }, + { + "epoch": 0.10170834238401288, + "grad_norm": 0.6473510265350342, + "learning_rate": 9.936388841132071e-05, + "loss": 0.7809, + "step": 15920 + }, + { + "epoch": 0.10177222953375158, + "grad_norm": 1.089911937713623, + "learning_rate": 9.936309032545961e-05, + "loss": 0.846, + "step": 15930 + }, + { + "epoch": 0.10183611668349028, + "grad_norm": 0.9146657586097717, + "learning_rate": 9.936229174246947e-05, + "loss": 0.8369, + "step": 15940 + }, + { + "epoch": 0.10190000383322899, + "grad_norm": 0.599389374256134, + "learning_rate": 9.936149266235835e-05, + "loss": 0.9457, + "step": 15950 + }, + { + "epoch": 0.10196389098296768, + "grad_norm": 0.5718626976013184, + "learning_rate": 9.93606930851343e-05, + "loss": 1.1508, + "step": 15960 + }, + { + "epoch": 0.10202777813270639, + "grad_norm": 0.5820611715316772, + "learning_rate": 9.935989301080535e-05, + "loss": 0.5636, + "step": 15970 + }, + { + "epoch": 0.1020916652824451, + "grad_norm": 0.9194528460502625, + "learning_rate": 9.935909243937959e-05, + "loss": 0.9002, + "step": 15980 + }, + { + "epoch": 0.10215555243218379, + "grad_norm": 1.094212293624878, + "learning_rate": 9.935829137086508e-05, + "loss": 1.2759, + "step": 15990 + }, + { + "epoch": 0.10221943958192249, + "grad_norm": 0.8695144653320312, + "learning_rate": 9.935748980526986e-05, + "loss": 0.6543, + "step": 16000 + }, + { + "epoch": 0.1022833267316612, + "grad_norm": 1.7058948278427124, + "learning_rate": 9.935668774260202e-05, + "loss": 0.8703, + "step": 16010 + }, + { + "epoch": 0.10234721388139989, + "grad_norm": 2.493241310119629, + "learning_rate": 9.935588518286963e-05, + "loss": 1.282, + "step": 16020 + }, + { + "epoch": 0.1024111010311386, + "grad_norm": 0.7929388284683228, + "learning_rate": 9.935508212608078e-05, + "loss": 0.6585, + "step": 16030 + }, + { + "epoch": 0.1024749881808773, + "grad_norm": 0.8106563091278076, + "learning_rate": 9.935427857224356e-05, + "loss": 0.9955, + "step": 16040 + }, + { + "epoch": 0.102538875330616, + "grad_norm": 2.413360357284546, + "learning_rate": 9.935347452136606e-05, + "loss": 1.0771, + "step": 16050 + }, + { + "epoch": 0.1026027624803547, + "grad_norm": 0.7023759484291077, + "learning_rate": 9.935266997345636e-05, + "loss": 1.0192, + "step": 16060 + }, + { + "epoch": 0.1026666496300934, + "grad_norm": 1.3818843364715576, + "learning_rate": 9.935186492852258e-05, + "loss": 1.1104, + "step": 16070 + }, + { + "epoch": 0.1027305367798321, + "grad_norm": 1.1903809309005737, + "learning_rate": 9.935105938657283e-05, + "loss": 0.8756, + "step": 16080 + }, + { + "epoch": 0.1027944239295708, + "grad_norm": 0.610237181186676, + "learning_rate": 9.935025334761523e-05, + "loss": 0.9345, + "step": 16090 + }, + { + "epoch": 0.10285831107930951, + "grad_norm": 0.8631981015205383, + "learning_rate": 9.934944681165786e-05, + "loss": 1.0976, + "step": 16100 + }, + { + "epoch": 0.1029221982290482, + "grad_norm": 0.5845250487327576, + "learning_rate": 9.934863977870889e-05, + "loss": 0.8406, + "step": 16110 + }, + { + "epoch": 0.10298608537878691, + "grad_norm": 0.5269205570220947, + "learning_rate": 9.93478322487764e-05, + "loss": 0.9238, + "step": 16120 + }, + { + "epoch": 0.10304997252852562, + "grad_norm": 0.6796483397483826, + "learning_rate": 9.934702422186857e-05, + "loss": 0.9912, + "step": 16130 + }, + { + "epoch": 0.10311385967826431, + "grad_norm": 0.9061000347137451, + "learning_rate": 9.93462156979935e-05, + "loss": 1.0622, + "step": 16140 + }, + { + "epoch": 0.10317774682800301, + "grad_norm": 0.5684584379196167, + "learning_rate": 9.934540667715936e-05, + "loss": 0.8797, + "step": 16150 + }, + { + "epoch": 0.10324163397774172, + "grad_norm": 0.8343471884727478, + "learning_rate": 9.934459715937428e-05, + "loss": 0.8628, + "step": 16160 + }, + { + "epoch": 0.10330552112748041, + "grad_norm": 0.9811477065086365, + "learning_rate": 9.934378714464642e-05, + "loss": 1.1671, + "step": 16170 + }, + { + "epoch": 0.10336940827721912, + "grad_norm": 0.9283135533332825, + "learning_rate": 9.934297663298393e-05, + "loss": 0.7027, + "step": 16180 + }, + { + "epoch": 0.10343329542695782, + "grad_norm": 0.7332042455673218, + "learning_rate": 9.934216562439498e-05, + "loss": 0.8026, + "step": 16190 + }, + { + "epoch": 0.10349718257669652, + "grad_norm": 3.353732109069824, + "learning_rate": 9.934135411888773e-05, + "loss": 1.1843, + "step": 16200 + }, + { + "epoch": 0.10356106972643522, + "grad_norm": 1.056642770767212, + "learning_rate": 9.934054211647036e-05, + "loss": 0.8445, + "step": 16210 + }, + { + "epoch": 0.10362495687617393, + "grad_norm": 0.6340813636779785, + "learning_rate": 9.933972961715104e-05, + "loss": 1.0407, + "step": 16220 + }, + { + "epoch": 0.10368884402591262, + "grad_norm": 0.823939859867096, + "learning_rate": 9.933891662093797e-05, + "loss": 0.9409, + "step": 16230 + }, + { + "epoch": 0.10375273117565133, + "grad_norm": 1.3675154447555542, + "learning_rate": 9.933810312783932e-05, + "loss": 0.7627, + "step": 16240 + }, + { + "epoch": 0.10381661832539003, + "grad_norm": 2.952162742614746, + "learning_rate": 9.933728913786328e-05, + "loss": 0.7343, + "step": 16250 + }, + { + "epoch": 0.10388050547512873, + "grad_norm": 0.5602843165397644, + "learning_rate": 9.933647465101807e-05, + "loss": 0.9949, + "step": 16260 + }, + { + "epoch": 0.10394439262486743, + "grad_norm": 1.035836935043335, + "learning_rate": 9.933565966731187e-05, + "loss": 0.733, + "step": 16270 + }, + { + "epoch": 0.10400827977460614, + "grad_norm": 0.599962055683136, + "learning_rate": 9.93348441867529e-05, + "loss": 0.8972, + "step": 16280 + }, + { + "epoch": 0.10407216692434483, + "grad_norm": 1.3323990106582642, + "learning_rate": 9.933402820934936e-05, + "loss": 0.8854, + "step": 16290 + }, + { + "epoch": 0.10413605407408354, + "grad_norm": 1.9497777223587036, + "learning_rate": 9.933321173510949e-05, + "loss": 0.9189, + "step": 16300 + }, + { + "epoch": 0.10419994122382224, + "grad_norm": 1.2453469038009644, + "learning_rate": 9.933239476404149e-05, + "loss": 0.9895, + "step": 16310 + }, + { + "epoch": 0.10426382837356094, + "grad_norm": 1.1778478622436523, + "learning_rate": 9.933157729615359e-05, + "loss": 0.9034, + "step": 16320 + }, + { + "epoch": 0.10432771552329964, + "grad_norm": 0.7370180487632751, + "learning_rate": 9.933075933145404e-05, + "loss": 0.9827, + "step": 16330 + }, + { + "epoch": 0.10439160267303835, + "grad_norm": 0.992669403553009, + "learning_rate": 9.932994086995107e-05, + "loss": 0.696, + "step": 16340 + }, + { + "epoch": 0.10445548982277705, + "grad_norm": 0.8469734191894531, + "learning_rate": 9.93291219116529e-05, + "loss": 0.7605, + "step": 16350 + }, + { + "epoch": 0.10451937697251575, + "grad_norm": 1.4844669103622437, + "learning_rate": 9.932830245656782e-05, + "loss": 0.8848, + "step": 16360 + }, + { + "epoch": 0.10458326412225445, + "grad_norm": 0.7089157104492188, + "learning_rate": 9.932748250470403e-05, + "loss": 0.8722, + "step": 16370 + }, + { + "epoch": 0.10464715127199316, + "grad_norm": 0.6361833214759827, + "learning_rate": 9.932666205606984e-05, + "loss": 1.0907, + "step": 16380 + }, + { + "epoch": 0.10471103842173185, + "grad_norm": 1.060922384262085, + "learning_rate": 9.932584111067348e-05, + "loss": 0.9377, + "step": 16390 + }, + { + "epoch": 0.10477492557147056, + "grad_norm": 1.2127258777618408, + "learning_rate": 9.932501966852323e-05, + "loss": 1.1433, + "step": 16400 + }, + { + "epoch": 0.10483881272120926, + "grad_norm": 0.6231849193572998, + "learning_rate": 9.932419772962735e-05, + "loss": 0.925, + "step": 16410 + }, + { + "epoch": 0.10490269987094795, + "grad_norm": 0.5481915473937988, + "learning_rate": 9.932337529399415e-05, + "loss": 0.8031, + "step": 16420 + }, + { + "epoch": 0.10496658702068666, + "grad_norm": 0.5232637524604797, + "learning_rate": 9.932255236163187e-05, + "loss": 0.8512, + "step": 16430 + }, + { + "epoch": 0.10503047417042537, + "grad_norm": 0.6596049666404724, + "learning_rate": 9.932172893254884e-05, + "loss": 0.7366, + "step": 16440 + }, + { + "epoch": 0.10509436132016406, + "grad_norm": 0.826575517654419, + "learning_rate": 9.932090500675331e-05, + "loss": 0.7942, + "step": 16450 + }, + { + "epoch": 0.10515824846990277, + "grad_norm": 0.6646784543991089, + "learning_rate": 9.932008058425359e-05, + "loss": 1.1065, + "step": 16460 + }, + { + "epoch": 0.10522213561964147, + "grad_norm": 0.6288832426071167, + "learning_rate": 9.931925566505802e-05, + "loss": 0.9242, + "step": 16470 + }, + { + "epoch": 0.10528602276938016, + "grad_norm": 0.7876302003860474, + "learning_rate": 9.931843024917484e-05, + "loss": 0.9227, + "step": 16480 + }, + { + "epoch": 0.10534990991911887, + "grad_norm": 0.6333622336387634, + "learning_rate": 9.931760433661244e-05, + "loss": 0.9783, + "step": 16490 + }, + { + "epoch": 0.10541379706885758, + "grad_norm": 1.2118867635726929, + "learning_rate": 9.931677792737907e-05, + "loss": 0.727, + "step": 16500 + }, + { + "epoch": 0.10547768421859627, + "grad_norm": 0.8063325881958008, + "learning_rate": 9.931595102148309e-05, + "loss": 1.2654, + "step": 16510 + }, + { + "epoch": 0.10554157136833497, + "grad_norm": 0.5137673020362854, + "learning_rate": 9.931512361893283e-05, + "loss": 0.7905, + "step": 16520 + }, + { + "epoch": 0.10560545851807368, + "grad_norm": 1.0696414709091187, + "learning_rate": 9.93142957197366e-05, + "loss": 1.0821, + "step": 16530 + }, + { + "epoch": 0.10566934566781237, + "grad_norm": 1.1155736446380615, + "learning_rate": 9.931346732390274e-05, + "loss": 0.7375, + "step": 16540 + }, + { + "epoch": 0.10573323281755108, + "grad_norm": 0.784761369228363, + "learning_rate": 9.931263843143962e-05, + "loss": 0.7859, + "step": 16550 + }, + { + "epoch": 0.10579711996728978, + "grad_norm": 0.9071635007858276, + "learning_rate": 9.931180904235557e-05, + "loss": 1.0189, + "step": 16560 + }, + { + "epoch": 0.10586100711702848, + "grad_norm": 0.6615142822265625, + "learning_rate": 9.931097915665892e-05, + "loss": 0.9826, + "step": 16570 + }, + { + "epoch": 0.10592489426676718, + "grad_norm": 1.0913355350494385, + "learning_rate": 9.931014877435806e-05, + "loss": 1.2501, + "step": 16580 + }, + { + "epoch": 0.10598878141650589, + "grad_norm": 0.7185521125793457, + "learning_rate": 9.930931789546136e-05, + "loss": 0.9584, + "step": 16590 + }, + { + "epoch": 0.10605266856624458, + "grad_norm": 0.9962629079818726, + "learning_rate": 9.930848651997716e-05, + "loss": 1.2084, + "step": 16600 + }, + { + "epoch": 0.10611655571598329, + "grad_norm": 0.5388261079788208, + "learning_rate": 9.930765464791383e-05, + "loss": 0.7474, + "step": 16610 + }, + { + "epoch": 0.106180442865722, + "grad_norm": 0.963033139705658, + "learning_rate": 9.930682227927978e-05, + "loss": 0.8856, + "step": 16620 + }, + { + "epoch": 0.10624433001546069, + "grad_norm": 0.8740180730819702, + "learning_rate": 9.930598941408335e-05, + "loss": 0.9665, + "step": 16630 + }, + { + "epoch": 0.10630821716519939, + "grad_norm": 0.7706631422042847, + "learning_rate": 9.930515605233297e-05, + "loss": 0.9538, + "step": 16640 + }, + { + "epoch": 0.1063721043149381, + "grad_norm": 1.0172282457351685, + "learning_rate": 9.930432219403702e-05, + "loss": 0.9451, + "step": 16650 + }, + { + "epoch": 0.10643599146467679, + "grad_norm": 1.1416665315628052, + "learning_rate": 9.930348783920387e-05, + "loss": 0.812, + "step": 16660 + }, + { + "epoch": 0.1064998786144155, + "grad_norm": 1.248719573020935, + "learning_rate": 9.930265298784196e-05, + "loss": 1.0079, + "step": 16670 + }, + { + "epoch": 0.1065637657641542, + "grad_norm": 0.8804942965507507, + "learning_rate": 9.930181763995968e-05, + "loss": 1.0038, + "step": 16680 + }, + { + "epoch": 0.1066276529138929, + "grad_norm": 0.9898728132247925, + "learning_rate": 9.930098179556543e-05, + "loss": 0.9694, + "step": 16690 + }, + { + "epoch": 0.1066915400636316, + "grad_norm": 1.1314060688018799, + "learning_rate": 9.930014545466765e-05, + "loss": 1.0318, + "step": 16700 + }, + { + "epoch": 0.10675542721337031, + "grad_norm": 1.0899930000305176, + "learning_rate": 9.929930861727476e-05, + "loss": 1.1298, + "step": 16710 + }, + { + "epoch": 0.106819314363109, + "grad_norm": 1.2332922220230103, + "learning_rate": 9.929847128339517e-05, + "loss": 0.9744, + "step": 16720 + }, + { + "epoch": 0.1068832015128477, + "grad_norm": 1.1803171634674072, + "learning_rate": 9.929763345303733e-05, + "loss": 0.9733, + "step": 16730 + }, + { + "epoch": 0.10694708866258641, + "grad_norm": 0.8435320854187012, + "learning_rate": 9.929679512620969e-05, + "loss": 0.8418, + "step": 16740 + }, + { + "epoch": 0.1070109758123251, + "grad_norm": 0.68702632188797, + "learning_rate": 9.929595630292066e-05, + "loss": 1.0078, + "step": 16750 + }, + { + "epoch": 0.10707486296206381, + "grad_norm": 0.8807457089424133, + "learning_rate": 9.92951169831787e-05, + "loss": 0.8933, + "step": 16760 + }, + { + "epoch": 0.10713875011180252, + "grad_norm": 0.922346293926239, + "learning_rate": 9.929427716699227e-05, + "loss": 0.765, + "step": 16770 + }, + { + "epoch": 0.10720263726154121, + "grad_norm": 0.6668721437454224, + "learning_rate": 9.929343685436982e-05, + "loss": 0.7723, + "step": 16780 + }, + { + "epoch": 0.10726652441127991, + "grad_norm": 1.0509366989135742, + "learning_rate": 9.929259604531981e-05, + "loss": 0.9128, + "step": 16790 + }, + { + "epoch": 0.10733041156101862, + "grad_norm": 0.9233303070068359, + "learning_rate": 9.929175473985073e-05, + "loss": 0.8772, + "step": 16800 + }, + { + "epoch": 0.10739429871075731, + "grad_norm": 0.5858426094055176, + "learning_rate": 9.929091293797102e-05, + "loss": 0.9377, + "step": 16810 + }, + { + "epoch": 0.10745818586049602, + "grad_norm": 0.7452363967895508, + "learning_rate": 9.929007063968919e-05, + "loss": 0.9821, + "step": 16820 + }, + { + "epoch": 0.10752207301023473, + "grad_norm": 0.8996424078941345, + "learning_rate": 9.92892278450137e-05, + "loss": 0.969, + "step": 16830 + }, + { + "epoch": 0.10758596015997342, + "grad_norm": 0.9038456082344055, + "learning_rate": 9.928838455395304e-05, + "loss": 0.9136, + "step": 16840 + }, + { + "epoch": 0.10764984730971212, + "grad_norm": 2.0651540756225586, + "learning_rate": 9.928754076651571e-05, + "loss": 0.9447, + "step": 16850 + }, + { + "epoch": 0.10771373445945083, + "grad_norm": 0.9123902916908264, + "learning_rate": 9.928669648271021e-05, + "loss": 1.0723, + "step": 16860 + }, + { + "epoch": 0.10777762160918952, + "grad_norm": 0.7702105641365051, + "learning_rate": 9.928585170254503e-05, + "loss": 1.0555, + "step": 16870 + }, + { + "epoch": 0.10784150875892823, + "grad_norm": 0.8191667795181274, + "learning_rate": 9.928500642602869e-05, + "loss": 0.952, + "step": 16880 + }, + { + "epoch": 0.10790539590866693, + "grad_norm": 0.8521249890327454, + "learning_rate": 9.928416065316969e-05, + "loss": 0.9182, + "step": 16890 + }, + { + "epoch": 0.10796928305840563, + "grad_norm": 0.4355503022670746, + "learning_rate": 9.928331438397655e-05, + "loss": 0.7828, + "step": 16900 + }, + { + "epoch": 0.10803317020814433, + "grad_norm": 0.8053306937217712, + "learning_rate": 9.928246761845782e-05, + "loss": 1.073, + "step": 16910 + }, + { + "epoch": 0.10809705735788304, + "grad_norm": 1.1718153953552246, + "learning_rate": 9.928162035662199e-05, + "loss": 0.8979, + "step": 16920 + }, + { + "epoch": 0.10816094450762173, + "grad_norm": 0.8112810850143433, + "learning_rate": 9.928077259847761e-05, + "loss": 1.0277, + "step": 16930 + }, + { + "epoch": 0.10822483165736044, + "grad_norm": 0.5165520906448364, + "learning_rate": 9.927992434403322e-05, + "loss": 1.0714, + "step": 16940 + }, + { + "epoch": 0.10828871880709914, + "grad_norm": 0.9523488283157349, + "learning_rate": 9.927907559329736e-05, + "loss": 0.9623, + "step": 16950 + }, + { + "epoch": 0.10835260595683784, + "grad_norm": 0.5549238324165344, + "learning_rate": 9.927822634627857e-05, + "loss": 0.7777, + "step": 16960 + }, + { + "epoch": 0.10841649310657654, + "grad_norm": 0.8362735509872437, + "learning_rate": 9.927737660298541e-05, + "loss": 0.9429, + "step": 16970 + }, + { + "epoch": 0.10848038025631525, + "grad_norm": 1.324947714805603, + "learning_rate": 9.927652636342645e-05, + "loss": 0.9626, + "step": 16980 + }, + { + "epoch": 0.10854426740605394, + "grad_norm": 0.8219287395477295, + "learning_rate": 9.927567562761021e-05, + "loss": 1.1411, + "step": 16990 + }, + { + "epoch": 0.10860815455579265, + "grad_norm": 0.7673150897026062, + "learning_rate": 9.927482439554532e-05, + "loss": 0.9758, + "step": 17000 + }, + { + "epoch": 0.10867204170553135, + "grad_norm": 0.7057496905326843, + "learning_rate": 9.92739726672403e-05, + "loss": 0.8181, + "step": 17010 + }, + { + "epoch": 0.10873592885527004, + "grad_norm": 1.2595868110656738, + "learning_rate": 9.927312044270375e-05, + "loss": 0.9396, + "step": 17020 + }, + { + "epoch": 0.10879981600500875, + "grad_norm": 0.8270642161369324, + "learning_rate": 9.927226772194426e-05, + "loss": 0.9074, + "step": 17030 + }, + { + "epoch": 0.10886370315474746, + "grad_norm": 0.5199679732322693, + "learning_rate": 9.927141450497039e-05, + "loss": 0.9427, + "step": 17040 + }, + { + "epoch": 0.10892759030448615, + "grad_norm": 0.7724682688713074, + "learning_rate": 9.927056079179076e-05, + "loss": 0.8286, + "step": 17050 + }, + { + "epoch": 0.10899147745422486, + "grad_norm": 0.614035964012146, + "learning_rate": 9.926970658241397e-05, + "loss": 0.8915, + "step": 17060 + }, + { + "epoch": 0.10905536460396356, + "grad_norm": 1.045047402381897, + "learning_rate": 9.926885187684859e-05, + "loss": 0.8422, + "step": 17070 + }, + { + "epoch": 0.10911925175370225, + "grad_norm": 0.7779353857040405, + "learning_rate": 9.926799667510326e-05, + "loss": 0.8882, + "step": 17080 + }, + { + "epoch": 0.10918313890344096, + "grad_norm": 0.9462752938270569, + "learning_rate": 9.926714097718657e-05, + "loss": 0.923, + "step": 17090 + }, + { + "epoch": 0.10924702605317967, + "grad_norm": 0.5807504057884216, + "learning_rate": 9.926628478310715e-05, + "loss": 0.8799, + "step": 17100 + }, + { + "epoch": 0.10931091320291836, + "grad_norm": 0.8692427277565002, + "learning_rate": 9.926542809287364e-05, + "loss": 0.9051, + "step": 17110 + }, + { + "epoch": 0.10937480035265706, + "grad_norm": 0.8406835794448853, + "learning_rate": 9.926457090649462e-05, + "loss": 0.7788, + "step": 17120 + }, + { + "epoch": 0.10943868750239577, + "grad_norm": 0.8116185665130615, + "learning_rate": 9.926371322397877e-05, + "loss": 0.7086, + "step": 17130 + }, + { + "epoch": 0.10950257465213446, + "grad_norm": 1.1767171621322632, + "learning_rate": 9.92628550453347e-05, + "loss": 0.9115, + "step": 17140 + }, + { + "epoch": 0.10956646180187317, + "grad_norm": 0.8342850804328918, + "learning_rate": 9.926199637057108e-05, + "loss": 0.8859, + "step": 17150 + }, + { + "epoch": 0.10963034895161188, + "grad_norm": 1.0540907382965088, + "learning_rate": 9.926113719969652e-05, + "loss": 1.1537, + "step": 17160 + }, + { + "epoch": 0.10969423610135058, + "grad_norm": 1.0921380519866943, + "learning_rate": 9.926027753271969e-05, + "loss": 0.6981, + "step": 17170 + }, + { + "epoch": 0.10975812325108927, + "grad_norm": 1.9400385618209839, + "learning_rate": 9.925941736964925e-05, + "loss": 0.953, + "step": 17180 + }, + { + "epoch": 0.10982201040082798, + "grad_norm": 0.9200677275657654, + "learning_rate": 9.925855671049387e-05, + "loss": 1.0603, + "step": 17190 + }, + { + "epoch": 0.10988589755056669, + "grad_norm": 0.8739213347434998, + "learning_rate": 9.92576955552622e-05, + "loss": 0.9993, + "step": 17200 + }, + { + "epoch": 0.10994978470030538, + "grad_norm": 0.4887886345386505, + "learning_rate": 9.925683390396292e-05, + "loss": 0.9623, + "step": 17210 + }, + { + "epoch": 0.11001367185004408, + "grad_norm": 0.7912802696228027, + "learning_rate": 9.925597175660472e-05, + "loss": 0.9074, + "step": 17220 + }, + { + "epoch": 0.11007755899978279, + "grad_norm": 0.8125321865081787, + "learning_rate": 9.925510911319626e-05, + "loss": 0.8537, + "step": 17230 + }, + { + "epoch": 0.11014144614952148, + "grad_norm": 0.5782091617584229, + "learning_rate": 9.925424597374626e-05, + "loss": 0.8458, + "step": 17240 + }, + { + "epoch": 0.11020533329926019, + "grad_norm": 0.777730405330658, + "learning_rate": 9.925338233826338e-05, + "loss": 0.8778, + "step": 17250 + }, + { + "epoch": 0.1102692204489989, + "grad_norm": 0.8471282124519348, + "learning_rate": 9.925251820675633e-05, + "loss": 0.8727, + "step": 17260 + }, + { + "epoch": 0.11033310759873759, + "grad_norm": 0.7456023097038269, + "learning_rate": 9.92516535792338e-05, + "loss": 0.9105, + "step": 17270 + }, + { + "epoch": 0.1103969947484763, + "grad_norm": 1.1746282577514648, + "learning_rate": 9.925078845570452e-05, + "loss": 0.9972, + "step": 17280 + }, + { + "epoch": 0.110460881898215, + "grad_norm": 1.0486959218978882, + "learning_rate": 9.92499228361772e-05, + "loss": 0.923, + "step": 17290 + }, + { + "epoch": 0.11052476904795369, + "grad_norm": 0.8411831259727478, + "learning_rate": 9.924905672066054e-05, + "loss": 0.8309, + "step": 17300 + }, + { + "epoch": 0.1105886561976924, + "grad_norm": 0.8965527415275574, + "learning_rate": 9.924819010916328e-05, + "loss": 0.66, + "step": 17310 + }, + { + "epoch": 0.1106525433474311, + "grad_norm": 0.8932517170906067, + "learning_rate": 9.924732300169414e-05, + "loss": 0.9388, + "step": 17320 + }, + { + "epoch": 0.1107164304971698, + "grad_norm": 0.4896504878997803, + "learning_rate": 9.924645539826184e-05, + "loss": 0.9554, + "step": 17330 + }, + { + "epoch": 0.1107803176469085, + "grad_norm": 1.2872638702392578, + "learning_rate": 9.924558729887514e-05, + "loss": 1.0668, + "step": 17340 + }, + { + "epoch": 0.11084420479664721, + "grad_norm": 0.7142483592033386, + "learning_rate": 9.924471870354277e-05, + "loss": 0.9363, + "step": 17350 + }, + { + "epoch": 0.1109080919463859, + "grad_norm": 0.8296705484390259, + "learning_rate": 9.924384961227348e-05, + "loss": 0.7258, + "step": 17360 + }, + { + "epoch": 0.1109719790961246, + "grad_norm": 0.609883189201355, + "learning_rate": 9.924298002507602e-05, + "loss": 1.0268, + "step": 17370 + }, + { + "epoch": 0.11103586624586331, + "grad_norm": 1.5724817514419556, + "learning_rate": 9.924210994195915e-05, + "loss": 0.9622, + "step": 17380 + }, + { + "epoch": 0.111099753395602, + "grad_norm": 0.8032723069190979, + "learning_rate": 9.924123936293164e-05, + "loss": 0.7995, + "step": 17390 + }, + { + "epoch": 0.11116364054534071, + "grad_norm": 0.6989961266517639, + "learning_rate": 9.924036828800223e-05, + "loss": 0.8276, + "step": 17400 + }, + { + "epoch": 0.11122752769507942, + "grad_norm": 0.6586987972259521, + "learning_rate": 9.923949671717973e-05, + "loss": 0.9443, + "step": 17410 + }, + { + "epoch": 0.11129141484481811, + "grad_norm": 1.005934715270996, + "learning_rate": 9.923862465047291e-05, + "loss": 0.7672, + "step": 17420 + }, + { + "epoch": 0.11135530199455682, + "grad_norm": 0.7518536448478699, + "learning_rate": 9.923775208789053e-05, + "loss": 0.9946, + "step": 17430 + }, + { + "epoch": 0.11141918914429552, + "grad_norm": 0.8329457640647888, + "learning_rate": 9.923687902944138e-05, + "loss": 0.8019, + "step": 17440 + }, + { + "epoch": 0.11148307629403421, + "grad_norm": 0.5968138575553894, + "learning_rate": 9.923600547513427e-05, + "loss": 0.779, + "step": 17450 + }, + { + "epoch": 0.11154696344377292, + "grad_norm": 0.5740717053413391, + "learning_rate": 9.9235131424978e-05, + "loss": 1.1269, + "step": 17460 + }, + { + "epoch": 0.11161085059351163, + "grad_norm": 0.4525648355484009, + "learning_rate": 9.923425687898135e-05, + "loss": 0.9606, + "step": 17470 + }, + { + "epoch": 0.11167473774325032, + "grad_norm": 0.7562941312789917, + "learning_rate": 9.923338183715314e-05, + "loss": 1.3565, + "step": 17480 + }, + { + "epoch": 0.11173862489298902, + "grad_norm": 0.8217481970787048, + "learning_rate": 9.923250629950218e-05, + "loss": 0.9787, + "step": 17490 + }, + { + "epoch": 0.11180251204272773, + "grad_norm": 1.1421339511871338, + "learning_rate": 9.92316302660373e-05, + "loss": 1.118, + "step": 17500 + }, + { + "epoch": 0.11186639919246642, + "grad_norm": 1.1751115322113037, + "learning_rate": 9.92307537367673e-05, + "loss": 0.8283, + "step": 17510 + }, + { + "epoch": 0.11193028634220513, + "grad_norm": 0.8353852033615112, + "learning_rate": 9.922987671170103e-05, + "loss": 1.1464, + "step": 17520 + }, + { + "epoch": 0.11199417349194384, + "grad_norm": 0.6148945689201355, + "learning_rate": 9.92289991908473e-05, + "loss": 0.8993, + "step": 17530 + }, + { + "epoch": 0.11205806064168253, + "grad_norm": 0.7143790125846863, + "learning_rate": 9.922812117421496e-05, + "loss": 0.8293, + "step": 17540 + }, + { + "epoch": 0.11212194779142123, + "grad_norm": 0.6704200506210327, + "learning_rate": 9.922724266181286e-05, + "loss": 0.9002, + "step": 17550 + }, + { + "epoch": 0.11218583494115994, + "grad_norm": 0.6758965253829956, + "learning_rate": 9.922636365364984e-05, + "loss": 0.868, + "step": 17560 + }, + { + "epoch": 0.11224972209089863, + "grad_norm": 1.4119186401367188, + "learning_rate": 9.922548414973473e-05, + "loss": 0.8967, + "step": 17570 + }, + { + "epoch": 0.11231360924063734, + "grad_norm": 0.7103084921836853, + "learning_rate": 9.922460415007644e-05, + "loss": 0.7774, + "step": 17580 + }, + { + "epoch": 0.11237749639037604, + "grad_norm": 1.5748227834701538, + "learning_rate": 9.922372365468378e-05, + "loss": 0.8543, + "step": 17590 + }, + { + "epoch": 0.11244138354011474, + "grad_norm": 0.8554244637489319, + "learning_rate": 9.922284266356565e-05, + "loss": 0.9862, + "step": 17600 + }, + { + "epoch": 0.11250527068985344, + "grad_norm": 0.9203200936317444, + "learning_rate": 9.92219611767309e-05, + "loss": 0.8462, + "step": 17610 + }, + { + "epoch": 0.11256915783959215, + "grad_norm": 0.8570156097412109, + "learning_rate": 9.922107919418842e-05, + "loss": 0.9768, + "step": 17620 + }, + { + "epoch": 0.11263304498933084, + "grad_norm": 0.8079208135604858, + "learning_rate": 9.92201967159471e-05, + "loss": 0.7745, + "step": 17630 + }, + { + "epoch": 0.11269693213906955, + "grad_norm": 0.8128913640975952, + "learning_rate": 9.92193137420158e-05, + "loss": 0.7183, + "step": 17640 + }, + { + "epoch": 0.11276081928880825, + "grad_norm": 1.0222535133361816, + "learning_rate": 9.921843027240345e-05, + "loss": 0.762, + "step": 17650 + }, + { + "epoch": 0.11282470643854695, + "grad_norm": 0.782536506652832, + "learning_rate": 9.921754630711891e-05, + "loss": 1.0573, + "step": 17660 + }, + { + "epoch": 0.11288859358828565, + "grad_norm": 0.7294056415557861, + "learning_rate": 9.921666184617111e-05, + "loss": 1.1262, + "step": 17670 + }, + { + "epoch": 0.11295248073802436, + "grad_norm": 0.7423584461212158, + "learning_rate": 9.921577688956893e-05, + "loss": 0.9985, + "step": 17680 + }, + { + "epoch": 0.11301636788776305, + "grad_norm": 0.7123269438743591, + "learning_rate": 9.921489143732133e-05, + "loss": 0.9849, + "step": 17690 + }, + { + "epoch": 0.11308025503750176, + "grad_norm": 0.9806658625602722, + "learning_rate": 9.921400548943718e-05, + "loss": 0.8499, + "step": 17700 + }, + { + "epoch": 0.11314414218724046, + "grad_norm": 2.702582359313965, + "learning_rate": 9.921311904592541e-05, + "loss": 0.9368, + "step": 17710 + }, + { + "epoch": 0.11320802933697915, + "grad_norm": 0.627751350402832, + "learning_rate": 9.921223210679495e-05, + "loss": 1.0154, + "step": 17720 + }, + { + "epoch": 0.11327191648671786, + "grad_norm": 1.1272038221359253, + "learning_rate": 9.921134467205477e-05, + "loss": 1.0128, + "step": 17730 + }, + { + "epoch": 0.11333580363645657, + "grad_norm": 1.0452537536621094, + "learning_rate": 9.921045674171374e-05, + "loss": 0.9581, + "step": 17740 + }, + { + "epoch": 0.11339969078619526, + "grad_norm": 0.6000169515609741, + "learning_rate": 9.920956831578086e-05, + "loss": 1.0127, + "step": 17750 + }, + { + "epoch": 0.11346357793593397, + "grad_norm": 0.8441605567932129, + "learning_rate": 9.920867939426505e-05, + "loss": 1.0766, + "step": 17760 + }, + { + "epoch": 0.11352746508567267, + "grad_norm": 1.0325100421905518, + "learning_rate": 9.920778997717527e-05, + "loss": 0.673, + "step": 17770 + }, + { + "epoch": 0.11359135223541136, + "grad_norm": 0.8646054863929749, + "learning_rate": 9.920690006452047e-05, + "loss": 0.8475, + "step": 17780 + }, + { + "epoch": 0.11365523938515007, + "grad_norm": 1.1158571243286133, + "learning_rate": 9.920600965630962e-05, + "loss": 0.7743, + "step": 17790 + }, + { + "epoch": 0.11371912653488878, + "grad_norm": 0.790447473526001, + "learning_rate": 9.920511875255168e-05, + "loss": 0.8564, + "step": 17800 + }, + { + "epoch": 0.11378301368462747, + "grad_norm": 0.6469011902809143, + "learning_rate": 9.920422735325561e-05, + "loss": 0.9071, + "step": 17810 + }, + { + "epoch": 0.11384690083436617, + "grad_norm": 0.8129775524139404, + "learning_rate": 9.920333545843042e-05, + "loss": 0.9754, + "step": 17820 + }, + { + "epoch": 0.11391078798410488, + "grad_norm": 1.0118224620819092, + "learning_rate": 9.920244306808509e-05, + "loss": 0.8034, + "step": 17830 + }, + { + "epoch": 0.11397467513384357, + "grad_norm": 0.8558486104011536, + "learning_rate": 9.920155018222857e-05, + "loss": 1.0181, + "step": 17840 + }, + { + "epoch": 0.11403856228358228, + "grad_norm": 1.0910837650299072, + "learning_rate": 9.920065680086988e-05, + "loss": 0.9216, + "step": 17850 + }, + { + "epoch": 0.11410244943332098, + "grad_norm": 0.6649434566497803, + "learning_rate": 9.9199762924018e-05, + "loss": 0.7545, + "step": 17860 + }, + { + "epoch": 0.11416633658305968, + "grad_norm": 2.092512369155884, + "learning_rate": 9.919886855168196e-05, + "loss": 1.0409, + "step": 17870 + }, + { + "epoch": 0.11423022373279838, + "grad_norm": 1.0226621627807617, + "learning_rate": 9.919797368387073e-05, + "loss": 0.9839, + "step": 17880 + }, + { + "epoch": 0.11429411088253709, + "grad_norm": 0.9362402558326721, + "learning_rate": 9.919707832059337e-05, + "loss": 0.9349, + "step": 17890 + }, + { + "epoch": 0.11435799803227578, + "grad_norm": 0.6043878793716431, + "learning_rate": 9.919618246185886e-05, + "loss": 0.9667, + "step": 17900 + }, + { + "epoch": 0.11442188518201449, + "grad_norm": 0.7030009031295776, + "learning_rate": 9.919528610767622e-05, + "loss": 0.8868, + "step": 17910 + }, + { + "epoch": 0.1144857723317532, + "grad_norm": 0.54000324010849, + "learning_rate": 9.919438925805451e-05, + "loss": 0.9966, + "step": 17920 + }, + { + "epoch": 0.11454965948149189, + "grad_norm": 0.7529541254043579, + "learning_rate": 9.919349191300272e-05, + "loss": 0.8292, + "step": 17930 + }, + { + "epoch": 0.11461354663123059, + "grad_norm": 1.047979712486267, + "learning_rate": 9.919259407252992e-05, + "loss": 0.722, + "step": 17940 + }, + { + "epoch": 0.1146774337809693, + "grad_norm": 0.6364821195602417, + "learning_rate": 9.919169573664513e-05, + "loss": 1.0382, + "step": 17950 + }, + { + "epoch": 0.11474132093070799, + "grad_norm": 1.298886775970459, + "learning_rate": 9.919079690535742e-05, + "loss": 1.0275, + "step": 17960 + }, + { + "epoch": 0.1148052080804467, + "grad_norm": 0.9060257077217102, + "learning_rate": 9.918989757867583e-05, + "loss": 0.8959, + "step": 17970 + }, + { + "epoch": 0.1148690952301854, + "grad_norm": 1.0557827949523926, + "learning_rate": 9.91889977566094e-05, + "loss": 0.8224, + "step": 17980 + }, + { + "epoch": 0.1149329823799241, + "grad_norm": 1.0880374908447266, + "learning_rate": 9.918809743916722e-05, + "loss": 1.0926, + "step": 17990 + }, + { + "epoch": 0.1149968695296628, + "grad_norm": 0.9131140112876892, + "learning_rate": 9.918719662635834e-05, + "loss": 0.8125, + "step": 18000 + }, + { + "epoch": 0.11506075667940151, + "grad_norm": 0.956883430480957, + "learning_rate": 9.918629531819184e-05, + "loss": 0.7358, + "step": 18010 + }, + { + "epoch": 0.11512464382914021, + "grad_norm": 1.1593812704086304, + "learning_rate": 9.91853935146768e-05, + "loss": 0.9325, + "step": 18020 + }, + { + "epoch": 0.1151885309788789, + "grad_norm": 0.8647767901420593, + "learning_rate": 9.918449121582228e-05, + "loss": 0.889, + "step": 18030 + }, + { + "epoch": 0.11525241812861761, + "grad_norm": 0.875560462474823, + "learning_rate": 9.91835884216374e-05, + "loss": 0.7376, + "step": 18040 + }, + { + "epoch": 0.11531630527835632, + "grad_norm": 1.0609110593795776, + "learning_rate": 9.918268513213123e-05, + "loss": 0.9935, + "step": 18050 + }, + { + "epoch": 0.11538019242809501, + "grad_norm": 0.7033603191375732, + "learning_rate": 9.918178134731286e-05, + "loss": 0.9307, + "step": 18060 + }, + { + "epoch": 0.11544407957783372, + "grad_norm": 0.7909555435180664, + "learning_rate": 9.918087706719141e-05, + "loss": 1.0967, + "step": 18070 + }, + { + "epoch": 0.11550796672757242, + "grad_norm": 1.5477937459945679, + "learning_rate": 9.917997229177597e-05, + "loss": 0.921, + "step": 18080 + }, + { + "epoch": 0.11557185387731111, + "grad_norm": 1.373567819595337, + "learning_rate": 9.91790670210757e-05, + "loss": 0.7096, + "step": 18090 + }, + { + "epoch": 0.11563574102704982, + "grad_norm": 0.5353577136993408, + "learning_rate": 9.917816125509965e-05, + "loss": 0.8476, + "step": 18100 + }, + { + "epoch": 0.11569962817678853, + "grad_norm": 0.6826961040496826, + "learning_rate": 9.917725499385698e-05, + "loss": 1.0802, + "step": 18110 + }, + { + "epoch": 0.11576351532652722, + "grad_norm": 0.9268578290939331, + "learning_rate": 9.917634823735678e-05, + "loss": 1.0728, + "step": 18120 + }, + { + "epoch": 0.11582740247626593, + "grad_norm": 0.9943346381187439, + "learning_rate": 9.917544098560824e-05, + "loss": 1.2018, + "step": 18130 + }, + { + "epoch": 0.11589128962600463, + "grad_norm": 1.2347413301467896, + "learning_rate": 9.917453323862046e-05, + "loss": 0.8933, + "step": 18140 + }, + { + "epoch": 0.11595517677574332, + "grad_norm": 0.7425234913825989, + "learning_rate": 9.91736249964026e-05, + "loss": 0.7152, + "step": 18150 + }, + { + "epoch": 0.11601906392548203, + "grad_norm": 1.068671703338623, + "learning_rate": 9.917271625896377e-05, + "loss": 0.9737, + "step": 18160 + }, + { + "epoch": 0.11608295107522074, + "grad_norm": 0.9823939204216003, + "learning_rate": 9.917180702631316e-05, + "loss": 0.9365, + "step": 18170 + }, + { + "epoch": 0.11614683822495943, + "grad_norm": 0.7012134194374084, + "learning_rate": 9.917089729845991e-05, + "loss": 0.9741, + "step": 18180 + }, + { + "epoch": 0.11621072537469813, + "grad_norm": 0.8662933111190796, + "learning_rate": 9.916998707541319e-05, + "loss": 0.9238, + "step": 18190 + }, + { + "epoch": 0.11627461252443684, + "grad_norm": 0.5047873258590698, + "learning_rate": 9.916907635718216e-05, + "loss": 0.772, + "step": 18200 + }, + { + "epoch": 0.11633849967417553, + "grad_norm": 1.2148154973983765, + "learning_rate": 9.916816514377598e-05, + "loss": 0.8872, + "step": 18210 + }, + { + "epoch": 0.11640238682391424, + "grad_norm": 0.6862503886222839, + "learning_rate": 9.916725343520386e-05, + "loss": 0.9914, + "step": 18220 + }, + { + "epoch": 0.11646627397365295, + "grad_norm": 0.7228761315345764, + "learning_rate": 9.916634123147495e-05, + "loss": 1.3034, + "step": 18230 + }, + { + "epoch": 0.11653016112339164, + "grad_norm": 0.5457968711853027, + "learning_rate": 9.916542853259848e-05, + "loss": 0.8272, + "step": 18240 + }, + { + "epoch": 0.11659404827313034, + "grad_norm": 2.023207187652588, + "learning_rate": 9.916451533858358e-05, + "loss": 0.7746, + "step": 18250 + }, + { + "epoch": 0.11665793542286905, + "grad_norm": 0.9167050123214722, + "learning_rate": 9.916360164943947e-05, + "loss": 0.9439, + "step": 18260 + }, + { + "epoch": 0.11672182257260774, + "grad_norm": 0.7956591248512268, + "learning_rate": 9.916268746517537e-05, + "loss": 1.0798, + "step": 18270 + }, + { + "epoch": 0.11678570972234645, + "grad_norm": 0.8357956409454346, + "learning_rate": 9.916177278580047e-05, + "loss": 1.0405, + "step": 18280 + }, + { + "epoch": 0.11684959687208515, + "grad_norm": 0.7955309152603149, + "learning_rate": 9.9160857611324e-05, + "loss": 0.7501, + "step": 18290 + }, + { + "epoch": 0.11691348402182385, + "grad_norm": 0.8821001052856445, + "learning_rate": 9.915994194175516e-05, + "loss": 0.916, + "step": 18300 + }, + { + "epoch": 0.11697737117156255, + "grad_norm": 0.7497395873069763, + "learning_rate": 9.915902577710318e-05, + "loss": 0.9209, + "step": 18310 + }, + { + "epoch": 0.11704125832130126, + "grad_norm": 0.7040755152702332, + "learning_rate": 9.915810911737727e-05, + "loss": 0.807, + "step": 18320 + }, + { + "epoch": 0.11710514547103995, + "grad_norm": 0.640442430973053, + "learning_rate": 9.915719196258668e-05, + "loss": 0.8374, + "step": 18330 + }, + { + "epoch": 0.11716903262077866, + "grad_norm": 0.8393665552139282, + "learning_rate": 9.915627431274064e-05, + "loss": 1.118, + "step": 18340 + }, + { + "epoch": 0.11723291977051736, + "grad_norm": 0.9538019895553589, + "learning_rate": 9.915535616784838e-05, + "loss": 0.6021, + "step": 18350 + }, + { + "epoch": 0.11729680692025606, + "grad_norm": 0.5672876238822937, + "learning_rate": 9.915443752791917e-05, + "loss": 0.7703, + "step": 18360 + }, + { + "epoch": 0.11736069406999476, + "grad_norm": 0.6178574562072754, + "learning_rate": 9.915351839296225e-05, + "loss": 1.1465, + "step": 18370 + }, + { + "epoch": 0.11742458121973347, + "grad_norm": 0.9924026131629944, + "learning_rate": 9.915259876298688e-05, + "loss": 0.9693, + "step": 18380 + }, + { + "epoch": 0.11748846836947216, + "grad_norm": 0.9154996275901794, + "learning_rate": 9.91516786380023e-05, + "loss": 1.5368, + "step": 18390 + }, + { + "epoch": 0.11755235551921087, + "grad_norm": 0.8077566623687744, + "learning_rate": 9.91507580180178e-05, + "loss": 0.9521, + "step": 18400 + }, + { + "epoch": 0.11761624266894957, + "grad_norm": 0.8165660500526428, + "learning_rate": 9.914983690304266e-05, + "loss": 0.977, + "step": 18410 + }, + { + "epoch": 0.11768012981868826, + "grad_norm": 0.46091389656066895, + "learning_rate": 9.914891529308614e-05, + "loss": 0.9916, + "step": 18420 + }, + { + "epoch": 0.11774401696842697, + "grad_norm": 0.8578134775161743, + "learning_rate": 9.914799318815751e-05, + "loss": 0.8512, + "step": 18430 + }, + { + "epoch": 0.11780790411816568, + "grad_norm": 1.149581789970398, + "learning_rate": 9.914707058826607e-05, + "loss": 1.0471, + "step": 18440 + }, + { + "epoch": 0.11787179126790437, + "grad_norm": 1.0105202198028564, + "learning_rate": 9.91461474934211e-05, + "loss": 0.8365, + "step": 18450 + }, + { + "epoch": 0.11793567841764308, + "grad_norm": 0.5020955801010132, + "learning_rate": 9.914522390363194e-05, + "loss": 0.9565, + "step": 18460 + }, + { + "epoch": 0.11799956556738178, + "grad_norm": 0.5407631993293762, + "learning_rate": 9.914429981890783e-05, + "loss": 0.885, + "step": 18470 + }, + { + "epoch": 0.11806345271712047, + "grad_norm": 0.5676096081733704, + "learning_rate": 9.914337523925812e-05, + "loss": 0.9684, + "step": 18480 + }, + { + "epoch": 0.11812733986685918, + "grad_norm": 0.7046330571174622, + "learning_rate": 9.91424501646921e-05, + "loss": 1.2259, + "step": 18490 + }, + { + "epoch": 0.11819122701659789, + "grad_norm": 0.501208484172821, + "learning_rate": 9.914152459521909e-05, + "loss": 0.8909, + "step": 18500 + }, + { + "epoch": 0.11825511416633658, + "grad_norm": 1.0521641969680786, + "learning_rate": 9.914059853084842e-05, + "loss": 0.8647, + "step": 18510 + }, + { + "epoch": 0.11831900131607528, + "grad_norm": 1.0477256774902344, + "learning_rate": 9.913967197158942e-05, + "loss": 0.8122, + "step": 18520 + }, + { + "epoch": 0.11838288846581399, + "grad_norm": 1.0611625909805298, + "learning_rate": 9.913874491745138e-05, + "loss": 1.0057, + "step": 18530 + }, + { + "epoch": 0.11844677561555268, + "grad_norm": 0.833010733127594, + "learning_rate": 9.91378173684437e-05, + "loss": 0.8437, + "step": 18540 + }, + { + "epoch": 0.11851066276529139, + "grad_norm": 0.7416166067123413, + "learning_rate": 9.913688932457567e-05, + "loss": 0.9933, + "step": 18550 + }, + { + "epoch": 0.1185745499150301, + "grad_norm": 1.7045838832855225, + "learning_rate": 9.913596078585667e-05, + "loss": 0.7402, + "step": 18560 + }, + { + "epoch": 0.11863843706476879, + "grad_norm": 1.204579472541809, + "learning_rate": 9.913503175229603e-05, + "loss": 0.73, + "step": 18570 + }, + { + "epoch": 0.11870232421450749, + "grad_norm": 2.575094223022461, + "learning_rate": 9.91341022239031e-05, + "loss": 1.0376, + "step": 18580 + }, + { + "epoch": 0.1187662113642462, + "grad_norm": 1.0583864450454712, + "learning_rate": 9.913317220068728e-05, + "loss": 0.9919, + "step": 18590 + }, + { + "epoch": 0.11883009851398489, + "grad_norm": 1.465122938156128, + "learning_rate": 9.913224168265788e-05, + "loss": 0.8039, + "step": 18600 + }, + { + "epoch": 0.1188939856637236, + "grad_norm": 0.8531835675239563, + "learning_rate": 9.913131066982431e-05, + "loss": 0.846, + "step": 18610 + }, + { + "epoch": 0.1189578728134623, + "grad_norm": 0.6930166482925415, + "learning_rate": 9.913037916219594e-05, + "loss": 1.1698, + "step": 18620 + }, + { + "epoch": 0.119021759963201, + "grad_norm": 0.8985093832015991, + "learning_rate": 9.912944715978215e-05, + "loss": 1.0585, + "step": 18630 + }, + { + "epoch": 0.1190856471129397, + "grad_norm": 0.7134751677513123, + "learning_rate": 9.912851466259232e-05, + "loss": 0.8098, + "step": 18640 + }, + { + "epoch": 0.11914953426267841, + "grad_norm": 1.5828766822814941, + "learning_rate": 9.912758167063585e-05, + "loss": 1.116, + "step": 18650 + }, + { + "epoch": 0.1192134214124171, + "grad_norm": 0.602565586566925, + "learning_rate": 9.912664818392213e-05, + "loss": 1.0292, + "step": 18660 + }, + { + "epoch": 0.1192773085621558, + "grad_norm": 0.9910022616386414, + "learning_rate": 9.912571420246057e-05, + "loss": 1.0432, + "step": 18670 + }, + { + "epoch": 0.11934119571189451, + "grad_norm": 0.8652639389038086, + "learning_rate": 9.912477972626055e-05, + "loss": 0.941, + "step": 18680 + }, + { + "epoch": 0.1194050828616332, + "grad_norm": 0.6660580039024353, + "learning_rate": 9.912384475533152e-05, + "loss": 0.8312, + "step": 18690 + }, + { + "epoch": 0.11946897001137191, + "grad_norm": 1.2698357105255127, + "learning_rate": 9.912290928968286e-05, + "loss": 0.6955, + "step": 18700 + }, + { + "epoch": 0.11953285716111062, + "grad_norm": 0.7728399634361267, + "learning_rate": 9.9121973329324e-05, + "loss": 0.6761, + "step": 18710 + }, + { + "epoch": 0.11959674431084931, + "grad_norm": 1.1762244701385498, + "learning_rate": 9.91210368742644e-05, + "loss": 0.9973, + "step": 18720 + }, + { + "epoch": 0.11966063146058802, + "grad_norm": 0.9727983474731445, + "learning_rate": 9.912009992451343e-05, + "loss": 1.0287, + "step": 18730 + }, + { + "epoch": 0.11972451861032672, + "grad_norm": 1.166279673576355, + "learning_rate": 9.911916248008058e-05, + "loss": 0.9455, + "step": 18740 + }, + { + "epoch": 0.11978840576006541, + "grad_norm": 1.3871594667434692, + "learning_rate": 9.911822454097526e-05, + "loss": 0.8691, + "step": 18750 + }, + { + "epoch": 0.11985229290980412, + "grad_norm": 0.7483668923377991, + "learning_rate": 9.911728610720693e-05, + "loss": 0.7596, + "step": 18760 + }, + { + "epoch": 0.11991618005954283, + "grad_norm": 0.7471362352371216, + "learning_rate": 9.911634717878505e-05, + "loss": 0.7925, + "step": 18770 + }, + { + "epoch": 0.11998006720928152, + "grad_norm": 0.7331792712211609, + "learning_rate": 9.911540775571903e-05, + "loss": 0.7732, + "step": 18780 + }, + { + "epoch": 0.12004395435902022, + "grad_norm": 0.8485783934593201, + "learning_rate": 9.911446783801839e-05, + "loss": 1.0558, + "step": 18790 + }, + { + "epoch": 0.12010784150875893, + "grad_norm": 0.63601154088974, + "learning_rate": 9.911352742569255e-05, + "loss": 0.8409, + "step": 18800 + }, + { + "epoch": 0.12017172865849762, + "grad_norm": 1.0364725589752197, + "learning_rate": 9.911258651875102e-05, + "loss": 1.1726, + "step": 18810 + }, + { + "epoch": 0.12023561580823633, + "grad_norm": 1.1578558683395386, + "learning_rate": 9.911164511720324e-05, + "loss": 1.0072, + "step": 18820 + }, + { + "epoch": 0.12029950295797504, + "grad_norm": 0.622075617313385, + "learning_rate": 9.911070322105871e-05, + "loss": 0.7986, + "step": 18830 + }, + { + "epoch": 0.12036339010771374, + "grad_norm": 0.9480080604553223, + "learning_rate": 9.91097608303269e-05, + "loss": 0.7835, + "step": 18840 + }, + { + "epoch": 0.12042727725745243, + "grad_norm": 0.6373130679130554, + "learning_rate": 9.910881794501734e-05, + "loss": 1.2013, + "step": 18850 + }, + { + "epoch": 0.12049116440719114, + "grad_norm": 1.1628334522247314, + "learning_rate": 9.910787456513948e-05, + "loss": 0.8801, + "step": 18860 + }, + { + "epoch": 0.12055505155692985, + "grad_norm": 1.2941060066223145, + "learning_rate": 9.910693069070285e-05, + "loss": 0.8426, + "step": 18870 + }, + { + "epoch": 0.12061893870666854, + "grad_norm": 1.0892646312713623, + "learning_rate": 9.910598632171692e-05, + "loss": 0.7019, + "step": 18880 + }, + { + "epoch": 0.12068282585640724, + "grad_norm": 1.0153416395187378, + "learning_rate": 9.910504145819124e-05, + "loss": 0.9361, + "step": 18890 + }, + { + "epoch": 0.12074671300614595, + "grad_norm": 0.8913525342941284, + "learning_rate": 9.910409610013531e-05, + "loss": 1.2171, + "step": 18900 + }, + { + "epoch": 0.12081060015588464, + "grad_norm": 1.518178105354309, + "learning_rate": 9.910315024755866e-05, + "loss": 0.8538, + "step": 18910 + }, + { + "epoch": 0.12087448730562335, + "grad_norm": 0.8142111301422119, + "learning_rate": 9.910220390047081e-05, + "loss": 0.9446, + "step": 18920 + }, + { + "epoch": 0.12093837445536205, + "grad_norm": 0.6663020849227905, + "learning_rate": 9.910125705888127e-05, + "loss": 0.9821, + "step": 18930 + }, + { + "epoch": 0.12100226160510075, + "grad_norm": 0.7732610106468201, + "learning_rate": 9.91003097227996e-05, + "loss": 0.7072, + "step": 18940 + }, + { + "epoch": 0.12106614875483945, + "grad_norm": 0.8097338080406189, + "learning_rate": 9.909936189223533e-05, + "loss": 1.0208, + "step": 18950 + }, + { + "epoch": 0.12113003590457816, + "grad_norm": 1.0220088958740234, + "learning_rate": 9.909841356719802e-05, + "loss": 0.7898, + "step": 18960 + }, + { + "epoch": 0.12119392305431685, + "grad_norm": 2.5410892963409424, + "learning_rate": 9.909746474769718e-05, + "loss": 0.884, + "step": 18970 + }, + { + "epoch": 0.12125781020405556, + "grad_norm": 1.1213639974594116, + "learning_rate": 9.909651543374243e-05, + "loss": 0.9554, + "step": 18980 + }, + { + "epoch": 0.12132169735379426, + "grad_norm": 0.8598119020462036, + "learning_rate": 9.909556562534327e-05, + "loss": 1.0724, + "step": 18990 + }, + { + "epoch": 0.12138558450353296, + "grad_norm": 0.89163738489151, + "learning_rate": 9.90946153225093e-05, + "loss": 0.9145, + "step": 19000 + }, + { + "epoch": 0.12144947165327166, + "grad_norm": 0.8153218030929565, + "learning_rate": 9.909366452525009e-05, + "loss": 0.8033, + "step": 19010 + }, + { + "epoch": 0.12151335880301037, + "grad_norm": 0.8267776966094971, + "learning_rate": 9.90927132335752e-05, + "loss": 0.9408, + "step": 19020 + }, + { + "epoch": 0.12157724595274906, + "grad_norm": 0.794154942035675, + "learning_rate": 9.909176144749421e-05, + "loss": 0.9167, + "step": 19030 + }, + { + "epoch": 0.12164113310248777, + "grad_norm": 0.9239640831947327, + "learning_rate": 9.909080916701672e-05, + "loss": 0.9062, + "step": 19040 + }, + { + "epoch": 0.12170502025222647, + "grad_norm": 1.426063060760498, + "learning_rate": 9.908995169188589e-05, + "loss": 0.9635, + "step": 19050 + }, + { + "epoch": 0.12176890740196517, + "grad_norm": 0.840755820274353, + "learning_rate": 9.908899847208145e-05, + "loss": 0.6732, + "step": 19060 + }, + { + "epoch": 0.12183279455170387, + "grad_norm": 1.245961308479309, + "learning_rate": 9.908804475790834e-05, + "loss": 1.0316, + "step": 19070 + }, + { + "epoch": 0.12189668170144258, + "grad_norm": 0.5957521796226501, + "learning_rate": 9.908709054937615e-05, + "loss": 0.7994, + "step": 19080 + }, + { + "epoch": 0.12196056885118127, + "grad_norm": 0.753171980381012, + "learning_rate": 9.908613584649447e-05, + "loss": 0.7903, + "step": 19090 + }, + { + "epoch": 0.12202445600091998, + "grad_norm": 0.5334873199462891, + "learning_rate": 9.908518064927297e-05, + "loss": 0.8806, + "step": 19100 + }, + { + "epoch": 0.12208834315065868, + "grad_norm": 0.7774950265884399, + "learning_rate": 9.908422495772121e-05, + "loss": 0.785, + "step": 19110 + }, + { + "epoch": 0.12215223030039737, + "grad_norm": 1.0679373741149902, + "learning_rate": 9.908326877184885e-05, + "loss": 1.1829, + "step": 19120 + }, + { + "epoch": 0.12221611745013608, + "grad_norm": 0.9180088043212891, + "learning_rate": 9.908231209166552e-05, + "loss": 0.943, + "step": 19130 + }, + { + "epoch": 0.12228000459987479, + "grad_norm": 2.2565629482269287, + "learning_rate": 9.908135491718082e-05, + "loss": 0.7051, + "step": 19140 + }, + { + "epoch": 0.12234389174961348, + "grad_norm": 0.5851088762283325, + "learning_rate": 9.908039724840444e-05, + "loss": 0.8, + "step": 19150 + }, + { + "epoch": 0.12240777889935218, + "grad_norm": 1.1300508975982666, + "learning_rate": 9.9079439085346e-05, + "loss": 1.0469, + "step": 19160 + }, + { + "epoch": 0.12247166604909089, + "grad_norm": 1.3692076206207275, + "learning_rate": 9.907848042801514e-05, + "loss": 0.8056, + "step": 19170 + }, + { + "epoch": 0.12253555319882958, + "grad_norm": 0.7391330599784851, + "learning_rate": 9.907752127642151e-05, + "loss": 1.0543, + "step": 19180 + }, + { + "epoch": 0.12259944034856829, + "grad_norm": 1.7373781204223633, + "learning_rate": 9.90765616305748e-05, + "loss": 1.1021, + "step": 19190 + }, + { + "epoch": 0.122663327498307, + "grad_norm": 1.2597390413284302, + "learning_rate": 9.907560149048465e-05, + "loss": 1.0209, + "step": 19200 + }, + { + "epoch": 0.12272721464804569, + "grad_norm": 0.7740830183029175, + "learning_rate": 9.907464085616073e-05, + "loss": 0.8195, + "step": 19210 + }, + { + "epoch": 0.1227911017977844, + "grad_norm": 0.8929482698440552, + "learning_rate": 9.907367972761273e-05, + "loss": 0.8193, + "step": 19220 + }, + { + "epoch": 0.1228549889475231, + "grad_norm": 0.854239821434021, + "learning_rate": 9.907271810485033e-05, + "loss": 0.9699, + "step": 19230 + }, + { + "epoch": 0.12291887609726179, + "grad_norm": 1.0040228366851807, + "learning_rate": 9.907175598788319e-05, + "loss": 0.8653, + "step": 19240 + }, + { + "epoch": 0.1229827632470005, + "grad_norm": 0.9501043558120728, + "learning_rate": 9.907079337672102e-05, + "loss": 1.2441, + "step": 19250 + }, + { + "epoch": 0.1230466503967392, + "grad_norm": 0.9891424179077148, + "learning_rate": 9.90698302713735e-05, + "loss": 1.0697, + "step": 19260 + }, + { + "epoch": 0.1231105375464779, + "grad_norm": 0.7450829148292542, + "learning_rate": 9.906886667185034e-05, + "loss": 0.883, + "step": 19270 + }, + { + "epoch": 0.1231744246962166, + "grad_norm": 0.9859048128128052, + "learning_rate": 9.906790257816125e-05, + "loss": 1.0223, + "step": 19280 + }, + { + "epoch": 0.12323831184595531, + "grad_norm": 0.6718336343765259, + "learning_rate": 9.906693799031593e-05, + "loss": 0.7721, + "step": 19290 + }, + { + "epoch": 0.123302198995694, + "grad_norm": 0.9734120965003967, + "learning_rate": 9.90659729083241e-05, + "loss": 1.1092, + "step": 19300 + }, + { + "epoch": 0.12336608614543271, + "grad_norm": 0.5610973238945007, + "learning_rate": 9.906500733219545e-05, + "loss": 0.8074, + "step": 19310 + }, + { + "epoch": 0.12342997329517141, + "grad_norm": 0.9786707162857056, + "learning_rate": 9.906404126193976e-05, + "loss": 0.8548, + "step": 19320 + }, + { + "epoch": 0.1234938604449101, + "grad_norm": 0.71066814661026, + "learning_rate": 9.90630746975667e-05, + "loss": 0.9489, + "step": 19330 + }, + { + "epoch": 0.12355774759464881, + "grad_norm": 1.0802106857299805, + "learning_rate": 9.906210763908606e-05, + "loss": 1.0818, + "step": 19340 + }, + { + "epoch": 0.12362163474438752, + "grad_norm": 0.8210603594779968, + "learning_rate": 9.906114008650753e-05, + "loss": 1.1651, + "step": 19350 + }, + { + "epoch": 0.12368552189412621, + "grad_norm": 1.066074252128601, + "learning_rate": 9.906017203984089e-05, + "loss": 1.1113, + "step": 19360 + }, + { + "epoch": 0.12374940904386492, + "grad_norm": 1.0652400255203247, + "learning_rate": 9.905920349909587e-05, + "loss": 0.8688, + "step": 19370 + }, + { + "epoch": 0.12381329619360362, + "grad_norm": 0.6207056045532227, + "learning_rate": 9.905823446428222e-05, + "loss": 0.7867, + "step": 19380 + }, + { + "epoch": 0.12387718334334231, + "grad_norm": 0.508903443813324, + "learning_rate": 9.905726493540972e-05, + "loss": 0.7805, + "step": 19390 + }, + { + "epoch": 0.12394107049308102, + "grad_norm": 1.3334448337554932, + "learning_rate": 9.905629491248812e-05, + "loss": 1.1862, + "step": 19400 + }, + { + "epoch": 0.12400495764281973, + "grad_norm": 0.6775515675544739, + "learning_rate": 9.905532439552718e-05, + "loss": 1.0348, + "step": 19410 + }, + { + "epoch": 0.12406884479255842, + "grad_norm": 0.628044605255127, + "learning_rate": 9.905435338453668e-05, + "loss": 0.8879, + "step": 19420 + }, + { + "epoch": 0.12413273194229713, + "grad_norm": 0.4216572940349579, + "learning_rate": 9.905338187952642e-05, + "loss": 0.9814, + "step": 19430 + }, + { + "epoch": 0.12419661909203583, + "grad_norm": 0.9256001710891724, + "learning_rate": 9.905240988050616e-05, + "loss": 0.834, + "step": 19440 + }, + { + "epoch": 0.12426050624177452, + "grad_norm": 1.2580517530441284, + "learning_rate": 9.90514373874857e-05, + "loss": 0.8597, + "step": 19450 + }, + { + "epoch": 0.12432439339151323, + "grad_norm": 0.9285855889320374, + "learning_rate": 9.905046440047483e-05, + "loss": 0.8476, + "step": 19460 + }, + { + "epoch": 0.12438828054125194, + "grad_norm": 4.265834808349609, + "learning_rate": 9.904949091948335e-05, + "loss": 0.7808, + "step": 19470 + }, + { + "epoch": 0.12445216769099063, + "grad_norm": 1.1027454137802124, + "learning_rate": 9.904851694452105e-05, + "loss": 0.9509, + "step": 19480 + }, + { + "epoch": 0.12451605484072933, + "grad_norm": 0.7222440838813782, + "learning_rate": 9.904754247559776e-05, + "loss": 1.19, + "step": 19490 + }, + { + "epoch": 0.12457994199046804, + "grad_norm": 0.9820877909660339, + "learning_rate": 9.904656751272328e-05, + "loss": 1.1383, + "step": 19500 + }, + { + "epoch": 0.12464382914020673, + "grad_norm": 0.521395742893219, + "learning_rate": 9.904559205590744e-05, + "loss": 0.7945, + "step": 19510 + }, + { + "epoch": 0.12470771628994544, + "grad_norm": 0.819299042224884, + "learning_rate": 9.904461610516006e-05, + "loss": 0.9847, + "step": 19520 + }, + { + "epoch": 0.12477160343968415, + "grad_norm": 0.7167036533355713, + "learning_rate": 9.904363966049098e-05, + "loss": 0.9058, + "step": 19530 + }, + { + "epoch": 0.12483549058942284, + "grad_norm": 0.9135296940803528, + "learning_rate": 9.904266272190999e-05, + "loss": 1.0799, + "step": 19540 + }, + { + "epoch": 0.12489937773916154, + "grad_norm": 0.9460045695304871, + "learning_rate": 9.904168528942696e-05, + "loss": 0.8938, + "step": 19550 + }, + { + "epoch": 0.12496326488890025, + "grad_norm": 0.8096686601638794, + "learning_rate": 9.904070736305176e-05, + "loss": 0.9354, + "step": 19560 + }, + { + "epoch": 0.12502715203863896, + "grad_norm": 0.8548075556755066, + "learning_rate": 9.903972894279419e-05, + "loss": 1.0604, + "step": 19570 + }, + { + "epoch": 0.12509103918837766, + "grad_norm": 0.9655779600143433, + "learning_rate": 9.903875002866412e-05, + "loss": 0.9133, + "step": 19580 + }, + { + "epoch": 0.12515492633811634, + "grad_norm": 0.6967488527297974, + "learning_rate": 9.903777062067142e-05, + "loss": 0.9566, + "step": 19590 + }, + { + "epoch": 0.12521881348785505, + "grad_norm": 0.5470744967460632, + "learning_rate": 9.903679071882594e-05, + "loss": 1.1614, + "step": 19600 + }, + { + "epoch": 0.12528270063759375, + "grad_norm": 0.44721782207489014, + "learning_rate": 9.903581032313757e-05, + "loss": 0.8072, + "step": 19610 + }, + { + "epoch": 0.12534658778733246, + "grad_norm": 0.6788471937179565, + "learning_rate": 9.903482943361616e-05, + "loss": 0.7861, + "step": 19620 + }, + { + "epoch": 0.12541047493707116, + "grad_norm": 1.1422935724258423, + "learning_rate": 9.90338480502716e-05, + "loss": 0.7205, + "step": 19630 + }, + { + "epoch": 0.12547436208680987, + "grad_norm": 0.9505433440208435, + "learning_rate": 9.903286617311375e-05, + "loss": 0.9314, + "step": 19640 + }, + { + "epoch": 0.12553824923654855, + "grad_norm": 1.1754378080368042, + "learning_rate": 9.903188380215254e-05, + "loss": 1.0777, + "step": 19650 + }, + { + "epoch": 0.12560213638628726, + "grad_norm": 0.8856581449508667, + "learning_rate": 9.903090093739784e-05, + "loss": 0.8573, + "step": 19660 + }, + { + "epoch": 0.12566602353602596, + "grad_norm": 0.9377738237380981, + "learning_rate": 9.902991757885955e-05, + "loss": 1.1693, + "step": 19670 + }, + { + "epoch": 0.12572991068576467, + "grad_norm": 1.0631327629089355, + "learning_rate": 9.902893372654755e-05, + "loss": 1.0915, + "step": 19680 + }, + { + "epoch": 0.12579379783550337, + "grad_norm": 0.9726115465164185, + "learning_rate": 9.902794938047179e-05, + "loss": 0.8837, + "step": 19690 + }, + { + "epoch": 0.12585768498524208, + "grad_norm": 1.0716935396194458, + "learning_rate": 9.902696454064218e-05, + "loss": 0.9323, + "step": 19700 + }, + { + "epoch": 0.12592157213498076, + "grad_norm": 0.6799229383468628, + "learning_rate": 9.90259792070686e-05, + "loss": 1.0071, + "step": 19710 + }, + { + "epoch": 0.12598545928471946, + "grad_norm": 0.6276800632476807, + "learning_rate": 9.9024993379761e-05, + "loss": 0.7668, + "step": 19720 + }, + { + "epoch": 0.12604934643445817, + "grad_norm": 0.6206457018852234, + "learning_rate": 9.902400705872931e-05, + "loss": 0.9062, + "step": 19730 + }, + { + "epoch": 0.12611323358419688, + "grad_norm": 0.8407792448997498, + "learning_rate": 9.902302024398344e-05, + "loss": 0.9027, + "step": 19740 + }, + { + "epoch": 0.12617712073393558, + "grad_norm": 1.0708434581756592, + "learning_rate": 9.902203293553337e-05, + "loss": 1.1047, + "step": 19750 + }, + { + "epoch": 0.1262410078836743, + "grad_norm": 0.7790530920028687, + "learning_rate": 9.902104513338901e-05, + "loss": 0.9325, + "step": 19760 + }, + { + "epoch": 0.12630489503341297, + "grad_norm": 0.8316869139671326, + "learning_rate": 9.90200568375603e-05, + "loss": 0.7061, + "step": 19770 + }, + { + "epoch": 0.12636878218315167, + "grad_norm": 1.0187642574310303, + "learning_rate": 9.901906804805723e-05, + "loss": 1.0777, + "step": 19780 + }, + { + "epoch": 0.12643266933289038, + "grad_norm": 0.5136988759040833, + "learning_rate": 9.901807876488973e-05, + "loss": 0.9242, + "step": 19790 + }, + { + "epoch": 0.12649655648262909, + "grad_norm": 0.8000445365905762, + "learning_rate": 9.901708898806777e-05, + "loss": 0.9573, + "step": 19800 + }, + { + "epoch": 0.1265604436323678, + "grad_norm": 0.6587111353874207, + "learning_rate": 9.901609871760132e-05, + "loss": 1.0622, + "step": 19810 + }, + { + "epoch": 0.1266243307821065, + "grad_norm": 0.7772683501243591, + "learning_rate": 9.901510795350035e-05, + "loss": 0.9968, + "step": 19820 + }, + { + "epoch": 0.12668821793184518, + "grad_norm": 0.578628659248352, + "learning_rate": 9.901411669577484e-05, + "loss": 0.8025, + "step": 19830 + }, + { + "epoch": 0.12675210508158388, + "grad_norm": 0.5878568887710571, + "learning_rate": 9.901312494443477e-05, + "loss": 0.9639, + "step": 19840 + }, + { + "epoch": 0.1268159922313226, + "grad_norm": 1.2923487424850464, + "learning_rate": 9.901213269949013e-05, + "loss": 0.8744, + "step": 19850 + }, + { + "epoch": 0.1268798793810613, + "grad_norm": 0.8328975439071655, + "learning_rate": 9.90111399609509e-05, + "loss": 0.8774, + "step": 19860 + }, + { + "epoch": 0.1269437665308, + "grad_norm": 0.5888987183570862, + "learning_rate": 9.901024607425051e-05, + "loss": 0.8943, + "step": 19870 + }, + { + "epoch": 0.1270076536805387, + "grad_norm": 0.5450535416603088, + "learning_rate": 9.900925239790913e-05, + "loss": 0.8265, + "step": 19880 + }, + { + "epoch": 0.12707154083027739, + "grad_norm": 1.1033037900924683, + "learning_rate": 9.90082582280022e-05, + "loss": 1.0133, + "step": 19890 + }, + { + "epoch": 0.1271354279800161, + "grad_norm": 0.7691605687141418, + "learning_rate": 9.90072635645397e-05, + "loss": 1.0807, + "step": 19900 + }, + { + "epoch": 0.1271993151297548, + "grad_norm": 0.5714837908744812, + "learning_rate": 9.900626840753167e-05, + "loss": 0.8473, + "step": 19910 + }, + { + "epoch": 0.1272632022794935, + "grad_norm": 0.5955528020858765, + "learning_rate": 9.90052727569881e-05, + "loss": 0.9808, + "step": 19920 + }, + { + "epoch": 0.1273270894292322, + "grad_norm": 0.6563436388969421, + "learning_rate": 9.900427661291904e-05, + "loss": 0.9406, + "step": 19930 + }, + { + "epoch": 0.12739097657897092, + "grad_norm": 1.398422360420227, + "learning_rate": 9.900327997533454e-05, + "loss": 1.1866, + "step": 19940 + }, + { + "epoch": 0.1274548637287096, + "grad_norm": 0.8855098485946655, + "learning_rate": 9.900228284424459e-05, + "loss": 1.167, + "step": 19950 + }, + { + "epoch": 0.1275187508784483, + "grad_norm": 0.8752385973930359, + "learning_rate": 9.900128521965927e-05, + "loss": 0.9714, + "step": 19960 + }, + { + "epoch": 0.127582638028187, + "grad_norm": 0.7587289810180664, + "learning_rate": 9.900028710158865e-05, + "loss": 1.1985, + "step": 19970 + }, + { + "epoch": 0.1276465251779257, + "grad_norm": 0.683338463306427, + "learning_rate": 9.899928849004269e-05, + "loss": 1.0779, + "step": 19980 + }, + { + "epoch": 0.12771041232766442, + "grad_norm": 0.738228440284729, + "learning_rate": 9.899828938503155e-05, + "loss": 0.8112, + "step": 19990 + }, + { + "epoch": 0.12777429947740312, + "grad_norm": 1.1224406957626343, + "learning_rate": 9.899728978656521e-05, + "loss": 0.707, + "step": 20000 + }, + { + "epoch": 0.1278381866271418, + "grad_norm": 1.0595028400421143, + "learning_rate": 9.89962896946538e-05, + "loss": 1.296, + "step": 20010 + }, + { + "epoch": 0.1279020737768805, + "grad_norm": 0.972698986530304, + "learning_rate": 9.899528910930736e-05, + "loss": 0.9258, + "step": 20020 + }, + { + "epoch": 0.12796596092661922, + "grad_norm": 0.7331506609916687, + "learning_rate": 9.899428803053597e-05, + "loss": 0.8608, + "step": 20030 + }, + { + "epoch": 0.12802984807635792, + "grad_norm": 0.9206950664520264, + "learning_rate": 9.899328645834971e-05, + "loss": 0.9087, + "step": 20040 + }, + { + "epoch": 0.12809373522609663, + "grad_norm": 2.2389299869537354, + "learning_rate": 9.899228439275867e-05, + "loss": 0.9422, + "step": 20050 + }, + { + "epoch": 0.12815762237583533, + "grad_norm": 1.7067959308624268, + "learning_rate": 9.899128183377294e-05, + "loss": 0.8746, + "step": 20060 + }, + { + "epoch": 0.128221509525574, + "grad_norm": 0.6370442509651184, + "learning_rate": 9.899027878140264e-05, + "loss": 1.1108, + "step": 20070 + }, + { + "epoch": 0.12828539667531272, + "grad_norm": 0.7334869503974915, + "learning_rate": 9.898927523565782e-05, + "loss": 0.7668, + "step": 20080 + }, + { + "epoch": 0.12834928382505142, + "grad_norm": 0.948521077632904, + "learning_rate": 9.898827119654864e-05, + "loss": 0.9522, + "step": 20090 + }, + { + "epoch": 0.12841317097479013, + "grad_norm": 1.9327528476715088, + "learning_rate": 9.898726666408516e-05, + "loss": 0.8717, + "step": 20100 + }, + { + "epoch": 0.12847705812452884, + "grad_norm": 0.8920581936836243, + "learning_rate": 9.898626163827755e-05, + "loss": 0.956, + "step": 20110 + }, + { + "epoch": 0.12854094527426754, + "grad_norm": 0.7983399033546448, + "learning_rate": 9.89852561191359e-05, + "loss": 0.8899, + "step": 20120 + }, + { + "epoch": 0.12860483242400625, + "grad_norm": 0.9559574723243713, + "learning_rate": 9.898425010667035e-05, + "loss": 0.9156, + "step": 20130 + }, + { + "epoch": 0.12866871957374493, + "grad_norm": 0.5370156764984131, + "learning_rate": 9.898324360089099e-05, + "loss": 0.8623, + "step": 20140 + }, + { + "epoch": 0.12873260672348363, + "grad_norm": 1.084375262260437, + "learning_rate": 9.898223660180802e-05, + "loss": 0.7424, + "step": 20150 + }, + { + "epoch": 0.12879649387322234, + "grad_norm": 0.6435216069221497, + "learning_rate": 9.898122910943155e-05, + "loss": 0.7816, + "step": 20160 + }, + { + "epoch": 0.12886038102296105, + "grad_norm": 0.8738903999328613, + "learning_rate": 9.898022112377172e-05, + "loss": 0.7824, + "step": 20170 + }, + { + "epoch": 0.12892426817269975, + "grad_norm": 0.943022608757019, + "learning_rate": 9.89792126448387e-05, + "loss": 1.1974, + "step": 20180 + }, + { + "epoch": 0.12898815532243846, + "grad_norm": 0.9258697032928467, + "learning_rate": 9.897820367264262e-05, + "loss": 0.9837, + "step": 20190 + }, + { + "epoch": 0.12905204247217714, + "grad_norm": 0.8255495429039001, + "learning_rate": 9.897719420719367e-05, + "loss": 0.8147, + "step": 20200 + }, + { + "epoch": 0.12911592962191584, + "grad_norm": 0.5483478307723999, + "learning_rate": 9.897618424850199e-05, + "loss": 0.9607, + "step": 20210 + }, + { + "epoch": 0.12917981677165455, + "grad_norm": 0.976705014705658, + "learning_rate": 9.897517379657778e-05, + "loss": 0.9184, + "step": 20220 + }, + { + "epoch": 0.12924370392139325, + "grad_norm": 0.66350257396698, + "learning_rate": 9.89741628514312e-05, + "loss": 0.8475, + "step": 20230 + }, + { + "epoch": 0.12930759107113196, + "grad_norm": 0.9961204528808594, + "learning_rate": 9.897315141307242e-05, + "loss": 0.9149, + "step": 20240 + }, + { + "epoch": 0.12937147822087067, + "grad_norm": 0.8872457146644592, + "learning_rate": 9.897213948151165e-05, + "loss": 0.8368, + "step": 20250 + }, + { + "epoch": 0.12943536537060935, + "grad_norm": 1.1536744832992554, + "learning_rate": 9.897112705675906e-05, + "loss": 0.8775, + "step": 20260 + }, + { + "epoch": 0.12949925252034805, + "grad_norm": 0.835328221321106, + "learning_rate": 9.897011413882484e-05, + "loss": 0.8357, + "step": 20270 + }, + { + "epoch": 0.12956313967008676, + "grad_norm": 0.5641841292381287, + "learning_rate": 9.896910072771924e-05, + "loss": 0.9148, + "step": 20280 + }, + { + "epoch": 0.12962702681982546, + "grad_norm": 0.9598913192749023, + "learning_rate": 9.89680868234524e-05, + "loss": 0.8754, + "step": 20290 + }, + { + "epoch": 0.12969091396956417, + "grad_norm": 0.7789944410324097, + "learning_rate": 9.896707242603457e-05, + "loss": 0.8845, + "step": 20300 + }, + { + "epoch": 0.12975480111930288, + "grad_norm": 1.1389309167861938, + "learning_rate": 9.896605753547596e-05, + "loss": 1.028, + "step": 20310 + }, + { + "epoch": 0.12981868826904155, + "grad_norm": 0.8242889642715454, + "learning_rate": 9.896504215178681e-05, + "loss": 0.7889, + "step": 20320 + }, + { + "epoch": 0.12988257541878026, + "grad_norm": 1.3238638639450073, + "learning_rate": 9.89640262749773e-05, + "loss": 0.8451, + "step": 20330 + }, + { + "epoch": 0.12994646256851897, + "grad_norm": 1.0306720733642578, + "learning_rate": 9.896300990505768e-05, + "loss": 0.9655, + "step": 20340 + }, + { + "epoch": 0.13001034971825767, + "grad_norm": 0.7990890145301819, + "learning_rate": 9.896199304203821e-05, + "loss": 0.9537, + "step": 20350 + }, + { + "epoch": 0.13007423686799638, + "grad_norm": 0.8819360136985779, + "learning_rate": 9.89609756859291e-05, + "loss": 0.9703, + "step": 20360 + }, + { + "epoch": 0.13013812401773509, + "grad_norm": 0.8472315669059753, + "learning_rate": 9.895995783674061e-05, + "loss": 1.1459, + "step": 20370 + }, + { + "epoch": 0.13020201116747376, + "grad_norm": 0.8132781386375427, + "learning_rate": 9.895893949448301e-05, + "loss": 1.2826, + "step": 20380 + }, + { + "epoch": 0.13026589831721247, + "grad_norm": 1.0438861846923828, + "learning_rate": 9.89579206591665e-05, + "loss": 0.846, + "step": 20390 + }, + { + "epoch": 0.13032978546695118, + "grad_norm": 1.2839152812957764, + "learning_rate": 9.89569013308014e-05, + "loss": 0.798, + "step": 20400 + }, + { + "epoch": 0.13039367261668988, + "grad_norm": 0.7642764449119568, + "learning_rate": 9.895588150939794e-05, + "loss": 1.2106, + "step": 20410 + }, + { + "epoch": 0.1304575597664286, + "grad_norm": 1.4906141757965088, + "learning_rate": 9.89548611949664e-05, + "loss": 1.0197, + "step": 20420 + }, + { + "epoch": 0.1305214469161673, + "grad_norm": 1.0365071296691895, + "learning_rate": 9.895384038751705e-05, + "loss": 0.793, + "step": 20430 + }, + { + "epoch": 0.13058533406590597, + "grad_norm": 0.7034469842910767, + "learning_rate": 9.895281908706018e-05, + "loss": 1.0824, + "step": 20440 + }, + { + "epoch": 0.13064922121564468, + "grad_norm": 0.8058176636695862, + "learning_rate": 9.895179729360606e-05, + "loss": 0.9053, + "step": 20450 + }, + { + "epoch": 0.13071310836538338, + "grad_norm": 1.0343101024627686, + "learning_rate": 9.8950775007165e-05, + "loss": 1.0945, + "step": 20460 + }, + { + "epoch": 0.1307769955151221, + "grad_norm": 0.7652077674865723, + "learning_rate": 9.89497522277473e-05, + "loss": 0.8568, + "step": 20470 + }, + { + "epoch": 0.1308408826648608, + "grad_norm": 0.6593330502510071, + "learning_rate": 9.894872895536325e-05, + "loss": 0.9574, + "step": 20480 + }, + { + "epoch": 0.1309047698145995, + "grad_norm": 1.9122685194015503, + "learning_rate": 9.894770519002314e-05, + "loss": 0.8306, + "step": 20490 + }, + { + "epoch": 0.13096865696433818, + "grad_norm": 0.57440185546875, + "learning_rate": 9.894668093173729e-05, + "loss": 0.6806, + "step": 20500 + }, + { + "epoch": 0.1310325441140769, + "grad_norm": 0.5228521227836609, + "learning_rate": 9.894565618051603e-05, + "loss": 0.9544, + "step": 20510 + }, + { + "epoch": 0.1310964312638156, + "grad_norm": 0.6962705850601196, + "learning_rate": 9.894463093636966e-05, + "loss": 0.7487, + "step": 20520 + }, + { + "epoch": 0.1311603184135543, + "grad_norm": 0.92603999376297, + "learning_rate": 9.89436051993085e-05, + "loss": 0.7536, + "step": 20530 + }, + { + "epoch": 0.131224205563293, + "grad_norm": 1.5977349281311035, + "learning_rate": 9.894257896934292e-05, + "loss": 1.0637, + "step": 20540 + }, + { + "epoch": 0.1312880927130317, + "grad_norm": 1.1071442365646362, + "learning_rate": 9.894155224648322e-05, + "loss": 0.7441, + "step": 20550 + }, + { + "epoch": 0.1313519798627704, + "grad_norm": 0.576611340045929, + "learning_rate": 9.894052503073973e-05, + "loss": 0.7077, + "step": 20560 + }, + { + "epoch": 0.1314158670125091, + "grad_norm": 0.7525666356086731, + "learning_rate": 9.893949732212284e-05, + "loss": 0.9049, + "step": 20570 + }, + { + "epoch": 0.1314797541622478, + "grad_norm": 0.7113981246948242, + "learning_rate": 9.893846912064287e-05, + "loss": 1.0453, + "step": 20580 + }, + { + "epoch": 0.1315436413119865, + "grad_norm": 0.9703547358512878, + "learning_rate": 9.893744042631016e-05, + "loss": 0.903, + "step": 20590 + }, + { + "epoch": 0.13160752846172522, + "grad_norm": 0.8187039494514465, + "learning_rate": 9.89364112391351e-05, + "loss": 0.7462, + "step": 20600 + }, + { + "epoch": 0.13167141561146392, + "grad_norm": 0.6756948232650757, + "learning_rate": 9.893538155912804e-05, + "loss": 0.8157, + "step": 20610 + }, + { + "epoch": 0.1317353027612026, + "grad_norm": 1.0830146074295044, + "learning_rate": 9.893435138629936e-05, + "loss": 0.7643, + "step": 20620 + }, + { + "epoch": 0.1317991899109413, + "grad_norm": 1.8327852487564087, + "learning_rate": 9.893332072065942e-05, + "loss": 0.9862, + "step": 20630 + }, + { + "epoch": 0.13186307706068, + "grad_norm": 2.034275770187378, + "learning_rate": 9.893228956221861e-05, + "loss": 0.8296, + "step": 20640 + }, + { + "epoch": 0.13192696421041872, + "grad_norm": 0.7762085795402527, + "learning_rate": 9.893125791098729e-05, + "loss": 0.8986, + "step": 20650 + }, + { + "epoch": 0.13199085136015742, + "grad_norm": 1.0018727779388428, + "learning_rate": 9.89302257669759e-05, + "loss": 0.9179, + "step": 20660 + }, + { + "epoch": 0.13205473850989613, + "grad_norm": 1.3458504676818848, + "learning_rate": 9.89291931301948e-05, + "loss": 0.7677, + "step": 20670 + }, + { + "epoch": 0.1321186256596348, + "grad_norm": 0.7849268913269043, + "learning_rate": 9.89281600006544e-05, + "loss": 1.1385, + "step": 20680 + }, + { + "epoch": 0.13218251280937351, + "grad_norm": 0.9244788289070129, + "learning_rate": 9.892712637836507e-05, + "loss": 0.8751, + "step": 20690 + }, + { + "epoch": 0.13224639995911222, + "grad_norm": 0.7756919860839844, + "learning_rate": 9.892609226333728e-05, + "loss": 0.8581, + "step": 20700 + }, + { + "epoch": 0.13231028710885093, + "grad_norm": 0.7075464129447937, + "learning_rate": 9.89250576555814e-05, + "loss": 0.8242, + "step": 20710 + }, + { + "epoch": 0.13237417425858963, + "grad_norm": 0.8638562560081482, + "learning_rate": 9.892402255510786e-05, + "loss": 0.9992, + "step": 20720 + }, + { + "epoch": 0.13243806140832834, + "grad_norm": 0.9571630954742432, + "learning_rate": 9.89229869619271e-05, + "loss": 0.9963, + "step": 20730 + }, + { + "epoch": 0.13250194855806702, + "grad_norm": 1.0435787439346313, + "learning_rate": 9.892195087604954e-05, + "loss": 0.8632, + "step": 20740 + }, + { + "epoch": 0.13256583570780572, + "grad_norm": 1.1710478067398071, + "learning_rate": 9.89209142974856e-05, + "loss": 0.8433, + "step": 20750 + }, + { + "epoch": 0.13262972285754443, + "grad_norm": 0.6886267066001892, + "learning_rate": 9.891987722624574e-05, + "loss": 1.0012, + "step": 20760 + }, + { + "epoch": 0.13269361000728314, + "grad_norm": 1.172371745109558, + "learning_rate": 9.89188396623404e-05, + "loss": 0.9275, + "step": 20770 + }, + { + "epoch": 0.13275749715702184, + "grad_norm": 0.8536580204963684, + "learning_rate": 9.891780160577999e-05, + "loss": 1.0204, + "step": 20780 + }, + { + "epoch": 0.13282138430676055, + "grad_norm": 0.8853366374969482, + "learning_rate": 9.891676305657502e-05, + "loss": 0.913, + "step": 20790 + }, + { + "epoch": 0.13288527145649923, + "grad_norm": 0.9350702166557312, + "learning_rate": 9.891572401473594e-05, + "loss": 0.8241, + "step": 20800 + }, + { + "epoch": 0.13294915860623793, + "grad_norm": 0.7683811783790588, + "learning_rate": 9.891468448027318e-05, + "loss": 0.7284, + "step": 20810 + }, + { + "epoch": 0.13301304575597664, + "grad_norm": 1.978036880493164, + "learning_rate": 9.891364445319723e-05, + "loss": 0.9082, + "step": 20820 + }, + { + "epoch": 0.13307693290571534, + "grad_norm": 0.828632652759552, + "learning_rate": 9.891260393351858e-05, + "loss": 0.7259, + "step": 20830 + }, + { + "epoch": 0.13314082005545405, + "grad_norm": 0.8856496810913086, + "learning_rate": 9.891156292124768e-05, + "loss": 0.8231, + "step": 20840 + }, + { + "epoch": 0.13320470720519276, + "grad_norm": 0.9500540494918823, + "learning_rate": 9.891052141639505e-05, + "loss": 0.8454, + "step": 20850 + }, + { + "epoch": 0.13326859435493144, + "grad_norm": 0.6504539251327515, + "learning_rate": 9.890947941897113e-05, + "loss": 0.8924, + "step": 20860 + }, + { + "epoch": 0.13333248150467014, + "grad_norm": 0.9036272168159485, + "learning_rate": 9.890843692898644e-05, + "loss": 1.2291, + "step": 20870 + }, + { + "epoch": 0.13339636865440885, + "grad_norm": 0.487404465675354, + "learning_rate": 9.890739394645149e-05, + "loss": 0.9082, + "step": 20880 + }, + { + "epoch": 0.13346025580414755, + "grad_norm": 0.6194189786911011, + "learning_rate": 9.890635047137678e-05, + "loss": 0.8234, + "step": 20890 + }, + { + "epoch": 0.13352414295388626, + "grad_norm": 1.1986579895019531, + "learning_rate": 9.890530650377279e-05, + "loss": 1.0975, + "step": 20900 + }, + { + "epoch": 0.13358803010362497, + "grad_norm": 0.5254888534545898, + "learning_rate": 9.890426204365006e-05, + "loss": 0.9337, + "step": 20910 + }, + { + "epoch": 0.13365191725336364, + "grad_norm": 2.1500959396362305, + "learning_rate": 9.890321709101911e-05, + "loss": 0.9268, + "step": 20920 + }, + { + "epoch": 0.13371580440310235, + "grad_norm": 1.653495192527771, + "learning_rate": 9.890217164589044e-05, + "loss": 0.805, + "step": 20930 + }, + { + "epoch": 0.13377969155284106, + "grad_norm": 0.9262358546257019, + "learning_rate": 9.890112570827461e-05, + "loss": 0.7364, + "step": 20940 + }, + { + "epoch": 0.13384357870257976, + "grad_norm": 0.7505791187286377, + "learning_rate": 9.890007927818214e-05, + "loss": 0.836, + "step": 20950 + }, + { + "epoch": 0.13390746585231847, + "grad_norm": 0.7554075717926025, + "learning_rate": 9.889903235562357e-05, + "loss": 1.0677, + "step": 20960 + }, + { + "epoch": 0.13397135300205718, + "grad_norm": 1.8679813146591187, + "learning_rate": 9.889798494060942e-05, + "loss": 0.818, + "step": 20970 + }, + { + "epoch": 0.13403524015179588, + "grad_norm": 1.584902286529541, + "learning_rate": 9.889693703315029e-05, + "loss": 1.1151, + "step": 20980 + }, + { + "epoch": 0.13409912730153456, + "grad_norm": 0.8589569330215454, + "learning_rate": 9.889588863325667e-05, + "loss": 0.9884, + "step": 20990 + }, + { + "epoch": 0.13416301445127327, + "grad_norm": 0.9949905872344971, + "learning_rate": 9.889483974093917e-05, + "loss": 0.925, + "step": 21000 + }, + { + "epoch": 0.13422690160101197, + "grad_norm": 0.6873974204063416, + "learning_rate": 9.889379035620833e-05, + "loss": 0.9067, + "step": 21010 + }, + { + "epoch": 0.13429078875075068, + "grad_norm": 2.3519535064697266, + "learning_rate": 9.889274047907472e-05, + "loss": 0.9542, + "step": 21020 + }, + { + "epoch": 0.13435467590048938, + "grad_norm": 0.6520812511444092, + "learning_rate": 9.889169010954892e-05, + "loss": 0.9918, + "step": 21030 + }, + { + "epoch": 0.1344185630502281, + "grad_norm": 0.6458450555801392, + "learning_rate": 9.88906392476415e-05, + "loss": 1.0032, + "step": 21040 + }, + { + "epoch": 0.13448245019996677, + "grad_norm": 1.0632940530776978, + "learning_rate": 9.888958789336304e-05, + "loss": 1.0281, + "step": 21050 + }, + { + "epoch": 0.13454633734970547, + "grad_norm": 0.8738301992416382, + "learning_rate": 9.888853604672415e-05, + "loss": 0.8943, + "step": 21060 + }, + { + "epoch": 0.13461022449944418, + "grad_norm": 0.8271169662475586, + "learning_rate": 9.88874837077354e-05, + "loss": 0.8891, + "step": 21070 + }, + { + "epoch": 0.1346741116491829, + "grad_norm": 0.7805771827697754, + "learning_rate": 9.888643087640739e-05, + "loss": 0.9641, + "step": 21080 + }, + { + "epoch": 0.1347379987989216, + "grad_norm": 1.1134415864944458, + "learning_rate": 9.888537755275073e-05, + "loss": 0.8162, + "step": 21090 + }, + { + "epoch": 0.1348018859486603, + "grad_norm": 0.9039101004600525, + "learning_rate": 9.888432373677602e-05, + "loss": 1.0201, + "step": 21100 + }, + { + "epoch": 0.13486577309839898, + "grad_norm": 0.8428747653961182, + "learning_rate": 9.888326942849389e-05, + "loss": 0.8404, + "step": 21110 + }, + { + "epoch": 0.13492966024813768, + "grad_norm": 0.8015506267547607, + "learning_rate": 9.888221462791493e-05, + "loss": 1.0085, + "step": 21120 + }, + { + "epoch": 0.1349935473978764, + "grad_norm": 1.1341489553451538, + "learning_rate": 9.88811593350498e-05, + "loss": 0.9912, + "step": 21130 + }, + { + "epoch": 0.1350574345476151, + "grad_norm": 0.8308176398277283, + "learning_rate": 9.888010354990911e-05, + "loss": 0.7831, + "step": 21140 + }, + { + "epoch": 0.1351213216973538, + "grad_norm": 0.8086538910865784, + "learning_rate": 9.887904727250348e-05, + "loss": 0.8645, + "step": 21150 + }, + { + "epoch": 0.1351852088470925, + "grad_norm": 0.5411624908447266, + "learning_rate": 9.887799050284355e-05, + "loss": 1.0745, + "step": 21160 + }, + { + "epoch": 0.1352490959968312, + "grad_norm": 0.8454309105873108, + "learning_rate": 9.887693324093998e-05, + "loss": 0.8502, + "step": 21170 + }, + { + "epoch": 0.1353129831465699, + "grad_norm": 0.8707975149154663, + "learning_rate": 9.88758754868034e-05, + "loss": 0.9231, + "step": 21180 + }, + { + "epoch": 0.1353768702963086, + "grad_norm": 0.819693386554718, + "learning_rate": 9.887481724044447e-05, + "loss": 0.8355, + "step": 21190 + }, + { + "epoch": 0.1354407574460473, + "grad_norm": 0.7734857201576233, + "learning_rate": 9.887375850187386e-05, + "loss": 1.1568, + "step": 21200 + }, + { + "epoch": 0.135504644595786, + "grad_norm": 0.6396207809448242, + "learning_rate": 9.887269927110222e-05, + "loss": 0.9182, + "step": 21210 + }, + { + "epoch": 0.13556853174552472, + "grad_norm": 1.281610369682312, + "learning_rate": 9.88716395481402e-05, + "loss": 0.9132, + "step": 21220 + }, + { + "epoch": 0.1356324188952634, + "grad_norm": 1.1592093706130981, + "learning_rate": 9.88705793329985e-05, + "loss": 0.9757, + "step": 21230 + }, + { + "epoch": 0.1356963060450021, + "grad_norm": 0.7820732593536377, + "learning_rate": 9.88695186256878e-05, + "loss": 0.9314, + "step": 21240 + }, + { + "epoch": 0.1357601931947408, + "grad_norm": 0.7652541399002075, + "learning_rate": 9.886845742621876e-05, + "loss": 1.0022, + "step": 21250 + }, + { + "epoch": 0.13582408034447951, + "grad_norm": 0.7700982689857483, + "learning_rate": 9.886739573460207e-05, + "loss": 1.0373, + "step": 21260 + }, + { + "epoch": 0.13588796749421822, + "grad_norm": 1.0912948846817017, + "learning_rate": 9.886633355084843e-05, + "loss": 0.9453, + "step": 21270 + }, + { + "epoch": 0.13595185464395693, + "grad_norm": 0.6350242495536804, + "learning_rate": 9.886527087496853e-05, + "loss": 0.7426, + "step": 21280 + }, + { + "epoch": 0.1360157417936956, + "grad_norm": 0.7051372528076172, + "learning_rate": 9.886420770697309e-05, + "loss": 0.823, + "step": 21290 + }, + { + "epoch": 0.1360796289434343, + "grad_norm": 0.8976541757583618, + "learning_rate": 9.88631440468728e-05, + "loss": 0.9737, + "step": 21300 + }, + { + "epoch": 0.13614351609317302, + "grad_norm": 2.363358974456787, + "learning_rate": 9.886207989467837e-05, + "loss": 0.9999, + "step": 21310 + }, + { + "epoch": 0.13620740324291172, + "grad_norm": 1.8028829097747803, + "learning_rate": 9.886101525040055e-05, + "loss": 0.8832, + "step": 21320 + }, + { + "epoch": 0.13627129039265043, + "grad_norm": 1.440885305404663, + "learning_rate": 9.885995011405e-05, + "loss": 0.922, + "step": 21330 + }, + { + "epoch": 0.13633517754238914, + "grad_norm": 0.9806457161903381, + "learning_rate": 9.88588844856375e-05, + "loss": 0.9399, + "step": 21340 + }, + { + "epoch": 0.1363990646921278, + "grad_norm": 0.8839708566665649, + "learning_rate": 9.885781836517377e-05, + "loss": 0.7167, + "step": 21350 + }, + { + "epoch": 0.13646295184186652, + "grad_norm": 1.2500883340835571, + "learning_rate": 9.885675175266953e-05, + "loss": 1.177, + "step": 21360 + }, + { + "epoch": 0.13652683899160523, + "grad_norm": 1.0007693767547607, + "learning_rate": 9.885568464813554e-05, + "loss": 0.9975, + "step": 21370 + }, + { + "epoch": 0.13659072614134393, + "grad_norm": 0.8086827397346497, + "learning_rate": 9.885461705158254e-05, + "loss": 0.8139, + "step": 21380 + }, + { + "epoch": 0.13665461329108264, + "grad_norm": 0.947471022605896, + "learning_rate": 9.885354896302128e-05, + "loss": 1.1116, + "step": 21390 + }, + { + "epoch": 0.13671850044082134, + "grad_norm": 0.8959566950798035, + "learning_rate": 9.885248038246251e-05, + "loss": 1.1009, + "step": 21400 + }, + { + "epoch": 0.13678238759056002, + "grad_norm": 1.1560317277908325, + "learning_rate": 9.8851411309917e-05, + "loss": 0.6756, + "step": 21410 + }, + { + "epoch": 0.13684627474029873, + "grad_norm": 1.5213913917541504, + "learning_rate": 9.885034174539552e-05, + "loss": 0.7979, + "step": 21420 + }, + { + "epoch": 0.13691016189003744, + "grad_norm": 0.5531548261642456, + "learning_rate": 9.884927168890884e-05, + "loss": 0.9408, + "step": 21430 + }, + { + "epoch": 0.13697404903977614, + "grad_norm": 0.7810382843017578, + "learning_rate": 9.884820114046774e-05, + "loss": 0.8515, + "step": 21440 + }, + { + "epoch": 0.13703793618951485, + "grad_norm": 1.0958387851715088, + "learning_rate": 9.884713010008298e-05, + "loss": 0.869, + "step": 21450 + }, + { + "epoch": 0.13710182333925355, + "grad_norm": 0.4343324899673462, + "learning_rate": 9.884605856776537e-05, + "loss": 0.8596, + "step": 21460 + }, + { + "epoch": 0.13716571048899223, + "grad_norm": 0.9415945410728455, + "learning_rate": 9.884498654352567e-05, + "loss": 0.7679, + "step": 21470 + }, + { + "epoch": 0.13722959763873094, + "grad_norm": 0.6814182996749878, + "learning_rate": 9.884391402737473e-05, + "loss": 0.9849, + "step": 21480 + }, + { + "epoch": 0.13729348478846964, + "grad_norm": 0.8244829177856445, + "learning_rate": 9.88428410193233e-05, + "loss": 0.9204, + "step": 21490 + }, + { + "epoch": 0.13735737193820835, + "grad_norm": 0.5591076612472534, + "learning_rate": 9.884176751938222e-05, + "loss": 1.0907, + "step": 21500 + }, + { + "epoch": 0.13742125908794706, + "grad_norm": 0.6328865885734558, + "learning_rate": 9.884069352756228e-05, + "loss": 0.7108, + "step": 21510 + }, + { + "epoch": 0.13748514623768576, + "grad_norm": 0.6090789437294006, + "learning_rate": 9.883961904387431e-05, + "loss": 0.8593, + "step": 21520 + }, + { + "epoch": 0.13754903338742444, + "grad_norm": 1.0573042631149292, + "learning_rate": 9.88385440683291e-05, + "loss": 1.2391, + "step": 21530 + }, + { + "epoch": 0.13761292053716315, + "grad_norm": 1.0376691818237305, + "learning_rate": 9.883746860093752e-05, + "loss": 0.8013, + "step": 21540 + }, + { + "epoch": 0.13767680768690185, + "grad_norm": 0.9400094747543335, + "learning_rate": 9.883639264171038e-05, + "loss": 0.8789, + "step": 21550 + }, + { + "epoch": 0.13774069483664056, + "grad_norm": 2.4618563652038574, + "learning_rate": 9.88353161906585e-05, + "loss": 1.1602, + "step": 21560 + }, + { + "epoch": 0.13780458198637927, + "grad_norm": 1.2091678380966187, + "learning_rate": 9.883423924779277e-05, + "loss": 0.7947, + "step": 21570 + }, + { + "epoch": 0.13786846913611797, + "grad_norm": 0.7721507549285889, + "learning_rate": 9.883316181312398e-05, + "loss": 1.0147, + "step": 21580 + }, + { + "epoch": 0.13793235628585665, + "grad_norm": 0.9228678941726685, + "learning_rate": 9.8832083886663e-05, + "loss": 0.8414, + "step": 21590 + }, + { + "epoch": 0.13799624343559536, + "grad_norm": 0.6696807742118835, + "learning_rate": 9.883100546842071e-05, + "loss": 1.0162, + "step": 21600 + }, + { + "epoch": 0.13806013058533406, + "grad_norm": 0.8186768889427185, + "learning_rate": 9.882992655840793e-05, + "loss": 0.8442, + "step": 21610 + }, + { + "epoch": 0.13812401773507277, + "grad_norm": 1.444062352180481, + "learning_rate": 9.882884715663557e-05, + "loss": 1.2117, + "step": 21620 + }, + { + "epoch": 0.13818790488481147, + "grad_norm": 0.7770470380783081, + "learning_rate": 9.882776726311445e-05, + "loss": 0.7657, + "step": 21630 + }, + { + "epoch": 0.13825179203455018, + "grad_norm": 1.0606368780136108, + "learning_rate": 9.882668687785548e-05, + "loss": 0.8434, + "step": 21640 + }, + { + "epoch": 0.13831567918428886, + "grad_norm": 1.0077322721481323, + "learning_rate": 9.882560600086954e-05, + "loss": 0.8445, + "step": 21650 + }, + { + "epoch": 0.13837956633402757, + "grad_norm": 0.9700446128845215, + "learning_rate": 9.882452463216749e-05, + "loss": 0.7823, + "step": 21660 + }, + { + "epoch": 0.13844345348376627, + "grad_norm": 0.7618522644042969, + "learning_rate": 9.882344277176025e-05, + "loss": 1.0858, + "step": 21670 + }, + { + "epoch": 0.13850734063350498, + "grad_norm": 0.5642924904823303, + "learning_rate": 9.882236041965871e-05, + "loss": 0.9753, + "step": 21680 + }, + { + "epoch": 0.13857122778324368, + "grad_norm": 0.6261829733848572, + "learning_rate": 9.882127757587377e-05, + "loss": 0.773, + "step": 21690 + }, + { + "epoch": 0.1386351149329824, + "grad_norm": 0.48715344071388245, + "learning_rate": 9.882019424041629e-05, + "loss": 0.7998, + "step": 21700 + }, + { + "epoch": 0.13869900208272107, + "grad_norm": 0.850307285785675, + "learning_rate": 9.881911041329726e-05, + "loss": 0.9011, + "step": 21710 + }, + { + "epoch": 0.13876288923245977, + "grad_norm": 0.7470149993896484, + "learning_rate": 9.881802609452753e-05, + "loss": 0.9515, + "step": 21720 + }, + { + "epoch": 0.13882677638219848, + "grad_norm": 0.6368236541748047, + "learning_rate": 9.881694128411804e-05, + "loss": 1.0206, + "step": 21730 + }, + { + "epoch": 0.1388906635319372, + "grad_norm": 1.1505577564239502, + "learning_rate": 9.881585598207973e-05, + "loss": 1.0826, + "step": 21740 + }, + { + "epoch": 0.1389545506816759, + "grad_norm": 2.4669744968414307, + "learning_rate": 9.881477018842352e-05, + "loss": 1.018, + "step": 21750 + }, + { + "epoch": 0.1390184378314146, + "grad_norm": 1.7851297855377197, + "learning_rate": 9.881368390316033e-05, + "loss": 0.7395, + "step": 21760 + }, + { + "epoch": 0.1390823249811533, + "grad_norm": 0.6467908620834351, + "learning_rate": 9.881259712630113e-05, + "loss": 0.9388, + "step": 21770 + }, + { + "epoch": 0.13914621213089198, + "grad_norm": 1.0224095582962036, + "learning_rate": 9.881150985785683e-05, + "loss": 0.9804, + "step": 21780 + }, + { + "epoch": 0.1392100992806307, + "grad_norm": 0.8423238396644592, + "learning_rate": 9.881042209783842e-05, + "loss": 0.8013, + "step": 21790 + }, + { + "epoch": 0.1392739864303694, + "grad_norm": 0.8437933325767517, + "learning_rate": 9.880933384625681e-05, + "loss": 1.0403, + "step": 21800 + }, + { + "epoch": 0.1393378735801081, + "grad_norm": 0.8127179145812988, + "learning_rate": 9.880824510312301e-05, + "loss": 1.0857, + "step": 21810 + }, + { + "epoch": 0.1394017607298468, + "grad_norm": 0.7408185005187988, + "learning_rate": 9.880715586844793e-05, + "loss": 0.8628, + "step": 21820 + }, + { + "epoch": 0.1394656478795855, + "grad_norm": 0.8337761759757996, + "learning_rate": 9.880606614224256e-05, + "loss": 0.9279, + "step": 21830 + }, + { + "epoch": 0.1395295350293242, + "grad_norm": 0.7604190707206726, + "learning_rate": 9.880497592451791e-05, + "loss": 0.6789, + "step": 21840 + }, + { + "epoch": 0.1395934221790629, + "grad_norm": 0.8677889704704285, + "learning_rate": 9.880388521528491e-05, + "loss": 1.2008, + "step": 21850 + }, + { + "epoch": 0.1396573093288016, + "grad_norm": 0.6006574630737305, + "learning_rate": 9.880279401455459e-05, + "loss": 0.9166, + "step": 21860 + }, + { + "epoch": 0.1397211964785403, + "grad_norm": 1.9674246311187744, + "learning_rate": 9.880170232233789e-05, + "loss": 0.7204, + "step": 21870 + }, + { + "epoch": 0.13978508362827902, + "grad_norm": 1.377977967262268, + "learning_rate": 9.880061013864583e-05, + "loss": 0.9785, + "step": 21880 + }, + { + "epoch": 0.13984897077801772, + "grad_norm": 0.8513831496238708, + "learning_rate": 9.879951746348942e-05, + "loss": 1.1253, + "step": 21890 + }, + { + "epoch": 0.1399128579277564, + "grad_norm": 0.7017676830291748, + "learning_rate": 9.879842429687964e-05, + "loss": 0.6319, + "step": 21900 + }, + { + "epoch": 0.1399767450774951, + "grad_norm": 0.8190149068832397, + "learning_rate": 9.87973306388275e-05, + "loss": 1.0802, + "step": 21910 + }, + { + "epoch": 0.1400406322272338, + "grad_norm": 0.7550898790359497, + "learning_rate": 9.879623648934404e-05, + "loss": 0.8998, + "step": 21920 + }, + { + "epoch": 0.14010451937697252, + "grad_norm": 0.8115261793136597, + "learning_rate": 9.879514184844027e-05, + "loss": 0.6637, + "step": 21930 + }, + { + "epoch": 0.14016840652671123, + "grad_norm": 0.6252816319465637, + "learning_rate": 9.87940467161272e-05, + "loss": 0.9156, + "step": 21940 + }, + { + "epoch": 0.14023229367644993, + "grad_norm": 2.5343711376190186, + "learning_rate": 9.879295109241587e-05, + "loss": 1.0213, + "step": 21950 + }, + { + "epoch": 0.1402961808261886, + "grad_norm": 0.9597296714782715, + "learning_rate": 9.87918549773173e-05, + "loss": 0.7637, + "step": 21960 + }, + { + "epoch": 0.14036006797592732, + "grad_norm": 0.613199770450592, + "learning_rate": 9.879075837084255e-05, + "loss": 0.9528, + "step": 21970 + }, + { + "epoch": 0.14042395512566602, + "grad_norm": 1.902039885520935, + "learning_rate": 9.878966127300264e-05, + "loss": 0.7262, + "step": 21980 + }, + { + "epoch": 0.14048784227540473, + "grad_norm": 0.6681598424911499, + "learning_rate": 9.878856368380864e-05, + "loss": 0.9852, + "step": 21990 + }, + { + "epoch": 0.14055172942514343, + "grad_norm": 0.593425989151001, + "learning_rate": 9.87874656032716e-05, + "loss": 0.8866, + "step": 22000 + }, + { + "epoch": 0.14061561657488214, + "grad_norm": 0.8456883430480957, + "learning_rate": 9.878636703140257e-05, + "loss": 0.7837, + "step": 22010 + }, + { + "epoch": 0.14067950372462082, + "grad_norm": 0.719262421131134, + "learning_rate": 9.878526796821261e-05, + "loss": 1.0117, + "step": 22020 + }, + { + "epoch": 0.14074339087435953, + "grad_norm": 0.740960419178009, + "learning_rate": 9.878416841371282e-05, + "loss": 1.0046, + "step": 22030 + }, + { + "epoch": 0.14080727802409823, + "grad_norm": 1.0368300676345825, + "learning_rate": 9.878306836791423e-05, + "loss": 0.8077, + "step": 22040 + }, + { + "epoch": 0.14087116517383694, + "grad_norm": 1.3289177417755127, + "learning_rate": 9.878196783082793e-05, + "loss": 0.777, + "step": 22050 + }, + { + "epoch": 0.14093505232357564, + "grad_norm": 0.9713805913925171, + "learning_rate": 9.878086680246504e-05, + "loss": 0.8098, + "step": 22060 + }, + { + "epoch": 0.14099893947331435, + "grad_norm": 0.5410668253898621, + "learning_rate": 9.877976528283661e-05, + "loss": 0.9304, + "step": 22070 + }, + { + "epoch": 0.14106282662305303, + "grad_norm": 0.8843280673027039, + "learning_rate": 9.877866327195373e-05, + "loss": 0.8307, + "step": 22080 + }, + { + "epoch": 0.14112671377279173, + "grad_norm": 1.040749430656433, + "learning_rate": 9.877756076982751e-05, + "loss": 0.8895, + "step": 22090 + }, + { + "epoch": 0.14119060092253044, + "grad_norm": 0.8764167428016663, + "learning_rate": 9.877645777646907e-05, + "loss": 0.9634, + "step": 22100 + }, + { + "epoch": 0.14125448807226915, + "grad_norm": 0.5217092633247375, + "learning_rate": 9.87753542918895e-05, + "loss": 1.0052, + "step": 22110 + }, + { + "epoch": 0.14131837522200785, + "grad_norm": 0.6405453681945801, + "learning_rate": 9.87742503160999e-05, + "loss": 0.9599, + "step": 22120 + }, + { + "epoch": 0.14138226237174656, + "grad_norm": 0.7412799000740051, + "learning_rate": 9.877314584911143e-05, + "loss": 0.7852, + "step": 22130 + }, + { + "epoch": 0.14144614952148524, + "grad_norm": 1.6060749292373657, + "learning_rate": 9.877204089093516e-05, + "loss": 1.2637, + "step": 22140 + }, + { + "epoch": 0.14151003667122394, + "grad_norm": 1.0910207033157349, + "learning_rate": 9.877093544158227e-05, + "loss": 0.8333, + "step": 22150 + }, + { + "epoch": 0.14157392382096265, + "grad_norm": 1.1602824926376343, + "learning_rate": 9.876982950106384e-05, + "loss": 0.9858, + "step": 22160 + }, + { + "epoch": 0.14163781097070136, + "grad_norm": 0.8228923082351685, + "learning_rate": 9.876872306939105e-05, + "loss": 1.0867, + "step": 22170 + }, + { + "epoch": 0.14170169812044006, + "grad_norm": 0.825602650642395, + "learning_rate": 9.876761614657504e-05, + "loss": 0.8261, + "step": 22180 + }, + { + "epoch": 0.14176558527017877, + "grad_norm": 0.7997944355010986, + "learning_rate": 9.876650873262692e-05, + "loss": 0.8914, + "step": 22190 + }, + { + "epoch": 0.14182947241991745, + "grad_norm": 0.6990909576416016, + "learning_rate": 9.876540082755788e-05, + "loss": 1.1852, + "step": 22200 + }, + { + "epoch": 0.14189335956965615, + "grad_norm": 0.7908971309661865, + "learning_rate": 9.876429243137906e-05, + "loss": 0.7917, + "step": 22210 + }, + { + "epoch": 0.14195724671939486, + "grad_norm": 1.2012591361999512, + "learning_rate": 9.876318354410163e-05, + "loss": 0.9249, + "step": 22220 + }, + { + "epoch": 0.14202113386913356, + "grad_norm": 0.7461243867874146, + "learning_rate": 9.876207416573677e-05, + "loss": 0.9312, + "step": 22230 + }, + { + "epoch": 0.14208502101887227, + "grad_norm": 0.8374635577201843, + "learning_rate": 9.876096429629563e-05, + "loss": 0.8613, + "step": 22240 + }, + { + "epoch": 0.14214890816861098, + "grad_norm": 0.6775134801864624, + "learning_rate": 9.875985393578938e-05, + "loss": 0.9284, + "step": 22250 + }, + { + "epoch": 0.14221279531834966, + "grad_norm": 1.069081425666809, + "learning_rate": 9.875874308422923e-05, + "loss": 0.809, + "step": 22260 + }, + { + "epoch": 0.14227668246808836, + "grad_norm": 0.8016782402992249, + "learning_rate": 9.875763174162635e-05, + "loss": 0.8151, + "step": 22270 + }, + { + "epoch": 0.14234056961782707, + "grad_norm": 0.7888844609260559, + "learning_rate": 9.875651990799196e-05, + "loss": 0.8556, + "step": 22280 + }, + { + "epoch": 0.14240445676756577, + "grad_norm": 0.8360929489135742, + "learning_rate": 9.875540758333721e-05, + "loss": 0.7994, + "step": 22290 + }, + { + "epoch": 0.14246834391730448, + "grad_norm": 0.520611584186554, + "learning_rate": 9.875429476767333e-05, + "loss": 0.8767, + "step": 22300 + }, + { + "epoch": 0.14253223106704319, + "grad_norm": 0.47477564215660095, + "learning_rate": 9.875318146101151e-05, + "loss": 0.7093, + "step": 22310 + }, + { + "epoch": 0.14259611821678186, + "grad_norm": 0.7633807063102722, + "learning_rate": 9.8752067663363e-05, + "loss": 1.1947, + "step": 22320 + }, + { + "epoch": 0.14266000536652057, + "grad_norm": 0.7206790447235107, + "learning_rate": 9.875095337473899e-05, + "loss": 0.8928, + "step": 22330 + }, + { + "epoch": 0.14272389251625928, + "grad_norm": 0.7361767888069153, + "learning_rate": 9.874983859515069e-05, + "loss": 0.8716, + "step": 22340 + }, + { + "epoch": 0.14278777966599798, + "grad_norm": 0.8034409880638123, + "learning_rate": 9.874872332460934e-05, + "loss": 0.9446, + "step": 22350 + }, + { + "epoch": 0.1428516668157367, + "grad_norm": 0.8999035954475403, + "learning_rate": 9.874760756312617e-05, + "loss": 1.0096, + "step": 22360 + }, + { + "epoch": 0.1429155539654754, + "grad_norm": 0.8220607042312622, + "learning_rate": 9.874649131071244e-05, + "loss": 0.9535, + "step": 22370 + }, + { + "epoch": 0.14297944111521407, + "grad_norm": 1.6880701780319214, + "learning_rate": 9.874537456737936e-05, + "loss": 0.9347, + "step": 22380 + }, + { + "epoch": 0.14304332826495278, + "grad_norm": 2.4227957725524902, + "learning_rate": 9.874425733313819e-05, + "loss": 0.9415, + "step": 22390 + }, + { + "epoch": 0.14310721541469149, + "grad_norm": 0.665111243724823, + "learning_rate": 9.874313960800017e-05, + "loss": 0.8991, + "step": 22400 + }, + { + "epoch": 0.1431711025644302, + "grad_norm": 1.0277364253997803, + "learning_rate": 9.874202139197657e-05, + "loss": 1.0399, + "step": 22410 + }, + { + "epoch": 0.1432349897141689, + "grad_norm": 0.8064048290252686, + "learning_rate": 9.874090268507866e-05, + "loss": 0.9161, + "step": 22420 + }, + { + "epoch": 0.1432988768639076, + "grad_norm": 0.5607860684394836, + "learning_rate": 9.873978348731767e-05, + "loss": 0.8696, + "step": 22430 + }, + { + "epoch": 0.14336276401364628, + "grad_norm": 0.6954864263534546, + "learning_rate": 9.873866379870492e-05, + "loss": 0.6301, + "step": 22440 + }, + { + "epoch": 0.143426651163385, + "grad_norm": 0.675815999507904, + "learning_rate": 9.873754361925162e-05, + "loss": 0.9119, + "step": 22450 + }, + { + "epoch": 0.1434905383131237, + "grad_norm": 1.266095757484436, + "learning_rate": 9.873642294896913e-05, + "loss": 0.9423, + "step": 22460 + }, + { + "epoch": 0.1435544254628624, + "grad_norm": 0.8914671540260315, + "learning_rate": 9.873530178786868e-05, + "loss": 1.034, + "step": 22470 + }, + { + "epoch": 0.1436183126126011, + "grad_norm": 0.953437864780426, + "learning_rate": 9.873418013596159e-05, + "loss": 0.9487, + "step": 22480 + }, + { + "epoch": 0.1436821997623398, + "grad_norm": 0.6912809014320374, + "learning_rate": 9.873305799325914e-05, + "loss": 1.1522, + "step": 22490 + }, + { + "epoch": 0.1437460869120785, + "grad_norm": 0.6595206260681152, + "learning_rate": 9.873193535977263e-05, + "loss": 0.977, + "step": 22500 + }, + { + "epoch": 0.1438099740618172, + "grad_norm": 0.9730925559997559, + "learning_rate": 9.873081223551338e-05, + "loss": 0.7952, + "step": 22510 + }, + { + "epoch": 0.1438738612115559, + "grad_norm": 4.339688777923584, + "learning_rate": 9.872968862049268e-05, + "loss": 0.9139, + "step": 22520 + }, + { + "epoch": 0.1439377483612946, + "grad_norm": 0.939578652381897, + "learning_rate": 9.872856451472188e-05, + "loss": 1.0464, + "step": 22530 + }, + { + "epoch": 0.14400163551103332, + "grad_norm": 0.8998389840126038, + "learning_rate": 9.872743991821227e-05, + "loss": 0.9492, + "step": 22540 + }, + { + "epoch": 0.14406552266077202, + "grad_norm": 0.7495961785316467, + "learning_rate": 9.872631483097518e-05, + "loss": 0.8357, + "step": 22550 + }, + { + "epoch": 0.1441294098105107, + "grad_norm": 0.7158836126327515, + "learning_rate": 9.872518925302195e-05, + "loss": 0.6346, + "step": 22560 + }, + { + "epoch": 0.1441932969602494, + "grad_norm": 1.3562219142913818, + "learning_rate": 9.872406318436391e-05, + "loss": 0.7683, + "step": 22570 + }, + { + "epoch": 0.1442571841099881, + "grad_norm": 2.4515798091888428, + "learning_rate": 9.872293662501239e-05, + "loss": 0.93, + "step": 22580 + }, + { + "epoch": 0.14432107125972682, + "grad_norm": 0.6932923197746277, + "learning_rate": 9.872180957497876e-05, + "loss": 0.8557, + "step": 22590 + }, + { + "epoch": 0.14438495840946552, + "grad_norm": 0.8083714842796326, + "learning_rate": 9.872068203427434e-05, + "loss": 0.9603, + "step": 22600 + }, + { + "epoch": 0.14444884555920423, + "grad_norm": 0.6430138945579529, + "learning_rate": 9.871955400291052e-05, + "loss": 1.0151, + "step": 22610 + }, + { + "epoch": 0.14451273270894294, + "grad_norm": 0.5157865881919861, + "learning_rate": 9.871842548089864e-05, + "loss": 1.0402, + "step": 22620 + }, + { + "epoch": 0.14457661985868162, + "grad_norm": 0.7073084115982056, + "learning_rate": 9.871729646825008e-05, + "loss": 1.1601, + "step": 22630 + }, + { + "epoch": 0.14464050700842032, + "grad_norm": 0.8356124758720398, + "learning_rate": 9.871616696497618e-05, + "loss": 0.7882, + "step": 22640 + }, + { + "epoch": 0.14470439415815903, + "grad_norm": 0.7543877959251404, + "learning_rate": 9.871503697108833e-05, + "loss": 1.1977, + "step": 22650 + }, + { + "epoch": 0.14476828130789773, + "grad_norm": 0.5048431158065796, + "learning_rate": 9.871390648659793e-05, + "loss": 0.6942, + "step": 22660 + }, + { + "epoch": 0.14483216845763644, + "grad_norm": 0.8877227306365967, + "learning_rate": 9.871277551151635e-05, + "loss": 1.0161, + "step": 22670 + }, + { + "epoch": 0.14489605560737515, + "grad_norm": 1.6515774726867676, + "learning_rate": 9.871164404585496e-05, + "loss": 0.7984, + "step": 22680 + }, + { + "epoch": 0.14495994275711382, + "grad_norm": 0.7503309845924377, + "learning_rate": 9.871051208962518e-05, + "loss": 1.4356, + "step": 22690 + }, + { + "epoch": 0.14502382990685253, + "grad_norm": 0.5918260216712952, + "learning_rate": 9.87093796428384e-05, + "loss": 0.8047, + "step": 22700 + }, + { + "epoch": 0.14508771705659124, + "grad_norm": 0.7670891880989075, + "learning_rate": 9.870824670550603e-05, + "loss": 1.0355, + "step": 22710 + }, + { + "epoch": 0.14515160420632994, + "grad_norm": 0.7030889987945557, + "learning_rate": 9.870711327763947e-05, + "loss": 0.9419, + "step": 22720 + }, + { + "epoch": 0.14521549135606865, + "grad_norm": 1.9804078340530396, + "learning_rate": 9.870597935925016e-05, + "loss": 1.0519, + "step": 22730 + }, + { + "epoch": 0.14527937850580736, + "grad_norm": 0.5866715312004089, + "learning_rate": 9.870484495034948e-05, + "loss": 0.8467, + "step": 22740 + }, + { + "epoch": 0.14534326565554603, + "grad_norm": 1.0047521591186523, + "learning_rate": 9.87037100509489e-05, + "loss": 0.9234, + "step": 22750 + }, + { + "epoch": 0.14540715280528474, + "grad_norm": 0.8460586667060852, + "learning_rate": 9.87025746610598e-05, + "loss": 0.98, + "step": 22760 + }, + { + "epoch": 0.14547103995502345, + "grad_norm": 0.6952506303787231, + "learning_rate": 9.870143878069364e-05, + "loss": 0.8913, + "step": 22770 + }, + { + "epoch": 0.14553492710476215, + "grad_norm": 0.8370442986488342, + "learning_rate": 9.870030240986188e-05, + "loss": 0.7564, + "step": 22780 + }, + { + "epoch": 0.14559881425450086, + "grad_norm": 2.1772940158843994, + "learning_rate": 9.869916554857593e-05, + "loss": 1.0058, + "step": 22790 + }, + { + "epoch": 0.14566270140423956, + "grad_norm": 1.9751546382904053, + "learning_rate": 9.869802819684726e-05, + "loss": 0.8494, + "step": 22800 + }, + { + "epoch": 0.14572658855397824, + "grad_norm": 1.1138042211532593, + "learning_rate": 9.86968903546873e-05, + "loss": 0.745, + "step": 22810 + }, + { + "epoch": 0.14579047570371695, + "grad_norm": 0.9470332264900208, + "learning_rate": 9.869575202210754e-05, + "loss": 0.9222, + "step": 22820 + }, + { + "epoch": 0.14585436285345565, + "grad_norm": 0.6957728862762451, + "learning_rate": 9.869461319911944e-05, + "loss": 1.0055, + "step": 22830 + }, + { + "epoch": 0.14591825000319436, + "grad_norm": 0.7304112911224365, + "learning_rate": 9.869347388573443e-05, + "loss": 0.8063, + "step": 22840 + }, + { + "epoch": 0.14598213715293307, + "grad_norm": 0.4859442710876465, + "learning_rate": 9.869233408196403e-05, + "loss": 0.7749, + "step": 22850 + }, + { + "epoch": 0.14604602430267177, + "grad_norm": 0.6382431387901306, + "learning_rate": 9.86911937878197e-05, + "loss": 0.9488, + "step": 22860 + }, + { + "epoch": 0.14610991145241045, + "grad_norm": 0.6626219153404236, + "learning_rate": 9.869005300331291e-05, + "loss": 0.6605, + "step": 22870 + }, + { + "epoch": 0.14617379860214916, + "grad_norm": 0.9865225553512573, + "learning_rate": 9.868891172845519e-05, + "loss": 1.0758, + "step": 22880 + }, + { + "epoch": 0.14623768575188786, + "grad_norm": 0.7838436365127563, + "learning_rate": 9.868776996325799e-05, + "loss": 1.0838, + "step": 22890 + }, + { + "epoch": 0.14630157290162657, + "grad_norm": 0.7881513833999634, + "learning_rate": 9.868662770773282e-05, + "loss": 0.8395, + "step": 22900 + }, + { + "epoch": 0.14636546005136528, + "grad_norm": 0.6249982118606567, + "learning_rate": 9.86854849618912e-05, + "loss": 1.0855, + "step": 22910 + }, + { + "epoch": 0.14642934720110398, + "grad_norm": 0.7879114151000977, + "learning_rate": 9.868434172574462e-05, + "loss": 1.0791, + "step": 22920 + }, + { + "epoch": 0.14649323435084266, + "grad_norm": 0.872688353061676, + "learning_rate": 9.86831979993046e-05, + "loss": 1.189, + "step": 22930 + }, + { + "epoch": 0.14655712150058137, + "grad_norm": 0.6431063413619995, + "learning_rate": 9.868205378258266e-05, + "loss": 1.0102, + "step": 22940 + }, + { + "epoch": 0.14662100865032007, + "grad_norm": 0.9336161017417908, + "learning_rate": 9.868090907559033e-05, + "loss": 1.1622, + "step": 22950 + }, + { + "epoch": 0.14668489580005878, + "grad_norm": 1.0055698156356812, + "learning_rate": 9.867976387833913e-05, + "loss": 0.8623, + "step": 22960 + }, + { + "epoch": 0.14674878294979748, + "grad_norm": 1.0225908756256104, + "learning_rate": 9.867861819084059e-05, + "loss": 0.7738, + "step": 22970 + }, + { + "epoch": 0.1468126700995362, + "grad_norm": 0.9196385741233826, + "learning_rate": 9.867747201310626e-05, + "loss": 0.8153, + "step": 22980 + }, + { + "epoch": 0.14687655724927487, + "grad_norm": 1.0798165798187256, + "learning_rate": 9.867632534514766e-05, + "loss": 0.9407, + "step": 22990 + }, + { + "epoch": 0.14694044439901358, + "grad_norm": 0.8176427483558655, + "learning_rate": 9.867517818697636e-05, + "loss": 0.9316, + "step": 23000 + }, + { + "epoch": 0.14700433154875228, + "grad_norm": 1.2678016424179077, + "learning_rate": 9.867403053860391e-05, + "loss": 0.7385, + "step": 23010 + }, + { + "epoch": 0.147068218698491, + "grad_norm": 1.1173145771026611, + "learning_rate": 9.867288240004185e-05, + "loss": 0.9177, + "step": 23020 + }, + { + "epoch": 0.1471321058482297, + "grad_norm": 0.6615016460418701, + "learning_rate": 9.867173377130177e-05, + "loss": 0.9355, + "step": 23030 + }, + { + "epoch": 0.1471959929979684, + "grad_norm": 0.5626130104064941, + "learning_rate": 9.867058465239522e-05, + "loss": 0.73, + "step": 23040 + }, + { + "epoch": 0.14725988014770708, + "grad_norm": 0.9644745588302612, + "learning_rate": 9.866943504333377e-05, + "loss": 0.8876, + "step": 23050 + }, + { + "epoch": 0.14732376729744578, + "grad_norm": 1.4023088216781616, + "learning_rate": 9.866828494412901e-05, + "loss": 0.8923, + "step": 23060 + }, + { + "epoch": 0.1473876544471845, + "grad_norm": 0.6760227680206299, + "learning_rate": 9.866713435479252e-05, + "loss": 0.8072, + "step": 23070 + }, + { + "epoch": 0.1474515415969232, + "grad_norm": 0.9531158804893494, + "learning_rate": 9.866598327533589e-05, + "loss": 0.8004, + "step": 23080 + }, + { + "epoch": 0.1475154287466619, + "grad_norm": 0.6163201928138733, + "learning_rate": 9.866483170577069e-05, + "loss": 0.9639, + "step": 23090 + }, + { + "epoch": 0.1475793158964006, + "grad_norm": 0.6841567158699036, + "learning_rate": 9.866367964610854e-05, + "loss": 1.0902, + "step": 23100 + }, + { + "epoch": 0.1476432030461393, + "grad_norm": 0.8613043427467346, + "learning_rate": 9.866252709636104e-05, + "loss": 0.8745, + "step": 23110 + }, + { + "epoch": 0.147707090195878, + "grad_norm": 0.9095843434333801, + "learning_rate": 9.86613740565398e-05, + "loss": 0.8784, + "step": 23120 + }, + { + "epoch": 0.1477709773456167, + "grad_norm": 0.6751396059989929, + "learning_rate": 9.86602205266564e-05, + "loss": 1.0972, + "step": 23130 + }, + { + "epoch": 0.1478348644953554, + "grad_norm": 0.7569636106491089, + "learning_rate": 9.86590665067225e-05, + "loss": 1.0461, + "step": 23140 + }, + { + "epoch": 0.1478987516450941, + "grad_norm": 1.0290982723236084, + "learning_rate": 9.86579119967497e-05, + "loss": 1.112, + "step": 23150 + }, + { + "epoch": 0.14796263879483282, + "grad_norm": 0.6011145114898682, + "learning_rate": 9.865675699674964e-05, + "loss": 1.0506, + "step": 23160 + }, + { + "epoch": 0.1480265259445715, + "grad_norm": 0.8810587525367737, + "learning_rate": 9.865560150673392e-05, + "loss": 0.9679, + "step": 23170 + }, + { + "epoch": 0.1480904130943102, + "grad_norm": 0.7942286133766174, + "learning_rate": 9.865444552671422e-05, + "loss": 0.8441, + "step": 23180 + }, + { + "epoch": 0.1481543002440489, + "grad_norm": 1.2883180379867554, + "learning_rate": 9.865328905670215e-05, + "loss": 0.9123, + "step": 23190 + }, + { + "epoch": 0.14821818739378761, + "grad_norm": 0.9160734415054321, + "learning_rate": 9.865213209670939e-05, + "loss": 0.8103, + "step": 23200 + }, + { + "epoch": 0.14828207454352632, + "grad_norm": 0.5292953848838806, + "learning_rate": 9.865097464674754e-05, + "loss": 0.7631, + "step": 23210 + }, + { + "epoch": 0.14834596169326503, + "grad_norm": 1.5886908769607544, + "learning_rate": 9.86498167068283e-05, + "loss": 0.9782, + "step": 23220 + }, + { + "epoch": 0.1484098488430037, + "grad_norm": 1.2354532480239868, + "learning_rate": 9.864865827696333e-05, + "loss": 1.0666, + "step": 23230 + }, + { + "epoch": 0.1484737359927424, + "grad_norm": 0.902732789516449, + "learning_rate": 9.864749935716427e-05, + "loss": 0.8587, + "step": 23240 + }, + { + "epoch": 0.14853762314248112, + "grad_norm": 0.9489061236381531, + "learning_rate": 9.86463399474428e-05, + "loss": 0.9015, + "step": 23250 + }, + { + "epoch": 0.14860151029221982, + "grad_norm": 1.0594868659973145, + "learning_rate": 9.86451800478106e-05, + "loss": 0.9639, + "step": 23260 + }, + { + "epoch": 0.14866539744195853, + "grad_norm": 0.9709058403968811, + "learning_rate": 9.864401965827936e-05, + "loss": 0.9575, + "step": 23270 + }, + { + "epoch": 0.14872928459169724, + "grad_norm": 0.7420225143432617, + "learning_rate": 9.864285877886076e-05, + "loss": 0.8139, + "step": 23280 + }, + { + "epoch": 0.14879317174143591, + "grad_norm": 1.2411167621612549, + "learning_rate": 9.86416974095665e-05, + "loss": 0.9154, + "step": 23290 + }, + { + "epoch": 0.14885705889117462, + "grad_norm": 0.9969791769981384, + "learning_rate": 9.864053555040826e-05, + "loss": 0.7712, + "step": 23300 + }, + { + "epoch": 0.14892094604091333, + "grad_norm": 0.7000773549079895, + "learning_rate": 9.863937320139774e-05, + "loss": 0.9034, + "step": 23310 + }, + { + "epoch": 0.14898483319065203, + "grad_norm": 0.8266654014587402, + "learning_rate": 9.863821036254666e-05, + "loss": 0.9289, + "step": 23320 + }, + { + "epoch": 0.14904872034039074, + "grad_norm": 0.5291149616241455, + "learning_rate": 9.863704703386671e-05, + "loss": 0.8965, + "step": 23330 + }, + { + "epoch": 0.14911260749012945, + "grad_norm": 1.1645135879516602, + "learning_rate": 9.863588321536964e-05, + "loss": 1.0616, + "step": 23340 + }, + { + "epoch": 0.14917649463986812, + "grad_norm": 0.7084513902664185, + "learning_rate": 9.863471890706714e-05, + "loss": 1.0098, + "step": 23350 + }, + { + "epoch": 0.14924038178960683, + "grad_norm": 0.6941312551498413, + "learning_rate": 9.863355410897095e-05, + "loss": 0.9369, + "step": 23360 + }, + { + "epoch": 0.14930426893934554, + "grad_norm": 1.0156537294387817, + "learning_rate": 9.863238882109278e-05, + "loss": 1.1076, + "step": 23370 + }, + { + "epoch": 0.14936815608908424, + "grad_norm": 0.8023911714553833, + "learning_rate": 9.863122304344439e-05, + "loss": 0.8709, + "step": 23380 + }, + { + "epoch": 0.14943204323882295, + "grad_norm": 0.8865915536880493, + "learning_rate": 9.863005677603752e-05, + "loss": 0.8393, + "step": 23390 + }, + { + "epoch": 0.14949593038856165, + "grad_norm": 1.4520982503890991, + "learning_rate": 9.86288900188839e-05, + "loss": 1.138, + "step": 23400 + }, + { + "epoch": 0.14955981753830033, + "grad_norm": 1.1401234865188599, + "learning_rate": 9.862772277199529e-05, + "loss": 1.1788, + "step": 23410 + }, + { + "epoch": 0.14962370468803904, + "grad_norm": 0.632628858089447, + "learning_rate": 9.862655503538344e-05, + "loss": 0.8879, + "step": 23420 + }, + { + "epoch": 0.14968759183777774, + "grad_norm": 0.6416946649551392, + "learning_rate": 9.862538680906012e-05, + "loss": 0.8936, + "step": 23430 + }, + { + "epoch": 0.14975147898751645, + "grad_norm": 0.6808968186378479, + "learning_rate": 9.862421809303708e-05, + "loss": 0.8778, + "step": 23440 + }, + { + "epoch": 0.14981536613725516, + "grad_norm": 0.9920696020126343, + "learning_rate": 9.86230488873261e-05, + "loss": 0.8278, + "step": 23450 + }, + { + "epoch": 0.14987925328699386, + "grad_norm": 0.8314083218574524, + "learning_rate": 9.862187919193895e-05, + "loss": 0.9445, + "step": 23460 + }, + { + "epoch": 0.14994314043673257, + "grad_norm": 0.7839555740356445, + "learning_rate": 9.862070900688742e-05, + "loss": 0.9105, + "step": 23470 + }, + { + "epoch": 0.15000702758647125, + "grad_norm": 0.7194756865501404, + "learning_rate": 9.861953833218329e-05, + "loss": 0.8104, + "step": 23480 + }, + { + "epoch": 0.15007091473620995, + "grad_norm": 0.8320297002792358, + "learning_rate": 9.861836716783834e-05, + "loss": 0.9076, + "step": 23490 + }, + { + "epoch": 0.15013480188594866, + "grad_norm": 0.744303822517395, + "learning_rate": 9.861719551386437e-05, + "loss": 0.7775, + "step": 23500 + }, + { + "epoch": 0.15019868903568737, + "grad_norm": 1.1499621868133545, + "learning_rate": 9.861602337027318e-05, + "loss": 1.0126, + "step": 23510 + }, + { + "epoch": 0.15026257618542607, + "grad_norm": 0.893481969833374, + "learning_rate": 9.861485073707658e-05, + "loss": 0.9876, + "step": 23520 + }, + { + "epoch": 0.15032646333516478, + "grad_norm": 1.0423784255981445, + "learning_rate": 9.861367761428638e-05, + "loss": 0.831, + "step": 23530 + }, + { + "epoch": 0.15039035048490346, + "grad_norm": 0.7774150371551514, + "learning_rate": 9.861250400191438e-05, + "loss": 0.8752, + "step": 23540 + }, + { + "epoch": 0.15045423763464216, + "grad_norm": 0.9276893138885498, + "learning_rate": 9.861132989997242e-05, + "loss": 0.815, + "step": 23550 + }, + { + "epoch": 0.15051812478438087, + "grad_norm": 1.5479460954666138, + "learning_rate": 9.86101553084723e-05, + "loss": 1.1705, + "step": 23560 + }, + { + "epoch": 0.15058201193411958, + "grad_norm": 1.3702467679977417, + "learning_rate": 9.860898022742587e-05, + "loss": 1.1229, + "step": 23570 + }, + { + "epoch": 0.15064589908385828, + "grad_norm": 0.8833318948745728, + "learning_rate": 9.860780465684497e-05, + "loss": 0.8501, + "step": 23580 + }, + { + "epoch": 0.150709786233597, + "grad_norm": 0.8857479691505432, + "learning_rate": 9.860662859674139e-05, + "loss": 0.9028, + "step": 23590 + }, + { + "epoch": 0.15077367338333567, + "grad_norm": 0.9464370608329773, + "learning_rate": 9.860545204712703e-05, + "loss": 0.8605, + "step": 23600 + }, + { + "epoch": 0.15083756053307437, + "grad_norm": 0.9219076037406921, + "learning_rate": 9.860427500801372e-05, + "loss": 0.8217, + "step": 23610 + }, + { + "epoch": 0.15090144768281308, + "grad_norm": 2.4392945766448975, + "learning_rate": 9.860309747941333e-05, + "loss": 0.8927, + "step": 23620 + }, + { + "epoch": 0.15096533483255178, + "grad_norm": 1.1871190071105957, + "learning_rate": 9.860191946133766e-05, + "loss": 1.1577, + "step": 23630 + }, + { + "epoch": 0.1510292219822905, + "grad_norm": 1.2772961854934692, + "learning_rate": 9.860074095379863e-05, + "loss": 0.9204, + "step": 23640 + }, + { + "epoch": 0.1510931091320292, + "grad_norm": 0.6214377284049988, + "learning_rate": 9.859956195680811e-05, + "loss": 0.8562, + "step": 23650 + }, + { + "epoch": 0.15115699628176787, + "grad_norm": 0.7957346439361572, + "learning_rate": 9.859838247037794e-05, + "loss": 0.7878, + "step": 23660 + }, + { + "epoch": 0.15122088343150658, + "grad_norm": 0.7047122716903687, + "learning_rate": 9.859720249452003e-05, + "loss": 0.9215, + "step": 23670 + }, + { + "epoch": 0.1512847705812453, + "grad_norm": 0.8219524025917053, + "learning_rate": 9.859602202924623e-05, + "loss": 0.884, + "step": 23680 + }, + { + "epoch": 0.151348657730984, + "grad_norm": 0.844274640083313, + "learning_rate": 9.859484107456846e-05, + "loss": 0.8565, + "step": 23690 + }, + { + "epoch": 0.1514125448807227, + "grad_norm": 0.8894696831703186, + "learning_rate": 9.859365963049858e-05, + "loss": 0.8738, + "step": 23700 + }, + { + "epoch": 0.1514764320304614, + "grad_norm": 1.032109260559082, + "learning_rate": 9.859247769704854e-05, + "loss": 0.7034, + "step": 23710 + }, + { + "epoch": 0.15154031918020008, + "grad_norm": 0.8953695297241211, + "learning_rate": 9.859129527423019e-05, + "loss": 0.9061, + "step": 23720 + }, + { + "epoch": 0.1516042063299388, + "grad_norm": 0.7908507585525513, + "learning_rate": 9.859011236205547e-05, + "loss": 0.9427, + "step": 23730 + }, + { + "epoch": 0.1516680934796775, + "grad_norm": 0.7494611144065857, + "learning_rate": 9.858892896053626e-05, + "loss": 0.7095, + "step": 23740 + }, + { + "epoch": 0.1517319806294162, + "grad_norm": 0.7644729614257812, + "learning_rate": 9.858774506968451e-05, + "loss": 0.9053, + "step": 23750 + }, + { + "epoch": 0.1517958677791549, + "grad_norm": 1.1524786949157715, + "learning_rate": 9.858656068951215e-05, + "loss": 0.7965, + "step": 23760 + }, + { + "epoch": 0.15185975492889361, + "grad_norm": 0.8188411593437195, + "learning_rate": 9.858537582003107e-05, + "loss": 0.986, + "step": 23770 + }, + { + "epoch": 0.1519236420786323, + "grad_norm": 0.9521570801734924, + "learning_rate": 9.858419046125322e-05, + "loss": 0.791, + "step": 23780 + }, + { + "epoch": 0.151987529228371, + "grad_norm": 1.1801695823669434, + "learning_rate": 9.858300461319057e-05, + "loss": 0.8084, + "step": 23790 + }, + { + "epoch": 0.1520514163781097, + "grad_norm": 0.66313236951828, + "learning_rate": 9.8581818275855e-05, + "loss": 1.0134, + "step": 23800 + }, + { + "epoch": 0.1521153035278484, + "grad_norm": 0.7492579817771912, + "learning_rate": 9.85806314492585e-05, + "loss": 0.892, + "step": 23810 + }, + { + "epoch": 0.15217919067758712, + "grad_norm": 0.7110322713851929, + "learning_rate": 9.857944413341304e-05, + "loss": 1.1158, + "step": 23820 + }, + { + "epoch": 0.15224307782732582, + "grad_norm": 1.010519027709961, + "learning_rate": 9.857825632833053e-05, + "loss": 0.9537, + "step": 23830 + }, + { + "epoch": 0.1523069649770645, + "grad_norm": 0.8604142069816589, + "learning_rate": 9.857706803402294e-05, + "loss": 0.936, + "step": 23840 + }, + { + "epoch": 0.1523708521268032, + "grad_norm": 0.5838251113891602, + "learning_rate": 9.857587925050226e-05, + "loss": 0.9363, + "step": 23850 + }, + { + "epoch": 0.15243473927654191, + "grad_norm": 0.7778534889221191, + "learning_rate": 9.857468997778046e-05, + "loss": 1.0045, + "step": 23860 + }, + { + "epoch": 0.15249862642628062, + "grad_norm": 1.257494568824768, + "learning_rate": 9.85735002158695e-05, + "loss": 0.847, + "step": 23870 + }, + { + "epoch": 0.15256251357601933, + "grad_norm": 0.7079510688781738, + "learning_rate": 9.857230996478137e-05, + "loss": 1.0672, + "step": 23880 + }, + { + "epoch": 0.15262640072575803, + "grad_norm": 2.4514129161834717, + "learning_rate": 9.857111922452807e-05, + "loss": 0.7693, + "step": 23890 + }, + { + "epoch": 0.1526902878754967, + "grad_norm": 0.5904504060745239, + "learning_rate": 9.856992799512157e-05, + "loss": 0.9016, + "step": 23900 + }, + { + "epoch": 0.15275417502523542, + "grad_norm": 0.7344809770584106, + "learning_rate": 9.856873627657387e-05, + "loss": 0.7255, + "step": 23910 + }, + { + "epoch": 0.15281806217497412, + "grad_norm": 1.4561502933502197, + "learning_rate": 9.856754406889698e-05, + "loss": 0.9038, + "step": 23920 + }, + { + "epoch": 0.15288194932471283, + "grad_norm": 0.8599551916122437, + "learning_rate": 9.85663513721029e-05, + "loss": 0.7992, + "step": 23930 + }, + { + "epoch": 0.15294583647445154, + "grad_norm": 2.2323386669158936, + "learning_rate": 9.856515818620367e-05, + "loss": 0.9681, + "step": 23940 + }, + { + "epoch": 0.15300972362419024, + "grad_norm": 1.3280889987945557, + "learning_rate": 9.856396451121125e-05, + "loss": 0.6727, + "step": 23950 + }, + { + "epoch": 0.15307361077392892, + "grad_norm": 1.3691116571426392, + "learning_rate": 9.856277034713772e-05, + "loss": 0.8038, + "step": 23960 + }, + { + "epoch": 0.15313749792366763, + "grad_norm": 1.1116257905960083, + "learning_rate": 9.856157569399507e-05, + "loss": 0.749, + "step": 23970 + }, + { + "epoch": 0.15320138507340633, + "grad_norm": 1.1849030256271362, + "learning_rate": 9.856038055179535e-05, + "loss": 0.9773, + "step": 23980 + }, + { + "epoch": 0.15326527222314504, + "grad_norm": 0.88172447681427, + "learning_rate": 9.855918492055057e-05, + "loss": 1.1426, + "step": 23990 + }, + { + "epoch": 0.15332915937288374, + "grad_norm": 1.102968454360962, + "learning_rate": 9.855798880027279e-05, + "loss": 0.9212, + "step": 24000 + }, + { + "epoch": 0.15339304652262245, + "grad_norm": 1.179286003112793, + "learning_rate": 9.855679219097407e-05, + "loss": 0.9407, + "step": 24010 + }, + { + "epoch": 0.15345693367236113, + "grad_norm": 0.7198648452758789, + "learning_rate": 9.855559509266644e-05, + "loss": 0.9663, + "step": 24020 + }, + { + "epoch": 0.15352082082209983, + "grad_norm": 0.9259359240531921, + "learning_rate": 9.855439750536195e-05, + "loss": 1.0747, + "step": 24030 + }, + { + "epoch": 0.15358470797183854, + "grad_norm": 0.9067502021789551, + "learning_rate": 9.855319942907268e-05, + "loss": 0.7373, + "step": 24040 + }, + { + "epoch": 0.15364859512157725, + "grad_norm": 0.6593869924545288, + "learning_rate": 9.855200086381068e-05, + "loss": 0.9685, + "step": 24050 + }, + { + "epoch": 0.15371248227131595, + "grad_norm": 0.810939610004425, + "learning_rate": 9.855080180958803e-05, + "loss": 0.7862, + "step": 24060 + }, + { + "epoch": 0.15377636942105466, + "grad_norm": 0.8420569896697998, + "learning_rate": 9.854960226641681e-05, + "loss": 0.8562, + "step": 24070 + }, + { + "epoch": 0.15384025657079334, + "grad_norm": 0.7327421307563782, + "learning_rate": 9.854840223430909e-05, + "loss": 0.852, + "step": 24080 + }, + { + "epoch": 0.15390414372053204, + "grad_norm": 0.8360452055931091, + "learning_rate": 9.854720171327696e-05, + "loss": 0.9425, + "step": 24090 + }, + { + "epoch": 0.15396803087027075, + "grad_norm": 0.6557414531707764, + "learning_rate": 9.854600070333251e-05, + "loss": 0.754, + "step": 24100 + }, + { + "epoch": 0.15403191802000946, + "grad_norm": 0.9082469940185547, + "learning_rate": 9.854479920448782e-05, + "loss": 0.9427, + "step": 24110 + }, + { + "epoch": 0.15409580516974816, + "grad_norm": 0.7796029448509216, + "learning_rate": 9.854359721675503e-05, + "loss": 0.8438, + "step": 24120 + }, + { + "epoch": 0.15415969231948687, + "grad_norm": 0.6190805435180664, + "learning_rate": 9.85423947401462e-05, + "loss": 0.8237, + "step": 24130 + }, + { + "epoch": 0.15422357946922555, + "grad_norm": 0.813653290271759, + "learning_rate": 9.854119177467347e-05, + "loss": 0.8553, + "step": 24140 + }, + { + "epoch": 0.15428746661896425, + "grad_norm": 0.8362258672714233, + "learning_rate": 9.853998832034894e-05, + "loss": 0.9488, + "step": 24150 + }, + { + "epoch": 0.15435135376870296, + "grad_norm": 1.0680490732192993, + "learning_rate": 9.853878437718473e-05, + "loss": 0.9838, + "step": 24160 + }, + { + "epoch": 0.15441524091844167, + "grad_norm": 1.0183037519454956, + "learning_rate": 9.853757994519299e-05, + "loss": 0.6685, + "step": 24170 + }, + { + "epoch": 0.15447912806818037, + "grad_norm": 0.7617247700691223, + "learning_rate": 9.853637502438582e-05, + "loss": 0.8784, + "step": 24180 + }, + { + "epoch": 0.15454301521791908, + "grad_norm": 0.633660614490509, + "learning_rate": 9.853516961477535e-05, + "loss": 0.8068, + "step": 24190 + }, + { + "epoch": 0.15460690236765776, + "grad_norm": 0.8987011313438416, + "learning_rate": 9.853396371637374e-05, + "loss": 0.6322, + "step": 24200 + }, + { + "epoch": 0.15467078951739646, + "grad_norm": 0.8973355889320374, + "learning_rate": 9.853275732919314e-05, + "loss": 1.2822, + "step": 24210 + }, + { + "epoch": 0.15473467666713517, + "grad_norm": 1.284421682357788, + "learning_rate": 9.853155045324567e-05, + "loss": 0.9294, + "step": 24220 + }, + { + "epoch": 0.15479856381687387, + "grad_norm": 1.0189619064331055, + "learning_rate": 9.85303430885435e-05, + "loss": 0.7897, + "step": 24230 + }, + { + "epoch": 0.15486245096661258, + "grad_norm": 0.8572905659675598, + "learning_rate": 9.85291352350988e-05, + "loss": 0.9204, + "step": 24240 + }, + { + "epoch": 0.1549263381163513, + "grad_norm": 1.0044801235198975, + "learning_rate": 9.852792689292373e-05, + "loss": 1.0265, + "step": 24250 + }, + { + "epoch": 0.15499022526608996, + "grad_norm": 0.8651962280273438, + "learning_rate": 9.852671806203045e-05, + "loss": 0.6892, + "step": 24260 + }, + { + "epoch": 0.15505411241582867, + "grad_norm": 1.309009075164795, + "learning_rate": 9.852550874243111e-05, + "loss": 1.0858, + "step": 24270 + }, + { + "epoch": 0.15511799956556738, + "grad_norm": 0.9584972262382507, + "learning_rate": 9.852429893413795e-05, + "loss": 0.9216, + "step": 24280 + }, + { + "epoch": 0.15518188671530608, + "grad_norm": 0.6010156869888306, + "learning_rate": 9.852308863716311e-05, + "loss": 0.8739, + "step": 24290 + }, + { + "epoch": 0.1552457738650448, + "grad_norm": 0.8952304124832153, + "learning_rate": 9.852187785151879e-05, + "loss": 0.9147, + "step": 24300 + }, + { + "epoch": 0.1553096610147835, + "grad_norm": 0.6536133885383606, + "learning_rate": 9.85206665772172e-05, + "loss": 0.8771, + "step": 24310 + }, + { + "epoch": 0.1553735481645222, + "grad_norm": 0.7753522992134094, + "learning_rate": 9.851945481427048e-05, + "loss": 1.0301, + "step": 24320 + }, + { + "epoch": 0.15543743531426088, + "grad_norm": 1.4516469240188599, + "learning_rate": 9.851824256269092e-05, + "loss": 1.0265, + "step": 24330 + }, + { + "epoch": 0.1555013224639996, + "grad_norm": 0.934195339679718, + "learning_rate": 9.851702982249065e-05, + "loss": 0.995, + "step": 24340 + }, + { + "epoch": 0.1555652096137383, + "grad_norm": 0.7957481741905212, + "learning_rate": 9.851581659368192e-05, + "loss": 0.8226, + "step": 24350 + }, + { + "epoch": 0.155629096763477, + "grad_norm": 0.7475680708885193, + "learning_rate": 9.851460287627695e-05, + "loss": 1.0825, + "step": 24360 + }, + { + "epoch": 0.1556929839132157, + "grad_norm": 0.65959233045578, + "learning_rate": 9.851338867028797e-05, + "loss": 1.1795, + "step": 24370 + }, + { + "epoch": 0.1557568710629544, + "grad_norm": 0.6770491600036621, + "learning_rate": 9.851217397572718e-05, + "loss": 0.9308, + "step": 24380 + }, + { + "epoch": 0.1558207582126931, + "grad_norm": 0.6056157946586609, + "learning_rate": 9.851095879260684e-05, + "loss": 1.0731, + "step": 24390 + }, + { + "epoch": 0.1558846453624318, + "grad_norm": 0.8914613127708435, + "learning_rate": 9.850974312093918e-05, + "loss": 0.7644, + "step": 24400 + }, + { + "epoch": 0.1559485325121705, + "grad_norm": 0.8289129137992859, + "learning_rate": 9.850852696073643e-05, + "loss": 1.1423, + "step": 24410 + }, + { + "epoch": 0.1560124196619092, + "grad_norm": 1.1932592391967773, + "learning_rate": 9.850731031201084e-05, + "loss": 0.7908, + "step": 24420 + }, + { + "epoch": 0.1560763068116479, + "grad_norm": 0.8615885376930237, + "learning_rate": 9.850609317477468e-05, + "loss": 0.7105, + "step": 24430 + }, + { + "epoch": 0.15614019396138662, + "grad_norm": 0.647098958492279, + "learning_rate": 9.85048755490402e-05, + "loss": 1.0009, + "step": 24440 + }, + { + "epoch": 0.1562040811111253, + "grad_norm": 0.6660744547843933, + "learning_rate": 9.850365743481965e-05, + "loss": 0.8714, + "step": 24450 + }, + { + "epoch": 0.156267968260864, + "grad_norm": 0.84688800573349, + "learning_rate": 9.850243883212531e-05, + "loss": 0.942, + "step": 24460 + }, + { + "epoch": 0.1563318554106027, + "grad_norm": 0.48218655586242676, + "learning_rate": 9.850121974096946e-05, + "loss": 1.0805, + "step": 24470 + }, + { + "epoch": 0.15639574256034142, + "grad_norm": 0.9218449592590332, + "learning_rate": 9.850000016136437e-05, + "loss": 1.0481, + "step": 24480 + }, + { + "epoch": 0.15645962971008012, + "grad_norm": 0.584633469581604, + "learning_rate": 9.849878009332231e-05, + "loss": 0.8474, + "step": 24490 + }, + { + "epoch": 0.15652351685981883, + "grad_norm": 0.8491461873054504, + "learning_rate": 9.849755953685557e-05, + "loss": 0.9905, + "step": 24500 + }, + { + "epoch": 0.1565874040095575, + "grad_norm": 0.961509644985199, + "learning_rate": 9.849633849197649e-05, + "loss": 1.1605, + "step": 24510 + }, + { + "epoch": 0.1566512911592962, + "grad_norm": 0.8623896241188049, + "learning_rate": 9.849511695869728e-05, + "loss": 0.7161, + "step": 24520 + }, + { + "epoch": 0.15671517830903492, + "grad_norm": 0.6448975205421448, + "learning_rate": 9.84938949370303e-05, + "loss": 0.9754, + "step": 24530 + }, + { + "epoch": 0.15677906545877363, + "grad_norm": 0.5791314244270325, + "learning_rate": 9.849267242698785e-05, + "loss": 0.7836, + "step": 24540 + }, + { + "epoch": 0.15684295260851233, + "grad_norm": 0.5874826312065125, + "learning_rate": 9.849144942858224e-05, + "loss": 0.8067, + "step": 24550 + }, + { + "epoch": 0.15690683975825104, + "grad_norm": 0.7695150375366211, + "learning_rate": 9.849022594182577e-05, + "loss": 1.153, + "step": 24560 + }, + { + "epoch": 0.15697072690798972, + "grad_norm": 0.7399982213973999, + "learning_rate": 9.848900196673079e-05, + "loss": 1.2349, + "step": 24570 + }, + { + "epoch": 0.15703461405772842, + "grad_norm": 0.8517500758171082, + "learning_rate": 9.848777750330961e-05, + "loss": 1.01, + "step": 24580 + }, + { + "epoch": 0.15709850120746713, + "grad_norm": 0.6582129001617432, + "learning_rate": 9.848655255157456e-05, + "loss": 0.71, + "step": 24590 + }, + { + "epoch": 0.15716238835720583, + "grad_norm": 0.5711886286735535, + "learning_rate": 9.848532711153797e-05, + "loss": 0.9785, + "step": 24600 + }, + { + "epoch": 0.15722627550694454, + "grad_norm": 0.7866716980934143, + "learning_rate": 9.848410118321221e-05, + "loss": 0.8093, + "step": 24610 + }, + { + "epoch": 0.15729016265668325, + "grad_norm": 0.6282891631126404, + "learning_rate": 9.848287476660958e-05, + "loss": 0.8937, + "step": 24620 + }, + { + "epoch": 0.15735404980642193, + "grad_norm": 1.6044594049453735, + "learning_rate": 9.848164786174248e-05, + "loss": 1.0449, + "step": 24630 + }, + { + "epoch": 0.15741793695616063, + "grad_norm": 1.279166579246521, + "learning_rate": 9.848042046862322e-05, + "loss": 1.2909, + "step": 24640 + }, + { + "epoch": 0.15748182410589934, + "grad_norm": 1.3262732028961182, + "learning_rate": 9.847919258726421e-05, + "loss": 0.9336, + "step": 24650 + }, + { + "epoch": 0.15754571125563804, + "grad_norm": 0.7303173542022705, + "learning_rate": 9.847796421767777e-05, + "loss": 0.8935, + "step": 24660 + }, + { + "epoch": 0.15760959840537675, + "grad_norm": 0.8746846914291382, + "learning_rate": 9.84767353598763e-05, + "loss": 0.8438, + "step": 24670 + }, + { + "epoch": 0.15767348555511546, + "grad_norm": 1.244907259941101, + "learning_rate": 9.847550601387217e-05, + "loss": 0.672, + "step": 24680 + }, + { + "epoch": 0.15773737270485413, + "grad_norm": 0.7882753610610962, + "learning_rate": 9.847427617967775e-05, + "loss": 0.8104, + "step": 24690 + }, + { + "epoch": 0.15780125985459284, + "grad_norm": 0.5869142413139343, + "learning_rate": 9.847304585730544e-05, + "loss": 0.9445, + "step": 24700 + }, + { + "epoch": 0.15786514700433155, + "grad_norm": 0.8743402361869812, + "learning_rate": 9.847181504676761e-05, + "loss": 1.0129, + "step": 24710 + }, + { + "epoch": 0.15792903415407025, + "grad_norm": 0.8246279358863831, + "learning_rate": 9.847058374807669e-05, + "loss": 0.8171, + "step": 24720 + }, + { + "epoch": 0.15799292130380896, + "grad_norm": 0.7410875558853149, + "learning_rate": 9.846935196124504e-05, + "loss": 0.9308, + "step": 24730 + }, + { + "epoch": 0.15805680845354766, + "grad_norm": 0.9520349502563477, + "learning_rate": 9.846811968628509e-05, + "loss": 1.0484, + "step": 24740 + }, + { + "epoch": 0.15812069560328634, + "grad_norm": 1.908379316329956, + "learning_rate": 9.846688692320925e-05, + "loss": 0.9074, + "step": 24750 + }, + { + "epoch": 0.15818458275302505, + "grad_norm": 1.148059368133545, + "learning_rate": 9.846565367202992e-05, + "loss": 1.0573, + "step": 24760 + }, + { + "epoch": 0.15824846990276376, + "grad_norm": 0.6221771836280823, + "learning_rate": 9.846441993275952e-05, + "loss": 0.9355, + "step": 24770 + }, + { + "epoch": 0.15831235705250246, + "grad_norm": 0.7107810974121094, + "learning_rate": 9.84631857054105e-05, + "loss": 0.8245, + "step": 24780 + }, + { + "epoch": 0.15837624420224117, + "grad_norm": 2.3203704357147217, + "learning_rate": 9.846195098999527e-05, + "loss": 0.7197, + "step": 24790 + }, + { + "epoch": 0.15844013135197987, + "grad_norm": 0.8047979474067688, + "learning_rate": 9.846071578652627e-05, + "loss": 1.0095, + "step": 24800 + }, + { + "epoch": 0.15850401850171855, + "grad_norm": 0.848024845123291, + "learning_rate": 9.845948009501593e-05, + "loss": 0.9665, + "step": 24810 + }, + { + "epoch": 0.15856790565145726, + "grad_norm": 0.5435264706611633, + "learning_rate": 9.845824391547671e-05, + "loss": 0.7763, + "step": 24820 + }, + { + "epoch": 0.15863179280119596, + "grad_norm": 0.6636167764663696, + "learning_rate": 9.845700724792104e-05, + "loss": 0.966, + "step": 24830 + }, + { + "epoch": 0.15869567995093467, + "grad_norm": 0.9921244382858276, + "learning_rate": 9.84557700923614e-05, + "loss": 0.8407, + "step": 24840 + }, + { + "epoch": 0.15875956710067338, + "grad_norm": 0.6068295836448669, + "learning_rate": 9.845453244881022e-05, + "loss": 0.7625, + "step": 24850 + }, + { + "epoch": 0.15882345425041208, + "grad_norm": 0.5496127605438232, + "learning_rate": 9.845329431728e-05, + "loss": 0.8734, + "step": 24860 + }, + { + "epoch": 0.15888734140015076, + "grad_norm": 1.1657304763793945, + "learning_rate": 9.845205569778316e-05, + "loss": 0.8215, + "step": 24870 + }, + { + "epoch": 0.15895122854988947, + "grad_norm": 0.6050916910171509, + "learning_rate": 9.845081659033221e-05, + "loss": 0.7701, + "step": 24880 + }, + { + "epoch": 0.15901511569962817, + "grad_norm": 0.7160899043083191, + "learning_rate": 9.844957699493964e-05, + "loss": 1.0013, + "step": 24890 + }, + { + "epoch": 0.15907900284936688, + "grad_norm": 0.8572732210159302, + "learning_rate": 9.84483369116179e-05, + "loss": 0.8973, + "step": 24900 + }, + { + "epoch": 0.15914288999910559, + "grad_norm": 0.8619921803474426, + "learning_rate": 9.84470963403795e-05, + "loss": 0.9491, + "step": 24910 + }, + { + "epoch": 0.1592067771488443, + "grad_norm": 0.6899974942207336, + "learning_rate": 9.844585528123692e-05, + "loss": 0.8375, + "step": 24920 + }, + { + "epoch": 0.15927066429858297, + "grad_norm": 0.7540447115898132, + "learning_rate": 9.844461373420267e-05, + "loss": 0.8525, + "step": 24930 + }, + { + "epoch": 0.15933455144832168, + "grad_norm": 0.8030637502670288, + "learning_rate": 9.844337169928926e-05, + "loss": 0.8833, + "step": 24940 + }, + { + "epoch": 0.15939843859806038, + "grad_norm": 0.8504492044448853, + "learning_rate": 9.844212917650917e-05, + "loss": 0.9273, + "step": 24950 + }, + { + "epoch": 0.1594623257477991, + "grad_norm": 1.3353928327560425, + "learning_rate": 9.844088616587493e-05, + "loss": 0.8097, + "step": 24960 + }, + { + "epoch": 0.1595262128975378, + "grad_norm": 1.6527575254440308, + "learning_rate": 9.843964266739907e-05, + "loss": 0.7699, + "step": 24970 + }, + { + "epoch": 0.1595901000472765, + "grad_norm": 0.6608484387397766, + "learning_rate": 9.84383986810941e-05, + "loss": 0.9171, + "step": 24980 + }, + { + "epoch": 0.15965398719701518, + "grad_norm": 0.8177617788314819, + "learning_rate": 9.843715420697254e-05, + "loss": 0.9391, + "step": 24990 + }, + { + "epoch": 0.15971787434675389, + "grad_norm": 0.8526275753974915, + "learning_rate": 9.843590924504696e-05, + "loss": 0.9272, + "step": 25000 + }, + { + "epoch": 0.1597817614964926, + "grad_norm": 0.753639817237854, + "learning_rate": 9.843466379532985e-05, + "loss": 0.6739, + "step": 25010 + }, + { + "epoch": 0.1598456486462313, + "grad_norm": 0.8092784881591797, + "learning_rate": 9.843341785783377e-05, + "loss": 0.7158, + "step": 25020 + }, + { + "epoch": 0.15990953579597, + "grad_norm": 1.0467857122421265, + "learning_rate": 9.843217143257126e-05, + "loss": 0.7562, + "step": 25030 + }, + { + "epoch": 0.1599734229457087, + "grad_norm": 0.5774504542350769, + "learning_rate": 9.843092451955491e-05, + "loss": 0.7832, + "step": 25040 + }, + { + "epoch": 0.1600373100954474, + "grad_norm": 1.2460720539093018, + "learning_rate": 9.842967711879725e-05, + "loss": 0.7436, + "step": 25050 + }, + { + "epoch": 0.1601011972451861, + "grad_norm": 1.1241377592086792, + "learning_rate": 9.842842923031084e-05, + "loss": 0.7252, + "step": 25060 + }, + { + "epoch": 0.1601650843949248, + "grad_norm": 0.565549910068512, + "learning_rate": 9.842718085410823e-05, + "loss": 0.8209, + "step": 25070 + }, + { + "epoch": 0.1602289715446635, + "grad_norm": 0.5020076036453247, + "learning_rate": 9.842593199020203e-05, + "loss": 0.661, + "step": 25080 + }, + { + "epoch": 0.1602928586944022, + "grad_norm": 0.7566838264465332, + "learning_rate": 9.842480759571027e-05, + "loss": 0.9231, + "step": 25090 + }, + { + "epoch": 0.16035674584414092, + "grad_norm": 0.9458664655685425, + "learning_rate": 9.842355780520187e-05, + "loss": 0.7723, + "step": 25100 + }, + { + "epoch": 0.1604206329938796, + "grad_norm": 1.0208630561828613, + "learning_rate": 9.842230752702635e-05, + "loss": 0.7883, + "step": 25110 + }, + { + "epoch": 0.1604845201436183, + "grad_norm": 0.7197948098182678, + "learning_rate": 9.84210567611963e-05, + "loss": 0.8673, + "step": 25120 + }, + { + "epoch": 0.160548407293357, + "grad_norm": 0.9319686889648438, + "learning_rate": 9.841980550772433e-05, + "loss": 0.7893, + "step": 25130 + }, + { + "epoch": 0.16061229444309572, + "grad_norm": 0.8447830677032471, + "learning_rate": 9.841855376662302e-05, + "loss": 1.0086, + "step": 25140 + }, + { + "epoch": 0.16067618159283442, + "grad_norm": 1.1380891799926758, + "learning_rate": 9.841730153790499e-05, + "loss": 0.7411, + "step": 25150 + }, + { + "epoch": 0.16074006874257313, + "grad_norm": 0.988677442073822, + "learning_rate": 9.841604882158285e-05, + "loss": 0.8238, + "step": 25160 + }, + { + "epoch": 0.16080395589231183, + "grad_norm": 0.6261546611785889, + "learning_rate": 9.84147956176692e-05, + "loss": 0.9153, + "step": 25170 + }, + { + "epoch": 0.1608678430420505, + "grad_norm": 1.1242022514343262, + "learning_rate": 9.841354192617667e-05, + "loss": 0.8479, + "step": 25180 + }, + { + "epoch": 0.16093173019178922, + "grad_norm": 0.8760757446289062, + "learning_rate": 9.84122877471179e-05, + "loss": 1.0954, + "step": 25190 + }, + { + "epoch": 0.16099561734152792, + "grad_norm": 0.8859489560127258, + "learning_rate": 9.841103308050552e-05, + "loss": 0.6732, + "step": 25200 + }, + { + "epoch": 0.16105950449126663, + "grad_norm": 1.3529788255691528, + "learning_rate": 9.840977792635215e-05, + "loss": 1.0534, + "step": 25210 + }, + { + "epoch": 0.16112339164100534, + "grad_norm": 0.721413254737854, + "learning_rate": 9.840852228467041e-05, + "loss": 0.9705, + "step": 25220 + }, + { + "epoch": 0.16118727879074404, + "grad_norm": 0.9626721739768982, + "learning_rate": 9.8407266155473e-05, + "loss": 0.8799, + "step": 25230 + }, + { + "epoch": 0.16125116594048272, + "grad_norm": 0.5856235027313232, + "learning_rate": 9.840600953877253e-05, + "loss": 1.2152, + "step": 25240 + }, + { + "epoch": 0.16131505309022143, + "grad_norm": 1.074049711227417, + "learning_rate": 9.840475243458167e-05, + "loss": 0.8506, + "step": 25250 + }, + { + "epoch": 0.16137894023996013, + "grad_norm": 0.7193922400474548, + "learning_rate": 9.840349484291308e-05, + "loss": 0.7899, + "step": 25260 + }, + { + "epoch": 0.16144282738969884, + "grad_norm": 1.0390762090682983, + "learning_rate": 9.840223676377942e-05, + "loss": 0.9389, + "step": 25270 + }, + { + "epoch": 0.16150671453943755, + "grad_norm": 1.7726080417633057, + "learning_rate": 9.840097819719336e-05, + "loss": 0.9474, + "step": 25280 + }, + { + "epoch": 0.16157060168917625, + "grad_norm": 0.6403753757476807, + "learning_rate": 9.839971914316757e-05, + "loss": 0.8837, + "step": 25290 + }, + { + "epoch": 0.16163448883891493, + "grad_norm": 0.8878451585769653, + "learning_rate": 9.839845960171475e-05, + "loss": 0.9911, + "step": 25300 + }, + { + "epoch": 0.16169837598865364, + "grad_norm": 0.9376581907272339, + "learning_rate": 9.839719957284756e-05, + "loss": 1.0247, + "step": 25310 + }, + { + "epoch": 0.16176226313839234, + "grad_norm": 0.6702033877372742, + "learning_rate": 9.839593905657871e-05, + "loss": 0.9453, + "step": 25320 + }, + { + "epoch": 0.16182615028813105, + "grad_norm": 0.4987049400806427, + "learning_rate": 9.839467805292089e-05, + "loss": 0.7227, + "step": 25330 + }, + { + "epoch": 0.16189003743786975, + "grad_norm": 0.6735382080078125, + "learning_rate": 9.839341656188677e-05, + "loss": 0.8046, + "step": 25340 + }, + { + "epoch": 0.16195392458760846, + "grad_norm": 0.8256925344467163, + "learning_rate": 9.839215458348909e-05, + "loss": 1.342, + "step": 25350 + }, + { + "epoch": 0.16201781173734714, + "grad_norm": 1.0099321603775024, + "learning_rate": 9.839089211774056e-05, + "loss": 0.9079, + "step": 25360 + }, + { + "epoch": 0.16208169888708585, + "grad_norm": 0.9464432597160339, + "learning_rate": 9.838962916465388e-05, + "loss": 1.0935, + "step": 25370 + }, + { + "epoch": 0.16214558603682455, + "grad_norm": 1.0927412509918213, + "learning_rate": 9.838836572424176e-05, + "loss": 0.6795, + "step": 25380 + }, + { + "epoch": 0.16220947318656326, + "grad_norm": 0.6880885362625122, + "learning_rate": 9.838710179651694e-05, + "loss": 0.9407, + "step": 25390 + }, + { + "epoch": 0.16227336033630196, + "grad_norm": 0.9150338768959045, + "learning_rate": 9.838583738149215e-05, + "loss": 0.9107, + "step": 25400 + }, + { + "epoch": 0.16233724748604067, + "grad_norm": 1.087501049041748, + "learning_rate": 9.838457247918012e-05, + "loss": 0.7319, + "step": 25410 + }, + { + "epoch": 0.16240113463577935, + "grad_norm": 0.7410935163497925, + "learning_rate": 9.838330708959358e-05, + "loss": 0.755, + "step": 25420 + }, + { + "epoch": 0.16246502178551805, + "grad_norm": 0.7320923209190369, + "learning_rate": 9.838204121274527e-05, + "loss": 0.9022, + "step": 25430 + }, + { + "epoch": 0.16252890893525676, + "grad_norm": 0.7874162793159485, + "learning_rate": 9.838077484864796e-05, + "loss": 0.8658, + "step": 25440 + }, + { + "epoch": 0.16259279608499547, + "grad_norm": 0.6988115310668945, + "learning_rate": 9.83795079973144e-05, + "loss": 0.8592, + "step": 25450 + }, + { + "epoch": 0.16265668323473417, + "grad_norm": 1.1536266803741455, + "learning_rate": 9.837824065875733e-05, + "loss": 0.9147, + "step": 25460 + }, + { + "epoch": 0.16272057038447288, + "grad_norm": 0.8450478911399841, + "learning_rate": 9.837697283298952e-05, + "loss": 0.8379, + "step": 25470 + }, + { + "epoch": 0.16278445753421156, + "grad_norm": 0.545207679271698, + "learning_rate": 9.837570452002375e-05, + "loss": 0.8029, + "step": 25480 + }, + { + "epoch": 0.16284834468395026, + "grad_norm": 0.8712167739868164, + "learning_rate": 9.837443571987277e-05, + "loss": 0.9546, + "step": 25490 + }, + { + "epoch": 0.16291223183368897, + "grad_norm": 1.3103466033935547, + "learning_rate": 9.837316643254938e-05, + "loss": 0.8578, + "step": 25500 + }, + { + "epoch": 0.16297611898342768, + "grad_norm": 0.6110614538192749, + "learning_rate": 9.837189665806637e-05, + "loss": 0.9893, + "step": 25510 + }, + { + "epoch": 0.16304000613316638, + "grad_norm": 0.8567008972167969, + "learning_rate": 9.83706263964365e-05, + "loss": 0.7717, + "step": 25520 + }, + { + "epoch": 0.1631038932829051, + "grad_norm": 0.844247579574585, + "learning_rate": 9.836935564767257e-05, + "loss": 1.0621, + "step": 25530 + }, + { + "epoch": 0.16316778043264377, + "grad_norm": 0.8914816379547119, + "learning_rate": 9.836808441178739e-05, + "loss": 0.8287, + "step": 25540 + }, + { + "epoch": 0.16323166758238247, + "grad_norm": 0.6251090168952942, + "learning_rate": 9.836681268879377e-05, + "loss": 1.051, + "step": 25550 + }, + { + "epoch": 0.16329555473212118, + "grad_norm": 0.6964147090911865, + "learning_rate": 9.836554047870447e-05, + "loss": 0.9595, + "step": 25560 + }, + { + "epoch": 0.16335944188185988, + "grad_norm": 0.5562779307365417, + "learning_rate": 9.836426778153236e-05, + "loss": 0.8304, + "step": 25570 + }, + { + "epoch": 0.1634233290315986, + "grad_norm": 0.6539714336395264, + "learning_rate": 9.836299459729023e-05, + "loss": 0.9026, + "step": 25580 + }, + { + "epoch": 0.1634872161813373, + "grad_norm": 0.789167582988739, + "learning_rate": 9.836172092599089e-05, + "loss": 0.806, + "step": 25590 + }, + { + "epoch": 0.16355110333107598, + "grad_norm": 0.7832333445549011, + "learning_rate": 9.83604467676472e-05, + "loss": 0.8309, + "step": 25600 + }, + { + "epoch": 0.16361499048081468, + "grad_norm": 0.9938201308250427, + "learning_rate": 9.835917212227197e-05, + "loss": 0.9, + "step": 25610 + }, + { + "epoch": 0.1636788776305534, + "grad_norm": 0.7347666621208191, + "learning_rate": 9.835789698987802e-05, + "loss": 0.7665, + "step": 25620 + }, + { + "epoch": 0.1637427647802921, + "grad_norm": 0.7416117191314697, + "learning_rate": 9.835662137047824e-05, + "loss": 0.8239, + "step": 25630 + }, + { + "epoch": 0.1638066519300308, + "grad_norm": 0.6439573764801025, + "learning_rate": 9.835534526408543e-05, + "loss": 0.9106, + "step": 25640 + }, + { + "epoch": 0.1638705390797695, + "grad_norm": 1.0646562576293945, + "learning_rate": 9.835406867071247e-05, + "loss": 0.7518, + "step": 25650 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 1.135383129119873, + "learning_rate": 9.83527915903722e-05, + "loss": 0.723, + "step": 25660 + }, + { + "epoch": 0.1639983133792469, + "grad_norm": 0.9141467213630676, + "learning_rate": 9.83515140230775e-05, + "loss": 1.104, + "step": 25670 + }, + { + "epoch": 0.1640622005289856, + "grad_norm": 0.7846889495849609, + "learning_rate": 9.83502359688412e-05, + "loss": 0.8969, + "step": 25680 + }, + { + "epoch": 0.1641260876787243, + "grad_norm": 0.8037777543067932, + "learning_rate": 9.834895742767622e-05, + "loss": 0.9751, + "step": 25690 + }, + { + "epoch": 0.164189974828463, + "grad_norm": 1.0449095964431763, + "learning_rate": 9.83476783995954e-05, + "loss": 1.0799, + "step": 25700 + }, + { + "epoch": 0.16425386197820172, + "grad_norm": 0.6123198866844177, + "learning_rate": 9.834639888461162e-05, + "loss": 0.8884, + "step": 25710 + }, + { + "epoch": 0.1643177491279404, + "grad_norm": 0.7933758497238159, + "learning_rate": 9.834511888273778e-05, + "loss": 0.9816, + "step": 25720 + }, + { + "epoch": 0.1643816362776791, + "grad_norm": 1.233192801475525, + "learning_rate": 9.83438383939868e-05, + "loss": 0.9493, + "step": 25730 + }, + { + "epoch": 0.1644455234274178, + "grad_norm": 0.9002760052680969, + "learning_rate": 9.834255741837151e-05, + "loss": 1.0682, + "step": 25740 + }, + { + "epoch": 0.1645094105771565, + "grad_norm": 0.6131082773208618, + "learning_rate": 9.834127595590485e-05, + "loss": 0.964, + "step": 25750 + }, + { + "epoch": 0.16457329772689522, + "grad_norm": 1.53384530544281, + "learning_rate": 9.833999400659972e-05, + "loss": 0.9393, + "step": 25760 + }, + { + "epoch": 0.16463718487663392, + "grad_norm": 0.8691433072090149, + "learning_rate": 9.833871157046904e-05, + "loss": 0.708, + "step": 25770 + }, + { + "epoch": 0.1647010720263726, + "grad_norm": 0.6749919652938843, + "learning_rate": 9.833742864752571e-05, + "loss": 1.1174, + "step": 25780 + }, + { + "epoch": 0.1647649591761113, + "grad_norm": 0.6683396100997925, + "learning_rate": 9.833614523778266e-05, + "loss": 0.8302, + "step": 25790 + }, + { + "epoch": 0.16482884632585001, + "grad_norm": 0.8051975965499878, + "learning_rate": 9.833486134125281e-05, + "loss": 1.2393, + "step": 25800 + }, + { + "epoch": 0.16489273347558872, + "grad_norm": 0.6575607657432556, + "learning_rate": 9.833357695794909e-05, + "loss": 1.0257, + "step": 25810 + }, + { + "epoch": 0.16495662062532743, + "grad_norm": 0.9496917128562927, + "learning_rate": 9.833229208788443e-05, + "loss": 0.8261, + "step": 25820 + }, + { + "epoch": 0.16502050777506613, + "grad_norm": 0.7231150269508362, + "learning_rate": 9.833100673107179e-05, + "loss": 0.6341, + "step": 25830 + }, + { + "epoch": 0.1650843949248048, + "grad_norm": 1.2418237924575806, + "learning_rate": 9.832972088752407e-05, + "loss": 0.803, + "step": 25840 + }, + { + "epoch": 0.16514828207454352, + "grad_norm": 0.6519736051559448, + "learning_rate": 9.832843455725427e-05, + "loss": 0.918, + "step": 25850 + }, + { + "epoch": 0.16521216922428222, + "grad_norm": 0.6396727561950684, + "learning_rate": 9.832714774027534e-05, + "loss": 1.0144, + "step": 25860 + }, + { + "epoch": 0.16527605637402093, + "grad_norm": 1.0266163349151611, + "learning_rate": 9.832586043660019e-05, + "loss": 0.7874, + "step": 25870 + }, + { + "epoch": 0.16533994352375964, + "grad_norm": 0.9573850035667419, + "learning_rate": 9.832457264624184e-05, + "loss": 0.8346, + "step": 25880 + }, + { + "epoch": 0.16540383067349834, + "grad_norm": 0.7382820844650269, + "learning_rate": 9.832328436921324e-05, + "loss": 0.7884, + "step": 25890 + }, + { + "epoch": 0.16546771782323702, + "grad_norm": 0.8257744908332825, + "learning_rate": 9.832199560552734e-05, + "loss": 0.9137, + "step": 25900 + }, + { + "epoch": 0.16553160497297573, + "grad_norm": 0.4377366006374359, + "learning_rate": 9.832070635519715e-05, + "loss": 0.7715, + "step": 25910 + }, + { + "epoch": 0.16559549212271443, + "grad_norm": 0.6788588166236877, + "learning_rate": 9.831941661823564e-05, + "loss": 0.8829, + "step": 25920 + }, + { + "epoch": 0.16565937927245314, + "grad_norm": 0.7223168611526489, + "learning_rate": 9.831812639465581e-05, + "loss": 0.9969, + "step": 25930 + }, + { + "epoch": 0.16572326642219185, + "grad_norm": 0.5885007977485657, + "learning_rate": 9.831683568447064e-05, + "loss": 0.8589, + "step": 25940 + }, + { + "epoch": 0.16578715357193055, + "grad_norm": 0.4553689956665039, + "learning_rate": 9.831554448769314e-05, + "loss": 1.0332, + "step": 25950 + }, + { + "epoch": 0.16585104072166926, + "grad_norm": 0.8313513398170471, + "learning_rate": 9.831425280433631e-05, + "loss": 0.8301, + "step": 25960 + }, + { + "epoch": 0.16591492787140794, + "grad_norm": 0.8566281795501709, + "learning_rate": 9.831296063441315e-05, + "loss": 0.8196, + "step": 25970 + }, + { + "epoch": 0.16597881502114664, + "grad_norm": 0.9477049708366394, + "learning_rate": 9.831166797793668e-05, + "loss": 1.0331, + "step": 25980 + }, + { + "epoch": 0.16604270217088535, + "grad_norm": 0.9263975620269775, + "learning_rate": 9.831037483491991e-05, + "loss": 1.0746, + "step": 25990 + }, + { + "epoch": 0.16610658932062405, + "grad_norm": 0.5900850296020508, + "learning_rate": 9.83090812053759e-05, + "loss": 0.9527, + "step": 26000 + }, + { + "epoch": 0.16617047647036276, + "grad_norm": 1.1755515336990356, + "learning_rate": 9.830778708931762e-05, + "loss": 0.8315, + "step": 26010 + }, + { + "epoch": 0.16623436362010147, + "grad_norm": 0.9088423848152161, + "learning_rate": 9.830649248675814e-05, + "loss": 0.9244, + "step": 26020 + }, + { + "epoch": 0.16629825076984014, + "grad_norm": 0.7324301600456238, + "learning_rate": 9.83051973977105e-05, + "loss": 0.904, + "step": 26030 + }, + { + "epoch": 0.16636213791957885, + "grad_norm": 0.8652370572090149, + "learning_rate": 9.830390182218771e-05, + "loss": 0.9646, + "step": 26040 + }, + { + "epoch": 0.16642602506931756, + "grad_norm": 1.0668914318084717, + "learning_rate": 9.830260576020286e-05, + "loss": 1.0844, + "step": 26050 + }, + { + "epoch": 0.16648991221905626, + "grad_norm": 1.8865516185760498, + "learning_rate": 9.830130921176898e-05, + "loss": 0.9959, + "step": 26060 + }, + { + "epoch": 0.16655379936879497, + "grad_norm": 0.7931020855903625, + "learning_rate": 9.830001217689913e-05, + "loss": 0.8263, + "step": 26070 + }, + { + "epoch": 0.16661768651853368, + "grad_norm": 1.674660325050354, + "learning_rate": 9.829871465560637e-05, + "loss": 0.8527, + "step": 26080 + }, + { + "epoch": 0.16668157366827235, + "grad_norm": 1.0010359287261963, + "learning_rate": 9.829741664790376e-05, + "loss": 0.7847, + "step": 26090 + }, + { + "epoch": 0.16674546081801106, + "grad_norm": 1.0310410261154175, + "learning_rate": 9.829611815380439e-05, + "loss": 0.8471, + "step": 26100 + }, + { + "epoch": 0.16680934796774977, + "grad_norm": 0.8554787039756775, + "learning_rate": 9.829481917332132e-05, + "loss": 0.8849, + "step": 26110 + }, + { + "epoch": 0.16687323511748847, + "grad_norm": 1.3090757131576538, + "learning_rate": 9.829351970646764e-05, + "loss": 0.921, + "step": 26120 + }, + { + "epoch": 0.16693712226722718, + "grad_norm": 1.068739414215088, + "learning_rate": 9.829221975325644e-05, + "loss": 0.9898, + "step": 26130 + }, + { + "epoch": 0.16700100941696588, + "grad_norm": 0.8239749073982239, + "learning_rate": 9.829091931370082e-05, + "loss": 1.1161, + "step": 26140 + }, + { + "epoch": 0.16706489656670456, + "grad_norm": 1.0234822034835815, + "learning_rate": 9.828961838781385e-05, + "loss": 1.0181, + "step": 26150 + }, + { + "epoch": 0.16712878371644327, + "grad_norm": 1.0589160919189453, + "learning_rate": 9.828831697560865e-05, + "loss": 1.0243, + "step": 26160 + }, + { + "epoch": 0.16719267086618197, + "grad_norm": 0.8593624234199524, + "learning_rate": 9.828701507709832e-05, + "loss": 1.3933, + "step": 26170 + }, + { + "epoch": 0.16725655801592068, + "grad_norm": 1.0242136716842651, + "learning_rate": 9.828571269229598e-05, + "loss": 0.9601, + "step": 26180 + }, + { + "epoch": 0.1673204451656594, + "grad_norm": 0.47015684843063354, + "learning_rate": 9.828440982121473e-05, + "loss": 1.2651, + "step": 26190 + }, + { + "epoch": 0.1673843323153981, + "grad_norm": 0.9608283042907715, + "learning_rate": 9.828310646386772e-05, + "loss": 0.7508, + "step": 26200 + }, + { + "epoch": 0.16744821946513677, + "grad_norm": 0.8850206136703491, + "learning_rate": 9.828180262026805e-05, + "loss": 0.9822, + "step": 26210 + }, + { + "epoch": 0.16751210661487548, + "grad_norm": 1.7484781742095947, + "learning_rate": 9.828049829042884e-05, + "loss": 0.9558, + "step": 26220 + }, + { + "epoch": 0.16757599376461418, + "grad_norm": 0.625224769115448, + "learning_rate": 9.827919347436328e-05, + "loss": 0.8881, + "step": 26230 + }, + { + "epoch": 0.1676398809143529, + "grad_norm": 0.576524555683136, + "learning_rate": 9.827788817208444e-05, + "loss": 0.9399, + "step": 26240 + }, + { + "epoch": 0.1677037680640916, + "grad_norm": 1.0603713989257812, + "learning_rate": 9.827658238360553e-05, + "loss": 0.8588, + "step": 26250 + }, + { + "epoch": 0.1677676552138303, + "grad_norm": 0.7877979278564453, + "learning_rate": 9.827527610893964e-05, + "loss": 0.8973, + "step": 26260 + }, + { + "epoch": 0.16783154236356898, + "grad_norm": 1.2610846757888794, + "learning_rate": 9.827396934809997e-05, + "loss": 0.7684, + "step": 26270 + }, + { + "epoch": 0.1678954295133077, + "grad_norm": 0.49026232957839966, + "learning_rate": 9.827266210109967e-05, + "loss": 1.061, + "step": 26280 + }, + { + "epoch": 0.1679593166630464, + "grad_norm": 1.0687637329101562, + "learning_rate": 9.827135436795189e-05, + "loss": 0.8798, + "step": 26290 + }, + { + "epoch": 0.1680232038127851, + "grad_norm": 0.9565626978874207, + "learning_rate": 9.827004614866981e-05, + "loss": 0.8781, + "step": 26300 + }, + { + "epoch": 0.1680870909625238, + "grad_norm": 1.148451566696167, + "learning_rate": 9.826873744326661e-05, + "loss": 0.7915, + "step": 26310 + }, + { + "epoch": 0.1681509781122625, + "grad_norm": 0.6154188513755798, + "learning_rate": 9.826742825175547e-05, + "loss": 0.8317, + "step": 26320 + }, + { + "epoch": 0.1682148652620012, + "grad_norm": 0.9438403844833374, + "learning_rate": 9.826611857414957e-05, + "loss": 0.8347, + "step": 26330 + }, + { + "epoch": 0.1682787524117399, + "grad_norm": 0.6729276776313782, + "learning_rate": 9.82648084104621e-05, + "loss": 0.969, + "step": 26340 + }, + { + "epoch": 0.1683426395614786, + "grad_norm": 0.6888546943664551, + "learning_rate": 9.826349776070625e-05, + "loss": 1.0223, + "step": 26350 + }, + { + "epoch": 0.1684065267112173, + "grad_norm": 0.8470525741577148, + "learning_rate": 9.826218662489521e-05, + "loss": 0.8919, + "step": 26360 + }, + { + "epoch": 0.16847041386095601, + "grad_norm": 0.653862714767456, + "learning_rate": 9.826087500304222e-05, + "loss": 1.0743, + "step": 26370 + }, + { + "epoch": 0.16853430101069472, + "grad_norm": 0.7015219926834106, + "learning_rate": 9.825956289516046e-05, + "loss": 1.1053, + "step": 26380 + }, + { + "epoch": 0.1685981881604334, + "grad_norm": 1.116733193397522, + "learning_rate": 9.825825030126315e-05, + "loss": 1.199, + "step": 26390 + }, + { + "epoch": 0.1686620753101721, + "grad_norm": 0.8197908401489258, + "learning_rate": 9.825693722136351e-05, + "loss": 0.9155, + "step": 26400 + }, + { + "epoch": 0.1687259624599108, + "grad_norm": 0.9840227365493774, + "learning_rate": 9.825562365547477e-05, + "loss": 0.9655, + "step": 26410 + }, + { + "epoch": 0.16878984960964952, + "grad_norm": 0.6856445074081421, + "learning_rate": 9.825430960361015e-05, + "loss": 0.7135, + "step": 26420 + }, + { + "epoch": 0.16885373675938822, + "grad_norm": 1.0108433961868286, + "learning_rate": 9.825299506578288e-05, + "loss": 1.1918, + "step": 26430 + }, + { + "epoch": 0.16891762390912693, + "grad_norm": 0.7306868433952332, + "learning_rate": 9.82516800420062e-05, + "loss": 0.9422, + "step": 26440 + }, + { + "epoch": 0.1689815110588656, + "grad_norm": 0.8072736859321594, + "learning_rate": 9.825036453229336e-05, + "loss": 0.8563, + "step": 26450 + }, + { + "epoch": 0.1690453982086043, + "grad_norm": 0.9829355478286743, + "learning_rate": 9.824904853665764e-05, + "loss": 1.2103, + "step": 26460 + }, + { + "epoch": 0.16910928535834302, + "grad_norm": 0.7728550434112549, + "learning_rate": 9.824773205511222e-05, + "loss": 0.986, + "step": 26470 + }, + { + "epoch": 0.16917317250808173, + "grad_norm": 0.7675033211708069, + "learning_rate": 9.824641508767042e-05, + "loss": 1.1175, + "step": 26480 + }, + { + "epoch": 0.16923705965782043, + "grad_norm": 1.7961393594741821, + "learning_rate": 9.824509763434548e-05, + "loss": 0.8564, + "step": 26490 + }, + { + "epoch": 0.16930094680755914, + "grad_norm": 0.782536506652832, + "learning_rate": 9.824377969515065e-05, + "loss": 1.0492, + "step": 26500 + }, + { + "epoch": 0.16936483395729782, + "grad_norm": 0.6397303342819214, + "learning_rate": 9.824246127009924e-05, + "loss": 0.922, + "step": 26510 + }, + { + "epoch": 0.16942872110703652, + "grad_norm": 0.7447948455810547, + "learning_rate": 9.82411423592045e-05, + "loss": 0.9926, + "step": 26520 + }, + { + "epoch": 0.16949260825677523, + "grad_norm": 0.7400467991828918, + "learning_rate": 9.823982296247972e-05, + "loss": 1.0191, + "step": 26530 + }, + { + "epoch": 0.16955649540651394, + "grad_norm": 0.6189865469932556, + "learning_rate": 9.82385030799382e-05, + "loss": 1.0518, + "step": 26540 + }, + { + "epoch": 0.16962038255625264, + "grad_norm": 0.8793081641197205, + "learning_rate": 9.823718271159321e-05, + "loss": 0.8839, + "step": 26550 + }, + { + "epoch": 0.16968426970599135, + "grad_norm": 0.6479794979095459, + "learning_rate": 9.823586185745808e-05, + "loss": 1.1906, + "step": 26560 + }, + { + "epoch": 0.16974815685573003, + "grad_norm": 0.9083991646766663, + "learning_rate": 9.823454051754605e-05, + "loss": 0.8276, + "step": 26570 + }, + { + "epoch": 0.16981204400546873, + "grad_norm": 0.7456206679344177, + "learning_rate": 9.823321869187051e-05, + "loss": 1.6253, + "step": 26580 + }, + { + "epoch": 0.16987593115520744, + "grad_norm": 0.7797310948371887, + "learning_rate": 9.823189638044473e-05, + "loss": 0.9139, + "step": 26590 + }, + { + "epoch": 0.16993981830494614, + "grad_norm": 0.6889947056770325, + "learning_rate": 9.8230573583282e-05, + "loss": 0.8341, + "step": 26600 + }, + { + "epoch": 0.17000370545468485, + "grad_norm": 0.9696791172027588, + "learning_rate": 9.822925030039567e-05, + "loss": 0.8444, + "step": 26610 + }, + { + "epoch": 0.17006759260442356, + "grad_norm": 0.5877872705459595, + "learning_rate": 9.822792653179908e-05, + "loss": 1.012, + "step": 26620 + }, + { + "epoch": 0.17013147975416223, + "grad_norm": 0.7431389093399048, + "learning_rate": 9.822660227750554e-05, + "loss": 0.7642, + "step": 26630 + }, + { + "epoch": 0.17019536690390094, + "grad_norm": 0.7920153737068176, + "learning_rate": 9.822527753752839e-05, + "loss": 0.8715, + "step": 26640 + }, + { + "epoch": 0.17025925405363965, + "grad_norm": 0.8526118397712708, + "learning_rate": 9.822395231188099e-05, + "loss": 0.816, + "step": 26650 + }, + { + "epoch": 0.17032314120337835, + "grad_norm": 0.8121978640556335, + "learning_rate": 9.822262660057666e-05, + "loss": 0.9923, + "step": 26660 + }, + { + "epoch": 0.17038702835311706, + "grad_norm": 1.0887260437011719, + "learning_rate": 9.822130040362875e-05, + "loss": 0.9544, + "step": 26670 + }, + { + "epoch": 0.17045091550285577, + "grad_norm": 0.5011045336723328, + "learning_rate": 9.821997372105065e-05, + "loss": 0.7011, + "step": 26680 + }, + { + "epoch": 0.17051480265259444, + "grad_norm": 1.1520075798034668, + "learning_rate": 9.821864655285569e-05, + "loss": 0.963, + "step": 26690 + }, + { + "epoch": 0.17057868980233315, + "grad_norm": 0.7860487699508667, + "learning_rate": 9.821731889905722e-05, + "loss": 0.8835, + "step": 26700 + }, + { + "epoch": 0.17064257695207186, + "grad_norm": 0.7170895934104919, + "learning_rate": 9.821599075966868e-05, + "loss": 0.8771, + "step": 26710 + }, + { + "epoch": 0.17070646410181056, + "grad_norm": 1.2707265615463257, + "learning_rate": 9.821466213470337e-05, + "loss": 1.134, + "step": 26720 + }, + { + "epoch": 0.17077035125154927, + "grad_norm": 0.8907286524772644, + "learning_rate": 9.82133330241747e-05, + "loss": 0.8502, + "step": 26730 + }, + { + "epoch": 0.17083423840128797, + "grad_norm": 0.6497828960418701, + "learning_rate": 9.821200342809606e-05, + "loss": 1.2541, + "step": 26740 + }, + { + "epoch": 0.17089812555102665, + "grad_norm": 0.7036256194114685, + "learning_rate": 9.821067334648084e-05, + "loss": 0.9958, + "step": 26750 + }, + { + "epoch": 0.17096201270076536, + "grad_norm": 0.8180755376815796, + "learning_rate": 9.820934277934243e-05, + "loss": 1.0885, + "step": 26760 + }, + { + "epoch": 0.17102589985050407, + "grad_norm": 0.9146037697792053, + "learning_rate": 9.820801172669425e-05, + "loss": 0.8732, + "step": 26770 + }, + { + "epoch": 0.17108978700024277, + "grad_norm": 0.9962326884269714, + "learning_rate": 9.820668018854966e-05, + "loss": 0.711, + "step": 26780 + }, + { + "epoch": 0.17115367414998148, + "grad_norm": 0.9202134609222412, + "learning_rate": 9.82053481649221e-05, + "loss": 1.0103, + "step": 26790 + }, + { + "epoch": 0.17121756129972018, + "grad_norm": 1.4406436681747437, + "learning_rate": 9.820401565582498e-05, + "loss": 1.1804, + "step": 26800 + }, + { + "epoch": 0.1712814484494589, + "grad_norm": 0.8345924019813538, + "learning_rate": 9.820268266127173e-05, + "loss": 0.7762, + "step": 26810 + }, + { + "epoch": 0.17134533559919757, + "grad_norm": 1.7090119123458862, + "learning_rate": 9.820134918127576e-05, + "loss": 0.867, + "step": 26820 + }, + { + "epoch": 0.17140922274893627, + "grad_norm": 0.7631736397743225, + "learning_rate": 9.82000152158505e-05, + "loss": 1.1823, + "step": 26830 + }, + { + "epoch": 0.17147310989867498, + "grad_norm": 0.5878487825393677, + "learning_rate": 9.81986807650094e-05, + "loss": 0.8455, + "step": 26840 + }, + { + "epoch": 0.1715369970484137, + "grad_norm": 1.0740894079208374, + "learning_rate": 9.819734582876587e-05, + "loss": 0.8497, + "step": 26850 + }, + { + "epoch": 0.1716008841981524, + "grad_norm": 0.8244208097457886, + "learning_rate": 9.819601040713337e-05, + "loss": 0.7606, + "step": 26860 + }, + { + "epoch": 0.1716647713478911, + "grad_norm": 0.9550793170928955, + "learning_rate": 9.819467450012536e-05, + "loss": 0.8171, + "step": 26870 + }, + { + "epoch": 0.17172865849762978, + "grad_norm": 0.7170982360839844, + "learning_rate": 9.819333810775528e-05, + "loss": 0.823, + "step": 26880 + }, + { + "epoch": 0.17179254564736848, + "grad_norm": 0.8252397775650024, + "learning_rate": 9.81920012300366e-05, + "loss": 0.7653, + "step": 26890 + }, + { + "epoch": 0.1718564327971072, + "grad_norm": 1.1286877393722534, + "learning_rate": 9.819066386698277e-05, + "loss": 0.991, + "step": 26900 + }, + { + "epoch": 0.1719203199468459, + "grad_norm": 0.7603797912597656, + "learning_rate": 9.818932601860727e-05, + "loss": 0.9141, + "step": 26910 + }, + { + "epoch": 0.1719842070965846, + "grad_norm": 0.7588580250740051, + "learning_rate": 9.818798768492354e-05, + "loss": 0.8255, + "step": 26920 + }, + { + "epoch": 0.1720480942463233, + "grad_norm": 0.9968806505203247, + "learning_rate": 9.81866488659451e-05, + "loss": 0.6958, + "step": 26930 + }, + { + "epoch": 0.17211198139606199, + "grad_norm": 0.7764785885810852, + "learning_rate": 9.818530956168543e-05, + "loss": 1.1488, + "step": 26940 + }, + { + "epoch": 0.1721758685458007, + "grad_norm": 0.6332468390464783, + "learning_rate": 9.818396977215801e-05, + "loss": 0.6837, + "step": 26950 + }, + { + "epoch": 0.1722397556955394, + "grad_norm": 0.8513321876525879, + "learning_rate": 9.818262949737632e-05, + "loss": 0.7871, + "step": 26960 + }, + { + "epoch": 0.1723036428452781, + "grad_norm": 0.6733559370040894, + "learning_rate": 9.818128873735386e-05, + "loss": 0.8591, + "step": 26970 + }, + { + "epoch": 0.1723675299950168, + "grad_norm": 1.0465015172958374, + "learning_rate": 9.817994749210415e-05, + "loss": 0.8665, + "step": 26980 + }, + { + "epoch": 0.17243141714475552, + "grad_norm": 0.6963700652122498, + "learning_rate": 9.817860576164069e-05, + "loss": 0.8684, + "step": 26990 + }, + { + "epoch": 0.1724953042944942, + "grad_norm": 1.0664374828338623, + "learning_rate": 9.817726354597699e-05, + "loss": 0.6893, + "step": 27000 + }, + { + "epoch": 0.1725591914442329, + "grad_norm": 0.7583040595054626, + "learning_rate": 9.817592084512655e-05, + "loss": 0.9267, + "step": 27010 + }, + { + "epoch": 0.1726230785939716, + "grad_norm": 0.8282020092010498, + "learning_rate": 9.817457765910292e-05, + "loss": 0.7665, + "step": 27020 + }, + { + "epoch": 0.1726869657437103, + "grad_norm": 0.8650298118591309, + "learning_rate": 9.817323398791961e-05, + "loss": 1.0732, + "step": 27030 + }, + { + "epoch": 0.17275085289344902, + "grad_norm": 0.5665771961212158, + "learning_rate": 9.817188983159016e-05, + "loss": 0.92, + "step": 27040 + }, + { + "epoch": 0.17281474004318773, + "grad_norm": 1.4481645822525024, + "learning_rate": 9.817054519012811e-05, + "loss": 0.8976, + "step": 27050 + }, + { + "epoch": 0.1728786271929264, + "grad_norm": 0.7741625308990479, + "learning_rate": 9.8169200063547e-05, + "loss": 1.0959, + "step": 27060 + }, + { + "epoch": 0.1729425143426651, + "grad_norm": 0.7932523488998413, + "learning_rate": 9.816785445186036e-05, + "loss": 0.9241, + "step": 27070 + }, + { + "epoch": 0.17300640149240382, + "grad_norm": 0.6542154550552368, + "learning_rate": 9.816650835508177e-05, + "loss": 0.9807, + "step": 27080 + }, + { + "epoch": 0.17307028864214252, + "grad_norm": 0.7726758718490601, + "learning_rate": 9.816516177322477e-05, + "loss": 0.8918, + "step": 27090 + }, + { + "epoch": 0.17313417579188123, + "grad_norm": 0.8398792743682861, + "learning_rate": 9.81638147063029e-05, + "loss": 1.1571, + "step": 27100 + }, + { + "epoch": 0.17319806294161993, + "grad_norm": 3.0609123706817627, + "learning_rate": 9.816246715432977e-05, + "loss": 1.0103, + "step": 27110 + }, + { + "epoch": 0.1732619500913586, + "grad_norm": 0.8899956941604614, + "learning_rate": 9.816111911731892e-05, + "loss": 0.878, + "step": 27120 + }, + { + "epoch": 0.17332583724109732, + "grad_norm": 1.076644778251648, + "learning_rate": 9.815977059528393e-05, + "loss": 1.0136, + "step": 27130 + }, + { + "epoch": 0.17338972439083603, + "grad_norm": 2.1969175338745117, + "learning_rate": 9.81584215882384e-05, + "loss": 0.7375, + "step": 27140 + }, + { + "epoch": 0.17345361154057473, + "grad_norm": 0.9302259087562561, + "learning_rate": 9.815707209619589e-05, + "loss": 0.802, + "step": 27150 + }, + { + "epoch": 0.17351749869031344, + "grad_norm": 0.6798985004425049, + "learning_rate": 9.815572211917001e-05, + "loss": 0.7363, + "step": 27160 + }, + { + "epoch": 0.17358138584005214, + "grad_norm": 0.7445381879806519, + "learning_rate": 9.815437165717435e-05, + "loss": 1.024, + "step": 27170 + }, + { + "epoch": 0.17364527298979082, + "grad_norm": 0.7571766972541809, + "learning_rate": 9.81530207102225e-05, + "loss": 0.8216, + "step": 27180 + }, + { + "epoch": 0.17370916013952953, + "grad_norm": 1.2653508186340332, + "learning_rate": 9.815166927832809e-05, + "loss": 0.8769, + "step": 27190 + }, + { + "epoch": 0.17377304728926823, + "grad_norm": 1.0241389274597168, + "learning_rate": 9.815031736150468e-05, + "loss": 0.8065, + "step": 27200 + }, + { + "epoch": 0.17383693443900694, + "grad_norm": 0.6065948605537415, + "learning_rate": 9.814896495976595e-05, + "loss": 0.8726, + "step": 27210 + }, + { + "epoch": 0.17390082158874565, + "grad_norm": 0.7081197500228882, + "learning_rate": 9.814761207312547e-05, + "loss": 0.9101, + "step": 27220 + }, + { + "epoch": 0.17396470873848435, + "grad_norm": 1.0318403244018555, + "learning_rate": 9.814625870159688e-05, + "loss": 0.9142, + "step": 27230 + }, + { + "epoch": 0.17402859588822303, + "grad_norm": 1.1322126388549805, + "learning_rate": 9.814490484519384e-05, + "loss": 0.8966, + "step": 27240 + }, + { + "epoch": 0.17409248303796174, + "grad_norm": 1.0569275617599487, + "learning_rate": 9.814355050392993e-05, + "loss": 0.8479, + "step": 27250 + }, + { + "epoch": 0.17415637018770044, + "grad_norm": 0.6752243041992188, + "learning_rate": 9.814219567781882e-05, + "loss": 0.8054, + "step": 27260 + }, + { + "epoch": 0.17422025733743915, + "grad_norm": 2.970486640930176, + "learning_rate": 9.814084036687417e-05, + "loss": 0.7318, + "step": 27270 + }, + { + "epoch": 0.17428414448717786, + "grad_norm": 1.1387560367584229, + "learning_rate": 9.813948457110957e-05, + "loss": 0.7659, + "step": 27280 + }, + { + "epoch": 0.17434803163691656, + "grad_norm": 0.7417890429496765, + "learning_rate": 9.813812829053874e-05, + "loss": 0.7819, + "step": 27290 + }, + { + "epoch": 0.17441191878665524, + "grad_norm": 1.0214507579803467, + "learning_rate": 9.813677152517533e-05, + "loss": 0.8779, + "step": 27300 + }, + { + "epoch": 0.17447580593639395, + "grad_norm": 0.9005577564239502, + "learning_rate": 9.813541427503296e-05, + "loss": 0.826, + "step": 27310 + }, + { + "epoch": 0.17453969308613265, + "grad_norm": 0.5254817008972168, + "learning_rate": 9.813405654012533e-05, + "loss": 0.7745, + "step": 27320 + }, + { + "epoch": 0.17460358023587136, + "grad_norm": 0.8588125109672546, + "learning_rate": 9.813269832046612e-05, + "loss": 0.8896, + "step": 27330 + }, + { + "epoch": 0.17466746738561006, + "grad_norm": 0.9681766033172607, + "learning_rate": 9.813133961606899e-05, + "loss": 0.9978, + "step": 27340 + }, + { + "epoch": 0.17473135453534877, + "grad_norm": 0.6579704880714417, + "learning_rate": 9.812998042694762e-05, + "loss": 0.9591, + "step": 27350 + }, + { + "epoch": 0.17479524168508745, + "grad_norm": 1.3134688138961792, + "learning_rate": 9.812862075311572e-05, + "loss": 0.9493, + "step": 27360 + }, + { + "epoch": 0.17485912883482616, + "grad_norm": 1.0650473833084106, + "learning_rate": 9.812726059458697e-05, + "loss": 0.6251, + "step": 27370 + }, + { + "epoch": 0.17492301598456486, + "grad_norm": 0.9300364851951599, + "learning_rate": 9.812589995137507e-05, + "loss": 0.8485, + "step": 27380 + }, + { + "epoch": 0.17498690313430357, + "grad_norm": 0.8550617098808289, + "learning_rate": 9.812453882349373e-05, + "loss": 0.9799, + "step": 27390 + }, + { + "epoch": 0.17505079028404227, + "grad_norm": 1.0517045259475708, + "learning_rate": 9.812317721095662e-05, + "loss": 1.092, + "step": 27400 + }, + { + "epoch": 0.17511467743378098, + "grad_norm": 0.8268793821334839, + "learning_rate": 9.812181511377752e-05, + "loss": 0.8651, + "step": 27410 + }, + { + "epoch": 0.17517856458351966, + "grad_norm": 1.0271008014678955, + "learning_rate": 9.81204525319701e-05, + "loss": 1.0251, + "step": 27420 + }, + { + "epoch": 0.17524245173325836, + "grad_norm": 1.1085052490234375, + "learning_rate": 9.811908946554809e-05, + "loss": 0.897, + "step": 27430 + }, + { + "epoch": 0.17530633888299707, + "grad_norm": 0.9341952204704285, + "learning_rate": 9.811772591452521e-05, + "loss": 0.9069, + "step": 27440 + }, + { + "epoch": 0.17537022603273578, + "grad_norm": 1.8567582368850708, + "learning_rate": 9.811636187891521e-05, + "loss": 0.8957, + "step": 27450 + }, + { + "epoch": 0.17543411318247448, + "grad_norm": 0.8161446452140808, + "learning_rate": 9.811499735873182e-05, + "loss": 0.9018, + "step": 27460 + }, + { + "epoch": 0.1754980003322132, + "grad_norm": 0.8577879667282104, + "learning_rate": 9.811363235398878e-05, + "loss": 0.9191, + "step": 27470 + }, + { + "epoch": 0.17556188748195187, + "grad_norm": 1.067243218421936, + "learning_rate": 9.811226686469985e-05, + "loss": 0.6471, + "step": 27480 + }, + { + "epoch": 0.17562577463169057, + "grad_norm": 1.763016939163208, + "learning_rate": 9.811090089087875e-05, + "loss": 0.7081, + "step": 27490 + }, + { + "epoch": 0.17568966178142928, + "grad_norm": 1.0972936153411865, + "learning_rate": 9.810953443253927e-05, + "loss": 0.821, + "step": 27500 + }, + { + "epoch": 0.17575354893116799, + "grad_norm": 0.691754937171936, + "learning_rate": 9.810816748969516e-05, + "loss": 0.9142, + "step": 27510 + }, + { + "epoch": 0.1758174360809067, + "grad_norm": 0.7978219389915466, + "learning_rate": 9.810680006236017e-05, + "loss": 1.0896, + "step": 27520 + }, + { + "epoch": 0.1758813232306454, + "grad_norm": 0.5945133566856384, + "learning_rate": 9.81054321505481e-05, + "loss": 1.1876, + "step": 27530 + }, + { + "epoch": 0.17594521038038408, + "grad_norm": 0.7158066034317017, + "learning_rate": 9.81040637542727e-05, + "loss": 0.8112, + "step": 27540 + }, + { + "epoch": 0.17600909753012278, + "grad_norm": 0.7002230882644653, + "learning_rate": 9.810269487354777e-05, + "loss": 0.7471, + "step": 27550 + }, + { + "epoch": 0.1760729846798615, + "grad_norm": 1.0922120809555054, + "learning_rate": 9.810132550838709e-05, + "loss": 0.6824, + "step": 27560 + }, + { + "epoch": 0.1761368718296002, + "grad_norm": 0.7432847023010254, + "learning_rate": 9.809995565880443e-05, + "loss": 0.9265, + "step": 27570 + }, + { + "epoch": 0.1762007589793389, + "grad_norm": 0.5448877215385437, + "learning_rate": 9.809858532481362e-05, + "loss": 0.8096, + "step": 27580 + }, + { + "epoch": 0.1762646461290776, + "grad_norm": 0.7894873023033142, + "learning_rate": 9.809721450642844e-05, + "loss": 0.9688, + "step": 27590 + }, + { + "epoch": 0.17632853327881629, + "grad_norm": 1.4557750225067139, + "learning_rate": 9.80958432036627e-05, + "loss": 0.8877, + "step": 27600 + }, + { + "epoch": 0.176392420428555, + "grad_norm": 0.8581323623657227, + "learning_rate": 9.809447141653022e-05, + "loss": 0.8595, + "step": 27610 + }, + { + "epoch": 0.1764563075782937, + "grad_norm": 1.0392162799835205, + "learning_rate": 9.809309914504479e-05, + "loss": 0.9148, + "step": 27620 + }, + { + "epoch": 0.1765201947280324, + "grad_norm": 0.5153777003288269, + "learning_rate": 9.809172638922024e-05, + "loss": 0.9317, + "step": 27630 + }, + { + "epoch": 0.1765840818777711, + "grad_norm": 0.6191779971122742, + "learning_rate": 9.809035314907043e-05, + "loss": 0.7501, + "step": 27640 + }, + { + "epoch": 0.17664796902750982, + "grad_norm": 1.2180255651474, + "learning_rate": 9.808897942460912e-05, + "loss": 0.9112, + "step": 27650 + }, + { + "epoch": 0.17671185617724852, + "grad_norm": 0.8534625768661499, + "learning_rate": 9.808760521585021e-05, + "loss": 1.1213, + "step": 27660 + }, + { + "epoch": 0.1767757433269872, + "grad_norm": 0.7606062889099121, + "learning_rate": 9.808623052280752e-05, + "loss": 0.9272, + "step": 27670 + }, + { + "epoch": 0.1768396304767259, + "grad_norm": 0.8535296320915222, + "learning_rate": 9.808485534549488e-05, + "loss": 0.9289, + "step": 27680 + }, + { + "epoch": 0.1769035176264646, + "grad_norm": 0.9565229415893555, + "learning_rate": 9.808347968392613e-05, + "loss": 1.1181, + "step": 27690 + }, + { + "epoch": 0.17696740477620332, + "grad_norm": 0.8111469149589539, + "learning_rate": 9.808210353811516e-05, + "loss": 0.9397, + "step": 27700 + }, + { + "epoch": 0.17703129192594202, + "grad_norm": 0.8592471480369568, + "learning_rate": 9.808072690807582e-05, + "loss": 0.9435, + "step": 27710 + }, + { + "epoch": 0.17709517907568073, + "grad_norm": 0.4907069206237793, + "learning_rate": 9.807934979382194e-05, + "loss": 1.0336, + "step": 27720 + }, + { + "epoch": 0.1771590662254194, + "grad_norm": 1.013027310371399, + "learning_rate": 9.807797219536743e-05, + "loss": 0.8746, + "step": 27730 + }, + { + "epoch": 0.17722295337515812, + "grad_norm": 0.910508394241333, + "learning_rate": 9.807659411272614e-05, + "loss": 0.7623, + "step": 27740 + }, + { + "epoch": 0.17728684052489682, + "grad_norm": 1.0840027332305908, + "learning_rate": 9.807521554591194e-05, + "loss": 1.2327, + "step": 27750 + }, + { + "epoch": 0.17735072767463553, + "grad_norm": 0.9532760977745056, + "learning_rate": 9.807383649493875e-05, + "loss": 0.8192, + "step": 27760 + }, + { + "epoch": 0.17741461482437423, + "grad_norm": 1.1489735841751099, + "learning_rate": 9.807245695982044e-05, + "loss": 0.7777, + "step": 27770 + }, + { + "epoch": 0.17747850197411294, + "grad_norm": 0.6683622598648071, + "learning_rate": 9.807107694057089e-05, + "loss": 0.6466, + "step": 27780 + }, + { + "epoch": 0.17754238912385162, + "grad_norm": 1.4319005012512207, + "learning_rate": 9.806969643720401e-05, + "loss": 1.1009, + "step": 27790 + }, + { + "epoch": 0.17760627627359032, + "grad_norm": 1.017777919769287, + "learning_rate": 9.80683154497337e-05, + "loss": 0.9284, + "step": 27800 + }, + { + "epoch": 0.17767016342332903, + "grad_norm": 0.8920938968658447, + "learning_rate": 9.806693397817386e-05, + "loss": 0.8675, + "step": 27810 + }, + { + "epoch": 0.17773405057306774, + "grad_norm": 1.0226699113845825, + "learning_rate": 9.806555202253842e-05, + "loss": 1.0085, + "step": 27820 + }, + { + "epoch": 0.17779793772280644, + "grad_norm": 0.841672956943512, + "learning_rate": 9.806416958284127e-05, + "loss": 0.9486, + "step": 27830 + }, + { + "epoch": 0.17786182487254515, + "grad_norm": 0.7303531765937805, + "learning_rate": 9.806278665909638e-05, + "loss": 0.9338, + "step": 27840 + }, + { + "epoch": 0.17792571202228383, + "grad_norm": 0.723166823387146, + "learning_rate": 9.806140325131763e-05, + "loss": 0.9934, + "step": 27850 + }, + { + "epoch": 0.17798959917202253, + "grad_norm": 1.413759469985962, + "learning_rate": 9.806001935951899e-05, + "loss": 1.061, + "step": 27860 + }, + { + "epoch": 0.17805348632176124, + "grad_norm": 0.8165162205696106, + "learning_rate": 9.805863498371435e-05, + "loss": 0.9142, + "step": 27870 + }, + { + "epoch": 0.17811737347149995, + "grad_norm": 0.6334624886512756, + "learning_rate": 9.805725012391768e-05, + "loss": 0.9758, + "step": 27880 + }, + { + "epoch": 0.17818126062123865, + "grad_norm": 0.7921863794326782, + "learning_rate": 9.805586478014294e-05, + "loss": 1.4444, + "step": 27890 + }, + { + "epoch": 0.17824514777097736, + "grad_norm": 0.94256192445755, + "learning_rate": 9.805447895240407e-05, + "loss": 0.7907, + "step": 27900 + }, + { + "epoch": 0.17830903492071604, + "grad_norm": 0.948287844657898, + "learning_rate": 9.805309264071502e-05, + "loss": 0.9496, + "step": 27910 + }, + { + "epoch": 0.17837292207045474, + "grad_norm": 0.5825172066688538, + "learning_rate": 9.805170584508976e-05, + "loss": 1.1519, + "step": 27920 + }, + { + "epoch": 0.17843680922019345, + "grad_norm": 1.1197121143341064, + "learning_rate": 9.80504573152731e-05, + "loss": 0.866, + "step": 27930 + }, + { + "epoch": 0.17850069636993215, + "grad_norm": 0.8723785877227783, + "learning_rate": 9.804906960020751e-05, + "loss": 0.7456, + "step": 27940 + }, + { + "epoch": 0.17856458351967086, + "grad_norm": 0.6122041940689087, + "learning_rate": 9.804768140124621e-05, + "loss": 0.7238, + "step": 27950 + }, + { + "epoch": 0.17862847066940957, + "grad_norm": 0.7413936853408813, + "learning_rate": 9.80462927184032e-05, + "loss": 0.863, + "step": 27960 + }, + { + "epoch": 0.17869235781914825, + "grad_norm": 0.7080979943275452, + "learning_rate": 9.804490355169246e-05, + "loss": 0.8773, + "step": 27970 + }, + { + "epoch": 0.17875624496888695, + "grad_norm": 0.9712502956390381, + "learning_rate": 9.804351390112799e-05, + "loss": 0.9399, + "step": 27980 + }, + { + "epoch": 0.17882013211862566, + "grad_norm": 1.720031499862671, + "learning_rate": 9.804212376672375e-05, + "loss": 1.4551, + "step": 27990 + }, + { + "epoch": 0.17888401926836436, + "grad_norm": 3.504847526550293, + "learning_rate": 9.804073314849375e-05, + "loss": 1.1386, + "step": 28000 + }, + { + "epoch": 0.17894790641810307, + "grad_norm": 0.8636149168014526, + "learning_rate": 9.803934204645202e-05, + "loss": 0.8948, + "step": 28010 + }, + { + "epoch": 0.17901179356784178, + "grad_norm": 1.0400105714797974, + "learning_rate": 9.803795046061257e-05, + "loss": 0.8915, + "step": 28020 + }, + { + "epoch": 0.17907568071758045, + "grad_norm": 0.6742110848426819, + "learning_rate": 9.803655839098938e-05, + "loss": 1.0636, + "step": 28030 + }, + { + "epoch": 0.17913956786731916, + "grad_norm": 1.9153518676757812, + "learning_rate": 9.80351658375965e-05, + "loss": 0.8614, + "step": 28040 + }, + { + "epoch": 0.17920345501705787, + "grad_norm": 0.7775312662124634, + "learning_rate": 9.803377280044794e-05, + "loss": 0.869, + "step": 28050 + }, + { + "epoch": 0.17926734216679657, + "grad_norm": 0.558363676071167, + "learning_rate": 9.803237927955772e-05, + "loss": 0.7641, + "step": 28060 + }, + { + "epoch": 0.17933122931653528, + "grad_norm": 0.7154206037521362, + "learning_rate": 9.80309852749399e-05, + "loss": 0.7582, + "step": 28070 + }, + { + "epoch": 0.17939511646627399, + "grad_norm": 0.7916398048400879, + "learning_rate": 9.802959078660851e-05, + "loss": 1.0197, + "step": 28080 + }, + { + "epoch": 0.17945900361601266, + "grad_norm": 1.3828551769256592, + "learning_rate": 9.802819581457758e-05, + "loss": 0.9683, + "step": 28090 + }, + { + "epoch": 0.17952289076575137, + "grad_norm": 1.6986253261566162, + "learning_rate": 9.802680035886118e-05, + "loss": 1.0508, + "step": 28100 + }, + { + "epoch": 0.17958677791549008, + "grad_norm": 0.576038658618927, + "learning_rate": 9.802540441947334e-05, + "loss": 0.6362, + "step": 28110 + }, + { + "epoch": 0.17965066506522878, + "grad_norm": 0.8584470748901367, + "learning_rate": 9.802400799642814e-05, + "loss": 0.8484, + "step": 28120 + }, + { + "epoch": 0.1797145522149675, + "grad_norm": 0.6002673506736755, + "learning_rate": 9.802261108973962e-05, + "loss": 0.6569, + "step": 28130 + }, + { + "epoch": 0.1797784393647062, + "grad_norm": 1.76115083694458, + "learning_rate": 9.802121369942188e-05, + "loss": 1.0472, + "step": 28140 + }, + { + "epoch": 0.17984232651444487, + "grad_norm": 0.6964778304100037, + "learning_rate": 9.801981582548896e-05, + "loss": 1.0831, + "step": 28150 + }, + { + "epoch": 0.17990621366418358, + "grad_norm": 0.6689683198928833, + "learning_rate": 9.801841746795495e-05, + "loss": 1.013, + "step": 28160 + }, + { + "epoch": 0.17997010081392228, + "grad_norm": 1.5421873331069946, + "learning_rate": 9.801701862683393e-05, + "loss": 0.7561, + "step": 28170 + }, + { + "epoch": 0.180033987963661, + "grad_norm": 0.8853926062583923, + "learning_rate": 9.801561930214001e-05, + "loss": 0.7668, + "step": 28180 + }, + { + "epoch": 0.1800978751133997, + "grad_norm": 0.7320166826248169, + "learning_rate": 9.801421949388723e-05, + "loss": 0.8514, + "step": 28190 + }, + { + "epoch": 0.1801617622631384, + "grad_norm": 1.5770325660705566, + "learning_rate": 9.801281920208976e-05, + "loss": 1.2304, + "step": 28200 + }, + { + "epoch": 0.18022564941287708, + "grad_norm": 0.8628795146942139, + "learning_rate": 9.801141842676164e-05, + "loss": 0.999, + "step": 28210 + }, + { + "epoch": 0.1802895365626158, + "grad_norm": 1.4478768110275269, + "learning_rate": 9.801001716791701e-05, + "loss": 0.7788, + "step": 28220 + }, + { + "epoch": 0.1803534237123545, + "grad_norm": 1.1721216440200806, + "learning_rate": 9.800861542556998e-05, + "loss": 0.6793, + "step": 28230 + }, + { + "epoch": 0.1804173108620932, + "grad_norm": 1.0601638555526733, + "learning_rate": 9.800721319973465e-05, + "loss": 0.914, + "step": 28240 + }, + { + "epoch": 0.1804811980118319, + "grad_norm": 1.330712914466858, + "learning_rate": 9.800581049042515e-05, + "loss": 0.9251, + "step": 28250 + }, + { + "epoch": 0.1805450851615706, + "grad_norm": 1.723365306854248, + "learning_rate": 9.80044072976556e-05, + "loss": 0.8571, + "step": 28260 + }, + { + "epoch": 0.1806089723113093, + "grad_norm": 1.0684921741485596, + "learning_rate": 9.800300362144015e-05, + "loss": 0.8753, + "step": 28270 + }, + { + "epoch": 0.180672859461048, + "grad_norm": 0.870155394077301, + "learning_rate": 9.800159946179292e-05, + "loss": 0.8745, + "step": 28280 + }, + { + "epoch": 0.1807367466107867, + "grad_norm": 0.8147633075714111, + "learning_rate": 9.800019481872807e-05, + "loss": 0.8873, + "step": 28290 + }, + { + "epoch": 0.1808006337605254, + "grad_norm": 0.8370197415351868, + "learning_rate": 9.799878969225971e-05, + "loss": 0.7692, + "step": 28300 + }, + { + "epoch": 0.18086452091026411, + "grad_norm": 0.695644199848175, + "learning_rate": 9.799738408240202e-05, + "loss": 1.0125, + "step": 28310 + }, + { + "epoch": 0.18092840806000282, + "grad_norm": 0.8963587284088135, + "learning_rate": 9.799597798916915e-05, + "loss": 0.9593, + "step": 28320 + }, + { + "epoch": 0.1809922952097415, + "grad_norm": 0.9512690305709839, + "learning_rate": 9.799457141257527e-05, + "loss": 0.9553, + "step": 28330 + }, + { + "epoch": 0.1810561823594802, + "grad_norm": 0.8540796637535095, + "learning_rate": 9.799316435263452e-05, + "loss": 0.8412, + "step": 28340 + }, + { + "epoch": 0.1811200695092189, + "grad_norm": 0.7773367762565613, + "learning_rate": 9.799175680936109e-05, + "loss": 0.8601, + "step": 28350 + }, + { + "epoch": 0.18118395665895762, + "grad_norm": 2.9732205867767334, + "learning_rate": 9.799034878276916e-05, + "loss": 0.8188, + "step": 28360 + }, + { + "epoch": 0.18124784380869632, + "grad_norm": 1.0311912298202515, + "learning_rate": 9.798894027287289e-05, + "loss": 0.6879, + "step": 28370 + }, + { + "epoch": 0.18131173095843503, + "grad_norm": 1.366125464439392, + "learning_rate": 9.798753127968647e-05, + "loss": 0.7352, + "step": 28380 + }, + { + "epoch": 0.1813756181081737, + "grad_norm": 0.7077022790908813, + "learning_rate": 9.79861218032241e-05, + "loss": 0.9083, + "step": 28390 + }, + { + "epoch": 0.18143950525791241, + "grad_norm": 0.9163293242454529, + "learning_rate": 9.798471184349997e-05, + "loss": 1.0788, + "step": 28400 + }, + { + "epoch": 0.18150339240765112, + "grad_norm": 0.7429232001304626, + "learning_rate": 9.798330140052829e-05, + "loss": 1.2201, + "step": 28410 + }, + { + "epoch": 0.18156727955738983, + "grad_norm": 0.7430415749549866, + "learning_rate": 9.798189047432323e-05, + "loss": 0.7114, + "step": 28420 + }, + { + "epoch": 0.18163116670712853, + "grad_norm": 0.9560526013374329, + "learning_rate": 9.798047906489905e-05, + "loss": 0.9409, + "step": 28430 + }, + { + "epoch": 0.18169505385686724, + "grad_norm": 1.2373318672180176, + "learning_rate": 9.797906717226992e-05, + "loss": 0.9829, + "step": 28440 + }, + { + "epoch": 0.18175894100660592, + "grad_norm": 1.158624291419983, + "learning_rate": 9.797765479645007e-05, + "loss": 0.8655, + "step": 28450 + }, + { + "epoch": 0.18182282815634462, + "grad_norm": 0.6600698232650757, + "learning_rate": 9.797624193745374e-05, + "loss": 0.9877, + "step": 28460 + }, + { + "epoch": 0.18188671530608333, + "grad_norm": 0.8037683963775635, + "learning_rate": 9.797482859529514e-05, + "loss": 0.7506, + "step": 28470 + }, + { + "epoch": 0.18195060245582204, + "grad_norm": 0.7499133348464966, + "learning_rate": 9.797341476998853e-05, + "loss": 0.7967, + "step": 28480 + }, + { + "epoch": 0.18201448960556074, + "grad_norm": 0.6633144617080688, + "learning_rate": 9.797200046154811e-05, + "loss": 0.8313, + "step": 28490 + }, + { + "epoch": 0.18207837675529945, + "grad_norm": 1.5353120565414429, + "learning_rate": 9.797058566998816e-05, + "loss": 0.7916, + "step": 28500 + }, + { + "epoch": 0.18214226390503815, + "grad_norm": 1.1196563243865967, + "learning_rate": 9.79691703953229e-05, + "loss": 0.8152, + "step": 28510 + }, + { + "epoch": 0.18220615105477683, + "grad_norm": 0.7169744968414307, + "learning_rate": 9.79677546375666e-05, + "loss": 0.9211, + "step": 28520 + }, + { + "epoch": 0.18227003820451554, + "grad_norm": 0.8993495106697083, + "learning_rate": 9.796633839673352e-05, + "loss": 1.0358, + "step": 28530 + }, + { + "epoch": 0.18233392535425424, + "grad_norm": 0.600199282169342, + "learning_rate": 9.79649216728379e-05, + "loss": 0.9579, + "step": 28540 + }, + { + "epoch": 0.18239781250399295, + "grad_norm": 0.901833176612854, + "learning_rate": 9.796350446589404e-05, + "loss": 0.8611, + "step": 28550 + }, + { + "epoch": 0.18246169965373166, + "grad_norm": 0.5698120594024658, + "learning_rate": 9.796208677591619e-05, + "loss": 0.6931, + "step": 28560 + }, + { + "epoch": 0.18252558680347036, + "grad_norm": 0.9085325598716736, + "learning_rate": 9.796066860291861e-05, + "loss": 0.7067, + "step": 28570 + }, + { + "epoch": 0.18258947395320904, + "grad_norm": 0.8795328140258789, + "learning_rate": 9.795924994691564e-05, + "loss": 0.938, + "step": 28580 + }, + { + "epoch": 0.18265336110294775, + "grad_norm": 0.7105121612548828, + "learning_rate": 9.795783080792151e-05, + "loss": 1.0905, + "step": 28590 + }, + { + "epoch": 0.18271724825268645, + "grad_norm": 1.094942569732666, + "learning_rate": 9.795641118595053e-05, + "loss": 0.9418, + "step": 28600 + }, + { + "epoch": 0.18278113540242516, + "grad_norm": 1.1036394834518433, + "learning_rate": 9.795499108101702e-05, + "loss": 0.7659, + "step": 28610 + }, + { + "epoch": 0.18284502255216387, + "grad_norm": 0.9667114019393921, + "learning_rate": 9.795357049313526e-05, + "loss": 0.7326, + "step": 28620 + }, + { + "epoch": 0.18290890970190257, + "grad_norm": 0.8882653713226318, + "learning_rate": 9.795214942231956e-05, + "loss": 0.9086, + "step": 28630 + }, + { + "epoch": 0.18297279685164125, + "grad_norm": 0.941718339920044, + "learning_rate": 9.795072786858421e-05, + "loss": 0.8087, + "step": 28640 + }, + { + "epoch": 0.18303668400137996, + "grad_norm": 0.749993622303009, + "learning_rate": 9.794930583194357e-05, + "loss": 0.8691, + "step": 28650 + }, + { + "epoch": 0.18310057115111866, + "grad_norm": 0.9505361318588257, + "learning_rate": 9.794788331241193e-05, + "loss": 0.9151, + "step": 28660 + }, + { + "epoch": 0.18316445830085737, + "grad_norm": 0.7003071308135986, + "learning_rate": 9.794646031000363e-05, + "loss": 0.9178, + "step": 28670 + }, + { + "epoch": 0.18322834545059608, + "grad_norm": 0.7516195178031921, + "learning_rate": 9.7945036824733e-05, + "loss": 0.9932, + "step": 28680 + }, + { + "epoch": 0.18329223260033478, + "grad_norm": 0.737834095954895, + "learning_rate": 9.794361285661435e-05, + "loss": 1.0445, + "step": 28690 + }, + { + "epoch": 0.18335611975007346, + "grad_norm": 1.5142183303833008, + "learning_rate": 9.794218840566205e-05, + "loss": 0.9432, + "step": 28700 + }, + { + "epoch": 0.18342000689981217, + "grad_norm": 0.9545480012893677, + "learning_rate": 9.794076347189045e-05, + "loss": 1.0564, + "step": 28710 + }, + { + "epoch": 0.18348389404955087, + "grad_norm": 1.4519827365875244, + "learning_rate": 9.793933805531387e-05, + "loss": 1.0927, + "step": 28720 + }, + { + "epoch": 0.18354778119928958, + "grad_norm": 1.179065465927124, + "learning_rate": 9.793791215594669e-05, + "loss": 0.8412, + "step": 28730 + }, + { + "epoch": 0.18361166834902828, + "grad_norm": 0.5378461480140686, + "learning_rate": 9.793648577380325e-05, + "loss": 0.9532, + "step": 28740 + }, + { + "epoch": 0.183675555498767, + "grad_norm": 0.9860353469848633, + "learning_rate": 9.793505890889795e-05, + "loss": 0.7965, + "step": 28750 + }, + { + "epoch": 0.18373944264850567, + "grad_norm": 0.7210092544555664, + "learning_rate": 9.793363156124513e-05, + "loss": 0.9562, + "step": 28760 + }, + { + "epoch": 0.18380332979824437, + "grad_norm": 0.9851694703102112, + "learning_rate": 9.793220373085917e-05, + "loss": 0.9258, + "step": 28770 + }, + { + "epoch": 0.18386721694798308, + "grad_norm": 1.2864528894424438, + "learning_rate": 9.793077541775444e-05, + "loss": 0.8495, + "step": 28780 + }, + { + "epoch": 0.1839311040977218, + "grad_norm": 0.5326701402664185, + "learning_rate": 9.792934662194534e-05, + "loss": 0.7886, + "step": 28790 + }, + { + "epoch": 0.1839949912474605, + "grad_norm": 0.9040879011154175, + "learning_rate": 9.792791734344627e-05, + "loss": 0.7028, + "step": 28800 + }, + { + "epoch": 0.1840588783971992, + "grad_norm": 0.7170331478118896, + "learning_rate": 9.792648758227159e-05, + "loss": 0.957, + "step": 28810 + }, + { + "epoch": 0.18412276554693788, + "grad_norm": 1.0186604261398315, + "learning_rate": 9.792505733843573e-05, + "loss": 0.8086, + "step": 28820 + }, + { + "epoch": 0.18418665269667658, + "grad_norm": 1.5476514101028442, + "learning_rate": 9.792362661195307e-05, + "loss": 0.9259, + "step": 28830 + }, + { + "epoch": 0.1842505398464153, + "grad_norm": 0.7610865235328674, + "learning_rate": 9.792219540283804e-05, + "loss": 0.867, + "step": 28840 + }, + { + "epoch": 0.184314426996154, + "grad_norm": 0.6964796185493469, + "learning_rate": 9.792076371110503e-05, + "loss": 0.8641, + "step": 28850 + }, + { + "epoch": 0.1843783141458927, + "grad_norm": 1.106491208076477, + "learning_rate": 9.791933153676849e-05, + "loss": 0.9952, + "step": 28860 + }, + { + "epoch": 0.1844422012956314, + "grad_norm": 1.025023102760315, + "learning_rate": 9.791789887984282e-05, + "loss": 0.9773, + "step": 28870 + }, + { + "epoch": 0.1845060884453701, + "grad_norm": 0.7797799706459045, + "learning_rate": 9.791646574034245e-05, + "loss": 0.7025, + "step": 28880 + }, + { + "epoch": 0.1845699755951088, + "grad_norm": 0.6405588984489441, + "learning_rate": 9.791503211828182e-05, + "loss": 0.7509, + "step": 28890 + }, + { + "epoch": 0.1846338627448475, + "grad_norm": 1.01836097240448, + "learning_rate": 9.791359801367536e-05, + "loss": 0.7725, + "step": 28900 + }, + { + "epoch": 0.1846977498945862, + "grad_norm": 1.2316473722457886, + "learning_rate": 9.791216342653751e-05, + "loss": 0.8199, + "step": 28910 + }, + { + "epoch": 0.1847616370443249, + "grad_norm": 1.0237054824829102, + "learning_rate": 9.791072835688274e-05, + "loss": 0.7915, + "step": 28920 + }, + { + "epoch": 0.18482552419406362, + "grad_norm": 0.6611847877502441, + "learning_rate": 9.790929280472547e-05, + "loss": 0.8064, + "step": 28930 + }, + { + "epoch": 0.1848894113438023, + "grad_norm": 0.6756503582000732, + "learning_rate": 9.790785677008018e-05, + "loss": 0.7544, + "step": 28940 + }, + { + "epoch": 0.184953298493541, + "grad_norm": 1.1825060844421387, + "learning_rate": 9.790642025296134e-05, + "loss": 1.0022, + "step": 28950 + }, + { + "epoch": 0.1850171856432797, + "grad_norm": 1.195821762084961, + "learning_rate": 9.790498325338339e-05, + "loss": 1.0366, + "step": 28960 + }, + { + "epoch": 0.18508107279301841, + "grad_norm": 3.405341863632202, + "learning_rate": 9.790354577136083e-05, + "loss": 0.978, + "step": 28970 + }, + { + "epoch": 0.18514495994275712, + "grad_norm": 1.0382331609725952, + "learning_rate": 9.790210780690811e-05, + "loss": 0.9581, + "step": 28980 + }, + { + "epoch": 0.18520884709249583, + "grad_norm": 0.6907293200492859, + "learning_rate": 9.790066936003972e-05, + "loss": 0.9692, + "step": 28990 + }, + { + "epoch": 0.1852727342422345, + "grad_norm": 0.8222552537918091, + "learning_rate": 9.789923043077015e-05, + "loss": 1.1995, + "step": 29000 + }, + { + "epoch": 0.1853366213919732, + "grad_norm": 0.5325214862823486, + "learning_rate": 9.78977910191139e-05, + "loss": 0.9084, + "step": 29010 + }, + { + "epoch": 0.18540050854171192, + "grad_norm": 0.6033929586410522, + "learning_rate": 9.789635112508544e-05, + "loss": 0.9668, + "step": 29020 + }, + { + "epoch": 0.18546439569145062, + "grad_norm": 0.604171872138977, + "learning_rate": 9.78949107486993e-05, + "loss": 0.8394, + "step": 29030 + }, + { + "epoch": 0.18552828284118933, + "grad_norm": 0.6410810947418213, + "learning_rate": 9.789346988996997e-05, + "loss": 1.072, + "step": 29040 + }, + { + "epoch": 0.18559216999092804, + "grad_norm": 0.8470253348350525, + "learning_rate": 9.789202854891198e-05, + "loss": 0.9716, + "step": 29050 + }, + { + "epoch": 0.1856560571406667, + "grad_norm": 0.9727482199668884, + "learning_rate": 9.789058672553982e-05, + "loss": 0.9176, + "step": 29060 + }, + { + "epoch": 0.18571994429040542, + "grad_norm": 0.9362789988517761, + "learning_rate": 9.7889144419868e-05, + "loss": 0.8712, + "step": 29070 + }, + { + "epoch": 0.18578383144014413, + "grad_norm": 0.6700981259346008, + "learning_rate": 9.788770163191108e-05, + "loss": 0.6975, + "step": 29080 + }, + { + "epoch": 0.18584771858988283, + "grad_norm": 0.863276481628418, + "learning_rate": 9.788625836168359e-05, + "loss": 1.2225, + "step": 29090 + }, + { + "epoch": 0.18591160573962154, + "grad_norm": 0.9833418130874634, + "learning_rate": 9.788481460920003e-05, + "loss": 0.92, + "step": 29100 + }, + { + "epoch": 0.18597549288936024, + "grad_norm": 1.1162675619125366, + "learning_rate": 9.788337037447497e-05, + "loss": 0.765, + "step": 29110 + }, + { + "epoch": 0.18603938003909892, + "grad_norm": 0.8579927086830139, + "learning_rate": 9.788192565752294e-05, + "loss": 0.8593, + "step": 29120 + }, + { + "epoch": 0.18610326718883763, + "grad_norm": 0.6623185276985168, + "learning_rate": 9.788048045835851e-05, + "loss": 0.9438, + "step": 29130 + }, + { + "epoch": 0.18616715433857633, + "grad_norm": 1.0203254222869873, + "learning_rate": 9.78790347769962e-05, + "loss": 0.9786, + "step": 29140 + }, + { + "epoch": 0.18623104148831504, + "grad_norm": 1.5538065433502197, + "learning_rate": 9.78775886134506e-05, + "loss": 0.7127, + "step": 29150 + }, + { + "epoch": 0.18629492863805375, + "grad_norm": 0.5423676371574402, + "learning_rate": 9.787614196773627e-05, + "loss": 0.9467, + "step": 29160 + }, + { + "epoch": 0.18635881578779245, + "grad_norm": 0.8626308441162109, + "learning_rate": 9.787469483986775e-05, + "loss": 1.054, + "step": 29170 + }, + { + "epoch": 0.18642270293753113, + "grad_norm": 2.267576217651367, + "learning_rate": 9.787324722985966e-05, + "loss": 0.8235, + "step": 29180 + }, + { + "epoch": 0.18648659008726984, + "grad_norm": 1.2194722890853882, + "learning_rate": 9.787179913772653e-05, + "loss": 1.046, + "step": 29190 + }, + { + "epoch": 0.18655047723700854, + "grad_norm": 1.2716878652572632, + "learning_rate": 9.787035056348298e-05, + "loss": 1.0831, + "step": 29200 + }, + { + "epoch": 0.18661436438674725, + "grad_norm": 0.5902767181396484, + "learning_rate": 9.786890150714359e-05, + "loss": 0.9125, + "step": 29210 + }, + { + "epoch": 0.18667825153648596, + "grad_norm": 0.6737661361694336, + "learning_rate": 9.786745196872295e-05, + "loss": 0.8752, + "step": 29220 + }, + { + "epoch": 0.18674213868622466, + "grad_norm": 0.7880046367645264, + "learning_rate": 9.786600194823565e-05, + "loss": 0.7642, + "step": 29230 + }, + { + "epoch": 0.18680602583596334, + "grad_norm": 1.327628254890442, + "learning_rate": 9.78645514456963e-05, + "loss": 0.668, + "step": 29240 + }, + { + "epoch": 0.18686991298570205, + "grad_norm": 1.034236192703247, + "learning_rate": 9.786310046111951e-05, + "loss": 0.9501, + "step": 29250 + }, + { + "epoch": 0.18693380013544075, + "grad_norm": 0.7702693939208984, + "learning_rate": 9.78616489945199e-05, + "loss": 0.9508, + "step": 29260 + }, + { + "epoch": 0.18699768728517946, + "grad_norm": 0.8737154603004456, + "learning_rate": 9.786019704591206e-05, + "loss": 0.8081, + "step": 29270 + }, + { + "epoch": 0.18706157443491817, + "grad_norm": 0.7933652400970459, + "learning_rate": 9.785874461531064e-05, + "loss": 0.8241, + "step": 29280 + }, + { + "epoch": 0.18712546158465687, + "grad_norm": 1.6798765659332275, + "learning_rate": 9.785729170273026e-05, + "loss": 0.8096, + "step": 29290 + }, + { + "epoch": 0.18718934873439558, + "grad_norm": 0.7516373991966248, + "learning_rate": 9.785583830818554e-05, + "loss": 0.8489, + "step": 29300 + }, + { + "epoch": 0.18725323588413426, + "grad_norm": 0.5362650752067566, + "learning_rate": 9.785438443169115e-05, + "loss": 0.8583, + "step": 29310 + }, + { + "epoch": 0.18731712303387296, + "grad_norm": 0.5288386940956116, + "learning_rate": 9.785293007326169e-05, + "loss": 0.8078, + "step": 29320 + }, + { + "epoch": 0.18738101018361167, + "grad_norm": 0.7445020079612732, + "learning_rate": 9.785147523291183e-05, + "loss": 0.9432, + "step": 29330 + }, + { + "epoch": 0.18744489733335037, + "grad_norm": 0.8663593530654907, + "learning_rate": 9.78500199106562e-05, + "loss": 0.8345, + "step": 29340 + }, + { + "epoch": 0.18750878448308908, + "grad_norm": 1.6068364381790161, + "learning_rate": 9.784856410650951e-05, + "loss": 1.0205, + "step": 29350 + }, + { + "epoch": 0.1875726716328278, + "grad_norm": 0.7024542689323425, + "learning_rate": 9.784710782048636e-05, + "loss": 0.891, + "step": 29360 + }, + { + "epoch": 0.18763655878256646, + "grad_norm": 0.6852838397026062, + "learning_rate": 9.784565105260145e-05, + "loss": 0.7938, + "step": 29370 + }, + { + "epoch": 0.18770044593230517, + "grad_norm": 0.5752915740013123, + "learning_rate": 9.784419380286944e-05, + "loss": 0.9839, + "step": 29380 + }, + { + "epoch": 0.18776433308204388, + "grad_norm": 1.396058201789856, + "learning_rate": 9.784273607130501e-05, + "loss": 0.8067, + "step": 29390 + }, + { + "epoch": 0.18782822023178258, + "grad_norm": 0.9546979665756226, + "learning_rate": 9.784127785792283e-05, + "loss": 0.8, + "step": 29400 + }, + { + "epoch": 0.1878921073815213, + "grad_norm": 1.17519211769104, + "learning_rate": 9.783981916273758e-05, + "loss": 1.1313, + "step": 29410 + }, + { + "epoch": 0.18795599453126, + "grad_norm": 2.2271242141723633, + "learning_rate": 9.783835998576398e-05, + "loss": 0.8251, + "step": 29420 + }, + { + "epoch": 0.18801988168099867, + "grad_norm": 1.1907005310058594, + "learning_rate": 9.78369003270167e-05, + "loss": 0.7798, + "step": 29430 + }, + { + "epoch": 0.18808376883073738, + "grad_norm": 1.2218221426010132, + "learning_rate": 9.783544018651048e-05, + "loss": 0.9479, + "step": 29440 + }, + { + "epoch": 0.1881476559804761, + "grad_norm": 0.7123143076896667, + "learning_rate": 9.783397956425997e-05, + "loss": 0.8228, + "step": 29450 + }, + { + "epoch": 0.1882115431302148, + "grad_norm": 1.4676718711853027, + "learning_rate": 9.78325184602799e-05, + "loss": 0.9251, + "step": 29460 + }, + { + "epoch": 0.1882754302799535, + "grad_norm": 0.7313151359558105, + "learning_rate": 9.783105687458499e-05, + "loss": 0.9859, + "step": 29470 + }, + { + "epoch": 0.1883393174296922, + "grad_norm": 0.7722935080528259, + "learning_rate": 9.782959480718997e-05, + "loss": 0.7907, + "step": 29480 + }, + { + "epoch": 0.18840320457943088, + "grad_norm": 1.3157824277877808, + "learning_rate": 9.782813225810953e-05, + "loss": 1.0648, + "step": 29490 + }, + { + "epoch": 0.1884670917291696, + "grad_norm": 1.3221862316131592, + "learning_rate": 9.782666922735843e-05, + "loss": 0.7726, + "step": 29500 + }, + { + "epoch": 0.1885309788789083, + "grad_norm": 0.5356481671333313, + "learning_rate": 9.78252057149514e-05, + "loss": 0.8444, + "step": 29510 + }, + { + "epoch": 0.188594866028647, + "grad_norm": 1.3000450134277344, + "learning_rate": 9.782374172090318e-05, + "loss": 0.7855, + "step": 29520 + }, + { + "epoch": 0.1886587531783857, + "grad_norm": 0.7123465538024902, + "learning_rate": 9.78222772452285e-05, + "loss": 0.8986, + "step": 29530 + }, + { + "epoch": 0.1887226403281244, + "grad_norm": 0.8324477076530457, + "learning_rate": 9.78208122879421e-05, + "loss": 0.795, + "step": 29540 + }, + { + "epoch": 0.1887865274778631, + "grad_norm": 1.0922014713287354, + "learning_rate": 9.781934684905879e-05, + "loss": 0.8251, + "step": 29550 + }, + { + "epoch": 0.1888504146276018, + "grad_norm": 0.6796879768371582, + "learning_rate": 9.781788092859326e-05, + "loss": 0.8954, + "step": 29560 + }, + { + "epoch": 0.1889143017773405, + "grad_norm": 0.6543946862220764, + "learning_rate": 9.78164145265603e-05, + "loss": 0.9359, + "step": 29570 + }, + { + "epoch": 0.1889781889270792, + "grad_norm": 0.7796209454536438, + "learning_rate": 9.781494764297468e-05, + "loss": 0.7721, + "step": 29580 + }, + { + "epoch": 0.18904207607681792, + "grad_norm": 1.0429221391677856, + "learning_rate": 9.781348027785116e-05, + "loss": 1.3679, + "step": 29590 + }, + { + "epoch": 0.18910596322655662, + "grad_norm": 1.09304940700531, + "learning_rate": 9.781201243120455e-05, + "loss": 1.1277, + "step": 29600 + }, + { + "epoch": 0.1891698503762953, + "grad_norm": 0.9372734427452087, + "learning_rate": 9.781054410304959e-05, + "loss": 0.7567, + "step": 29610 + }, + { + "epoch": 0.189233737526034, + "grad_norm": 0.9679316282272339, + "learning_rate": 9.780907529340111e-05, + "loss": 0.8106, + "step": 29620 + }, + { + "epoch": 0.1892976246757727, + "grad_norm": 1.304903268814087, + "learning_rate": 9.780760600227388e-05, + "loss": 0.9488, + "step": 29630 + }, + { + "epoch": 0.18936151182551142, + "grad_norm": 1.0478878021240234, + "learning_rate": 9.780613622968269e-05, + "loss": 0.8575, + "step": 29640 + }, + { + "epoch": 0.18942539897525013, + "grad_norm": 1.2268606424331665, + "learning_rate": 9.780466597564235e-05, + "loss": 1.0457, + "step": 29650 + }, + { + "epoch": 0.18948928612498883, + "grad_norm": 0.8506630659103394, + "learning_rate": 9.780319524016767e-05, + "loss": 0.7606, + "step": 29660 + }, + { + "epoch": 0.1895531732747275, + "grad_norm": 1.1287379264831543, + "learning_rate": 9.780172402327346e-05, + "loss": 1.0102, + "step": 29670 + }, + { + "epoch": 0.18961706042446622, + "grad_norm": 0.9983859062194824, + "learning_rate": 9.780025232497452e-05, + "loss": 0.7572, + "step": 29680 + }, + { + "epoch": 0.18968094757420492, + "grad_norm": 0.6691607236862183, + "learning_rate": 9.77987801452857e-05, + "loss": 1.0665, + "step": 29690 + }, + { + "epoch": 0.18974483472394363, + "grad_norm": 1.1289949417114258, + "learning_rate": 9.779730748422181e-05, + "loss": 0.9657, + "step": 29700 + }, + { + "epoch": 0.18980872187368233, + "grad_norm": 0.8309307098388672, + "learning_rate": 9.779583434179769e-05, + "loss": 1.1482, + "step": 29710 + }, + { + "epoch": 0.18987260902342104, + "grad_norm": 0.9599489569664001, + "learning_rate": 9.779436071802815e-05, + "loss": 0.8744, + "step": 29720 + }, + { + "epoch": 0.18993649617315972, + "grad_norm": 1.1437122821807861, + "learning_rate": 9.779288661292807e-05, + "loss": 0.9947, + "step": 29730 + }, + { + "epoch": 0.19000038332289843, + "grad_norm": 0.6847367882728577, + "learning_rate": 9.779141202651225e-05, + "loss": 0.6783, + "step": 29740 + }, + { + "epoch": 0.19006427047263713, + "grad_norm": 0.7857696413993835, + "learning_rate": 9.778993695879559e-05, + "loss": 0.9785, + "step": 29750 + }, + { + "epoch": 0.19012815762237584, + "grad_norm": 0.6318495273590088, + "learning_rate": 9.778846140979292e-05, + "loss": 0.8373, + "step": 29760 + }, + { + "epoch": 0.19019204477211454, + "grad_norm": 1.0563832521438599, + "learning_rate": 9.778698537951908e-05, + "loss": 0.7032, + "step": 29770 + }, + { + "epoch": 0.19025593192185325, + "grad_norm": 0.8542010188102722, + "learning_rate": 9.778550886798898e-05, + "loss": 0.9274, + "step": 29780 + }, + { + "epoch": 0.19031981907159193, + "grad_norm": 0.6458016633987427, + "learning_rate": 9.778403187521746e-05, + "loss": 0.8418, + "step": 29790 + }, + { + "epoch": 0.19038370622133063, + "grad_norm": 1.4049769639968872, + "learning_rate": 9.778255440121937e-05, + "loss": 0.9105, + "step": 29800 + }, + { + "epoch": 0.19044759337106934, + "grad_norm": 1.6816697120666504, + "learning_rate": 9.778107644600964e-05, + "loss": 0.9616, + "step": 29810 + }, + { + "epoch": 0.19051148052080805, + "grad_norm": 0.8408365249633789, + "learning_rate": 9.777959800960314e-05, + "loss": 0.9771, + "step": 29820 + }, + { + "epoch": 0.19057536767054675, + "grad_norm": 0.9713007211685181, + "learning_rate": 9.777811909201476e-05, + "loss": 0.8812, + "step": 29830 + }, + { + "epoch": 0.19063925482028546, + "grad_norm": 0.5639253258705139, + "learning_rate": 9.777663969325938e-05, + "loss": 0.9724, + "step": 29840 + }, + { + "epoch": 0.19070314197002414, + "grad_norm": 1.0495178699493408, + "learning_rate": 9.77751598133519e-05, + "loss": 1.005, + "step": 29850 + }, + { + "epoch": 0.19076702911976284, + "grad_norm": 1.3950402736663818, + "learning_rate": 9.777367945230722e-05, + "loss": 0.6716, + "step": 29860 + }, + { + "epoch": 0.19083091626950155, + "grad_norm": 1.0976344347000122, + "learning_rate": 9.777219861014028e-05, + "loss": 0.7201, + "step": 29870 + }, + { + "epoch": 0.19089480341924026, + "grad_norm": 0.6188146471977234, + "learning_rate": 9.777071728686595e-05, + "loss": 0.8153, + "step": 29880 + }, + { + "epoch": 0.19095869056897896, + "grad_norm": 0.748587965965271, + "learning_rate": 9.776923548249919e-05, + "loss": 1.0403, + "step": 29890 + }, + { + "epoch": 0.19102257771871767, + "grad_norm": 0.6070273518562317, + "learning_rate": 9.776775319705488e-05, + "loss": 0.7215, + "step": 29900 + }, + { + "epoch": 0.19108646486845635, + "grad_norm": 1.0740474462509155, + "learning_rate": 9.776627043054799e-05, + "loss": 0.9513, + "step": 29910 + }, + { + "epoch": 0.19115035201819505, + "grad_norm": 0.5291925072669983, + "learning_rate": 9.776478718299343e-05, + "loss": 0.6963, + "step": 29920 + }, + { + "epoch": 0.19121423916793376, + "grad_norm": 1.0318714380264282, + "learning_rate": 9.776330345440613e-05, + "loss": 0.7995, + "step": 29930 + }, + { + "epoch": 0.19127812631767246, + "grad_norm": 0.8970870971679688, + "learning_rate": 9.776181924480105e-05, + "loss": 0.9622, + "step": 29940 + }, + { + "epoch": 0.19134201346741117, + "grad_norm": 1.631463885307312, + "learning_rate": 9.776033455419313e-05, + "loss": 0.7972, + "step": 29950 + }, + { + "epoch": 0.19140590061714988, + "grad_norm": 1.0540581941604614, + "learning_rate": 9.775884938259732e-05, + "loss": 1.0735, + "step": 29960 + }, + { + "epoch": 0.19146978776688856, + "grad_norm": 2.7128796577453613, + "learning_rate": 9.775736373002858e-05, + "loss": 0.7245, + "step": 29970 + }, + { + "epoch": 0.19153367491662726, + "grad_norm": 3.253152847290039, + "learning_rate": 9.775587759650186e-05, + "loss": 1.0705, + "step": 29980 + }, + { + "epoch": 0.19159756206636597, + "grad_norm": 0.7138085961341858, + "learning_rate": 9.775439098203216e-05, + "loss": 1.0778, + "step": 29990 + }, + { + "epoch": 0.19166144921610467, + "grad_norm": 1.333784580230713, + "learning_rate": 9.775290388663443e-05, + "loss": 0.7873, + "step": 30000 + }, + { + "epoch": 0.19172533636584338, + "grad_norm": 1.4584836959838867, + "learning_rate": 9.775141631032362e-05, + "loss": 0.827, + "step": 30010 + }, + { + "epoch": 0.19178922351558209, + "grad_norm": 0.7264024019241333, + "learning_rate": 9.774992825311476e-05, + "loss": 0.7283, + "step": 30020 + }, + { + "epoch": 0.19185311066532076, + "grad_norm": 0.9471032619476318, + "learning_rate": 9.774843971502282e-05, + "loss": 0.8963, + "step": 30030 + }, + { + "epoch": 0.19191699781505947, + "grad_norm": 0.9348069429397583, + "learning_rate": 9.774695069606275e-05, + "loss": 0.9005, + "step": 30040 + }, + { + "epoch": 0.19198088496479818, + "grad_norm": 0.7033948302268982, + "learning_rate": 9.774546119624961e-05, + "loss": 0.7593, + "step": 30050 + }, + { + "epoch": 0.19204477211453688, + "grad_norm": 0.7773811221122742, + "learning_rate": 9.774397121559836e-05, + "loss": 0.9053, + "step": 30060 + }, + { + "epoch": 0.1921086592642756, + "grad_norm": 1.9900609254837036, + "learning_rate": 9.7742480754124e-05, + "loss": 0.9314, + "step": 30070 + }, + { + "epoch": 0.1921725464140143, + "grad_norm": 0.6554052233695984, + "learning_rate": 9.774098981184158e-05, + "loss": 0.6362, + "step": 30080 + }, + { + "epoch": 0.19223643356375297, + "grad_norm": 1.0862607955932617, + "learning_rate": 9.773949838876608e-05, + "loss": 0.7648, + "step": 30090 + }, + { + "epoch": 0.19230032071349168, + "grad_norm": 0.7586400508880615, + "learning_rate": 9.773800648491252e-05, + "loss": 0.769, + "step": 30100 + }, + { + "epoch": 0.19236420786323039, + "grad_norm": 0.8479837775230408, + "learning_rate": 9.773651410029594e-05, + "loss": 0.802, + "step": 30110 + }, + { + "epoch": 0.1924280950129691, + "grad_norm": 0.5918093323707581, + "learning_rate": 9.773502123493139e-05, + "loss": 0.7993, + "step": 30120 + }, + { + "epoch": 0.1924919821627078, + "grad_norm": 1.4376020431518555, + "learning_rate": 9.773352788883385e-05, + "loss": 0.9593, + "step": 30130 + }, + { + "epoch": 0.1925558693124465, + "grad_norm": 0.9727760553359985, + "learning_rate": 9.77320340620184e-05, + "loss": 0.7962, + "step": 30140 + }, + { + "epoch": 0.1926197564621852, + "grad_norm": 0.9183517098426819, + "learning_rate": 9.773053975450009e-05, + "loss": 0.755, + "step": 30150 + }, + { + "epoch": 0.1926836436119239, + "grad_norm": 1.3329063653945923, + "learning_rate": 9.772904496629391e-05, + "loss": 0.747, + "step": 30160 + }, + { + "epoch": 0.1927475307616626, + "grad_norm": 0.7893358469009399, + "learning_rate": 9.7727549697415e-05, + "loss": 0.9337, + "step": 30170 + }, + { + "epoch": 0.1928114179114013, + "grad_norm": 1.3940712213516235, + "learning_rate": 9.772605394787834e-05, + "loss": 0.924, + "step": 30180 + }, + { + "epoch": 0.19287530506114, + "grad_norm": 1.1371750831604004, + "learning_rate": 9.772455771769905e-05, + "loss": 0.7126, + "step": 30190 + }, + { + "epoch": 0.1929391922108787, + "grad_norm": 0.8628626465797424, + "learning_rate": 9.772306100689216e-05, + "loss": 0.965, + "step": 30200 + }, + { + "epoch": 0.19300307936061742, + "grad_norm": 0.5869954228401184, + "learning_rate": 9.772156381547277e-05, + "loss": 0.7079, + "step": 30210 + }, + { + "epoch": 0.1930669665103561, + "grad_norm": 0.6862210035324097, + "learning_rate": 9.772006614345594e-05, + "loss": 0.8432, + "step": 30220 + }, + { + "epoch": 0.1931308536600948, + "grad_norm": 0.9875562191009521, + "learning_rate": 9.771856799085678e-05, + "loss": 1.3028, + "step": 30230 + }, + { + "epoch": 0.1931947408098335, + "grad_norm": 1.2262318134307861, + "learning_rate": 9.771706935769034e-05, + "loss": 0.9413, + "step": 30240 + }, + { + "epoch": 0.19325862795957222, + "grad_norm": 1.6821092367172241, + "learning_rate": 9.771557024397173e-05, + "loss": 0.8401, + "step": 30250 + }, + { + "epoch": 0.19332251510931092, + "grad_norm": 0.5990639925003052, + "learning_rate": 9.771407064971605e-05, + "loss": 0.918, + "step": 30260 + }, + { + "epoch": 0.19338640225904963, + "grad_norm": 0.7293832898139954, + "learning_rate": 9.771257057493841e-05, + "loss": 0.8454, + "step": 30270 + }, + { + "epoch": 0.1934502894087883, + "grad_norm": 0.7124828100204468, + "learning_rate": 9.77110700196539e-05, + "loss": 0.7466, + "step": 30280 + }, + { + "epoch": 0.193514176558527, + "grad_norm": 0.7515029311180115, + "learning_rate": 9.770956898387764e-05, + "loss": 0.8641, + "step": 30290 + }, + { + "epoch": 0.19357806370826572, + "grad_norm": 0.7060081958770752, + "learning_rate": 9.770806746762473e-05, + "loss": 0.8651, + "step": 30300 + }, + { + "epoch": 0.19364195085800442, + "grad_norm": 0.7407328486442566, + "learning_rate": 9.770656547091033e-05, + "loss": 1.1405, + "step": 30310 + }, + { + "epoch": 0.19370583800774313, + "grad_norm": 1.009606122970581, + "learning_rate": 9.770506299374953e-05, + "loss": 0.9224, + "step": 30320 + }, + { + "epoch": 0.19376972515748184, + "grad_norm": 1.1087229251861572, + "learning_rate": 9.770356003615749e-05, + "loss": 0.9545, + "step": 30330 + }, + { + "epoch": 0.19383361230722052, + "grad_norm": 0.6406879425048828, + "learning_rate": 9.770205659814931e-05, + "loss": 0.9398, + "step": 30340 + }, + { + "epoch": 0.19389749945695922, + "grad_norm": 0.5704166889190674, + "learning_rate": 9.770055267974017e-05, + "loss": 0.6516, + "step": 30350 + }, + { + "epoch": 0.19396138660669793, + "grad_norm": 0.5956087112426758, + "learning_rate": 9.769904828094519e-05, + "loss": 1.0608, + "step": 30360 + }, + { + "epoch": 0.19402527375643663, + "grad_norm": 1.1136138439178467, + "learning_rate": 9.769754340177953e-05, + "loss": 0.7172, + "step": 30370 + }, + { + "epoch": 0.19408916090617534, + "grad_norm": 0.4953550696372986, + "learning_rate": 9.769603804225833e-05, + "loss": 0.9855, + "step": 30380 + }, + { + "epoch": 0.19415304805591405, + "grad_norm": 1.3780313730239868, + "learning_rate": 9.769453220239677e-05, + "loss": 0.8654, + "step": 30390 + }, + { + "epoch": 0.19421693520565272, + "grad_norm": 1.0662996768951416, + "learning_rate": 9.769302588221002e-05, + "loss": 1.1878, + "step": 30400 + }, + { + "epoch": 0.19428082235539143, + "grad_norm": 0.896293044090271, + "learning_rate": 9.769151908171324e-05, + "loss": 0.9222, + "step": 30410 + }, + { + "epoch": 0.19434470950513014, + "grad_norm": 1.046999454498291, + "learning_rate": 9.769001180092159e-05, + "loss": 0.9972, + "step": 30420 + }, + { + "epoch": 0.19440859665486884, + "grad_norm": 0.9748583436012268, + "learning_rate": 9.768850403985028e-05, + "loss": 0.7333, + "step": 30430 + }, + { + "epoch": 0.19447248380460755, + "grad_norm": 1.3169922828674316, + "learning_rate": 9.768699579851446e-05, + "loss": 0.7077, + "step": 30440 + }, + { + "epoch": 0.19453637095434625, + "grad_norm": 0.8591229319572449, + "learning_rate": 9.768548707692935e-05, + "loss": 0.7176, + "step": 30450 + }, + { + "epoch": 0.19460025810408493, + "grad_norm": 1.1447664499282837, + "learning_rate": 9.768397787511012e-05, + "loss": 0.7956, + "step": 30460 + }, + { + "epoch": 0.19466414525382364, + "grad_norm": 0.8214355111122131, + "learning_rate": 9.768246819307199e-05, + "loss": 0.9318, + "step": 30470 + }, + { + "epoch": 0.19472803240356235, + "grad_norm": 0.6454271078109741, + "learning_rate": 9.768095803083015e-05, + "loss": 0.9187, + "step": 30480 + }, + { + "epoch": 0.19479191955330105, + "grad_norm": 0.8612026572227478, + "learning_rate": 9.767944738839983e-05, + "loss": 0.8895, + "step": 30490 + }, + { + "epoch": 0.19485580670303976, + "grad_norm": 0.7116665244102478, + "learning_rate": 9.76779362657962e-05, + "loss": 0.9473, + "step": 30500 + }, + { + "epoch": 0.19491969385277846, + "grad_norm": 0.4623630940914154, + "learning_rate": 9.767642466303452e-05, + "loss": 1.0248, + "step": 30510 + }, + { + "epoch": 0.19498358100251714, + "grad_norm": 0.7994482517242432, + "learning_rate": 9.767491258013e-05, + "loss": 1.0697, + "step": 30520 + }, + { + "epoch": 0.19504746815225585, + "grad_norm": 0.9058681130409241, + "learning_rate": 9.767340001709785e-05, + "loss": 1.0001, + "step": 30530 + }, + { + "epoch": 0.19511135530199455, + "grad_norm": 0.8972348570823669, + "learning_rate": 9.767188697395333e-05, + "loss": 0.9495, + "step": 30540 + }, + { + "epoch": 0.19517524245173326, + "grad_norm": 0.664193868637085, + "learning_rate": 9.767037345071166e-05, + "loss": 0.9913, + "step": 30550 + }, + { + "epoch": 0.19523912960147197, + "grad_norm": 1.2621389627456665, + "learning_rate": 9.766885944738808e-05, + "loss": 0.7485, + "step": 30560 + }, + { + "epoch": 0.19530301675121067, + "grad_norm": 0.7015652656555176, + "learning_rate": 9.766734496399786e-05, + "loss": 0.7023, + "step": 30570 + }, + { + "epoch": 0.19536690390094935, + "grad_norm": 1.066769003868103, + "learning_rate": 9.766583000055625e-05, + "loss": 1.0337, + "step": 30580 + }, + { + "epoch": 0.19543079105068806, + "grad_norm": 1.1455520391464233, + "learning_rate": 9.766431455707847e-05, + "loss": 1.0366, + "step": 30590 + }, + { + "epoch": 0.19549467820042676, + "grad_norm": 1.4529062509536743, + "learning_rate": 9.766279863357982e-05, + "loss": 0.9134, + "step": 30600 + }, + { + "epoch": 0.19555856535016547, + "grad_norm": 0.7042234539985657, + "learning_rate": 9.766128223007556e-05, + "loss": 1.032, + "step": 30610 + }, + { + "epoch": 0.19562245249990418, + "grad_norm": 0.7277450561523438, + "learning_rate": 9.765991705652953e-05, + "loss": 0.9731, + "step": 30620 + }, + { + "epoch": 0.19568633964964288, + "grad_norm": 0.7915880084037781, + "learning_rate": 9.765839974105665e-05, + "loss": 1.0449, + "step": 30630 + }, + { + "epoch": 0.19575022679938156, + "grad_norm": 1.1217659711837769, + "learning_rate": 9.765688194562249e-05, + "loss": 0.816, + "step": 30640 + }, + { + "epoch": 0.19581411394912027, + "grad_norm": 0.7037495374679565, + "learning_rate": 9.765536367024229e-05, + "loss": 0.9901, + "step": 30650 + }, + { + "epoch": 0.19587800109885897, + "grad_norm": 0.8996081352233887, + "learning_rate": 9.765384491493132e-05, + "loss": 0.9512, + "step": 30660 + }, + { + "epoch": 0.19594188824859768, + "grad_norm": 0.543251633644104, + "learning_rate": 9.765232567970493e-05, + "loss": 0.8288, + "step": 30670 + }, + { + "epoch": 0.19600577539833638, + "grad_norm": 0.7527588605880737, + "learning_rate": 9.76508059645784e-05, + "loss": 0.9027, + "step": 30680 + }, + { + "epoch": 0.1960696625480751, + "grad_norm": 0.8170384764671326, + "learning_rate": 9.764928576956703e-05, + "loss": 0.8716, + "step": 30690 + }, + { + "epoch": 0.19613354969781377, + "grad_norm": 0.8016200661659241, + "learning_rate": 9.764776509468611e-05, + "loss": 0.9099, + "step": 30700 + }, + { + "epoch": 0.19619743684755248, + "grad_norm": 1.191615343093872, + "learning_rate": 9.764624393995098e-05, + "loss": 0.9785, + "step": 30710 + }, + { + "epoch": 0.19626132399729118, + "grad_norm": 1.0004390478134155, + "learning_rate": 9.764472230537697e-05, + "loss": 1.06, + "step": 30720 + }, + { + "epoch": 0.1963252111470299, + "grad_norm": 0.5032203197479248, + "learning_rate": 9.764320019097938e-05, + "loss": 0.7955, + "step": 30730 + }, + { + "epoch": 0.1963890982967686, + "grad_norm": 1.1866439580917358, + "learning_rate": 9.764167759677354e-05, + "loss": 0.7862, + "step": 30740 + }, + { + "epoch": 0.1964529854465073, + "grad_norm": 0.934973955154419, + "learning_rate": 9.764015452277479e-05, + "loss": 0.9502, + "step": 30750 + }, + { + "epoch": 0.19651687259624598, + "grad_norm": 1.0195708274841309, + "learning_rate": 9.763863096899847e-05, + "loss": 1.0983, + "step": 30760 + }, + { + "epoch": 0.19658075974598468, + "grad_norm": 0.8169684410095215, + "learning_rate": 9.763710693545993e-05, + "loss": 0.7333, + "step": 30770 + }, + { + "epoch": 0.1966446468957234, + "grad_norm": 1.0230990648269653, + "learning_rate": 9.763558242217452e-05, + "loss": 1.1088, + "step": 30780 + }, + { + "epoch": 0.1967085340454621, + "grad_norm": 1.095651388168335, + "learning_rate": 9.763405742915756e-05, + "loss": 0.8304, + "step": 30790 + }, + { + "epoch": 0.1967724211952008, + "grad_norm": 0.717144787311554, + "learning_rate": 9.763253195642446e-05, + "loss": 0.9346, + "step": 30800 + }, + { + "epoch": 0.1968363083449395, + "grad_norm": 3.6631853580474854, + "learning_rate": 9.763100600399053e-05, + "loss": 1.0039, + "step": 30810 + }, + { + "epoch": 0.1969001954946782, + "grad_norm": 0.7753827571868896, + "learning_rate": 9.762947957187117e-05, + "loss": 0.986, + "step": 30820 + }, + { + "epoch": 0.1969640826444169, + "grad_norm": 1.0646581649780273, + "learning_rate": 9.762795266008175e-05, + "loss": 0.743, + "step": 30830 + }, + { + "epoch": 0.1970279697941556, + "grad_norm": 0.9290790557861328, + "learning_rate": 9.762642526863765e-05, + "loss": 0.9802, + "step": 30840 + }, + { + "epoch": 0.1970918569438943, + "grad_norm": 1.0001217126846313, + "learning_rate": 9.762489739755423e-05, + "loss": 0.8519, + "step": 30850 + }, + { + "epoch": 0.197155744093633, + "grad_norm": 0.9493054151535034, + "learning_rate": 9.76233690468469e-05, + "loss": 0.844, + "step": 30860 + }, + { + "epoch": 0.19721963124337172, + "grad_norm": 0.775419294834137, + "learning_rate": 9.762184021653104e-05, + "loss": 0.6618, + "step": 30870 + }, + { + "epoch": 0.1972835183931104, + "grad_norm": 0.6491733193397522, + "learning_rate": 9.762031090662205e-05, + "loss": 0.8618, + "step": 30880 + }, + { + "epoch": 0.1973474055428491, + "grad_norm": 0.7320391535758972, + "learning_rate": 9.761878111713534e-05, + "loss": 1.1604, + "step": 30890 + }, + { + "epoch": 0.1974112926925878, + "grad_norm": 0.6711703538894653, + "learning_rate": 9.761725084808629e-05, + "loss": 0.9965, + "step": 30900 + }, + { + "epoch": 0.19747517984232651, + "grad_norm": 0.5033368468284607, + "learning_rate": 9.761572009949035e-05, + "loss": 1.0613, + "step": 30910 + }, + { + "epoch": 0.19753906699206522, + "grad_norm": 0.8021765947341919, + "learning_rate": 9.76141888713629e-05, + "loss": 0.8296, + "step": 30920 + }, + { + "epoch": 0.19760295414180393, + "grad_norm": 0.6978395581245422, + "learning_rate": 9.761265716371938e-05, + "loss": 0.8845, + "step": 30930 + }, + { + "epoch": 0.1976668412915426, + "grad_norm": 0.741265594959259, + "learning_rate": 9.761112497657522e-05, + "loss": 0.8021, + "step": 30940 + }, + { + "epoch": 0.1977307284412813, + "grad_norm": 0.6882484555244446, + "learning_rate": 9.760959230994583e-05, + "loss": 0.7808, + "step": 30950 + }, + { + "epoch": 0.19779461559102002, + "grad_norm": 1.2899192571640015, + "learning_rate": 9.760805916384666e-05, + "loss": 0.9258, + "step": 30960 + }, + { + "epoch": 0.19785850274075872, + "grad_norm": 0.7548243999481201, + "learning_rate": 9.760652553829314e-05, + "loss": 0.749, + "step": 30970 + }, + { + "epoch": 0.19792238989049743, + "grad_norm": 0.8977358341217041, + "learning_rate": 9.760499143330075e-05, + "loss": 0.8827, + "step": 30980 + }, + { + "epoch": 0.19798627704023614, + "grad_norm": 0.8583622574806213, + "learning_rate": 9.760345684888489e-05, + "loss": 0.7604, + "step": 30990 + }, + { + "epoch": 0.19805016418997484, + "grad_norm": 0.930568516254425, + "learning_rate": 9.760192178506104e-05, + "loss": 0.8838, + "step": 31000 + }, + { + "epoch": 0.19811405133971352, + "grad_norm": 0.7296523451805115, + "learning_rate": 9.760038624184466e-05, + "loss": 0.997, + "step": 31010 + }, + { + "epoch": 0.19817793848945223, + "grad_norm": 0.5813782215118408, + "learning_rate": 9.75988502192512e-05, + "loss": 0.873, + "step": 31020 + }, + { + "epoch": 0.19824182563919093, + "grad_norm": 1.0174976587295532, + "learning_rate": 9.759731371729614e-05, + "loss": 0.9311, + "step": 31030 + }, + { + "epoch": 0.19830571278892964, + "grad_norm": 0.6261200904846191, + "learning_rate": 9.759577673599497e-05, + "loss": 0.8865, + "step": 31040 + }, + { + "epoch": 0.19836959993866835, + "grad_norm": 0.5916396975517273, + "learning_rate": 9.759423927536316e-05, + "loss": 0.7946, + "step": 31050 + }, + { + "epoch": 0.19843348708840705, + "grad_norm": 1.060449242591858, + "learning_rate": 9.759270133541616e-05, + "loss": 1.2101, + "step": 31060 + }, + { + "epoch": 0.19849737423814573, + "grad_norm": 0.5522297620773315, + "learning_rate": 9.759116291616948e-05, + "loss": 0.86, + "step": 31070 + }, + { + "epoch": 0.19856126138788444, + "grad_norm": 1.0017218589782715, + "learning_rate": 9.758962401763863e-05, + "loss": 0.8776, + "step": 31080 + }, + { + "epoch": 0.19862514853762314, + "grad_norm": 0.8278487920761108, + "learning_rate": 9.758808463983911e-05, + "loss": 1.0988, + "step": 31090 + }, + { + "epoch": 0.19868903568736185, + "grad_norm": 0.8800287246704102, + "learning_rate": 9.758654478278638e-05, + "loss": 0.8976, + "step": 31100 + }, + { + "epoch": 0.19875292283710055, + "grad_norm": 0.7034065127372742, + "learning_rate": 9.758500444649598e-05, + "loss": 1.0156, + "step": 31110 + }, + { + "epoch": 0.19881680998683926, + "grad_norm": 1.02751886844635, + "learning_rate": 9.758346363098344e-05, + "loss": 0.9064, + "step": 31120 + }, + { + "epoch": 0.19888069713657794, + "grad_norm": 0.8063342571258545, + "learning_rate": 9.758192233626425e-05, + "loss": 0.9177, + "step": 31130 + }, + { + "epoch": 0.19894458428631664, + "grad_norm": 2.721904754638672, + "learning_rate": 9.758038056235393e-05, + "loss": 0.9505, + "step": 31140 + }, + { + "epoch": 0.19900847143605535, + "grad_norm": 1.0083937644958496, + "learning_rate": 9.757883830926801e-05, + "loss": 1.1092, + "step": 31150 + }, + { + "epoch": 0.19907235858579406, + "grad_norm": 0.841985821723938, + "learning_rate": 9.757729557702202e-05, + "loss": 0.7708, + "step": 31160 + }, + { + "epoch": 0.19913624573553276, + "grad_norm": 0.6755800843238831, + "learning_rate": 9.757575236563152e-05, + "loss": 0.7743, + "step": 31170 + }, + { + "epoch": 0.19920013288527147, + "grad_norm": 0.7885231971740723, + "learning_rate": 9.757420867511202e-05, + "loss": 0.9718, + "step": 31180 + }, + { + "epoch": 0.19926402003501015, + "grad_norm": 1.2030565738677979, + "learning_rate": 9.75726645054791e-05, + "loss": 0.9473, + "step": 31190 + }, + { + "epoch": 0.19932790718474885, + "grad_norm": 1.3136283159255981, + "learning_rate": 9.757111985674828e-05, + "loss": 1.0381, + "step": 31200 + }, + { + "epoch": 0.19939179433448756, + "grad_norm": 0.6797472834587097, + "learning_rate": 9.756957472893513e-05, + "loss": 1.0419, + "step": 31210 + }, + { + "epoch": 0.19945568148422627, + "grad_norm": 0.7219412922859192, + "learning_rate": 9.756802912205522e-05, + "loss": 1.0792, + "step": 31220 + }, + { + "epoch": 0.19951956863396497, + "grad_norm": 0.8325220346450806, + "learning_rate": 9.756648303612409e-05, + "loss": 0.7956, + "step": 31230 + }, + { + "epoch": 0.19958345578370368, + "grad_norm": 0.9289294481277466, + "learning_rate": 9.756493647115734e-05, + "loss": 0.7096, + "step": 31240 + }, + { + "epoch": 0.19964734293344236, + "grad_norm": 0.908420205116272, + "learning_rate": 9.756338942717051e-05, + "loss": 0.8602, + "step": 31250 + }, + { + "epoch": 0.19971123008318106, + "grad_norm": 2.6106882095336914, + "learning_rate": 9.756184190417921e-05, + "loss": 0.9356, + "step": 31260 + }, + { + "epoch": 0.19977511723291977, + "grad_norm": 0.8880581259727478, + "learning_rate": 9.756029390219901e-05, + "loss": 0.7862, + "step": 31270 + }, + { + "epoch": 0.19983900438265847, + "grad_norm": 0.6896887421607971, + "learning_rate": 9.755874542124551e-05, + "loss": 0.9089, + "step": 31280 + }, + { + "epoch": 0.19990289153239718, + "grad_norm": 1.0063308477401733, + "learning_rate": 9.75571964613343e-05, + "loss": 1.0273, + "step": 31290 + }, + { + "epoch": 0.1999667786821359, + "grad_norm": 1.277763843536377, + "learning_rate": 9.755564702248099e-05, + "loss": 1.0963, + "step": 31300 + }, + { + "epoch": 0.20003066583187457, + "grad_norm": 0.6711148023605347, + "learning_rate": 9.755409710470116e-05, + "loss": 1.016, + "step": 31310 + }, + { + "epoch": 0.20009455298161327, + "grad_norm": 1.1493245363235474, + "learning_rate": 9.755254670801042e-05, + "loss": 0.6895, + "step": 31320 + }, + { + "epoch": 0.20015844013135198, + "grad_norm": 0.5734307765960693, + "learning_rate": 9.755099583242442e-05, + "loss": 0.9138, + "step": 31330 + }, + { + "epoch": 0.20022232728109068, + "grad_norm": 0.6204320788383484, + "learning_rate": 9.754944447795874e-05, + "loss": 0.8158, + "step": 31340 + }, + { + "epoch": 0.2002862144308294, + "grad_norm": 1.0882511138916016, + "learning_rate": 9.754789264462902e-05, + "loss": 0.7919, + "step": 31350 + }, + { + "epoch": 0.2003501015805681, + "grad_norm": 1.2842504978179932, + "learning_rate": 9.754634033245089e-05, + "loss": 1.1121, + "step": 31360 + }, + { + "epoch": 0.20041398873030677, + "grad_norm": 0.7042865753173828, + "learning_rate": 9.754478754143998e-05, + "loss": 0.8598, + "step": 31370 + }, + { + "epoch": 0.20047787588004548, + "grad_norm": 0.7466055154800415, + "learning_rate": 9.754323427161191e-05, + "loss": 0.7496, + "step": 31380 + }, + { + "epoch": 0.2005417630297842, + "grad_norm": 1.2161649465560913, + "learning_rate": 9.754168052298237e-05, + "loss": 0.9603, + "step": 31390 + }, + { + "epoch": 0.2006056501795229, + "grad_norm": 0.871167778968811, + "learning_rate": 9.754012629556696e-05, + "loss": 1.0315, + "step": 31400 + }, + { + "epoch": 0.2006695373292616, + "grad_norm": 0.853158175945282, + "learning_rate": 9.753857158938135e-05, + "loss": 0.9701, + "step": 31410 + }, + { + "epoch": 0.2007334244790003, + "grad_norm": 0.5134825706481934, + "learning_rate": 9.753701640444121e-05, + "loss": 0.9838, + "step": 31420 + }, + { + "epoch": 0.20079731162873898, + "grad_norm": 0.7412970662117004, + "learning_rate": 9.753546074076217e-05, + "loss": 0.802, + "step": 31430 + }, + { + "epoch": 0.2008611987784777, + "grad_norm": 1.8500874042510986, + "learning_rate": 9.753390459835993e-05, + "loss": 0.711, + "step": 31440 + }, + { + "epoch": 0.2009250859282164, + "grad_norm": 0.685453474521637, + "learning_rate": 9.753234797725015e-05, + "loss": 0.9091, + "step": 31450 + }, + { + "epoch": 0.2009889730779551, + "grad_norm": 1.9982002973556519, + "learning_rate": 9.75307908774485e-05, + "loss": 0.8882, + "step": 31460 + }, + { + "epoch": 0.2010528602276938, + "grad_norm": 1.9767764806747437, + "learning_rate": 9.752923329897066e-05, + "loss": 0.8807, + "step": 31470 + }, + { + "epoch": 0.20111674737743251, + "grad_norm": 0.6557339429855347, + "learning_rate": 9.752767524183233e-05, + "loss": 0.8447, + "step": 31480 + }, + { + "epoch": 0.2011806345271712, + "grad_norm": 0.6405972838401794, + "learning_rate": 9.752611670604919e-05, + "loss": 0.8889, + "step": 31490 + }, + { + "epoch": 0.2012445216769099, + "grad_norm": 0.8593305349349976, + "learning_rate": 9.752455769163693e-05, + "loss": 1.1378, + "step": 31500 + }, + { + "epoch": 0.2013084088266486, + "grad_norm": 0.6940191984176636, + "learning_rate": 9.752299819861127e-05, + "loss": 0.9958, + "step": 31510 + }, + { + "epoch": 0.2013722959763873, + "grad_norm": 0.8981072306632996, + "learning_rate": 9.752143822698789e-05, + "loss": 0.8305, + "step": 31520 + }, + { + "epoch": 0.20143618312612602, + "grad_norm": 1.0248847007751465, + "learning_rate": 9.751987777678253e-05, + "loss": 0.91, + "step": 31530 + }, + { + "epoch": 0.20150007027586472, + "grad_norm": 0.8903045654296875, + "learning_rate": 9.751831684801089e-05, + "loss": 0.8491, + "step": 31540 + }, + { + "epoch": 0.2015639574256034, + "grad_norm": 1.1542670726776123, + "learning_rate": 9.75167554406887e-05, + "loss": 0.8601, + "step": 31550 + }, + { + "epoch": 0.2016278445753421, + "grad_norm": 0.7678368091583252, + "learning_rate": 9.751519355483166e-05, + "loss": 0.8247, + "step": 31560 + }, + { + "epoch": 0.20169173172508081, + "grad_norm": 0.9471594095230103, + "learning_rate": 9.75136311904555e-05, + "loss": 0.9038, + "step": 31570 + }, + { + "epoch": 0.20175561887481952, + "grad_norm": 0.8465635180473328, + "learning_rate": 9.7512068347576e-05, + "loss": 0.788, + "step": 31580 + }, + { + "epoch": 0.20181950602455823, + "grad_norm": 0.9624682664871216, + "learning_rate": 9.751050502620885e-05, + "loss": 1.0697, + "step": 31590 + }, + { + "epoch": 0.20188339317429693, + "grad_norm": 0.569759726524353, + "learning_rate": 9.750894122636982e-05, + "loss": 1.0777, + "step": 31600 + }, + { + "epoch": 0.2019472803240356, + "grad_norm": 3.1683318614959717, + "learning_rate": 9.750737694807464e-05, + "loss": 0.9169, + "step": 31610 + }, + { + "epoch": 0.20201116747377432, + "grad_norm": 0.7441072463989258, + "learning_rate": 9.75058121913391e-05, + "loss": 0.771, + "step": 31620 + }, + { + "epoch": 0.20207505462351302, + "grad_norm": 1.1185020208358765, + "learning_rate": 9.75042469561789e-05, + "loss": 0.8128, + "step": 31630 + }, + { + "epoch": 0.20213894177325173, + "grad_norm": 0.7714232206344604, + "learning_rate": 9.750268124260987e-05, + "loss": 0.8612, + "step": 31640 + }, + { + "epoch": 0.20220282892299044, + "grad_norm": 0.6368833184242249, + "learning_rate": 9.75011150506477e-05, + "loss": 1.076, + "step": 31650 + }, + { + "epoch": 0.20226671607272914, + "grad_norm": 1.164900779724121, + "learning_rate": 9.749954838030824e-05, + "loss": 0.9611, + "step": 31660 + }, + { + "epoch": 0.20233060322246782, + "grad_norm": 0.66245436668396, + "learning_rate": 9.749798123160723e-05, + "loss": 0.8932, + "step": 31670 + }, + { + "epoch": 0.20239449037220653, + "grad_norm": 0.7968323826789856, + "learning_rate": 9.749641360456045e-05, + "loss": 1.0375, + "step": 31680 + }, + { + "epoch": 0.20245837752194523, + "grad_norm": 1.1304694414138794, + "learning_rate": 9.749484549918371e-05, + "loss": 0.843, + "step": 31690 + }, + { + "epoch": 0.20252226467168394, + "grad_norm": 1.4667329788208008, + "learning_rate": 9.749327691549277e-05, + "loss": 0.8078, + "step": 31700 + }, + { + "epoch": 0.20258615182142264, + "grad_norm": 0.8826027512550354, + "learning_rate": 9.749170785350344e-05, + "loss": 1.0263, + "step": 31710 + }, + { + "epoch": 0.20265003897116135, + "grad_norm": 0.7443497180938721, + "learning_rate": 9.749013831323154e-05, + "loss": 0.7889, + "step": 31720 + }, + { + "epoch": 0.20271392612090003, + "grad_norm": 0.5433924198150635, + "learning_rate": 9.748856829469287e-05, + "loss": 0.9073, + "step": 31730 + }, + { + "epoch": 0.20277781327063873, + "grad_norm": 0.5322934985160828, + "learning_rate": 9.74869977979032e-05, + "loss": 0.7924, + "step": 31740 + }, + { + "epoch": 0.20284170042037744, + "grad_norm": 1.0181642770767212, + "learning_rate": 9.748542682287841e-05, + "loss": 0.7738, + "step": 31750 + }, + { + "epoch": 0.20290558757011615, + "grad_norm": 0.8533402681350708, + "learning_rate": 9.74838553696343e-05, + "loss": 1.1269, + "step": 31760 + }, + { + "epoch": 0.20296947471985485, + "grad_norm": 0.6573584079742432, + "learning_rate": 9.748228343818666e-05, + "loss": 0.9684, + "step": 31770 + }, + { + "epoch": 0.20303336186959356, + "grad_norm": 1.141799807548523, + "learning_rate": 9.748071102855135e-05, + "loss": 1.0159, + "step": 31780 + }, + { + "epoch": 0.20309724901933224, + "grad_norm": 2.4994301795959473, + "learning_rate": 9.747913814074421e-05, + "loss": 0.7334, + "step": 31790 + }, + { + "epoch": 0.20316113616907094, + "grad_norm": 1.0525953769683838, + "learning_rate": 9.747756477478108e-05, + "loss": 0.9094, + "step": 31800 + }, + { + "epoch": 0.20322502331880965, + "grad_norm": 0.6493773460388184, + "learning_rate": 9.747599093067779e-05, + "loss": 0.7929, + "step": 31810 + }, + { + "epoch": 0.20328891046854836, + "grad_norm": 1.622753381729126, + "learning_rate": 9.747441660845021e-05, + "loss": 1.3227, + "step": 31820 + }, + { + "epoch": 0.20335279761828706, + "grad_norm": 0.932706356048584, + "learning_rate": 9.747284180811417e-05, + "loss": 1.2109, + "step": 31830 + }, + { + "epoch": 0.20341668476802577, + "grad_norm": 0.706366240978241, + "learning_rate": 9.747126652968554e-05, + "loss": 1.0372, + "step": 31840 + }, + { + "epoch": 0.20348057191776447, + "grad_norm": 0.5680680871009827, + "learning_rate": 9.74696907731802e-05, + "loss": 0.8115, + "step": 31850 + }, + { + "epoch": 0.20354445906750315, + "grad_norm": 0.7886488437652588, + "learning_rate": 9.7468114538614e-05, + "loss": 0.8516, + "step": 31860 + }, + { + "epoch": 0.20360834621724186, + "grad_norm": 1.467068076133728, + "learning_rate": 9.746653782600284e-05, + "loss": 0.9352, + "step": 31870 + }, + { + "epoch": 0.20367223336698057, + "grad_norm": 0.6054574251174927, + "learning_rate": 9.746496063536254e-05, + "loss": 0.738, + "step": 31880 + }, + { + "epoch": 0.20373612051671927, + "grad_norm": 0.8049781322479248, + "learning_rate": 9.746338296670906e-05, + "loss": 0.9212, + "step": 31890 + }, + { + "epoch": 0.20380000766645798, + "grad_norm": 2.8067591190338135, + "learning_rate": 9.746180482005825e-05, + "loss": 1.1401, + "step": 31900 + }, + { + "epoch": 0.20386389481619668, + "grad_norm": 0.5604707598686218, + "learning_rate": 9.746022619542599e-05, + "loss": 0.7448, + "step": 31910 + }, + { + "epoch": 0.20392778196593536, + "grad_norm": 0.6594801545143127, + "learning_rate": 9.745864709282819e-05, + "loss": 1.0038, + "step": 31920 + }, + { + "epoch": 0.20399166911567407, + "grad_norm": 0.7814098000526428, + "learning_rate": 9.745706751228076e-05, + "loss": 1.0487, + "step": 31930 + }, + { + "epoch": 0.20405555626541277, + "grad_norm": 0.892376184463501, + "learning_rate": 9.745548745379961e-05, + "loss": 1.006, + "step": 31940 + }, + { + "epoch": 0.20411944341515148, + "grad_norm": 0.4663401246070862, + "learning_rate": 9.745390691740064e-05, + "loss": 0.9555, + "step": 31950 + }, + { + "epoch": 0.2041833305648902, + "grad_norm": 0.9874062538146973, + "learning_rate": 9.745232590309978e-05, + "loss": 1.0092, + "step": 31960 + }, + { + "epoch": 0.2042472177146289, + "grad_norm": 0.5330253839492798, + "learning_rate": 9.745074441091294e-05, + "loss": 1.0081, + "step": 31970 + }, + { + "epoch": 0.20431110486436757, + "grad_norm": 1.0687589645385742, + "learning_rate": 9.744916244085606e-05, + "loss": 0.8934, + "step": 31980 + }, + { + "epoch": 0.20437499201410628, + "grad_norm": 0.6077286601066589, + "learning_rate": 9.744757999294506e-05, + "loss": 0.8938, + "step": 31990 + }, + { + "epoch": 0.20443887916384498, + "grad_norm": 0.6717079281806946, + "learning_rate": 9.744599706719588e-05, + "loss": 0.9467, + "step": 32000 + }, + { + "epoch": 0.2045027663135837, + "grad_norm": 1.032605767250061, + "learning_rate": 9.744441366362447e-05, + "loss": 0.9648, + "step": 32010 + }, + { + "epoch": 0.2045666534633224, + "grad_norm": 0.6703940629959106, + "learning_rate": 9.744282978224677e-05, + "loss": 0.7152, + "step": 32020 + }, + { + "epoch": 0.2046305406130611, + "grad_norm": 1.4983042478561401, + "learning_rate": 9.744124542307871e-05, + "loss": 0.9562, + "step": 32030 + }, + { + "epoch": 0.20469442776279978, + "grad_norm": 0.7340278029441833, + "learning_rate": 9.743966058613629e-05, + "loss": 0.9512, + "step": 32040 + }, + { + "epoch": 0.20475831491253849, + "grad_norm": 2.0036234855651855, + "learning_rate": 9.743807527143544e-05, + "loss": 1.1441, + "step": 32050 + }, + { + "epoch": 0.2048222020622772, + "grad_norm": 0.8495148420333862, + "learning_rate": 9.743648947899214e-05, + "loss": 1.0051, + "step": 32060 + }, + { + "epoch": 0.2048860892120159, + "grad_norm": 1.4452283382415771, + "learning_rate": 9.743490320882234e-05, + "loss": 0.8346, + "step": 32070 + }, + { + "epoch": 0.2049499763617546, + "grad_norm": 0.7870922684669495, + "learning_rate": 9.743331646094202e-05, + "loss": 0.8006, + "step": 32080 + }, + { + "epoch": 0.2050138635114933, + "grad_norm": 0.8627803325653076, + "learning_rate": 9.743172923536718e-05, + "loss": 0.947, + "step": 32090 + }, + { + "epoch": 0.205077750661232, + "grad_norm": 1.4049910306930542, + "learning_rate": 9.74301415321138e-05, + "loss": 1.1061, + "step": 32100 + }, + { + "epoch": 0.2051416378109707, + "grad_norm": 0.6403430700302124, + "learning_rate": 9.742855335119785e-05, + "loss": 0.8099, + "step": 32110 + }, + { + "epoch": 0.2052055249607094, + "grad_norm": 1.0958514213562012, + "learning_rate": 9.742696469263533e-05, + "loss": 0.9353, + "step": 32120 + }, + { + "epoch": 0.2052694121104481, + "grad_norm": 0.828372597694397, + "learning_rate": 9.742537555644225e-05, + "loss": 0.7152, + "step": 32130 + }, + { + "epoch": 0.2053332992601868, + "grad_norm": 0.743424654006958, + "learning_rate": 9.742378594263461e-05, + "loss": 0.8165, + "step": 32140 + }, + { + "epoch": 0.20539718640992552, + "grad_norm": 1.0300029516220093, + "learning_rate": 9.742219585122843e-05, + "loss": 1.0059, + "step": 32150 + }, + { + "epoch": 0.2054610735596642, + "grad_norm": 0.9428716897964478, + "learning_rate": 9.74206052822397e-05, + "loss": 0.9221, + "step": 32160 + }, + { + "epoch": 0.2055249607094029, + "grad_norm": 0.9042668342590332, + "learning_rate": 9.741901423568446e-05, + "loss": 0.8297, + "step": 32170 + }, + { + "epoch": 0.2055888478591416, + "grad_norm": 1.2374792098999023, + "learning_rate": 9.741742271157872e-05, + "loss": 0.8647, + "step": 32180 + }, + { + "epoch": 0.20565273500888032, + "grad_norm": 0.9123538136482239, + "learning_rate": 9.74158307099385e-05, + "loss": 0.6822, + "step": 32190 + }, + { + "epoch": 0.20571662215861902, + "grad_norm": 0.725796103477478, + "learning_rate": 9.741423823077986e-05, + "loss": 1.1005, + "step": 32200 + }, + { + "epoch": 0.20578050930835773, + "grad_norm": 0.8962036371231079, + "learning_rate": 9.741264527411881e-05, + "loss": 1.0891, + "step": 32210 + }, + { + "epoch": 0.2058443964580964, + "grad_norm": 0.9846658110618591, + "learning_rate": 9.741105183997141e-05, + "loss": 1.0041, + "step": 32220 + }, + { + "epoch": 0.2059082836078351, + "grad_norm": 0.4427562654018402, + "learning_rate": 9.74094579283537e-05, + "loss": 0.8606, + "step": 32230 + }, + { + "epoch": 0.20597217075757382, + "grad_norm": 0.8591815829277039, + "learning_rate": 9.740786353928173e-05, + "loss": 1.0499, + "step": 32240 + }, + { + "epoch": 0.20603605790731253, + "grad_norm": 0.5261662602424622, + "learning_rate": 9.740626867277157e-05, + "loss": 0.9264, + "step": 32250 + }, + { + "epoch": 0.20609994505705123, + "grad_norm": 1.2539498805999756, + "learning_rate": 9.740467332883926e-05, + "loss": 1.0337, + "step": 32260 + }, + { + "epoch": 0.20616383220678994, + "grad_norm": 0.6254390478134155, + "learning_rate": 9.740307750750088e-05, + "loss": 1.0999, + "step": 32270 + }, + { + "epoch": 0.20622771935652862, + "grad_norm": 0.6762027144432068, + "learning_rate": 9.740148120877251e-05, + "loss": 0.9724, + "step": 32280 + }, + { + "epoch": 0.20629160650626732, + "grad_norm": 0.9390422105789185, + "learning_rate": 9.73998844326702e-05, + "loss": 0.8626, + "step": 32290 + }, + { + "epoch": 0.20635549365600603, + "grad_norm": 0.8526495695114136, + "learning_rate": 9.739828717921006e-05, + "loss": 0.911, + "step": 32300 + }, + { + "epoch": 0.20641938080574473, + "grad_norm": 1.050434947013855, + "learning_rate": 9.739668944840817e-05, + "loss": 1.0802, + "step": 32310 + }, + { + "epoch": 0.20648326795548344, + "grad_norm": 0.8968641757965088, + "learning_rate": 9.739509124028062e-05, + "loss": 1.0353, + "step": 32320 + }, + { + "epoch": 0.20654715510522215, + "grad_norm": 0.9247165322303772, + "learning_rate": 9.739349255484346e-05, + "loss": 1.1142, + "step": 32330 + }, + { + "epoch": 0.20661104225496082, + "grad_norm": 0.7122106552124023, + "learning_rate": 9.739189339211286e-05, + "loss": 1.0356, + "step": 32340 + }, + { + "epoch": 0.20667492940469953, + "grad_norm": 0.5841015577316284, + "learning_rate": 9.739029375210489e-05, + "loss": 0.9243, + "step": 32350 + }, + { + "epoch": 0.20673881655443824, + "grad_norm": 0.7304105758666992, + "learning_rate": 9.738869363483565e-05, + "loss": 0.8895, + "step": 32360 + }, + { + "epoch": 0.20680270370417694, + "grad_norm": 0.9879099726676941, + "learning_rate": 9.738709304032128e-05, + "loss": 1.0733, + "step": 32370 + }, + { + "epoch": 0.20686659085391565, + "grad_norm": 1.092883586883545, + "learning_rate": 9.738549196857789e-05, + "loss": 0.9595, + "step": 32380 + }, + { + "epoch": 0.20693047800365436, + "grad_norm": 0.8127654194831848, + "learning_rate": 9.738389041962159e-05, + "loss": 0.6739, + "step": 32390 + }, + { + "epoch": 0.20699436515339303, + "grad_norm": 0.60942542552948, + "learning_rate": 9.738228839346853e-05, + "loss": 0.829, + "step": 32400 + }, + { + "epoch": 0.20705825230313174, + "grad_norm": 1.1465409994125366, + "learning_rate": 9.738068589013483e-05, + "loss": 0.7232, + "step": 32410 + }, + { + "epoch": 0.20712213945287045, + "grad_norm": 0.6177552342414856, + "learning_rate": 9.737908290963663e-05, + "loss": 0.8286, + "step": 32420 + }, + { + "epoch": 0.20718602660260915, + "grad_norm": 0.5419365763664246, + "learning_rate": 9.737747945199009e-05, + "loss": 0.8722, + "step": 32430 + }, + { + "epoch": 0.20724991375234786, + "grad_norm": 1.0209770202636719, + "learning_rate": 9.737587551721132e-05, + "loss": 0.9187, + "step": 32440 + }, + { + "epoch": 0.20731380090208656, + "grad_norm": 0.7830290198326111, + "learning_rate": 9.737427110531652e-05, + "loss": 1.1599, + "step": 32450 + }, + { + "epoch": 0.20737768805182524, + "grad_norm": 1.0259994268417358, + "learning_rate": 9.737266621632182e-05, + "loss": 1.1211, + "step": 32460 + }, + { + "epoch": 0.20744157520156395, + "grad_norm": 0.6848270893096924, + "learning_rate": 9.73710608502434e-05, + "loss": 0.7989, + "step": 32470 + }, + { + "epoch": 0.20750546235130266, + "grad_norm": 0.779099702835083, + "learning_rate": 9.736945500709737e-05, + "loss": 0.9887, + "step": 32480 + }, + { + "epoch": 0.20756934950104136, + "grad_norm": 0.7140209078788757, + "learning_rate": 9.736784868689999e-05, + "loss": 1.075, + "step": 32490 + }, + { + "epoch": 0.20763323665078007, + "grad_norm": 0.7910488247871399, + "learning_rate": 9.736624188966738e-05, + "loss": 1.1467, + "step": 32500 + }, + { + "epoch": 0.20769712380051877, + "grad_norm": 0.8852772116661072, + "learning_rate": 9.736463461541574e-05, + "loss": 1.0504, + "step": 32510 + }, + { + "epoch": 0.20776101095025745, + "grad_norm": 1.6205745935440063, + "learning_rate": 9.736302686416126e-05, + "loss": 0.8582, + "step": 32520 + }, + { + "epoch": 0.20782489809999616, + "grad_norm": 0.9984052777290344, + "learning_rate": 9.736141863592012e-05, + "loss": 1.0526, + "step": 32530 + }, + { + "epoch": 0.20788878524973486, + "grad_norm": 0.7698317170143127, + "learning_rate": 9.735980993070852e-05, + "loss": 0.8745, + "step": 32540 + }, + { + "epoch": 0.20795267239947357, + "grad_norm": 1.8012065887451172, + "learning_rate": 9.735820074854265e-05, + "loss": 0.9542, + "step": 32550 + }, + { + "epoch": 0.20801655954921228, + "grad_norm": 0.7188138365745544, + "learning_rate": 9.735659108943876e-05, + "loss": 0.6682, + "step": 32560 + }, + { + "epoch": 0.20808044669895098, + "grad_norm": 0.7604565620422363, + "learning_rate": 9.7354980953413e-05, + "loss": 0.7246, + "step": 32570 + }, + { + "epoch": 0.20814433384868966, + "grad_norm": 0.6722016334533691, + "learning_rate": 9.735337034048162e-05, + "loss": 0.8719, + "step": 32580 + }, + { + "epoch": 0.20820822099842837, + "grad_norm": 0.5613377690315247, + "learning_rate": 9.735175925066082e-05, + "loss": 0.8531, + "step": 32590 + }, + { + "epoch": 0.20827210814816707, + "grad_norm": 1.168945550918579, + "learning_rate": 9.735014768396686e-05, + "loss": 1.1047, + "step": 32600 + }, + { + "epoch": 0.20833599529790578, + "grad_norm": 0.7283167243003845, + "learning_rate": 9.734853564041595e-05, + "loss": 0.7414, + "step": 32610 + }, + { + "epoch": 0.20839988244764449, + "grad_norm": 0.8897091150283813, + "learning_rate": 9.734692312002431e-05, + "loss": 0.9406, + "step": 32620 + }, + { + "epoch": 0.2084637695973832, + "grad_norm": 0.6193281412124634, + "learning_rate": 9.734531012280821e-05, + "loss": 0.8429, + "step": 32630 + }, + { + "epoch": 0.20852765674712187, + "grad_norm": 1.2287752628326416, + "learning_rate": 9.734369664878387e-05, + "loss": 0.9993, + "step": 32640 + }, + { + "epoch": 0.20859154389686058, + "grad_norm": 1.4086371660232544, + "learning_rate": 9.734208269796754e-05, + "loss": 0.6823, + "step": 32650 + }, + { + "epoch": 0.20865543104659928, + "grad_norm": 0.9113640785217285, + "learning_rate": 9.734046827037548e-05, + "loss": 1.1112, + "step": 32660 + }, + { + "epoch": 0.208719318196338, + "grad_norm": 0.7698211073875427, + "learning_rate": 9.733885336602396e-05, + "loss": 0.7977, + "step": 32670 + }, + { + "epoch": 0.2087832053460767, + "grad_norm": 2.5170323848724365, + "learning_rate": 9.733723798492921e-05, + "loss": 0.8861, + "step": 32680 + }, + { + "epoch": 0.2088470924958154, + "grad_norm": 0.5907607078552246, + "learning_rate": 9.733562212710755e-05, + "loss": 0.8325, + "step": 32690 + }, + { + "epoch": 0.2089109796455541, + "grad_norm": 0.7293870449066162, + "learning_rate": 9.733400579257521e-05, + "loss": 0.8956, + "step": 32700 + }, + { + "epoch": 0.20897486679529279, + "grad_norm": 0.9861850738525391, + "learning_rate": 9.733238898134848e-05, + "loss": 0.8441, + "step": 32710 + }, + { + "epoch": 0.2090387539450315, + "grad_norm": 0.8502741456031799, + "learning_rate": 9.733077169344366e-05, + "loss": 0.7623, + "step": 32720 + }, + { + "epoch": 0.2091026410947702, + "grad_norm": 0.6573517322540283, + "learning_rate": 9.7329153928877e-05, + "loss": 1.134, + "step": 32730 + }, + { + "epoch": 0.2091665282445089, + "grad_norm": 1.0283352136611938, + "learning_rate": 9.732753568766482e-05, + "loss": 1.1342, + "step": 32740 + }, + { + "epoch": 0.2092304153942476, + "grad_norm": 0.9217149019241333, + "learning_rate": 9.732591696982343e-05, + "loss": 0.7505, + "step": 32750 + }, + { + "epoch": 0.20929430254398632, + "grad_norm": 1.5344794988632202, + "learning_rate": 9.732429777536909e-05, + "loss": 0.8524, + "step": 32760 + }, + { + "epoch": 0.209358189693725, + "grad_norm": 0.6569311022758484, + "learning_rate": 9.732267810431814e-05, + "loss": 0.9557, + "step": 32770 + }, + { + "epoch": 0.2094220768434637, + "grad_norm": 1.269944190979004, + "learning_rate": 9.732105795668689e-05, + "loss": 0.8407, + "step": 32780 + }, + { + "epoch": 0.2094859639932024, + "grad_norm": 0.914414644241333, + "learning_rate": 9.731943733249164e-05, + "loss": 0.7725, + "step": 32790 + }, + { + "epoch": 0.2095498511429411, + "grad_norm": 0.5438032746315002, + "learning_rate": 9.731781623174871e-05, + "loss": 0.9418, + "step": 32800 + }, + { + "epoch": 0.20961373829267982, + "grad_norm": 0.9533820152282715, + "learning_rate": 9.731619465447445e-05, + "loss": 0.7887, + "step": 32810 + }, + { + "epoch": 0.20967762544241852, + "grad_norm": 0.9719078540802002, + "learning_rate": 9.731457260068517e-05, + "loss": 1.0511, + "step": 32820 + }, + { + "epoch": 0.2097415125921572, + "grad_norm": 0.8131768107414246, + "learning_rate": 9.73129500703972e-05, + "loss": 0.7443, + "step": 32830 + }, + { + "epoch": 0.2098053997418959, + "grad_norm": 0.9436559081077576, + "learning_rate": 9.731132706362692e-05, + "loss": 0.7655, + "step": 32840 + }, + { + "epoch": 0.20986928689163462, + "grad_norm": 0.6353892683982849, + "learning_rate": 9.730970358039062e-05, + "loss": 0.9139, + "step": 32850 + }, + { + "epoch": 0.20993317404137332, + "grad_norm": 1.7300466299057007, + "learning_rate": 9.730807962070467e-05, + "loss": 0.8533, + "step": 32860 + }, + { + "epoch": 0.20999706119111203, + "grad_norm": 0.9070175886154175, + "learning_rate": 9.730645518458545e-05, + "loss": 1.0384, + "step": 32870 + }, + { + "epoch": 0.21006094834085073, + "grad_norm": 1.630418300628662, + "learning_rate": 9.73048302720493e-05, + "loss": 1.1833, + "step": 32880 + }, + { + "epoch": 0.2101248354905894, + "grad_norm": 0.6094731092453003, + "learning_rate": 9.730320488311258e-05, + "loss": 0.8528, + "step": 32890 + }, + { + "epoch": 0.21018872264032812, + "grad_norm": 0.9163777828216553, + "learning_rate": 9.730157901779165e-05, + "loss": 0.986, + "step": 32900 + }, + { + "epoch": 0.21025260979006682, + "grad_norm": 0.885759174823761, + "learning_rate": 9.729995267610293e-05, + "loss": 1.0211, + "step": 32910 + }, + { + "epoch": 0.21031649693980553, + "grad_norm": 0.6660359501838684, + "learning_rate": 9.729832585806273e-05, + "loss": 0.8855, + "step": 32920 + }, + { + "epoch": 0.21038038408954424, + "grad_norm": 0.9728102087974548, + "learning_rate": 9.729669856368748e-05, + "loss": 0.9548, + "step": 32930 + }, + { + "epoch": 0.21044427123928294, + "grad_norm": 0.8899286985397339, + "learning_rate": 9.729507079299359e-05, + "loss": 0.98, + "step": 32940 + }, + { + "epoch": 0.21050815838902162, + "grad_norm": 0.8630788326263428, + "learning_rate": 9.729344254599738e-05, + "loss": 0.8842, + "step": 32950 + }, + { + "epoch": 0.21057204553876033, + "grad_norm": 1.159555435180664, + "learning_rate": 9.72918138227153e-05, + "loss": 1.0565, + "step": 32960 + }, + { + "epoch": 0.21063593268849903, + "grad_norm": 0.9720593690872192, + "learning_rate": 9.729018462316375e-05, + "loss": 0.8663, + "step": 32970 + }, + { + "epoch": 0.21069981983823774, + "grad_norm": 1.0807291269302368, + "learning_rate": 9.728855494735914e-05, + "loss": 0.7609, + "step": 32980 + }, + { + "epoch": 0.21076370698797645, + "grad_norm": 0.9693974852561951, + "learning_rate": 9.728692479531784e-05, + "loss": 0.9466, + "step": 32990 + }, + { + "epoch": 0.21082759413771515, + "grad_norm": 1.1828261613845825, + "learning_rate": 9.728529416705632e-05, + "loss": 1.17, + "step": 33000 + }, + { + "epoch": 0.21089148128745383, + "grad_norm": 0.8070554733276367, + "learning_rate": 9.728366306259098e-05, + "loss": 0.999, + "step": 33010 + }, + { + "epoch": 0.21095536843719254, + "grad_norm": 0.6054061651229858, + "learning_rate": 9.728203148193824e-05, + "loss": 0.7462, + "step": 33020 + }, + { + "epoch": 0.21101925558693124, + "grad_norm": 0.9334638714790344, + "learning_rate": 9.728039942511453e-05, + "loss": 0.8478, + "step": 33030 + }, + { + "epoch": 0.21108314273666995, + "grad_norm": 0.692486584186554, + "learning_rate": 9.727876689213631e-05, + "loss": 0.9051, + "step": 33040 + }, + { + "epoch": 0.21114702988640865, + "grad_norm": 0.7370048761367798, + "learning_rate": 9.727713388302e-05, + "loss": 1.131, + "step": 33050 + }, + { + "epoch": 0.21121091703614736, + "grad_norm": 0.8169997930526733, + "learning_rate": 9.727550039778205e-05, + "loss": 0.762, + "step": 33060 + }, + { + "epoch": 0.21127480418588604, + "grad_norm": 1.1108886003494263, + "learning_rate": 9.727386643643891e-05, + "loss": 0.8818, + "step": 33070 + }, + { + "epoch": 0.21133869133562475, + "grad_norm": 2.2037575244903564, + "learning_rate": 9.727223199900704e-05, + "loss": 0.9574, + "step": 33080 + }, + { + "epoch": 0.21140257848536345, + "grad_norm": 0.820559024810791, + "learning_rate": 9.72705970855029e-05, + "loss": 1.0319, + "step": 33090 + }, + { + "epoch": 0.21146646563510216, + "grad_norm": 0.6320390701293945, + "learning_rate": 9.726896169594295e-05, + "loss": 0.8773, + "step": 33100 + }, + { + "epoch": 0.21153035278484086, + "grad_norm": 0.6292109489440918, + "learning_rate": 9.726732583034365e-05, + "loss": 0.7979, + "step": 33110 + }, + { + "epoch": 0.21159423993457957, + "grad_norm": 1.0046201944351196, + "learning_rate": 9.72656894887215e-05, + "loss": 0.7807, + "step": 33120 + }, + { + "epoch": 0.21165812708431825, + "grad_norm": 0.8816448450088501, + "learning_rate": 9.726405267109297e-05, + "loss": 0.7321, + "step": 33130 + }, + { + "epoch": 0.21172201423405695, + "grad_norm": 0.9356503486633301, + "learning_rate": 9.726241537747454e-05, + "loss": 0.791, + "step": 33140 + }, + { + "epoch": 0.21178590138379566, + "grad_norm": 0.8952210545539856, + "learning_rate": 9.72607776078827e-05, + "loss": 0.9033, + "step": 33150 + }, + { + "epoch": 0.21184978853353437, + "grad_norm": 0.6787972450256348, + "learning_rate": 9.725913936233393e-05, + "loss": 0.8994, + "step": 33160 + }, + { + "epoch": 0.21191367568327307, + "grad_norm": 1.112884759902954, + "learning_rate": 9.725750064084476e-05, + "loss": 0.8439, + "step": 33170 + }, + { + "epoch": 0.21197756283301178, + "grad_norm": 1.08254873752594, + "learning_rate": 9.725586144343166e-05, + "loss": 0.8901, + "step": 33180 + }, + { + "epoch": 0.21204144998275046, + "grad_norm": 0.7427080273628235, + "learning_rate": 9.725422177011116e-05, + "loss": 0.9528, + "step": 33190 + }, + { + "epoch": 0.21210533713248916, + "grad_norm": 0.6845873594284058, + "learning_rate": 9.725274565723552e-05, + "loss": 1.1284, + "step": 33200 + }, + { + "epoch": 0.21216922428222787, + "grad_norm": 2.6716866493225098, + "learning_rate": 9.725110507973644e-05, + "loss": 0.9867, + "step": 33210 + }, + { + "epoch": 0.21223311143196658, + "grad_norm": 1.6081085205078125, + "learning_rate": 9.724946402637786e-05, + "loss": 0.6687, + "step": 33220 + }, + { + "epoch": 0.21229699858170528, + "grad_norm": 0.7291703820228577, + "learning_rate": 9.724782249717628e-05, + "loss": 0.8611, + "step": 33230 + }, + { + "epoch": 0.212360885731444, + "grad_norm": 0.6999391317367554, + "learning_rate": 9.724618049214828e-05, + "loss": 0.8015, + "step": 33240 + }, + { + "epoch": 0.21242477288118267, + "grad_norm": 0.7499661445617676, + "learning_rate": 9.724453801131035e-05, + "loss": 0.8521, + "step": 33250 + }, + { + "epoch": 0.21248866003092137, + "grad_norm": 1.027510404586792, + "learning_rate": 9.724289505467906e-05, + "loss": 1.0125, + "step": 33260 + }, + { + "epoch": 0.21255254718066008, + "grad_norm": 1.0336750745773315, + "learning_rate": 9.724125162227095e-05, + "loss": 0.8207, + "step": 33270 + }, + { + "epoch": 0.21261643433039878, + "grad_norm": 0.8094274401664734, + "learning_rate": 9.723960771410256e-05, + "loss": 0.7034, + "step": 33280 + }, + { + "epoch": 0.2126803214801375, + "grad_norm": 0.9066417813301086, + "learning_rate": 9.723796333019044e-05, + "loss": 0.8273, + "step": 33290 + }, + { + "epoch": 0.2127442086298762, + "grad_norm": 1.2769392728805542, + "learning_rate": 9.723631847055119e-05, + "loss": 0.792, + "step": 33300 + }, + { + "epoch": 0.21280809577961488, + "grad_norm": 0.751732349395752, + "learning_rate": 9.723467313520133e-05, + "loss": 0.8004, + "step": 33310 + }, + { + "epoch": 0.21287198292935358, + "grad_norm": 0.7040248513221741, + "learning_rate": 9.723302732415745e-05, + "loss": 1.0993, + "step": 33320 + }, + { + "epoch": 0.2129358700790923, + "grad_norm": 0.6100977063179016, + "learning_rate": 9.723138103743612e-05, + "loss": 0.7998, + "step": 33330 + }, + { + "epoch": 0.212999757228831, + "grad_norm": 1.0050344467163086, + "learning_rate": 9.722973427505391e-05, + "loss": 0.8967, + "step": 33340 + }, + { + "epoch": 0.2130636443785697, + "grad_norm": 0.5379306674003601, + "learning_rate": 9.722808703702743e-05, + "loss": 0.7652, + "step": 33350 + }, + { + "epoch": 0.2131275315283084, + "grad_norm": 0.6813077330589294, + "learning_rate": 9.722643932337327e-05, + "loss": 1.2678, + "step": 33360 + }, + { + "epoch": 0.21319141867804708, + "grad_norm": 1.1152585744857788, + "learning_rate": 9.722479113410799e-05, + "loss": 0.9101, + "step": 33370 + }, + { + "epoch": 0.2132553058277858, + "grad_norm": 0.8351494073867798, + "learning_rate": 9.722314246924822e-05, + "loss": 0.8285, + "step": 33380 + }, + { + "epoch": 0.2133191929775245, + "grad_norm": 0.7308449149131775, + "learning_rate": 9.722149332881054e-05, + "loss": 1.1201, + "step": 33390 + }, + { + "epoch": 0.2133830801272632, + "grad_norm": 1.078356385231018, + "learning_rate": 9.721984371281158e-05, + "loss": 0.9609, + "step": 33400 + }, + { + "epoch": 0.2134469672770019, + "grad_norm": 1.385568380355835, + "learning_rate": 9.721819362126793e-05, + "loss": 0.9715, + "step": 33410 + }, + { + "epoch": 0.21351085442674061, + "grad_norm": 0.8912048935890198, + "learning_rate": 9.721654305419623e-05, + "loss": 0.7701, + "step": 33420 + }, + { + "epoch": 0.2135747415764793, + "grad_norm": 0.7083896994590759, + "learning_rate": 9.721489201161309e-05, + "loss": 0.8202, + "step": 33430 + }, + { + "epoch": 0.213638628726218, + "grad_norm": 0.6518615484237671, + "learning_rate": 9.721324049353515e-05, + "loss": 0.7974, + "step": 33440 + }, + { + "epoch": 0.2137025158759567, + "grad_norm": 0.7615000605583191, + "learning_rate": 9.721158849997903e-05, + "loss": 0.9024, + "step": 33450 + }, + { + "epoch": 0.2137664030256954, + "grad_norm": 0.6199432611465454, + "learning_rate": 9.720993603096136e-05, + "loss": 1.0076, + "step": 33460 + }, + { + "epoch": 0.21383029017543412, + "grad_norm": 0.6537955403327942, + "learning_rate": 9.720828308649879e-05, + "loss": 0.9644, + "step": 33470 + }, + { + "epoch": 0.21389417732517282, + "grad_norm": 0.8364148139953613, + "learning_rate": 9.720662966660799e-05, + "loss": 0.7734, + "step": 33480 + }, + { + "epoch": 0.21395806447491153, + "grad_norm": 0.8252184391021729, + "learning_rate": 9.720497577130557e-05, + "loss": 0.9241, + "step": 33490 + }, + { + "epoch": 0.2140219516246502, + "grad_norm": 1.0425599813461304, + "learning_rate": 9.72033214006082e-05, + "loss": 0.6872, + "step": 33500 + }, + { + "epoch": 0.21408583877438891, + "grad_norm": 0.7613168358802795, + "learning_rate": 9.720166655453256e-05, + "loss": 0.8292, + "step": 33510 + }, + { + "epoch": 0.21414972592412762, + "grad_norm": 0.7358224391937256, + "learning_rate": 9.72000112330953e-05, + "loss": 0.7993, + "step": 33520 + }, + { + "epoch": 0.21421361307386633, + "grad_norm": 1.4351872205734253, + "learning_rate": 9.71983554363131e-05, + "loss": 0.9427, + "step": 33530 + }, + { + "epoch": 0.21427750022360503, + "grad_norm": 0.9211145043373108, + "learning_rate": 9.719669916420262e-05, + "loss": 0.7403, + "step": 33540 + }, + { + "epoch": 0.21434138737334374, + "grad_norm": 0.5790296792984009, + "learning_rate": 9.719504241678054e-05, + "loss": 0.77, + "step": 33550 + }, + { + "epoch": 0.21440527452308242, + "grad_norm": 1.3659369945526123, + "learning_rate": 9.719338519406358e-05, + "loss": 0.9941, + "step": 33560 + }, + { + "epoch": 0.21446916167282112, + "grad_norm": 0.6189954876899719, + "learning_rate": 9.719172749606838e-05, + "loss": 0.8592, + "step": 33570 + }, + { + "epoch": 0.21453304882255983, + "grad_norm": 0.8214682936668396, + "learning_rate": 9.719006932281167e-05, + "loss": 0.7411, + "step": 33580 + }, + { + "epoch": 0.21459693597229854, + "grad_norm": 0.5750226974487305, + "learning_rate": 9.718841067431013e-05, + "loss": 0.7238, + "step": 33590 + }, + { + "epoch": 0.21466082312203724, + "grad_norm": 1.5233280658721924, + "learning_rate": 9.718675155058046e-05, + "loss": 0.7061, + "step": 33600 + }, + { + "epoch": 0.21472471027177595, + "grad_norm": 0.5941923260688782, + "learning_rate": 9.718509195163939e-05, + "loss": 1.0065, + "step": 33610 + }, + { + "epoch": 0.21478859742151463, + "grad_norm": 0.8326600790023804, + "learning_rate": 9.718343187750363e-05, + "loss": 0.8198, + "step": 33620 + }, + { + "epoch": 0.21485248457125333, + "grad_norm": 0.6903313994407654, + "learning_rate": 9.718177132818988e-05, + "loss": 0.8067, + "step": 33630 + }, + { + "epoch": 0.21491637172099204, + "grad_norm": 1.647194266319275, + "learning_rate": 9.71801103037149e-05, + "loss": 0.8966, + "step": 33640 + }, + { + "epoch": 0.21498025887073074, + "grad_norm": 0.6679027080535889, + "learning_rate": 9.717844880409537e-05, + "loss": 0.7546, + "step": 33650 + }, + { + "epoch": 0.21504414602046945, + "grad_norm": 0.8270406723022461, + "learning_rate": 9.717678682934803e-05, + "loss": 1.068, + "step": 33660 + }, + { + "epoch": 0.21510803317020816, + "grad_norm": 0.6147032976150513, + "learning_rate": 9.717512437948966e-05, + "loss": 0.747, + "step": 33670 + }, + { + "epoch": 0.21517192031994684, + "grad_norm": 1.2196052074432373, + "learning_rate": 9.717346145453696e-05, + "loss": 0.7214, + "step": 33680 + }, + { + "epoch": 0.21523580746968554, + "grad_norm": 1.0216395854949951, + "learning_rate": 9.717179805450671e-05, + "loss": 0.8437, + "step": 33690 + }, + { + "epoch": 0.21529969461942425, + "grad_norm": 0.7304588556289673, + "learning_rate": 9.717013417941563e-05, + "loss": 0.6288, + "step": 33700 + }, + { + "epoch": 0.21536358176916295, + "grad_norm": 1.711125135421753, + "learning_rate": 9.716846982928049e-05, + "loss": 0.8811, + "step": 33710 + }, + { + "epoch": 0.21542746891890166, + "grad_norm": 0.868000864982605, + "learning_rate": 9.716680500411805e-05, + "loss": 0.857, + "step": 33720 + }, + { + "epoch": 0.21549135606864037, + "grad_norm": 0.7319660186767578, + "learning_rate": 9.716513970394509e-05, + "loss": 0.8252, + "step": 33730 + }, + { + "epoch": 0.21555524321837904, + "grad_norm": 0.9054515361785889, + "learning_rate": 9.716347392877836e-05, + "loss": 0.8681, + "step": 33740 + }, + { + "epoch": 0.21561913036811775, + "grad_norm": 1.218607783317566, + "learning_rate": 9.716180767863465e-05, + "loss": 0.9609, + "step": 33750 + }, + { + "epoch": 0.21568301751785646, + "grad_norm": 0.9217560291290283, + "learning_rate": 9.716014095353075e-05, + "loss": 0.8119, + "step": 33760 + }, + { + "epoch": 0.21574690466759516, + "grad_norm": 0.7078598141670227, + "learning_rate": 9.715847375348342e-05, + "loss": 0.9151, + "step": 33770 + }, + { + "epoch": 0.21581079181733387, + "grad_norm": 0.7617483139038086, + "learning_rate": 9.715680607850945e-05, + "loss": 0.9346, + "step": 33780 + }, + { + "epoch": 0.21587467896707258, + "grad_norm": 0.7594091892242432, + "learning_rate": 9.715513792862565e-05, + "loss": 0.9478, + "step": 33790 + }, + { + "epoch": 0.21593856611681125, + "grad_norm": 0.9850571155548096, + "learning_rate": 9.715346930384882e-05, + "loss": 0.7815, + "step": 33800 + }, + { + "epoch": 0.21600245326654996, + "grad_norm": 0.8838279843330383, + "learning_rate": 9.715180020419576e-05, + "loss": 1.0338, + "step": 33810 + }, + { + "epoch": 0.21606634041628867, + "grad_norm": 0.7649998068809509, + "learning_rate": 9.715013062968328e-05, + "loss": 0.839, + "step": 33820 + }, + { + "epoch": 0.21613022756602737, + "grad_norm": 0.8073322176933289, + "learning_rate": 9.71484605803282e-05, + "loss": 1.0359, + "step": 33830 + }, + { + "epoch": 0.21619411471576608, + "grad_norm": 1.914969563484192, + "learning_rate": 9.714679005614733e-05, + "loss": 0.972, + "step": 33840 + }, + { + "epoch": 0.21625800186550478, + "grad_norm": 0.781913161277771, + "learning_rate": 9.714511905715749e-05, + "loss": 1.2603, + "step": 33850 + }, + { + "epoch": 0.21632188901524346, + "grad_norm": 0.5499342083930969, + "learning_rate": 9.714344758337553e-05, + "loss": 1.0211, + "step": 33860 + }, + { + "epoch": 0.21638577616498217, + "grad_norm": 2.390815496444702, + "learning_rate": 9.714177563481824e-05, + "loss": 1.1886, + "step": 33870 + }, + { + "epoch": 0.21644966331472087, + "grad_norm": 2.6002392768859863, + "learning_rate": 9.71401032115025e-05, + "loss": 1.0595, + "step": 33880 + }, + { + "epoch": 0.21651355046445958, + "grad_norm": 0.8145592212677002, + "learning_rate": 9.713843031344515e-05, + "loss": 0.8558, + "step": 33890 + }, + { + "epoch": 0.2165774376141983, + "grad_norm": 0.7605422139167786, + "learning_rate": 9.713675694066302e-05, + "loss": 0.79, + "step": 33900 + }, + { + "epoch": 0.216641324763937, + "grad_norm": 0.9282397031784058, + "learning_rate": 9.713508309317296e-05, + "loss": 0.8963, + "step": 33910 + }, + { + "epoch": 0.21670521191367567, + "grad_norm": 0.6586880683898926, + "learning_rate": 9.713340877099183e-05, + "loss": 0.7421, + "step": 33920 + }, + { + "epoch": 0.21676909906341438, + "grad_norm": 0.9235056042671204, + "learning_rate": 9.713173397413652e-05, + "loss": 0.8292, + "step": 33930 + }, + { + "epoch": 0.21683298621315308, + "grad_norm": 0.7915987372398376, + "learning_rate": 9.713005870262386e-05, + "loss": 0.8096, + "step": 33940 + }, + { + "epoch": 0.2168968733628918, + "grad_norm": 0.5287061333656311, + "learning_rate": 9.712838295647074e-05, + "loss": 0.7746, + "step": 33950 + }, + { + "epoch": 0.2169607605126305, + "grad_norm": 0.7330449819564819, + "learning_rate": 9.712670673569403e-05, + "loss": 0.9486, + "step": 33960 + }, + { + "epoch": 0.2170246476623692, + "grad_norm": 0.7698398232460022, + "learning_rate": 9.712503004031061e-05, + "loss": 0.9407, + "step": 33970 + }, + { + "epoch": 0.21708853481210788, + "grad_norm": 5.071091651916504, + "learning_rate": 9.712335287033739e-05, + "loss": 1.046, + "step": 33980 + }, + { + "epoch": 0.2171524219618466, + "grad_norm": 0.8342990875244141, + "learning_rate": 9.712167522579121e-05, + "loss": 0.7953, + "step": 33990 + }, + { + "epoch": 0.2172163091115853, + "grad_norm": 1.221957802772522, + "learning_rate": 9.7119997106689e-05, + "loss": 0.8411, + "step": 34000 + }, + { + "epoch": 0.217280196261324, + "grad_norm": 1.421647548675537, + "learning_rate": 9.711831851304767e-05, + "loss": 0.8459, + "step": 34010 + }, + { + "epoch": 0.2173440834110627, + "grad_norm": 1.0833210945129395, + "learning_rate": 9.71166394448841e-05, + "loss": 0.9682, + "step": 34020 + }, + { + "epoch": 0.2174079705608014, + "grad_norm": 0.7942554354667664, + "learning_rate": 9.71149599022152e-05, + "loss": 0.9468, + "step": 34030 + }, + { + "epoch": 0.2174718577105401, + "grad_norm": 0.5950953364372253, + "learning_rate": 9.71132798850579e-05, + "loss": 0.9885, + "step": 34040 + }, + { + "epoch": 0.2175357448602788, + "grad_norm": 1.1501030921936035, + "learning_rate": 9.711159939342911e-05, + "loss": 0.7241, + "step": 34050 + }, + { + "epoch": 0.2175996320100175, + "grad_norm": 0.8352699875831604, + "learning_rate": 9.710991842734577e-05, + "loss": 0.9376, + "step": 34060 + }, + { + "epoch": 0.2176635191597562, + "grad_norm": 1.2237290143966675, + "learning_rate": 9.710823698682478e-05, + "loss": 0.8397, + "step": 34070 + }, + { + "epoch": 0.21772740630949491, + "grad_norm": 1.1886348724365234, + "learning_rate": 9.71065550718831e-05, + "loss": 0.8056, + "step": 34080 + }, + { + "epoch": 0.21779129345923362, + "grad_norm": 0.954849362373352, + "learning_rate": 9.710487268253765e-05, + "loss": 0.9837, + "step": 34090 + }, + { + "epoch": 0.2178551806089723, + "grad_norm": 0.7035555243492126, + "learning_rate": 9.710318981880539e-05, + "loss": 0.7794, + "step": 34100 + }, + { + "epoch": 0.217919067758711, + "grad_norm": 1.048746109008789, + "learning_rate": 9.710150648070325e-05, + "loss": 0.8262, + "step": 34110 + }, + { + "epoch": 0.2179829549084497, + "grad_norm": 0.8809221386909485, + "learning_rate": 9.70998226682482e-05, + "loss": 1.0516, + "step": 34120 + }, + { + "epoch": 0.21804684205818842, + "grad_norm": 1.0661201477050781, + "learning_rate": 9.709813838145718e-05, + "loss": 1.0833, + "step": 34130 + }, + { + "epoch": 0.21811072920792712, + "grad_norm": 1.1189355850219727, + "learning_rate": 9.709645362034716e-05, + "loss": 1.1234, + "step": 34140 + }, + { + "epoch": 0.21817461635766583, + "grad_norm": 0.872307538986206, + "learning_rate": 9.709476838493511e-05, + "loss": 1.0436, + "step": 34150 + }, + { + "epoch": 0.2182385035074045, + "grad_norm": 0.6649029850959778, + "learning_rate": 9.709308267523801e-05, + "loss": 0.8959, + "step": 34160 + }, + { + "epoch": 0.2183023906571432, + "grad_norm": 0.6744316220283508, + "learning_rate": 9.70913964912728e-05, + "loss": 0.9037, + "step": 34170 + }, + { + "epoch": 0.21836627780688192, + "grad_norm": 1.0881192684173584, + "learning_rate": 9.708970983305652e-05, + "loss": 0.8183, + "step": 34180 + }, + { + "epoch": 0.21843016495662063, + "grad_norm": 0.9044772386550903, + "learning_rate": 9.70880227006061e-05, + "loss": 0.9755, + "step": 34190 + }, + { + "epoch": 0.21849405210635933, + "grad_norm": 0.9986025094985962, + "learning_rate": 9.708633509393856e-05, + "loss": 0.9058, + "step": 34200 + }, + { + "epoch": 0.21855793925609804, + "grad_norm": 0.7025921940803528, + "learning_rate": 9.70846470130709e-05, + "loss": 1.0454, + "step": 34210 + }, + { + "epoch": 0.21862182640583672, + "grad_norm": 0.6166189312934875, + "learning_rate": 9.70829584580201e-05, + "loss": 0.9536, + "step": 34220 + }, + { + "epoch": 0.21868571355557542, + "grad_norm": 1.0105708837509155, + "learning_rate": 9.708126942880318e-05, + "loss": 0.7328, + "step": 34230 + }, + { + "epoch": 0.21874960070531413, + "grad_norm": 0.7658517956733704, + "learning_rate": 9.707957992543714e-05, + "loss": 0.753, + "step": 34240 + }, + { + "epoch": 0.21881348785505284, + "grad_norm": 0.8330119252204895, + "learning_rate": 9.707788994793901e-05, + "loss": 0.9129, + "step": 34250 + }, + { + "epoch": 0.21887737500479154, + "grad_norm": 1.216202735900879, + "learning_rate": 9.707619949632578e-05, + "loss": 0.7501, + "step": 34260 + }, + { + "epoch": 0.21894126215453025, + "grad_norm": 0.7274483442306519, + "learning_rate": 9.707450857061452e-05, + "loss": 0.8814, + "step": 34270 + }, + { + "epoch": 0.21900514930426893, + "grad_norm": 0.7238608598709106, + "learning_rate": 9.707281717082222e-05, + "loss": 0.7132, + "step": 34280 + }, + { + "epoch": 0.21906903645400763, + "grad_norm": 0.736379861831665, + "learning_rate": 9.707112529696594e-05, + "loss": 0.8236, + "step": 34290 + }, + { + "epoch": 0.21913292360374634, + "grad_norm": 0.8833523988723755, + "learning_rate": 9.706943294906268e-05, + "loss": 1.0377, + "step": 34300 + }, + { + "epoch": 0.21919681075348504, + "grad_norm": 0.7226671576499939, + "learning_rate": 9.706774012712953e-05, + "loss": 0.9242, + "step": 34310 + }, + { + "epoch": 0.21926069790322375, + "grad_norm": 1.3238605260849, + "learning_rate": 9.706604683118353e-05, + "loss": 0.9551, + "step": 34320 + }, + { + "epoch": 0.21932458505296246, + "grad_norm": 2.324223279953003, + "learning_rate": 9.706435306124169e-05, + "loss": 1.0707, + "step": 34330 + }, + { + "epoch": 0.21938847220270116, + "grad_norm": 0.6457687020301819, + "learning_rate": 9.70626588173211e-05, + "loss": 0.8856, + "step": 34340 + }, + { + "epoch": 0.21945235935243984, + "grad_norm": 0.7554599642753601, + "learning_rate": 9.706096409943883e-05, + "loss": 0.8271, + "step": 34350 + }, + { + "epoch": 0.21951624650217855, + "grad_norm": 1.154531478881836, + "learning_rate": 9.705926890761195e-05, + "loss": 1.2138, + "step": 34360 + }, + { + "epoch": 0.21958013365191725, + "grad_norm": 0.8493779897689819, + "learning_rate": 9.705757324185751e-05, + "loss": 0.7191, + "step": 34370 + }, + { + "epoch": 0.21964402080165596, + "grad_norm": 1.1541070938110352, + "learning_rate": 9.705587710219259e-05, + "loss": 0.9184, + "step": 34380 + }, + { + "epoch": 0.21970790795139467, + "grad_norm": 2.6271910667419434, + "learning_rate": 9.705418048863429e-05, + "loss": 1.0036, + "step": 34390 + }, + { + "epoch": 0.21977179510113337, + "grad_norm": 0.7804545164108276, + "learning_rate": 9.705248340119968e-05, + "loss": 1.1445, + "step": 34400 + }, + { + "epoch": 0.21983568225087205, + "grad_norm": 0.515604555606842, + "learning_rate": 9.705078583990586e-05, + "loss": 0.729, + "step": 34410 + }, + { + "epoch": 0.21989956940061076, + "grad_norm": 0.9133629202842712, + "learning_rate": 9.704908780476991e-05, + "loss": 1.0537, + "step": 34420 + }, + { + "epoch": 0.21996345655034946, + "grad_norm": 1.274163842201233, + "learning_rate": 9.704738929580896e-05, + "loss": 1.0591, + "step": 34430 + }, + { + "epoch": 0.22002734370008817, + "grad_norm": 0.45899906754493713, + "learning_rate": 9.704569031304009e-05, + "loss": 0.6701, + "step": 34440 + }, + { + "epoch": 0.22009123084982687, + "grad_norm": 0.942436933517456, + "learning_rate": 9.704399085648041e-05, + "loss": 0.9153, + "step": 34450 + }, + { + "epoch": 0.22015511799956558, + "grad_norm": 1.0042204856872559, + "learning_rate": 9.704229092614705e-05, + "loss": 0.8758, + "step": 34460 + }, + { + "epoch": 0.22021900514930426, + "grad_norm": 0.4676646292209625, + "learning_rate": 9.704059052205712e-05, + "loss": 0.7552, + "step": 34470 + }, + { + "epoch": 0.22028289229904296, + "grad_norm": 0.8477068543434143, + "learning_rate": 9.703888964422775e-05, + "loss": 0.8348, + "step": 34480 + }, + { + "epoch": 0.22034677944878167, + "grad_norm": 1.006347417831421, + "learning_rate": 9.703718829267607e-05, + "loss": 0.9339, + "step": 34490 + }, + { + "epoch": 0.22041066659852038, + "grad_norm": 0.8507176637649536, + "learning_rate": 9.703548646741923e-05, + "loss": 0.9948, + "step": 34500 + }, + { + "epoch": 0.22047455374825908, + "grad_norm": 0.9493306279182434, + "learning_rate": 9.703378416847431e-05, + "loss": 0.7232, + "step": 34510 + }, + { + "epoch": 0.2205384408979978, + "grad_norm": 0.7349863052368164, + "learning_rate": 9.703208139585851e-05, + "loss": 0.7541, + "step": 34520 + }, + { + "epoch": 0.22060232804773647, + "grad_norm": 0.8959886431694031, + "learning_rate": 9.703037814958898e-05, + "loss": 0.9639, + "step": 34530 + }, + { + "epoch": 0.22066621519747517, + "grad_norm": 0.6771888136863708, + "learning_rate": 9.702867442968283e-05, + "loss": 0.9092, + "step": 34540 + }, + { + "epoch": 0.22073010234721388, + "grad_norm": 0.784125804901123, + "learning_rate": 9.702697023615726e-05, + "loss": 0.8621, + "step": 34550 + }, + { + "epoch": 0.2207939894969526, + "grad_norm": 1.009945273399353, + "learning_rate": 9.70252655690294e-05, + "loss": 0.7527, + "step": 34560 + }, + { + "epoch": 0.2208578766466913, + "grad_norm": 1.0403534173965454, + "learning_rate": 9.702356042831643e-05, + "loss": 1.0779, + "step": 34570 + }, + { + "epoch": 0.22092176379643, + "grad_norm": 0.9144579172134399, + "learning_rate": 9.702185481403555e-05, + "loss": 0.9942, + "step": 34580 + }, + { + "epoch": 0.22098565094616868, + "grad_norm": 1.012250542640686, + "learning_rate": 9.702014872620388e-05, + "loss": 0.8412, + "step": 34590 + }, + { + "epoch": 0.22104953809590738, + "grad_norm": 1.3977776765823364, + "learning_rate": 9.701844216483866e-05, + "loss": 0.9844, + "step": 34600 + }, + { + "epoch": 0.2211134252456461, + "grad_norm": 0.8186967372894287, + "learning_rate": 9.701673512995704e-05, + "loss": 0.8303, + "step": 34610 + }, + { + "epoch": 0.2211773123953848, + "grad_norm": 0.7828638553619385, + "learning_rate": 9.701502762157623e-05, + "loss": 0.9695, + "step": 34620 + }, + { + "epoch": 0.2212411995451235, + "grad_norm": 0.9973053336143494, + "learning_rate": 9.701331963971341e-05, + "loss": 0.8977, + "step": 34630 + }, + { + "epoch": 0.2213050866948622, + "grad_norm": 1.1445131301879883, + "learning_rate": 9.70116111843858e-05, + "loss": 0.8871, + "step": 34640 + }, + { + "epoch": 0.22136897384460089, + "grad_norm": 0.8758741617202759, + "learning_rate": 9.700990225561058e-05, + "loss": 0.81, + "step": 34650 + }, + { + "epoch": 0.2214328609943396, + "grad_norm": 0.49622881412506104, + "learning_rate": 9.700819285340497e-05, + "loss": 0.8899, + "step": 34660 + }, + { + "epoch": 0.2214967481440783, + "grad_norm": 0.9389495253562927, + "learning_rate": 9.700648297778621e-05, + "loss": 0.867, + "step": 34670 + }, + { + "epoch": 0.221560635293817, + "grad_norm": 2.2437360286712646, + "learning_rate": 9.700477262877149e-05, + "loss": 1.0428, + "step": 34680 + }, + { + "epoch": 0.2216245224435557, + "grad_norm": 1.3925631046295166, + "learning_rate": 9.700306180637804e-05, + "loss": 0.91, + "step": 34690 + }, + { + "epoch": 0.22168840959329442, + "grad_norm": 1.310964822769165, + "learning_rate": 9.700135051062312e-05, + "loss": 0.8114, + "step": 34700 + }, + { + "epoch": 0.2217522967430331, + "grad_norm": 1.04167902469635, + "learning_rate": 9.699963874152392e-05, + "loss": 0.7845, + "step": 34710 + }, + { + "epoch": 0.2218161838927718, + "grad_norm": 0.9633674621582031, + "learning_rate": 9.699792649909768e-05, + "loss": 0.6929, + "step": 34720 + }, + { + "epoch": 0.2218800710425105, + "grad_norm": 0.6973922252655029, + "learning_rate": 9.699621378336168e-05, + "loss": 0.7923, + "step": 34730 + }, + { + "epoch": 0.2219439581922492, + "grad_norm": 0.6631523370742798, + "learning_rate": 9.699450059433314e-05, + "loss": 0.8096, + "step": 34740 + }, + { + "epoch": 0.22200784534198792, + "grad_norm": 1.064477801322937, + "learning_rate": 9.699278693202933e-05, + "loss": 0.9907, + "step": 34750 + }, + { + "epoch": 0.22207173249172663, + "grad_norm": 1.0626312494277954, + "learning_rate": 9.699107279646751e-05, + "loss": 0.7736, + "step": 34760 + }, + { + "epoch": 0.2221356196414653, + "grad_norm": 0.5820396542549133, + "learning_rate": 9.698935818766493e-05, + "loss": 0.7869, + "step": 34770 + }, + { + "epoch": 0.222199506791204, + "grad_norm": 0.7940320372581482, + "learning_rate": 9.698764310563885e-05, + "loss": 0.8672, + "step": 34780 + }, + { + "epoch": 0.22226339394094272, + "grad_norm": 0.9088238477706909, + "learning_rate": 9.698592755040657e-05, + "loss": 0.8374, + "step": 34790 + }, + { + "epoch": 0.22232728109068142, + "grad_norm": 1.1797140836715698, + "learning_rate": 9.698421152198533e-05, + "loss": 1.074, + "step": 34800 + }, + { + "epoch": 0.22239116824042013, + "grad_norm": 0.7393913269042969, + "learning_rate": 9.698249502039243e-05, + "loss": 0.7102, + "step": 34810 + }, + { + "epoch": 0.22245505539015883, + "grad_norm": 1.2401602268218994, + "learning_rate": 9.698077804564519e-05, + "loss": 1.0855, + "step": 34820 + }, + { + "epoch": 0.2225189425398975, + "grad_norm": 0.7187434434890747, + "learning_rate": 9.697906059776085e-05, + "loss": 1.02, + "step": 34830 + }, + { + "epoch": 0.22258282968963622, + "grad_norm": 0.719468355178833, + "learning_rate": 9.697734267675674e-05, + "loss": 0.932, + "step": 34840 + }, + { + "epoch": 0.22264671683937493, + "grad_norm": 0.8819088935852051, + "learning_rate": 9.697562428265012e-05, + "loss": 0.8238, + "step": 34850 + }, + { + "epoch": 0.22271060398911363, + "grad_norm": 0.49491390585899353, + "learning_rate": 9.697390541545834e-05, + "loss": 0.9514, + "step": 34860 + }, + { + "epoch": 0.22277449113885234, + "grad_norm": 1.5479438304901123, + "learning_rate": 9.697218607519871e-05, + "loss": 0.9275, + "step": 34870 + }, + { + "epoch": 0.22283837828859104, + "grad_norm": 0.763923704624176, + "learning_rate": 9.697046626188852e-05, + "loss": 0.8258, + "step": 34880 + }, + { + "epoch": 0.22290226543832972, + "grad_norm": 1.1767523288726807, + "learning_rate": 9.696874597554509e-05, + "loss": 0.7937, + "step": 34890 + }, + { + "epoch": 0.22296615258806843, + "grad_norm": 1.3667820692062378, + "learning_rate": 9.696702521618576e-05, + "loss": 0.8892, + "step": 34900 + }, + { + "epoch": 0.22303003973780713, + "grad_norm": 0.7159459590911865, + "learning_rate": 9.696530398382786e-05, + "loss": 0.9855, + "step": 34910 + }, + { + "epoch": 0.22309392688754584, + "grad_norm": 0.6876511573791504, + "learning_rate": 9.69635822784887e-05, + "loss": 1.0461, + "step": 34920 + }, + { + "epoch": 0.22315781403728455, + "grad_norm": 0.6138442158699036, + "learning_rate": 9.696186010018566e-05, + "loss": 0.8192, + "step": 34930 + }, + { + "epoch": 0.22322170118702325, + "grad_norm": 0.6763925552368164, + "learning_rate": 9.696013744893604e-05, + "loss": 0.8746, + "step": 34940 + }, + { + "epoch": 0.22328558833676193, + "grad_norm": 0.807370126247406, + "learning_rate": 9.695841432475723e-05, + "loss": 0.7289, + "step": 34950 + }, + { + "epoch": 0.22334947548650064, + "grad_norm": 0.7103719711303711, + "learning_rate": 9.695669072766655e-05, + "loss": 0.8883, + "step": 34960 + }, + { + "epoch": 0.22341336263623934, + "grad_norm": 0.6593259572982788, + "learning_rate": 9.695496665768138e-05, + "loss": 0.845, + "step": 34970 + }, + { + "epoch": 0.22347724978597805, + "grad_norm": 0.7919392585754395, + "learning_rate": 9.695324211481907e-05, + "loss": 0.7294, + "step": 34980 + }, + { + "epoch": 0.22354113693571676, + "grad_norm": 1.0960744619369507, + "learning_rate": 9.695151709909698e-05, + "loss": 0.8352, + "step": 34990 + }, + { + "epoch": 0.22360502408545546, + "grad_norm": 0.9134578704833984, + "learning_rate": 9.69497916105325e-05, + "loss": 0.8196, + "step": 35000 + }, + { + "epoch": 0.22366891123519414, + "grad_norm": 0.7842540144920349, + "learning_rate": 9.6948065649143e-05, + "loss": 1.0348, + "step": 35010 + }, + { + "epoch": 0.22373279838493285, + "grad_norm": 0.6312137842178345, + "learning_rate": 9.694633921494588e-05, + "loss": 0.62, + "step": 35020 + }, + { + "epoch": 0.22379668553467155, + "grad_norm": 0.6972392797470093, + "learning_rate": 9.69446123079585e-05, + "loss": 0.8615, + "step": 35030 + }, + { + "epoch": 0.22386057268441026, + "grad_norm": 0.7970590591430664, + "learning_rate": 9.694288492819825e-05, + "loss": 0.925, + "step": 35040 + }, + { + "epoch": 0.22392445983414896, + "grad_norm": 1.2083357572555542, + "learning_rate": 9.694115707568254e-05, + "loss": 0.7092, + "step": 35050 + }, + { + "epoch": 0.22398834698388767, + "grad_norm": 0.585113525390625, + "learning_rate": 9.693942875042878e-05, + "loss": 0.9887, + "step": 35060 + }, + { + "epoch": 0.22405223413362635, + "grad_norm": 1.90079665184021, + "learning_rate": 9.693769995245437e-05, + "loss": 0.9447, + "step": 35070 + }, + { + "epoch": 0.22411612128336506, + "grad_norm": 0.8119843602180481, + "learning_rate": 9.69359706817767e-05, + "loss": 0.7767, + "step": 35080 + }, + { + "epoch": 0.22418000843310376, + "grad_norm": 0.5535334348678589, + "learning_rate": 9.69342409384132e-05, + "loss": 1.0211, + "step": 35090 + }, + { + "epoch": 0.22424389558284247, + "grad_norm": 0.5760706663131714, + "learning_rate": 9.69325107223813e-05, + "loss": 0.7181, + "step": 35100 + }, + { + "epoch": 0.22430778273258117, + "grad_norm": 0.7716420292854309, + "learning_rate": 9.69307800336984e-05, + "loss": 0.9217, + "step": 35110 + }, + { + "epoch": 0.22437166988231988, + "grad_norm": 1.1675033569335938, + "learning_rate": 9.692904887238195e-05, + "loss": 1.1387, + "step": 35120 + }, + { + "epoch": 0.22443555703205856, + "grad_norm": 0.8765130043029785, + "learning_rate": 9.692731723844939e-05, + "loss": 0.7809, + "step": 35130 + }, + { + "epoch": 0.22449944418179726, + "grad_norm": 2.621401786804199, + "learning_rate": 9.692558513191812e-05, + "loss": 0.9963, + "step": 35140 + }, + { + "epoch": 0.22456333133153597, + "grad_norm": 1.051527976989746, + "learning_rate": 9.692385255280564e-05, + "loss": 1.0771, + "step": 35150 + }, + { + "epoch": 0.22462721848127468, + "grad_norm": 0.8392159342765808, + "learning_rate": 9.692211950112936e-05, + "loss": 0.9217, + "step": 35160 + }, + { + "epoch": 0.22469110563101338, + "grad_norm": 0.7495473027229309, + "learning_rate": 9.692038597690674e-05, + "loss": 1.0467, + "step": 35170 + }, + { + "epoch": 0.2247549927807521, + "grad_norm": 0.6959127187728882, + "learning_rate": 9.691865198015524e-05, + "loss": 1.0204, + "step": 35180 + }, + { + "epoch": 0.2248188799304908, + "grad_norm": 1.0118756294250488, + "learning_rate": 9.691691751089234e-05, + "loss": 0.9488, + "step": 35190 + }, + { + "epoch": 0.22488276708022947, + "grad_norm": 1.1415350437164307, + "learning_rate": 9.691518256913547e-05, + "loss": 1.2746, + "step": 35200 + }, + { + "epoch": 0.22494665422996818, + "grad_norm": 0.9474114179611206, + "learning_rate": 9.691344715490213e-05, + "loss": 0.9522, + "step": 35210 + }, + { + "epoch": 0.22501054137970689, + "grad_norm": 1.113400936126709, + "learning_rate": 9.69117112682098e-05, + "loss": 0.6583, + "step": 35220 + }, + { + "epoch": 0.2250744285294456, + "grad_norm": 1.0649466514587402, + "learning_rate": 9.690997490907594e-05, + "loss": 0.9882, + "step": 35230 + }, + { + "epoch": 0.2251383156791843, + "grad_norm": 0.7435470819473267, + "learning_rate": 9.690823807751807e-05, + "loss": 0.7211, + "step": 35240 + }, + { + "epoch": 0.225202202828923, + "grad_norm": 1.1411978006362915, + "learning_rate": 9.690650077355364e-05, + "loss": 0.8664, + "step": 35250 + }, + { + "epoch": 0.22526608997866168, + "grad_norm": 0.888380765914917, + "learning_rate": 9.690476299720018e-05, + "loss": 0.9631, + "step": 35260 + }, + { + "epoch": 0.2253299771284004, + "grad_norm": 0.8436518907546997, + "learning_rate": 9.690302474847516e-05, + "loss": 1.0347, + "step": 35270 + }, + { + "epoch": 0.2253938642781391, + "grad_norm": 0.8739194869995117, + "learning_rate": 9.690128602739613e-05, + "loss": 1.0825, + "step": 35280 + }, + { + "epoch": 0.2254577514278778, + "grad_norm": 0.9203752875328064, + "learning_rate": 9.689954683398057e-05, + "loss": 0.8836, + "step": 35290 + }, + { + "epoch": 0.2255216385776165, + "grad_norm": 0.7080454230308533, + "learning_rate": 9.6897807168246e-05, + "loss": 0.7775, + "step": 35300 + }, + { + "epoch": 0.2255855257273552, + "grad_norm": 0.8330931067466736, + "learning_rate": 9.689606703020993e-05, + "loss": 0.948, + "step": 35310 + }, + { + "epoch": 0.2256494128770939, + "grad_norm": 0.9416504502296448, + "learning_rate": 9.689432641988988e-05, + "loss": 0.8721, + "step": 35320 + }, + { + "epoch": 0.2257133000268326, + "grad_norm": 0.7803798317909241, + "learning_rate": 9.689258533730341e-05, + "loss": 0.8416, + "step": 35330 + }, + { + "epoch": 0.2257771871765713, + "grad_norm": 0.6242881417274475, + "learning_rate": 9.689084378246804e-05, + "loss": 0.7793, + "step": 35340 + }, + { + "epoch": 0.22584107432631, + "grad_norm": 0.8477808833122253, + "learning_rate": 9.68891017554013e-05, + "loss": 0.84, + "step": 35350 + }, + { + "epoch": 0.22590496147604872, + "grad_norm": 0.8038986325263977, + "learning_rate": 9.688735925612075e-05, + "loss": 0.7162, + "step": 35360 + }, + { + "epoch": 0.22596884862578742, + "grad_norm": 0.6531451940536499, + "learning_rate": 9.688561628464391e-05, + "loss": 0.8058, + "step": 35370 + }, + { + "epoch": 0.2260327357755261, + "grad_norm": 0.8681033849716187, + "learning_rate": 9.688387284098837e-05, + "loss": 1.0791, + "step": 35380 + }, + { + "epoch": 0.2260966229252648, + "grad_norm": 1.3325775861740112, + "learning_rate": 9.688212892517167e-05, + "loss": 0.6875, + "step": 35390 + }, + { + "epoch": 0.2261605100750035, + "grad_norm": 0.5260213017463684, + "learning_rate": 9.688038453721137e-05, + "loss": 0.8236, + "step": 35400 + }, + { + "epoch": 0.22622439722474222, + "grad_norm": 1.0699787139892578, + "learning_rate": 9.687863967712503e-05, + "loss": 0.8972, + "step": 35410 + }, + { + "epoch": 0.22628828437448092, + "grad_norm": 0.6690873503684998, + "learning_rate": 9.687689434493025e-05, + "loss": 0.8042, + "step": 35420 + }, + { + "epoch": 0.22635217152421963, + "grad_norm": 0.6598352789878845, + "learning_rate": 9.687514854064458e-05, + "loss": 1.0096, + "step": 35430 + }, + { + "epoch": 0.2264160586739583, + "grad_norm": 0.839152455329895, + "learning_rate": 9.68735769131643e-05, + "loss": 1.0554, + "step": 35440 + }, + { + "epoch": 0.22647994582369702, + "grad_norm": 1.03608238697052, + "learning_rate": 9.68718302119544e-05, + "loss": 1.0627, + "step": 35450 + }, + { + "epoch": 0.22654383297343572, + "grad_norm": 0.8941081762313843, + "learning_rate": 9.687008303870461e-05, + "loss": 0.663, + "step": 35460 + }, + { + "epoch": 0.22660772012317443, + "grad_norm": 0.5950977802276611, + "learning_rate": 9.686833539343256e-05, + "loss": 0.9636, + "step": 35470 + }, + { + "epoch": 0.22667160727291313, + "grad_norm": 0.5966373085975647, + "learning_rate": 9.686658727615581e-05, + "loss": 0.8674, + "step": 35480 + }, + { + "epoch": 0.22673549442265184, + "grad_norm": 0.8043856620788574, + "learning_rate": 9.686483868689198e-05, + "loss": 0.9328, + "step": 35490 + }, + { + "epoch": 0.22679938157239052, + "grad_norm": 1.025963306427002, + "learning_rate": 9.686308962565869e-05, + "loss": 0.7796, + "step": 35500 + }, + { + "epoch": 0.22686326872212922, + "grad_norm": 0.4956408143043518, + "learning_rate": 9.686134009247354e-05, + "loss": 0.7355, + "step": 35510 + }, + { + "epoch": 0.22692715587186793, + "grad_norm": 0.9197072386741638, + "learning_rate": 9.685959008735414e-05, + "loss": 0.7268, + "step": 35520 + }, + { + "epoch": 0.22699104302160664, + "grad_norm": 0.9792423248291016, + "learning_rate": 9.685783961031814e-05, + "loss": 0.8215, + "step": 35530 + }, + { + "epoch": 0.22705493017134534, + "grad_norm": 1.209794282913208, + "learning_rate": 9.685608866138316e-05, + "loss": 0.7836, + "step": 35540 + }, + { + "epoch": 0.22711881732108405, + "grad_norm": 0.8678392767906189, + "learning_rate": 9.685433724056683e-05, + "loss": 0.862, + "step": 35550 + }, + { + "epoch": 0.22718270447082273, + "grad_norm": 1.0202693939208984, + "learning_rate": 9.685258534788679e-05, + "loss": 0.8804, + "step": 35560 + }, + { + "epoch": 0.22724659162056143, + "grad_norm": 0.8867144584655762, + "learning_rate": 9.685083298336068e-05, + "loss": 0.8365, + "step": 35570 + }, + { + "epoch": 0.22731047877030014, + "grad_norm": 0.7046698927879333, + "learning_rate": 9.684908014700616e-05, + "loss": 1.1958, + "step": 35580 + }, + { + "epoch": 0.22737436592003885, + "grad_norm": 0.7776816487312317, + "learning_rate": 9.684732683884085e-05, + "loss": 0.8462, + "step": 35590 + }, + { + "epoch": 0.22743825306977755, + "grad_norm": 0.9116525650024414, + "learning_rate": 9.684557305888245e-05, + "loss": 0.744, + "step": 35600 + }, + { + "epoch": 0.22750214021951626, + "grad_norm": 1.0605876445770264, + "learning_rate": 9.684381880714858e-05, + "loss": 0.9077, + "step": 35610 + }, + { + "epoch": 0.22756602736925494, + "grad_norm": 1.1371787786483765, + "learning_rate": 9.684206408365695e-05, + "loss": 1.1714, + "step": 35620 + }, + { + "epoch": 0.22762991451899364, + "grad_norm": 1.0647424459457397, + "learning_rate": 9.684030888842521e-05, + "loss": 0.8973, + "step": 35630 + }, + { + "epoch": 0.22769380166873235, + "grad_norm": 1.0106362104415894, + "learning_rate": 9.683855322147103e-05, + "loss": 0.7832, + "step": 35640 + }, + { + "epoch": 0.22775768881847105, + "grad_norm": 1.509164571762085, + "learning_rate": 9.68367970828121e-05, + "loss": 1.0245, + "step": 35650 + }, + { + "epoch": 0.22782157596820976, + "grad_norm": 0.6998576521873474, + "learning_rate": 9.68350404724661e-05, + "loss": 0.9198, + "step": 35660 + }, + { + "epoch": 0.22788546311794847, + "grad_norm": 0.7818799614906311, + "learning_rate": 9.683328339045073e-05, + "loss": 0.9013, + "step": 35670 + }, + { + "epoch": 0.22794935026768715, + "grad_norm": 0.9192219972610474, + "learning_rate": 9.683152583678367e-05, + "loss": 0.8992, + "step": 35680 + }, + { + "epoch": 0.22801323741742585, + "grad_norm": 0.7720584273338318, + "learning_rate": 9.682976781148265e-05, + "loss": 1.0002, + "step": 35690 + }, + { + "epoch": 0.22807712456716456, + "grad_norm": 1.023474097251892, + "learning_rate": 9.682800931456534e-05, + "loss": 0.8645, + "step": 35700 + }, + { + "epoch": 0.22814101171690326, + "grad_norm": 0.7522472143173218, + "learning_rate": 9.682625034604946e-05, + "loss": 0.9877, + "step": 35710 + }, + { + "epoch": 0.22820489886664197, + "grad_norm": 0.7929263710975647, + "learning_rate": 9.682449090595274e-05, + "loss": 0.9654, + "step": 35720 + }, + { + "epoch": 0.22826878601638068, + "grad_norm": 0.8946601152420044, + "learning_rate": 9.682273099429288e-05, + "loss": 1.1321, + "step": 35730 + }, + { + "epoch": 0.22833267316611935, + "grad_norm": 0.884692907333374, + "learning_rate": 9.682097061108761e-05, + "loss": 0.7554, + "step": 35740 + }, + { + "epoch": 0.22839656031585806, + "grad_norm": 0.6156822443008423, + "learning_rate": 9.681920975635467e-05, + "loss": 0.8625, + "step": 35750 + }, + { + "epoch": 0.22846044746559677, + "grad_norm": 0.6044219732284546, + "learning_rate": 9.681744843011177e-05, + "loss": 0.758, + "step": 35760 + }, + { + "epoch": 0.22852433461533547, + "grad_norm": 0.835270345211029, + "learning_rate": 9.681568663237668e-05, + "loss": 0.8325, + "step": 35770 + }, + { + "epoch": 0.22858822176507418, + "grad_norm": 0.9461874961853027, + "learning_rate": 9.68139243631671e-05, + "loss": 0.8916, + "step": 35780 + }, + { + "epoch": 0.22865210891481288, + "grad_norm": 1.3007314205169678, + "learning_rate": 9.681216162250082e-05, + "loss": 1.1537, + "step": 35790 + }, + { + "epoch": 0.22871599606455156, + "grad_norm": 1.0743658542633057, + "learning_rate": 9.681039841039557e-05, + "loss": 0.7409, + "step": 35800 + }, + { + "epoch": 0.22877988321429027, + "grad_norm": 2.3741660118103027, + "learning_rate": 9.680863472686911e-05, + "loss": 0.8093, + "step": 35810 + }, + { + "epoch": 0.22884377036402898, + "grad_norm": 0.9726037383079529, + "learning_rate": 9.68068705719392e-05, + "loss": 1.1677, + "step": 35820 + }, + { + "epoch": 0.22890765751376768, + "grad_norm": 0.7922230958938599, + "learning_rate": 9.680510594562362e-05, + "loss": 0.9944, + "step": 35830 + }, + { + "epoch": 0.2289715446635064, + "grad_norm": 0.8513554930686951, + "learning_rate": 9.680334084794011e-05, + "loss": 0.8125, + "step": 35840 + }, + { + "epoch": 0.2290354318132451, + "grad_norm": 1.046993374824524, + "learning_rate": 9.680157527890649e-05, + "loss": 0.9013, + "step": 35850 + }, + { + "epoch": 0.22909931896298377, + "grad_norm": 0.6349254250526428, + "learning_rate": 9.679980923854051e-05, + "loss": 0.903, + "step": 35860 + }, + { + "epoch": 0.22916320611272248, + "grad_norm": 0.4237905740737915, + "learning_rate": 9.679804272685995e-05, + "loss": 0.7127, + "step": 35870 + }, + { + "epoch": 0.22922709326246118, + "grad_norm": 0.7686927914619446, + "learning_rate": 9.679627574388264e-05, + "loss": 0.8212, + "step": 35880 + }, + { + "epoch": 0.2292909804121999, + "grad_norm": 1.274295687675476, + "learning_rate": 9.679450828962633e-05, + "loss": 0.7458, + "step": 35890 + }, + { + "epoch": 0.2293548675619386, + "grad_norm": 0.8231094479560852, + "learning_rate": 9.679274036410884e-05, + "loss": 0.851, + "step": 35900 + }, + { + "epoch": 0.2294187547116773, + "grad_norm": 0.5917838215827942, + "learning_rate": 9.679097196734797e-05, + "loss": 1.0595, + "step": 35910 + }, + { + "epoch": 0.22948264186141598, + "grad_norm": 0.9595643877983093, + "learning_rate": 9.678920309936155e-05, + "loss": 0.8143, + "step": 35920 + }, + { + "epoch": 0.2295465290111547, + "grad_norm": 0.9315831661224365, + "learning_rate": 9.678743376016736e-05, + "loss": 0.8278, + "step": 35930 + }, + { + "epoch": 0.2296104161608934, + "grad_norm": 0.8110885620117188, + "learning_rate": 9.678566394978323e-05, + "loss": 0.9624, + "step": 35940 + }, + { + "epoch": 0.2296743033106321, + "grad_norm": 0.8156410455703735, + "learning_rate": 9.6783893668227e-05, + "loss": 0.7916, + "step": 35950 + }, + { + "epoch": 0.2297381904603708, + "grad_norm": 0.7576091885566711, + "learning_rate": 9.678212291551649e-05, + "loss": 1.2787, + "step": 35960 + }, + { + "epoch": 0.2298020776101095, + "grad_norm": 0.6447461843490601, + "learning_rate": 9.678035169166953e-05, + "loss": 0.8515, + "step": 35970 + }, + { + "epoch": 0.2298659647598482, + "grad_norm": 0.6504492163658142, + "learning_rate": 9.677857999670394e-05, + "loss": 1.0268, + "step": 35980 + }, + { + "epoch": 0.2299298519095869, + "grad_norm": 1.1412609815597534, + "learning_rate": 9.677680783063761e-05, + "loss": 1.1179, + "step": 35990 + }, + { + "epoch": 0.2299937390593256, + "grad_norm": 0.7995015382766724, + "learning_rate": 9.677503519348834e-05, + "loss": 0.9593, + "step": 36000 + }, + { + "epoch": 0.2300576262090643, + "grad_norm": 1.159679889678955, + "learning_rate": 9.677326208527399e-05, + "loss": 0.9146, + "step": 36010 + }, + { + "epoch": 0.23012151335880301, + "grad_norm": 0.721098780632019, + "learning_rate": 9.677148850601243e-05, + "loss": 1.0502, + "step": 36020 + }, + { + "epoch": 0.23018540050854172, + "grad_norm": 0.4577333927154541, + "learning_rate": 9.676971445572152e-05, + "loss": 0.9092, + "step": 36030 + }, + { + "epoch": 0.23024928765828043, + "grad_norm": 0.8602834343910217, + "learning_rate": 9.676793993441913e-05, + "loss": 0.7162, + "step": 36040 + }, + { + "epoch": 0.2303131748080191, + "grad_norm": 0.8518884181976318, + "learning_rate": 9.676616494212314e-05, + "loss": 0.9275, + "step": 36050 + }, + { + "epoch": 0.2303770619577578, + "grad_norm": 1.1824616193771362, + "learning_rate": 9.676438947885138e-05, + "loss": 0.6779, + "step": 36060 + }, + { + "epoch": 0.23044094910749652, + "grad_norm": 1.0840277671813965, + "learning_rate": 9.676261354462177e-05, + "loss": 0.8189, + "step": 36070 + }, + { + "epoch": 0.23050483625723522, + "grad_norm": 0.7747464776039124, + "learning_rate": 9.67608371394522e-05, + "loss": 1.0298, + "step": 36080 + }, + { + "epoch": 0.23056872340697393, + "grad_norm": 0.7132411003112793, + "learning_rate": 9.675906026336053e-05, + "loss": 0.823, + "step": 36090 + }, + { + "epoch": 0.23063261055671264, + "grad_norm": 1.1659483909606934, + "learning_rate": 9.675728291636467e-05, + "loss": 0.8323, + "step": 36100 + }, + { + "epoch": 0.23069649770645131, + "grad_norm": 0.7727037072181702, + "learning_rate": 9.675550509848253e-05, + "loss": 0.8996, + "step": 36110 + }, + { + "epoch": 0.23076038485619002, + "grad_norm": 0.511026918888092, + "learning_rate": 9.6753726809732e-05, + "loss": 0.9119, + "step": 36120 + }, + { + "epoch": 0.23082427200592873, + "grad_norm": 1.2003488540649414, + "learning_rate": 9.6751948050131e-05, + "loss": 0.9831, + "step": 36130 + }, + { + "epoch": 0.23088815915566743, + "grad_norm": 0.9001702070236206, + "learning_rate": 9.675016881969743e-05, + "loss": 1.0382, + "step": 36140 + }, + { + "epoch": 0.23095204630540614, + "grad_norm": 0.8864395618438721, + "learning_rate": 9.674838911844923e-05, + "loss": 0.8401, + "step": 36150 + }, + { + "epoch": 0.23101593345514485, + "grad_norm": 0.8258879780769348, + "learning_rate": 9.674660894640429e-05, + "loss": 0.9833, + "step": 36160 + }, + { + "epoch": 0.23107982060488352, + "grad_norm": 0.8250300884246826, + "learning_rate": 9.674482830358056e-05, + "loss": 0.8936, + "step": 36170 + }, + { + "epoch": 0.23114370775462223, + "grad_norm": 0.9559470415115356, + "learning_rate": 9.674304718999598e-05, + "loss": 1.2631, + "step": 36180 + }, + { + "epoch": 0.23120759490436094, + "grad_norm": 2.168290853500366, + "learning_rate": 9.674126560566846e-05, + "loss": 0.9825, + "step": 36190 + }, + { + "epoch": 0.23127148205409964, + "grad_norm": 0.775067150592804, + "learning_rate": 9.673948355061597e-05, + "loss": 0.8517, + "step": 36200 + }, + { + "epoch": 0.23133536920383835, + "grad_norm": 1.186092495918274, + "learning_rate": 9.673770102485644e-05, + "loss": 0.8136, + "step": 36210 + }, + { + "epoch": 0.23139925635357705, + "grad_norm": 1.8314769268035889, + "learning_rate": 9.673591802840782e-05, + "loss": 1.0137, + "step": 36220 + }, + { + "epoch": 0.23146314350331573, + "grad_norm": 0.9208132028579712, + "learning_rate": 9.673413456128808e-05, + "loss": 0.8576, + "step": 36230 + }, + { + "epoch": 0.23152703065305444, + "grad_norm": 0.8547564148902893, + "learning_rate": 9.673235062351517e-05, + "loss": 1.1041, + "step": 36240 + }, + { + "epoch": 0.23159091780279314, + "grad_norm": 0.6247135400772095, + "learning_rate": 9.673056621510707e-05, + "loss": 0.8918, + "step": 36250 + }, + { + "epoch": 0.23165480495253185, + "grad_norm": 1.1294952630996704, + "learning_rate": 9.672878133608174e-05, + "loss": 1.1331, + "step": 36260 + }, + { + "epoch": 0.23171869210227056, + "grad_norm": 1.048307180404663, + "learning_rate": 9.672699598645716e-05, + "loss": 1.0438, + "step": 36270 + }, + { + "epoch": 0.23178257925200926, + "grad_norm": 0.8274295926094055, + "learning_rate": 9.672521016625128e-05, + "loss": 1.0533, + "step": 36280 + }, + { + "epoch": 0.23184646640174794, + "grad_norm": 0.6973618268966675, + "learning_rate": 9.672342387548215e-05, + "loss": 1.0061, + "step": 36290 + }, + { + "epoch": 0.23191035355148665, + "grad_norm": 0.6850184798240662, + "learning_rate": 9.672163711416768e-05, + "loss": 0.9715, + "step": 36300 + }, + { + "epoch": 0.23197424070122535, + "grad_norm": 0.9231820702552795, + "learning_rate": 9.671984988232593e-05, + "loss": 0.8866, + "step": 36310 + }, + { + "epoch": 0.23203812785096406, + "grad_norm": 1.0431686639785767, + "learning_rate": 9.671806217997485e-05, + "loss": 0.8008, + "step": 36320 + }, + { + "epoch": 0.23210201500070277, + "grad_norm": 0.5410827398300171, + "learning_rate": 9.67162740071325e-05, + "loss": 0.9067, + "step": 36330 + }, + { + "epoch": 0.23216590215044147, + "grad_norm": 0.6281831860542297, + "learning_rate": 9.671448536381683e-05, + "loss": 0.9372, + "step": 36340 + }, + { + "epoch": 0.23222978930018015, + "grad_norm": 0.8428774476051331, + "learning_rate": 9.671269625004589e-05, + "loss": 0.9881, + "step": 36350 + }, + { + "epoch": 0.23229367644991886, + "grad_norm": 0.5876288414001465, + "learning_rate": 9.671090666583769e-05, + "loss": 0.7809, + "step": 36360 + }, + { + "epoch": 0.23235756359965756, + "grad_norm": 0.904808521270752, + "learning_rate": 9.670911661121023e-05, + "loss": 0.9961, + "step": 36370 + }, + { + "epoch": 0.23242145074939627, + "grad_norm": 0.6523864269256592, + "learning_rate": 9.670732608618157e-05, + "loss": 0.9395, + "step": 36380 + }, + { + "epoch": 0.23248533789913498, + "grad_norm": 0.8728864192962646, + "learning_rate": 9.670553509076972e-05, + "loss": 0.7788, + "step": 36390 + }, + { + "epoch": 0.23254922504887368, + "grad_norm": 0.7656633257865906, + "learning_rate": 9.670374362499274e-05, + "loss": 0.9066, + "step": 36400 + }, + { + "epoch": 0.23261311219861236, + "grad_norm": 0.7706246972084045, + "learning_rate": 9.670195168886866e-05, + "loss": 1.1319, + "step": 36410 + }, + { + "epoch": 0.23267699934835107, + "grad_norm": 0.6671524047851562, + "learning_rate": 9.670015928241551e-05, + "loss": 1.1416, + "step": 36420 + }, + { + "epoch": 0.23274088649808977, + "grad_norm": 0.8740767240524292, + "learning_rate": 9.669836640565136e-05, + "loss": 0.9599, + "step": 36430 + }, + { + "epoch": 0.23280477364782848, + "grad_norm": 0.8602978587150574, + "learning_rate": 9.669657305859425e-05, + "loss": 0.7909, + "step": 36440 + }, + { + "epoch": 0.23286866079756718, + "grad_norm": 0.7806286215782166, + "learning_rate": 9.669477924126226e-05, + "loss": 0.7961, + "step": 36450 + }, + { + "epoch": 0.2329325479473059, + "grad_norm": 0.5049117803573608, + "learning_rate": 9.669298495367345e-05, + "loss": 0.742, + "step": 36460 + }, + { + "epoch": 0.23299643509704457, + "grad_norm": 0.719462513923645, + "learning_rate": 9.669119019584589e-05, + "loss": 0.8667, + "step": 36470 + }, + { + "epoch": 0.23306032224678327, + "grad_norm": 0.8203737735748291, + "learning_rate": 9.668939496779763e-05, + "loss": 1.0432, + "step": 36480 + }, + { + "epoch": 0.23312420939652198, + "grad_norm": 0.7739396691322327, + "learning_rate": 9.668759926954679e-05, + "loss": 0.7717, + "step": 36490 + }, + { + "epoch": 0.2331880965462607, + "grad_norm": 0.5877523422241211, + "learning_rate": 9.668580310111142e-05, + "loss": 0.8282, + "step": 36500 + }, + { + "epoch": 0.2332519836959994, + "grad_norm": 0.7117794156074524, + "learning_rate": 9.668400646250963e-05, + "loss": 0.8262, + "step": 36510 + }, + { + "epoch": 0.2333158708457381, + "grad_norm": 0.6126281023025513, + "learning_rate": 9.668220935375953e-05, + "loss": 0.7877, + "step": 36520 + }, + { + "epoch": 0.23337975799547678, + "grad_norm": 1.1325799226760864, + "learning_rate": 9.668041177487917e-05, + "loss": 1.2053, + "step": 36530 + }, + { + "epoch": 0.23344364514521548, + "grad_norm": 0.8727070689201355, + "learning_rate": 9.667861372588669e-05, + "loss": 0.9724, + "step": 36540 + }, + { + "epoch": 0.2335075322949542, + "grad_norm": 0.87961345911026, + "learning_rate": 9.667681520680017e-05, + "loss": 0.8785, + "step": 36550 + }, + { + "epoch": 0.2335714194446929, + "grad_norm": 0.9073530435562134, + "learning_rate": 9.667501621763777e-05, + "loss": 0.7719, + "step": 36560 + }, + { + "epoch": 0.2336353065944316, + "grad_norm": 0.7770230770111084, + "learning_rate": 9.667321675841754e-05, + "loss": 0.9077, + "step": 36570 + }, + { + "epoch": 0.2336991937441703, + "grad_norm": 1.0296423435211182, + "learning_rate": 9.667141682915765e-05, + "loss": 0.867, + "step": 36580 + }, + { + "epoch": 0.233763080893909, + "grad_norm": 0.7076445817947388, + "learning_rate": 9.666961642987624e-05, + "loss": 0.7565, + "step": 36590 + }, + { + "epoch": 0.2338269680436477, + "grad_norm": 1.4758923053741455, + "learning_rate": 9.66678155605914e-05, + "loss": 1.0654, + "step": 36600 + }, + { + "epoch": 0.2338908551933864, + "grad_norm": 0.8394945859909058, + "learning_rate": 9.666601422132129e-05, + "loss": 0.6541, + "step": 36610 + }, + { + "epoch": 0.2339547423431251, + "grad_norm": 0.946808934211731, + "learning_rate": 9.666421241208404e-05, + "loss": 0.9308, + "step": 36620 + }, + { + "epoch": 0.2340186294928638, + "grad_norm": 0.8768804669380188, + "learning_rate": 9.666241013289781e-05, + "loss": 0.7125, + "step": 36630 + }, + { + "epoch": 0.23408251664260252, + "grad_norm": 0.9706554412841797, + "learning_rate": 9.666060738378072e-05, + "loss": 0.8804, + "step": 36640 + }, + { + "epoch": 0.2341464037923412, + "grad_norm": 1.6427329778671265, + "learning_rate": 9.665880416475097e-05, + "loss": 0.9644, + "step": 36650 + }, + { + "epoch": 0.2342102909420799, + "grad_norm": 0.792389988899231, + "learning_rate": 9.665700047582667e-05, + "loss": 0.8932, + "step": 36660 + }, + { + "epoch": 0.2342741780918186, + "grad_norm": 0.6772669553756714, + "learning_rate": 9.665519631702605e-05, + "loss": 0.8973, + "step": 36670 + }, + { + "epoch": 0.23433806524155731, + "grad_norm": 0.8175477385520935, + "learning_rate": 9.66533916883672e-05, + "loss": 0.9906, + "step": 36680 + }, + { + "epoch": 0.23440195239129602, + "grad_norm": 1.3049653768539429, + "learning_rate": 9.665158658986835e-05, + "loss": 0.9246, + "step": 36690 + }, + { + "epoch": 0.23446583954103473, + "grad_norm": 0.7505981922149658, + "learning_rate": 9.664978102154766e-05, + "loss": 0.9096, + "step": 36700 + }, + { + "epoch": 0.2345297266907734, + "grad_norm": 0.8786876797676086, + "learning_rate": 9.664797498342333e-05, + "loss": 0.9795, + "step": 36710 + }, + { + "epoch": 0.2345936138405121, + "grad_norm": 1.1042776107788086, + "learning_rate": 9.664616847551354e-05, + "loss": 0.871, + "step": 36720 + }, + { + "epoch": 0.23465750099025082, + "grad_norm": 0.5629504919052124, + "learning_rate": 9.664436149783647e-05, + "loss": 0.7445, + "step": 36730 + }, + { + "epoch": 0.23472138813998952, + "grad_norm": 0.7298271656036377, + "learning_rate": 9.664255405041031e-05, + "loss": 0.9827, + "step": 36740 + }, + { + "epoch": 0.23478527528972823, + "grad_norm": 0.6317089200019836, + "learning_rate": 9.66407461332533e-05, + "loss": 0.8363, + "step": 36750 + }, + { + "epoch": 0.23484916243946694, + "grad_norm": 0.8942947387695312, + "learning_rate": 9.663893774638362e-05, + "loss": 0.9289, + "step": 36760 + }, + { + "epoch": 0.2349130495892056, + "grad_norm": 1.3955134153366089, + "learning_rate": 9.663712888981949e-05, + "loss": 0.9013, + "step": 36770 + }, + { + "epoch": 0.23497693673894432, + "grad_norm": 0.84214186668396, + "learning_rate": 9.663531956357912e-05, + "loss": 0.8152, + "step": 36780 + }, + { + "epoch": 0.23504082388868303, + "grad_norm": 0.4366759955883026, + "learning_rate": 9.663350976768074e-05, + "loss": 0.7441, + "step": 36790 + }, + { + "epoch": 0.23510471103842173, + "grad_norm": 0.7600962519645691, + "learning_rate": 9.663169950214257e-05, + "loss": 0.9543, + "step": 36800 + }, + { + "epoch": 0.23516859818816044, + "grad_norm": 1.2092550992965698, + "learning_rate": 9.662988876698285e-05, + "loss": 1.0359, + "step": 36810 + }, + { + "epoch": 0.23523248533789914, + "grad_norm": 0.6062434911727905, + "learning_rate": 9.662807756221981e-05, + "loss": 0.6755, + "step": 36820 + }, + { + "epoch": 0.23529637248763785, + "grad_norm": 0.9666545987129211, + "learning_rate": 9.662626588787168e-05, + "loss": 0.7634, + "step": 36830 + }, + { + "epoch": 0.23536025963737653, + "grad_norm": 0.9782662987709045, + "learning_rate": 9.662445374395672e-05, + "loss": 0.9015, + "step": 36840 + }, + { + "epoch": 0.23542414678711523, + "grad_norm": 0.6901407241821289, + "learning_rate": 9.662264113049318e-05, + "loss": 0.8262, + "step": 36850 + }, + { + "epoch": 0.23548803393685394, + "grad_norm": 0.6084008812904358, + "learning_rate": 9.66208280474993e-05, + "loss": 0.7851, + "step": 36860 + }, + { + "epoch": 0.23555192108659265, + "grad_norm": 1.5526678562164307, + "learning_rate": 9.661901449499336e-05, + "loss": 0.9491, + "step": 36870 + }, + { + "epoch": 0.23561580823633135, + "grad_norm": 0.6443691849708557, + "learning_rate": 9.66172004729936e-05, + "loss": 0.8368, + "step": 36880 + }, + { + "epoch": 0.23567969538607006, + "grad_norm": 1.0201776027679443, + "learning_rate": 9.661538598151831e-05, + "loss": 0.9269, + "step": 36890 + }, + { + "epoch": 0.23574358253580874, + "grad_norm": 1.2530359029769897, + "learning_rate": 9.661357102058577e-05, + "loss": 0.9521, + "step": 36900 + }, + { + "epoch": 0.23580746968554744, + "grad_norm": 0.675190269947052, + "learning_rate": 9.661175559021423e-05, + "loss": 0.8889, + "step": 36910 + }, + { + "epoch": 0.23587135683528615, + "grad_norm": 1.3392939567565918, + "learning_rate": 9.660993969042197e-05, + "loss": 1.0459, + "step": 36920 + }, + { + "epoch": 0.23593524398502486, + "grad_norm": 0.7173458337783813, + "learning_rate": 9.66081233212273e-05, + "loss": 0.8731, + "step": 36930 + }, + { + "epoch": 0.23599913113476356, + "grad_norm": 1.142118215560913, + "learning_rate": 9.660630648264852e-05, + "loss": 0.9468, + "step": 36940 + }, + { + "epoch": 0.23606301828450227, + "grad_norm": 0.6740077137947083, + "learning_rate": 9.66044891747039e-05, + "loss": 0.8258, + "step": 36950 + }, + { + "epoch": 0.23612690543424095, + "grad_norm": 0.7697812914848328, + "learning_rate": 9.660267139741177e-05, + "loss": 0.9605, + "step": 36960 + }, + { + "epoch": 0.23619079258397965, + "grad_norm": 0.9170047044754028, + "learning_rate": 9.660085315079041e-05, + "loss": 0.8237, + "step": 36970 + }, + { + "epoch": 0.23625467973371836, + "grad_norm": 1.0468403100967407, + "learning_rate": 9.659903443485816e-05, + "loss": 0.7339, + "step": 36980 + }, + { + "epoch": 0.23631856688345707, + "grad_norm": 0.7569143176078796, + "learning_rate": 9.659721524963331e-05, + "loss": 0.9094, + "step": 36990 + }, + { + "epoch": 0.23638245403319577, + "grad_norm": 2.99776291847229, + "learning_rate": 9.659539559513418e-05, + "loss": 0.7063, + "step": 37000 + }, + { + "epoch": 0.23644634118293448, + "grad_norm": 0.6073469519615173, + "learning_rate": 9.659357547137912e-05, + "loss": 0.6927, + "step": 37010 + }, + { + "epoch": 0.23651022833267316, + "grad_norm": 0.9018070101737976, + "learning_rate": 9.659175487838643e-05, + "loss": 0.6648, + "step": 37020 + }, + { + "epoch": 0.23657411548241186, + "grad_norm": 1.5573745965957642, + "learning_rate": 9.658993381617447e-05, + "loss": 0.866, + "step": 37030 + }, + { + "epoch": 0.23663800263215057, + "grad_norm": 0.9631299376487732, + "learning_rate": 9.658811228476158e-05, + "loss": 0.8542, + "step": 37040 + }, + { + "epoch": 0.23670188978188927, + "grad_norm": 0.5931088924407959, + "learning_rate": 9.658629028416608e-05, + "loss": 1.0986, + "step": 37050 + }, + { + "epoch": 0.23676577693162798, + "grad_norm": 1.1451070308685303, + "learning_rate": 9.658446781440635e-05, + "loss": 1.026, + "step": 37060 + }, + { + "epoch": 0.2368296640813667, + "grad_norm": 0.9093202352523804, + "learning_rate": 9.65826448755007e-05, + "loss": 1.0764, + "step": 37070 + }, + { + "epoch": 0.23689355123110536, + "grad_norm": 0.6607868075370789, + "learning_rate": 9.658082146746754e-05, + "loss": 0.8786, + "step": 37080 + }, + { + "epoch": 0.23695743838084407, + "grad_norm": 1.8870525360107422, + "learning_rate": 9.65789975903252e-05, + "loss": 0.7795, + "step": 37090 + }, + { + "epoch": 0.23702132553058278, + "grad_norm": 0.9815956354141235, + "learning_rate": 9.657717324409207e-05, + "loss": 0.9482, + "step": 37100 + }, + { + "epoch": 0.23708521268032148, + "grad_norm": 0.7396382689476013, + "learning_rate": 9.65753484287865e-05, + "loss": 1.2419, + "step": 37110 + }, + { + "epoch": 0.2371490998300602, + "grad_norm": 1.3282475471496582, + "learning_rate": 9.657352314442688e-05, + "loss": 1.0154, + "step": 37120 + }, + { + "epoch": 0.2372129869797989, + "grad_norm": 0.5715224742889404, + "learning_rate": 9.65716973910316e-05, + "loss": 0.8772, + "step": 37130 + }, + { + "epoch": 0.23727687412953757, + "grad_norm": 0.646783173084259, + "learning_rate": 9.656987116861902e-05, + "loss": 0.9359, + "step": 37140 + }, + { + "epoch": 0.23734076127927628, + "grad_norm": 0.9318345189094543, + "learning_rate": 9.656804447720755e-05, + "loss": 0.7484, + "step": 37150 + }, + { + "epoch": 0.23740464842901499, + "grad_norm": 0.9858495593070984, + "learning_rate": 9.65662173168156e-05, + "loss": 0.9772, + "step": 37160 + }, + { + "epoch": 0.2374685355787537, + "grad_norm": 0.8943020701408386, + "learning_rate": 9.656438968746153e-05, + "loss": 0.9814, + "step": 37170 + }, + { + "epoch": 0.2375324227284924, + "grad_norm": 0.7488458752632141, + "learning_rate": 9.656256158916379e-05, + "loss": 0.8101, + "step": 37180 + }, + { + "epoch": 0.2375963098782311, + "grad_norm": 1.547443151473999, + "learning_rate": 9.656073302194078e-05, + "loss": 0.7346, + "step": 37190 + }, + { + "epoch": 0.23766019702796978, + "grad_norm": 0.7410275340080261, + "learning_rate": 9.655890398581088e-05, + "loss": 0.8636, + "step": 37200 + }, + { + "epoch": 0.2377240841777085, + "grad_norm": 1.3418773412704468, + "learning_rate": 9.655707448079256e-05, + "loss": 0.9634, + "step": 37210 + }, + { + "epoch": 0.2377879713274472, + "grad_norm": 1.0941447019577026, + "learning_rate": 9.655524450690423e-05, + "loss": 1.0396, + "step": 37220 + }, + { + "epoch": 0.2378518584771859, + "grad_norm": 0.6817768216133118, + "learning_rate": 9.65534140641643e-05, + "loss": 1.1446, + "step": 37230 + }, + { + "epoch": 0.2379157456269246, + "grad_norm": 1.0512549877166748, + "learning_rate": 9.65515831525912e-05, + "loss": 0.8289, + "step": 37240 + }, + { + "epoch": 0.2379796327766633, + "grad_norm": 0.6401187777519226, + "learning_rate": 9.654975177220341e-05, + "loss": 1.045, + "step": 37250 + }, + { + "epoch": 0.238043519926402, + "grad_norm": 1.0263795852661133, + "learning_rate": 9.654791992301935e-05, + "loss": 1.0096, + "step": 37260 + }, + { + "epoch": 0.2381074070761407, + "grad_norm": 0.7788522839546204, + "learning_rate": 9.654608760505745e-05, + "loss": 1.0715, + "step": 37270 + }, + { + "epoch": 0.2381712942258794, + "grad_norm": 0.7468205094337463, + "learning_rate": 9.654425481833618e-05, + "loss": 1.0105, + "step": 37280 + }, + { + "epoch": 0.2382351813756181, + "grad_norm": 0.6502282619476318, + "learning_rate": 9.6542421562874e-05, + "loss": 0.9838, + "step": 37290 + }, + { + "epoch": 0.23829906852535682, + "grad_norm": 0.6235799193382263, + "learning_rate": 9.654058783868938e-05, + "loss": 0.8914, + "step": 37300 + }, + { + "epoch": 0.23836295567509552, + "grad_norm": 0.6103238463401794, + "learning_rate": 9.653875364580077e-05, + "loss": 0.864, + "step": 37310 + }, + { + "epoch": 0.2384268428248342, + "grad_norm": 0.9452196955680847, + "learning_rate": 9.653691898422666e-05, + "loss": 0.8753, + "step": 37320 + }, + { + "epoch": 0.2384907299745729, + "grad_norm": 0.8040950298309326, + "learning_rate": 9.653508385398549e-05, + "loss": 0.8442, + "step": 37330 + }, + { + "epoch": 0.2385546171243116, + "grad_norm": 1.0032446384429932, + "learning_rate": 9.65332482550958e-05, + "loss": 0.9091, + "step": 37340 + }, + { + "epoch": 0.23861850427405032, + "grad_norm": 1.0294917821884155, + "learning_rate": 9.653141218757602e-05, + "loss": 0.7559, + "step": 37350 + }, + { + "epoch": 0.23868239142378903, + "grad_norm": 0.6536062955856323, + "learning_rate": 9.652957565144465e-05, + "loss": 0.7608, + "step": 37360 + }, + { + "epoch": 0.23874627857352773, + "grad_norm": 0.7073416113853455, + "learning_rate": 9.652773864672022e-05, + "loss": 1.0675, + "step": 37370 + }, + { + "epoch": 0.2388101657232664, + "grad_norm": 0.8172992467880249, + "learning_rate": 9.652590117342122e-05, + "loss": 0.8483, + "step": 37380 + }, + { + "epoch": 0.23887405287300512, + "grad_norm": 0.7354963421821594, + "learning_rate": 9.652406323156613e-05, + "loss": 0.9358, + "step": 37390 + }, + { + "epoch": 0.23893794002274382, + "grad_norm": 0.6794359683990479, + "learning_rate": 9.652222482117347e-05, + "loss": 0.9437, + "step": 37400 + }, + { + "epoch": 0.23900182717248253, + "grad_norm": 0.7249003648757935, + "learning_rate": 9.652038594226177e-05, + "loss": 0.8782, + "step": 37410 + }, + { + "epoch": 0.23906571432222123, + "grad_norm": 0.8355563282966614, + "learning_rate": 9.651854659484954e-05, + "loss": 0.6612, + "step": 37420 + }, + { + "epoch": 0.23912960147195994, + "grad_norm": 0.7103647589683533, + "learning_rate": 9.651670677895529e-05, + "loss": 1.1142, + "step": 37430 + }, + { + "epoch": 0.23919348862169862, + "grad_norm": 0.5884954929351807, + "learning_rate": 9.651486649459755e-05, + "loss": 0.9896, + "step": 37440 + }, + { + "epoch": 0.23925737577143733, + "grad_norm": 0.7389781475067139, + "learning_rate": 9.651302574179489e-05, + "loss": 0.8372, + "step": 37450 + }, + { + "epoch": 0.23932126292117603, + "grad_norm": 0.5792128443717957, + "learning_rate": 9.651118452056582e-05, + "loss": 0.7093, + "step": 37460 + }, + { + "epoch": 0.23938515007091474, + "grad_norm": 0.7699292898178101, + "learning_rate": 9.650934283092887e-05, + "loss": 1.0111, + "step": 37470 + }, + { + "epoch": 0.23944903722065344, + "grad_norm": 0.7070481181144714, + "learning_rate": 9.65075006729026e-05, + "loss": 1.084, + "step": 37480 + }, + { + "epoch": 0.23951292437039215, + "grad_norm": 1.0527695417404175, + "learning_rate": 9.650565804650556e-05, + "loss": 0.8554, + "step": 37490 + }, + { + "epoch": 0.23957681152013083, + "grad_norm": 0.7435452342033386, + "learning_rate": 9.650381495175633e-05, + "loss": 0.8564, + "step": 37500 + }, + { + "epoch": 0.23964069866986953, + "grad_norm": 0.9343265295028687, + "learning_rate": 9.650197138867343e-05, + "loss": 1.2956, + "step": 37510 + }, + { + "epoch": 0.23970458581960824, + "grad_norm": 1.0350561141967773, + "learning_rate": 9.650012735727546e-05, + "loss": 0.9515, + "step": 37520 + }, + { + "epoch": 0.23976847296934695, + "grad_norm": 1.1967248916625977, + "learning_rate": 9.649828285758098e-05, + "loss": 0.9391, + "step": 37530 + }, + { + "epoch": 0.23983236011908565, + "grad_norm": 1.7346086502075195, + "learning_rate": 9.649643788960856e-05, + "loss": 0.8431, + "step": 37540 + }, + { + "epoch": 0.23989624726882436, + "grad_norm": 0.7352771162986755, + "learning_rate": 9.649459245337679e-05, + "loss": 0.6744, + "step": 37550 + }, + { + "epoch": 0.23996013441856304, + "grad_norm": 0.6544600129127502, + "learning_rate": 9.649293116042181e-05, + "loss": 1.2871, + "step": 37560 + }, + { + "epoch": 0.24002402156830174, + "grad_norm": 0.4782470464706421, + "learning_rate": 9.649108483454848e-05, + "loss": 1.022, + "step": 37570 + }, + { + "epoch": 0.24008790871804045, + "grad_norm": 0.9200822710990906, + "learning_rate": 9.648923804046968e-05, + "loss": 0.737, + "step": 37580 + }, + { + "epoch": 0.24015179586777916, + "grad_norm": 1.8405570983886719, + "learning_rate": 9.648739077820405e-05, + "loss": 0.832, + "step": 37590 + }, + { + "epoch": 0.24021568301751786, + "grad_norm": 1.054779052734375, + "learning_rate": 9.648554304777017e-05, + "loss": 0.8275, + "step": 37600 + }, + { + "epoch": 0.24027957016725657, + "grad_norm": 0.8630744814872742, + "learning_rate": 9.648369484918667e-05, + "loss": 0.8602, + "step": 37610 + }, + { + "epoch": 0.24034345731699525, + "grad_norm": 1.0110766887664795, + "learning_rate": 9.648184618247214e-05, + "loss": 0.8669, + "step": 37620 + }, + { + "epoch": 0.24040734446673395, + "grad_norm": 1.0114331245422363, + "learning_rate": 9.64799970476452e-05, + "loss": 1.0707, + "step": 37630 + }, + { + "epoch": 0.24047123161647266, + "grad_norm": 0.8818547129631042, + "learning_rate": 9.64781474447245e-05, + "loss": 0.9434, + "step": 37640 + }, + { + "epoch": 0.24053511876621136, + "grad_norm": 1.12362802028656, + "learning_rate": 9.647629737372863e-05, + "loss": 1.1379, + "step": 37650 + }, + { + "epoch": 0.24059900591595007, + "grad_norm": 0.696323812007904, + "learning_rate": 9.647444683467623e-05, + "loss": 0.951, + "step": 37660 + }, + { + "epoch": 0.24066289306568878, + "grad_norm": 0.8041189312934875, + "learning_rate": 9.647259582758597e-05, + "loss": 0.9218, + "step": 37670 + }, + { + "epoch": 0.24072678021542748, + "grad_norm": 0.45444270968437195, + "learning_rate": 9.647074435247644e-05, + "loss": 0.7025, + "step": 37680 + }, + { + "epoch": 0.24079066736516616, + "grad_norm": 0.6444490551948547, + "learning_rate": 9.646889240936632e-05, + "loss": 1.011, + "step": 37690 + }, + { + "epoch": 0.24085455451490487, + "grad_norm": 0.9339631199836731, + "learning_rate": 9.646703999827426e-05, + "loss": 1.0486, + "step": 37700 + }, + { + "epoch": 0.24091844166464357, + "grad_norm": 1.2948579788208008, + "learning_rate": 9.64651871192189e-05, + "loss": 1.2862, + "step": 37710 + }, + { + "epoch": 0.24098232881438228, + "grad_norm": 0.8452892899513245, + "learning_rate": 9.64633337722189e-05, + "loss": 0.8614, + "step": 37720 + }, + { + "epoch": 0.24104621596412099, + "grad_norm": 0.8650469183921814, + "learning_rate": 9.646147995729294e-05, + "loss": 1.0569, + "step": 37730 + }, + { + "epoch": 0.2411101031138597, + "grad_norm": 0.8053631782531738, + "learning_rate": 9.645962567445969e-05, + "loss": 0.9448, + "step": 37740 + }, + { + "epoch": 0.24117399026359837, + "grad_norm": 0.6854358315467834, + "learning_rate": 9.64577709237378e-05, + "loss": 0.9552, + "step": 37750 + }, + { + "epoch": 0.24123787741333708, + "grad_norm": 1.0860304832458496, + "learning_rate": 9.645591570514598e-05, + "loss": 1.111, + "step": 37760 + }, + { + "epoch": 0.24130176456307578, + "grad_norm": 0.7687236070632935, + "learning_rate": 9.64540600187029e-05, + "loss": 0.8652, + "step": 37770 + }, + { + "epoch": 0.2413656517128145, + "grad_norm": 1.2046473026275635, + "learning_rate": 9.645220386442724e-05, + "loss": 0.7453, + "step": 37780 + }, + { + "epoch": 0.2414295388625532, + "grad_norm": 0.9802344441413879, + "learning_rate": 9.64503472423377e-05, + "loss": 0.8819, + "step": 37790 + }, + { + "epoch": 0.2414934260122919, + "grad_norm": 0.7101196646690369, + "learning_rate": 9.644849015245296e-05, + "loss": 0.8814, + "step": 37800 + }, + { + "epoch": 0.24155731316203058, + "grad_norm": 1.215147852897644, + "learning_rate": 9.644663259479177e-05, + "loss": 0.9877, + "step": 37810 + }, + { + "epoch": 0.24162120031176929, + "grad_norm": 0.9594703316688538, + "learning_rate": 9.64447745693728e-05, + "loss": 0.8849, + "step": 37820 + }, + { + "epoch": 0.241685087461508, + "grad_norm": 0.6628295183181763, + "learning_rate": 9.644291607621476e-05, + "loss": 0.6372, + "step": 37830 + }, + { + "epoch": 0.2417489746112467, + "grad_norm": 0.7092610001564026, + "learning_rate": 9.644105711533638e-05, + "loss": 0.9584, + "step": 37840 + }, + { + "epoch": 0.2418128617609854, + "grad_norm": 1.3304320573806763, + "learning_rate": 9.643919768675637e-05, + "loss": 1.075, + "step": 37850 + }, + { + "epoch": 0.2418767489107241, + "grad_norm": 0.8040294051170349, + "learning_rate": 9.643733779049349e-05, + "loss": 0.9873, + "step": 37860 + }, + { + "epoch": 0.2419406360604628, + "grad_norm": 0.6643669009208679, + "learning_rate": 9.643547742656643e-05, + "loss": 0.9092, + "step": 37870 + }, + { + "epoch": 0.2420045232102015, + "grad_norm": 0.8764951229095459, + "learning_rate": 9.643361659499392e-05, + "loss": 0.7729, + "step": 37880 + }, + { + "epoch": 0.2420684103599402, + "grad_norm": 6.183263778686523, + "learning_rate": 9.643175529579475e-05, + "loss": 1.238, + "step": 37890 + }, + { + "epoch": 0.2421322975096789, + "grad_norm": 1.3563274145126343, + "learning_rate": 9.642989352898762e-05, + "loss": 0.8639, + "step": 37900 + }, + { + "epoch": 0.2421961846594176, + "grad_norm": 0.8023094534873962, + "learning_rate": 9.64280312945913e-05, + "loss": 1.1368, + "step": 37910 + }, + { + "epoch": 0.24226007180915632, + "grad_norm": 0.6188109517097473, + "learning_rate": 9.642616859262455e-05, + "loss": 1.0875, + "step": 37920 + }, + { + "epoch": 0.242323958958895, + "grad_norm": 0.7044292688369751, + "learning_rate": 9.64243054231061e-05, + "loss": 0.9899, + "step": 37930 + }, + { + "epoch": 0.2423878461086337, + "grad_norm": 0.8876643776893616, + "learning_rate": 9.642244178605473e-05, + "loss": 0.9804, + "step": 37940 + }, + { + "epoch": 0.2424517332583724, + "grad_norm": 0.7421206831932068, + "learning_rate": 9.642057768148922e-05, + "loss": 0.8828, + "step": 37950 + }, + { + "epoch": 0.24251562040811112, + "grad_norm": 0.8457249402999878, + "learning_rate": 9.641871310942832e-05, + "loss": 0.9491, + "step": 37960 + }, + { + "epoch": 0.24257950755784982, + "grad_norm": 1.0700315237045288, + "learning_rate": 9.641684806989084e-05, + "loss": 0.8752, + "step": 37970 + }, + { + "epoch": 0.24264339470758853, + "grad_norm": 0.7216569781303406, + "learning_rate": 9.641498256289552e-05, + "loss": 1.1564, + "step": 37980 + }, + { + "epoch": 0.2427072818573272, + "grad_norm": 0.847780704498291, + "learning_rate": 9.641311658846119e-05, + "loss": 0.8944, + "step": 37990 + }, + { + "epoch": 0.2427711690070659, + "grad_norm": 0.9553901553153992, + "learning_rate": 9.641125014660662e-05, + "loss": 0.9013, + "step": 38000 + }, + { + "epoch": 0.24283505615680462, + "grad_norm": 1.37058424949646, + "learning_rate": 9.64093832373506e-05, + "loss": 0.9315, + "step": 38010 + }, + { + "epoch": 0.24289894330654332, + "grad_norm": 0.7296909689903259, + "learning_rate": 9.640751586071195e-05, + "loss": 0.8648, + "step": 38020 + }, + { + "epoch": 0.24296283045628203, + "grad_norm": 1.1600792407989502, + "learning_rate": 9.640564801670948e-05, + "loss": 0.7834, + "step": 38030 + }, + { + "epoch": 0.24302671760602074, + "grad_norm": 1.0222969055175781, + "learning_rate": 9.640377970536197e-05, + "loss": 1.0175, + "step": 38040 + }, + { + "epoch": 0.24309060475575942, + "grad_norm": 1.1712769269943237, + "learning_rate": 9.640191092668825e-05, + "loss": 1.0173, + "step": 38050 + }, + { + "epoch": 0.24315449190549812, + "grad_norm": 1.4743320941925049, + "learning_rate": 9.640004168070716e-05, + "loss": 1.2532, + "step": 38060 + }, + { + "epoch": 0.24321837905523683, + "grad_norm": 0.7207898497581482, + "learning_rate": 9.639817196743749e-05, + "loss": 0.9538, + "step": 38070 + }, + { + "epoch": 0.24328226620497553, + "grad_norm": 1.0338420867919922, + "learning_rate": 9.639630178689809e-05, + "loss": 0.8653, + "step": 38080 + }, + { + "epoch": 0.24334615335471424, + "grad_norm": 0.7284950017929077, + "learning_rate": 9.639443113910781e-05, + "loss": 0.7094, + "step": 38090 + }, + { + "epoch": 0.24341004050445295, + "grad_norm": 1.0017796754837036, + "learning_rate": 9.639256002408545e-05, + "loss": 0.7997, + "step": 38100 + }, + { + "epoch": 0.24347392765419162, + "grad_norm": 1.1406546831130981, + "learning_rate": 9.639068844184989e-05, + "loss": 0.9456, + "step": 38110 + }, + { + "epoch": 0.24353781480393033, + "grad_norm": 0.7113826870918274, + "learning_rate": 9.638881639241996e-05, + "loss": 0.8586, + "step": 38120 + }, + { + "epoch": 0.24360170195366904, + "grad_norm": 0.8635872602462769, + "learning_rate": 9.638694387581453e-05, + "loss": 0.8049, + "step": 38130 + }, + { + "epoch": 0.24366558910340774, + "grad_norm": 0.882631242275238, + "learning_rate": 9.638507089205242e-05, + "loss": 0.7798, + "step": 38140 + }, + { + "epoch": 0.24372947625314645, + "grad_norm": 0.9470729231834412, + "learning_rate": 9.638319744115254e-05, + "loss": 0.875, + "step": 38150 + }, + { + "epoch": 0.24379336340288515, + "grad_norm": 0.5995595455169678, + "learning_rate": 9.638132352313371e-05, + "loss": 0.7982, + "step": 38160 + }, + { + "epoch": 0.24385725055262383, + "grad_norm": 0.702936589717865, + "learning_rate": 9.637944913801485e-05, + "loss": 0.8089, + "step": 38170 + }, + { + "epoch": 0.24392113770236254, + "grad_norm": 0.9595651626586914, + "learning_rate": 9.63775742858148e-05, + "loss": 0.9638, + "step": 38180 + }, + { + "epoch": 0.24398502485210125, + "grad_norm": 0.706251323223114, + "learning_rate": 9.637569896655245e-05, + "loss": 0.7911, + "step": 38190 + }, + { + "epoch": 0.24404891200183995, + "grad_norm": 0.6460245251655579, + "learning_rate": 9.63740107798937e-05, + "loss": 1.1869, + "step": 38200 + }, + { + "epoch": 0.24411279915157866, + "grad_norm": 0.7056565880775452, + "learning_rate": 9.637213457326503e-05, + "loss": 0.874, + "step": 38210 + }, + { + "epoch": 0.24417668630131736, + "grad_norm": 1.0978292226791382, + "learning_rate": 9.637025789962885e-05, + "loss": 1.059, + "step": 38220 + }, + { + "epoch": 0.24424057345105604, + "grad_norm": 1.170605182647705, + "learning_rate": 9.636838075900405e-05, + "loss": 0.7483, + "step": 38230 + }, + { + "epoch": 0.24430446060079475, + "grad_norm": 0.9686828255653381, + "learning_rate": 9.636650315140955e-05, + "loss": 1.0386, + "step": 38240 + }, + { + "epoch": 0.24436834775053345, + "grad_norm": 0.9862095713615417, + "learning_rate": 9.636462507686425e-05, + "loss": 0.8925, + "step": 38250 + }, + { + "epoch": 0.24443223490027216, + "grad_norm": 0.8100789189338684, + "learning_rate": 9.636274653538707e-05, + "loss": 0.8097, + "step": 38260 + }, + { + "epoch": 0.24449612205001087, + "grad_norm": 0.9674224257469177, + "learning_rate": 9.636086752699691e-05, + "loss": 0.9992, + "step": 38270 + }, + { + "epoch": 0.24456000919974957, + "grad_norm": 0.8364433646202087, + "learning_rate": 9.63589880517127e-05, + "loss": 0.7888, + "step": 38280 + }, + { + "epoch": 0.24462389634948825, + "grad_norm": 0.8828948140144348, + "learning_rate": 9.63571081095534e-05, + "loss": 0.9031, + "step": 38290 + }, + { + "epoch": 0.24468778349922696, + "grad_norm": 0.7281533479690552, + "learning_rate": 9.63552277005379e-05, + "loss": 0.7772, + "step": 38300 + }, + { + "epoch": 0.24475167064896566, + "grad_norm": 1.225823998451233, + "learning_rate": 9.635334682468516e-05, + "loss": 0.7892, + "step": 38310 + }, + { + "epoch": 0.24481555779870437, + "grad_norm": 0.8330084681510925, + "learning_rate": 9.63514654820141e-05, + "loss": 0.8299, + "step": 38320 + }, + { + "epoch": 0.24487944494844308, + "grad_norm": 1.0153292417526245, + "learning_rate": 9.63495836725437e-05, + "loss": 1.0699, + "step": 38330 + }, + { + "epoch": 0.24494333209818178, + "grad_norm": 1.185373306274414, + "learning_rate": 9.634770139629288e-05, + "loss": 0.9322, + "step": 38340 + }, + { + "epoch": 0.24500721924792046, + "grad_norm": 0.7242724895477295, + "learning_rate": 9.634581865328062e-05, + "loss": 0.7414, + "step": 38350 + }, + { + "epoch": 0.24507110639765917, + "grad_norm": 0.8044312596321106, + "learning_rate": 9.634393544352589e-05, + "loss": 1.1166, + "step": 38360 + }, + { + "epoch": 0.24513499354739787, + "grad_norm": 0.9795138239860535, + "learning_rate": 9.63420517670476e-05, + "loss": 0.7423, + "step": 38370 + }, + { + "epoch": 0.24519888069713658, + "grad_norm": 0.7290918231010437, + "learning_rate": 9.634016762386478e-05, + "loss": 0.9426, + "step": 38380 + }, + { + "epoch": 0.24526276784687528, + "grad_norm": 0.7532154321670532, + "learning_rate": 9.633828301399639e-05, + "loss": 0.9311, + "step": 38390 + }, + { + "epoch": 0.245326654996614, + "grad_norm": 1.0642715692520142, + "learning_rate": 9.633639793746139e-05, + "loss": 0.83, + "step": 38400 + }, + { + "epoch": 0.24539054214635267, + "grad_norm": 0.5684540271759033, + "learning_rate": 9.633451239427877e-05, + "loss": 0.9418, + "step": 38410 + }, + { + "epoch": 0.24545442929609138, + "grad_norm": 0.827085554599762, + "learning_rate": 9.633262638446753e-05, + "loss": 0.8866, + "step": 38420 + }, + { + "epoch": 0.24551831644583008, + "grad_norm": 1.217444896697998, + "learning_rate": 9.633073990804666e-05, + "loss": 0.8359, + "step": 38430 + }, + { + "epoch": 0.2455822035955688, + "grad_norm": 0.964013397693634, + "learning_rate": 9.632885296503515e-05, + "loss": 0.9809, + "step": 38440 + }, + { + "epoch": 0.2456460907453075, + "grad_norm": 0.5672999024391174, + "learning_rate": 9.632696555545203e-05, + "loss": 0.7156, + "step": 38450 + }, + { + "epoch": 0.2457099778950462, + "grad_norm": 0.6509802341461182, + "learning_rate": 9.632507767931626e-05, + "loss": 0.7118, + "step": 38460 + }, + { + "epoch": 0.24577386504478488, + "grad_norm": 1.3255314826965332, + "learning_rate": 9.63231893366469e-05, + "loss": 0.9581, + "step": 38470 + }, + { + "epoch": 0.24583775219452358, + "grad_norm": 0.7225618958473206, + "learning_rate": 9.632130052746296e-05, + "loss": 0.9634, + "step": 38480 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 1.1634318828582764, + "learning_rate": 9.631941125178343e-05, + "loss": 1.0248, + "step": 38490 + }, + { + "epoch": 0.245965526494001, + "grad_norm": 0.75383460521698, + "learning_rate": 9.631752150962736e-05, + "loss": 0.8206, + "step": 38500 + }, + { + "epoch": 0.2460294136437397, + "grad_norm": 1.0557365417480469, + "learning_rate": 9.631563130101377e-05, + "loss": 0.8587, + "step": 38510 + }, + { + "epoch": 0.2460933007934784, + "grad_norm": 0.8488501906394958, + "learning_rate": 9.631374062596172e-05, + "loss": 0.9565, + "step": 38520 + }, + { + "epoch": 0.24615718794321712, + "grad_norm": 0.7064526081085205, + "learning_rate": 9.631184948449023e-05, + "loss": 0.8974, + "step": 38530 + }, + { + "epoch": 0.2462210750929558, + "grad_norm": 1.123842716217041, + "learning_rate": 9.630995787661834e-05, + "loss": 1.0052, + "step": 38540 + }, + { + "epoch": 0.2462849622426945, + "grad_norm": 0.8303399085998535, + "learning_rate": 9.630806580236512e-05, + "loss": 0.6941, + "step": 38550 + }, + { + "epoch": 0.2463488493924332, + "grad_norm": 0.6929298043251038, + "learning_rate": 9.630617326174962e-05, + "loss": 0.9629, + "step": 38560 + }, + { + "epoch": 0.2464127365421719, + "grad_norm": 0.9439372420310974, + "learning_rate": 9.630428025479088e-05, + "loss": 0.8717, + "step": 38570 + }, + { + "epoch": 0.24647662369191062, + "grad_norm": 0.9138436317443848, + "learning_rate": 9.630238678150799e-05, + "loss": 0.8803, + "step": 38580 + }, + { + "epoch": 0.24654051084164932, + "grad_norm": 0.8482638597488403, + "learning_rate": 9.630049284192001e-05, + "loss": 0.9979, + "step": 38590 + }, + { + "epoch": 0.246604397991388, + "grad_norm": 0.779525637626648, + "learning_rate": 9.6298598436046e-05, + "loss": 0.7542, + "step": 38600 + }, + { + "epoch": 0.2466682851411267, + "grad_norm": 0.8518264293670654, + "learning_rate": 9.629670356390505e-05, + "loss": 0.9886, + "step": 38610 + }, + { + "epoch": 0.24673217229086541, + "grad_norm": 1.1973893642425537, + "learning_rate": 9.629480822551627e-05, + "loss": 1.0507, + "step": 38620 + }, + { + "epoch": 0.24679605944060412, + "grad_norm": 0.8994162678718567, + "learning_rate": 9.629291242089869e-05, + "loss": 0.8537, + "step": 38630 + }, + { + "epoch": 0.24685994659034283, + "grad_norm": 0.5772508978843689, + "learning_rate": 9.629101615007145e-05, + "loss": 0.8669, + "step": 38640 + }, + { + "epoch": 0.24692383374008153, + "grad_norm": 0.7069337368011475, + "learning_rate": 9.628911941305361e-05, + "loss": 1.0522, + "step": 38650 + }, + { + "epoch": 0.2469877208898202, + "grad_norm": 0.722184956073761, + "learning_rate": 9.62872222098643e-05, + "loss": 1.1426, + "step": 38660 + }, + { + "epoch": 0.24705160803955892, + "grad_norm": 1.3423835039138794, + "learning_rate": 9.628532454052263e-05, + "loss": 0.7297, + "step": 38670 + }, + { + "epoch": 0.24711549518929762, + "grad_norm": 0.7924500703811646, + "learning_rate": 9.628342640504769e-05, + "loss": 0.7804, + "step": 38680 + }, + { + "epoch": 0.24717938233903633, + "grad_norm": 0.9021787047386169, + "learning_rate": 9.628152780345861e-05, + "loss": 0.8003, + "step": 38690 + }, + { + "epoch": 0.24724326948877504, + "grad_norm": 0.991362452507019, + "learning_rate": 9.62796287357745e-05, + "loss": 0.6662, + "step": 38700 + }, + { + "epoch": 0.24730715663851374, + "grad_norm": 0.8103492856025696, + "learning_rate": 9.62777292020145e-05, + "loss": 0.8117, + "step": 38710 + }, + { + "epoch": 0.24737104378825242, + "grad_norm": 0.6966975331306458, + "learning_rate": 9.627582920219773e-05, + "loss": 0.8408, + "step": 38720 + }, + { + "epoch": 0.24743493093799113, + "grad_norm": 0.7350102663040161, + "learning_rate": 9.627392873634332e-05, + "loss": 0.7777, + "step": 38730 + }, + { + "epoch": 0.24749881808772983, + "grad_norm": 0.7632319927215576, + "learning_rate": 9.627202780447041e-05, + "loss": 1.0788, + "step": 38740 + }, + { + "epoch": 0.24756270523746854, + "grad_norm": 0.6083953976631165, + "learning_rate": 9.627012640659816e-05, + "loss": 0.8096, + "step": 38750 + }, + { + "epoch": 0.24762659238720724, + "grad_norm": 0.7463345527648926, + "learning_rate": 9.62682245427457e-05, + "loss": 0.7349, + "step": 38760 + }, + { + "epoch": 0.24769047953694595, + "grad_norm": 0.7767286896705627, + "learning_rate": 9.626632221293219e-05, + "loss": 1.1039, + "step": 38770 + }, + { + "epoch": 0.24775436668668463, + "grad_norm": 1.4006277322769165, + "learning_rate": 9.626441941717678e-05, + "loss": 0.7395, + "step": 38780 + }, + { + "epoch": 0.24781825383642334, + "grad_norm": 0.9676879048347473, + "learning_rate": 9.626251615549867e-05, + "loss": 1.1208, + "step": 38790 + }, + { + "epoch": 0.24788214098616204, + "grad_norm": 0.8642779588699341, + "learning_rate": 9.626061242791699e-05, + "loss": 0.8964, + "step": 38800 + }, + { + "epoch": 0.24794602813590075, + "grad_norm": 1.1887127161026, + "learning_rate": 9.625870823445092e-05, + "loss": 0.8361, + "step": 38810 + }, + { + "epoch": 0.24800991528563945, + "grad_norm": 0.7598790526390076, + "learning_rate": 9.625680357511962e-05, + "loss": 0.9854, + "step": 38820 + }, + { + "epoch": 0.24807380243537816, + "grad_norm": 1.8052411079406738, + "learning_rate": 9.625489844994231e-05, + "loss": 0.867, + "step": 38830 + }, + { + "epoch": 0.24813768958511684, + "grad_norm": 0.9150733947753906, + "learning_rate": 9.625299285893816e-05, + "loss": 0.9337, + "step": 38840 + }, + { + "epoch": 0.24820157673485554, + "grad_norm": 0.8813467621803284, + "learning_rate": 9.625108680212633e-05, + "loss": 0.6746, + "step": 38850 + }, + { + "epoch": 0.24826546388459425, + "grad_norm": 0.7594160437583923, + "learning_rate": 9.624918027952607e-05, + "loss": 1.0736, + "step": 38860 + }, + { + "epoch": 0.24832935103433296, + "grad_norm": 0.7469112873077393, + "learning_rate": 9.624727329115655e-05, + "loss": 0.8787, + "step": 38870 + }, + { + "epoch": 0.24839323818407166, + "grad_norm": 0.7725271582603455, + "learning_rate": 9.624536583703697e-05, + "loss": 0.768, + "step": 38880 + }, + { + "epoch": 0.24845712533381037, + "grad_norm": 2.877929210662842, + "learning_rate": 9.624345791718656e-05, + "loss": 1.0134, + "step": 38890 + }, + { + "epoch": 0.24852101248354905, + "grad_norm": 0.7709558606147766, + "learning_rate": 9.62415495316245e-05, + "loss": 0.922, + "step": 38900 + }, + { + "epoch": 0.24858489963328775, + "grad_norm": 1.2846732139587402, + "learning_rate": 9.623964068037006e-05, + "loss": 1.2037, + "step": 38910 + }, + { + "epoch": 0.24864878678302646, + "grad_norm": 0.6465185284614563, + "learning_rate": 9.62377313634424e-05, + "loss": 0.9262, + "step": 38920 + }, + { + "epoch": 0.24871267393276517, + "grad_norm": 0.7968388199806213, + "learning_rate": 9.623582158086081e-05, + "loss": 0.7902, + "step": 38930 + }, + { + "epoch": 0.24877656108250387, + "grad_norm": 0.9779314994812012, + "learning_rate": 9.62339113326445e-05, + "loss": 0.9249, + "step": 38940 + }, + { + "epoch": 0.24884044823224258, + "grad_norm": 1.0651602745056152, + "learning_rate": 9.62320006188127e-05, + "loss": 0.8328, + "step": 38950 + }, + { + "epoch": 0.24890433538198126, + "grad_norm": 0.6496372222900391, + "learning_rate": 9.623008943938466e-05, + "loss": 0.8704, + "step": 38960 + }, + { + "epoch": 0.24896822253171996, + "grad_norm": 1.1484968662261963, + "learning_rate": 9.62281777943796e-05, + "loss": 0.8183, + "step": 38970 + }, + { + "epoch": 0.24903210968145867, + "grad_norm": 1.39591383934021, + "learning_rate": 9.622626568381684e-05, + "loss": 1.0141, + "step": 38980 + }, + { + "epoch": 0.24909599683119737, + "grad_norm": 0.6642056107521057, + "learning_rate": 9.622435310771556e-05, + "loss": 0.8481, + "step": 38990 + }, + { + "epoch": 0.24915988398093608, + "grad_norm": 1.48539400100708, + "learning_rate": 9.622244006609506e-05, + "loss": 0.8486, + "step": 39000 + }, + { + "epoch": 0.2492237711306748, + "grad_norm": 0.6797450184822083, + "learning_rate": 9.62205265589746e-05, + "loss": 0.8783, + "step": 39010 + }, + { + "epoch": 0.24928765828041347, + "grad_norm": 0.6741315126419067, + "learning_rate": 9.621861258637345e-05, + "loss": 0.8605, + "step": 39020 + }, + { + "epoch": 0.24935154543015217, + "grad_norm": 0.8567194938659668, + "learning_rate": 9.621669814831089e-05, + "loss": 1.1903, + "step": 39030 + }, + { + "epoch": 0.24941543257989088, + "grad_norm": 0.665440022945404, + "learning_rate": 9.62147832448062e-05, + "loss": 0.9027, + "step": 39040 + }, + { + "epoch": 0.24947931972962958, + "grad_norm": 0.7882826924324036, + "learning_rate": 9.621286787587866e-05, + "loss": 0.7315, + "step": 39050 + }, + { + "epoch": 0.2495432068793683, + "grad_norm": 0.6440451145172119, + "learning_rate": 9.621095204154756e-05, + "loss": 1.0849, + "step": 39060 + }, + { + "epoch": 0.249607094029107, + "grad_norm": 0.7116972804069519, + "learning_rate": 9.62090357418322e-05, + "loss": 0.9145, + "step": 39070 + }, + { + "epoch": 0.24967098117884567, + "grad_norm": 0.895585298538208, + "learning_rate": 9.620711897675185e-05, + "loss": 0.9243, + "step": 39080 + }, + { + "epoch": 0.24973486832858438, + "grad_norm": 0.730792224407196, + "learning_rate": 9.620520174632585e-05, + "loss": 0.8114, + "step": 39090 + }, + { + "epoch": 0.2497987554783231, + "grad_norm": 1.1607691049575806, + "learning_rate": 9.620328405057352e-05, + "loss": 0.9737, + "step": 39100 + }, + { + "epoch": 0.2498626426280618, + "grad_norm": 0.927515983581543, + "learning_rate": 9.620136588951412e-05, + "loss": 1.0355, + "step": 39110 + }, + { + "epoch": 0.2499265297778005, + "grad_norm": 1.267722487449646, + "learning_rate": 9.6199447263167e-05, + "loss": 1.0208, + "step": 39120 + }, + { + "epoch": 0.2499904169275392, + "grad_norm": 0.9854876399040222, + "learning_rate": 9.619752817155149e-05, + "loss": 1.2538, + "step": 39130 + }, + { + "epoch": 0.2500543040772779, + "grad_norm": 0.8375557065010071, + "learning_rate": 9.61956086146869e-05, + "loss": 0.8444, + "step": 39140 + }, + { + "epoch": 0.2501181912270166, + "grad_norm": 0.7425163388252258, + "learning_rate": 9.619368859259255e-05, + "loss": 0.6912, + "step": 39150 + }, + { + "epoch": 0.2501820783767553, + "grad_norm": 0.9362971782684326, + "learning_rate": 9.61917681052878e-05, + "loss": 1.0123, + "step": 39160 + }, + { + "epoch": 0.250245965526494, + "grad_norm": 0.7394902110099792, + "learning_rate": 9.6189847152792e-05, + "loss": 0.7578, + "step": 39170 + }, + { + "epoch": 0.2503098526762327, + "grad_norm": 2.326955556869507, + "learning_rate": 9.618792573512447e-05, + "loss": 0.9821, + "step": 39180 + }, + { + "epoch": 0.2503737398259714, + "grad_norm": 0.5567811727523804, + "learning_rate": 9.618600385230456e-05, + "loss": 1.543, + "step": 39190 + }, + { + "epoch": 0.2504376269757101, + "grad_norm": 0.8062513470649719, + "learning_rate": 9.618408150435165e-05, + "loss": 1.0742, + "step": 39200 + }, + { + "epoch": 0.2505015141254488, + "grad_norm": 0.8506273031234741, + "learning_rate": 9.618215869128507e-05, + "loss": 0.8167, + "step": 39210 + }, + { + "epoch": 0.2505654012751875, + "grad_norm": 1.3575971126556396, + "learning_rate": 9.61802354131242e-05, + "loss": 1.0804, + "step": 39220 + }, + { + "epoch": 0.2506292884249262, + "grad_norm": 0.6114894151687622, + "learning_rate": 9.617831166988842e-05, + "loss": 0.7265, + "step": 39230 + }, + { + "epoch": 0.2506931755746649, + "grad_norm": 1.1617469787597656, + "learning_rate": 9.617638746159709e-05, + "loss": 1.1414, + "step": 39240 + }, + { + "epoch": 0.2507570627244036, + "grad_norm": 1.004840612411499, + "learning_rate": 9.617446278826958e-05, + "loss": 0.8523, + "step": 39250 + }, + { + "epoch": 0.25082094987414233, + "grad_norm": 0.69590824842453, + "learning_rate": 9.617253764992529e-05, + "loss": 0.6603, + "step": 39260 + }, + { + "epoch": 0.25088483702388104, + "grad_norm": 2.461747169494629, + "learning_rate": 9.61706120465836e-05, + "loss": 0.973, + "step": 39270 + }, + { + "epoch": 0.25094872417361974, + "grad_norm": 1.718680500984192, + "learning_rate": 9.616868597826389e-05, + "loss": 0.9792, + "step": 39280 + }, + { + "epoch": 0.2510126113233584, + "grad_norm": 0.9190512299537659, + "learning_rate": 9.616675944498559e-05, + "loss": 1.0048, + "step": 39290 + }, + { + "epoch": 0.2510764984730971, + "grad_norm": 0.6358333230018616, + "learning_rate": 9.616483244676809e-05, + "loss": 0.8014, + "step": 39300 + }, + { + "epoch": 0.2511403856228358, + "grad_norm": 0.8349801301956177, + "learning_rate": 9.616290498363076e-05, + "loss": 0.9359, + "step": 39310 + }, + { + "epoch": 0.2512042727725745, + "grad_norm": 0.715552568435669, + "learning_rate": 9.616097705559306e-05, + "loss": 0.9922, + "step": 39320 + }, + { + "epoch": 0.2512681599223132, + "grad_norm": 0.7606783509254456, + "learning_rate": 9.615904866267438e-05, + "loss": 0.8454, + "step": 39330 + }, + { + "epoch": 0.2513320470720519, + "grad_norm": 0.4417531192302704, + "learning_rate": 9.615711980489415e-05, + "loss": 0.7793, + "step": 39340 + }, + { + "epoch": 0.25139593422179063, + "grad_norm": 0.9691148400306702, + "learning_rate": 9.615519048227178e-05, + "loss": 1.0334, + "step": 39350 + }, + { + "epoch": 0.25145982137152934, + "grad_norm": 0.9279484748840332, + "learning_rate": 9.615326069482673e-05, + "loss": 0.76, + "step": 39360 + }, + { + "epoch": 0.25152370852126804, + "grad_norm": 0.8048676252365112, + "learning_rate": 9.61513304425784e-05, + "loss": 0.7468, + "step": 39370 + }, + { + "epoch": 0.25158759567100675, + "grad_norm": 0.460409939289093, + "learning_rate": 9.614939972554626e-05, + "loss": 0.9653, + "step": 39380 + }, + { + "epoch": 0.25165148282074545, + "grad_norm": 0.5549013018608093, + "learning_rate": 9.614746854374972e-05, + "loss": 0.6636, + "step": 39390 + }, + { + "epoch": 0.25171536997048416, + "grad_norm": 0.5946542024612427, + "learning_rate": 9.614553689720827e-05, + "loss": 1.0487, + "step": 39400 + }, + { + "epoch": 0.25177925712022287, + "grad_norm": 0.4577612578868866, + "learning_rate": 9.614360478594133e-05, + "loss": 0.7034, + "step": 39410 + }, + { + "epoch": 0.2518431442699615, + "grad_norm": 1.0320554971694946, + "learning_rate": 9.614167220996838e-05, + "loss": 1.0302, + "step": 39420 + }, + { + "epoch": 0.2519070314197002, + "grad_norm": 0.5677948594093323, + "learning_rate": 9.613973916930887e-05, + "loss": 0.7853, + "step": 39430 + }, + { + "epoch": 0.25197091856943893, + "grad_norm": 0.8020201325416565, + "learning_rate": 9.613780566398227e-05, + "loss": 0.9151, + "step": 39440 + }, + { + "epoch": 0.25203480571917763, + "grad_norm": 0.966617226600647, + "learning_rate": 9.613587169400805e-05, + "loss": 0.8891, + "step": 39450 + }, + { + "epoch": 0.25209869286891634, + "grad_norm": 1.3065134286880493, + "learning_rate": 9.613393725940568e-05, + "loss": 0.8974, + "step": 39460 + }, + { + "epoch": 0.25216258001865505, + "grad_norm": 0.45482340455055237, + "learning_rate": 9.613200236019466e-05, + "loss": 0.8328, + "step": 39470 + }, + { + "epoch": 0.25222646716839375, + "grad_norm": 1.2391636371612549, + "learning_rate": 9.613006699639446e-05, + "loss": 1.1481, + "step": 39480 + }, + { + "epoch": 0.25229035431813246, + "grad_norm": 0.6843194365501404, + "learning_rate": 9.612813116802459e-05, + "loss": 0.9104, + "step": 39490 + }, + { + "epoch": 0.25235424146787117, + "grad_norm": 0.9102997779846191, + "learning_rate": 9.612619487510452e-05, + "loss": 0.9072, + "step": 39500 + }, + { + "epoch": 0.25241812861760987, + "grad_norm": 1.0311905145645142, + "learning_rate": 9.612425811765376e-05, + "loss": 0.7641, + "step": 39510 + }, + { + "epoch": 0.2524820157673486, + "grad_norm": 0.7546457648277283, + "learning_rate": 9.612232089569183e-05, + "loss": 0.7963, + "step": 39520 + }, + { + "epoch": 0.2525459029170873, + "grad_norm": 0.6303521990776062, + "learning_rate": 9.612038320923822e-05, + "loss": 0.7462, + "step": 39530 + }, + { + "epoch": 0.25260979006682593, + "grad_norm": 0.8027763366699219, + "learning_rate": 9.611844505831245e-05, + "loss": 1.104, + "step": 39540 + }, + { + "epoch": 0.25267367721656464, + "grad_norm": 0.7128822803497314, + "learning_rate": 9.611650644293404e-05, + "loss": 0.8728, + "step": 39550 + }, + { + "epoch": 0.25273756436630335, + "grad_norm": 0.6497736573219299, + "learning_rate": 9.611456736312252e-05, + "loss": 0.7607, + "step": 39560 + }, + { + "epoch": 0.25280145151604205, + "grad_norm": 0.8743992447853088, + "learning_rate": 9.61126278188974e-05, + "loss": 0.7501, + "step": 39570 + }, + { + "epoch": 0.25286533866578076, + "grad_norm": 0.9536669254302979, + "learning_rate": 9.611068781027824e-05, + "loss": 0.9285, + "step": 39580 + }, + { + "epoch": 0.25292922581551947, + "grad_norm": 0.5790122747421265, + "learning_rate": 9.610874733728455e-05, + "loss": 0.9496, + "step": 39590 + }, + { + "epoch": 0.25299311296525817, + "grad_norm": 0.9158995151519775, + "learning_rate": 9.61068063999359e-05, + "loss": 0.6402, + "step": 39600 + }, + { + "epoch": 0.2530570001149969, + "grad_norm": 0.5689387321472168, + "learning_rate": 9.61048649982518e-05, + "loss": 0.8452, + "step": 39610 + }, + { + "epoch": 0.2531208872647356, + "grad_norm": 0.7655090093612671, + "learning_rate": 9.610292313225184e-05, + "loss": 0.8777, + "step": 39620 + }, + { + "epoch": 0.2531847744144743, + "grad_norm": 0.9562214612960815, + "learning_rate": 9.610098080195555e-05, + "loss": 0.8012, + "step": 39630 + }, + { + "epoch": 0.253248661564213, + "grad_norm": 0.9904442429542542, + "learning_rate": 9.609903800738251e-05, + "loss": 0.7636, + "step": 39640 + }, + { + "epoch": 0.2533125487139517, + "grad_norm": 1.2793811559677124, + "learning_rate": 9.609709474855226e-05, + "loss": 0.7783, + "step": 39650 + }, + { + "epoch": 0.25337643586369035, + "grad_norm": 1.0388842821121216, + "learning_rate": 9.60951510254844e-05, + "loss": 0.9002, + "step": 39660 + }, + { + "epoch": 0.25344032301342906, + "grad_norm": 0.5834752321243286, + "learning_rate": 9.60932068381985e-05, + "loss": 0.8864, + "step": 39670 + }, + { + "epoch": 0.25350421016316776, + "grad_norm": 0.7528826594352722, + "learning_rate": 9.609126218671411e-05, + "loss": 0.921, + "step": 39680 + }, + { + "epoch": 0.25356809731290647, + "grad_norm": 0.8979531526565552, + "learning_rate": 9.608931707105085e-05, + "loss": 0.7506, + "step": 39690 + }, + { + "epoch": 0.2536319844626452, + "grad_norm": 0.8471142649650574, + "learning_rate": 9.608737149122829e-05, + "loss": 0.7908, + "step": 39700 + }, + { + "epoch": 0.2536958716123839, + "grad_norm": 1.1389135122299194, + "learning_rate": 9.608542544726603e-05, + "loss": 0.6934, + "step": 39710 + }, + { + "epoch": 0.2537597587621226, + "grad_norm": 0.7484509944915771, + "learning_rate": 9.608347893918366e-05, + "loss": 0.8213, + "step": 39720 + }, + { + "epoch": 0.2538236459118613, + "grad_norm": 1.0670719146728516, + "learning_rate": 9.608153196700078e-05, + "loss": 0.8251, + "step": 39730 + }, + { + "epoch": 0.2538875330616, + "grad_norm": 0.6782344579696655, + "learning_rate": 9.607958453073702e-05, + "loss": 1.1657, + "step": 39740 + }, + { + "epoch": 0.2539514202113387, + "grad_norm": 1.3708243370056152, + "learning_rate": 9.607763663041198e-05, + "loss": 0.815, + "step": 39750 + }, + { + "epoch": 0.2540153073610774, + "grad_norm": 0.6719133853912354, + "learning_rate": 9.607568826604528e-05, + "loss": 0.8005, + "step": 39760 + }, + { + "epoch": 0.2540791945108161, + "grad_norm": 0.8270106315612793, + "learning_rate": 9.607373943765652e-05, + "loss": 0.9914, + "step": 39770 + }, + { + "epoch": 0.25414308166055477, + "grad_norm": 0.9956563711166382, + "learning_rate": 9.607179014526535e-05, + "loss": 0.8496, + "step": 39780 + }, + { + "epoch": 0.2542069688102935, + "grad_norm": 1.2813925743103027, + "learning_rate": 9.60698403888914e-05, + "loss": 0.8953, + "step": 39790 + }, + { + "epoch": 0.2542708559600322, + "grad_norm": 0.9512122273445129, + "learning_rate": 9.60678901685543e-05, + "loss": 1.2809, + "step": 39800 + }, + { + "epoch": 0.2543347431097709, + "grad_norm": 0.9539186954498291, + "learning_rate": 9.60659394842737e-05, + "loss": 1.0062, + "step": 39810 + }, + { + "epoch": 0.2543986302595096, + "grad_norm": 0.963093101978302, + "learning_rate": 9.606398833606923e-05, + "loss": 0.9276, + "step": 39820 + }, + { + "epoch": 0.2544625174092483, + "grad_norm": 0.8398544192314148, + "learning_rate": 9.606203672396055e-05, + "loss": 1.2115, + "step": 39830 + }, + { + "epoch": 0.254526404558987, + "grad_norm": 0.7242417335510254, + "learning_rate": 9.60600846479673e-05, + "loss": 0.7489, + "step": 39840 + }, + { + "epoch": 0.2545902917087257, + "grad_norm": 1.4944490194320679, + "learning_rate": 9.605813210810917e-05, + "loss": 1.3959, + "step": 39850 + }, + { + "epoch": 0.2546541788584644, + "grad_norm": 0.8167847394943237, + "learning_rate": 9.605617910440579e-05, + "loss": 0.8212, + "step": 39860 + }, + { + "epoch": 0.2547180660082031, + "grad_norm": 0.7867516279220581, + "learning_rate": 9.605422563687684e-05, + "loss": 1.1342, + "step": 39870 + }, + { + "epoch": 0.25478195315794183, + "grad_norm": 1.030428171157837, + "learning_rate": 9.605227170554201e-05, + "loss": 1.0195, + "step": 39880 + }, + { + "epoch": 0.25484584030768054, + "grad_norm": 0.6590962409973145, + "learning_rate": 9.605031731042094e-05, + "loss": 0.928, + "step": 39890 + }, + { + "epoch": 0.2549097274574192, + "grad_norm": 1.0177749395370483, + "learning_rate": 9.604836245153334e-05, + "loss": 0.8701, + "step": 39900 + }, + { + "epoch": 0.2549736146071579, + "grad_norm": 0.6760947108268738, + "learning_rate": 9.604640712889891e-05, + "loss": 0.75, + "step": 39910 + }, + { + "epoch": 0.2550375017568966, + "grad_norm": 0.6182072758674622, + "learning_rate": 9.604445134253731e-05, + "loss": 0.7936, + "step": 39920 + }, + { + "epoch": 0.2551013889066353, + "grad_norm": 2.0102100372314453, + "learning_rate": 9.604249509246826e-05, + "loss": 0.7663, + "step": 39930 + }, + { + "epoch": 0.255165276056374, + "grad_norm": 0.7477763295173645, + "learning_rate": 9.604053837871145e-05, + "loss": 0.7996, + "step": 39940 + }, + { + "epoch": 0.2552291632061127, + "grad_norm": 0.8741987943649292, + "learning_rate": 9.603858120128658e-05, + "loss": 0.6339, + "step": 39950 + }, + { + "epoch": 0.2552930503558514, + "grad_norm": 0.9012942314147949, + "learning_rate": 9.603662356021337e-05, + "loss": 1.0624, + "step": 39960 + }, + { + "epoch": 0.25535693750559013, + "grad_norm": 0.9270766973495483, + "learning_rate": 9.603466545551155e-05, + "loss": 0.8575, + "step": 39970 + }, + { + "epoch": 0.25542082465532884, + "grad_norm": 1.3213611841201782, + "learning_rate": 9.603270688720081e-05, + "loss": 1.0033, + "step": 39980 + }, + { + "epoch": 0.25548471180506754, + "grad_norm": 1.1009562015533447, + "learning_rate": 9.603074785530088e-05, + "loss": 1.2523, + "step": 39990 + }, + { + "epoch": 0.25554859895480625, + "grad_norm": 0.6423203945159912, + "learning_rate": 9.602878835983151e-05, + "loss": 1.3048, + "step": 40000 + }, + { + "epoch": 0.25561248610454496, + "grad_norm": 0.747044026851654, + "learning_rate": 9.60268284008124e-05, + "loss": 0.8828, + "step": 40010 + }, + { + "epoch": 0.2556763732542836, + "grad_norm": 2.6469714641571045, + "learning_rate": 9.602486797826333e-05, + "loss": 1.0208, + "step": 40020 + }, + { + "epoch": 0.2557402604040223, + "grad_norm": 1.1847596168518066, + "learning_rate": 9.602290709220403e-05, + "loss": 1.0072, + "step": 40030 + }, + { + "epoch": 0.255804147553761, + "grad_norm": 2.8336246013641357, + "learning_rate": 9.602094574265421e-05, + "loss": 1.1539, + "step": 40040 + }, + { + "epoch": 0.2558680347034997, + "grad_norm": 0.8645704388618469, + "learning_rate": 9.601898392963368e-05, + "loss": 0.9056, + "step": 40050 + }, + { + "epoch": 0.25593192185323843, + "grad_norm": 1.049857258796692, + "learning_rate": 9.601702165316216e-05, + "loss": 0.9048, + "step": 40060 + }, + { + "epoch": 0.25599580900297714, + "grad_norm": 0.4633677005767822, + "learning_rate": 9.601505891325941e-05, + "loss": 0.8304, + "step": 40070 + }, + { + "epoch": 0.25605969615271584, + "grad_norm": 0.9349238872528076, + "learning_rate": 9.601309570994522e-05, + "loss": 0.8373, + "step": 40080 + }, + { + "epoch": 0.25612358330245455, + "grad_norm": 0.8270478844642639, + "learning_rate": 9.601113204323935e-05, + "loss": 0.9072, + "step": 40090 + }, + { + "epoch": 0.25618747045219326, + "grad_norm": 0.9788760542869568, + "learning_rate": 9.600916791316157e-05, + "loss": 0.9037, + "step": 40100 + }, + { + "epoch": 0.25625135760193196, + "grad_norm": 0.9276544451713562, + "learning_rate": 9.600720331973167e-05, + "loss": 0.855, + "step": 40110 + }, + { + "epoch": 0.25631524475167067, + "grad_norm": 0.6395992636680603, + "learning_rate": 9.600523826296943e-05, + "loss": 0.7903, + "step": 40120 + }, + { + "epoch": 0.2563791319014094, + "grad_norm": 0.8007004261016846, + "learning_rate": 9.600327274289464e-05, + "loss": 0.8177, + "step": 40130 + }, + { + "epoch": 0.256443019051148, + "grad_norm": 1.1678056716918945, + "learning_rate": 9.60013067595271e-05, + "loss": 0.8526, + "step": 40140 + }, + { + "epoch": 0.25650690620088673, + "grad_norm": 0.8026590347290039, + "learning_rate": 9.59993403128866e-05, + "loss": 0.9358, + "step": 40150 + }, + { + "epoch": 0.25657079335062544, + "grad_norm": 1.020652174949646, + "learning_rate": 9.599737340299294e-05, + "loss": 1.0027, + "step": 40160 + }, + { + "epoch": 0.25663468050036414, + "grad_norm": 0.8998063206672668, + "learning_rate": 9.599540602986594e-05, + "loss": 0.9003, + "step": 40170 + }, + { + "epoch": 0.25669856765010285, + "grad_norm": 1.1373684406280518, + "learning_rate": 9.599343819352542e-05, + "loss": 0.9378, + "step": 40180 + }, + { + "epoch": 0.25676245479984156, + "grad_norm": 0.9886625409126282, + "learning_rate": 9.599146989399117e-05, + "loss": 0.7858, + "step": 40190 + }, + { + "epoch": 0.25682634194958026, + "grad_norm": 0.899255633354187, + "learning_rate": 9.598950113128304e-05, + "loss": 0.829, + "step": 40200 + }, + { + "epoch": 0.25689022909931897, + "grad_norm": 1.3470916748046875, + "learning_rate": 9.598753190542086e-05, + "loss": 1.1775, + "step": 40210 + }, + { + "epoch": 0.2569541162490577, + "grad_norm": 0.6345730423927307, + "learning_rate": 9.598556221642443e-05, + "loss": 0.9636, + "step": 40220 + }, + { + "epoch": 0.2570180033987964, + "grad_norm": 0.9716305136680603, + "learning_rate": 9.598359206431362e-05, + "loss": 0.7459, + "step": 40230 + }, + { + "epoch": 0.2570818905485351, + "grad_norm": 0.5304000377655029, + "learning_rate": 9.598162144910824e-05, + "loss": 1.0725, + "step": 40240 + }, + { + "epoch": 0.2571457776982738, + "grad_norm": 0.5957240462303162, + "learning_rate": 9.597965037082817e-05, + "loss": 0.6653, + "step": 40250 + }, + { + "epoch": 0.2572096648480125, + "grad_norm": 0.4943158030509949, + "learning_rate": 9.597767882949322e-05, + "loss": 0.86, + "step": 40260 + }, + { + "epoch": 0.25727355199775115, + "grad_norm": 0.5531061291694641, + "learning_rate": 9.59757068251233e-05, + "loss": 1.0038, + "step": 40270 + }, + { + "epoch": 0.25733743914748985, + "grad_norm": 0.747992753982544, + "learning_rate": 9.59737343577382e-05, + "loss": 0.7299, + "step": 40280 + }, + { + "epoch": 0.25740132629722856, + "grad_norm": 0.6985594034194946, + "learning_rate": 9.597176142735784e-05, + "loss": 0.7868, + "step": 40290 + }, + { + "epoch": 0.25746521344696727, + "grad_norm": 0.7488262057304382, + "learning_rate": 9.596978803400207e-05, + "loss": 0.8295, + "step": 40300 + }, + { + "epoch": 0.257529100596706, + "grad_norm": 1.0540517568588257, + "learning_rate": 9.596781417769076e-05, + "loss": 0.9902, + "step": 40310 + }, + { + "epoch": 0.2575929877464447, + "grad_norm": 0.7886415123939514, + "learning_rate": 9.596583985844381e-05, + "loss": 1.1803, + "step": 40320 + }, + { + "epoch": 0.2576568748961834, + "grad_norm": 0.8558555841445923, + "learning_rate": 9.596386507628108e-05, + "loss": 0.7431, + "step": 40330 + }, + { + "epoch": 0.2577207620459221, + "grad_norm": 1.4910976886749268, + "learning_rate": 9.596188983122246e-05, + "loss": 0.804, + "step": 40340 + }, + { + "epoch": 0.2577846491956608, + "grad_norm": 0.7117838859558105, + "learning_rate": 9.595991412328784e-05, + "loss": 1.1232, + "step": 40350 + }, + { + "epoch": 0.2578485363453995, + "grad_norm": 1.4588252305984497, + "learning_rate": 9.595793795249714e-05, + "loss": 0.8111, + "step": 40360 + }, + { + "epoch": 0.2579124234951382, + "grad_norm": 1.322583794593811, + "learning_rate": 9.595596131887024e-05, + "loss": 0.6423, + "step": 40370 + }, + { + "epoch": 0.2579763106448769, + "grad_norm": 1.1012037992477417, + "learning_rate": 9.595398422242702e-05, + "loss": 1.0775, + "step": 40380 + }, + { + "epoch": 0.25804019779461557, + "grad_norm": 0.7852954864501953, + "learning_rate": 9.595200666318746e-05, + "loss": 0.9674, + "step": 40390 + }, + { + "epoch": 0.2581040849443543, + "grad_norm": 0.9846484661102295, + "learning_rate": 9.595002864117144e-05, + "loss": 1.0256, + "step": 40400 + }, + { + "epoch": 0.258167972094093, + "grad_norm": 0.7954578399658203, + "learning_rate": 9.594805015639887e-05, + "loss": 0.8524, + "step": 40410 + }, + { + "epoch": 0.2582318592438317, + "grad_norm": 1.4870191812515259, + "learning_rate": 9.594607120888968e-05, + "loss": 1.0345, + "step": 40420 + }, + { + "epoch": 0.2582957463935704, + "grad_norm": 0.8756714463233948, + "learning_rate": 9.594409179866382e-05, + "loss": 0.7956, + "step": 40430 + }, + { + "epoch": 0.2583596335433091, + "grad_norm": 0.9294307231903076, + "learning_rate": 9.594211192574119e-05, + "loss": 1.2266, + "step": 40440 + }, + { + "epoch": 0.2584235206930478, + "grad_norm": 0.6904338598251343, + "learning_rate": 9.594013159014174e-05, + "loss": 1.1333, + "step": 40450 + }, + { + "epoch": 0.2584874078427865, + "grad_norm": 0.9498498439788818, + "learning_rate": 9.593815079188544e-05, + "loss": 0.8015, + "step": 40460 + }, + { + "epoch": 0.2585512949925252, + "grad_norm": 0.686410665512085, + "learning_rate": 9.593616953099222e-05, + "loss": 1.0482, + "step": 40470 + }, + { + "epoch": 0.2586151821422639, + "grad_norm": 0.8870381116867065, + "learning_rate": 9.593418780748203e-05, + "loss": 0.9889, + "step": 40480 + }, + { + "epoch": 0.25867906929200263, + "grad_norm": 0.6513268947601318, + "learning_rate": 9.593220562137481e-05, + "loss": 0.8747, + "step": 40490 + }, + { + "epoch": 0.25874295644174133, + "grad_norm": 0.8651543855667114, + "learning_rate": 9.593022297269056e-05, + "loss": 0.8722, + "step": 40500 + }, + { + "epoch": 0.25880684359148, + "grad_norm": 0.8488206267356873, + "learning_rate": 9.592823986144923e-05, + "loss": 0.8462, + "step": 40510 + }, + { + "epoch": 0.2588707307412187, + "grad_norm": 1.4681724309921265, + "learning_rate": 9.592625628767079e-05, + "loss": 0.8677, + "step": 40520 + }, + { + "epoch": 0.2589346178909574, + "grad_norm": 4.392560958862305, + "learning_rate": 9.592427225137521e-05, + "loss": 0.8277, + "step": 40530 + }, + { + "epoch": 0.2589985050406961, + "grad_norm": 0.7526928782463074, + "learning_rate": 9.59222877525825e-05, + "loss": 0.9594, + "step": 40540 + }, + { + "epoch": 0.2590623921904348, + "grad_norm": 0.8426737785339355, + "learning_rate": 9.59203027913126e-05, + "loss": 0.9697, + "step": 40550 + }, + { + "epoch": 0.2591262793401735, + "grad_norm": 1.0020760297775269, + "learning_rate": 9.591831736758553e-05, + "loss": 1.0801, + "step": 40560 + }, + { + "epoch": 0.2591901664899122, + "grad_norm": 0.746593177318573, + "learning_rate": 9.591633148142129e-05, + "loss": 0.9383, + "step": 40570 + }, + { + "epoch": 0.2592540536396509, + "grad_norm": 0.7140881419181824, + "learning_rate": 9.591434513283986e-05, + "loss": 0.8904, + "step": 40580 + }, + { + "epoch": 0.25931794078938963, + "grad_norm": 0.9984216690063477, + "learning_rate": 9.591235832186125e-05, + "loss": 0.8797, + "step": 40590 + }, + { + "epoch": 0.25938182793912834, + "grad_norm": 0.9869062304496765, + "learning_rate": 9.591037104850546e-05, + "loss": 0.8081, + "step": 40600 + }, + { + "epoch": 0.25944571508886705, + "grad_norm": 1.0706651210784912, + "learning_rate": 9.590838331279255e-05, + "loss": 1.1206, + "step": 40610 + }, + { + "epoch": 0.25950960223860575, + "grad_norm": 1.028882384300232, + "learning_rate": 9.590639511474248e-05, + "loss": 0.9107, + "step": 40620 + }, + { + "epoch": 0.2595734893883444, + "grad_norm": 1.1108314990997314, + "learning_rate": 9.590440645437529e-05, + "loss": 0.8844, + "step": 40630 + }, + { + "epoch": 0.2596373765380831, + "grad_norm": 0.9321774840354919, + "learning_rate": 9.590241733171104e-05, + "loss": 0.8686, + "step": 40640 + }, + { + "epoch": 0.2597012636878218, + "grad_norm": 1.029826283454895, + "learning_rate": 9.59004277467697e-05, + "loss": 1.1136, + "step": 40650 + }, + { + "epoch": 0.2597651508375605, + "grad_norm": 0.908687949180603, + "learning_rate": 9.589843769957138e-05, + "loss": 0.7381, + "step": 40660 + }, + { + "epoch": 0.2598290379872992, + "grad_norm": 0.565542459487915, + "learning_rate": 9.589644719013607e-05, + "loss": 0.9248, + "step": 40670 + }, + { + "epoch": 0.25989292513703793, + "grad_norm": 0.7926294803619385, + "learning_rate": 9.589445621848384e-05, + "loss": 0.8401, + "step": 40680 + }, + { + "epoch": 0.25995681228677664, + "grad_norm": 0.6880931854248047, + "learning_rate": 9.589246478463471e-05, + "loss": 1.0261, + "step": 40690 + }, + { + "epoch": 0.26002069943651535, + "grad_norm": 1.0573099851608276, + "learning_rate": 9.589047288860876e-05, + "loss": 0.7308, + "step": 40700 + }, + { + "epoch": 0.26008458658625405, + "grad_norm": 1.2168048620224, + "learning_rate": 9.588848053042605e-05, + "loss": 0.9018, + "step": 40710 + }, + { + "epoch": 0.26014847373599276, + "grad_norm": 0.883126437664032, + "learning_rate": 9.588648771010666e-05, + "loss": 1.0485, + "step": 40720 + }, + { + "epoch": 0.26021236088573146, + "grad_norm": 0.7025336623191833, + "learning_rate": 9.588449442767062e-05, + "loss": 0.8312, + "step": 40730 + }, + { + "epoch": 0.26027624803547017, + "grad_norm": 1.2748278379440308, + "learning_rate": 9.588250068313803e-05, + "loss": 1.1546, + "step": 40740 + }, + { + "epoch": 0.2603401351852088, + "grad_norm": 0.5718688368797302, + "learning_rate": 9.588050647652898e-05, + "loss": 1.0319, + "step": 40750 + }, + { + "epoch": 0.2604040223349475, + "grad_norm": 0.8465229868888855, + "learning_rate": 9.587851180786351e-05, + "loss": 0.9557, + "step": 40760 + }, + { + "epoch": 0.26046790948468623, + "grad_norm": 0.9384807348251343, + "learning_rate": 9.587651667716175e-05, + "loss": 0.9037, + "step": 40770 + }, + { + "epoch": 0.26053179663442494, + "grad_norm": 0.8962541222572327, + "learning_rate": 9.58745210844438e-05, + "loss": 0.782, + "step": 40780 + }, + { + "epoch": 0.26059568378416365, + "grad_norm": 1.1555324792861938, + "learning_rate": 9.58725250297297e-05, + "loss": 0.9214, + "step": 40790 + }, + { + "epoch": 0.26065957093390235, + "grad_norm": 0.5348967909812927, + "learning_rate": 9.587052851303961e-05, + "loss": 0.9402, + "step": 40800 + }, + { + "epoch": 0.26072345808364106, + "grad_norm": 0.9009259343147278, + "learning_rate": 9.586853153439359e-05, + "loss": 0.8024, + "step": 40810 + }, + { + "epoch": 0.26078734523337976, + "grad_norm": 0.7219264507293701, + "learning_rate": 9.58665340938118e-05, + "loss": 0.6523, + "step": 40820 + }, + { + "epoch": 0.26085123238311847, + "grad_norm": 0.5624069571495056, + "learning_rate": 9.586453619131432e-05, + "loss": 0.8817, + "step": 40830 + }, + { + "epoch": 0.2609151195328572, + "grad_norm": 0.8910852074623108, + "learning_rate": 9.586253782692129e-05, + "loss": 0.9517, + "step": 40840 + }, + { + "epoch": 0.2609790066825959, + "grad_norm": 0.9586001038551331, + "learning_rate": 9.586053900065282e-05, + "loss": 0.788, + "step": 40850 + }, + { + "epoch": 0.2610428938323346, + "grad_norm": 0.696487307548523, + "learning_rate": 9.585853971252905e-05, + "loss": 0.8417, + "step": 40860 + }, + { + "epoch": 0.26110678098207324, + "grad_norm": 0.5967971682548523, + "learning_rate": 9.585653996257011e-05, + "loss": 0.6904, + "step": 40870 + }, + { + "epoch": 0.26117066813181194, + "grad_norm": 0.7355442643165588, + "learning_rate": 9.585453975079615e-05, + "loss": 0.8139, + "step": 40880 + }, + { + "epoch": 0.26123455528155065, + "grad_norm": 1.0505903959274292, + "learning_rate": 9.585253907722729e-05, + "loss": 0.9211, + "step": 40890 + }, + { + "epoch": 0.26129844243128936, + "grad_norm": 0.8949944376945496, + "learning_rate": 9.58505379418837e-05, + "loss": 0.7938, + "step": 40900 + }, + { + "epoch": 0.26136232958102806, + "grad_norm": 0.9528142809867859, + "learning_rate": 9.584853634478553e-05, + "loss": 1.1102, + "step": 40910 + }, + { + "epoch": 0.26142621673076677, + "grad_norm": 1.1576639413833618, + "learning_rate": 9.584653428595294e-05, + "loss": 1.1977, + "step": 40920 + }, + { + "epoch": 0.2614901038805055, + "grad_norm": 0.895746648311615, + "learning_rate": 9.584453176540607e-05, + "loss": 0.9543, + "step": 40930 + }, + { + "epoch": 0.2615539910302442, + "grad_norm": 0.5793939828872681, + "learning_rate": 9.58425287831651e-05, + "loss": 0.7111, + "step": 40940 + }, + { + "epoch": 0.2616178781799829, + "grad_norm": 0.8070379495620728, + "learning_rate": 9.584052533925023e-05, + "loss": 0.9304, + "step": 40950 + }, + { + "epoch": 0.2616817653297216, + "grad_norm": 0.7948583364486694, + "learning_rate": 9.583852143368159e-05, + "loss": 0.8819, + "step": 40960 + }, + { + "epoch": 0.2617456524794603, + "grad_norm": 0.9934036731719971, + "learning_rate": 9.58365170664794e-05, + "loss": 0.9324, + "step": 40970 + }, + { + "epoch": 0.261809539629199, + "grad_norm": 0.47576090693473816, + "learning_rate": 9.583451223766382e-05, + "loss": 0.686, + "step": 40980 + }, + { + "epoch": 0.26187342677893766, + "grad_norm": 0.9381804466247559, + "learning_rate": 9.583250694725505e-05, + "loss": 1.1989, + "step": 40990 + }, + { + "epoch": 0.26193731392867636, + "grad_norm": 0.6319658756256104, + "learning_rate": 9.58305011952733e-05, + "loss": 0.9446, + "step": 41000 + }, + { + "epoch": 0.26200120107841507, + "grad_norm": 0.8187490701675415, + "learning_rate": 9.582849498173873e-05, + "loss": 1.0167, + "step": 41010 + }, + { + "epoch": 0.2620650882281538, + "grad_norm": 0.682817816734314, + "learning_rate": 9.582648830667157e-05, + "loss": 0.8066, + "step": 41020 + }, + { + "epoch": 0.2621289753778925, + "grad_norm": 1.3718403577804565, + "learning_rate": 9.582448117009205e-05, + "loss": 0.6587, + "step": 41030 + }, + { + "epoch": 0.2621928625276312, + "grad_norm": 0.7690182328224182, + "learning_rate": 9.582247357202035e-05, + "loss": 0.8507, + "step": 41040 + }, + { + "epoch": 0.2622567496773699, + "grad_norm": 1.0088491439819336, + "learning_rate": 9.58204655124767e-05, + "loss": 0.907, + "step": 41050 + }, + { + "epoch": 0.2623206368271086, + "grad_norm": 0.912486732006073, + "learning_rate": 9.581845699148132e-05, + "loss": 1.0735, + "step": 41060 + }, + { + "epoch": 0.2623845239768473, + "grad_norm": 0.9121546149253845, + "learning_rate": 9.581644800905442e-05, + "loss": 0.8556, + "step": 41070 + }, + { + "epoch": 0.262448411126586, + "grad_norm": 0.8414210677146912, + "learning_rate": 9.581443856521628e-05, + "loss": 1.1905, + "step": 41080 + }, + { + "epoch": 0.2625122982763247, + "grad_norm": 0.5232017040252686, + "learning_rate": 9.58124286599871e-05, + "loss": 0.8749, + "step": 41090 + }, + { + "epoch": 0.2625761854260634, + "grad_norm": 1.9335732460021973, + "learning_rate": 9.581041829338712e-05, + "loss": 0.7256, + "step": 41100 + }, + { + "epoch": 0.26264007257580213, + "grad_norm": 0.7388540506362915, + "learning_rate": 9.58084074654366e-05, + "loss": 0.8657, + "step": 41110 + }, + { + "epoch": 0.2627039597255408, + "grad_norm": 1.9808521270751953, + "learning_rate": 9.580639617615579e-05, + "loss": 0.7139, + "step": 41120 + }, + { + "epoch": 0.2627678468752795, + "grad_norm": 1.5845258235931396, + "learning_rate": 9.580438442556494e-05, + "loss": 0.9972, + "step": 41130 + }, + { + "epoch": 0.2628317340250182, + "grad_norm": 0.8241519331932068, + "learning_rate": 9.580237221368431e-05, + "loss": 0.7273, + "step": 41140 + }, + { + "epoch": 0.2628956211747569, + "grad_norm": 0.8821679353713989, + "learning_rate": 9.580035954053418e-05, + "loss": 0.8493, + "step": 41150 + }, + { + "epoch": 0.2629595083244956, + "grad_norm": 0.7632741332054138, + "learning_rate": 9.57983464061348e-05, + "loss": 1.0987, + "step": 41160 + }, + { + "epoch": 0.2630233954742343, + "grad_norm": 0.6200475692749023, + "learning_rate": 9.579633281050644e-05, + "loss": 0.7774, + "step": 41170 + }, + { + "epoch": 0.263087282623973, + "grad_norm": 0.7186120748519897, + "learning_rate": 9.57943187536694e-05, + "loss": 1.0146, + "step": 41180 + }, + { + "epoch": 0.2631511697737117, + "grad_norm": 1.4124023914337158, + "learning_rate": 9.579230423564395e-05, + "loss": 0.9421, + "step": 41190 + }, + { + "epoch": 0.26321505692345043, + "grad_norm": 1.0203825235366821, + "learning_rate": 9.579028925645038e-05, + "loss": 0.7599, + "step": 41200 + }, + { + "epoch": 0.26327894407318914, + "grad_norm": 0.9980260729789734, + "learning_rate": 9.578827381610899e-05, + "loss": 0.7085, + "step": 41210 + }, + { + "epoch": 0.26334283122292784, + "grad_norm": 0.6271802186965942, + "learning_rate": 9.578625791464006e-05, + "loss": 0.7378, + "step": 41220 + }, + { + "epoch": 0.26340671837266655, + "grad_norm": 0.8588720560073853, + "learning_rate": 9.578424155206392e-05, + "loss": 1.1045, + "step": 41230 + }, + { + "epoch": 0.2634706055224052, + "grad_norm": 0.9197202920913696, + "learning_rate": 9.578222472840083e-05, + "loss": 0.7892, + "step": 41240 + }, + { + "epoch": 0.2635344926721439, + "grad_norm": 1.5513139963150024, + "learning_rate": 9.578020744367115e-05, + "loss": 0.9384, + "step": 41250 + }, + { + "epoch": 0.2635983798218826, + "grad_norm": 0.952202320098877, + "learning_rate": 9.577818969789516e-05, + "loss": 1.0154, + "step": 41260 + }, + { + "epoch": 0.2636622669716213, + "grad_norm": 0.7039241790771484, + "learning_rate": 9.577617149109322e-05, + "loss": 0.8493, + "step": 41270 + }, + { + "epoch": 0.26372615412136, + "grad_norm": 0.8046781420707703, + "learning_rate": 9.577415282328561e-05, + "loss": 0.8281, + "step": 41280 + }, + { + "epoch": 0.26379004127109873, + "grad_norm": 1.6643345355987549, + "learning_rate": 9.57721336944927e-05, + "loss": 0.9271, + "step": 41290 + }, + { + "epoch": 0.26385392842083744, + "grad_norm": 0.9182053208351135, + "learning_rate": 9.577011410473477e-05, + "loss": 0.9575, + "step": 41300 + }, + { + "epoch": 0.26391781557057614, + "grad_norm": 1.2427566051483154, + "learning_rate": 9.576809405403222e-05, + "loss": 1.1546, + "step": 41310 + }, + { + "epoch": 0.26398170272031485, + "grad_norm": 1.6159720420837402, + "learning_rate": 9.576607354240536e-05, + "loss": 0.8444, + "step": 41320 + }, + { + "epoch": 0.26404558987005355, + "grad_norm": 1.222095012664795, + "learning_rate": 9.576405256987456e-05, + "loss": 1.0805, + "step": 41330 + }, + { + "epoch": 0.26410947701979226, + "grad_norm": 0.5747536420822144, + "learning_rate": 9.576203113646015e-05, + "loss": 0.9668, + "step": 41340 + }, + { + "epoch": 0.26417336416953097, + "grad_norm": 0.505827009677887, + "learning_rate": 9.576000924218249e-05, + "loss": 0.812, + "step": 41350 + }, + { + "epoch": 0.2642372513192696, + "grad_norm": 0.9075201749801636, + "learning_rate": 9.575798688706196e-05, + "loss": 0.907, + "step": 41360 + }, + { + "epoch": 0.2643011384690083, + "grad_norm": 0.6353416442871094, + "learning_rate": 9.575596407111891e-05, + "loss": 0.9169, + "step": 41370 + }, + { + "epoch": 0.26436502561874703, + "grad_norm": 0.8017897009849548, + "learning_rate": 9.575394079437372e-05, + "loss": 0.8862, + "step": 41380 + }, + { + "epoch": 0.26442891276848574, + "grad_norm": 0.6533048748970032, + "learning_rate": 9.575191705684676e-05, + "loss": 1.004, + "step": 41390 + }, + { + "epoch": 0.26449279991822444, + "grad_norm": 0.6979532837867737, + "learning_rate": 9.574989285855842e-05, + "loss": 0.7255, + "step": 41400 + }, + { + "epoch": 0.26455668706796315, + "grad_norm": 1.3084895610809326, + "learning_rate": 9.574786819952908e-05, + "loss": 0.9328, + "step": 41410 + }, + { + "epoch": 0.26462057421770185, + "grad_norm": 0.8081639409065247, + "learning_rate": 9.574584307977912e-05, + "loss": 1.1026, + "step": 41420 + }, + { + "epoch": 0.26468446136744056, + "grad_norm": 0.6545292735099792, + "learning_rate": 9.574381749932894e-05, + "loss": 0.8194, + "step": 41430 + }, + { + "epoch": 0.26474834851717927, + "grad_norm": 1.0248223543167114, + "learning_rate": 9.574179145819898e-05, + "loss": 0.9576, + "step": 41440 + }, + { + "epoch": 0.26481223566691797, + "grad_norm": 1.235369086265564, + "learning_rate": 9.573976495640958e-05, + "loss": 0.758, + "step": 41450 + }, + { + "epoch": 0.2648761228166567, + "grad_norm": 0.8014651536941528, + "learning_rate": 9.573773799398116e-05, + "loss": 0.7959, + "step": 41460 + }, + { + "epoch": 0.2649400099663954, + "grad_norm": 0.6294002532958984, + "learning_rate": 9.573571057093418e-05, + "loss": 0.9757, + "step": 41470 + }, + { + "epoch": 0.26500389711613404, + "grad_norm": 0.9327560663223267, + "learning_rate": 9.573368268728901e-05, + "loss": 1.1375, + "step": 41480 + }, + { + "epoch": 0.26506778426587274, + "grad_norm": 1.0414352416992188, + "learning_rate": 9.57316543430661e-05, + "loss": 1.0202, + "step": 41490 + }, + { + "epoch": 0.26513167141561145, + "grad_norm": 1.695721983909607, + "learning_rate": 9.572962553828586e-05, + "loss": 0.8127, + "step": 41500 + }, + { + "epoch": 0.26519555856535015, + "grad_norm": 0.8339122533798218, + "learning_rate": 9.572759627296872e-05, + "loss": 0.7414, + "step": 41510 + }, + { + "epoch": 0.26525944571508886, + "grad_norm": 0.8159180283546448, + "learning_rate": 9.572556654713514e-05, + "loss": 0.811, + "step": 41520 + }, + { + "epoch": 0.26532333286482757, + "grad_norm": 0.9597871899604797, + "learning_rate": 9.572353636080555e-05, + "loss": 1.089, + "step": 41530 + }, + { + "epoch": 0.26538722001456627, + "grad_norm": 0.5867129564285278, + "learning_rate": 9.572150571400038e-05, + "loss": 0.9204, + "step": 41540 + }, + { + "epoch": 0.265451107164305, + "grad_norm": 0.706292450428009, + "learning_rate": 9.57194746067401e-05, + "loss": 0.8317, + "step": 41550 + }, + { + "epoch": 0.2655149943140437, + "grad_norm": 1.7897143363952637, + "learning_rate": 9.571744303904515e-05, + "loss": 0.9645, + "step": 41560 + }, + { + "epoch": 0.2655788814637824, + "grad_norm": 0.9594964981079102, + "learning_rate": 9.571541101093602e-05, + "loss": 0.9729, + "step": 41570 + }, + { + "epoch": 0.2656427686135211, + "grad_norm": 0.7055974006652832, + "learning_rate": 9.571337852243313e-05, + "loss": 0.8736, + "step": 41580 + }, + { + "epoch": 0.2657066557632598, + "grad_norm": 0.8249185681343079, + "learning_rate": 9.571134557355697e-05, + "loss": 0.9665, + "step": 41590 + }, + { + "epoch": 0.26577054291299845, + "grad_norm": 0.5963894128799438, + "learning_rate": 9.570931216432801e-05, + "loss": 0.8331, + "step": 41600 + }, + { + "epoch": 0.26583443006273716, + "grad_norm": 0.5887112617492676, + "learning_rate": 9.570727829476676e-05, + "loss": 0.8705, + "step": 41610 + }, + { + "epoch": 0.26589831721247587, + "grad_norm": 0.7210960388183594, + "learning_rate": 9.570524396489365e-05, + "loss": 0.7763, + "step": 41620 + }, + { + "epoch": 0.26596220436221457, + "grad_norm": 0.8247012495994568, + "learning_rate": 9.570320917472919e-05, + "loss": 0.8148, + "step": 41630 + }, + { + "epoch": 0.2660260915119533, + "grad_norm": 0.8511599898338318, + "learning_rate": 9.57011739242939e-05, + "loss": 0.9971, + "step": 41640 + }, + { + "epoch": 0.266089978661692, + "grad_norm": 0.6672869324684143, + "learning_rate": 9.569913821360824e-05, + "loss": 0.9768, + "step": 41650 + }, + { + "epoch": 0.2661538658114307, + "grad_norm": 1.1974848508834839, + "learning_rate": 9.569710204269271e-05, + "loss": 0.991, + "step": 41660 + }, + { + "epoch": 0.2662177529611694, + "grad_norm": 0.8013436198234558, + "learning_rate": 9.569506541156784e-05, + "loss": 0.8786, + "step": 41670 + }, + { + "epoch": 0.2662816401109081, + "grad_norm": 0.7496063113212585, + "learning_rate": 9.569302832025413e-05, + "loss": 0.858, + "step": 41680 + }, + { + "epoch": 0.2663455272606468, + "grad_norm": 0.8424622416496277, + "learning_rate": 9.569099076877208e-05, + "loss": 0.6938, + "step": 41690 + }, + { + "epoch": 0.2664094144103855, + "grad_norm": 0.8976988196372986, + "learning_rate": 9.568895275714225e-05, + "loss": 0.9233, + "step": 41700 + }, + { + "epoch": 0.2664733015601242, + "grad_norm": 1.4451935291290283, + "learning_rate": 9.568691428538512e-05, + "loss": 1.0334, + "step": 41710 + }, + { + "epoch": 0.26653718870986287, + "grad_norm": 1.2237658500671387, + "learning_rate": 9.568487535352124e-05, + "loss": 0.8801, + "step": 41720 + }, + { + "epoch": 0.2666010758596016, + "grad_norm": 2.1000301837921143, + "learning_rate": 9.568283596157115e-05, + "loss": 1.0871, + "step": 41730 + }, + { + "epoch": 0.2666649630093403, + "grad_norm": 0.8955364227294922, + "learning_rate": 9.568079610955539e-05, + "loss": 0.8636, + "step": 41740 + }, + { + "epoch": 0.266728850159079, + "grad_norm": 0.9859808087348938, + "learning_rate": 9.567875579749447e-05, + "loss": 0.9954, + "step": 41750 + }, + { + "epoch": 0.2667927373088177, + "grad_norm": 1.0908890962600708, + "learning_rate": 9.567671502540897e-05, + "loss": 0.8674, + "step": 41760 + }, + { + "epoch": 0.2668566244585564, + "grad_norm": 0.48835402727127075, + "learning_rate": 9.567467379331943e-05, + "loss": 1.2828, + "step": 41770 + }, + { + "epoch": 0.2669205116082951, + "grad_norm": 0.7387278079986572, + "learning_rate": 9.567263210124641e-05, + "loss": 0.9535, + "step": 41780 + }, + { + "epoch": 0.2669843987580338, + "grad_norm": 0.813470184803009, + "learning_rate": 9.567058994921049e-05, + "loss": 0.9851, + "step": 41790 + }, + { + "epoch": 0.2670482859077725, + "grad_norm": 0.8729559779167175, + "learning_rate": 9.566854733723221e-05, + "loss": 0.8694, + "step": 41800 + }, + { + "epoch": 0.2671121730575112, + "grad_norm": 0.9543429613113403, + "learning_rate": 9.566650426533214e-05, + "loss": 0.8436, + "step": 41810 + }, + { + "epoch": 0.26717606020724993, + "grad_norm": 1.1418914794921875, + "learning_rate": 9.566446073353089e-05, + "loss": 0.9252, + "step": 41820 + }, + { + "epoch": 0.26723994735698864, + "grad_norm": 0.9289169907569885, + "learning_rate": 9.566241674184898e-05, + "loss": 0.9306, + "step": 41830 + }, + { + "epoch": 0.2673038345067273, + "grad_norm": 1.3721492290496826, + "learning_rate": 9.566037229030704e-05, + "loss": 1.0788, + "step": 41840 + }, + { + "epoch": 0.267367721656466, + "grad_norm": 0.7210074067115784, + "learning_rate": 9.565832737892566e-05, + "loss": 0.7705, + "step": 41850 + }, + { + "epoch": 0.2674316088062047, + "grad_norm": 0.6854256391525269, + "learning_rate": 9.565628200772542e-05, + "loss": 0.8524, + "step": 41860 + }, + { + "epoch": 0.2674954959559434, + "grad_norm": 1.1195999383926392, + "learning_rate": 9.565423617672691e-05, + "loss": 1.0449, + "step": 41870 + }, + { + "epoch": 0.2675593831056821, + "grad_norm": 1.1100611686706543, + "learning_rate": 9.565218988595077e-05, + "loss": 0.9949, + "step": 41880 + }, + { + "epoch": 0.2676232702554208, + "grad_norm": 0.6192795038223267, + "learning_rate": 9.565014313541756e-05, + "loss": 1.0524, + "step": 41890 + }, + { + "epoch": 0.2676871574051595, + "grad_norm": 1.138809084892273, + "learning_rate": 9.564809592514793e-05, + "loss": 1.0164, + "step": 41900 + }, + { + "epoch": 0.26775104455489823, + "grad_norm": 0.6620566844940186, + "learning_rate": 9.564604825516248e-05, + "loss": 0.9091, + "step": 41910 + }, + { + "epoch": 0.26781493170463694, + "grad_norm": 0.8734396696090698, + "learning_rate": 9.564400012548183e-05, + "loss": 1.1338, + "step": 41920 + }, + { + "epoch": 0.26787881885437564, + "grad_norm": 0.5580737590789795, + "learning_rate": 9.56419515361266e-05, + "loss": 0.8139, + "step": 41930 + }, + { + "epoch": 0.26794270600411435, + "grad_norm": 0.8338034152984619, + "learning_rate": 9.563990248711745e-05, + "loss": 0.8791, + "step": 41940 + }, + { + "epoch": 0.26800659315385306, + "grad_norm": 0.557685136795044, + "learning_rate": 9.563785297847501e-05, + "loss": 0.7298, + "step": 41950 + }, + { + "epoch": 0.26807048030359176, + "grad_norm": 0.8459478616714478, + "learning_rate": 9.563580301021988e-05, + "loss": 0.9614, + "step": 41960 + }, + { + "epoch": 0.2681343674533304, + "grad_norm": 0.918144166469574, + "learning_rate": 9.563375258237275e-05, + "loss": 0.9374, + "step": 41970 + }, + { + "epoch": 0.2681982546030691, + "grad_norm": 0.9142857193946838, + "learning_rate": 9.563170169495424e-05, + "loss": 0.8053, + "step": 41980 + }, + { + "epoch": 0.2682621417528078, + "grad_norm": 0.9095722436904907, + "learning_rate": 9.562965034798502e-05, + "loss": 0.9772, + "step": 41990 + }, + { + "epoch": 0.26832602890254653, + "grad_norm": 1.0715844631195068, + "learning_rate": 9.562759854148575e-05, + "loss": 0.8129, + "step": 42000 + }, + { + "epoch": 0.26838991605228524, + "grad_norm": 0.8338362574577332, + "learning_rate": 9.562554627547709e-05, + "loss": 0.9305, + "step": 42010 + }, + { + "epoch": 0.26845380320202394, + "grad_norm": 0.709132969379425, + "learning_rate": 9.562349354997971e-05, + "loss": 0.7656, + "step": 42020 + }, + { + "epoch": 0.26851769035176265, + "grad_norm": 1.1715177297592163, + "learning_rate": 9.562144036501428e-05, + "loss": 0.9993, + "step": 42030 + }, + { + "epoch": 0.26858157750150136, + "grad_norm": 1.6266652345657349, + "learning_rate": 9.561938672060147e-05, + "loss": 0.8534, + "step": 42040 + }, + { + "epoch": 0.26864546465124006, + "grad_norm": 0.8818448781967163, + "learning_rate": 9.561733261676196e-05, + "loss": 0.8328, + "step": 42050 + }, + { + "epoch": 0.26870935180097877, + "grad_norm": 1.15716552734375, + "learning_rate": 9.561527805351646e-05, + "loss": 0.7874, + "step": 42060 + }, + { + "epoch": 0.2687732389507175, + "grad_norm": 0.913690984249115, + "learning_rate": 9.561322303088565e-05, + "loss": 0.7483, + "step": 42070 + }, + { + "epoch": 0.2688371261004562, + "grad_norm": 0.644812822341919, + "learning_rate": 9.561116754889022e-05, + "loss": 1.011, + "step": 42080 + }, + { + "epoch": 0.26890101325019483, + "grad_norm": 0.7774586081504822, + "learning_rate": 9.560911160755088e-05, + "loss": 1.0542, + "step": 42090 + }, + { + "epoch": 0.26896490039993354, + "grad_norm": 0.4670238494873047, + "learning_rate": 9.56070552068883e-05, + "loss": 0.9087, + "step": 42100 + }, + { + "epoch": 0.26902878754967224, + "grad_norm": 1.1221692562103271, + "learning_rate": 9.560499834692325e-05, + "loss": 0.8704, + "step": 42110 + }, + { + "epoch": 0.26909267469941095, + "grad_norm": 1.4995726346969604, + "learning_rate": 9.56029410276764e-05, + "loss": 0.9453, + "step": 42120 + }, + { + "epoch": 0.26915656184914966, + "grad_norm": 0.8239421844482422, + "learning_rate": 9.56008832491685e-05, + "loss": 0.8527, + "step": 42130 + }, + { + "epoch": 0.26922044899888836, + "grad_norm": 0.7687192559242249, + "learning_rate": 9.559882501142024e-05, + "loss": 1.0542, + "step": 42140 + }, + { + "epoch": 0.26928433614862707, + "grad_norm": 0.8161906003952026, + "learning_rate": 9.559676631445236e-05, + "loss": 0.8796, + "step": 42150 + }, + { + "epoch": 0.2693482232983658, + "grad_norm": 0.7608240842819214, + "learning_rate": 9.559470715828559e-05, + "loss": 0.8504, + "step": 42160 + }, + { + "epoch": 0.2694121104481045, + "grad_norm": 1.2632617950439453, + "learning_rate": 9.559264754294068e-05, + "loss": 0.6786, + "step": 42170 + }, + { + "epoch": 0.2694759975978432, + "grad_norm": 1.0786528587341309, + "learning_rate": 9.55907934965501e-05, + "loss": 1.0747, + "step": 42180 + }, + { + "epoch": 0.2695398847475819, + "grad_norm": 0.9386286735534668, + "learning_rate": 9.558873300882385e-05, + "loss": 0.6627, + "step": 42190 + }, + { + "epoch": 0.2696037718973206, + "grad_norm": 0.9134590029716492, + "learning_rate": 9.558667206197964e-05, + "loss": 0.7985, + "step": 42200 + }, + { + "epoch": 0.26966765904705925, + "grad_norm": 0.6581794619560242, + "learning_rate": 9.55846106560382e-05, + "loss": 0.7476, + "step": 42210 + }, + { + "epoch": 0.26973154619679796, + "grad_norm": 0.7346853017807007, + "learning_rate": 9.558254879102028e-05, + "loss": 1.1158, + "step": 42220 + }, + { + "epoch": 0.26979543334653666, + "grad_norm": 0.7923113107681274, + "learning_rate": 9.558048646694668e-05, + "loss": 1.0275, + "step": 42230 + }, + { + "epoch": 0.26985932049627537, + "grad_norm": 0.9055522680282593, + "learning_rate": 9.557842368383813e-05, + "loss": 0.7192, + "step": 42240 + }, + { + "epoch": 0.2699232076460141, + "grad_norm": 0.5908991694450378, + "learning_rate": 9.557636044171542e-05, + "loss": 0.9693, + "step": 42250 + }, + { + "epoch": 0.2699870947957528, + "grad_norm": 0.636661946773529, + "learning_rate": 9.557429674059935e-05, + "loss": 1.0553, + "step": 42260 + }, + { + "epoch": 0.2700509819454915, + "grad_norm": 0.9865610599517822, + "learning_rate": 9.557223258051069e-05, + "loss": 1.0789, + "step": 42270 + }, + { + "epoch": 0.2701148690952302, + "grad_norm": 0.9444893598556519, + "learning_rate": 9.557016796147021e-05, + "loss": 0.8252, + "step": 42280 + }, + { + "epoch": 0.2701787562449689, + "grad_norm": 0.7374017238616943, + "learning_rate": 9.556810288349871e-05, + "loss": 0.9914, + "step": 42290 + }, + { + "epoch": 0.2702426433947076, + "grad_norm": 0.772415041923523, + "learning_rate": 9.5566037346617e-05, + "loss": 0.7827, + "step": 42300 + }, + { + "epoch": 0.2703065305444463, + "grad_norm": 1.0962374210357666, + "learning_rate": 9.556397135084587e-05, + "loss": 0.995, + "step": 42310 + }, + { + "epoch": 0.270370417694185, + "grad_norm": 0.7097411751747131, + "learning_rate": 9.556190489620612e-05, + "loss": 0.8302, + "step": 42320 + }, + { + "epoch": 0.27043430484392367, + "grad_norm": 0.7932478785514832, + "learning_rate": 9.555983798271859e-05, + "loss": 0.9678, + "step": 42330 + }, + { + "epoch": 0.2704981919936624, + "grad_norm": 0.6816592812538147, + "learning_rate": 9.555777061040407e-05, + "loss": 1.0183, + "step": 42340 + }, + { + "epoch": 0.2705620791434011, + "grad_norm": 0.6527500152587891, + "learning_rate": 9.555570277928338e-05, + "loss": 0.8971, + "step": 42350 + }, + { + "epoch": 0.2706259662931398, + "grad_norm": 0.6478419899940491, + "learning_rate": 9.555363448937735e-05, + "loss": 0.8146, + "step": 42360 + }, + { + "epoch": 0.2706898534428785, + "grad_norm": 0.5460163354873657, + "learning_rate": 9.555156574070681e-05, + "loss": 0.7972, + "step": 42370 + }, + { + "epoch": 0.2707537405926172, + "grad_norm": 0.5501124262809753, + "learning_rate": 9.554949653329262e-05, + "loss": 0.6, + "step": 42380 + }, + { + "epoch": 0.2708176277423559, + "grad_norm": 0.9783973693847656, + "learning_rate": 9.554742686715557e-05, + "loss": 0.9689, + "step": 42390 + }, + { + "epoch": 0.2708815148920946, + "grad_norm": 1.4098743200302124, + "learning_rate": 9.554535674231652e-05, + "loss": 0.8839, + "step": 42400 + }, + { + "epoch": 0.2709454020418333, + "grad_norm": 0.9374015927314758, + "learning_rate": 9.554328615879636e-05, + "loss": 1.1531, + "step": 42410 + }, + { + "epoch": 0.271009289191572, + "grad_norm": 2.437901258468628, + "learning_rate": 9.554121511661587e-05, + "loss": 0.8329, + "step": 42420 + }, + { + "epoch": 0.27107317634131073, + "grad_norm": 0.5805662870407104, + "learning_rate": 9.553914361579597e-05, + "loss": 0.8364, + "step": 42430 + }, + { + "epoch": 0.27113706349104943, + "grad_norm": 0.8254538178443909, + "learning_rate": 9.553707165635747e-05, + "loss": 0.5683, + "step": 42440 + }, + { + "epoch": 0.2712009506407881, + "grad_norm": 1.0397802591323853, + "learning_rate": 9.55349992383213e-05, + "loss": 0.9475, + "step": 42450 + }, + { + "epoch": 0.2712648377905268, + "grad_norm": 1.111701488494873, + "learning_rate": 9.553292636170827e-05, + "loss": 0.9378, + "step": 42460 + }, + { + "epoch": 0.2713287249402655, + "grad_norm": 1.4257961511611938, + "learning_rate": 9.553085302653929e-05, + "loss": 0.828, + "step": 42470 + }, + { + "epoch": 0.2713926120900042, + "grad_norm": 1.0189239978790283, + "learning_rate": 9.552877923283522e-05, + "loss": 1.1691, + "step": 42480 + }, + { + "epoch": 0.2714564992397429, + "grad_norm": 1.3065085411071777, + "learning_rate": 9.552670498061697e-05, + "loss": 1.0535, + "step": 42490 + }, + { + "epoch": 0.2715203863894816, + "grad_norm": 0.6838773488998413, + "learning_rate": 9.55246302699054e-05, + "loss": 0.938, + "step": 42500 + }, + { + "epoch": 0.2715842735392203, + "grad_norm": 0.9010002613067627, + "learning_rate": 9.552255510072142e-05, + "loss": 0.7285, + "step": 42510 + }, + { + "epoch": 0.27164816068895903, + "grad_norm": 0.7863100171089172, + "learning_rate": 9.552047947308593e-05, + "loss": 0.8349, + "step": 42520 + }, + { + "epoch": 0.27171204783869773, + "grad_norm": 1.3251279592514038, + "learning_rate": 9.551840338701983e-05, + "loss": 0.8618, + "step": 42530 + }, + { + "epoch": 0.27177593498843644, + "grad_norm": 1.1294409036636353, + "learning_rate": 9.551632684254405e-05, + "loss": 0.9233, + "step": 42540 + }, + { + "epoch": 0.27183982213817515, + "grad_norm": 1.4269248247146606, + "learning_rate": 9.551424983967946e-05, + "loss": 0.8823, + "step": 42550 + }, + { + "epoch": 0.27190370928791385, + "grad_norm": 0.7050525546073914, + "learning_rate": 9.551217237844701e-05, + "loss": 0.8103, + "step": 42560 + }, + { + "epoch": 0.2719675964376525, + "grad_norm": 1.4217504262924194, + "learning_rate": 9.551009445886759e-05, + "loss": 0.7929, + "step": 42570 + }, + { + "epoch": 0.2720314835873912, + "grad_norm": 0.6999850869178772, + "learning_rate": 9.550801608096216e-05, + "loss": 1.3094, + "step": 42580 + }, + { + "epoch": 0.2720953707371299, + "grad_norm": 0.5494612455368042, + "learning_rate": 9.550593724475163e-05, + "loss": 0.9256, + "step": 42590 + }, + { + "epoch": 0.2721592578868686, + "grad_norm": 0.5456877946853638, + "learning_rate": 9.550385795025696e-05, + "loss": 0.8309, + "step": 42600 + }, + { + "epoch": 0.27222314503660733, + "grad_norm": 0.6689077615737915, + "learning_rate": 9.550177819749905e-05, + "loss": 1.158, + "step": 42610 + }, + { + "epoch": 0.27228703218634603, + "grad_norm": 0.811871349811554, + "learning_rate": 9.54996979864989e-05, + "loss": 0.6157, + "step": 42620 + }, + { + "epoch": 0.27235091933608474, + "grad_norm": 0.5832274556159973, + "learning_rate": 9.549761731727741e-05, + "loss": 0.9875, + "step": 42630 + }, + { + "epoch": 0.27241480648582345, + "grad_norm": 0.874345064163208, + "learning_rate": 9.549553618985556e-05, + "loss": 0.8906, + "step": 42640 + }, + { + "epoch": 0.27247869363556215, + "grad_norm": 2.064990282058716, + "learning_rate": 9.54934546042543e-05, + "loss": 0.8612, + "step": 42650 + }, + { + "epoch": 0.27254258078530086, + "grad_norm": 0.5960216522216797, + "learning_rate": 9.549137256049459e-05, + "loss": 0.8631, + "step": 42660 + }, + { + "epoch": 0.27260646793503956, + "grad_norm": 1.0062336921691895, + "learning_rate": 9.548929005859739e-05, + "loss": 0.786, + "step": 42670 + }, + { + "epoch": 0.27267035508477827, + "grad_norm": 0.9856522679328918, + "learning_rate": 9.548720709858371e-05, + "loss": 0.8347, + "step": 42680 + }, + { + "epoch": 0.272734242234517, + "grad_norm": 1.2544548511505127, + "learning_rate": 9.548512368047448e-05, + "loss": 1.0405, + "step": 42690 + }, + { + "epoch": 0.2727981293842556, + "grad_norm": 0.45234474539756775, + "learning_rate": 9.548303980429072e-05, + "loss": 0.8274, + "step": 42700 + }, + { + "epoch": 0.27286201653399433, + "grad_norm": 0.7926174402236938, + "learning_rate": 9.54809554700534e-05, + "loss": 0.9903, + "step": 42710 + }, + { + "epoch": 0.27292590368373304, + "grad_norm": 0.9858782291412354, + "learning_rate": 9.547887067778352e-05, + "loss": 0.8354, + "step": 42720 + }, + { + "epoch": 0.27298979083347175, + "grad_norm": 0.5488384962081909, + "learning_rate": 9.547678542750204e-05, + "loss": 0.9663, + "step": 42730 + }, + { + "epoch": 0.27305367798321045, + "grad_norm": 0.7685027122497559, + "learning_rate": 9.547469971923001e-05, + "loss": 0.8943, + "step": 42740 + }, + { + "epoch": 0.27311756513294916, + "grad_norm": 0.910285472869873, + "learning_rate": 9.54726135529884e-05, + "loss": 0.7713, + "step": 42750 + }, + { + "epoch": 0.27318145228268786, + "grad_norm": 1.956381916999817, + "learning_rate": 9.547052692879825e-05, + "loss": 0.7784, + "step": 42760 + }, + { + "epoch": 0.27324533943242657, + "grad_norm": 0.7288005352020264, + "learning_rate": 9.546843984668055e-05, + "loss": 0.8306, + "step": 42770 + }, + { + "epoch": 0.2733092265821653, + "grad_norm": 0.8818903565406799, + "learning_rate": 9.54663523066563e-05, + "loss": 0.8758, + "step": 42780 + }, + { + "epoch": 0.273373113731904, + "grad_norm": 1.380008339881897, + "learning_rate": 9.546426430874658e-05, + "loss": 0.8951, + "step": 42790 + }, + { + "epoch": 0.2734370008816427, + "grad_norm": 1.8795280456542969, + "learning_rate": 9.546217585297236e-05, + "loss": 0.8414, + "step": 42800 + }, + { + "epoch": 0.2735008880313814, + "grad_norm": 1.4341343641281128, + "learning_rate": 9.546008693935473e-05, + "loss": 0.7366, + "step": 42810 + }, + { + "epoch": 0.27356477518112005, + "grad_norm": 1.095348596572876, + "learning_rate": 9.545799756791467e-05, + "loss": 0.8262, + "step": 42820 + }, + { + "epoch": 0.27362866233085875, + "grad_norm": 0.8619642853736877, + "learning_rate": 9.545590773867325e-05, + "loss": 0.9742, + "step": 42830 + }, + { + "epoch": 0.27369254948059746, + "grad_norm": 0.8517597317695618, + "learning_rate": 9.545381745165154e-05, + "loss": 0.9214, + "step": 42840 + }, + { + "epoch": 0.27375643663033616, + "grad_norm": 0.5588153600692749, + "learning_rate": 9.545172670687053e-05, + "loss": 0.7431, + "step": 42850 + }, + { + "epoch": 0.27382032378007487, + "grad_norm": 0.8848083019256592, + "learning_rate": 9.544963550435133e-05, + "loss": 0.7999, + "step": 42860 + }, + { + "epoch": 0.2738842109298136, + "grad_norm": 0.7159634232521057, + "learning_rate": 9.544754384411499e-05, + "loss": 0.9701, + "step": 42870 + }, + { + "epoch": 0.2739480980795523, + "grad_norm": 0.8267273306846619, + "learning_rate": 9.544545172618255e-05, + "loss": 0.8219, + "step": 42880 + }, + { + "epoch": 0.274011985229291, + "grad_norm": 1.47396981716156, + "learning_rate": 9.54433591505751e-05, + "loss": 1.1159, + "step": 42890 + }, + { + "epoch": 0.2740758723790297, + "grad_norm": 0.6588977575302124, + "learning_rate": 9.54412661173137e-05, + "loss": 0.9557, + "step": 42900 + }, + { + "epoch": 0.2741397595287684, + "grad_norm": 0.9745913743972778, + "learning_rate": 9.543917262641944e-05, + "loss": 0.9112, + "step": 42910 + }, + { + "epoch": 0.2742036466785071, + "grad_norm": 0.8897466063499451, + "learning_rate": 9.543707867791342e-05, + "loss": 0.9167, + "step": 42920 + }, + { + "epoch": 0.2742675338282458, + "grad_norm": 0.7316814064979553, + "learning_rate": 9.543498427181669e-05, + "loss": 0.8055, + "step": 42930 + }, + { + "epoch": 0.27433142097798446, + "grad_norm": 1.263555645942688, + "learning_rate": 9.543288940815036e-05, + "loss": 0.9486, + "step": 42940 + }, + { + "epoch": 0.27439530812772317, + "grad_norm": 0.6425541639328003, + "learning_rate": 9.543079408693554e-05, + "loss": 0.7572, + "step": 42950 + }, + { + "epoch": 0.2744591952774619, + "grad_norm": 0.5203328728675842, + "learning_rate": 9.542869830819332e-05, + "loss": 0.7523, + "step": 42960 + }, + { + "epoch": 0.2745230824272006, + "grad_norm": 0.5347851514816284, + "learning_rate": 9.542660207194481e-05, + "loss": 0.8578, + "step": 42970 + }, + { + "epoch": 0.2745869695769393, + "grad_norm": 0.863381564617157, + "learning_rate": 9.542450537821111e-05, + "loss": 0.87, + "step": 42980 + }, + { + "epoch": 0.274650856726678, + "grad_norm": 0.608485996723175, + "learning_rate": 9.542240822701333e-05, + "loss": 1.1265, + "step": 42990 + }, + { + "epoch": 0.2747147438764167, + "grad_norm": 0.8107671737670898, + "learning_rate": 9.542031061837262e-05, + "loss": 1.0411, + "step": 43000 + }, + { + "epoch": 0.2747786310261554, + "grad_norm": 0.8316226601600647, + "learning_rate": 9.541821255231009e-05, + "loss": 0.7935, + "step": 43010 + }, + { + "epoch": 0.2748425181758941, + "grad_norm": 0.741036593914032, + "learning_rate": 9.541611402884685e-05, + "loss": 0.9842, + "step": 43020 + }, + { + "epoch": 0.2749064053256328, + "grad_norm": 1.501214861869812, + "learning_rate": 9.541401504800407e-05, + "loss": 1.4551, + "step": 43030 + }, + { + "epoch": 0.2749702924753715, + "grad_norm": 0.6897192001342773, + "learning_rate": 9.541191560980287e-05, + "loss": 0.7029, + "step": 43040 + }, + { + "epoch": 0.27503417962511023, + "grad_norm": 0.555748701095581, + "learning_rate": 9.540981571426437e-05, + "loss": 0.8156, + "step": 43050 + }, + { + "epoch": 0.2750980667748489, + "grad_norm": 0.5355345010757446, + "learning_rate": 9.540771536140976e-05, + "loss": 1.1097, + "step": 43060 + }, + { + "epoch": 0.2751619539245876, + "grad_norm": 0.839185357093811, + "learning_rate": 9.540561455126018e-05, + "loss": 0.7823, + "step": 43070 + }, + { + "epoch": 0.2752258410743263, + "grad_norm": 1.0240132808685303, + "learning_rate": 9.540351328383676e-05, + "loss": 0.825, + "step": 43080 + }, + { + "epoch": 0.275289728224065, + "grad_norm": 0.6717436909675598, + "learning_rate": 9.54014115591607e-05, + "loss": 0.8841, + "step": 43090 + }, + { + "epoch": 0.2753536153738037, + "grad_norm": 0.7767571806907654, + "learning_rate": 9.539930937725313e-05, + "loss": 0.8338, + "step": 43100 + }, + { + "epoch": 0.2754175025235424, + "grad_norm": 0.9243952035903931, + "learning_rate": 9.539720673813526e-05, + "loss": 0.8565, + "step": 43110 + }, + { + "epoch": 0.2754813896732811, + "grad_norm": 0.7475656270980835, + "learning_rate": 9.539510364182822e-05, + "loss": 0.8373, + "step": 43120 + }, + { + "epoch": 0.2755452768230198, + "grad_norm": 0.7929537296295166, + "learning_rate": 9.539300008835323e-05, + "loss": 1.1913, + "step": 43130 + }, + { + "epoch": 0.27560916397275853, + "grad_norm": 0.9647955298423767, + "learning_rate": 9.539089607773145e-05, + "loss": 0.8969, + "step": 43140 + }, + { + "epoch": 0.27567305112249724, + "grad_norm": 0.7929497957229614, + "learning_rate": 9.538879160998408e-05, + "loss": 0.8129, + "step": 43150 + }, + { + "epoch": 0.27573693827223594, + "grad_norm": 0.7888079881668091, + "learning_rate": 9.538668668513232e-05, + "loss": 1.0804, + "step": 43160 + }, + { + "epoch": 0.27580082542197465, + "grad_norm": 1.1037369966506958, + "learning_rate": 9.538458130319736e-05, + "loss": 0.7396, + "step": 43170 + }, + { + "epoch": 0.2758647125717133, + "grad_norm": 1.0101234912872314, + "learning_rate": 9.538247546420038e-05, + "loss": 0.899, + "step": 43180 + }, + { + "epoch": 0.275928599721452, + "grad_norm": 1.06256902217865, + "learning_rate": 9.538036916816264e-05, + "loss": 0.8693, + "step": 43190 + }, + { + "epoch": 0.2759924868711907, + "grad_norm": 1.1790313720703125, + "learning_rate": 9.53782624151053e-05, + "loss": 0.8821, + "step": 43200 + }, + { + "epoch": 0.2760563740209294, + "grad_norm": 1.0005087852478027, + "learning_rate": 9.537615520504961e-05, + "loss": 0.8263, + "step": 43210 + }, + { + "epoch": 0.2761202611706681, + "grad_norm": 0.965392529964447, + "learning_rate": 9.537404753801679e-05, + "loss": 0.7032, + "step": 43220 + }, + { + "epoch": 0.27618414832040683, + "grad_norm": 1.0671719312667847, + "learning_rate": 9.537193941402805e-05, + "loss": 0.6795, + "step": 43230 + }, + { + "epoch": 0.27624803547014554, + "grad_norm": 0.8210242390632629, + "learning_rate": 9.536983083310463e-05, + "loss": 0.8035, + "step": 43240 + }, + { + "epoch": 0.27631192261988424, + "grad_norm": 1.3356382846832275, + "learning_rate": 9.536772179526774e-05, + "loss": 0.8635, + "step": 43250 + }, + { + "epoch": 0.27637580976962295, + "grad_norm": 0.6056402325630188, + "learning_rate": 9.536561230053866e-05, + "loss": 1.1843, + "step": 43260 + }, + { + "epoch": 0.27643969691936165, + "grad_norm": 0.674022912979126, + "learning_rate": 9.536350234893863e-05, + "loss": 1.2542, + "step": 43270 + }, + { + "epoch": 0.27650358406910036, + "grad_norm": 1.186140537261963, + "learning_rate": 9.536139194048888e-05, + "loss": 0.884, + "step": 43280 + }, + { + "epoch": 0.27656747121883907, + "grad_norm": 0.8582965731620789, + "learning_rate": 9.535928107521067e-05, + "loss": 0.7672, + "step": 43290 + }, + { + "epoch": 0.2766313583685777, + "grad_norm": 0.8430611491203308, + "learning_rate": 9.535716975312524e-05, + "loss": 0.8817, + "step": 43300 + }, + { + "epoch": 0.2766952455183164, + "grad_norm": 0.9493967294692993, + "learning_rate": 9.535505797425388e-05, + "loss": 0.9554, + "step": 43310 + }, + { + "epoch": 0.27675913266805513, + "grad_norm": 0.9920264482498169, + "learning_rate": 9.535294573861786e-05, + "loss": 0.9554, + "step": 43320 + }, + { + "epoch": 0.27682301981779384, + "grad_norm": 0.8129369616508484, + "learning_rate": 9.535083304623844e-05, + "loss": 0.839, + "step": 43330 + }, + { + "epoch": 0.27688690696753254, + "grad_norm": 1.0524061918258667, + "learning_rate": 9.534871989713688e-05, + "loss": 0.8456, + "step": 43340 + }, + { + "epoch": 0.27695079411727125, + "grad_norm": 0.5314822793006897, + "learning_rate": 9.53466062913345e-05, + "loss": 0.8378, + "step": 43350 + }, + { + "epoch": 0.27701468126700995, + "grad_norm": 0.574448823928833, + "learning_rate": 9.534449222885254e-05, + "loss": 0.8479, + "step": 43360 + }, + { + "epoch": 0.27707856841674866, + "grad_norm": 0.622386634349823, + "learning_rate": 9.534237770971233e-05, + "loss": 1.2532, + "step": 43370 + }, + { + "epoch": 0.27714245556648737, + "grad_norm": 0.7160522937774658, + "learning_rate": 9.534026273393515e-05, + "loss": 0.8913, + "step": 43380 + }, + { + "epoch": 0.2772063427162261, + "grad_norm": 0.8195508122444153, + "learning_rate": 9.533814730154229e-05, + "loss": 0.8407, + "step": 43390 + }, + { + "epoch": 0.2772702298659648, + "grad_norm": 2.375558614730835, + "learning_rate": 9.533603141255508e-05, + "loss": 0.8774, + "step": 43400 + }, + { + "epoch": 0.2773341170157035, + "grad_norm": 1.008970022201538, + "learning_rate": 9.533391506699481e-05, + "loss": 0.8729, + "step": 43410 + }, + { + "epoch": 0.27739800416544214, + "grad_norm": 1.062031865119934, + "learning_rate": 9.533179826488278e-05, + "loss": 0.7358, + "step": 43420 + }, + { + "epoch": 0.27746189131518084, + "grad_norm": 1.110970139503479, + "learning_rate": 9.532968100624034e-05, + "loss": 0.9176, + "step": 43430 + }, + { + "epoch": 0.27752577846491955, + "grad_norm": 0.7017518877983093, + "learning_rate": 9.532756329108879e-05, + "loss": 0.7531, + "step": 43440 + }, + { + "epoch": 0.27758966561465825, + "grad_norm": 0.7108795046806335, + "learning_rate": 9.532544511944945e-05, + "loss": 1.2516, + "step": 43450 + }, + { + "epoch": 0.27765355276439696, + "grad_norm": 1.1942083835601807, + "learning_rate": 9.532332649134368e-05, + "loss": 0.8122, + "step": 43460 + }, + { + "epoch": 0.27771743991413567, + "grad_norm": 1.6900832653045654, + "learning_rate": 9.53212074067928e-05, + "loss": 0.9352, + "step": 43470 + }, + { + "epoch": 0.2777813270638744, + "grad_norm": 0.6773011088371277, + "learning_rate": 9.531908786581816e-05, + "loss": 0.7606, + "step": 43480 + }, + { + "epoch": 0.2778452142136131, + "grad_norm": 0.9404903054237366, + "learning_rate": 9.53169678684411e-05, + "loss": 0.8783, + "step": 43490 + }, + { + "epoch": 0.2779091013633518, + "grad_norm": 0.9946535229682922, + "learning_rate": 9.531484741468296e-05, + "loss": 0.8917, + "step": 43500 + }, + { + "epoch": 0.2779729885130905, + "grad_norm": 0.9458416104316711, + "learning_rate": 9.531272650456508e-05, + "loss": 1.1044, + "step": 43510 + }, + { + "epoch": 0.2780368756628292, + "grad_norm": 0.5456779599189758, + "learning_rate": 9.531060513810887e-05, + "loss": 0.7003, + "step": 43520 + }, + { + "epoch": 0.2781007628125679, + "grad_norm": 0.7244671583175659, + "learning_rate": 9.530848331533569e-05, + "loss": 0.8803, + "step": 43530 + }, + { + "epoch": 0.2781646499623066, + "grad_norm": 0.7101406455039978, + "learning_rate": 9.530636103626684e-05, + "loss": 0.9651, + "step": 43540 + }, + { + "epoch": 0.27822853711204526, + "grad_norm": 0.8346617221832275, + "learning_rate": 9.530423830092376e-05, + "loss": 1.1236, + "step": 43550 + }, + { + "epoch": 0.27829242426178397, + "grad_norm": 1.0931973457336426, + "learning_rate": 9.530211510932781e-05, + "loss": 1.1875, + "step": 43560 + }, + { + "epoch": 0.27835631141152267, + "grad_norm": 0.7076020240783691, + "learning_rate": 9.529999146150037e-05, + "loss": 0.9529, + "step": 43570 + }, + { + "epoch": 0.2784201985612614, + "grad_norm": 0.8255470991134644, + "learning_rate": 9.529786735746281e-05, + "loss": 0.9156, + "step": 43580 + }, + { + "epoch": 0.2784840857110001, + "grad_norm": 1.8320345878601074, + "learning_rate": 9.529574279723655e-05, + "loss": 1.1068, + "step": 43590 + }, + { + "epoch": 0.2785479728607388, + "grad_norm": 0.8600050806999207, + "learning_rate": 9.529361778084297e-05, + "loss": 0.7782, + "step": 43600 + }, + { + "epoch": 0.2786118600104775, + "grad_norm": 0.5279654860496521, + "learning_rate": 9.529149230830348e-05, + "loss": 0.6292, + "step": 43610 + }, + { + "epoch": 0.2786757471602162, + "grad_norm": 1.0602446794509888, + "learning_rate": 9.528936637963948e-05, + "loss": 0.8949, + "step": 43620 + }, + { + "epoch": 0.2787396343099549, + "grad_norm": 0.687105655670166, + "learning_rate": 9.528723999487236e-05, + "loss": 0.9358, + "step": 43630 + }, + { + "epoch": 0.2788035214596936, + "grad_norm": 0.6915990114212036, + "learning_rate": 9.528511315402358e-05, + "loss": 1.011, + "step": 43640 + }, + { + "epoch": 0.2788674086094323, + "grad_norm": 1.5329688787460327, + "learning_rate": 9.528298585711453e-05, + "loss": 0.8036, + "step": 43650 + }, + { + "epoch": 0.278931295759171, + "grad_norm": 0.8263123035430908, + "learning_rate": 9.528085810416663e-05, + "loss": 0.8864, + "step": 43660 + }, + { + "epoch": 0.2789951829089097, + "grad_norm": 1.231269121170044, + "learning_rate": 9.52787298952013e-05, + "loss": 0.8268, + "step": 43670 + }, + { + "epoch": 0.2790590700586484, + "grad_norm": 0.7919381260871887, + "learning_rate": 9.527660123024e-05, + "loss": 1.0224, + "step": 43680 + }, + { + "epoch": 0.2791229572083871, + "grad_norm": 0.850471556186676, + "learning_rate": 9.527447210930417e-05, + "loss": 0.7467, + "step": 43690 + }, + { + "epoch": 0.2791868443581258, + "grad_norm": 1.2980278730392456, + "learning_rate": 9.527234253241522e-05, + "loss": 0.7765, + "step": 43700 + }, + { + "epoch": 0.2792507315078645, + "grad_norm": 1.044519066810608, + "learning_rate": 9.527021249959462e-05, + "loss": 1.0149, + "step": 43710 + }, + { + "epoch": 0.2793146186576032, + "grad_norm": 0.9248097538948059, + "learning_rate": 9.526808201086382e-05, + "loss": 1.0461, + "step": 43720 + }, + { + "epoch": 0.2793785058073419, + "grad_norm": 0.7465457320213318, + "learning_rate": 9.526595106624428e-05, + "loss": 0.8779, + "step": 43730 + }, + { + "epoch": 0.2794423929570806, + "grad_norm": 1.1304700374603271, + "learning_rate": 9.526381966575744e-05, + "loss": 0.9451, + "step": 43740 + }, + { + "epoch": 0.2795062801068193, + "grad_norm": 0.8045159578323364, + "learning_rate": 9.526168780942477e-05, + "loss": 0.851, + "step": 43750 + }, + { + "epoch": 0.27957016725655803, + "grad_norm": 0.8568775057792664, + "learning_rate": 9.525955549726776e-05, + "loss": 1.2149, + "step": 43760 + }, + { + "epoch": 0.27963405440629674, + "grad_norm": 0.8600241541862488, + "learning_rate": 9.525742272930787e-05, + "loss": 1.2311, + "step": 43770 + }, + { + "epoch": 0.27969794155603545, + "grad_norm": 0.9878765940666199, + "learning_rate": 9.525528950556657e-05, + "loss": 0.7023, + "step": 43780 + }, + { + "epoch": 0.2797618287057741, + "grad_norm": 1.1209038496017456, + "learning_rate": 9.525315582606537e-05, + "loss": 0.7451, + "step": 43790 + }, + { + "epoch": 0.2798257158555128, + "grad_norm": 0.8326137065887451, + "learning_rate": 9.525102169082573e-05, + "loss": 1.139, + "step": 43800 + }, + { + "epoch": 0.2798896030052515, + "grad_norm": 0.7747043371200562, + "learning_rate": 9.524888709986914e-05, + "loss": 1.1918, + "step": 43810 + }, + { + "epoch": 0.2799534901549902, + "grad_norm": 0.7023658752441406, + "learning_rate": 9.524675205321713e-05, + "loss": 1.1406, + "step": 43820 + }, + { + "epoch": 0.2800173773047289, + "grad_norm": 0.9086745977401733, + "learning_rate": 9.524461655089119e-05, + "loss": 1.0247, + "step": 43830 + }, + { + "epoch": 0.2800812644544676, + "grad_norm": 0.850036084651947, + "learning_rate": 9.52424805929128e-05, + "loss": 0.9891, + "step": 43840 + }, + { + "epoch": 0.28014515160420633, + "grad_norm": 0.7269537448883057, + "learning_rate": 9.52403441793035e-05, + "loss": 0.917, + "step": 43850 + }, + { + "epoch": 0.28020903875394504, + "grad_norm": 0.6691316962242126, + "learning_rate": 9.523820731008479e-05, + "loss": 0.8787, + "step": 43860 + }, + { + "epoch": 0.28027292590368375, + "grad_norm": 0.9154207110404968, + "learning_rate": 9.52360699852782e-05, + "loss": 1.1603, + "step": 43870 + }, + { + "epoch": 0.28033681305342245, + "grad_norm": 0.8188737034797668, + "learning_rate": 9.523393220490526e-05, + "loss": 1.0814, + "step": 43880 + }, + { + "epoch": 0.28040070020316116, + "grad_norm": 0.873671293258667, + "learning_rate": 9.523179396898748e-05, + "loss": 0.8897, + "step": 43890 + }, + { + "epoch": 0.28046458735289986, + "grad_norm": 1.1229972839355469, + "learning_rate": 9.52296552775464e-05, + "loss": 0.9826, + "step": 43900 + }, + { + "epoch": 0.2805284745026385, + "grad_norm": 0.664357602596283, + "learning_rate": 9.522751613060356e-05, + "loss": 0.8484, + "step": 43910 + }, + { + "epoch": 0.2805923616523772, + "grad_norm": 0.6779937148094177, + "learning_rate": 9.522537652818051e-05, + "loss": 0.8464, + "step": 43920 + }, + { + "epoch": 0.2806562488021159, + "grad_norm": 0.7764692306518555, + "learning_rate": 9.522323647029879e-05, + "loss": 1.005, + "step": 43930 + }, + { + "epoch": 0.28072013595185463, + "grad_norm": 0.6373327970504761, + "learning_rate": 9.522109595697997e-05, + "loss": 0.7112, + "step": 43940 + }, + { + "epoch": 0.28078402310159334, + "grad_norm": 1.3876614570617676, + "learning_rate": 9.521895498824558e-05, + "loss": 0.7226, + "step": 43950 + }, + { + "epoch": 0.28084791025133204, + "grad_norm": 1.1054093837738037, + "learning_rate": 9.521681356411718e-05, + "loss": 0.8765, + "step": 43960 + }, + { + "epoch": 0.28091179740107075, + "grad_norm": 0.7157889008522034, + "learning_rate": 9.521467168461637e-05, + "loss": 0.9107, + "step": 43970 + }, + { + "epoch": 0.28097568455080946, + "grad_norm": 0.7243021726608276, + "learning_rate": 9.521252934976469e-05, + "loss": 0.6519, + "step": 43980 + }, + { + "epoch": 0.28103957170054816, + "grad_norm": 1.0799028873443604, + "learning_rate": 9.521038655958373e-05, + "loss": 0.9358, + "step": 43990 + }, + { + "epoch": 0.28110345885028687, + "grad_norm": 0.8918923139572144, + "learning_rate": 9.520824331409506e-05, + "loss": 1.0831, + "step": 44000 + }, + { + "epoch": 0.2811673460000256, + "grad_norm": 0.6121041774749756, + "learning_rate": 9.520609961332027e-05, + "loss": 0.9407, + "step": 44010 + }, + { + "epoch": 0.2812312331497643, + "grad_norm": 0.8242114186286926, + "learning_rate": 9.520395545728096e-05, + "loss": 0.6712, + "step": 44020 + }, + { + "epoch": 0.28129512029950293, + "grad_norm": 0.8655091524124146, + "learning_rate": 9.52018108459987e-05, + "loss": 0.8633, + "step": 44030 + }, + { + "epoch": 0.28135900744924164, + "grad_norm": 0.7352428436279297, + "learning_rate": 9.51996657794951e-05, + "loss": 1.0033, + "step": 44040 + }, + { + "epoch": 0.28142289459898034, + "grad_norm": 1.2268953323364258, + "learning_rate": 9.519752025779177e-05, + "loss": 1.0435, + "step": 44050 + }, + { + "epoch": 0.28148678174871905, + "grad_norm": 0.766212522983551, + "learning_rate": 9.51953742809103e-05, + "loss": 0.8992, + "step": 44060 + }, + { + "epoch": 0.28155066889845776, + "grad_norm": 0.9193620681762695, + "learning_rate": 9.51932278488723e-05, + "loss": 0.9638, + "step": 44070 + }, + { + "epoch": 0.28161455604819646, + "grad_norm": 0.7695852518081665, + "learning_rate": 9.519108096169943e-05, + "loss": 0.875, + "step": 44080 + }, + { + "epoch": 0.28167844319793517, + "grad_norm": 0.7172717452049255, + "learning_rate": 9.518893361941326e-05, + "loss": 1.106, + "step": 44090 + }, + { + "epoch": 0.2817423303476739, + "grad_norm": 0.7750954031944275, + "learning_rate": 9.518678582203542e-05, + "loss": 1.0095, + "step": 44100 + }, + { + "epoch": 0.2818062174974126, + "grad_norm": 1.0420329570770264, + "learning_rate": 9.518463756958758e-05, + "loss": 0.6998, + "step": 44110 + }, + { + "epoch": 0.2818701046471513, + "grad_norm": 1.6733392477035522, + "learning_rate": 9.518248886209134e-05, + "loss": 0.733, + "step": 44120 + }, + { + "epoch": 0.28193399179689, + "grad_norm": 0.9372019171714783, + "learning_rate": 9.518033969956834e-05, + "loss": 0.8278, + "step": 44130 + }, + { + "epoch": 0.2819978789466287, + "grad_norm": 0.6449085474014282, + "learning_rate": 9.517819008204025e-05, + "loss": 0.8307, + "step": 44140 + }, + { + "epoch": 0.28206176609636735, + "grad_norm": 1.0656044483184814, + "learning_rate": 9.517604000952869e-05, + "loss": 0.7596, + "step": 44150 + }, + { + "epoch": 0.28212565324610606, + "grad_norm": 1.2960087060928345, + "learning_rate": 9.517388948205532e-05, + "loss": 0.7277, + "step": 44160 + }, + { + "epoch": 0.28218954039584476, + "grad_norm": 0.7332021594047546, + "learning_rate": 9.517173849964181e-05, + "loss": 0.9438, + "step": 44170 + }, + { + "epoch": 0.28225342754558347, + "grad_norm": 1.0289632081985474, + "learning_rate": 9.516958706230981e-05, + "loss": 1.1855, + "step": 44180 + }, + { + "epoch": 0.2823173146953222, + "grad_norm": 0.628814697265625, + "learning_rate": 9.516743517008099e-05, + "loss": 1.0023, + "step": 44190 + }, + { + "epoch": 0.2823812018450609, + "grad_norm": 0.8182355165481567, + "learning_rate": 9.516528282297703e-05, + "loss": 0.924, + "step": 44200 + }, + { + "epoch": 0.2824450889947996, + "grad_norm": 0.7950728535652161, + "learning_rate": 9.51631300210196e-05, + "loss": 0.6805, + "step": 44210 + }, + { + "epoch": 0.2825089761445383, + "grad_norm": 0.902574360370636, + "learning_rate": 9.516097676423037e-05, + "loss": 0.9775, + "step": 44220 + }, + { + "epoch": 0.282572863294277, + "grad_norm": 0.8031740784645081, + "learning_rate": 9.515882305263104e-05, + "loss": 1.0566, + "step": 44230 + }, + { + "epoch": 0.2826367504440157, + "grad_norm": 0.8170803189277649, + "learning_rate": 9.515666888624329e-05, + "loss": 0.9942, + "step": 44240 + }, + { + "epoch": 0.2827006375937544, + "grad_norm": 0.6290708184242249, + "learning_rate": 9.515451426508882e-05, + "loss": 0.7682, + "step": 44250 + }, + { + "epoch": 0.2827645247434931, + "grad_norm": 0.9795184135437012, + "learning_rate": 9.515235918918932e-05, + "loss": 0.7583, + "step": 44260 + }, + { + "epoch": 0.28282841189323177, + "grad_norm": 0.7491911053657532, + "learning_rate": 9.515020365856651e-05, + "loss": 0.9229, + "step": 44270 + }, + { + "epoch": 0.2828922990429705, + "grad_norm": 1.1259350776672363, + "learning_rate": 9.51480476732421e-05, + "loss": 1.0074, + "step": 44280 + }, + { + "epoch": 0.2829561861927092, + "grad_norm": 1.4809681177139282, + "learning_rate": 9.514589123323777e-05, + "loss": 0.7639, + "step": 44290 + }, + { + "epoch": 0.2830200733424479, + "grad_norm": 1.034775733947754, + "learning_rate": 9.514373433857527e-05, + "loss": 0.9323, + "step": 44300 + }, + { + "epoch": 0.2830839604921866, + "grad_norm": 0.6214499473571777, + "learning_rate": 9.51415769892763e-05, + "loss": 0.7838, + "step": 44310 + }, + { + "epoch": 0.2831478476419253, + "grad_norm": 3.2163054943084717, + "learning_rate": 9.51394191853626e-05, + "loss": 1.0207, + "step": 44320 + }, + { + "epoch": 0.283211734791664, + "grad_norm": 0.6460834741592407, + "learning_rate": 9.513726092685591e-05, + "loss": 1.1491, + "step": 44330 + }, + { + "epoch": 0.2832756219414027, + "grad_norm": 0.7531580924987793, + "learning_rate": 9.513510221377793e-05, + "loss": 0.7412, + "step": 44340 + }, + { + "epoch": 0.2833395090911414, + "grad_norm": 1.0614440441131592, + "learning_rate": 9.513294304615044e-05, + "loss": 0.8966, + "step": 44350 + }, + { + "epoch": 0.2834033962408801, + "grad_norm": 0.6449925303459167, + "learning_rate": 9.513078342399517e-05, + "loss": 0.9988, + "step": 44360 + }, + { + "epoch": 0.28346728339061883, + "grad_norm": 0.7040312886238098, + "learning_rate": 9.512862334733386e-05, + "loss": 0.8929, + "step": 44370 + }, + { + "epoch": 0.28353117054035754, + "grad_norm": 1.03850257396698, + "learning_rate": 9.512646281618828e-05, + "loss": 0.5284, + "step": 44380 + }, + { + "epoch": 0.28359505769009624, + "grad_norm": 1.0077382326126099, + "learning_rate": 9.512430183058016e-05, + "loss": 0.8976, + "step": 44390 + }, + { + "epoch": 0.2836589448398349, + "grad_norm": 1.8333910703659058, + "learning_rate": 9.512214039053131e-05, + "loss": 0.9276, + "step": 44400 + }, + { + "epoch": 0.2837228319895736, + "grad_norm": 1.7474950551986694, + "learning_rate": 9.511997849606344e-05, + "loss": 0.9906, + "step": 44410 + }, + { + "epoch": 0.2837867191393123, + "grad_norm": 0.6216913461685181, + "learning_rate": 9.511781614719838e-05, + "loss": 0.8697, + "step": 44420 + }, + { + "epoch": 0.283850606289051, + "grad_norm": 0.8137566447257996, + "learning_rate": 9.511565334395786e-05, + "loss": 1.1427, + "step": 44430 + }, + { + "epoch": 0.2839144934387897, + "grad_norm": 0.8825230598449707, + "learning_rate": 9.51134900863637e-05, + "loss": 0.8255, + "step": 44440 + }, + { + "epoch": 0.2839783805885284, + "grad_norm": 1.51393461227417, + "learning_rate": 9.511132637443765e-05, + "loss": 0.9725, + "step": 44450 + }, + { + "epoch": 0.28404226773826713, + "grad_norm": 1.1853009462356567, + "learning_rate": 9.510916220820152e-05, + "loss": 0.9037, + "step": 44460 + }, + { + "epoch": 0.28410615488800584, + "grad_norm": 1.493323802947998, + "learning_rate": 9.510699758767709e-05, + "loss": 0.9787, + "step": 44470 + }, + { + "epoch": 0.28417004203774454, + "grad_norm": 0.8198840022087097, + "learning_rate": 9.510483251288619e-05, + "loss": 0.8874, + "step": 44480 + }, + { + "epoch": 0.28423392918748325, + "grad_norm": 0.7507383227348328, + "learning_rate": 9.51026669838506e-05, + "loss": 1.0607, + "step": 44490 + }, + { + "epoch": 0.28429781633722195, + "grad_norm": 0.698621928691864, + "learning_rate": 9.510050100059214e-05, + "loss": 0.9481, + "step": 44500 + }, + { + "epoch": 0.28436170348696066, + "grad_norm": 1.1146284341812134, + "learning_rate": 9.50983345631326e-05, + "loss": 1.0293, + "step": 44510 + }, + { + "epoch": 0.2844255906366993, + "grad_norm": 0.7054140567779541, + "learning_rate": 9.509616767149383e-05, + "loss": 0.8634, + "step": 44520 + }, + { + "epoch": 0.284489477786438, + "grad_norm": 0.7089869976043701, + "learning_rate": 9.509400032569763e-05, + "loss": 1.0544, + "step": 44530 + }, + { + "epoch": 0.2845533649361767, + "grad_norm": 0.817032516002655, + "learning_rate": 9.509183252576583e-05, + "loss": 0.8768, + "step": 44540 + }, + { + "epoch": 0.28461725208591543, + "grad_norm": 0.9883630275726318, + "learning_rate": 9.508966427172028e-05, + "loss": 0.8952, + "step": 44550 + }, + { + "epoch": 0.28468113923565413, + "grad_norm": 0.8523919582366943, + "learning_rate": 9.50874955635828e-05, + "loss": 0.9583, + "step": 44560 + }, + { + "epoch": 0.28474502638539284, + "grad_norm": 0.5503199696540833, + "learning_rate": 9.508532640137522e-05, + "loss": 0.986, + "step": 44570 + }, + { + "epoch": 0.28480891353513155, + "grad_norm": 0.6643738150596619, + "learning_rate": 9.50831567851194e-05, + "loss": 0.9774, + "step": 44580 + }, + { + "epoch": 0.28487280068487025, + "grad_norm": 0.8332018256187439, + "learning_rate": 9.50809867148372e-05, + "loss": 0.8159, + "step": 44590 + }, + { + "epoch": 0.28493668783460896, + "grad_norm": 0.8186637759208679, + "learning_rate": 9.507881619055046e-05, + "loss": 0.8421, + "step": 44600 + }, + { + "epoch": 0.28500057498434767, + "grad_norm": 0.5218867659568787, + "learning_rate": 9.507664521228106e-05, + "loss": 0.6236, + "step": 44610 + }, + { + "epoch": 0.28506446213408637, + "grad_norm": 0.8833245038986206, + "learning_rate": 9.507447378005083e-05, + "loss": 0.7893, + "step": 44620 + }, + { + "epoch": 0.2851283492838251, + "grad_norm": 0.9100516438484192, + "learning_rate": 9.507230189388164e-05, + "loss": 0.8338, + "step": 44630 + }, + { + "epoch": 0.28519223643356373, + "grad_norm": 1.299561858177185, + "learning_rate": 9.50701295537954e-05, + "loss": 0.7506, + "step": 44640 + }, + { + "epoch": 0.28525612358330243, + "grad_norm": 0.8499222993850708, + "learning_rate": 9.506795675981394e-05, + "loss": 0.8838, + "step": 44650 + }, + { + "epoch": 0.28532001073304114, + "grad_norm": 0.7215855121612549, + "learning_rate": 9.506578351195918e-05, + "loss": 1.1284, + "step": 44660 + }, + { + "epoch": 0.28538389788277985, + "grad_norm": 0.9105709791183472, + "learning_rate": 9.5063609810253e-05, + "loss": 0.7879, + "step": 44670 + }, + { + "epoch": 0.28544778503251855, + "grad_norm": 1.2917571067810059, + "learning_rate": 9.506165309069255e-05, + "loss": 1.0344, + "step": 44680 + }, + { + "epoch": 0.28551167218225726, + "grad_norm": 0.7351425886154175, + "learning_rate": 9.505947852672896e-05, + "loss": 0.9027, + "step": 44690 + }, + { + "epoch": 0.28557555933199597, + "grad_norm": 1.4496943950653076, + "learning_rate": 9.505730350897745e-05, + "loss": 0.7425, + "step": 44700 + }, + { + "epoch": 0.28563944648173467, + "grad_norm": 0.9381955862045288, + "learning_rate": 9.505512803745991e-05, + "loss": 1.0261, + "step": 44710 + }, + { + "epoch": 0.2857033336314734, + "grad_norm": 1.0335243940353394, + "learning_rate": 9.505295211219824e-05, + "loss": 1.0252, + "step": 44720 + }, + { + "epoch": 0.2857672207812121, + "grad_norm": 0.9310586452484131, + "learning_rate": 9.505077573321438e-05, + "loss": 0.947, + "step": 44730 + }, + { + "epoch": 0.2858311079309508, + "grad_norm": 1.818731665611267, + "learning_rate": 9.504859890053023e-05, + "loss": 0.9851, + "step": 44740 + }, + { + "epoch": 0.2858949950806895, + "grad_norm": 2.0556581020355225, + "learning_rate": 9.504642161416773e-05, + "loss": 0.7945, + "step": 44750 + }, + { + "epoch": 0.28595888223042815, + "grad_norm": 0.9168753027915955, + "learning_rate": 9.504424387414876e-05, + "loss": 0.7114, + "step": 44760 + }, + { + "epoch": 0.28602276938016685, + "grad_norm": 0.7781484723091125, + "learning_rate": 9.504206568049532e-05, + "loss": 0.8891, + "step": 44770 + }, + { + "epoch": 0.28608665652990556, + "grad_norm": 2.3256382942199707, + "learning_rate": 9.503988703322928e-05, + "loss": 0.8655, + "step": 44780 + }, + { + "epoch": 0.28615054367964426, + "grad_norm": 0.826259970664978, + "learning_rate": 9.503770793237263e-05, + "loss": 0.9931, + "step": 44790 + }, + { + "epoch": 0.28621443082938297, + "grad_norm": 1.0867620706558228, + "learning_rate": 9.50355283779473e-05, + "loss": 0.9931, + "step": 44800 + }, + { + "epoch": 0.2862783179791217, + "grad_norm": 0.6833561658859253, + "learning_rate": 9.503334836997524e-05, + "loss": 0.7724, + "step": 44810 + }, + { + "epoch": 0.2863422051288604, + "grad_norm": 0.8544519543647766, + "learning_rate": 9.503116790847839e-05, + "loss": 0.8207, + "step": 44820 + }, + { + "epoch": 0.2864060922785991, + "grad_norm": 0.5061067342758179, + "learning_rate": 9.502898699347873e-05, + "loss": 0.8357, + "step": 44830 + }, + { + "epoch": 0.2864699794283378, + "grad_norm": 0.91792231798172, + "learning_rate": 9.502680562499821e-05, + "loss": 1.0274, + "step": 44840 + }, + { + "epoch": 0.2865338665780765, + "grad_norm": 0.8766928911209106, + "learning_rate": 9.502462380305881e-05, + "loss": 0.8878, + "step": 44850 + }, + { + "epoch": 0.2865977537278152, + "grad_norm": 0.5164894461631775, + "learning_rate": 9.50224415276825e-05, + "loss": 0.961, + "step": 44860 + }, + { + "epoch": 0.2866616408775539, + "grad_norm": 0.7407921552658081, + "learning_rate": 9.502025879889125e-05, + "loss": 0.8303, + "step": 44870 + }, + { + "epoch": 0.28672552802729256, + "grad_norm": 0.8378937244415283, + "learning_rate": 9.501807561670703e-05, + "loss": 0.831, + "step": 44880 + }, + { + "epoch": 0.28678941517703127, + "grad_norm": 2.5379931926727295, + "learning_rate": 9.501589198115186e-05, + "loss": 0.8583, + "step": 44890 + }, + { + "epoch": 0.28685330232677, + "grad_norm": 0.8692481517791748, + "learning_rate": 9.501370789224772e-05, + "loss": 0.8469, + "step": 44900 + }, + { + "epoch": 0.2869171894765087, + "grad_norm": 0.7590452432632446, + "learning_rate": 9.501152335001658e-05, + "loss": 0.7567, + "step": 44910 + }, + { + "epoch": 0.2869810766262474, + "grad_norm": 0.6347168684005737, + "learning_rate": 9.500933835448047e-05, + "loss": 0.898, + "step": 44920 + }, + { + "epoch": 0.2870449637759861, + "grad_norm": 0.975532054901123, + "learning_rate": 9.500715290566138e-05, + "loss": 1.1958, + "step": 44930 + }, + { + "epoch": 0.2871088509257248, + "grad_norm": 0.670184850692749, + "learning_rate": 9.500496700358132e-05, + "loss": 0.9185, + "step": 44940 + }, + { + "epoch": 0.2871727380754635, + "grad_norm": 0.5493016839027405, + "learning_rate": 9.500278064826232e-05, + "loss": 0.9177, + "step": 44950 + }, + { + "epoch": 0.2872366252252022, + "grad_norm": 1.360520839691162, + "learning_rate": 9.500059383972638e-05, + "loss": 0.9026, + "step": 44960 + }, + { + "epoch": 0.2873005123749409, + "grad_norm": 0.6873490214347839, + "learning_rate": 9.499840657799553e-05, + "loss": 0.7064, + "step": 44970 + }, + { + "epoch": 0.2873643995246796, + "grad_norm": 0.814471423625946, + "learning_rate": 9.49962188630918e-05, + "loss": 0.8996, + "step": 44980 + }, + { + "epoch": 0.28742828667441833, + "grad_norm": 0.7156900763511658, + "learning_rate": 9.49940306950372e-05, + "loss": 0.841, + "step": 44990 + }, + { + "epoch": 0.287492173824157, + "grad_norm": 0.6915486454963684, + "learning_rate": 9.499184207385381e-05, + "loss": 0.8996, + "step": 45000 + }, + { + "epoch": 0.2875560609738957, + "grad_norm": 1.0259060859680176, + "learning_rate": 9.498965299956364e-05, + "loss": 0.9954, + "step": 45010 + }, + { + "epoch": 0.2876199481236344, + "grad_norm": 0.5235810875892639, + "learning_rate": 9.498746347218873e-05, + "loss": 1.1643, + "step": 45020 + }, + { + "epoch": 0.2876838352733731, + "grad_norm": 0.7001626491546631, + "learning_rate": 9.498527349175115e-05, + "loss": 1.0269, + "step": 45030 + }, + { + "epoch": 0.2877477224231118, + "grad_norm": 1.0902423858642578, + "learning_rate": 9.498308305827294e-05, + "loss": 0.9768, + "step": 45040 + }, + { + "epoch": 0.2878116095728505, + "grad_norm": 0.6482483744621277, + "learning_rate": 9.49808921717762e-05, + "loss": 1.0191, + "step": 45050 + }, + { + "epoch": 0.2878754967225892, + "grad_norm": 1.0290073156356812, + "learning_rate": 9.497870083228292e-05, + "loss": 1.0096, + "step": 45060 + }, + { + "epoch": 0.2879393838723279, + "grad_norm": 0.8370404243469238, + "learning_rate": 9.497650903981524e-05, + "loss": 1.0161, + "step": 45070 + }, + { + "epoch": 0.28800327102206663, + "grad_norm": 0.8315509557723999, + "learning_rate": 9.497431679439519e-05, + "loss": 0.7909, + "step": 45080 + }, + { + "epoch": 0.28806715817180534, + "grad_norm": 0.7571452856063843, + "learning_rate": 9.497212409604487e-05, + "loss": 0.9372, + "step": 45090 + }, + { + "epoch": 0.28813104532154404, + "grad_norm": 0.9375543594360352, + "learning_rate": 9.496993094478634e-05, + "loss": 0.6588, + "step": 45100 + }, + { + "epoch": 0.28819493247128275, + "grad_norm": 0.8192710876464844, + "learning_rate": 9.496773734064171e-05, + "loss": 0.9545, + "step": 45110 + }, + { + "epoch": 0.2882588196210214, + "grad_norm": 0.8890470862388611, + "learning_rate": 9.496554328363307e-05, + "loss": 0.9824, + "step": 45120 + }, + { + "epoch": 0.2883227067707601, + "grad_norm": 0.8460478186607361, + "learning_rate": 9.49633487737825e-05, + "loss": 0.66, + "step": 45130 + }, + { + "epoch": 0.2883865939204988, + "grad_norm": 1.1381182670593262, + "learning_rate": 9.496115381111211e-05, + "loss": 0.788, + "step": 45140 + }, + { + "epoch": 0.2884504810702375, + "grad_norm": 0.7069154977798462, + "learning_rate": 9.495895839564401e-05, + "loss": 0.7456, + "step": 45150 + }, + { + "epoch": 0.2885143682199762, + "grad_norm": 0.9826921820640564, + "learning_rate": 9.495676252740029e-05, + "loss": 1.0517, + "step": 45160 + }, + { + "epoch": 0.28857825536971493, + "grad_norm": 0.9633061289787292, + "learning_rate": 9.495456620640308e-05, + "loss": 1.0595, + "step": 45170 + }, + { + "epoch": 0.28864214251945364, + "grad_norm": 0.8172567486763, + "learning_rate": 9.495236943267451e-05, + "loss": 0.676, + "step": 45180 + }, + { + "epoch": 0.28870602966919234, + "grad_norm": 0.9763637185096741, + "learning_rate": 9.495017220623669e-05, + "loss": 0.7928, + "step": 45190 + }, + { + "epoch": 0.28876991681893105, + "grad_norm": 1.778890609741211, + "learning_rate": 9.494797452711174e-05, + "loss": 0.699, + "step": 45200 + }, + { + "epoch": 0.28883380396866976, + "grad_norm": 0.7678098678588867, + "learning_rate": 9.49457763953218e-05, + "loss": 0.8654, + "step": 45210 + }, + { + "epoch": 0.28889769111840846, + "grad_norm": 0.7432067394256592, + "learning_rate": 9.494357781088901e-05, + "loss": 0.7378, + "step": 45220 + }, + { + "epoch": 0.28896157826814717, + "grad_norm": 0.9834187030792236, + "learning_rate": 9.494137877383551e-05, + "loss": 0.9317, + "step": 45230 + }, + { + "epoch": 0.2890254654178859, + "grad_norm": 0.6653081178665161, + "learning_rate": 9.493917928418345e-05, + "loss": 0.7968, + "step": 45240 + }, + { + "epoch": 0.2890893525676245, + "grad_norm": 0.7635434865951538, + "learning_rate": 9.493697934195499e-05, + "loss": 0.8611, + "step": 45250 + }, + { + "epoch": 0.28915323971736323, + "grad_norm": 0.9517902135848999, + "learning_rate": 9.493477894717224e-05, + "loss": 0.7536, + "step": 45260 + }, + { + "epoch": 0.28921712686710194, + "grad_norm": 0.7886881232261658, + "learning_rate": 9.49325780998574e-05, + "loss": 0.9113, + "step": 45270 + }, + { + "epoch": 0.28928101401684064, + "grad_norm": 0.7776336669921875, + "learning_rate": 9.493037680003264e-05, + "loss": 0.8193, + "step": 45280 + }, + { + "epoch": 0.28934490116657935, + "grad_norm": 0.8685764670372009, + "learning_rate": 9.492817504772012e-05, + "loss": 0.9521, + "step": 45290 + }, + { + "epoch": 0.28940878831631806, + "grad_norm": 1.247955322265625, + "learning_rate": 9.492597284294198e-05, + "loss": 0.9216, + "step": 45300 + }, + { + "epoch": 0.28947267546605676, + "grad_norm": 0.9125822186470032, + "learning_rate": 9.492377018572046e-05, + "loss": 1.003, + "step": 45310 + }, + { + "epoch": 0.28953656261579547, + "grad_norm": 0.7521454095840454, + "learning_rate": 9.492156707607769e-05, + "loss": 0.806, + "step": 45320 + }, + { + "epoch": 0.2896004497655342, + "grad_norm": 0.8048921823501587, + "learning_rate": 9.491936351403588e-05, + "loss": 1.0567, + "step": 45330 + }, + { + "epoch": 0.2896643369152729, + "grad_norm": 0.6293105483055115, + "learning_rate": 9.491715949961721e-05, + "loss": 0.6753, + "step": 45340 + }, + { + "epoch": 0.2897282240650116, + "grad_norm": 0.7665662169456482, + "learning_rate": 9.491495503284391e-05, + "loss": 1.1162, + "step": 45350 + }, + { + "epoch": 0.2897921112147503, + "grad_norm": 1.389918327331543, + "learning_rate": 9.491275011373813e-05, + "loss": 0.8464, + "step": 45360 + }, + { + "epoch": 0.28985599836448894, + "grad_norm": 0.5303570628166199, + "learning_rate": 9.491054474232212e-05, + "loss": 0.8697, + "step": 45370 + }, + { + "epoch": 0.28991988551422765, + "grad_norm": 0.7896818518638611, + "learning_rate": 9.490833891861806e-05, + "loss": 0.8274, + "step": 45380 + }, + { + "epoch": 0.28998377266396635, + "grad_norm": 1.085740566253662, + "learning_rate": 9.490613264264818e-05, + "loss": 1.2138, + "step": 45390 + }, + { + "epoch": 0.29004765981370506, + "grad_norm": 0.9836480617523193, + "learning_rate": 9.490392591443469e-05, + "loss": 1.0133, + "step": 45400 + }, + { + "epoch": 0.29011154696344377, + "grad_norm": 1.2857035398483276, + "learning_rate": 9.490171873399982e-05, + "loss": 0.7627, + "step": 45410 + }, + { + "epoch": 0.2901754341131825, + "grad_norm": 0.7839180827140808, + "learning_rate": 9.489951110136581e-05, + "loss": 0.8626, + "step": 45420 + }, + { + "epoch": 0.2902393212629212, + "grad_norm": 0.6946144104003906, + "learning_rate": 9.489730301655486e-05, + "loss": 0.8252, + "step": 45430 + }, + { + "epoch": 0.2903032084126599, + "grad_norm": 0.7816669344902039, + "learning_rate": 9.489509447958924e-05, + "loss": 0.8103, + "step": 45440 + }, + { + "epoch": 0.2903670955623986, + "grad_norm": 1.0374782085418701, + "learning_rate": 9.489288549049118e-05, + "loss": 0.9284, + "step": 45450 + }, + { + "epoch": 0.2904309827121373, + "grad_norm": 1.0329042673110962, + "learning_rate": 9.489067604928293e-05, + "loss": 1.0252, + "step": 45460 + }, + { + "epoch": 0.290494869861876, + "grad_norm": 1.062635898590088, + "learning_rate": 9.488846615598671e-05, + "loss": 0.8388, + "step": 45470 + }, + { + "epoch": 0.2905587570116147, + "grad_norm": 1.2873570919036865, + "learning_rate": 9.488625581062483e-05, + "loss": 0.8721, + "step": 45480 + }, + { + "epoch": 0.29062264416135336, + "grad_norm": 1.4806243181228638, + "learning_rate": 9.48840450132195e-05, + "loss": 1.1295, + "step": 45490 + }, + { + "epoch": 0.29068653131109207, + "grad_norm": 0.9083405137062073, + "learning_rate": 9.488183376379302e-05, + "loss": 0.782, + "step": 45500 + }, + { + "epoch": 0.2907504184608308, + "grad_norm": 0.43349987268447876, + "learning_rate": 9.487962206236765e-05, + "loss": 0.9368, + "step": 45510 + }, + { + "epoch": 0.2908143056105695, + "grad_norm": 0.6599463224411011, + "learning_rate": 9.487740990896564e-05, + "loss": 0.7841, + "step": 45520 + }, + { + "epoch": 0.2908781927603082, + "grad_norm": 0.6311991810798645, + "learning_rate": 9.48751973036093e-05, + "loss": 0.9138, + "step": 45530 + }, + { + "epoch": 0.2909420799100469, + "grad_norm": 0.5348168015480042, + "learning_rate": 9.487298424632089e-05, + "loss": 0.7043, + "step": 45540 + }, + { + "epoch": 0.2910059670597856, + "grad_norm": 0.8502787947654724, + "learning_rate": 9.487077073712273e-05, + "loss": 1.0872, + "step": 45550 + }, + { + "epoch": 0.2910698542095243, + "grad_norm": 0.8174751400947571, + "learning_rate": 9.486855677603707e-05, + "loss": 0.9294, + "step": 45560 + }, + { + "epoch": 0.291133741359263, + "grad_norm": 0.7692357897758484, + "learning_rate": 9.486634236308624e-05, + "loss": 0.9752, + "step": 45570 + }, + { + "epoch": 0.2911976285090017, + "grad_norm": 0.43835484981536865, + "learning_rate": 9.486412749829251e-05, + "loss": 0.8376, + "step": 45580 + }, + { + "epoch": 0.2912615156587404, + "grad_norm": 1.0766656398773193, + "learning_rate": 9.486191218167823e-05, + "loss": 0.95, + "step": 45590 + }, + { + "epoch": 0.29132540280847913, + "grad_norm": 0.7746816277503967, + "learning_rate": 9.485969641326566e-05, + "loss": 1.114, + "step": 45600 + }, + { + "epoch": 0.2913892899582178, + "grad_norm": 0.8561303019523621, + "learning_rate": 9.485748019307716e-05, + "loss": 1.125, + "step": 45610 + }, + { + "epoch": 0.2914531771079565, + "grad_norm": 0.7320988774299622, + "learning_rate": 9.4855263521135e-05, + "loss": 0.8814, + "step": 45620 + }, + { + "epoch": 0.2915170642576952, + "grad_norm": 0.9033998847007751, + "learning_rate": 9.485304639746155e-05, + "loss": 0.8253, + "step": 45630 + }, + { + "epoch": 0.2915809514074339, + "grad_norm": 1.71013605594635, + "learning_rate": 9.485082882207911e-05, + "loss": 0.632, + "step": 45640 + }, + { + "epoch": 0.2916448385571726, + "grad_norm": 0.5615208148956299, + "learning_rate": 9.484861079501003e-05, + "loss": 0.7828, + "step": 45650 + }, + { + "epoch": 0.2917087257069113, + "grad_norm": 1.236778974533081, + "learning_rate": 9.484639231627664e-05, + "loss": 0.9695, + "step": 45660 + }, + { + "epoch": 0.29177261285665, + "grad_norm": 0.9685949683189392, + "learning_rate": 9.484417338590127e-05, + "loss": 0.6323, + "step": 45670 + }, + { + "epoch": 0.2918365000063887, + "grad_norm": 1.1921647787094116, + "learning_rate": 9.484195400390629e-05, + "loss": 1.1795, + "step": 45680 + }, + { + "epoch": 0.2919003871561274, + "grad_norm": 1.2076773643493652, + "learning_rate": 9.483973417031404e-05, + "loss": 0.9146, + "step": 45690 + }, + { + "epoch": 0.29196427430586613, + "grad_norm": 1.2398484945297241, + "learning_rate": 9.483751388514685e-05, + "loss": 0.9077, + "step": 45700 + }, + { + "epoch": 0.29202816145560484, + "grad_norm": 0.9221826195716858, + "learning_rate": 9.483529314842715e-05, + "loss": 0.6419, + "step": 45710 + }, + { + "epoch": 0.29209204860534355, + "grad_norm": 0.9597351551055908, + "learning_rate": 9.483307196017722e-05, + "loss": 0.8304, + "step": 45720 + }, + { + "epoch": 0.2921559357550822, + "grad_norm": 0.8210084438323975, + "learning_rate": 9.483085032041949e-05, + "loss": 0.5964, + "step": 45730 + }, + { + "epoch": 0.2922198229048209, + "grad_norm": 0.8063592910766602, + "learning_rate": 9.48286282291763e-05, + "loss": 1.1034, + "step": 45740 + }, + { + "epoch": 0.2922837100545596, + "grad_norm": 1.1515041589736938, + "learning_rate": 9.482640568647006e-05, + "loss": 1.1599, + "step": 45750 + }, + { + "epoch": 0.2923475972042983, + "grad_norm": 0.5701981782913208, + "learning_rate": 9.482418269232311e-05, + "loss": 0.8986, + "step": 45760 + }, + { + "epoch": 0.292411484354037, + "grad_norm": 1.5599360466003418, + "learning_rate": 9.482195924675789e-05, + "loss": 1.0177, + "step": 45770 + }, + { + "epoch": 0.2924753715037757, + "grad_norm": 0.7682929635047913, + "learning_rate": 9.481973534979674e-05, + "loss": 1.104, + "step": 45780 + }, + { + "epoch": 0.29253925865351443, + "grad_norm": 0.5851641297340393, + "learning_rate": 9.481751100146209e-05, + "loss": 1.0031, + "step": 45790 + }, + { + "epoch": 0.29260314580325314, + "grad_norm": 0.8387541174888611, + "learning_rate": 9.481528620177633e-05, + "loss": 0.868, + "step": 45800 + }, + { + "epoch": 0.29266703295299185, + "grad_norm": 0.9967763423919678, + "learning_rate": 9.481306095076188e-05, + "loss": 0.7826, + "step": 45810 + }, + { + "epoch": 0.29273092010273055, + "grad_norm": 0.5963412523269653, + "learning_rate": 9.481083524844113e-05, + "loss": 0.8424, + "step": 45820 + }, + { + "epoch": 0.29279480725246926, + "grad_norm": 0.5736973285675049, + "learning_rate": 9.480860909483649e-05, + "loss": 0.8342, + "step": 45830 + }, + { + "epoch": 0.29285869440220796, + "grad_norm": 1.3335403203964233, + "learning_rate": 9.480638248997039e-05, + "loss": 1.086, + "step": 45840 + }, + { + "epoch": 0.2929225815519466, + "grad_norm": 0.5996566414833069, + "learning_rate": 9.480415543386528e-05, + "loss": 0.8788, + "step": 45850 + }, + { + "epoch": 0.2929864687016853, + "grad_norm": 0.8202914595603943, + "learning_rate": 9.480192792654355e-05, + "loss": 0.9448, + "step": 45860 + }, + { + "epoch": 0.293050355851424, + "grad_norm": 1.0648213624954224, + "learning_rate": 9.479969996802763e-05, + "loss": 1.1049, + "step": 45870 + }, + { + "epoch": 0.29311424300116273, + "grad_norm": 0.8735106587409973, + "learning_rate": 9.479747155833999e-05, + "loss": 0.7924, + "step": 45880 + }, + { + "epoch": 0.29317813015090144, + "grad_norm": 0.7611486315727234, + "learning_rate": 9.479524269750306e-05, + "loss": 0.7604, + "step": 45890 + }, + { + "epoch": 0.29324201730064015, + "grad_norm": 1.0947805643081665, + "learning_rate": 9.479301338553927e-05, + "loss": 0.8645, + "step": 45900 + }, + { + "epoch": 0.29330590445037885, + "grad_norm": 1.0103657245635986, + "learning_rate": 9.479078362247109e-05, + "loss": 1.0796, + "step": 45910 + }, + { + "epoch": 0.29336979160011756, + "grad_norm": 0.4850558042526245, + "learning_rate": 9.478855340832097e-05, + "loss": 0.9143, + "step": 45920 + }, + { + "epoch": 0.29343367874985626, + "grad_norm": 1.094140648841858, + "learning_rate": 9.478632274311137e-05, + "loss": 0.8126, + "step": 45930 + }, + { + "epoch": 0.29349756589959497, + "grad_norm": 0.43484973907470703, + "learning_rate": 9.478409162686475e-05, + "loss": 0.8784, + "step": 45940 + }, + { + "epoch": 0.2935614530493337, + "grad_norm": 0.7616863250732422, + "learning_rate": 9.478186005960359e-05, + "loss": 0.9753, + "step": 45950 + }, + { + "epoch": 0.2936253401990724, + "grad_norm": 0.625148594379425, + "learning_rate": 9.477962804135037e-05, + "loss": 0.7565, + "step": 45960 + }, + { + "epoch": 0.29368922734881103, + "grad_norm": 0.5480995178222656, + "learning_rate": 9.477739557212753e-05, + "loss": 0.9026, + "step": 45970 + }, + { + "epoch": 0.29375311449854974, + "grad_norm": 2.2931747436523438, + "learning_rate": 9.477516265195759e-05, + "loss": 0.7589, + "step": 45980 + }, + { + "epoch": 0.29381700164828845, + "grad_norm": 0.734191358089447, + "learning_rate": 9.477292928086303e-05, + "loss": 0.9262, + "step": 45990 + }, + { + "epoch": 0.29388088879802715, + "grad_norm": 0.8451043367385864, + "learning_rate": 9.477069545886633e-05, + "loss": 0.9716, + "step": 46000 + }, + { + "epoch": 0.29394477594776586, + "grad_norm": 0.7809146046638489, + "learning_rate": 9.476846118599e-05, + "loss": 1.0449, + "step": 46010 + }, + { + "epoch": 0.29400866309750456, + "grad_norm": 0.930077314376831, + "learning_rate": 9.476622646225653e-05, + "loss": 0.7448, + "step": 46020 + }, + { + "epoch": 0.29407255024724327, + "grad_norm": 0.7774382829666138, + "learning_rate": 9.476399128768845e-05, + "loss": 0.9692, + "step": 46030 + }, + { + "epoch": 0.294136437396982, + "grad_norm": 0.678877592086792, + "learning_rate": 9.476175566230822e-05, + "loss": 0.8851, + "step": 46040 + }, + { + "epoch": 0.2942003245467207, + "grad_norm": 0.7753483653068542, + "learning_rate": 9.475951958613842e-05, + "loss": 0.7935, + "step": 46050 + }, + { + "epoch": 0.2942642116964594, + "grad_norm": 1.3318039178848267, + "learning_rate": 9.475728305920151e-05, + "loss": 0.9516, + "step": 46060 + }, + { + "epoch": 0.2943280988461981, + "grad_norm": 3.3444128036499023, + "learning_rate": 9.475504608152005e-05, + "loss": 0.9883, + "step": 46070 + }, + { + "epoch": 0.2943919859959368, + "grad_norm": 1.5764853954315186, + "learning_rate": 9.475280865311656e-05, + "loss": 0.8666, + "step": 46080 + }, + { + "epoch": 0.2944558731456755, + "grad_norm": 1.4029241800308228, + "learning_rate": 9.475057077401356e-05, + "loss": 0.9805, + "step": 46090 + }, + { + "epoch": 0.29451976029541416, + "grad_norm": 0.5334384441375732, + "learning_rate": 9.47483324442336e-05, + "loss": 0.8014, + "step": 46100 + }, + { + "epoch": 0.29458364744515286, + "grad_norm": 0.8421732783317566, + "learning_rate": 9.474609366379923e-05, + "loss": 0.8272, + "step": 46110 + }, + { + "epoch": 0.29464753459489157, + "grad_norm": 0.5735695362091064, + "learning_rate": 9.474385443273296e-05, + "loss": 0.9271, + "step": 46120 + }, + { + "epoch": 0.2947114217446303, + "grad_norm": 0.8526939749717712, + "learning_rate": 9.47416147510574e-05, + "loss": 0.9938, + "step": 46130 + }, + { + "epoch": 0.294775308894369, + "grad_norm": 0.6834962964057922, + "learning_rate": 9.473937461879505e-05, + "loss": 1.1168, + "step": 46140 + }, + { + "epoch": 0.2948391960441077, + "grad_norm": 1.1148639917373657, + "learning_rate": 9.47371340359685e-05, + "loss": 0.9541, + "step": 46150 + }, + { + "epoch": 0.2949030831938464, + "grad_norm": 0.8598116040229797, + "learning_rate": 9.47348930026003e-05, + "loss": 0.9374, + "step": 46160 + }, + { + "epoch": 0.2949669703435851, + "grad_norm": 0.6423646211624146, + "learning_rate": 9.473265151871304e-05, + "loss": 0.8231, + "step": 46170 + }, + { + "epoch": 0.2950308574933238, + "grad_norm": 2.0000832080841064, + "learning_rate": 9.473040958432927e-05, + "loss": 0.8936, + "step": 46180 + }, + { + "epoch": 0.2950947446430625, + "grad_norm": 1.143376350402832, + "learning_rate": 9.472816719947159e-05, + "loss": 0.6661, + "step": 46190 + }, + { + "epoch": 0.2951586317928012, + "grad_norm": 0.7327792048454285, + "learning_rate": 9.472592436416255e-05, + "loss": 0.8819, + "step": 46200 + }, + { + "epoch": 0.2952225189425399, + "grad_norm": 0.8125030994415283, + "learning_rate": 9.472368107842477e-05, + "loss": 0.9795, + "step": 46210 + }, + { + "epoch": 0.2952864060922786, + "grad_norm": 0.8501039743423462, + "learning_rate": 9.472143734228083e-05, + "loss": 1.1246, + "step": 46220 + }, + { + "epoch": 0.2953502932420173, + "grad_norm": 0.4900776743888855, + "learning_rate": 9.471919315575333e-05, + "loss": 0.6896, + "step": 46230 + }, + { + "epoch": 0.295414180391756, + "grad_norm": 1.3538086414337158, + "learning_rate": 9.471694851886487e-05, + "loss": 0.961, + "step": 46240 + }, + { + "epoch": 0.2954780675414947, + "grad_norm": 0.9380719661712646, + "learning_rate": 9.471470343163804e-05, + "loss": 1.1836, + "step": 46250 + }, + { + "epoch": 0.2955419546912334, + "grad_norm": 0.9986345767974854, + "learning_rate": 9.471245789409548e-05, + "loss": 0.8949, + "step": 46260 + }, + { + "epoch": 0.2956058418409721, + "grad_norm": 0.35391414165496826, + "learning_rate": 9.471021190625977e-05, + "loss": 0.8161, + "step": 46270 + }, + { + "epoch": 0.2956697289907108, + "grad_norm": 0.7981874942779541, + "learning_rate": 9.470796546815354e-05, + "loss": 0.9282, + "step": 46280 + }, + { + "epoch": 0.2957336161404495, + "grad_norm": 0.6027029752731323, + "learning_rate": 9.470571857979945e-05, + "loss": 0.9214, + "step": 46290 + }, + { + "epoch": 0.2957975032901882, + "grad_norm": 0.973746657371521, + "learning_rate": 9.470347124122008e-05, + "loss": 0.8735, + "step": 46300 + }, + { + "epoch": 0.29586139043992693, + "grad_norm": 0.944004476070404, + "learning_rate": 9.470122345243809e-05, + "loss": 0.9898, + "step": 46310 + }, + { + "epoch": 0.29592527758966564, + "grad_norm": 0.9042976498603821, + "learning_rate": 9.469897521347609e-05, + "loss": 1.0455, + "step": 46320 + }, + { + "epoch": 0.29598916473940434, + "grad_norm": 0.7813184857368469, + "learning_rate": 9.469672652435675e-05, + "loss": 0.879, + "step": 46330 + }, + { + "epoch": 0.296053051889143, + "grad_norm": 1.1560348272323608, + "learning_rate": 9.469447738510269e-05, + "loss": 0.8168, + "step": 46340 + }, + { + "epoch": 0.2961169390388817, + "grad_norm": 0.8251795768737793, + "learning_rate": 9.46922277957366e-05, + "loss": 0.9762, + "step": 46350 + }, + { + "epoch": 0.2961808261886204, + "grad_norm": 1.086754560470581, + "learning_rate": 9.46899777562811e-05, + "loss": 0.9877, + "step": 46360 + }, + { + "epoch": 0.2962447133383591, + "grad_norm": 1.2580642700195312, + "learning_rate": 9.468772726675887e-05, + "loss": 0.808, + "step": 46370 + }, + { + "epoch": 0.2963086004880978, + "grad_norm": 0.946445107460022, + "learning_rate": 9.468547632719255e-05, + "loss": 0.7862, + "step": 46380 + }, + { + "epoch": 0.2963724876378365, + "grad_norm": 1.1934231519699097, + "learning_rate": 9.468322493760484e-05, + "loss": 1.1795, + "step": 46390 + }, + { + "epoch": 0.29643637478757523, + "grad_norm": 0.9049299955368042, + "learning_rate": 9.46809730980184e-05, + "loss": 0.8541, + "step": 46400 + }, + { + "epoch": 0.29650026193731394, + "grad_norm": 1.3336893320083618, + "learning_rate": 9.46787208084559e-05, + "loss": 1.0398, + "step": 46410 + }, + { + "epoch": 0.29656414908705264, + "grad_norm": 0.9916601181030273, + "learning_rate": 9.467646806894001e-05, + "loss": 0.8907, + "step": 46420 + }, + { + "epoch": 0.29662803623679135, + "grad_norm": 0.9866839051246643, + "learning_rate": 9.467421487949347e-05, + "loss": 1.1556, + "step": 46430 + }, + { + "epoch": 0.29669192338653005, + "grad_norm": 0.7323521971702576, + "learning_rate": 9.467196124013893e-05, + "loss": 0.9656, + "step": 46440 + }, + { + "epoch": 0.29675581053626876, + "grad_norm": 1.1069689989089966, + "learning_rate": 9.466970715089907e-05, + "loss": 0.6297, + "step": 46450 + }, + { + "epoch": 0.2968196976860074, + "grad_norm": 0.5628019571304321, + "learning_rate": 9.466745261179664e-05, + "loss": 0.806, + "step": 46460 + }, + { + "epoch": 0.2968835848357461, + "grad_norm": 1.0032429695129395, + "learning_rate": 9.466519762285431e-05, + "loss": 0.9214, + "step": 46470 + }, + { + "epoch": 0.2969474719854848, + "grad_norm": 0.7182255983352661, + "learning_rate": 9.466294218409479e-05, + "loss": 0.9303, + "step": 46480 + }, + { + "epoch": 0.29701135913522353, + "grad_norm": 1.8324652910232544, + "learning_rate": 9.466068629554082e-05, + "loss": 1.1856, + "step": 46490 + }, + { + "epoch": 0.29707524628496224, + "grad_norm": 0.7303147315979004, + "learning_rate": 9.46584299572151e-05, + "loss": 0.7481, + "step": 46500 + }, + { + "epoch": 0.29713913343470094, + "grad_norm": 2.1872732639312744, + "learning_rate": 9.465617316914033e-05, + "loss": 0.7029, + "step": 46510 + }, + { + "epoch": 0.29720302058443965, + "grad_norm": 1.0133579969406128, + "learning_rate": 9.46539159313393e-05, + "loss": 0.8906, + "step": 46520 + }, + { + "epoch": 0.29726690773417835, + "grad_norm": 0.7005990147590637, + "learning_rate": 9.465165824383468e-05, + "loss": 0.725, + "step": 46530 + }, + { + "epoch": 0.29733079488391706, + "grad_norm": 1.0312696695327759, + "learning_rate": 9.464940010664925e-05, + "loss": 0.9634, + "step": 46540 + }, + { + "epoch": 0.29739468203365577, + "grad_norm": 0.36605343222618103, + "learning_rate": 9.464714151980571e-05, + "loss": 0.7082, + "step": 46550 + }, + { + "epoch": 0.2974585691833945, + "grad_norm": 1.1739599704742432, + "learning_rate": 9.464488248332685e-05, + "loss": 1.4234, + "step": 46560 + }, + { + "epoch": 0.2975224563331332, + "grad_norm": 0.8871263265609741, + "learning_rate": 9.464262299723539e-05, + "loss": 0.7826, + "step": 46570 + }, + { + "epoch": 0.29758634348287183, + "grad_norm": 0.652490496635437, + "learning_rate": 9.46403630615541e-05, + "loss": 0.9345, + "step": 46580 + }, + { + "epoch": 0.29765023063261054, + "grad_norm": 0.8714577555656433, + "learning_rate": 9.463810267630573e-05, + "loss": 1.096, + "step": 46590 + }, + { + "epoch": 0.29771411778234924, + "grad_norm": 0.48764970898628235, + "learning_rate": 9.463584184151305e-05, + "loss": 0.9833, + "step": 46600 + }, + { + "epoch": 0.29777800493208795, + "grad_norm": 0.5805774331092834, + "learning_rate": 9.463358055719883e-05, + "loss": 0.6249, + "step": 46610 + }, + { + "epoch": 0.29784189208182665, + "grad_norm": 1.5289901494979858, + "learning_rate": 9.463131882338583e-05, + "loss": 1.0411, + "step": 46620 + }, + { + "epoch": 0.29790577923156536, + "grad_norm": 0.9983165264129639, + "learning_rate": 9.462905664009685e-05, + "loss": 0.9297, + "step": 46630 + }, + { + "epoch": 0.29796966638130407, + "grad_norm": 0.5943264961242676, + "learning_rate": 9.462679400735466e-05, + "loss": 1.0207, + "step": 46640 + }, + { + "epoch": 0.29803355353104277, + "grad_norm": 0.9419231414794922, + "learning_rate": 9.462453092518204e-05, + "loss": 0.8072, + "step": 46650 + }, + { + "epoch": 0.2980974406807815, + "grad_norm": 0.6155195832252502, + "learning_rate": 9.46222673936018e-05, + "loss": 0.7672, + "step": 46660 + }, + { + "epoch": 0.2981613278305202, + "grad_norm": 0.5580214858055115, + "learning_rate": 9.462000341263671e-05, + "loss": 0.9204, + "step": 46670 + }, + { + "epoch": 0.2982252149802589, + "grad_norm": 1.5612927675247192, + "learning_rate": 9.46177389823096e-05, + "loss": 1.2065, + "step": 46680 + }, + { + "epoch": 0.2982891021299976, + "grad_norm": 1.112136960029602, + "learning_rate": 9.461547410264324e-05, + "loss": 0.7437, + "step": 46690 + }, + { + "epoch": 0.29835298927973625, + "grad_norm": 0.646933913230896, + "learning_rate": 9.461320877366047e-05, + "loss": 0.8512, + "step": 46700 + }, + { + "epoch": 0.29841687642947495, + "grad_norm": 0.7455711960792542, + "learning_rate": 9.461094299538408e-05, + "loss": 1.0632, + "step": 46710 + }, + { + "epoch": 0.29848076357921366, + "grad_norm": 1.7939437627792358, + "learning_rate": 9.460867676783691e-05, + "loss": 0.9103, + "step": 46720 + }, + { + "epoch": 0.29854465072895237, + "grad_norm": 0.8458738923072815, + "learning_rate": 9.460641009104177e-05, + "loss": 0.8318, + "step": 46730 + }, + { + "epoch": 0.29860853787869107, + "grad_norm": 1.1365669965744019, + "learning_rate": 9.460414296502149e-05, + "loss": 0.94, + "step": 46740 + }, + { + "epoch": 0.2986724250284298, + "grad_norm": 0.8920236229896545, + "learning_rate": 9.46018753897989e-05, + "loss": 0.8648, + "step": 46750 + }, + { + "epoch": 0.2987363121781685, + "grad_norm": 1.0041251182556152, + "learning_rate": 9.459960736539683e-05, + "loss": 0.6963, + "step": 46760 + }, + { + "epoch": 0.2988001993279072, + "grad_norm": 0.6039364337921143, + "learning_rate": 9.459733889183815e-05, + "loss": 0.8719, + "step": 46770 + }, + { + "epoch": 0.2988640864776459, + "grad_norm": 1.0676556825637817, + "learning_rate": 9.459506996914568e-05, + "loss": 0.7705, + "step": 46780 + }, + { + "epoch": 0.2989279736273846, + "grad_norm": 1.1080639362335205, + "learning_rate": 9.459280059734226e-05, + "loss": 0.9965, + "step": 46790 + }, + { + "epoch": 0.2989918607771233, + "grad_norm": 1.0551854372024536, + "learning_rate": 9.459053077645077e-05, + "loss": 0.8556, + "step": 46800 + }, + { + "epoch": 0.299055747926862, + "grad_norm": 0.8783060908317566, + "learning_rate": 9.458826050649407e-05, + "loss": 1.329, + "step": 46810 + }, + { + "epoch": 0.29911963507660067, + "grad_norm": 0.8185521960258484, + "learning_rate": 9.4585989787495e-05, + "loss": 1.0131, + "step": 46820 + }, + { + "epoch": 0.29918352222633937, + "grad_norm": 0.5932868719100952, + "learning_rate": 9.458371861947645e-05, + "loss": 1.0617, + "step": 46830 + }, + { + "epoch": 0.2992474093760781, + "grad_norm": 1.5922162532806396, + "learning_rate": 9.458144700246127e-05, + "loss": 0.8565, + "step": 46840 + }, + { + "epoch": 0.2993112965258168, + "grad_norm": 0.603920042514801, + "learning_rate": 9.457917493647235e-05, + "loss": 0.7725, + "step": 46850 + }, + { + "epoch": 0.2993751836755555, + "grad_norm": 0.9906972646713257, + "learning_rate": 9.457690242153258e-05, + "loss": 1.0442, + "step": 46860 + }, + { + "epoch": 0.2994390708252942, + "grad_norm": 0.756675124168396, + "learning_rate": 9.457462945766484e-05, + "loss": 0.8007, + "step": 46870 + }, + { + "epoch": 0.2995029579750329, + "grad_norm": 0.6027681827545166, + "learning_rate": 9.4572356044892e-05, + "loss": 1.1144, + "step": 46880 + }, + { + "epoch": 0.2995668451247716, + "grad_norm": 0.542198896408081, + "learning_rate": 9.457008218323699e-05, + "loss": 0.9496, + "step": 46890 + }, + { + "epoch": 0.2996307322745103, + "grad_norm": 1.004642367362976, + "learning_rate": 9.45678078727227e-05, + "loss": 0.8916, + "step": 46900 + }, + { + "epoch": 0.299694619424249, + "grad_norm": 0.5443822741508484, + "learning_rate": 9.456553311337202e-05, + "loss": 1.0492, + "step": 46910 + }, + { + "epoch": 0.2997585065739877, + "grad_norm": 0.562498927116394, + "learning_rate": 9.456325790520789e-05, + "loss": 1.0578, + "step": 46920 + }, + { + "epoch": 0.29982239372372643, + "grad_norm": 0.7859065532684326, + "learning_rate": 9.456098224825316e-05, + "loss": 1.046, + "step": 46930 + }, + { + "epoch": 0.29988628087346514, + "grad_norm": 0.7627017498016357, + "learning_rate": 9.455870614253081e-05, + "loss": 0.8198, + "step": 46940 + }, + { + "epoch": 0.2999501680232038, + "grad_norm": 0.6485910415649414, + "learning_rate": 9.455642958806374e-05, + "loss": 0.9887, + "step": 46950 + }, + { + "epoch": 0.3000140551729425, + "grad_norm": 1.4447276592254639, + "learning_rate": 9.455415258487487e-05, + "loss": 0.7989, + "step": 46960 + }, + { + "epoch": 0.3000779423226812, + "grad_norm": 0.9059609770774841, + "learning_rate": 9.455187513298714e-05, + "loss": 0.9545, + "step": 46970 + }, + { + "epoch": 0.3001418294724199, + "grad_norm": 1.1355173587799072, + "learning_rate": 9.454959723242349e-05, + "loss": 0.825, + "step": 46980 + }, + { + "epoch": 0.3002057166221586, + "grad_norm": 1.0973711013793945, + "learning_rate": 9.454731888320684e-05, + "loss": 0.9209, + "step": 46990 + }, + { + "epoch": 0.3002696037718973, + "grad_norm": 0.9574286341667175, + "learning_rate": 9.454504008536017e-05, + "loss": 0.8564, + "step": 47000 + }, + { + "epoch": 0.300333490921636, + "grad_norm": 1.527851939201355, + "learning_rate": 9.454276083890641e-05, + "loss": 1.3292, + "step": 47010 + }, + { + "epoch": 0.30039737807137473, + "grad_norm": 0.8139092326164246, + "learning_rate": 9.454048114386848e-05, + "loss": 0.8496, + "step": 47020 + }, + { + "epoch": 0.30046126522111344, + "grad_norm": 1.5546993017196655, + "learning_rate": 9.453820100026942e-05, + "loss": 1.1378, + "step": 47030 + }, + { + "epoch": 0.30052515237085214, + "grad_norm": 0.7080782055854797, + "learning_rate": 9.45359204081321e-05, + "loss": 0.7672, + "step": 47040 + }, + { + "epoch": 0.30058903952059085, + "grad_norm": 0.6548307538032532, + "learning_rate": 9.453363936747957e-05, + "loss": 1.0312, + "step": 47050 + }, + { + "epoch": 0.30065292667032956, + "grad_norm": 0.640304684638977, + "learning_rate": 9.453135787833473e-05, + "loss": 0.9947, + "step": 47060 + }, + { + "epoch": 0.3007168138200682, + "grad_norm": 0.9930755496025085, + "learning_rate": 9.452907594072062e-05, + "loss": 0.7912, + "step": 47070 + }, + { + "epoch": 0.3007807009698069, + "grad_norm": 0.8189347386360168, + "learning_rate": 9.452679355466018e-05, + "loss": 1.1204, + "step": 47080 + }, + { + "epoch": 0.3008445881195456, + "grad_norm": 0.8146405220031738, + "learning_rate": 9.45245107201764e-05, + "loss": 1.0947, + "step": 47090 + }, + { + "epoch": 0.3009084752692843, + "grad_norm": 0.9201721549034119, + "learning_rate": 9.45222274372923e-05, + "loss": 0.7746, + "step": 47100 + }, + { + "epoch": 0.30097236241902303, + "grad_norm": 0.7247973680496216, + "learning_rate": 9.451994370603084e-05, + "loss": 0.8073, + "step": 47110 + }, + { + "epoch": 0.30103624956876174, + "grad_norm": 3.4702842235565186, + "learning_rate": 9.451765952641502e-05, + "loss": 0.8341, + "step": 47120 + }, + { + "epoch": 0.30110013671850044, + "grad_norm": 2.2234277725219727, + "learning_rate": 9.451537489846787e-05, + "loss": 0.8992, + "step": 47130 + }, + { + "epoch": 0.30116402386823915, + "grad_norm": 0.8516297340393066, + "learning_rate": 9.451308982221238e-05, + "loss": 1.1016, + "step": 47140 + }, + { + "epoch": 0.30122791101797786, + "grad_norm": 0.6456612348556519, + "learning_rate": 9.451080429767157e-05, + "loss": 0.878, + "step": 47150 + }, + { + "epoch": 0.30129179816771656, + "grad_norm": 1.235134482383728, + "learning_rate": 9.450851832486844e-05, + "loss": 0.8274, + "step": 47160 + }, + { + "epoch": 0.30135568531745527, + "grad_norm": 1.2965903282165527, + "learning_rate": 9.450623190382604e-05, + "loss": 1.0011, + "step": 47170 + }, + { + "epoch": 0.301419572467194, + "grad_norm": 0.6325692534446716, + "learning_rate": 9.450394503456739e-05, + "loss": 0.8392, + "step": 47180 + }, + { + "epoch": 0.3014834596169326, + "grad_norm": 0.9124320149421692, + "learning_rate": 9.45016577171155e-05, + "loss": 1.0633, + "step": 47190 + }, + { + "epoch": 0.30154734676667133, + "grad_norm": 0.5959859490394592, + "learning_rate": 9.44993699514934e-05, + "loss": 0.993, + "step": 47200 + }, + { + "epoch": 0.30161123391641004, + "grad_norm": 0.5984769463539124, + "learning_rate": 9.449708173772417e-05, + "loss": 0.8204, + "step": 47210 + }, + { + "epoch": 0.30167512106614874, + "grad_norm": 1.2055346965789795, + "learning_rate": 9.449479307583082e-05, + "loss": 1.1527, + "step": 47220 + }, + { + "epoch": 0.30173900821588745, + "grad_norm": 0.8771409392356873, + "learning_rate": 9.449250396583642e-05, + "loss": 0.7836, + "step": 47230 + }, + { + "epoch": 0.30180289536562616, + "grad_norm": 1.1012285947799683, + "learning_rate": 9.4490214407764e-05, + "loss": 0.8842, + "step": 47240 + }, + { + "epoch": 0.30186678251536486, + "grad_norm": 2.031371831893921, + "learning_rate": 9.448792440163664e-05, + "loss": 0.8747, + "step": 47250 + }, + { + "epoch": 0.30193066966510357, + "grad_norm": 1.0956002473831177, + "learning_rate": 9.44856339474774e-05, + "loss": 0.9346, + "step": 47260 + }, + { + "epoch": 0.3019945568148423, + "grad_norm": 1.060286521911621, + "learning_rate": 9.448334304530932e-05, + "loss": 0.9462, + "step": 47270 + }, + { + "epoch": 0.302058443964581, + "grad_norm": 0.9044703841209412, + "learning_rate": 9.448105169515551e-05, + "loss": 0.8297, + "step": 47280 + }, + { + "epoch": 0.3021223311143197, + "grad_norm": 0.7445279955863953, + "learning_rate": 9.447875989703902e-05, + "loss": 0.9671, + "step": 47290 + }, + { + "epoch": 0.3021862182640584, + "grad_norm": 0.9023739099502563, + "learning_rate": 9.447646765098294e-05, + "loss": 0.9307, + "step": 47300 + }, + { + "epoch": 0.30225010541379704, + "grad_norm": 1.161218523979187, + "learning_rate": 9.447417495701036e-05, + "loss": 0.8708, + "step": 47310 + }, + { + "epoch": 0.30231399256353575, + "grad_norm": 0.9403544068336487, + "learning_rate": 9.447188181514437e-05, + "loss": 1.0087, + "step": 47320 + }, + { + "epoch": 0.30237787971327446, + "grad_norm": 1.1664180755615234, + "learning_rate": 9.446958822540803e-05, + "loss": 0.9059, + "step": 47330 + }, + { + "epoch": 0.30244176686301316, + "grad_norm": 1.02223539352417, + "learning_rate": 9.446729418782448e-05, + "loss": 1.0916, + "step": 47340 + }, + { + "epoch": 0.30250565401275187, + "grad_norm": 0.7959775924682617, + "learning_rate": 9.446499970241682e-05, + "loss": 0.9342, + "step": 47350 + }, + { + "epoch": 0.3025695411624906, + "grad_norm": 0.938345730304718, + "learning_rate": 9.446270476920813e-05, + "loss": 1.0355, + "step": 47360 + }, + { + "epoch": 0.3026334283122293, + "grad_norm": 0.8003197908401489, + "learning_rate": 9.446040938822154e-05, + "loss": 0.6568, + "step": 47370 + }, + { + "epoch": 0.302697315461968, + "grad_norm": 1.0039125680923462, + "learning_rate": 9.445811355948016e-05, + "loss": 0.7738, + "step": 47380 + }, + { + "epoch": 0.3027612026117067, + "grad_norm": 0.7357406616210938, + "learning_rate": 9.44558172830071e-05, + "loss": 0.9009, + "step": 47390 + }, + { + "epoch": 0.3028250897614454, + "grad_norm": 0.9000012874603271, + "learning_rate": 9.445352055882552e-05, + "loss": 0.6797, + "step": 47400 + }, + { + "epoch": 0.3028889769111841, + "grad_norm": 0.7020642161369324, + "learning_rate": 9.445122338695853e-05, + "loss": 1.0587, + "step": 47410 + }, + { + "epoch": 0.3029528640609228, + "grad_norm": 0.9596700668334961, + "learning_rate": 9.444892576742927e-05, + "loss": 0.8238, + "step": 47420 + }, + { + "epoch": 0.30301675121066146, + "grad_norm": 0.9670388698577881, + "learning_rate": 9.444662770026087e-05, + "loss": 1.0262, + "step": 47430 + }, + { + "epoch": 0.30308063836040017, + "grad_norm": 0.6952800154685974, + "learning_rate": 9.444432918547648e-05, + "loss": 0.6862, + "step": 47440 + }, + { + "epoch": 0.3031445255101389, + "grad_norm": 1.1551501750946045, + "learning_rate": 9.444203022309923e-05, + "loss": 0.8036, + "step": 47450 + }, + { + "epoch": 0.3032084126598776, + "grad_norm": 0.7595165371894836, + "learning_rate": 9.44397308131523e-05, + "loss": 0.7412, + "step": 47460 + }, + { + "epoch": 0.3032722998096163, + "grad_norm": 1.8203938007354736, + "learning_rate": 9.443743095565882e-05, + "loss": 0.8287, + "step": 47470 + }, + { + "epoch": 0.303336186959355, + "grad_norm": 0.9286074042320251, + "learning_rate": 9.443513065064198e-05, + "loss": 0.7826, + "step": 47480 + }, + { + "epoch": 0.3034000741090937, + "grad_norm": 0.6284856200218201, + "learning_rate": 9.443282989812495e-05, + "loss": 0.8863, + "step": 47490 + }, + { + "epoch": 0.3034639612588324, + "grad_norm": 0.5591778755187988, + "learning_rate": 9.443052869813085e-05, + "loss": 0.8473, + "step": 47500 + }, + { + "epoch": 0.3035278484085711, + "grad_norm": 0.854895293712616, + "learning_rate": 9.44282270506829e-05, + "loss": 0.9199, + "step": 47510 + }, + { + "epoch": 0.3035917355583098, + "grad_norm": 0.6344938278198242, + "learning_rate": 9.442592495580427e-05, + "loss": 0.8679, + "step": 47520 + }, + { + "epoch": 0.3036556227080485, + "grad_norm": 0.9397995471954346, + "learning_rate": 9.442362241351815e-05, + "loss": 0.8879, + "step": 47530 + }, + { + "epoch": 0.30371950985778723, + "grad_norm": 0.9806991219520569, + "learning_rate": 9.442131942384769e-05, + "loss": 1.0208, + "step": 47540 + }, + { + "epoch": 0.3037833970075259, + "grad_norm": 0.7757532000541687, + "learning_rate": 9.441901598681615e-05, + "loss": 1.0113, + "step": 47550 + }, + { + "epoch": 0.3038472841572646, + "grad_norm": 1.0031111240386963, + "learning_rate": 9.441671210244667e-05, + "loss": 0.8251, + "step": 47560 + }, + { + "epoch": 0.3039111713070033, + "grad_norm": 0.7999134659767151, + "learning_rate": 9.441440777076248e-05, + "loss": 0.836, + "step": 47570 + }, + { + "epoch": 0.303975058456742, + "grad_norm": 1.0625855922698975, + "learning_rate": 9.441210299178677e-05, + "loss": 0.87, + "step": 47580 + }, + { + "epoch": 0.3040389456064807, + "grad_norm": 1.3165303468704224, + "learning_rate": 9.440979776554278e-05, + "loss": 0.7388, + "step": 47590 + }, + { + "epoch": 0.3041028327562194, + "grad_norm": 1.166496992111206, + "learning_rate": 9.44074920920537e-05, + "loss": 0.7745, + "step": 47600 + }, + { + "epoch": 0.3041667199059581, + "grad_norm": 0.9159298539161682, + "learning_rate": 9.440518597134275e-05, + "loss": 0.932, + "step": 47610 + }, + { + "epoch": 0.3042306070556968, + "grad_norm": 1.0953913927078247, + "learning_rate": 9.440287940343317e-05, + "loss": 0.9399, + "step": 47620 + }, + { + "epoch": 0.30429449420543553, + "grad_norm": 0.7524245977401733, + "learning_rate": 9.440057238834816e-05, + "loss": 0.7907, + "step": 47630 + }, + { + "epoch": 0.30435838135517423, + "grad_norm": 0.6022171974182129, + "learning_rate": 9.4398264926111e-05, + "loss": 0.9259, + "step": 47640 + }, + { + "epoch": 0.30442226850491294, + "grad_norm": 0.8347755074501038, + "learning_rate": 9.439595701674488e-05, + "loss": 1.0379, + "step": 47650 + }, + { + "epoch": 0.30448615565465165, + "grad_norm": 0.7987663149833679, + "learning_rate": 9.43936486602731e-05, + "loss": 0.926, + "step": 47660 + }, + { + "epoch": 0.3045500428043903, + "grad_norm": 0.762823760509491, + "learning_rate": 9.439133985671884e-05, + "loss": 1.031, + "step": 47670 + }, + { + "epoch": 0.304613929954129, + "grad_norm": 5.046191692352295, + "learning_rate": 9.438903060610539e-05, + "loss": 0.8839, + "step": 47680 + }, + { + "epoch": 0.3046778171038677, + "grad_norm": 0.8193703293800354, + "learning_rate": 9.438672090845599e-05, + "loss": 0.9656, + "step": 47690 + }, + { + "epoch": 0.3047417042536064, + "grad_norm": 1.1891075372695923, + "learning_rate": 9.438441076379395e-05, + "loss": 0.8131, + "step": 47700 + }, + { + "epoch": 0.3048055914033451, + "grad_norm": 0.6901410222053528, + "learning_rate": 9.438210017214245e-05, + "loss": 0.9052, + "step": 47710 + }, + { + "epoch": 0.30486947855308383, + "grad_norm": 0.8975858092308044, + "learning_rate": 9.437978913352483e-05, + "loss": 0.9107, + "step": 47720 + }, + { + "epoch": 0.30493336570282253, + "grad_norm": 0.71076500415802, + "learning_rate": 9.437747764796432e-05, + "loss": 0.9106, + "step": 47730 + }, + { + "epoch": 0.30499725285256124, + "grad_norm": 0.6818621158599854, + "learning_rate": 9.437516571548424e-05, + "loss": 1.0038, + "step": 47740 + }, + { + "epoch": 0.30506114000229995, + "grad_norm": 1.3536254167556763, + "learning_rate": 9.437285333610784e-05, + "loss": 1.0431, + "step": 47750 + }, + { + "epoch": 0.30512502715203865, + "grad_norm": 0.7278540730476379, + "learning_rate": 9.437054050985842e-05, + "loss": 0.67, + "step": 47760 + }, + { + "epoch": 0.30518891430177736, + "grad_norm": 0.8322495222091675, + "learning_rate": 9.436822723675926e-05, + "loss": 0.8593, + "step": 47770 + }, + { + "epoch": 0.30525280145151606, + "grad_norm": 0.8993383646011353, + "learning_rate": 9.436591351683368e-05, + "loss": 0.9672, + "step": 47780 + }, + { + "epoch": 0.30531668860125477, + "grad_norm": 0.9851189851760864, + "learning_rate": 9.436359935010498e-05, + "loss": 0.775, + "step": 47790 + }, + { + "epoch": 0.3053805757509934, + "grad_norm": 0.7736380696296692, + "learning_rate": 9.436128473659644e-05, + "loss": 0.838, + "step": 47800 + }, + { + "epoch": 0.3054444629007321, + "grad_norm": 0.7815120816230774, + "learning_rate": 9.43589696763314e-05, + "loss": 0.6829, + "step": 47810 + }, + { + "epoch": 0.30550835005047083, + "grad_norm": 0.9725918769836426, + "learning_rate": 9.435665416933315e-05, + "loss": 0.7912, + "step": 47820 + }, + { + "epoch": 0.30557223720020954, + "grad_norm": 0.9616202712059021, + "learning_rate": 9.4354338215625e-05, + "loss": 0.8527, + "step": 47830 + }, + { + "epoch": 0.30563612434994825, + "grad_norm": 0.7812166810035706, + "learning_rate": 9.435202181523031e-05, + "loss": 0.5296, + "step": 47840 + }, + { + "epoch": 0.30570001149968695, + "grad_norm": 0.5943650603294373, + "learning_rate": 9.43497049681724e-05, + "loss": 0.982, + "step": 47850 + }, + { + "epoch": 0.30576389864942566, + "grad_norm": 1.0416995286941528, + "learning_rate": 9.434738767447458e-05, + "loss": 1.0534, + "step": 47860 + }, + { + "epoch": 0.30582778579916436, + "grad_norm": 0.5415847301483154, + "learning_rate": 9.43450699341602e-05, + "loss": 0.939, + "step": 47870 + }, + { + "epoch": 0.30589167294890307, + "grad_norm": 1.0292586088180542, + "learning_rate": 9.43427517472526e-05, + "loss": 1.0046, + "step": 47880 + }, + { + "epoch": 0.3059555600986418, + "grad_norm": 0.9215097427368164, + "learning_rate": 9.434043311377513e-05, + "loss": 0.977, + "step": 47890 + }, + { + "epoch": 0.3060194472483805, + "grad_norm": 0.45443835854530334, + "learning_rate": 9.433811403375114e-05, + "loss": 0.8409, + "step": 47900 + }, + { + "epoch": 0.3060833343981192, + "grad_norm": 1.0631471872329712, + "learning_rate": 9.433579450720398e-05, + "loss": 0.9141, + "step": 47910 + }, + { + "epoch": 0.30614722154785784, + "grad_norm": 1.0696526765823364, + "learning_rate": 9.433347453415702e-05, + "loss": 1.0058, + "step": 47920 + }, + { + "epoch": 0.30621110869759655, + "grad_norm": 1.055720329284668, + "learning_rate": 9.433115411463361e-05, + "loss": 0.7988, + "step": 47930 + }, + { + "epoch": 0.30627499584733525, + "grad_norm": 0.7248536944389343, + "learning_rate": 9.432883324865713e-05, + "loss": 0.9809, + "step": 47940 + }, + { + "epoch": 0.30633888299707396, + "grad_norm": 0.7505012154579163, + "learning_rate": 9.432651193625095e-05, + "loss": 1.0935, + "step": 47950 + }, + { + "epoch": 0.30640277014681266, + "grad_norm": 1.8567789793014526, + "learning_rate": 9.432419017743845e-05, + "loss": 0.8664, + "step": 47960 + }, + { + "epoch": 0.30646665729655137, + "grad_norm": 0.6086595058441162, + "learning_rate": 9.432186797224301e-05, + "loss": 1.1033, + "step": 47970 + }, + { + "epoch": 0.3065305444462901, + "grad_norm": 1.0013806819915771, + "learning_rate": 9.431954532068801e-05, + "loss": 0.8595, + "step": 47980 + }, + { + "epoch": 0.3065944315960288, + "grad_norm": 0.9920240640640259, + "learning_rate": 9.431722222279684e-05, + "loss": 0.9719, + "step": 47990 + }, + { + "epoch": 0.3066583187457675, + "grad_norm": 1.207126498222351, + "learning_rate": 9.43148986785929e-05, + "loss": 0.8834, + "step": 48000 + }, + { + "epoch": 0.3067222058955062, + "grad_norm": 0.6420753598213196, + "learning_rate": 9.431257468809961e-05, + "loss": 1.1191, + "step": 48010 + }, + { + "epoch": 0.3067860930452449, + "grad_norm": 0.9169009327888489, + "learning_rate": 9.431025025134036e-05, + "loss": 1.0277, + "step": 48020 + }, + { + "epoch": 0.3068499801949836, + "grad_norm": 1.1910425424575806, + "learning_rate": 9.430792536833855e-05, + "loss": 0.7912, + "step": 48030 + }, + { + "epoch": 0.30691386734472226, + "grad_norm": 1.0869101285934448, + "learning_rate": 9.43056000391176e-05, + "loss": 1.1964, + "step": 48040 + }, + { + "epoch": 0.30697775449446096, + "grad_norm": 0.8634042739868164, + "learning_rate": 9.430327426370091e-05, + "loss": 0.7144, + "step": 48050 + }, + { + "epoch": 0.30704164164419967, + "grad_norm": 1.0796973705291748, + "learning_rate": 9.430094804211195e-05, + "loss": 0.8565, + "step": 48060 + }, + { + "epoch": 0.3071055287939384, + "grad_norm": 0.871731698513031, + "learning_rate": 9.42986213743741e-05, + "loss": 0.9328, + "step": 48070 + }, + { + "epoch": 0.3071694159436771, + "grad_norm": 1.139355182647705, + "learning_rate": 9.429629426051081e-05, + "loss": 0.8468, + "step": 48080 + }, + { + "epoch": 0.3072333030934158, + "grad_norm": 0.9230227470397949, + "learning_rate": 9.429396670054551e-05, + "loss": 1.0648, + "step": 48090 + }, + { + "epoch": 0.3072971902431545, + "grad_norm": 0.9664996862411499, + "learning_rate": 9.429163869450166e-05, + "loss": 0.9543, + "step": 48100 + }, + { + "epoch": 0.3073610773928932, + "grad_norm": 0.7057569026947021, + "learning_rate": 9.428931024240267e-05, + "loss": 0.978, + "step": 48110 + }, + { + "epoch": 0.3074249645426319, + "grad_norm": 0.6868560314178467, + "learning_rate": 9.428698134427202e-05, + "loss": 0.8156, + "step": 48120 + }, + { + "epoch": 0.3074888516923706, + "grad_norm": 1.3215396404266357, + "learning_rate": 9.428465200013317e-05, + "loss": 1.176, + "step": 48130 + }, + { + "epoch": 0.3075527388421093, + "grad_norm": 0.767733633518219, + "learning_rate": 9.428232221000954e-05, + "loss": 1.0589, + "step": 48140 + }, + { + "epoch": 0.307616625991848, + "grad_norm": 0.9023085832595825, + "learning_rate": 9.427999197392463e-05, + "loss": 0.861, + "step": 48150 + }, + { + "epoch": 0.3076805131415867, + "grad_norm": 0.7275156378746033, + "learning_rate": 9.427766129190189e-05, + "loss": 0.9598, + "step": 48160 + }, + { + "epoch": 0.3077444002913254, + "grad_norm": 1.1125576496124268, + "learning_rate": 9.427533016396479e-05, + "loss": 0.7333, + "step": 48170 + }, + { + "epoch": 0.3078082874410641, + "grad_norm": 2.008270025253296, + "learning_rate": 9.427299859013682e-05, + "loss": 1.134, + "step": 48180 + }, + { + "epoch": 0.3078721745908028, + "grad_norm": 0.5112677216529846, + "learning_rate": 9.427066657044144e-05, + "loss": 0.7755, + "step": 48190 + }, + { + "epoch": 0.3079360617405415, + "grad_norm": 0.8975897431373596, + "learning_rate": 9.426833410490215e-05, + "loss": 0.791, + "step": 48200 + }, + { + "epoch": 0.3079999488902802, + "grad_norm": 0.7356785535812378, + "learning_rate": 9.426600119354245e-05, + "loss": 0.8472, + "step": 48210 + }, + { + "epoch": 0.3080638360400189, + "grad_norm": 0.8324338793754578, + "learning_rate": 9.426366783638582e-05, + "loss": 0.9064, + "step": 48220 + }, + { + "epoch": 0.3081277231897576, + "grad_norm": 0.9921901226043701, + "learning_rate": 9.426133403345576e-05, + "loss": 0.9901, + "step": 48230 + }, + { + "epoch": 0.3081916103394963, + "grad_norm": 1.4877877235412598, + "learning_rate": 9.425899978477577e-05, + "loss": 0.7397, + "step": 48240 + }, + { + "epoch": 0.30825549748923503, + "grad_norm": 1.276802897453308, + "learning_rate": 9.425666509036936e-05, + "loss": 0.9455, + "step": 48250 + }, + { + "epoch": 0.30831938463897374, + "grad_norm": 1.7324568033218384, + "learning_rate": 9.425432995026005e-05, + "loss": 0.8014, + "step": 48260 + }, + { + "epoch": 0.30838327178871244, + "grad_norm": 1.0053337812423706, + "learning_rate": 9.425199436447135e-05, + "loss": 1.1581, + "step": 48270 + }, + { + "epoch": 0.3084471589384511, + "grad_norm": 1.1063930988311768, + "learning_rate": 9.424965833302679e-05, + "loss": 1.1048, + "step": 48280 + }, + { + "epoch": 0.3085110460881898, + "grad_norm": 1.1341273784637451, + "learning_rate": 9.424732185594989e-05, + "loss": 1.0374, + "step": 48290 + }, + { + "epoch": 0.3085749332379285, + "grad_norm": 0.6320347785949707, + "learning_rate": 9.424498493326417e-05, + "loss": 0.9549, + "step": 48300 + }, + { + "epoch": 0.3086388203876672, + "grad_norm": 0.48150819540023804, + "learning_rate": 9.424264756499317e-05, + "loss": 0.8902, + "step": 48310 + }, + { + "epoch": 0.3087027075374059, + "grad_norm": 0.6222485899925232, + "learning_rate": 9.424030975116045e-05, + "loss": 0.8407, + "step": 48320 + }, + { + "epoch": 0.3087665946871446, + "grad_norm": 0.6241324543952942, + "learning_rate": 9.423797149178952e-05, + "loss": 0.8781, + "step": 48330 + }, + { + "epoch": 0.30883048183688333, + "grad_norm": 0.8804172277450562, + "learning_rate": 9.423563278690397e-05, + "loss": 1.0311, + "step": 48340 + }, + { + "epoch": 0.30889436898662204, + "grad_norm": 1.6444405317306519, + "learning_rate": 9.423329363652731e-05, + "loss": 0.7865, + "step": 48350 + }, + { + "epoch": 0.30895825613636074, + "grad_norm": 0.8753210306167603, + "learning_rate": 9.423095404068312e-05, + "loss": 1.0464, + "step": 48360 + }, + { + "epoch": 0.30902214328609945, + "grad_norm": 0.8216173052787781, + "learning_rate": 9.422861399939495e-05, + "loss": 0.8443, + "step": 48370 + }, + { + "epoch": 0.30908603043583815, + "grad_norm": 1.1134603023529053, + "learning_rate": 9.422627351268638e-05, + "loss": 1.1639, + "step": 48380 + }, + { + "epoch": 0.30914991758557686, + "grad_norm": 0.8974233269691467, + "learning_rate": 9.422393258058098e-05, + "loss": 0.8378, + "step": 48390 + }, + { + "epoch": 0.3092138047353155, + "grad_norm": 0.8469827175140381, + "learning_rate": 9.422159120310232e-05, + "loss": 0.8669, + "step": 48400 + }, + { + "epoch": 0.3092776918850542, + "grad_norm": 0.701692521572113, + "learning_rate": 9.421924938027397e-05, + "loss": 0.9278, + "step": 48410 + }, + { + "epoch": 0.3093415790347929, + "grad_norm": 0.7484457492828369, + "learning_rate": 9.421690711211952e-05, + "loss": 0.9045, + "step": 48420 + }, + { + "epoch": 0.30940546618453163, + "grad_norm": 0.7951037883758545, + "learning_rate": 9.421456439866257e-05, + "loss": 0.7149, + "step": 48430 + }, + { + "epoch": 0.30946935333427034, + "grad_norm": 0.6220124363899231, + "learning_rate": 9.421222123992671e-05, + "loss": 0.8828, + "step": 48440 + }, + { + "epoch": 0.30953324048400904, + "grad_norm": 0.8480835556983948, + "learning_rate": 9.420987763593554e-05, + "loss": 0.8722, + "step": 48450 + }, + { + "epoch": 0.30959712763374775, + "grad_norm": 0.8057517409324646, + "learning_rate": 9.420753358671264e-05, + "loss": 1.0229, + "step": 48460 + }, + { + "epoch": 0.30966101478348645, + "grad_norm": 0.7954405546188354, + "learning_rate": 9.420518909228164e-05, + "loss": 0.7711, + "step": 48470 + }, + { + "epoch": 0.30972490193322516, + "grad_norm": 1.0141898393630981, + "learning_rate": 9.420284415266613e-05, + "loss": 0.7272, + "step": 48480 + }, + { + "epoch": 0.30978878908296387, + "grad_norm": 0.8430118560791016, + "learning_rate": 9.420049876788974e-05, + "loss": 0.9584, + "step": 48490 + }, + { + "epoch": 0.3098526762327026, + "grad_norm": 0.703395426273346, + "learning_rate": 9.419815293797611e-05, + "loss": 0.7518, + "step": 48500 + }, + { + "epoch": 0.3099165633824413, + "grad_norm": 0.6851431727409363, + "learning_rate": 9.419580666294883e-05, + "loss": 0.6678, + "step": 48510 + }, + { + "epoch": 0.30998045053217993, + "grad_norm": 0.6793634295463562, + "learning_rate": 9.419345994283153e-05, + "loss": 0.902, + "step": 48520 + }, + { + "epoch": 0.31004433768191864, + "grad_norm": 0.7479285597801208, + "learning_rate": 9.419111277764788e-05, + "loss": 0.8441, + "step": 48530 + }, + { + "epoch": 0.31010822483165734, + "grad_norm": 1.3613170385360718, + "learning_rate": 9.418876516742148e-05, + "loss": 1.0158, + "step": 48540 + }, + { + "epoch": 0.31017211198139605, + "grad_norm": 1.1104499101638794, + "learning_rate": 9.418665193772571e-05, + "loss": 1.0577, + "step": 48550 + }, + { + "epoch": 0.31023599913113475, + "grad_norm": 1.6109135150909424, + "learning_rate": 9.418430348198326e-05, + "loss": 0.8694, + "step": 48560 + }, + { + "epoch": 0.31029988628087346, + "grad_norm": 1.1885582208633423, + "learning_rate": 9.418195458126664e-05, + "loss": 0.8233, + "step": 48570 + }, + { + "epoch": 0.31036377343061217, + "grad_norm": 1.3938848972320557, + "learning_rate": 9.41796052355995e-05, + "loss": 0.824, + "step": 48580 + }, + { + "epoch": 0.3104276605803509, + "grad_norm": 0.72528076171875, + "learning_rate": 9.417725544500552e-05, + "loss": 0.912, + "step": 48590 + }, + { + "epoch": 0.3104915477300896, + "grad_norm": 0.7403073310852051, + "learning_rate": 9.417490520950838e-05, + "loss": 0.8565, + "step": 48600 + }, + { + "epoch": 0.3105554348798283, + "grad_norm": 1.263451337814331, + "learning_rate": 9.417255452913171e-05, + "loss": 0.9227, + "step": 48610 + }, + { + "epoch": 0.310619322029567, + "grad_norm": 1.1956753730773926, + "learning_rate": 9.417020340389922e-05, + "loss": 0.9026, + "step": 48620 + }, + { + "epoch": 0.3106832091793057, + "grad_norm": 0.985579252243042, + "learning_rate": 9.416785183383454e-05, + "loss": 0.9717, + "step": 48630 + }, + { + "epoch": 0.3107470963290444, + "grad_norm": 1.1130268573760986, + "learning_rate": 9.416549981896141e-05, + "loss": 0.6567, + "step": 48640 + }, + { + "epoch": 0.31081098347878305, + "grad_norm": 0.6500244736671448, + "learning_rate": 9.416314735930347e-05, + "loss": 1.0039, + "step": 48650 + }, + { + "epoch": 0.31087487062852176, + "grad_norm": 1.0111130475997925, + "learning_rate": 9.416079445488444e-05, + "loss": 1.0116, + "step": 48660 + }, + { + "epoch": 0.31093875777826047, + "grad_norm": 0.8683274984359741, + "learning_rate": 9.4158441105728e-05, + "loss": 0.9201, + "step": 48670 + }, + { + "epoch": 0.3110026449279992, + "grad_norm": 0.9501914381980896, + "learning_rate": 9.415608731185786e-05, + "loss": 1.0651, + "step": 48680 + }, + { + "epoch": 0.3110665320777379, + "grad_norm": 1.045398473739624, + "learning_rate": 9.415373307329771e-05, + "loss": 1.1339, + "step": 48690 + }, + { + "epoch": 0.3111304192274766, + "grad_norm": 0.6909173727035522, + "learning_rate": 9.415137839007127e-05, + "loss": 0.8049, + "step": 48700 + }, + { + "epoch": 0.3111943063772153, + "grad_norm": 0.929655909538269, + "learning_rate": 9.414902326220225e-05, + "loss": 0.8028, + "step": 48710 + }, + { + "epoch": 0.311258193526954, + "grad_norm": 0.9743746519088745, + "learning_rate": 9.414666768971438e-05, + "loss": 0.8393, + "step": 48720 + }, + { + "epoch": 0.3113220806766927, + "grad_norm": 0.9275550246238708, + "learning_rate": 9.414431167263139e-05, + "loss": 0.9987, + "step": 48730 + }, + { + "epoch": 0.3113859678264314, + "grad_norm": 0.9921037554740906, + "learning_rate": 9.414195521097697e-05, + "loss": 0.87, + "step": 48740 + }, + { + "epoch": 0.3114498549761701, + "grad_norm": 0.6033504605293274, + "learning_rate": 9.413959830477488e-05, + "loss": 1.0159, + "step": 48750 + }, + { + "epoch": 0.3115137421259088, + "grad_norm": 0.709073543548584, + "learning_rate": 9.413724095404884e-05, + "loss": 0.8106, + "step": 48760 + }, + { + "epoch": 0.31157762927564747, + "grad_norm": 0.8493859171867371, + "learning_rate": 9.413488315882261e-05, + "loss": 0.7047, + "step": 48770 + }, + { + "epoch": 0.3116415164253862, + "grad_norm": 1.3952586650848389, + "learning_rate": 9.413252491911993e-05, + "loss": 0.8132, + "step": 48780 + }, + { + "epoch": 0.3117054035751249, + "grad_norm": 1.9814708232879639, + "learning_rate": 9.413016623496452e-05, + "loss": 0.709, + "step": 48790 + }, + { + "epoch": 0.3117692907248636, + "grad_norm": 0.6453489661216736, + "learning_rate": 9.412780710638017e-05, + "loss": 0.6557, + "step": 48800 + }, + { + "epoch": 0.3118331778746023, + "grad_norm": 0.954566478729248, + "learning_rate": 9.412544753339063e-05, + "loss": 0.9015, + "step": 48810 + }, + { + "epoch": 0.311897065024341, + "grad_norm": 1.347273588180542, + "learning_rate": 9.412308751601967e-05, + "loss": 0.7975, + "step": 48820 + }, + { + "epoch": 0.3119609521740797, + "grad_norm": 1.4250802993774414, + "learning_rate": 9.412072705429103e-05, + "loss": 0.9073, + "step": 48830 + }, + { + "epoch": 0.3120248393238184, + "grad_norm": 0.8338466286659241, + "learning_rate": 9.41183661482285e-05, + "loss": 1.0889, + "step": 48840 + }, + { + "epoch": 0.3120887264735571, + "grad_norm": 0.8401825428009033, + "learning_rate": 9.411600479785586e-05, + "loss": 0.9772, + "step": 48850 + }, + { + "epoch": 0.3121526136232958, + "grad_norm": 0.7773457169532776, + "learning_rate": 9.411364300319688e-05, + "loss": 0.6319, + "step": 48860 + }, + { + "epoch": 0.31221650077303453, + "grad_norm": 0.6651679873466492, + "learning_rate": 9.411128076427536e-05, + "loss": 0.7987, + "step": 48870 + }, + { + "epoch": 0.31228038792277324, + "grad_norm": 1.0372742414474487, + "learning_rate": 9.410891808111508e-05, + "loss": 0.8707, + "step": 48880 + }, + { + "epoch": 0.3123442750725119, + "grad_norm": 1.1569018363952637, + "learning_rate": 9.410655495373983e-05, + "loss": 0.875, + "step": 48890 + }, + { + "epoch": 0.3124081622222506, + "grad_norm": 0.7065810561180115, + "learning_rate": 9.41041913821734e-05, + "loss": 1.1122, + "step": 48900 + }, + { + "epoch": 0.3124720493719893, + "grad_norm": 0.7592551112174988, + "learning_rate": 9.410182736643964e-05, + "loss": 0.9027, + "step": 48910 + }, + { + "epoch": 0.312535936521728, + "grad_norm": 0.6045570969581604, + "learning_rate": 9.40994629065623e-05, + "loss": 1.0845, + "step": 48920 + }, + { + "epoch": 0.3125998236714667, + "grad_norm": 1.0101828575134277, + "learning_rate": 9.409709800256523e-05, + "loss": 0.7681, + "step": 48930 + }, + { + "epoch": 0.3126637108212054, + "grad_norm": 0.5947315692901611, + "learning_rate": 9.409473265447224e-05, + "loss": 0.8502, + "step": 48940 + }, + { + "epoch": 0.3127275979709441, + "grad_norm": 0.9828523397445679, + "learning_rate": 9.409236686230713e-05, + "loss": 0.8704, + "step": 48950 + }, + { + "epoch": 0.31279148512068283, + "grad_norm": 0.731641948223114, + "learning_rate": 9.409000062609374e-05, + "loss": 1.0281, + "step": 48960 + }, + { + "epoch": 0.31285537227042154, + "grad_norm": 0.9199566841125488, + "learning_rate": 9.408763394585592e-05, + "loss": 0.95, + "step": 48970 + }, + { + "epoch": 0.31291925942016025, + "grad_norm": 1.1518223285675049, + "learning_rate": 9.408526682161746e-05, + "loss": 0.8526, + "step": 48980 + }, + { + "epoch": 0.31298314656989895, + "grad_norm": 0.8787213563919067, + "learning_rate": 9.408289925340224e-05, + "loss": 0.9255, + "step": 48990 + }, + { + "epoch": 0.31304703371963766, + "grad_norm": 0.9983569979667664, + "learning_rate": 9.408053124123408e-05, + "loss": 1.1062, + "step": 49000 + }, + { + "epoch": 0.3131109208693763, + "grad_norm": 1.392701268196106, + "learning_rate": 9.407816278513683e-05, + "loss": 0.6401, + "step": 49010 + }, + { + "epoch": 0.313174808019115, + "grad_norm": 0.5991133451461792, + "learning_rate": 9.407579388513434e-05, + "loss": 0.8792, + "step": 49020 + }, + { + "epoch": 0.3132386951688537, + "grad_norm": 0.7458668947219849, + "learning_rate": 9.40734245412505e-05, + "loss": 1.1298, + "step": 49030 + }, + { + "epoch": 0.3133025823185924, + "grad_norm": 1.072922706604004, + "learning_rate": 9.407105475350914e-05, + "loss": 0.8286, + "step": 49040 + }, + { + "epoch": 0.31336646946833113, + "grad_norm": 2.292825698852539, + "learning_rate": 9.406868452193411e-05, + "loss": 1.0391, + "step": 49050 + }, + { + "epoch": 0.31343035661806984, + "grad_norm": 0.8459042906761169, + "learning_rate": 9.406631384654934e-05, + "loss": 0.8123, + "step": 49060 + }, + { + "epoch": 0.31349424376780854, + "grad_norm": 1.0259894132614136, + "learning_rate": 9.406394272737863e-05, + "loss": 0.8378, + "step": 49070 + }, + { + "epoch": 0.31355813091754725, + "grad_norm": 0.6724763512611389, + "learning_rate": 9.406157116444592e-05, + "loss": 0.7612, + "step": 49080 + }, + { + "epoch": 0.31362201806728596, + "grad_norm": 0.6208413243293762, + "learning_rate": 9.405919915777506e-05, + "loss": 0.8256, + "step": 49090 + }, + { + "epoch": 0.31368590521702466, + "grad_norm": 0.5845530033111572, + "learning_rate": 9.405682670738995e-05, + "loss": 0.8746, + "step": 49100 + }, + { + "epoch": 0.31374979236676337, + "grad_norm": 1.0131624937057495, + "learning_rate": 9.405445381331449e-05, + "loss": 0.7601, + "step": 49110 + }, + { + "epoch": 0.3138136795165021, + "grad_norm": 0.8902072310447693, + "learning_rate": 9.405208047557255e-05, + "loss": 1.0531, + "step": 49120 + }, + { + "epoch": 0.3138775666662407, + "grad_norm": 1.196207880973816, + "learning_rate": 9.404970669418804e-05, + "loss": 1.023, + "step": 49130 + }, + { + "epoch": 0.31394145381597943, + "grad_norm": 0.9805600643157959, + "learning_rate": 9.404733246918489e-05, + "loss": 0.7877, + "step": 49140 + }, + { + "epoch": 0.31400534096571814, + "grad_norm": 0.7303978204727173, + "learning_rate": 9.404495780058701e-05, + "loss": 0.8739, + "step": 49150 + }, + { + "epoch": 0.31406922811545684, + "grad_norm": 0.754725456237793, + "learning_rate": 9.404258268841827e-05, + "loss": 0.8963, + "step": 49160 + }, + { + "epoch": 0.31413311526519555, + "grad_norm": 0.8432728052139282, + "learning_rate": 9.404020713270265e-05, + "loss": 0.8278, + "step": 49170 + }, + { + "epoch": 0.31419700241493426, + "grad_norm": 1.7309197187423706, + "learning_rate": 9.403783113346402e-05, + "loss": 0.7055, + "step": 49180 + }, + { + "epoch": 0.31426088956467296, + "grad_norm": 1.0485713481903076, + "learning_rate": 9.403545469072636e-05, + "loss": 1.0848, + "step": 49190 + }, + { + "epoch": 0.31432477671441167, + "grad_norm": 1.0555837154388428, + "learning_rate": 9.403307780451356e-05, + "loss": 0.9272, + "step": 49200 + }, + { + "epoch": 0.3143886638641504, + "grad_norm": 0.642327070236206, + "learning_rate": 9.403070047484957e-05, + "loss": 1.0311, + "step": 49210 + }, + { + "epoch": 0.3144525510138891, + "grad_norm": 1.0337790250778198, + "learning_rate": 9.402832270175833e-05, + "loss": 0.6988, + "step": 49220 + }, + { + "epoch": 0.3145164381636278, + "grad_norm": 0.6804295182228088, + "learning_rate": 9.40259444852638e-05, + "loss": 0.9084, + "step": 49230 + }, + { + "epoch": 0.3145803253133665, + "grad_norm": 0.9639153480529785, + "learning_rate": 9.402356582538991e-05, + "loss": 0.9531, + "step": 49240 + }, + { + "epoch": 0.31464421246310514, + "grad_norm": 1.2512791156768799, + "learning_rate": 9.402118672216064e-05, + "loss": 0.8388, + "step": 49250 + }, + { + "epoch": 0.31470809961284385, + "grad_norm": 1.065996527671814, + "learning_rate": 9.401880717559993e-05, + "loss": 1.0074, + "step": 49260 + }, + { + "epoch": 0.31477198676258256, + "grad_norm": 0.7025091052055359, + "learning_rate": 9.401642718573175e-05, + "loss": 0.9072, + "step": 49270 + }, + { + "epoch": 0.31483587391232126, + "grad_norm": 0.6397770047187805, + "learning_rate": 9.401404675258006e-05, + "loss": 0.9634, + "step": 49280 + }, + { + "epoch": 0.31489976106205997, + "grad_norm": 0.938177764415741, + "learning_rate": 9.401166587616885e-05, + "loss": 1.015, + "step": 49290 + }, + { + "epoch": 0.3149636482117987, + "grad_norm": 0.6089216470718384, + "learning_rate": 9.400928455652209e-05, + "loss": 0.8248, + "step": 49300 + }, + { + "epoch": 0.3150275353615374, + "grad_norm": 0.5597997307777405, + "learning_rate": 9.400690279366377e-05, + "loss": 0.7582, + "step": 49310 + }, + { + "epoch": 0.3150914225112761, + "grad_norm": 1.5323448181152344, + "learning_rate": 9.400452058761784e-05, + "loss": 0.8959, + "step": 49320 + }, + { + "epoch": 0.3151553096610148, + "grad_norm": 0.6819522976875305, + "learning_rate": 9.400213793840835e-05, + "loss": 0.8379, + "step": 49330 + }, + { + "epoch": 0.3152191968107535, + "grad_norm": 1.2610814571380615, + "learning_rate": 9.399975484605925e-05, + "loss": 1.0078, + "step": 49340 + }, + { + "epoch": 0.3152830839604922, + "grad_norm": 0.9983810782432556, + "learning_rate": 9.399737131059454e-05, + "loss": 0.9125, + "step": 49350 + }, + { + "epoch": 0.3153469711102309, + "grad_norm": 0.8259413838386536, + "learning_rate": 9.399498733203827e-05, + "loss": 0.921, + "step": 49360 + }, + { + "epoch": 0.31541085825996956, + "grad_norm": 1.0133284330368042, + "learning_rate": 9.399260291041439e-05, + "loss": 0.9468, + "step": 49370 + }, + { + "epoch": 0.31547474540970827, + "grad_norm": 0.9398267269134521, + "learning_rate": 9.399021804574694e-05, + "loss": 0.8747, + "step": 49380 + }, + { + "epoch": 0.315538632559447, + "grad_norm": 0.7082047462463379, + "learning_rate": 9.398783273805995e-05, + "loss": 0.8867, + "step": 49390 + }, + { + "epoch": 0.3156025197091857, + "grad_norm": 0.8589842915534973, + "learning_rate": 9.398544698737743e-05, + "loss": 0.713, + "step": 49400 + }, + { + "epoch": 0.3156664068589244, + "grad_norm": 1.1471587419509888, + "learning_rate": 9.398306079372339e-05, + "loss": 0.9788, + "step": 49410 + }, + { + "epoch": 0.3157302940086631, + "grad_norm": 0.49813616275787354, + "learning_rate": 9.398067415712188e-05, + "loss": 0.7366, + "step": 49420 + }, + { + "epoch": 0.3157941811584018, + "grad_norm": 1.774720311164856, + "learning_rate": 9.397828707759695e-05, + "loss": 0.685, + "step": 49430 + }, + { + "epoch": 0.3158580683081405, + "grad_norm": 1.121861219406128, + "learning_rate": 9.397589955517261e-05, + "loss": 0.9251, + "step": 49440 + }, + { + "epoch": 0.3159219554578792, + "grad_norm": 0.8228227496147156, + "learning_rate": 9.397351158987293e-05, + "loss": 0.9838, + "step": 49450 + }, + { + "epoch": 0.3159858426076179, + "grad_norm": 0.9918887615203857, + "learning_rate": 9.397112318172192e-05, + "loss": 0.8684, + "step": 49460 + }, + { + "epoch": 0.3160497297573566, + "grad_norm": 0.8907731175422668, + "learning_rate": 9.396873433074367e-05, + "loss": 0.8603, + "step": 49470 + }, + { + "epoch": 0.31611361690709533, + "grad_norm": 0.9690269827842712, + "learning_rate": 9.396634503696225e-05, + "loss": 0.931, + "step": 49480 + }, + { + "epoch": 0.31617750405683404, + "grad_norm": 0.7284924983978271, + "learning_rate": 9.396395530040167e-05, + "loss": 0.8224, + "step": 49490 + }, + { + "epoch": 0.3162413912065727, + "grad_norm": 0.8061665296554565, + "learning_rate": 9.396156512108603e-05, + "loss": 0.8418, + "step": 49500 + }, + { + "epoch": 0.3163052783563114, + "grad_norm": 0.7680371999740601, + "learning_rate": 9.39591744990394e-05, + "loss": 0.5664, + "step": 49510 + }, + { + "epoch": 0.3163691655060501, + "grad_norm": 0.8830310702323914, + "learning_rate": 9.395678343428586e-05, + "loss": 0.7123, + "step": 49520 + }, + { + "epoch": 0.3164330526557888, + "grad_norm": 0.9915767908096313, + "learning_rate": 9.395439192684947e-05, + "loss": 0.9406, + "step": 49530 + }, + { + "epoch": 0.3164969398055275, + "grad_norm": 1.3470020294189453, + "learning_rate": 9.395199997675435e-05, + "loss": 0.8661, + "step": 49540 + }, + { + "epoch": 0.3165608269552662, + "grad_norm": 0.934509813785553, + "learning_rate": 9.394960758402455e-05, + "loss": 1.0339, + "step": 49550 + }, + { + "epoch": 0.3166247141050049, + "grad_norm": 0.5762020349502563, + "learning_rate": 9.394721474868418e-05, + "loss": 0.9045, + "step": 49560 + }, + { + "epoch": 0.31668860125474363, + "grad_norm": 0.8978357315063477, + "learning_rate": 9.394482147075734e-05, + "loss": 0.8767, + "step": 49570 + }, + { + "epoch": 0.31675248840448234, + "grad_norm": 0.7672891020774841, + "learning_rate": 9.394242775026812e-05, + "loss": 0.9131, + "step": 49580 + }, + { + "epoch": 0.31681637555422104, + "grad_norm": 0.9509188532829285, + "learning_rate": 9.394003358724067e-05, + "loss": 0.9731, + "step": 49590 + }, + { + "epoch": 0.31688026270395975, + "grad_norm": 0.6911554336547852, + "learning_rate": 9.393763898169903e-05, + "loss": 0.7586, + "step": 49600 + }, + { + "epoch": 0.31694414985369845, + "grad_norm": 0.9574739933013916, + "learning_rate": 9.393524393366737e-05, + "loss": 0.9106, + "step": 49610 + }, + { + "epoch": 0.3170080370034371, + "grad_norm": 0.7374638915061951, + "learning_rate": 9.393284844316979e-05, + "loss": 0.7652, + "step": 49620 + }, + { + "epoch": 0.3170719241531758, + "grad_norm": 0.971748948097229, + "learning_rate": 9.393045251023042e-05, + "loss": 0.9203, + "step": 49630 + }, + { + "epoch": 0.3171358113029145, + "grad_norm": 0.8712042570114136, + "learning_rate": 9.392805613487339e-05, + "loss": 0.6788, + "step": 49640 + }, + { + "epoch": 0.3171996984526532, + "grad_norm": 0.782733142375946, + "learning_rate": 9.392565931712282e-05, + "loss": 0.936, + "step": 49650 + }, + { + "epoch": 0.31726358560239193, + "grad_norm": 0.6605043411254883, + "learning_rate": 9.392326205700288e-05, + "loss": 0.8659, + "step": 49660 + }, + { + "epoch": 0.31732747275213063, + "grad_norm": 0.831889808177948, + "learning_rate": 9.392086435453769e-05, + "loss": 0.8596, + "step": 49670 + }, + { + "epoch": 0.31739135990186934, + "grad_norm": 0.6261256337165833, + "learning_rate": 9.391846620975139e-05, + "loss": 0.9768, + "step": 49680 + }, + { + "epoch": 0.31745524705160805, + "grad_norm": 0.8194597959518433, + "learning_rate": 9.391606762266814e-05, + "loss": 0.7694, + "step": 49690 + }, + { + "epoch": 0.31751913420134675, + "grad_norm": 0.9790688157081604, + "learning_rate": 9.39136685933121e-05, + "loss": 1.0057, + "step": 49700 + }, + { + "epoch": 0.31758302135108546, + "grad_norm": 0.8990379571914673, + "learning_rate": 9.391126912170742e-05, + "loss": 0.9249, + "step": 49710 + }, + { + "epoch": 0.31764690850082417, + "grad_norm": 0.9338327050209045, + "learning_rate": 9.390886920787828e-05, + "loss": 0.99, + "step": 49720 + }, + { + "epoch": 0.31771079565056287, + "grad_norm": 0.7837555408477783, + "learning_rate": 9.390646885184884e-05, + "loss": 0.966, + "step": 49730 + }, + { + "epoch": 0.3177746828003015, + "grad_norm": 0.7973876595497131, + "learning_rate": 9.390406805364327e-05, + "loss": 0.9153, + "step": 49740 + }, + { + "epoch": 0.31783856995004023, + "grad_norm": 0.8525822758674622, + "learning_rate": 9.390166681328575e-05, + "loss": 0.8138, + "step": 49750 + }, + { + "epoch": 0.31790245709977893, + "grad_norm": 1.3615291118621826, + "learning_rate": 9.389926513080047e-05, + "loss": 0.887, + "step": 49760 + }, + { + "epoch": 0.31796634424951764, + "grad_norm": 1.1276623010635376, + "learning_rate": 9.389686300621162e-05, + "loss": 0.9642, + "step": 49770 + }, + { + "epoch": 0.31803023139925635, + "grad_norm": 0.7759473323822021, + "learning_rate": 9.389446043954336e-05, + "loss": 0.7954, + "step": 49780 + }, + { + "epoch": 0.31809411854899505, + "grad_norm": 0.9552074074745178, + "learning_rate": 9.389205743081992e-05, + "loss": 0.8348, + "step": 49790 + }, + { + "epoch": 0.31815800569873376, + "grad_norm": 0.6165658831596375, + "learning_rate": 9.388965398006549e-05, + "loss": 0.9181, + "step": 49800 + }, + { + "epoch": 0.31822189284847247, + "grad_norm": 0.7126082181930542, + "learning_rate": 9.388725008730428e-05, + "loss": 1.1795, + "step": 49810 + }, + { + "epoch": 0.31828577999821117, + "grad_norm": 0.8872388005256653, + "learning_rate": 9.388484575256049e-05, + "loss": 0.9206, + "step": 49820 + }, + { + "epoch": 0.3183496671479499, + "grad_norm": 0.8513489365577698, + "learning_rate": 9.388244097585835e-05, + "loss": 1.0587, + "step": 49830 + }, + { + "epoch": 0.3184135542976886, + "grad_norm": 0.6428020596504211, + "learning_rate": 9.388003575722204e-05, + "loss": 0.7731, + "step": 49840 + }, + { + "epoch": 0.3184774414474273, + "grad_norm": 1.0599162578582764, + "learning_rate": 9.387763009667583e-05, + "loss": 0.8681, + "step": 49850 + }, + { + "epoch": 0.31854132859716594, + "grad_norm": 0.6444383859634399, + "learning_rate": 9.387522399424391e-05, + "loss": 0.79, + "step": 49860 + }, + { + "epoch": 0.31860521574690465, + "grad_norm": 0.6530598998069763, + "learning_rate": 9.387281744995053e-05, + "loss": 1.1558, + "step": 49870 + }, + { + "epoch": 0.31866910289664335, + "grad_norm": 0.7804102897644043, + "learning_rate": 9.387041046381994e-05, + "loss": 0.9238, + "step": 49880 + }, + { + "epoch": 0.31873299004638206, + "grad_norm": 0.8872278332710266, + "learning_rate": 9.386800303587635e-05, + "loss": 0.7599, + "step": 49890 + }, + { + "epoch": 0.31879687719612076, + "grad_norm": 0.5427918434143066, + "learning_rate": 9.386559516614401e-05, + "loss": 0.8869, + "step": 49900 + }, + { + "epoch": 0.31886076434585947, + "grad_norm": 0.8014676570892334, + "learning_rate": 9.386318685464719e-05, + "loss": 0.885, + "step": 49910 + }, + { + "epoch": 0.3189246514955982, + "grad_norm": 0.7238976359367371, + "learning_rate": 9.386077810141013e-05, + "loss": 0.8402, + "step": 49920 + }, + { + "epoch": 0.3189885386453369, + "grad_norm": 1.0209499597549438, + "learning_rate": 9.385836890645708e-05, + "loss": 0.7001, + "step": 49930 + }, + { + "epoch": 0.3190524257950756, + "grad_norm": 0.978877067565918, + "learning_rate": 9.385595926981232e-05, + "loss": 0.8989, + "step": 49940 + }, + { + "epoch": 0.3191163129448143, + "grad_norm": 1.336305856704712, + "learning_rate": 9.385354919150011e-05, + "loss": 1.0186, + "step": 49950 + }, + { + "epoch": 0.319180200094553, + "grad_norm": 0.9015640616416931, + "learning_rate": 9.385113867154473e-05, + "loss": 0.9387, + "step": 49960 + }, + { + "epoch": 0.3192440872442917, + "grad_norm": 1.013190507888794, + "learning_rate": 9.384872770997043e-05, + "loss": 1.1722, + "step": 49970 + }, + { + "epoch": 0.31930797439403036, + "grad_norm": 1.667493224143982, + "learning_rate": 9.384631630680152e-05, + "loss": 0.9188, + "step": 49980 + }, + { + "epoch": 0.31937186154376906, + "grad_norm": 0.7360647320747375, + "learning_rate": 9.384390446206226e-05, + "loss": 0.8126, + "step": 49990 + }, + { + "epoch": 0.31943574869350777, + "grad_norm": 1.0479674339294434, + "learning_rate": 9.384149217577695e-05, + "loss": 0.8874, + "step": 50000 + }, + { + "epoch": 0.3194996358432465, + "grad_norm": 0.7030632495880127, + "learning_rate": 9.38390794479699e-05, + "loss": 1.0218, + "step": 50010 + }, + { + "epoch": 0.3195635229929852, + "grad_norm": 0.8451624512672424, + "learning_rate": 9.383666627866539e-05, + "loss": 0.8403, + "step": 50020 + }, + { + "epoch": 0.3196274101427239, + "grad_norm": 0.5618483424186707, + "learning_rate": 9.383425266788772e-05, + "loss": 0.7409, + "step": 50030 + }, + { + "epoch": 0.3196912972924626, + "grad_norm": 0.7848330736160278, + "learning_rate": 9.383183861566121e-05, + "loss": 0.9107, + "step": 50040 + }, + { + "epoch": 0.3197551844422013, + "grad_norm": 0.7237081527709961, + "learning_rate": 9.382942412201016e-05, + "loss": 0.6871, + "step": 50050 + }, + { + "epoch": 0.31981907159194, + "grad_norm": 1.093504786491394, + "learning_rate": 9.382700918695889e-05, + "loss": 0.8252, + "step": 50060 + }, + { + "epoch": 0.3198829587416787, + "grad_norm": 0.733311653137207, + "learning_rate": 9.382459381053173e-05, + "loss": 0.7627, + "step": 50070 + }, + { + "epoch": 0.3199468458914174, + "grad_norm": 0.8467728495597839, + "learning_rate": 9.3822177992753e-05, + "loss": 0.8656, + "step": 50080 + }, + { + "epoch": 0.3200107330411561, + "grad_norm": 1.3215608596801758, + "learning_rate": 9.381976173364702e-05, + "loss": 0.7438, + "step": 50090 + }, + { + "epoch": 0.3200746201908948, + "grad_norm": 0.7115028500556946, + "learning_rate": 9.381734503323812e-05, + "loss": 1.1696, + "step": 50100 + }, + { + "epoch": 0.3201385073406335, + "grad_norm": 0.7195971608161926, + "learning_rate": 9.381492789155066e-05, + "loss": 0.7597, + "step": 50110 + }, + { + "epoch": 0.3202023944903722, + "grad_norm": 0.7704067230224609, + "learning_rate": 9.381251030860896e-05, + "loss": 0.8963, + "step": 50120 + }, + { + "epoch": 0.3202662816401109, + "grad_norm": 0.7331963181495667, + "learning_rate": 9.381009228443737e-05, + "loss": 0.7832, + "step": 50130 + }, + { + "epoch": 0.3203301687898496, + "grad_norm": 0.8845292329788208, + "learning_rate": 9.380767381906029e-05, + "loss": 0.8378, + "step": 50140 + }, + { + "epoch": 0.3203940559395883, + "grad_norm": 0.714884340763092, + "learning_rate": 9.380525491250201e-05, + "loss": 0.9028, + "step": 50150 + }, + { + "epoch": 0.320457943089327, + "grad_norm": 0.6993659138679504, + "learning_rate": 9.380283556478691e-05, + "loss": 1.1563, + "step": 50160 + }, + { + "epoch": 0.3205218302390657, + "grad_norm": 1.0421202182769775, + "learning_rate": 9.380041577593937e-05, + "loss": 0.9022, + "step": 50170 + }, + { + "epoch": 0.3205857173888044, + "grad_norm": 1.632599115371704, + "learning_rate": 9.379799554598374e-05, + "loss": 0.8848, + "step": 50180 + }, + { + "epoch": 0.32064960453854313, + "grad_norm": 0.9513861536979675, + "learning_rate": 9.379557487494442e-05, + "loss": 1.0339, + "step": 50190 + }, + { + "epoch": 0.32071349168828184, + "grad_norm": 0.803616464138031, + "learning_rate": 9.379315376284576e-05, + "loss": 0.852, + "step": 50200 + }, + { + "epoch": 0.32077737883802054, + "grad_norm": 0.938457727432251, + "learning_rate": 9.379073220971215e-05, + "loss": 0.9651, + "step": 50210 + }, + { + "epoch": 0.3208412659877592, + "grad_norm": 1.7576130628585815, + "learning_rate": 9.3788310215568e-05, + "loss": 0.8734, + "step": 50220 + }, + { + "epoch": 0.3209051531374979, + "grad_norm": 0.8921381831169128, + "learning_rate": 9.378588778043766e-05, + "loss": 0.9362, + "step": 50230 + }, + { + "epoch": 0.3209690402872366, + "grad_norm": 0.6942434906959534, + "learning_rate": 9.378346490434558e-05, + "loss": 0.8118, + "step": 50240 + }, + { + "epoch": 0.3210329274369753, + "grad_norm": 0.9267338514328003, + "learning_rate": 9.378104158731611e-05, + "loss": 0.8815, + "step": 50250 + }, + { + "epoch": 0.321096814586714, + "grad_norm": 0.8614099621772766, + "learning_rate": 9.377861782937369e-05, + "loss": 0.8871, + "step": 50260 + }, + { + "epoch": 0.3211607017364527, + "grad_norm": 0.944468080997467, + "learning_rate": 9.37761936305427e-05, + "loss": 1.0129, + "step": 50270 + }, + { + "epoch": 0.32122458888619143, + "grad_norm": 1.3810163736343384, + "learning_rate": 9.377376899084757e-05, + "loss": 0.6677, + "step": 50280 + }, + { + "epoch": 0.32128847603593014, + "grad_norm": 0.8450731039047241, + "learning_rate": 9.377134391031272e-05, + "loss": 1.0518, + "step": 50290 + }, + { + "epoch": 0.32135236318566884, + "grad_norm": 1.011494755744934, + "learning_rate": 9.376891838896258e-05, + "loss": 0.7787, + "step": 50300 + }, + { + "epoch": 0.32141625033540755, + "grad_norm": 0.8098331093788147, + "learning_rate": 9.376649242682154e-05, + "loss": 0.9035, + "step": 50310 + }, + { + "epoch": 0.32148013748514626, + "grad_norm": 0.7809407711029053, + "learning_rate": 9.376406602391407e-05, + "loss": 0.8507, + "step": 50320 + }, + { + "epoch": 0.32154402463488496, + "grad_norm": 1.0858904123306274, + "learning_rate": 9.376163918026461e-05, + "loss": 1.109, + "step": 50330 + }, + { + "epoch": 0.32160791178462367, + "grad_norm": 1.2475218772888184, + "learning_rate": 9.375921189589756e-05, + "loss": 0.7929, + "step": 50340 + }, + { + "epoch": 0.3216717989343623, + "grad_norm": 0.9990367293357849, + "learning_rate": 9.375678417083741e-05, + "loss": 0.9663, + "step": 50350 + }, + { + "epoch": 0.321735686084101, + "grad_norm": 0.7379415035247803, + "learning_rate": 9.375435600510858e-05, + "loss": 0.8639, + "step": 50360 + }, + { + "epoch": 0.32179957323383973, + "grad_norm": 0.7513096928596497, + "learning_rate": 9.375192739873553e-05, + "loss": 0.9025, + "step": 50370 + }, + { + "epoch": 0.32186346038357844, + "grad_norm": 1.1147832870483398, + "learning_rate": 9.374949835174273e-05, + "loss": 1.0522, + "step": 50380 + }, + { + "epoch": 0.32192734753331714, + "grad_norm": 1.0712778568267822, + "learning_rate": 9.374706886415462e-05, + "loss": 0.8807, + "step": 50390 + }, + { + "epoch": 0.32199123468305585, + "grad_norm": 1.0778201818466187, + "learning_rate": 9.374463893599568e-05, + "loss": 1.0531, + "step": 50400 + }, + { + "epoch": 0.32205512183279456, + "grad_norm": 0.7178623676300049, + "learning_rate": 9.374220856729039e-05, + "loss": 0.7271, + "step": 50410 + }, + { + "epoch": 0.32211900898253326, + "grad_norm": 0.9530071020126343, + "learning_rate": 9.373977775806321e-05, + "loss": 0.7916, + "step": 50420 + }, + { + "epoch": 0.32218289613227197, + "grad_norm": 0.8260478377342224, + "learning_rate": 9.373734650833862e-05, + "loss": 0.9391, + "step": 50430 + }, + { + "epoch": 0.3222467832820107, + "grad_norm": 0.9254810810089111, + "learning_rate": 9.373491481814114e-05, + "loss": 1.0402, + "step": 50440 + }, + { + "epoch": 0.3223106704317494, + "grad_norm": 1.080959439277649, + "learning_rate": 9.373248268749521e-05, + "loss": 0.8513, + "step": 50450 + }, + { + "epoch": 0.3223745575814881, + "grad_norm": 1.447561264038086, + "learning_rate": 9.373005011642534e-05, + "loss": 0.8074, + "step": 50460 + }, + { + "epoch": 0.32243844473122674, + "grad_norm": 0.7180037498474121, + "learning_rate": 9.372761710495605e-05, + "loss": 0.9142, + "step": 50470 + }, + { + "epoch": 0.32250233188096544, + "grad_norm": 0.8241226077079773, + "learning_rate": 9.372518365311183e-05, + "loss": 0.7491, + "step": 50480 + }, + { + "epoch": 0.32256621903070415, + "grad_norm": 0.647534966468811, + "learning_rate": 9.372274976091718e-05, + "loss": 0.988, + "step": 50490 + }, + { + "epoch": 0.32263010618044285, + "grad_norm": 0.953140139579773, + "learning_rate": 9.372031542839658e-05, + "loss": 1.0637, + "step": 50500 + }, + { + "epoch": 0.32269399333018156, + "grad_norm": 1.1263874769210815, + "learning_rate": 9.371788065557463e-05, + "loss": 0.8773, + "step": 50510 + }, + { + "epoch": 0.32275788047992027, + "grad_norm": 0.9044056534767151, + "learning_rate": 9.371544544247577e-05, + "loss": 0.8032, + "step": 50520 + }, + { + "epoch": 0.322821767629659, + "grad_norm": 0.7417857646942139, + "learning_rate": 9.371300978912456e-05, + "loss": 0.8008, + "step": 50530 + }, + { + "epoch": 0.3228856547793977, + "grad_norm": 0.6616377830505371, + "learning_rate": 9.371057369554552e-05, + "loss": 0.9739, + "step": 50540 + }, + { + "epoch": 0.3229495419291364, + "grad_norm": 0.9354878664016724, + "learning_rate": 9.370813716176321e-05, + "loss": 0.9562, + "step": 50550 + }, + { + "epoch": 0.3230134290788751, + "grad_norm": 0.9551581740379333, + "learning_rate": 9.370570018780213e-05, + "loss": 0.9345, + "step": 50560 + }, + { + "epoch": 0.3230773162286138, + "grad_norm": 0.7437955737113953, + "learning_rate": 9.370326277368684e-05, + "loss": 0.8398, + "step": 50570 + }, + { + "epoch": 0.3231412033783525, + "grad_norm": 0.7190837264060974, + "learning_rate": 9.370082491944188e-05, + "loss": 0.978, + "step": 50580 + }, + { + "epoch": 0.32320509052809115, + "grad_norm": 1.749056100845337, + "learning_rate": 9.36983866250918e-05, + "loss": 1.241, + "step": 50590 + }, + { + "epoch": 0.32326897767782986, + "grad_norm": 0.8506900072097778, + "learning_rate": 9.369594789066119e-05, + "loss": 1.1666, + "step": 50600 + }, + { + "epoch": 0.32333286482756857, + "grad_norm": 0.8432791829109192, + "learning_rate": 9.369350871617454e-05, + "loss": 1.1055, + "step": 50610 + }, + { + "epoch": 0.3233967519773073, + "grad_norm": 0.8245983123779297, + "learning_rate": 9.36910691016565e-05, + "loss": 0.8282, + "step": 50620 + }, + { + "epoch": 0.323460639127046, + "grad_norm": 0.7552667856216431, + "learning_rate": 9.368862904713158e-05, + "loss": 0.8247, + "step": 50630 + }, + { + "epoch": 0.3235245262767847, + "grad_norm": 1.2741241455078125, + "learning_rate": 9.368618855262437e-05, + "loss": 0.8029, + "step": 50640 + }, + { + "epoch": 0.3235884134265234, + "grad_norm": 0.47164657711982727, + "learning_rate": 9.368374761815943e-05, + "loss": 1.1075, + "step": 50650 + }, + { + "epoch": 0.3236523005762621, + "grad_norm": 0.9931887984275818, + "learning_rate": 9.368130624376139e-05, + "loss": 0.7937, + "step": 50660 + }, + { + "epoch": 0.3237161877260008, + "grad_norm": 0.6441866755485535, + "learning_rate": 9.36788644294548e-05, + "loss": 0.748, + "step": 50670 + }, + { + "epoch": 0.3237800748757395, + "grad_norm": 1.0310367345809937, + "learning_rate": 9.367642217526423e-05, + "loss": 0.7168, + "step": 50680 + }, + { + "epoch": 0.3238439620254782, + "grad_norm": 1.0950703620910645, + "learning_rate": 9.367397948121433e-05, + "loss": 0.722, + "step": 50690 + }, + { + "epoch": 0.3239078491752169, + "grad_norm": 0.7349279522895813, + "learning_rate": 9.367153634732966e-05, + "loss": 0.9114, + "step": 50700 + }, + { + "epoch": 0.3239717363249556, + "grad_norm": 0.6076371073722839, + "learning_rate": 9.366909277363484e-05, + "loss": 0.9256, + "step": 50710 + }, + { + "epoch": 0.3240356234746943, + "grad_norm": 0.8350464105606079, + "learning_rate": 9.366664876015448e-05, + "loss": 0.7421, + "step": 50720 + }, + { + "epoch": 0.324099510624433, + "grad_norm": 0.6974174976348877, + "learning_rate": 9.36642043069132e-05, + "loss": 0.9282, + "step": 50730 + }, + { + "epoch": 0.3241633977741717, + "grad_norm": 0.8667670488357544, + "learning_rate": 9.36617594139356e-05, + "loss": 1.0113, + "step": 50740 + }, + { + "epoch": 0.3242272849239104, + "grad_norm": 0.8927708864212036, + "learning_rate": 9.365931408124631e-05, + "loss": 1.0419, + "step": 50750 + }, + { + "epoch": 0.3242911720736491, + "grad_norm": 0.6961429119110107, + "learning_rate": 9.365686830886995e-05, + "loss": 0.9287, + "step": 50760 + }, + { + "epoch": 0.3243550592233878, + "grad_norm": 0.638645350933075, + "learning_rate": 9.365442209683116e-05, + "loss": 0.8599, + "step": 50770 + }, + { + "epoch": 0.3244189463731265, + "grad_norm": 1.2358146905899048, + "learning_rate": 9.365197544515456e-05, + "loss": 0.7256, + "step": 50780 + }, + { + "epoch": 0.3244828335228652, + "grad_norm": 0.6738364696502686, + "learning_rate": 9.364952835386482e-05, + "loss": 1.0181, + "step": 50790 + }, + { + "epoch": 0.32454672067260393, + "grad_norm": 1.1015185117721558, + "learning_rate": 9.364708082298656e-05, + "loss": 0.6716, + "step": 50800 + }, + { + "epoch": 0.32461060782234263, + "grad_norm": 0.45196080207824707, + "learning_rate": 9.364463285254446e-05, + "loss": 0.8246, + "step": 50810 + }, + { + "epoch": 0.32467449497208134, + "grad_norm": 0.5970223546028137, + "learning_rate": 9.364218444256312e-05, + "loss": 1.08, + "step": 50820 + }, + { + "epoch": 0.32473838212182, + "grad_norm": 0.6557901501655579, + "learning_rate": 9.363973559306724e-05, + "loss": 0.8581, + "step": 50830 + }, + { + "epoch": 0.3248022692715587, + "grad_norm": 1.3220319747924805, + "learning_rate": 9.363728630408146e-05, + "loss": 0.8748, + "step": 50840 + }, + { + "epoch": 0.3248661564212974, + "grad_norm": 0.6746674180030823, + "learning_rate": 9.363483657563046e-05, + "loss": 0.9018, + "step": 50850 + }, + { + "epoch": 0.3249300435710361, + "grad_norm": 0.8742882013320923, + "learning_rate": 9.363238640773891e-05, + "loss": 0.8107, + "step": 50860 + }, + { + "epoch": 0.3249939307207748, + "grad_norm": 0.8248798847198486, + "learning_rate": 9.362993580043148e-05, + "loss": 0.9382, + "step": 50870 + }, + { + "epoch": 0.3250578178705135, + "grad_norm": 0.7204467058181763, + "learning_rate": 9.362748475373284e-05, + "loss": 1.0355, + "step": 50880 + }, + { + "epoch": 0.3251217050202522, + "grad_norm": 2.2558605670928955, + "learning_rate": 9.36250332676677e-05, + "loss": 1.1596, + "step": 50890 + }, + { + "epoch": 0.32518559216999093, + "grad_norm": 0.6045457124710083, + "learning_rate": 9.362258134226074e-05, + "loss": 0.905, + "step": 50900 + }, + { + "epoch": 0.32524947931972964, + "grad_norm": 0.6380605101585388, + "learning_rate": 9.362012897753662e-05, + "loss": 0.969, + "step": 50910 + }, + { + "epoch": 0.32531336646946835, + "grad_norm": 1.1410925388336182, + "learning_rate": 9.361767617352008e-05, + "loss": 0.7254, + "step": 50920 + }, + { + "epoch": 0.32537725361920705, + "grad_norm": 1.1019296646118164, + "learning_rate": 9.361522293023581e-05, + "loss": 0.9216, + "step": 50930 + }, + { + "epoch": 0.32544114076894576, + "grad_norm": 1.174682378768921, + "learning_rate": 9.361276924770853e-05, + "loss": 0.8368, + "step": 50940 + }, + { + "epoch": 0.3255050279186844, + "grad_norm": 0.5832127928733826, + "learning_rate": 9.36103151259629e-05, + "loss": 0.7741, + "step": 50950 + }, + { + "epoch": 0.3255689150684231, + "grad_norm": 0.9537053108215332, + "learning_rate": 9.360786056502367e-05, + "loss": 0.9866, + "step": 50960 + }, + { + "epoch": 0.3256328022181618, + "grad_norm": 1.0229930877685547, + "learning_rate": 9.360540556491558e-05, + "loss": 0.8135, + "step": 50970 + }, + { + "epoch": 0.3256966893679005, + "grad_norm": 1.187549352645874, + "learning_rate": 9.360295012566332e-05, + "loss": 0.9386, + "step": 50980 + }, + { + "epoch": 0.32576057651763923, + "grad_norm": 1.09153151512146, + "learning_rate": 9.360049424729162e-05, + "loss": 0.8265, + "step": 50990 + }, + { + "epoch": 0.32582446366737794, + "grad_norm": 0.7309248447418213, + "learning_rate": 9.359803792982525e-05, + "loss": 0.8341, + "step": 51000 + }, + { + "epoch": 0.32588835081711665, + "grad_norm": 0.45413196086883545, + "learning_rate": 9.359558117328891e-05, + "loss": 0.9438, + "step": 51010 + }, + { + "epoch": 0.32595223796685535, + "grad_norm": 0.8382761478424072, + "learning_rate": 9.359312397770733e-05, + "loss": 0.8004, + "step": 51020 + }, + { + "epoch": 0.32601612511659406, + "grad_norm": 0.8672879338264465, + "learning_rate": 9.359066634310529e-05, + "loss": 0.9812, + "step": 51030 + }, + { + "epoch": 0.32608001226633276, + "grad_norm": 1.0492173433303833, + "learning_rate": 9.358820826950754e-05, + "loss": 0.6533, + "step": 51040 + }, + { + "epoch": 0.32614389941607147, + "grad_norm": 0.9843623042106628, + "learning_rate": 9.358574975693882e-05, + "loss": 0.8744, + "step": 51050 + }, + { + "epoch": 0.3262077865658102, + "grad_norm": 0.9067795872688293, + "learning_rate": 9.358329080542389e-05, + "loss": 0.8468, + "step": 51060 + }, + { + "epoch": 0.3262716737155489, + "grad_norm": 0.6147485971450806, + "learning_rate": 9.358083141498751e-05, + "loss": 0.9103, + "step": 51070 + }, + { + "epoch": 0.32633556086528753, + "grad_norm": 0.8605659008026123, + "learning_rate": 9.357837158565446e-05, + "loss": 0.7366, + "step": 51080 + }, + { + "epoch": 0.32639944801502624, + "grad_norm": 0.6923385858535767, + "learning_rate": 9.357591131744952e-05, + "loss": 1.0148, + "step": 51090 + }, + { + "epoch": 0.32646333516476495, + "grad_norm": 0.8151227235794067, + "learning_rate": 9.357345061039745e-05, + "loss": 0.9406, + "step": 51100 + }, + { + "epoch": 0.32652722231450365, + "grad_norm": 0.7649495005607605, + "learning_rate": 9.357098946452301e-05, + "loss": 0.8339, + "step": 51110 + }, + { + "epoch": 0.32659110946424236, + "grad_norm": 1.2730705738067627, + "learning_rate": 9.356852787985105e-05, + "loss": 1.0306, + "step": 51120 + }, + { + "epoch": 0.32665499661398106, + "grad_norm": 0.6461998820304871, + "learning_rate": 9.35660658564063e-05, + "loss": 0.7584, + "step": 51130 + }, + { + "epoch": 0.32671888376371977, + "grad_norm": 0.999034583568573, + "learning_rate": 9.356360339421357e-05, + "loss": 0.9405, + "step": 51140 + }, + { + "epoch": 0.3267827709134585, + "grad_norm": 1.0120235681533813, + "learning_rate": 9.356114049329767e-05, + "loss": 0.8825, + "step": 51150 + }, + { + "epoch": 0.3268466580631972, + "grad_norm": 0.9906069040298462, + "learning_rate": 9.35586771536834e-05, + "loss": 0.8241, + "step": 51160 + }, + { + "epoch": 0.3269105452129359, + "grad_norm": 0.6129631996154785, + "learning_rate": 9.355621337539558e-05, + "loss": 0.8977, + "step": 51170 + }, + { + "epoch": 0.3269744323626746, + "grad_norm": 0.9414940476417542, + "learning_rate": 9.3553749158459e-05, + "loss": 0.9349, + "step": 51180 + }, + { + "epoch": 0.3270383195124133, + "grad_norm": 0.7236936092376709, + "learning_rate": 9.35512845028985e-05, + "loss": 0.8081, + "step": 51190 + }, + { + "epoch": 0.32710220666215195, + "grad_norm": 1.0342822074890137, + "learning_rate": 9.354881940873888e-05, + "loss": 0.8802, + "step": 51200 + }, + { + "epoch": 0.32716609381189066, + "grad_norm": 0.7799363136291504, + "learning_rate": 9.354635387600497e-05, + "loss": 0.8126, + "step": 51210 + }, + { + "epoch": 0.32722998096162936, + "grad_norm": 0.5983444452285767, + "learning_rate": 9.35438879047216e-05, + "loss": 0.909, + "step": 51220 + }, + { + "epoch": 0.32729386811136807, + "grad_norm": 0.9435120820999146, + "learning_rate": 9.35414214949136e-05, + "loss": 0.8125, + "step": 51230 + }, + { + "epoch": 0.3273577552611068, + "grad_norm": 0.7243615388870239, + "learning_rate": 9.353895464660585e-05, + "loss": 0.9006, + "step": 51240 + }, + { + "epoch": 0.3274216424108455, + "grad_norm": 0.7836112976074219, + "learning_rate": 9.353648735982312e-05, + "loss": 0.8456, + "step": 51250 + }, + { + "epoch": 0.3274855295605842, + "grad_norm": 0.8060475587844849, + "learning_rate": 9.353401963459032e-05, + "loss": 0.8903, + "step": 51260 + }, + { + "epoch": 0.3275494167103229, + "grad_norm": 0.49010974168777466, + "learning_rate": 9.353155147093228e-05, + "loss": 0.7631, + "step": 51270 + }, + { + "epoch": 0.3276133038600616, + "grad_norm": 1.089375615119934, + "learning_rate": 9.352908286887385e-05, + "loss": 0.8264, + "step": 51280 + }, + { + "epoch": 0.3276771910098003, + "grad_norm": 0.7374083399772644, + "learning_rate": 9.35266138284399e-05, + "loss": 0.7873, + "step": 51290 + }, + { + "epoch": 0.327741078159539, + "grad_norm": 0.8920226693153381, + "learning_rate": 9.352414434965531e-05, + "loss": 0.8032, + "step": 51300 + }, + { + "epoch": 0.3278049653092777, + "grad_norm": 0.8124125599861145, + "learning_rate": 9.35216744325449e-05, + "loss": 0.7542, + "step": 51310 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 0.7306677103042603, + "learning_rate": 9.35192040771336e-05, + "loss": 0.7191, + "step": 51320 + }, + { + "epoch": 0.3279327396087551, + "grad_norm": 1.0035959482192993, + "learning_rate": 9.351673328344626e-05, + "loss": 0.8754, + "step": 51330 + }, + { + "epoch": 0.3279966267584938, + "grad_norm": 1.0333647727966309, + "learning_rate": 9.351426205150777e-05, + "loss": 1.091, + "step": 51340 + }, + { + "epoch": 0.3280605139082325, + "grad_norm": 0.8857297897338867, + "learning_rate": 9.351179038134301e-05, + "loss": 0.8794, + "step": 51350 + }, + { + "epoch": 0.3281244010579712, + "grad_norm": 0.7112529873847961, + "learning_rate": 9.350931827297689e-05, + "loss": 0.9096, + "step": 51360 + }, + { + "epoch": 0.3281882882077099, + "grad_norm": 1.19603431224823, + "learning_rate": 9.350684572643427e-05, + "loss": 1.207, + "step": 51370 + }, + { + "epoch": 0.3282521753574486, + "grad_norm": 0.8583690524101257, + "learning_rate": 9.350437274174009e-05, + "loss": 0.9503, + "step": 51380 + }, + { + "epoch": 0.3283160625071873, + "grad_norm": 1.2817461490631104, + "learning_rate": 9.350189931891925e-05, + "loss": 0.8277, + "step": 51390 + }, + { + "epoch": 0.328379949656926, + "grad_norm": 1.0474090576171875, + "learning_rate": 9.349942545799664e-05, + "loss": 0.6875, + "step": 51400 + }, + { + "epoch": 0.3284438368066647, + "grad_norm": 2.7098753452301025, + "learning_rate": 9.349695115899717e-05, + "loss": 1.1116, + "step": 51410 + }, + { + "epoch": 0.32850772395640343, + "grad_norm": 0.9274638295173645, + "learning_rate": 9.349447642194578e-05, + "loss": 0.9108, + "step": 51420 + }, + { + "epoch": 0.32857161110614214, + "grad_norm": 0.9244821667671204, + "learning_rate": 9.34920012468674e-05, + "loss": 1.0516, + "step": 51430 + }, + { + "epoch": 0.3286354982558808, + "grad_norm": 0.7755671143531799, + "learning_rate": 9.348952563378693e-05, + "loss": 0.9229, + "step": 51440 + }, + { + "epoch": 0.3286993854056195, + "grad_norm": 0.8693557381629944, + "learning_rate": 9.348704958272931e-05, + "loss": 1.0101, + "step": 51450 + }, + { + "epoch": 0.3287632725553582, + "grad_norm": 1.1465810537338257, + "learning_rate": 9.348457309371948e-05, + "loss": 0.6397, + "step": 51460 + }, + { + "epoch": 0.3288271597050969, + "grad_norm": 1.0010018348693848, + "learning_rate": 9.348209616678238e-05, + "loss": 0.6065, + "step": 51470 + }, + { + "epoch": 0.3288910468548356, + "grad_norm": 0.7339697480201721, + "learning_rate": 9.347961880194296e-05, + "loss": 0.8538, + "step": 51480 + }, + { + "epoch": 0.3289549340045743, + "grad_norm": 0.5943730473518372, + "learning_rate": 9.347714099922616e-05, + "loss": 0.7904, + "step": 51490 + }, + { + "epoch": 0.329018821154313, + "grad_norm": 0.6852931380271912, + "learning_rate": 9.347466275865694e-05, + "loss": 0.8648, + "step": 51500 + }, + { + "epoch": 0.32908270830405173, + "grad_norm": 0.9251651167869568, + "learning_rate": 9.347218408026025e-05, + "loss": 0.9211, + "step": 51510 + }, + { + "epoch": 0.32914659545379044, + "grad_norm": 0.7798057794570923, + "learning_rate": 9.346970496406105e-05, + "loss": 0.6644, + "step": 51520 + }, + { + "epoch": 0.32921048260352914, + "grad_norm": 0.7447190880775452, + "learning_rate": 9.346722541008432e-05, + "loss": 0.7424, + "step": 51530 + }, + { + "epoch": 0.32927436975326785, + "grad_norm": 0.6202836036682129, + "learning_rate": 9.346474541835504e-05, + "loss": 1.0315, + "step": 51540 + }, + { + "epoch": 0.32933825690300655, + "grad_norm": 0.8937282562255859, + "learning_rate": 9.346226498889817e-05, + "loss": 0.8993, + "step": 51550 + }, + { + "epoch": 0.3294021440527452, + "grad_norm": 0.7935136556625366, + "learning_rate": 9.345978412173866e-05, + "loss": 0.8778, + "step": 51560 + }, + { + "epoch": 0.3294660312024839, + "grad_norm": 0.5474129319190979, + "learning_rate": 9.345730281690156e-05, + "loss": 0.9576, + "step": 51570 + }, + { + "epoch": 0.3295299183522226, + "grad_norm": 0.7357134819030762, + "learning_rate": 9.345482107441182e-05, + "loss": 0.7586, + "step": 51580 + }, + { + "epoch": 0.3295938055019613, + "grad_norm": 1.1119588613510132, + "learning_rate": 9.345233889429442e-05, + "loss": 0.9351, + "step": 51590 + }, + { + "epoch": 0.32965769265170003, + "grad_norm": 1.2632428407669067, + "learning_rate": 9.344985627657439e-05, + "loss": 0.7127, + "step": 51600 + }, + { + "epoch": 0.32972157980143874, + "grad_norm": 0.9762067198753357, + "learning_rate": 9.344737322127671e-05, + "loss": 1.2152, + "step": 51610 + }, + { + "epoch": 0.32978546695117744, + "grad_norm": 0.6457685232162476, + "learning_rate": 9.34448897284264e-05, + "loss": 0.7824, + "step": 51620 + }, + { + "epoch": 0.32984935410091615, + "grad_norm": 0.8155549764633179, + "learning_rate": 9.344240579804846e-05, + "loss": 1.0339, + "step": 51630 + }, + { + "epoch": 0.32991324125065485, + "grad_norm": 0.7448208332061768, + "learning_rate": 9.343992143016791e-05, + "loss": 0.8618, + "step": 51640 + }, + { + "epoch": 0.32997712840039356, + "grad_norm": 1.0054072141647339, + "learning_rate": 9.343743662480977e-05, + "loss": 1.0246, + "step": 51650 + }, + { + "epoch": 0.33004101555013227, + "grad_norm": 0.7192854285240173, + "learning_rate": 9.343495138199907e-05, + "loss": 0.8324, + "step": 51660 + }, + { + "epoch": 0.330104902699871, + "grad_norm": 1.023016095161438, + "learning_rate": 9.343246570176083e-05, + "loss": 0.9012, + "step": 51670 + }, + { + "epoch": 0.3301687898496096, + "grad_norm": 0.9066780209541321, + "learning_rate": 9.34299795841201e-05, + "loss": 0.7939, + "step": 51680 + }, + { + "epoch": 0.33023267699934833, + "grad_norm": 0.9442875385284424, + "learning_rate": 9.342749302910188e-05, + "loss": 0.8326, + "step": 51690 + }, + { + "epoch": 0.33029656414908704, + "grad_norm": 0.7106100916862488, + "learning_rate": 9.342500603673125e-05, + "loss": 0.9531, + "step": 51700 + }, + { + "epoch": 0.33036045129882574, + "grad_norm": 1.2981065511703491, + "learning_rate": 9.342251860703324e-05, + "loss": 0.9319, + "step": 51710 + }, + { + "epoch": 0.33042433844856445, + "grad_norm": 0.6678254008293152, + "learning_rate": 9.34200307400329e-05, + "loss": 0.7245, + "step": 51720 + }, + { + "epoch": 0.33048822559830315, + "grad_norm": 0.716221809387207, + "learning_rate": 9.341754243575528e-05, + "loss": 0.9938, + "step": 51730 + }, + { + "epoch": 0.33055211274804186, + "grad_norm": 0.9788273572921753, + "learning_rate": 9.341505369422546e-05, + "loss": 0.9796, + "step": 51740 + }, + { + "epoch": 0.33061599989778057, + "grad_norm": 0.6634423136711121, + "learning_rate": 9.341256451546848e-05, + "loss": 0.9807, + "step": 51750 + }, + { + "epoch": 0.33067988704751927, + "grad_norm": 1.2706855535507202, + "learning_rate": 9.341007489950942e-05, + "loss": 0.6877, + "step": 51760 + }, + { + "epoch": 0.330743774197258, + "grad_norm": 2.7023708820343018, + "learning_rate": 9.340758484637334e-05, + "loss": 0.7223, + "step": 51770 + }, + { + "epoch": 0.3308076613469967, + "grad_norm": 0.543978214263916, + "learning_rate": 9.340509435608534e-05, + "loss": 0.9397, + "step": 51780 + }, + { + "epoch": 0.3308715484967354, + "grad_norm": 0.8344589471817017, + "learning_rate": 9.340260342867049e-05, + "loss": 0.884, + "step": 51790 + }, + { + "epoch": 0.33093543564647404, + "grad_norm": 0.6055552959442139, + "learning_rate": 9.340011206415386e-05, + "loss": 0.6553, + "step": 51800 + }, + { + "epoch": 0.33099932279621275, + "grad_norm": 1.077162265777588, + "learning_rate": 9.339762026256058e-05, + "loss": 0.7583, + "step": 51810 + }, + { + "epoch": 0.33106320994595145, + "grad_norm": 1.1166653633117676, + "learning_rate": 9.33951280239157e-05, + "loss": 0.8469, + "step": 51820 + }, + { + "epoch": 0.33112709709569016, + "grad_norm": 0.7520068287849426, + "learning_rate": 9.339263534824436e-05, + "loss": 0.8934, + "step": 51830 + }, + { + "epoch": 0.33119098424542887, + "grad_norm": 0.8291226029396057, + "learning_rate": 9.339014223557163e-05, + "loss": 0.9665, + "step": 51840 + }, + { + "epoch": 0.33125487139516757, + "grad_norm": 0.8782137036323547, + "learning_rate": 9.338764868592262e-05, + "loss": 0.8251, + "step": 51850 + }, + { + "epoch": 0.3313187585449063, + "grad_norm": 0.6978154182434082, + "learning_rate": 9.338515469932246e-05, + "loss": 0.7853, + "step": 51860 + }, + { + "epoch": 0.331382645694645, + "grad_norm": 0.7604345679283142, + "learning_rate": 9.338266027579626e-05, + "loss": 0.8233, + "step": 51870 + }, + { + "epoch": 0.3314465328443837, + "grad_norm": 0.9156827926635742, + "learning_rate": 9.338016541536914e-05, + "loss": 0.7708, + "step": 51880 + }, + { + "epoch": 0.3315104199941224, + "grad_norm": 0.6973231434822083, + "learning_rate": 9.337767011806622e-05, + "loss": 0.82, + "step": 51890 + }, + { + "epoch": 0.3315743071438611, + "grad_norm": 0.7553335428237915, + "learning_rate": 9.337517438391263e-05, + "loss": 0.9323, + "step": 51900 + }, + { + "epoch": 0.3316381942935998, + "grad_norm": 1.0353292226791382, + "learning_rate": 9.337267821293351e-05, + "loss": 1.0569, + "step": 51910 + }, + { + "epoch": 0.3317020814433385, + "grad_norm": 0.6070806980133057, + "learning_rate": 9.3370181605154e-05, + "loss": 0.7626, + "step": 51920 + }, + { + "epoch": 0.33176596859307717, + "grad_norm": 1.3806092739105225, + "learning_rate": 9.336768456059925e-05, + "loss": 0.8368, + "step": 51930 + }, + { + "epoch": 0.33182985574281587, + "grad_norm": 0.8327397108078003, + "learning_rate": 9.33651870792944e-05, + "loss": 0.8303, + "step": 51940 + }, + { + "epoch": 0.3318937428925546, + "grad_norm": 0.916780948638916, + "learning_rate": 9.33626891612646e-05, + "loss": 0.8053, + "step": 51950 + }, + { + "epoch": 0.3319576300422933, + "grad_norm": 0.7326523065567017, + "learning_rate": 9.3360190806535e-05, + "loss": 0.8407, + "step": 51960 + }, + { + "epoch": 0.332021517192032, + "grad_norm": 1.0814404487609863, + "learning_rate": 9.335769201513075e-05, + "loss": 1.0026, + "step": 51970 + }, + { + "epoch": 0.3320854043417707, + "grad_norm": 0.9184064865112305, + "learning_rate": 9.335519278707705e-05, + "loss": 1.1877, + "step": 51980 + }, + { + "epoch": 0.3321492914915094, + "grad_norm": 0.7729029655456543, + "learning_rate": 9.335269312239904e-05, + "loss": 1.0875, + "step": 51990 + }, + { + "epoch": 0.3322131786412481, + "grad_norm": 1.2618939876556396, + "learning_rate": 9.335019302112193e-05, + "loss": 0.9594, + "step": 52000 + }, + { + "epoch": 0.3322770657909868, + "grad_norm": 0.6286314725875854, + "learning_rate": 9.334769248327085e-05, + "loss": 0.8619, + "step": 52010 + }, + { + "epoch": 0.3323409529407255, + "grad_norm": 1.7984399795532227, + "learning_rate": 9.334519150887103e-05, + "loss": 0.9147, + "step": 52020 + }, + { + "epoch": 0.3324048400904642, + "grad_norm": 1.0144270658493042, + "learning_rate": 9.33426900979476e-05, + "loss": 0.8399, + "step": 52030 + }, + { + "epoch": 0.33246872724020293, + "grad_norm": 1.9516681432724, + "learning_rate": 9.33401882505258e-05, + "loss": 0.8073, + "step": 52040 + }, + { + "epoch": 0.3325326143899416, + "grad_norm": 0.5465503931045532, + "learning_rate": 9.333768596663082e-05, + "loss": 0.8589, + "step": 52050 + }, + { + "epoch": 0.3325965015396803, + "grad_norm": 0.9213358759880066, + "learning_rate": 9.333518324628783e-05, + "loss": 0.6787, + "step": 52060 + }, + { + "epoch": 0.332660388689419, + "grad_norm": 0.7872808575630188, + "learning_rate": 9.333268008952206e-05, + "loss": 0.8307, + "step": 52070 + }, + { + "epoch": 0.3327242758391577, + "grad_norm": 0.9161990284919739, + "learning_rate": 9.333017649635871e-05, + "loss": 1.1748, + "step": 52080 + }, + { + "epoch": 0.3327881629888964, + "grad_norm": 0.7564883232116699, + "learning_rate": 9.332767246682301e-05, + "loss": 0.735, + "step": 52090 + }, + { + "epoch": 0.3328520501386351, + "grad_norm": 0.7654510140419006, + "learning_rate": 9.332516800094015e-05, + "loss": 0.5545, + "step": 52100 + }, + { + "epoch": 0.3329159372883738, + "grad_norm": 0.5725670456886292, + "learning_rate": 9.332266309873538e-05, + "loss": 1.294, + "step": 52110 + }, + { + "epoch": 0.3329798244381125, + "grad_norm": 1.0625219345092773, + "learning_rate": 9.332015776023391e-05, + "loss": 0.9125, + "step": 52120 + }, + { + "epoch": 0.33304371158785123, + "grad_norm": 0.9181973934173584, + "learning_rate": 9.331765198546097e-05, + "loss": 0.8822, + "step": 52130 + }, + { + "epoch": 0.33310759873758994, + "grad_norm": 1.5193865299224854, + "learning_rate": 9.33151457744418e-05, + "loss": 0.8638, + "step": 52140 + }, + { + "epoch": 0.33317148588732864, + "grad_norm": 1.0229812860488892, + "learning_rate": 9.331263912720165e-05, + "loss": 0.8326, + "step": 52150 + }, + { + "epoch": 0.33323537303706735, + "grad_norm": 1.2124236822128296, + "learning_rate": 9.331013204376573e-05, + "loss": 0.7525, + "step": 52160 + }, + { + "epoch": 0.333299260186806, + "grad_norm": 0.8149605393409729, + "learning_rate": 9.330762452415934e-05, + "loss": 0.9571, + "step": 52170 + }, + { + "epoch": 0.3333631473365447, + "grad_norm": 1.2210596799850464, + "learning_rate": 9.330511656840768e-05, + "loss": 1.0235, + "step": 52180 + }, + { + "epoch": 0.3334270344862834, + "grad_norm": 0.6121252775192261, + "learning_rate": 9.330260817653604e-05, + "loss": 0.8165, + "step": 52190 + }, + { + "epoch": 0.3334909216360221, + "grad_norm": 0.770204484462738, + "learning_rate": 9.330009934856967e-05, + "loss": 1.0588, + "step": 52200 + }, + { + "epoch": 0.3335548087857608, + "grad_norm": 0.6882258057594299, + "learning_rate": 9.329759008453385e-05, + "loss": 0.7148, + "step": 52210 + }, + { + "epoch": 0.33361869593549953, + "grad_norm": 1.0399905443191528, + "learning_rate": 9.329508038445382e-05, + "loss": 0.7462, + "step": 52220 + }, + { + "epoch": 0.33368258308523824, + "grad_norm": 1.3440240621566772, + "learning_rate": 9.32925702483549e-05, + "loss": 0.8031, + "step": 52230 + }, + { + "epoch": 0.33374647023497694, + "grad_norm": 0.5900636911392212, + "learning_rate": 9.329005967626234e-05, + "loss": 0.8395, + "step": 52240 + }, + { + "epoch": 0.33381035738471565, + "grad_norm": 0.8768534660339355, + "learning_rate": 9.328754866820142e-05, + "loss": 1.068, + "step": 52250 + }, + { + "epoch": 0.33387424453445436, + "grad_norm": 1.3895585536956787, + "learning_rate": 9.328503722419744e-05, + "loss": 0.9927, + "step": 52260 + }, + { + "epoch": 0.33393813168419306, + "grad_norm": 0.9054799675941467, + "learning_rate": 9.328252534427568e-05, + "loss": 0.7075, + "step": 52270 + }, + { + "epoch": 0.33400201883393177, + "grad_norm": 0.9763078689575195, + "learning_rate": 9.328001302846145e-05, + "loss": 1.0979, + "step": 52280 + }, + { + "epoch": 0.3340659059836704, + "grad_norm": 0.9339504241943359, + "learning_rate": 9.327750027678005e-05, + "loss": 0.8496, + "step": 52290 + }, + { + "epoch": 0.3341297931334091, + "grad_norm": 1.0181572437286377, + "learning_rate": 9.327498708925677e-05, + "loss": 0.9902, + "step": 52300 + }, + { + "epoch": 0.33419368028314783, + "grad_norm": 0.7839872241020203, + "learning_rate": 9.327247346591694e-05, + "loss": 0.9367, + "step": 52310 + }, + { + "epoch": 0.33425756743288654, + "grad_norm": 0.5195721387863159, + "learning_rate": 9.326995940678587e-05, + "loss": 0.8934, + "step": 52320 + }, + { + "epoch": 0.33432145458262524, + "grad_norm": 0.8356695175170898, + "learning_rate": 9.326744491188888e-05, + "loss": 0.9758, + "step": 52330 + }, + { + "epoch": 0.33438534173236395, + "grad_norm": 1.7605299949645996, + "learning_rate": 9.326492998125128e-05, + "loss": 0.7481, + "step": 52340 + }, + { + "epoch": 0.33444922888210266, + "grad_norm": 1.1781415939331055, + "learning_rate": 9.326241461489839e-05, + "loss": 0.8391, + "step": 52350 + }, + { + "epoch": 0.33451311603184136, + "grad_norm": 1.0908046960830688, + "learning_rate": 9.325989881285559e-05, + "loss": 0.6456, + "step": 52360 + }, + { + "epoch": 0.33457700318158007, + "grad_norm": 0.6815122961997986, + "learning_rate": 9.325738257514816e-05, + "loss": 0.7305, + "step": 52370 + }, + { + "epoch": 0.3346408903313188, + "grad_norm": 0.8093428611755371, + "learning_rate": 9.325486590180149e-05, + "loss": 0.778, + "step": 52380 + }, + { + "epoch": 0.3347047774810575, + "grad_norm": 0.8250554800033569, + "learning_rate": 9.325234879284086e-05, + "loss": 0.6891, + "step": 52390 + }, + { + "epoch": 0.3347686646307962, + "grad_norm": 0.8145758509635925, + "learning_rate": 9.324983124829169e-05, + "loss": 0.925, + "step": 52400 + }, + { + "epoch": 0.33483255178053484, + "grad_norm": 0.7351551651954651, + "learning_rate": 9.324731326817928e-05, + "loss": 1.0542, + "step": 52410 + }, + { + "epoch": 0.33489643893027354, + "grad_norm": 0.9697402119636536, + "learning_rate": 9.324479485252904e-05, + "loss": 0.9113, + "step": 52420 + }, + { + "epoch": 0.33496032608001225, + "grad_norm": 0.8043109774589539, + "learning_rate": 9.324227600136628e-05, + "loss": 0.9284, + "step": 52430 + }, + { + "epoch": 0.33502421322975096, + "grad_norm": 0.6603406667709351, + "learning_rate": 9.32397567147164e-05, + "loss": 0.9909, + "step": 52440 + }, + { + "epoch": 0.33508810037948966, + "grad_norm": 0.5201127529144287, + "learning_rate": 9.323723699260476e-05, + "loss": 0.7742, + "step": 52450 + }, + { + "epoch": 0.33515198752922837, + "grad_norm": 1.1055912971496582, + "learning_rate": 9.323471683505674e-05, + "loss": 0.8968, + "step": 52460 + }, + { + "epoch": 0.3352158746789671, + "grad_norm": 1.6916980743408203, + "learning_rate": 9.323219624209772e-05, + "loss": 0.8835, + "step": 52470 + }, + { + "epoch": 0.3352797618287058, + "grad_norm": 0.889218270778656, + "learning_rate": 9.322967521375307e-05, + "loss": 0.8445, + "step": 52480 + }, + { + "epoch": 0.3353436489784445, + "grad_norm": 0.9384592175483704, + "learning_rate": 9.32271537500482e-05, + "loss": 0.701, + "step": 52490 + }, + { + "epoch": 0.3354075361281832, + "grad_norm": 1.5930566787719727, + "learning_rate": 9.322463185100849e-05, + "loss": 0.8175, + "step": 52500 + }, + { + "epoch": 0.3354714232779219, + "grad_norm": 0.6644344925880432, + "learning_rate": 9.322210951665935e-05, + "loss": 1.0025, + "step": 52510 + }, + { + "epoch": 0.3355353104276606, + "grad_norm": 0.9203514456748962, + "learning_rate": 9.321958674702617e-05, + "loss": 0.7474, + "step": 52520 + }, + { + "epoch": 0.33559919757739926, + "grad_norm": 0.5946767330169678, + "learning_rate": 9.321706354213438e-05, + "loss": 0.7824, + "step": 52530 + }, + { + "epoch": 0.33566308472713796, + "grad_norm": 0.7163698077201843, + "learning_rate": 9.321453990200935e-05, + "loss": 0.778, + "step": 52540 + }, + { + "epoch": 0.33572697187687667, + "grad_norm": 0.8819127678871155, + "learning_rate": 9.321201582667653e-05, + "loss": 0.8468, + "step": 52550 + }, + { + "epoch": 0.3357908590266154, + "grad_norm": 0.8515467643737793, + "learning_rate": 9.320949131616132e-05, + "loss": 0.9673, + "step": 52560 + }, + { + "epoch": 0.3358547461763541, + "grad_norm": 0.7722886204719543, + "learning_rate": 9.320696637048915e-05, + "loss": 0.8924, + "step": 52570 + }, + { + "epoch": 0.3359186333260928, + "grad_norm": 0.7204701900482178, + "learning_rate": 9.320444098968545e-05, + "loss": 0.9221, + "step": 52580 + }, + { + "epoch": 0.3359825204758315, + "grad_norm": 1.010270595550537, + "learning_rate": 9.320191517377566e-05, + "loss": 1.2194, + "step": 52590 + }, + { + "epoch": 0.3360464076255702, + "grad_norm": 0.7149573564529419, + "learning_rate": 9.319938892278519e-05, + "loss": 0.9444, + "step": 52600 + }, + { + "epoch": 0.3361102947753089, + "grad_norm": 0.908594012260437, + "learning_rate": 9.31968622367395e-05, + "loss": 0.8676, + "step": 52610 + }, + { + "epoch": 0.3361741819250476, + "grad_norm": 0.9813511371612549, + "learning_rate": 9.319433511566406e-05, + "loss": 1.0841, + "step": 52620 + }, + { + "epoch": 0.3362380690747863, + "grad_norm": 0.6576645374298096, + "learning_rate": 9.31918075595843e-05, + "loss": 0.968, + "step": 52630 + }, + { + "epoch": 0.336301956224525, + "grad_norm": 0.9117244482040405, + "learning_rate": 9.318927956852566e-05, + "loss": 0.9304, + "step": 52640 + }, + { + "epoch": 0.3363658433742637, + "grad_norm": 0.8400249481201172, + "learning_rate": 9.318675114251361e-05, + "loss": 1.0644, + "step": 52650 + }, + { + "epoch": 0.3364297305240024, + "grad_norm": 0.950006365776062, + "learning_rate": 9.31842222815736e-05, + "loss": 0.8254, + "step": 52660 + }, + { + "epoch": 0.3364936176737411, + "grad_norm": 0.9387775659561157, + "learning_rate": 9.318169298573112e-05, + "loss": 0.9178, + "step": 52670 + }, + { + "epoch": 0.3365575048234798, + "grad_norm": 0.5320703387260437, + "learning_rate": 9.317916325501165e-05, + "loss": 0.5815, + "step": 52680 + }, + { + "epoch": 0.3366213919732185, + "grad_norm": 1.047491192817688, + "learning_rate": 9.317663308944064e-05, + "loss": 1.2488, + "step": 52690 + }, + { + "epoch": 0.3366852791229572, + "grad_norm": 2.0185956954956055, + "learning_rate": 9.317410248904358e-05, + "loss": 0.8944, + "step": 52700 + }, + { + "epoch": 0.3367491662726959, + "grad_norm": 0.8261764049530029, + "learning_rate": 9.317157145384596e-05, + "loss": 0.818, + "step": 52710 + }, + { + "epoch": 0.3368130534224346, + "grad_norm": 0.9799476265907288, + "learning_rate": 9.316903998387326e-05, + "loss": 0.7601, + "step": 52720 + }, + { + "epoch": 0.3368769405721733, + "grad_norm": 0.8541726469993591, + "learning_rate": 9.3166508079151e-05, + "loss": 0.9305, + "step": 52730 + }, + { + "epoch": 0.33694082772191203, + "grad_norm": 0.7811595797538757, + "learning_rate": 9.316397573970464e-05, + "loss": 0.82, + "step": 52740 + }, + { + "epoch": 0.33700471487165073, + "grad_norm": 0.7470584511756897, + "learning_rate": 9.316144296555971e-05, + "loss": 0.8631, + "step": 52750 + }, + { + "epoch": 0.33706860202138944, + "grad_norm": 0.8616728782653809, + "learning_rate": 9.315890975674169e-05, + "loss": 0.8319, + "step": 52760 + }, + { + "epoch": 0.33713248917112815, + "grad_norm": 0.6505323052406311, + "learning_rate": 9.315637611327614e-05, + "loss": 0.9409, + "step": 52770 + }, + { + "epoch": 0.3371963763208668, + "grad_norm": 1.1408954858779907, + "learning_rate": 9.315384203518853e-05, + "loss": 0.8325, + "step": 52780 + }, + { + "epoch": 0.3372602634706055, + "grad_norm": 0.8268606066703796, + "learning_rate": 9.31513075225044e-05, + "loss": 0.947, + "step": 52790 + }, + { + "epoch": 0.3373241506203442, + "grad_norm": 1.4688328504562378, + "learning_rate": 9.314877257524928e-05, + "loss": 0.9942, + "step": 52800 + }, + { + "epoch": 0.3373880377700829, + "grad_norm": 0.8979589343070984, + "learning_rate": 9.314623719344869e-05, + "loss": 0.8571, + "step": 52810 + }, + { + "epoch": 0.3374519249198216, + "grad_norm": 0.6567512154579163, + "learning_rate": 9.314370137712816e-05, + "loss": 0.6655, + "step": 52820 + }, + { + "epoch": 0.33751581206956033, + "grad_norm": 0.8439179062843323, + "learning_rate": 9.314116512631324e-05, + "loss": 0.8662, + "step": 52830 + }, + { + "epoch": 0.33757969921929903, + "grad_norm": 0.7378790378570557, + "learning_rate": 9.313862844102946e-05, + "loss": 0.8929, + "step": 52840 + }, + { + "epoch": 0.33764358636903774, + "grad_norm": 0.5747960209846497, + "learning_rate": 9.313609132130235e-05, + "loss": 0.8469, + "step": 52850 + }, + { + "epoch": 0.33770747351877645, + "grad_norm": 2.188962459564209, + "learning_rate": 9.313355376715751e-05, + "loss": 0.7715, + "step": 52860 + }, + { + "epoch": 0.33777136066851515, + "grad_norm": 1.6319129467010498, + "learning_rate": 9.313101577862046e-05, + "loss": 0.851, + "step": 52870 + }, + { + "epoch": 0.33783524781825386, + "grad_norm": 0.8833417892456055, + "learning_rate": 9.312847735571676e-05, + "loss": 0.9841, + "step": 52880 + }, + { + "epoch": 0.33789913496799256, + "grad_norm": 0.771787703037262, + "learning_rate": 9.312593849847198e-05, + "loss": 0.6755, + "step": 52890 + }, + { + "epoch": 0.3379630221177312, + "grad_norm": 1.289760947227478, + "learning_rate": 9.31233992069117e-05, + "loss": 1.1848, + "step": 52900 + }, + { + "epoch": 0.3380269092674699, + "grad_norm": 0.8547393083572388, + "learning_rate": 9.312085948106148e-05, + "loss": 1.0867, + "step": 52910 + }, + { + "epoch": 0.3380907964172086, + "grad_norm": 1.357723593711853, + "learning_rate": 9.311831932094691e-05, + "loss": 0.821, + "step": 52920 + }, + { + "epoch": 0.33815468356694733, + "grad_norm": 0.9254101514816284, + "learning_rate": 9.311577872659355e-05, + "loss": 1.188, + "step": 52930 + }, + { + "epoch": 0.33821857071668604, + "grad_norm": 0.9655906558036804, + "learning_rate": 9.311323769802701e-05, + "loss": 1.1519, + "step": 52940 + }, + { + "epoch": 0.33828245786642475, + "grad_norm": 0.9837827682495117, + "learning_rate": 9.311069623527285e-05, + "loss": 0.9612, + "step": 52950 + }, + { + "epoch": 0.33834634501616345, + "grad_norm": 0.7545758485794067, + "learning_rate": 9.310840854758487e-05, + "loss": 1.1672, + "step": 52960 + }, + { + "epoch": 0.33841023216590216, + "grad_norm": 0.9971650838851929, + "learning_rate": 9.31058662599448e-05, + "loss": 0.9516, + "step": 52970 + }, + { + "epoch": 0.33847411931564086, + "grad_norm": 0.8151521682739258, + "learning_rate": 9.310332353819136e-05, + "loss": 0.6755, + "step": 52980 + }, + { + "epoch": 0.33853800646537957, + "grad_norm": 1.5848335027694702, + "learning_rate": 9.310078038235014e-05, + "loss": 0.8026, + "step": 52990 + }, + { + "epoch": 0.3386018936151183, + "grad_norm": 1.3594563007354736, + "learning_rate": 9.30982367924468e-05, + "loss": 0.6692, + "step": 53000 + }, + { + "epoch": 0.338665780764857, + "grad_norm": 1.4335222244262695, + "learning_rate": 9.309569276850692e-05, + "loss": 0.8874, + "step": 53010 + }, + { + "epoch": 0.33872966791459563, + "grad_norm": 1.4923986196517944, + "learning_rate": 9.309314831055615e-05, + "loss": 1.0218, + "step": 53020 + }, + { + "epoch": 0.33879355506433434, + "grad_norm": 0.6935365796089172, + "learning_rate": 9.309060341862008e-05, + "loss": 0.8023, + "step": 53030 + }, + { + "epoch": 0.33885744221407305, + "grad_norm": 1.7542939186096191, + "learning_rate": 9.308805809272434e-05, + "loss": 0.7334, + "step": 53040 + }, + { + "epoch": 0.33892132936381175, + "grad_norm": 2.078371286392212, + "learning_rate": 9.30855123328946e-05, + "loss": 0.9812, + "step": 53050 + }, + { + "epoch": 0.33898521651355046, + "grad_norm": 0.6690249443054199, + "learning_rate": 9.308296613915647e-05, + "loss": 0.9794, + "step": 53060 + }, + { + "epoch": 0.33904910366328916, + "grad_norm": 0.8142697215080261, + "learning_rate": 9.30804195115356e-05, + "loss": 0.9776, + "step": 53070 + }, + { + "epoch": 0.33911299081302787, + "grad_norm": 0.7654648423194885, + "learning_rate": 9.307787245005764e-05, + "loss": 0.842, + "step": 53080 + }, + { + "epoch": 0.3391768779627666, + "grad_norm": 0.5504037141799927, + "learning_rate": 9.307532495474822e-05, + "loss": 0.7776, + "step": 53090 + }, + { + "epoch": 0.3392407651125053, + "grad_norm": 1.0997267961502075, + "learning_rate": 9.307277702563302e-05, + "loss": 0.685, + "step": 53100 + }, + { + "epoch": 0.339304652262244, + "grad_norm": 0.9791783690452576, + "learning_rate": 9.307022866273771e-05, + "loss": 1.1581, + "step": 53110 + }, + { + "epoch": 0.3393685394119827, + "grad_norm": 0.6219057440757751, + "learning_rate": 9.306767986608791e-05, + "loss": 0.9069, + "step": 53120 + }, + { + "epoch": 0.3394324265617214, + "grad_norm": 0.5955463647842407, + "learning_rate": 9.306513063570933e-05, + "loss": 1.112, + "step": 53130 + }, + { + "epoch": 0.33949631371146005, + "grad_norm": 0.7455695867538452, + "learning_rate": 9.306258097162763e-05, + "loss": 0.7857, + "step": 53140 + }, + { + "epoch": 0.33956020086119876, + "grad_norm": 0.9764438271522522, + "learning_rate": 9.306003087386848e-05, + "loss": 0.9552, + "step": 53150 + }, + { + "epoch": 0.33962408801093746, + "grad_norm": 0.6675849556922913, + "learning_rate": 9.305748034245756e-05, + "loss": 0.7883, + "step": 53160 + }, + { + "epoch": 0.33968797516067617, + "grad_norm": 0.9111708998680115, + "learning_rate": 9.305492937742057e-05, + "loss": 0.8918, + "step": 53170 + }, + { + "epoch": 0.3397518623104149, + "grad_norm": 2.7284460067749023, + "learning_rate": 9.30523779787832e-05, + "loss": 0.8114, + "step": 53180 + }, + { + "epoch": 0.3398157494601536, + "grad_norm": 0.586710512638092, + "learning_rate": 9.304982614657114e-05, + "loss": 0.856, + "step": 53190 + }, + { + "epoch": 0.3398796366098923, + "grad_norm": 0.644350528717041, + "learning_rate": 9.304727388081007e-05, + "loss": 0.8175, + "step": 53200 + }, + { + "epoch": 0.339943523759631, + "grad_norm": 0.6203905940055847, + "learning_rate": 9.304472118152572e-05, + "loss": 1.0128, + "step": 53210 + }, + { + "epoch": 0.3400074109093697, + "grad_norm": 0.840505063533783, + "learning_rate": 9.304216804874379e-05, + "loss": 0.8672, + "step": 53220 + }, + { + "epoch": 0.3400712980591084, + "grad_norm": 0.750717282295227, + "learning_rate": 9.303961448248998e-05, + "loss": 0.8607, + "step": 53230 + }, + { + "epoch": 0.3401351852088471, + "grad_norm": 0.7886949181556702, + "learning_rate": 9.303706048279004e-05, + "loss": 1.2132, + "step": 53240 + }, + { + "epoch": 0.3401990723585858, + "grad_norm": 0.9253231883049011, + "learning_rate": 9.303450604966966e-05, + "loss": 1.0289, + "step": 53250 + }, + { + "epoch": 0.34026295950832447, + "grad_norm": 1.0587670803070068, + "learning_rate": 9.303195118315455e-05, + "loss": 0.9249, + "step": 53260 + }, + { + "epoch": 0.3403268466580632, + "grad_norm": 1.1579573154449463, + "learning_rate": 9.302939588327048e-05, + "loss": 0.8702, + "step": 53270 + }, + { + "epoch": 0.3403907338078019, + "grad_norm": 1.4637956619262695, + "learning_rate": 9.302684015004318e-05, + "loss": 0.8417, + "step": 53280 + }, + { + "epoch": 0.3404546209575406, + "grad_norm": 2.425816774368286, + "learning_rate": 9.302428398349836e-05, + "loss": 0.8657, + "step": 53290 + }, + { + "epoch": 0.3405185081072793, + "grad_norm": 0.530267596244812, + "learning_rate": 9.30217273836618e-05, + "loss": 0.9432, + "step": 53300 + }, + { + "epoch": 0.340582395257018, + "grad_norm": 1.081075668334961, + "learning_rate": 9.30191703505592e-05, + "loss": 1.262, + "step": 53310 + }, + { + "epoch": 0.3406462824067567, + "grad_norm": 0.7147884964942932, + "learning_rate": 9.301661288421636e-05, + "loss": 0.8376, + "step": 53320 + }, + { + "epoch": 0.3407101695564954, + "grad_norm": 0.8092734217643738, + "learning_rate": 9.301405498465901e-05, + "loss": 0.8306, + "step": 53330 + }, + { + "epoch": 0.3407740567062341, + "grad_norm": 1.257656216621399, + "learning_rate": 9.30114966519129e-05, + "loss": 0.6576, + "step": 53340 + }, + { + "epoch": 0.3408379438559728, + "grad_norm": 0.7588216066360474, + "learning_rate": 9.30089378860038e-05, + "loss": 0.8001, + "step": 53350 + }, + { + "epoch": 0.34090183100571153, + "grad_norm": 2.2834153175354004, + "learning_rate": 9.300637868695752e-05, + "loss": 0.7371, + "step": 53360 + }, + { + "epoch": 0.34096571815545024, + "grad_norm": 1.2148463726043701, + "learning_rate": 9.300381905479978e-05, + "loss": 0.7611, + "step": 53370 + }, + { + "epoch": 0.3410296053051889, + "grad_norm": 0.7011250853538513, + "learning_rate": 9.300125898955639e-05, + "loss": 0.7491, + "step": 53380 + }, + { + "epoch": 0.3410934924549276, + "grad_norm": 0.9669275879859924, + "learning_rate": 9.299869849125311e-05, + "loss": 0.9306, + "step": 53390 + }, + { + "epoch": 0.3411573796046663, + "grad_norm": 0.8897387981414795, + "learning_rate": 9.299613755991573e-05, + "loss": 1.0307, + "step": 53400 + }, + { + "epoch": 0.341221266754405, + "grad_norm": 0.9630199670791626, + "learning_rate": 9.299357619557005e-05, + "loss": 1.1292, + "step": 53410 + }, + { + "epoch": 0.3412851539041437, + "grad_norm": 0.8969447016716003, + "learning_rate": 9.299101439824188e-05, + "loss": 0.9029, + "step": 53420 + }, + { + "epoch": 0.3413490410538824, + "grad_norm": 1.110783338546753, + "learning_rate": 9.298845216795699e-05, + "loss": 0.9651, + "step": 53430 + }, + { + "epoch": 0.3414129282036211, + "grad_norm": 0.8235384225845337, + "learning_rate": 9.29858895047412e-05, + "loss": 0.5702, + "step": 53440 + }, + { + "epoch": 0.34147681535335983, + "grad_norm": 1.1357210874557495, + "learning_rate": 9.298332640862032e-05, + "loss": 0.8345, + "step": 53450 + }, + { + "epoch": 0.34154070250309854, + "grad_norm": 0.7951391935348511, + "learning_rate": 9.298076287962016e-05, + "loss": 0.7113, + "step": 53460 + }, + { + "epoch": 0.34160458965283724, + "grad_norm": 0.9098735451698303, + "learning_rate": 9.297819891776651e-05, + "loss": 0.9365, + "step": 53470 + }, + { + "epoch": 0.34166847680257595, + "grad_norm": 0.6273751854896545, + "learning_rate": 9.297563452308525e-05, + "loss": 0.7352, + "step": 53480 + }, + { + "epoch": 0.34173236395231466, + "grad_norm": 0.4580266773700714, + "learning_rate": 9.297306969560213e-05, + "loss": 0.9588, + "step": 53490 + }, + { + "epoch": 0.3417962511020533, + "grad_norm": 1.1689975261688232, + "learning_rate": 9.297050443534305e-05, + "loss": 0.7314, + "step": 53500 + }, + { + "epoch": 0.341860138251792, + "grad_norm": 0.8858540058135986, + "learning_rate": 9.29679387423338e-05, + "loss": 0.8758, + "step": 53510 + }, + { + "epoch": 0.3419240254015307, + "grad_norm": 0.7352036833763123, + "learning_rate": 9.296537261660026e-05, + "loss": 0.9193, + "step": 53520 + }, + { + "epoch": 0.3419879125512694, + "grad_norm": 1.1787981986999512, + "learning_rate": 9.296280605816823e-05, + "loss": 0.853, + "step": 53530 + }, + { + "epoch": 0.34205179970100813, + "grad_norm": 0.8490791320800781, + "learning_rate": 9.296023906706357e-05, + "loss": 1.2468, + "step": 53540 + }, + { + "epoch": 0.34211568685074684, + "grad_norm": 1.1873284578323364, + "learning_rate": 9.295767164331215e-05, + "loss": 1.1106, + "step": 53550 + }, + { + "epoch": 0.34217957400048554, + "grad_norm": 1.3740506172180176, + "learning_rate": 9.29551037869398e-05, + "loss": 0.8165, + "step": 53560 + }, + { + "epoch": 0.34224346115022425, + "grad_norm": 1.074511170387268, + "learning_rate": 9.295253549797241e-05, + "loss": 0.8433, + "step": 53570 + }, + { + "epoch": 0.34230734829996295, + "grad_norm": 0.9406700134277344, + "learning_rate": 9.294996677643581e-05, + "loss": 0.844, + "step": 53580 + }, + { + "epoch": 0.34237123544970166, + "grad_norm": 0.9452252984046936, + "learning_rate": 9.294739762235589e-05, + "loss": 0.7768, + "step": 53590 + }, + { + "epoch": 0.34243512259944037, + "grad_norm": 0.8128929734230042, + "learning_rate": 9.294482803575853e-05, + "loss": 0.622, + "step": 53600 + }, + { + "epoch": 0.3424990097491791, + "grad_norm": 0.825412392616272, + "learning_rate": 9.294225801666959e-05, + "loss": 1.0291, + "step": 53610 + }, + { + "epoch": 0.3425628968989178, + "grad_norm": 1.06623113155365, + "learning_rate": 9.293968756511496e-05, + "loss": 1.0841, + "step": 53620 + }, + { + "epoch": 0.34262678404865643, + "grad_norm": 0.981828510761261, + "learning_rate": 9.293711668112054e-05, + "loss": 0.8458, + "step": 53630 + }, + { + "epoch": 0.34269067119839514, + "grad_norm": 1.0561970472335815, + "learning_rate": 9.29345453647122e-05, + "loss": 0.7624, + "step": 53640 + }, + { + "epoch": 0.34275455834813384, + "grad_norm": 0.7628150582313538, + "learning_rate": 9.293197361591586e-05, + "loss": 0.8328, + "step": 53650 + }, + { + "epoch": 0.34281844549787255, + "grad_norm": 0.9464593529701233, + "learning_rate": 9.292940143475737e-05, + "loss": 0.9501, + "step": 53660 + }, + { + "epoch": 0.34288233264761125, + "grad_norm": 2.0435502529144287, + "learning_rate": 9.292682882126272e-05, + "loss": 0.864, + "step": 53670 + }, + { + "epoch": 0.34294621979734996, + "grad_norm": 1.0263941287994385, + "learning_rate": 9.292425577545772e-05, + "loss": 0.8141, + "step": 53680 + }, + { + "epoch": 0.34301010694708867, + "grad_norm": 0.7042751908302307, + "learning_rate": 9.292168229736836e-05, + "loss": 0.7909, + "step": 53690 + }, + { + "epoch": 0.3430739940968274, + "grad_norm": 1.1945339441299438, + "learning_rate": 9.29191083870205e-05, + "loss": 0.9799, + "step": 53700 + }, + { + "epoch": 0.3431378812465661, + "grad_norm": 0.965678870677948, + "learning_rate": 9.29165340444401e-05, + "loss": 0.8093, + "step": 53710 + }, + { + "epoch": 0.3432017683963048, + "grad_norm": 2.425915241241455, + "learning_rate": 9.291395926965307e-05, + "loss": 1.0032, + "step": 53720 + }, + { + "epoch": 0.3432656555460435, + "grad_norm": 0.5332554578781128, + "learning_rate": 9.291138406268536e-05, + "loss": 0.7977, + "step": 53730 + }, + { + "epoch": 0.3433295426957822, + "grad_norm": 2.8045296669006348, + "learning_rate": 9.290880842356287e-05, + "loss": 0.9274, + "step": 53740 + }, + { + "epoch": 0.34339342984552085, + "grad_norm": 0.7845577597618103, + "learning_rate": 9.290623235231157e-05, + "loss": 1.0535, + "step": 53750 + }, + { + "epoch": 0.34345731699525955, + "grad_norm": 0.9177809953689575, + "learning_rate": 9.290365584895739e-05, + "loss": 0.9278, + "step": 53760 + }, + { + "epoch": 0.34352120414499826, + "grad_norm": 0.9220765829086304, + "learning_rate": 9.290107891352628e-05, + "loss": 0.945, + "step": 53770 + }, + { + "epoch": 0.34358509129473697, + "grad_norm": 0.8571166396141052, + "learning_rate": 9.289850154604417e-05, + "loss": 0.903, + "step": 53780 + }, + { + "epoch": 0.3436489784444757, + "grad_norm": 0.8738123178482056, + "learning_rate": 9.289592374653708e-05, + "loss": 0.9928, + "step": 53790 + }, + { + "epoch": 0.3437128655942144, + "grad_norm": 0.7225977778434753, + "learning_rate": 9.28933455150309e-05, + "loss": 0.8828, + "step": 53800 + }, + { + "epoch": 0.3437767527439531, + "grad_norm": 1.3303672075271606, + "learning_rate": 9.289076685155162e-05, + "loss": 1.0604, + "step": 53810 + }, + { + "epoch": 0.3438406398936918, + "grad_norm": 0.8628764152526855, + "learning_rate": 9.28881877561252e-05, + "loss": 1.1439, + "step": 53820 + }, + { + "epoch": 0.3439045270434305, + "grad_norm": 0.6281081438064575, + "learning_rate": 9.288560822877765e-05, + "loss": 0.9286, + "step": 53830 + }, + { + "epoch": 0.3439684141931692, + "grad_norm": 0.6044685244560242, + "learning_rate": 9.288302826953492e-05, + "loss": 1.1626, + "step": 53840 + }, + { + "epoch": 0.3440323013429079, + "grad_norm": 0.847324788570404, + "learning_rate": 9.288044787842298e-05, + "loss": 0.7661, + "step": 53850 + }, + { + "epoch": 0.3440961884926466, + "grad_norm": 0.9134111404418945, + "learning_rate": 9.287786705546785e-05, + "loss": 0.7944, + "step": 53860 + }, + { + "epoch": 0.34416007564238527, + "grad_norm": 1.3941556215286255, + "learning_rate": 9.287528580069551e-05, + "loss": 1.2369, + "step": 53870 + }, + { + "epoch": 0.34422396279212397, + "grad_norm": 0.8589109182357788, + "learning_rate": 9.287270411413194e-05, + "loss": 0.6585, + "step": 53880 + }, + { + "epoch": 0.3442878499418627, + "grad_norm": 0.49347206950187683, + "learning_rate": 9.287012199580315e-05, + "loss": 0.8574, + "step": 53890 + }, + { + "epoch": 0.3443517370916014, + "grad_norm": 0.9737316370010376, + "learning_rate": 9.286753944573514e-05, + "loss": 0.6949, + "step": 53900 + }, + { + "epoch": 0.3444156242413401, + "grad_norm": 1.0737287998199463, + "learning_rate": 9.286495646395392e-05, + "loss": 0.9367, + "step": 53910 + }, + { + "epoch": 0.3444795113910788, + "grad_norm": 0.9133766293525696, + "learning_rate": 9.28623730504855e-05, + "loss": 0.7619, + "step": 53920 + }, + { + "epoch": 0.3445433985408175, + "grad_norm": 0.784355640411377, + "learning_rate": 9.285978920535592e-05, + "loss": 1.0223, + "step": 53930 + }, + { + "epoch": 0.3446072856905562, + "grad_norm": 0.7285311818122864, + "learning_rate": 9.285720492859118e-05, + "loss": 0.9259, + "step": 53940 + }, + { + "epoch": 0.3446711728402949, + "grad_norm": 0.8762960433959961, + "learning_rate": 9.28546202202173e-05, + "loss": 0.8949, + "step": 53950 + }, + { + "epoch": 0.3447350599900336, + "grad_norm": 0.8869500756263733, + "learning_rate": 9.285203508026032e-05, + "loss": 0.8522, + "step": 53960 + }, + { + "epoch": 0.3447989471397723, + "grad_norm": 0.7807958722114563, + "learning_rate": 9.284944950874628e-05, + "loss": 1.0629, + "step": 53970 + }, + { + "epoch": 0.34486283428951103, + "grad_norm": 2.027085542678833, + "learning_rate": 9.284686350570121e-05, + "loss": 0.9566, + "step": 53980 + }, + { + "epoch": 0.3449267214392497, + "grad_norm": 1.2319154739379883, + "learning_rate": 9.284427707115116e-05, + "loss": 0.718, + "step": 53990 + }, + { + "epoch": 0.3449906085889884, + "grad_norm": 0.5686825513839722, + "learning_rate": 9.284169020512217e-05, + "loss": 0.7659, + "step": 54000 + }, + { + "epoch": 0.3450544957387271, + "grad_norm": 0.9346210956573486, + "learning_rate": 9.283910290764029e-05, + "loss": 0.8805, + "step": 54010 + }, + { + "epoch": 0.3451183828884658, + "grad_norm": 1.0254408121109009, + "learning_rate": 9.28365151787316e-05, + "loss": 1.0447, + "step": 54020 + }, + { + "epoch": 0.3451822700382045, + "grad_norm": 1.1026064157485962, + "learning_rate": 9.283392701842213e-05, + "loss": 1.11, + "step": 54030 + }, + { + "epoch": 0.3452461571879432, + "grad_norm": 1.1791328191757202, + "learning_rate": 9.283133842673797e-05, + "loss": 0.9846, + "step": 54040 + }, + { + "epoch": 0.3453100443376819, + "grad_norm": 0.6459341049194336, + "learning_rate": 9.282874940370517e-05, + "loss": 0.8446, + "step": 54050 + }, + { + "epoch": 0.3453739314874206, + "grad_norm": 0.7654846906661987, + "learning_rate": 9.282615994934982e-05, + "loss": 1.1735, + "step": 54060 + }, + { + "epoch": 0.34543781863715933, + "grad_norm": 1.2747883796691895, + "learning_rate": 9.282357006369798e-05, + "loss": 0.9468, + "step": 54070 + }, + { + "epoch": 0.34550170578689804, + "grad_norm": 0.5862970352172852, + "learning_rate": 9.282097974677574e-05, + "loss": 0.5708, + "step": 54080 + }, + { + "epoch": 0.34556559293663675, + "grad_norm": 1.1748859882354736, + "learning_rate": 9.28183889986092e-05, + "loss": 0.9329, + "step": 54090 + }, + { + "epoch": 0.34562948008637545, + "grad_norm": 0.7171411514282227, + "learning_rate": 9.281579781922442e-05, + "loss": 1.0105, + "step": 54100 + }, + { + "epoch": 0.3456933672361141, + "grad_norm": 0.8102126717567444, + "learning_rate": 9.281320620864754e-05, + "loss": 0.8918, + "step": 54110 + }, + { + "epoch": 0.3457572543858528, + "grad_norm": 1.1540294885635376, + "learning_rate": 9.281061416690462e-05, + "loss": 0.794, + "step": 54120 + }, + { + "epoch": 0.3458211415355915, + "grad_norm": 0.4848040044307709, + "learning_rate": 9.280802169402178e-05, + "loss": 0.7435, + "step": 54130 + }, + { + "epoch": 0.3458850286853302, + "grad_norm": 1.207207202911377, + "learning_rate": 9.280542879002512e-05, + "loss": 0.9234, + "step": 54140 + }, + { + "epoch": 0.3459489158350689, + "grad_norm": 0.7210013270378113, + "learning_rate": 9.280283545494077e-05, + "loss": 0.942, + "step": 54150 + }, + { + "epoch": 0.34601280298480763, + "grad_norm": 2.0840461254119873, + "learning_rate": 9.280024168879482e-05, + "loss": 0.828, + "step": 54160 + }, + { + "epoch": 0.34607669013454634, + "grad_norm": 0.8813756704330444, + "learning_rate": 9.279764749161344e-05, + "loss": 0.8051, + "step": 54170 + }, + { + "epoch": 0.34614057728428504, + "grad_norm": 0.606823205947876, + "learning_rate": 9.27950528634227e-05, + "loss": 1.0424, + "step": 54180 + }, + { + "epoch": 0.34620446443402375, + "grad_norm": 0.9201170206069946, + "learning_rate": 9.279245780424876e-05, + "loss": 0.772, + "step": 54190 + }, + { + "epoch": 0.34626835158376246, + "grad_norm": 0.9958915710449219, + "learning_rate": 9.278986231411776e-05, + "loss": 0.6918, + "step": 54200 + }, + { + "epoch": 0.34633223873350116, + "grad_norm": 0.7050455212593079, + "learning_rate": 9.278726639305581e-05, + "loss": 0.7851, + "step": 54210 + }, + { + "epoch": 0.34639612588323987, + "grad_norm": 0.9351766109466553, + "learning_rate": 9.27846700410891e-05, + "loss": 0.796, + "step": 54220 + }, + { + "epoch": 0.3464600130329785, + "grad_norm": 2.2169976234436035, + "learning_rate": 9.278207325824373e-05, + "loss": 0.9384, + "step": 54230 + }, + { + "epoch": 0.3465239001827172, + "grad_norm": 1.0841212272644043, + "learning_rate": 9.277947604454587e-05, + "loss": 0.996, + "step": 54240 + }, + { + "epoch": 0.34658778733245593, + "grad_norm": 1.3543506860733032, + "learning_rate": 9.277687840002167e-05, + "loss": 0.7683, + "step": 54250 + }, + { + "epoch": 0.34665167448219464, + "grad_norm": 0.7349464297294617, + "learning_rate": 9.277428032469731e-05, + "loss": 0.9666, + "step": 54260 + }, + { + "epoch": 0.34671556163193334, + "grad_norm": 1.0820789337158203, + "learning_rate": 9.277168181859893e-05, + "loss": 0.6931, + "step": 54270 + }, + { + "epoch": 0.34677944878167205, + "grad_norm": 0.6938410997390747, + "learning_rate": 9.276908288175272e-05, + "loss": 0.8715, + "step": 54280 + }, + { + "epoch": 0.34684333593141076, + "grad_norm": 0.7121148109436035, + "learning_rate": 9.276648351418484e-05, + "loss": 0.8973, + "step": 54290 + }, + { + "epoch": 0.34690722308114946, + "grad_norm": 0.8023224472999573, + "learning_rate": 9.276388371592149e-05, + "loss": 0.8985, + "step": 54300 + }, + { + "epoch": 0.34697111023088817, + "grad_norm": 1.1684279441833496, + "learning_rate": 9.276128348698881e-05, + "loss": 0.8147, + "step": 54310 + }, + { + "epoch": 0.3470349973806269, + "grad_norm": 1.3102762699127197, + "learning_rate": 9.275868282741303e-05, + "loss": 0.8545, + "step": 54320 + }, + { + "epoch": 0.3470988845303656, + "grad_norm": 0.8226547837257385, + "learning_rate": 9.27560817372203e-05, + "loss": 0.6908, + "step": 54330 + }, + { + "epoch": 0.3471627716801043, + "grad_norm": 1.1584205627441406, + "learning_rate": 9.275348021643686e-05, + "loss": 0.8704, + "step": 54340 + }, + { + "epoch": 0.34722665882984294, + "grad_norm": 0.852271556854248, + "learning_rate": 9.275087826508887e-05, + "loss": 0.8696, + "step": 54350 + }, + { + "epoch": 0.34729054597958164, + "grad_norm": 2.0320937633514404, + "learning_rate": 9.274827588320257e-05, + "loss": 0.6919, + "step": 54360 + }, + { + "epoch": 0.34735443312932035, + "grad_norm": 0.7250359058380127, + "learning_rate": 9.274567307080412e-05, + "loss": 0.8589, + "step": 54370 + }, + { + "epoch": 0.34741832027905906, + "grad_norm": 1.2491799592971802, + "learning_rate": 9.27430698279198e-05, + "loss": 0.8009, + "step": 54380 + }, + { + "epoch": 0.34748220742879776, + "grad_norm": 0.9660385251045227, + "learning_rate": 9.274046615457577e-05, + "loss": 1.0643, + "step": 54390 + }, + { + "epoch": 0.34754609457853647, + "grad_norm": 0.9620506167411804, + "learning_rate": 9.273786205079826e-05, + "loss": 0.8099, + "step": 54400 + }, + { + "epoch": 0.3476099817282752, + "grad_norm": 0.6800320744514465, + "learning_rate": 9.273525751661353e-05, + "loss": 1.039, + "step": 54410 + }, + { + "epoch": 0.3476738688780139, + "grad_norm": 0.6459980010986328, + "learning_rate": 9.273265255204778e-05, + "loss": 0.94, + "step": 54420 + }, + { + "epoch": 0.3477377560277526, + "grad_norm": 0.5387960076332092, + "learning_rate": 9.273004715712723e-05, + "loss": 0.9139, + "step": 54430 + }, + { + "epoch": 0.3478016431774913, + "grad_norm": 0.9442420601844788, + "learning_rate": 9.272744133187816e-05, + "loss": 0.813, + "step": 54440 + }, + { + "epoch": 0.34786553032723, + "grad_norm": 0.6634787321090698, + "learning_rate": 9.272483507632676e-05, + "loss": 0.6832, + "step": 54450 + }, + { + "epoch": 0.3479294174769687, + "grad_norm": 0.7288976311683655, + "learning_rate": 9.272222839049933e-05, + "loss": 0.873, + "step": 54460 + }, + { + "epoch": 0.3479933046267074, + "grad_norm": 1.1111667156219482, + "learning_rate": 9.27196212744221e-05, + "loss": 1.1114, + "step": 54470 + }, + { + "epoch": 0.34805719177644606, + "grad_norm": 1.427985668182373, + "learning_rate": 9.271701372812134e-05, + "loss": 0.8317, + "step": 54480 + }, + { + "epoch": 0.34812107892618477, + "grad_norm": 0.5816881656646729, + "learning_rate": 9.271440575162328e-05, + "loss": 0.9996, + "step": 54490 + }, + { + "epoch": 0.3481849660759235, + "grad_norm": 1.1159511804580688, + "learning_rate": 9.27117973449542e-05, + "loss": 0.8265, + "step": 54500 + }, + { + "epoch": 0.3482488532256622, + "grad_norm": 1.1096454858779907, + "learning_rate": 9.270918850814037e-05, + "loss": 0.9171, + "step": 54510 + }, + { + "epoch": 0.3483127403754009, + "grad_norm": 0.5924681425094604, + "learning_rate": 9.270657924120808e-05, + "loss": 0.9957, + "step": 54520 + }, + { + "epoch": 0.3483766275251396, + "grad_norm": 3.2207529544830322, + "learning_rate": 9.270396954418357e-05, + "loss": 1.1604, + "step": 54530 + }, + { + "epoch": 0.3484405146748783, + "grad_norm": 1.6686917543411255, + "learning_rate": 9.270135941709315e-05, + "loss": 0.7374, + "step": 54540 + }, + { + "epoch": 0.348504401824617, + "grad_norm": 0.8895770907402039, + "learning_rate": 9.26987488599631e-05, + "loss": 0.8122, + "step": 54550 + }, + { + "epoch": 0.3485682889743557, + "grad_norm": 0.9306029081344604, + "learning_rate": 9.26961378728197e-05, + "loss": 0.7176, + "step": 54560 + }, + { + "epoch": 0.3486321761240944, + "grad_norm": 1.2075837850570679, + "learning_rate": 9.269352645568927e-05, + "loss": 0.8263, + "step": 54570 + }, + { + "epoch": 0.3486960632738331, + "grad_norm": 0.7406107187271118, + "learning_rate": 9.269091460859807e-05, + "loss": 0.8766, + "step": 54580 + }, + { + "epoch": 0.34875995042357183, + "grad_norm": 0.717327892780304, + "learning_rate": 9.268830233157245e-05, + "loss": 1.2179, + "step": 54590 + }, + { + "epoch": 0.3488238375733105, + "grad_norm": 0.9631721377372742, + "learning_rate": 9.268568962463868e-05, + "loss": 0.9515, + "step": 54600 + }, + { + "epoch": 0.3488877247230492, + "grad_norm": 0.9041351675987244, + "learning_rate": 9.26830764878231e-05, + "loss": 1.1696, + "step": 54610 + }, + { + "epoch": 0.3489516118727879, + "grad_norm": 0.8273685574531555, + "learning_rate": 9.2680462921152e-05, + "loss": 1.1219, + "step": 54620 + }, + { + "epoch": 0.3490154990225266, + "grad_norm": 0.5111979842185974, + "learning_rate": 9.267784892465172e-05, + "loss": 0.8967, + "step": 54630 + }, + { + "epoch": 0.3490793861722653, + "grad_norm": 0.8033791184425354, + "learning_rate": 9.267523449834858e-05, + "loss": 0.8957, + "step": 54640 + }, + { + "epoch": 0.349143273322004, + "grad_norm": 0.8571832776069641, + "learning_rate": 9.267261964226892e-05, + "loss": 1.0502, + "step": 54650 + }, + { + "epoch": 0.3492071604717427, + "grad_norm": 0.4170287549495697, + "learning_rate": 9.267000435643904e-05, + "loss": 0.8696, + "step": 54660 + }, + { + "epoch": 0.3492710476214814, + "grad_norm": 0.687233567237854, + "learning_rate": 9.266738864088533e-05, + "loss": 0.787, + "step": 54670 + }, + { + "epoch": 0.34933493477122013, + "grad_norm": 0.8800210356712341, + "learning_rate": 9.266477249563408e-05, + "loss": 0.8221, + "step": 54680 + }, + { + "epoch": 0.34939882192095884, + "grad_norm": 0.8803540468215942, + "learning_rate": 9.266215592071167e-05, + "loss": 1.0652, + "step": 54690 + }, + { + "epoch": 0.34946270907069754, + "grad_norm": 0.5410533547401428, + "learning_rate": 9.265953891614445e-05, + "loss": 0.9378, + "step": 54700 + }, + { + "epoch": 0.34952659622043625, + "grad_norm": 0.5955383777618408, + "learning_rate": 9.265692148195875e-05, + "loss": 0.8833, + "step": 54710 + }, + { + "epoch": 0.3495904833701749, + "grad_norm": 0.633705735206604, + "learning_rate": 9.265430361818096e-05, + "loss": 1.004, + "step": 54720 + }, + { + "epoch": 0.3496543705199136, + "grad_norm": 0.9979560971260071, + "learning_rate": 9.265168532483744e-05, + "loss": 0.8923, + "step": 54730 + }, + { + "epoch": 0.3497182576696523, + "grad_norm": 0.5315431952476501, + "learning_rate": 9.264906660195453e-05, + "loss": 0.7914, + "step": 54740 + }, + { + "epoch": 0.349782144819391, + "grad_norm": 1.4878370761871338, + "learning_rate": 9.264644744955863e-05, + "loss": 1.0878, + "step": 54750 + }, + { + "epoch": 0.3498460319691297, + "grad_norm": 0.9964064359664917, + "learning_rate": 9.264382786767612e-05, + "loss": 1.1167, + "step": 54760 + }, + { + "epoch": 0.34990991911886843, + "grad_norm": 0.8638894557952881, + "learning_rate": 9.264120785633335e-05, + "loss": 0.7903, + "step": 54770 + }, + { + "epoch": 0.34997380626860713, + "grad_norm": 0.7577997446060181, + "learning_rate": 9.263858741555674e-05, + "loss": 0.86, + "step": 54780 + }, + { + "epoch": 0.35003769341834584, + "grad_norm": 0.9834237694740295, + "learning_rate": 9.263596654537265e-05, + "loss": 0.8051, + "step": 54790 + }, + { + "epoch": 0.35010158056808455, + "grad_norm": 0.9026603102684021, + "learning_rate": 9.263334524580751e-05, + "loss": 0.9596, + "step": 54800 + }, + { + "epoch": 0.35016546771782325, + "grad_norm": 1.9557400941848755, + "learning_rate": 9.26307235168877e-05, + "loss": 0.9289, + "step": 54810 + }, + { + "epoch": 0.35022935486756196, + "grad_norm": 0.5680462718009949, + "learning_rate": 9.262810135863962e-05, + "loss": 1.1719, + "step": 54820 + }, + { + "epoch": 0.35029324201730067, + "grad_norm": 1.077825665473938, + "learning_rate": 9.26254787710897e-05, + "loss": 0.8691, + "step": 54830 + }, + { + "epoch": 0.3503571291670393, + "grad_norm": 1.1171085834503174, + "learning_rate": 9.262285575426431e-05, + "loss": 0.7501, + "step": 54840 + }, + { + "epoch": 0.350421016316778, + "grad_norm": 0.8400352001190186, + "learning_rate": 9.262023230818987e-05, + "loss": 0.8568, + "step": 54850 + }, + { + "epoch": 0.35048490346651673, + "grad_norm": 0.8587310910224915, + "learning_rate": 9.261760843289284e-05, + "loss": 1.1126, + "step": 54860 + }, + { + "epoch": 0.35054879061625543, + "grad_norm": 0.979992687702179, + "learning_rate": 9.261498412839963e-05, + "loss": 0.7968, + "step": 54870 + }, + { + "epoch": 0.35061267776599414, + "grad_norm": 1.4268198013305664, + "learning_rate": 9.261235939473665e-05, + "loss": 0.9709, + "step": 54880 + }, + { + "epoch": 0.35067656491573285, + "grad_norm": 0.8531477451324463, + "learning_rate": 9.260973423193036e-05, + "loss": 1.1384, + "step": 54890 + }, + { + "epoch": 0.35074045206547155, + "grad_norm": 0.8192383050918579, + "learning_rate": 9.260710864000718e-05, + "loss": 0.9567, + "step": 54900 + }, + { + "epoch": 0.35080433921521026, + "grad_norm": 0.6545119881629944, + "learning_rate": 9.260448261899355e-05, + "loss": 0.8276, + "step": 54910 + }, + { + "epoch": 0.35086822636494897, + "grad_norm": 1.2469779253005981, + "learning_rate": 9.260185616891592e-05, + "loss": 0.9101, + "step": 54920 + }, + { + "epoch": 0.35093211351468767, + "grad_norm": 1.0227653980255127, + "learning_rate": 9.259922928980075e-05, + "loss": 1.0185, + "step": 54930 + }, + { + "epoch": 0.3509960006644264, + "grad_norm": 0.8625701665878296, + "learning_rate": 9.259660198167449e-05, + "loss": 1.0336, + "step": 54940 + }, + { + "epoch": 0.3510598878141651, + "grad_norm": 1.399640679359436, + "learning_rate": 9.259397424456359e-05, + "loss": 0.9261, + "step": 54950 + }, + { + "epoch": 0.35112377496390373, + "grad_norm": 1.6561399698257446, + "learning_rate": 9.259134607849451e-05, + "loss": 0.8661, + "step": 54960 + }, + { + "epoch": 0.35118766211364244, + "grad_norm": 0.7466694712638855, + "learning_rate": 9.258871748349375e-05, + "loss": 0.8944, + "step": 54970 + }, + { + "epoch": 0.35125154926338115, + "grad_norm": 2.0133652687072754, + "learning_rate": 9.258608845958774e-05, + "loss": 0.7284, + "step": 54980 + }, + { + "epoch": 0.35131543641311985, + "grad_norm": 0.8402307629585266, + "learning_rate": 9.258345900680299e-05, + "loss": 1.1441, + "step": 54990 + }, + { + "epoch": 0.35137932356285856, + "grad_norm": 0.6770734190940857, + "learning_rate": 9.258082912516597e-05, + "loss": 0.9305, + "step": 55000 + }, + { + "epoch": 0.35144321071259726, + "grad_norm": 1.0294511318206787, + "learning_rate": 9.257819881470315e-05, + "loss": 0.7655, + "step": 55010 + }, + { + "epoch": 0.35150709786233597, + "grad_norm": 0.6236374974250793, + "learning_rate": 9.257556807544106e-05, + "loss": 0.6974, + "step": 55020 + }, + { + "epoch": 0.3515709850120747, + "grad_norm": 0.7847385406494141, + "learning_rate": 9.257293690740614e-05, + "loss": 1.0462, + "step": 55030 + }, + { + "epoch": 0.3516348721618134, + "grad_norm": 0.6366947293281555, + "learning_rate": 9.257030531062492e-05, + "loss": 0.9091, + "step": 55040 + }, + { + "epoch": 0.3516987593115521, + "grad_norm": 0.9689487218856812, + "learning_rate": 9.25676732851239e-05, + "loss": 1.14, + "step": 55050 + }, + { + "epoch": 0.3517626464612908, + "grad_norm": 0.7967630624771118, + "learning_rate": 9.256504083092959e-05, + "loss": 0.7999, + "step": 55060 + }, + { + "epoch": 0.3518265336110295, + "grad_norm": 0.6108505725860596, + "learning_rate": 9.256240794806847e-05, + "loss": 1.1205, + "step": 55070 + }, + { + "epoch": 0.35189042076076815, + "grad_norm": 1.0797632932662964, + "learning_rate": 9.25597746365671e-05, + "loss": 0.8593, + "step": 55080 + }, + { + "epoch": 0.35195430791050686, + "grad_norm": 0.7324128150939941, + "learning_rate": 9.255714089645198e-05, + "loss": 0.9101, + "step": 55090 + }, + { + "epoch": 0.35201819506024556, + "grad_norm": 0.6534935235977173, + "learning_rate": 9.255450672774964e-05, + "loss": 1.2862, + "step": 55100 + }, + { + "epoch": 0.35208208220998427, + "grad_norm": 0.7674654722213745, + "learning_rate": 9.255187213048658e-05, + "loss": 1.0429, + "step": 55110 + }, + { + "epoch": 0.352145969359723, + "grad_norm": 0.8261142373085022, + "learning_rate": 9.254923710468937e-05, + "loss": 0.8614, + "step": 55120 + }, + { + "epoch": 0.3522098565094617, + "grad_norm": 1.2243504524230957, + "learning_rate": 9.254660165038453e-05, + "loss": 0.9836, + "step": 55130 + }, + { + "epoch": 0.3522737436592004, + "grad_norm": 0.9247923493385315, + "learning_rate": 9.254396576759861e-05, + "loss": 0.9118, + "step": 55140 + }, + { + "epoch": 0.3523376308089391, + "grad_norm": 1.049172043800354, + "learning_rate": 9.254132945635814e-05, + "loss": 1.1066, + "step": 55150 + }, + { + "epoch": 0.3524015179586778, + "grad_norm": 0.9203839302062988, + "learning_rate": 9.253869271668967e-05, + "loss": 1.0225, + "step": 55160 + }, + { + "epoch": 0.3524654051084165, + "grad_norm": 0.4765165448188782, + "learning_rate": 9.253605554861978e-05, + "loss": 0.7226, + "step": 55170 + }, + { + "epoch": 0.3525292922581552, + "grad_norm": 1.0200433731079102, + "learning_rate": 9.2533417952175e-05, + "loss": 0.9903, + "step": 55180 + }, + { + "epoch": 0.3525931794078939, + "grad_norm": 1.3597415685653687, + "learning_rate": 9.253077992738192e-05, + "loss": 0.7764, + "step": 55190 + }, + { + "epoch": 0.35265706655763257, + "grad_norm": 0.7081646919250488, + "learning_rate": 9.252814147426708e-05, + "loss": 0.9052, + "step": 55200 + }, + { + "epoch": 0.3527209537073713, + "grad_norm": 0.5674062967300415, + "learning_rate": 9.252550259285707e-05, + "loss": 0.8937, + "step": 55210 + }, + { + "epoch": 0.35278484085711, + "grad_norm": 0.8797856569290161, + "learning_rate": 9.252286328317846e-05, + "loss": 0.6981, + "step": 55220 + }, + { + "epoch": 0.3528487280068487, + "grad_norm": 0.6591719388961792, + "learning_rate": 9.252022354525783e-05, + "loss": 0.7734, + "step": 55230 + }, + { + "epoch": 0.3529126151565874, + "grad_norm": 0.9455986022949219, + "learning_rate": 9.251758337912174e-05, + "loss": 0.7539, + "step": 55240 + }, + { + "epoch": 0.3529765023063261, + "grad_norm": 0.6497638821601868, + "learning_rate": 9.251494278479682e-05, + "loss": 0.8169, + "step": 55250 + }, + { + "epoch": 0.3530403894560648, + "grad_norm": 0.9514163136482239, + "learning_rate": 9.251230176230965e-05, + "loss": 1.2422, + "step": 55260 + }, + { + "epoch": 0.3531042766058035, + "grad_norm": 1.0354559421539307, + "learning_rate": 9.250966031168682e-05, + "loss": 0.8663, + "step": 55270 + }, + { + "epoch": 0.3531681637555422, + "grad_norm": 0.6657097935676575, + "learning_rate": 9.250701843295492e-05, + "loss": 1.169, + "step": 55280 + }, + { + "epoch": 0.3532320509052809, + "grad_norm": 0.6656765937805176, + "learning_rate": 9.25043761261406e-05, + "loss": 0.722, + "step": 55290 + }, + { + "epoch": 0.35329593805501963, + "grad_norm": 0.7539229989051819, + "learning_rate": 9.250173339127042e-05, + "loss": 0.8882, + "step": 55300 + }, + { + "epoch": 0.35335982520475834, + "grad_norm": 2.1997039318084717, + "learning_rate": 9.249909022837102e-05, + "loss": 0.8417, + "step": 55310 + }, + { + "epoch": 0.35342371235449704, + "grad_norm": 0.5912847518920898, + "learning_rate": 9.249644663746901e-05, + "loss": 0.8431, + "step": 55320 + }, + { + "epoch": 0.3534875995042357, + "grad_norm": 1.1441770792007446, + "learning_rate": 9.249380261859103e-05, + "loss": 0.6843, + "step": 55330 + }, + { + "epoch": 0.3535514866539744, + "grad_norm": 1.2015843391418457, + "learning_rate": 9.249115817176368e-05, + "loss": 0.825, + "step": 55340 + }, + { + "epoch": 0.3536153738037131, + "grad_norm": 0.9341386556625366, + "learning_rate": 9.248851329701362e-05, + "loss": 1.0235, + "step": 55350 + }, + { + "epoch": 0.3536792609534518, + "grad_norm": 0.8819360733032227, + "learning_rate": 9.248586799436747e-05, + "loss": 0.8604, + "step": 55360 + }, + { + "epoch": 0.3537431481031905, + "grad_norm": 0.8615573048591614, + "learning_rate": 9.248322226385187e-05, + "loss": 0.9667, + "step": 55370 + }, + { + "epoch": 0.3538070352529292, + "grad_norm": 1.185778021812439, + "learning_rate": 9.248057610549348e-05, + "loss": 1.0003, + "step": 55380 + }, + { + "epoch": 0.35387092240266793, + "grad_norm": 0.9160196781158447, + "learning_rate": 9.247792951931893e-05, + "loss": 0.8687, + "step": 55390 + }, + { + "epoch": 0.35393480955240664, + "grad_norm": 0.6795194745063782, + "learning_rate": 9.247528250535487e-05, + "loss": 0.7333, + "step": 55400 + }, + { + "epoch": 0.35399869670214534, + "grad_norm": 0.5489585399627686, + "learning_rate": 9.247263506362798e-05, + "loss": 0.8638, + "step": 55410 + }, + { + "epoch": 0.35406258385188405, + "grad_norm": 1.2006055116653442, + "learning_rate": 9.246998719416491e-05, + "loss": 0.9143, + "step": 55420 + }, + { + "epoch": 0.35412647100162276, + "grad_norm": 1.0024096965789795, + "learning_rate": 9.246733889699233e-05, + "loss": 0.9047, + "step": 55430 + }, + { + "epoch": 0.35419035815136146, + "grad_norm": 0.5763610005378723, + "learning_rate": 9.24646901721369e-05, + "loss": 0.9754, + "step": 55440 + }, + { + "epoch": 0.3542542453011001, + "grad_norm": 1.1366212368011475, + "learning_rate": 9.24620410196253e-05, + "loss": 0.891, + "step": 55450 + }, + { + "epoch": 0.3543181324508388, + "grad_norm": 1.1361256837844849, + "learning_rate": 9.245939143948424e-05, + "loss": 1.0441, + "step": 55460 + }, + { + "epoch": 0.3543820196005775, + "grad_norm": 0.7863855361938477, + "learning_rate": 9.245674143174034e-05, + "loss": 0.7866, + "step": 55470 + }, + { + "epoch": 0.35444590675031623, + "grad_norm": 0.8668807744979858, + "learning_rate": 9.245409099642033e-05, + "loss": 0.9319, + "step": 55480 + }, + { + "epoch": 0.35450979390005494, + "grad_norm": 0.6587684750556946, + "learning_rate": 9.245144013355092e-05, + "loss": 0.8019, + "step": 55490 + }, + { + "epoch": 0.35457368104979364, + "grad_norm": 1.1338073015213013, + "learning_rate": 9.244878884315876e-05, + "loss": 0.8598, + "step": 55500 + }, + { + "epoch": 0.35463756819953235, + "grad_norm": 0.4027159512042999, + "learning_rate": 9.244613712527057e-05, + "loss": 0.7706, + "step": 55510 + }, + { + "epoch": 0.35470145534927106, + "grad_norm": 1.0326690673828125, + "learning_rate": 9.244348497991306e-05, + "loss": 0.9883, + "step": 55520 + }, + { + "epoch": 0.35476534249900976, + "grad_norm": 2.621795415878296, + "learning_rate": 9.244083240711297e-05, + "loss": 1.087, + "step": 55530 + }, + { + "epoch": 0.35482922964874847, + "grad_norm": 0.8886315822601318, + "learning_rate": 9.243817940689694e-05, + "loss": 0.7566, + "step": 55540 + }, + { + "epoch": 0.3548931167984872, + "grad_norm": 0.7971783876419067, + "learning_rate": 9.243552597929174e-05, + "loss": 0.7039, + "step": 55550 + }, + { + "epoch": 0.3549570039482259, + "grad_norm": 0.7734363675117493, + "learning_rate": 9.243287212432409e-05, + "loss": 0.9843, + "step": 55560 + }, + { + "epoch": 0.35502089109796453, + "grad_norm": 0.9685491919517517, + "learning_rate": 9.24302178420207e-05, + "loss": 0.9583, + "step": 55570 + }, + { + "epoch": 0.35508477824770324, + "grad_norm": 1.15921950340271, + "learning_rate": 9.242756313240833e-05, + "loss": 0.7942, + "step": 55580 + }, + { + "epoch": 0.35514866539744194, + "grad_norm": 1.1534897089004517, + "learning_rate": 9.242490799551366e-05, + "loss": 0.8079, + "step": 55590 + }, + { + "epoch": 0.35521255254718065, + "grad_norm": 0.9609005451202393, + "learning_rate": 9.242225243136348e-05, + "loss": 0.9695, + "step": 55600 + }, + { + "epoch": 0.35527643969691935, + "grad_norm": 0.6478775143623352, + "learning_rate": 9.241959643998453e-05, + "loss": 0.8381, + "step": 55610 + }, + { + "epoch": 0.35534032684665806, + "grad_norm": 0.9925094246864319, + "learning_rate": 9.241694002140354e-05, + "loss": 0.6593, + "step": 55620 + }, + { + "epoch": 0.35540421399639677, + "grad_norm": 0.9142459630966187, + "learning_rate": 9.241428317564725e-05, + "loss": 0.934, + "step": 55630 + }, + { + "epoch": 0.3554681011461355, + "grad_norm": 0.6951974034309387, + "learning_rate": 9.241162590274244e-05, + "loss": 0.9468, + "step": 55640 + }, + { + "epoch": 0.3555319882958742, + "grad_norm": 0.8623539209365845, + "learning_rate": 9.240896820271588e-05, + "loss": 0.8084, + "step": 55650 + }, + { + "epoch": 0.3555958754456129, + "grad_norm": 0.7138127684593201, + "learning_rate": 9.240631007559432e-05, + "loss": 0.8162, + "step": 55660 + }, + { + "epoch": 0.3556597625953516, + "grad_norm": 0.8145920634269714, + "learning_rate": 9.240365152140451e-05, + "loss": 1.0244, + "step": 55670 + }, + { + "epoch": 0.3557236497450903, + "grad_norm": 0.9237201809883118, + "learning_rate": 9.240099254017327e-05, + "loss": 0.8636, + "step": 55680 + }, + { + "epoch": 0.35578753689482895, + "grad_norm": 0.9301193356513977, + "learning_rate": 9.239833313192734e-05, + "loss": 1.1658, + "step": 55690 + }, + { + "epoch": 0.35585142404456765, + "grad_norm": 0.6827517151832581, + "learning_rate": 9.239567329669352e-05, + "loss": 1.1023, + "step": 55700 + }, + { + "epoch": 0.35591531119430636, + "grad_norm": 1.0909185409545898, + "learning_rate": 9.239301303449859e-05, + "loss": 0.8033, + "step": 55710 + }, + { + "epoch": 0.35597919834404507, + "grad_norm": 0.4835173189640045, + "learning_rate": 9.239035234536934e-05, + "loss": 0.8785, + "step": 55720 + }, + { + "epoch": 0.3560430854937838, + "grad_norm": 0.862131655216217, + "learning_rate": 9.238769122933257e-05, + "loss": 1.0392, + "step": 55730 + }, + { + "epoch": 0.3561069726435225, + "grad_norm": 1.5188207626342773, + "learning_rate": 9.238502968641509e-05, + "loss": 1.1016, + "step": 55740 + }, + { + "epoch": 0.3561708597932612, + "grad_norm": 0.6719252467155457, + "learning_rate": 9.238236771664369e-05, + "loss": 1.0367, + "step": 55750 + }, + { + "epoch": 0.3562347469429999, + "grad_norm": 0.8751115798950195, + "learning_rate": 9.237970532004516e-05, + "loss": 0.9716, + "step": 55760 + }, + { + "epoch": 0.3562986340927386, + "grad_norm": 0.8691346049308777, + "learning_rate": 9.237704249664637e-05, + "loss": 0.8428, + "step": 55770 + }, + { + "epoch": 0.3563625212424773, + "grad_norm": 0.7232783436775208, + "learning_rate": 9.237437924647408e-05, + "loss": 0.7021, + "step": 55780 + }, + { + "epoch": 0.356426408392216, + "grad_norm": 1.281238317489624, + "learning_rate": 9.237171556955513e-05, + "loss": 0.9095, + "step": 55790 + }, + { + "epoch": 0.3564902955419547, + "grad_norm": 1.1289631128311157, + "learning_rate": 9.236905146591635e-05, + "loss": 0.9427, + "step": 55800 + }, + { + "epoch": 0.35655418269169337, + "grad_norm": 0.8392340540885925, + "learning_rate": 9.236638693558456e-05, + "loss": 0.7125, + "step": 55810 + }, + { + "epoch": 0.3566180698414321, + "grad_norm": 1.3441346883773804, + "learning_rate": 9.23637219785866e-05, + "loss": 0.8185, + "step": 55820 + }, + { + "epoch": 0.3566819569911708, + "grad_norm": 0.7084068059921265, + "learning_rate": 9.236105659494933e-05, + "loss": 0.8048, + "step": 55830 + }, + { + "epoch": 0.3567458441409095, + "grad_norm": 0.8866279125213623, + "learning_rate": 9.235839078469956e-05, + "loss": 1.0885, + "step": 55840 + }, + { + "epoch": 0.3568097312906482, + "grad_norm": 0.9575055837631226, + "learning_rate": 9.235572454786414e-05, + "loss": 0.8621, + "step": 55850 + }, + { + "epoch": 0.3568736184403869, + "grad_norm": 0.7449828386306763, + "learning_rate": 9.235305788446995e-05, + "loss": 0.902, + "step": 55860 + }, + { + "epoch": 0.3569375055901256, + "grad_norm": 0.5956260561943054, + "learning_rate": 9.235039079454382e-05, + "loss": 0.9419, + "step": 55870 + }, + { + "epoch": 0.3570013927398643, + "grad_norm": 0.7238242030143738, + "learning_rate": 9.23477232781126e-05, + "loss": 1.07, + "step": 55880 + }, + { + "epoch": 0.357065279889603, + "grad_norm": 1.0870457887649536, + "learning_rate": 9.234505533520319e-05, + "loss": 0.9432, + "step": 55890 + }, + { + "epoch": 0.3571291670393417, + "grad_norm": 0.9857404232025146, + "learning_rate": 9.234238696584244e-05, + "loss": 1.1723, + "step": 55900 + }, + { + "epoch": 0.35719305418908043, + "grad_norm": 0.9271001815795898, + "learning_rate": 9.233971817005722e-05, + "loss": 0.9523, + "step": 55910 + }, + { + "epoch": 0.35725694133881913, + "grad_norm": 1.4764975309371948, + "learning_rate": 9.23370489478744e-05, + "loss": 1.0909, + "step": 55920 + }, + { + "epoch": 0.3573208284885578, + "grad_norm": 1.0458935499191284, + "learning_rate": 9.233437929932087e-05, + "loss": 0.8501, + "step": 55930 + }, + { + "epoch": 0.3573847156382965, + "grad_norm": 0.8124297857284546, + "learning_rate": 9.233170922442353e-05, + "loss": 1.0442, + "step": 55940 + }, + { + "epoch": 0.3574486027880352, + "grad_norm": 1.1618013381958008, + "learning_rate": 9.232903872320924e-05, + "loss": 1.0649, + "step": 55950 + }, + { + "epoch": 0.3575124899377739, + "grad_norm": 0.8419058918952942, + "learning_rate": 9.232636779570491e-05, + "loss": 0.8909, + "step": 55960 + }, + { + "epoch": 0.3575763770875126, + "grad_norm": 0.9706215262413025, + "learning_rate": 9.232369644193746e-05, + "loss": 0.9632, + "step": 55970 + }, + { + "epoch": 0.3576402642372513, + "grad_norm": 1.0121309757232666, + "learning_rate": 9.232102466193375e-05, + "loss": 0.7683, + "step": 55980 + }, + { + "epoch": 0.35770415138699, + "grad_norm": 0.8496699929237366, + "learning_rate": 9.231835245572072e-05, + "loss": 0.7359, + "step": 55990 + }, + { + "epoch": 0.3577680385367287, + "grad_norm": 0.6330521702766418, + "learning_rate": 9.231567982332528e-05, + "loss": 0.7402, + "step": 56000 + }, + { + "epoch": 0.35783192568646743, + "grad_norm": 0.8351934552192688, + "learning_rate": 9.23130067647743e-05, + "loss": 0.8587, + "step": 56010 + }, + { + "epoch": 0.35789581283620614, + "grad_norm": 0.8029147386550903, + "learning_rate": 9.231033328009477e-05, + "loss": 0.7748, + "step": 56020 + }, + { + "epoch": 0.35795969998594485, + "grad_norm": 0.6627703905105591, + "learning_rate": 9.230765936931355e-05, + "loss": 0.8785, + "step": 56030 + }, + { + "epoch": 0.35802358713568355, + "grad_norm": 1.2232366800308228, + "learning_rate": 9.230498503245764e-05, + "loss": 0.8073, + "step": 56040 + }, + { + "epoch": 0.3580874742854222, + "grad_norm": 0.783330500125885, + "learning_rate": 9.23023102695539e-05, + "loss": 1.0156, + "step": 56050 + }, + { + "epoch": 0.3581513614351609, + "grad_norm": 0.8045901656150818, + "learning_rate": 9.229963508062931e-05, + "loss": 0.9699, + "step": 56060 + }, + { + "epoch": 0.3582152485848996, + "grad_norm": 1.1084731817245483, + "learning_rate": 9.229695946571079e-05, + "loss": 1.0626, + "step": 56070 + }, + { + "epoch": 0.3582791357346383, + "grad_norm": 0.9448803067207336, + "learning_rate": 9.229428342482531e-05, + "loss": 0.873, + "step": 56080 + }, + { + "epoch": 0.358343022884377, + "grad_norm": 1.1229417324066162, + "learning_rate": 9.229160695799981e-05, + "loss": 0.9604, + "step": 56090 + }, + { + "epoch": 0.35840691003411573, + "grad_norm": 0.8668258190155029, + "learning_rate": 9.228893006526122e-05, + "loss": 1.0267, + "step": 56100 + }, + { + "epoch": 0.35847079718385444, + "grad_norm": 1.0289602279663086, + "learning_rate": 9.228625274663653e-05, + "loss": 0.8669, + "step": 56110 + }, + { + "epoch": 0.35853468433359315, + "grad_norm": 0.9492294192314148, + "learning_rate": 9.22835750021527e-05, + "loss": 0.8236, + "step": 56120 + }, + { + "epoch": 0.35859857148333185, + "grad_norm": 0.9482596516609192, + "learning_rate": 9.228116466802996e-05, + "loss": 0.9974, + "step": 56130 + }, + { + "epoch": 0.35866245863307056, + "grad_norm": 0.8407812118530273, + "learning_rate": 9.227848611448803e-05, + "loss": 0.8215, + "step": 56140 + }, + { + "epoch": 0.35872634578280926, + "grad_norm": 0.8675394654273987, + "learning_rate": 9.227580713516519e-05, + "loss": 1.0991, + "step": 56150 + }, + { + "epoch": 0.35879023293254797, + "grad_norm": 0.9417056441307068, + "learning_rate": 9.227312773008838e-05, + "loss": 1.1657, + "step": 56160 + }, + { + "epoch": 0.3588541200822867, + "grad_norm": 0.6891525983810425, + "learning_rate": 9.22704478992846e-05, + "loss": 0.8083, + "step": 56170 + }, + { + "epoch": 0.3589180072320253, + "grad_norm": 0.8754307627677917, + "learning_rate": 9.226776764278087e-05, + "loss": 0.8525, + "step": 56180 + }, + { + "epoch": 0.35898189438176403, + "grad_norm": 0.7058795094490051, + "learning_rate": 9.226508696060412e-05, + "loss": 0.9577, + "step": 56190 + }, + { + "epoch": 0.35904578153150274, + "grad_norm": 0.7938945889472961, + "learning_rate": 9.22624058527814e-05, + "loss": 0.9518, + "step": 56200 + }, + { + "epoch": 0.35910966868124145, + "grad_norm": 1.1744211912155151, + "learning_rate": 9.225972431933968e-05, + "loss": 0.9626, + "step": 56210 + }, + { + "epoch": 0.35917355583098015, + "grad_norm": 0.6415241360664368, + "learning_rate": 9.225704236030597e-05, + "loss": 0.8535, + "step": 56220 + }, + { + "epoch": 0.35923744298071886, + "grad_norm": 0.8061168789863586, + "learning_rate": 9.225435997570731e-05, + "loss": 0.9465, + "step": 56230 + }, + { + "epoch": 0.35930133013045756, + "grad_norm": 1.0841360092163086, + "learning_rate": 9.225167716557066e-05, + "loss": 0.8539, + "step": 56240 + }, + { + "epoch": 0.35936521728019627, + "grad_norm": 0.8104168772697449, + "learning_rate": 9.22489939299231e-05, + "loss": 0.9817, + "step": 56250 + }, + { + "epoch": 0.359429104429935, + "grad_norm": 0.5234248042106628, + "learning_rate": 9.22463102687916e-05, + "loss": 0.9769, + "step": 56260 + }, + { + "epoch": 0.3594929915796737, + "grad_norm": 0.9442692995071411, + "learning_rate": 9.224362618220321e-05, + "loss": 0.7631, + "step": 56270 + }, + { + "epoch": 0.3595568787294124, + "grad_norm": 0.7581874132156372, + "learning_rate": 9.224094167018496e-05, + "loss": 0.9655, + "step": 56280 + }, + { + "epoch": 0.3596207658791511, + "grad_norm": 0.9377095699310303, + "learning_rate": 9.223825673276387e-05, + "loss": 0.8839, + "step": 56290 + }, + { + "epoch": 0.35968465302888974, + "grad_norm": 1.3251482248306274, + "learning_rate": 9.2235571369967e-05, + "loss": 1.07, + "step": 56300 + }, + { + "epoch": 0.35974854017862845, + "grad_norm": 0.6887118220329285, + "learning_rate": 9.223288558182141e-05, + "loss": 0.8927, + "step": 56310 + }, + { + "epoch": 0.35981242732836716, + "grad_norm": 1.181915044784546, + "learning_rate": 9.22301993683541e-05, + "loss": 0.8667, + "step": 56320 + }, + { + "epoch": 0.35987631447810586, + "grad_norm": 0.9099196195602417, + "learning_rate": 9.222751272959216e-05, + "loss": 0.853, + "step": 56330 + }, + { + "epoch": 0.35994020162784457, + "grad_norm": 0.717306911945343, + "learning_rate": 9.222482566556263e-05, + "loss": 0.6981, + "step": 56340 + }, + { + "epoch": 0.3600040887775833, + "grad_norm": 0.9583873748779297, + "learning_rate": 9.222213817629258e-05, + "loss": 0.9945, + "step": 56350 + }, + { + "epoch": 0.360067975927322, + "grad_norm": 0.770458459854126, + "learning_rate": 9.221945026180907e-05, + "loss": 1.0296, + "step": 56360 + }, + { + "epoch": 0.3601318630770607, + "grad_norm": 1.2659205198287964, + "learning_rate": 9.221676192213918e-05, + "loss": 1.0096, + "step": 56370 + }, + { + "epoch": 0.3601957502267994, + "grad_norm": 1.0208081007003784, + "learning_rate": 9.221407315730997e-05, + "loss": 1.0415, + "step": 56380 + }, + { + "epoch": 0.3602596373765381, + "grad_norm": 0.8561046719551086, + "learning_rate": 9.22113839673485e-05, + "loss": 1.052, + "step": 56390 + }, + { + "epoch": 0.3603235245262768, + "grad_norm": 0.8812417984008789, + "learning_rate": 9.22086943522819e-05, + "loss": 0.9035, + "step": 56400 + }, + { + "epoch": 0.3603874116760155, + "grad_norm": 0.9357554316520691, + "learning_rate": 9.220600431213721e-05, + "loss": 1.052, + "step": 56410 + }, + { + "epoch": 0.36045129882575416, + "grad_norm": 0.5159763693809509, + "learning_rate": 9.220331384694157e-05, + "loss": 0.857, + "step": 56420 + }, + { + "epoch": 0.36051518597549287, + "grad_norm": 0.6961538791656494, + "learning_rate": 9.220062295672203e-05, + "loss": 0.7773, + "step": 56430 + }, + { + "epoch": 0.3605790731252316, + "grad_norm": 0.8022356033325195, + "learning_rate": 9.219793164150572e-05, + "loss": 0.9277, + "step": 56440 + }, + { + "epoch": 0.3606429602749703, + "grad_norm": 0.7829380631446838, + "learning_rate": 9.219523990131972e-05, + "loss": 1.2579, + "step": 56450 + }, + { + "epoch": 0.360706847424709, + "grad_norm": 0.705920934677124, + "learning_rate": 9.219254773619118e-05, + "loss": 0.7642, + "step": 56460 + }, + { + "epoch": 0.3607707345744477, + "grad_norm": 0.6244139075279236, + "learning_rate": 9.218985514614715e-05, + "loss": 0.8506, + "step": 56470 + }, + { + "epoch": 0.3608346217241864, + "grad_norm": 0.9292709231376648, + "learning_rate": 9.218716213121479e-05, + "loss": 0.8007, + "step": 56480 + }, + { + "epoch": 0.3608985088739251, + "grad_norm": 1.1093422174453735, + "learning_rate": 9.218446869142121e-05, + "loss": 0.948, + "step": 56490 + }, + { + "epoch": 0.3609623960236638, + "grad_norm": 0.9102591872215271, + "learning_rate": 9.218177482679354e-05, + "loss": 0.9274, + "step": 56500 + }, + { + "epoch": 0.3610262831734025, + "grad_norm": 0.8324138522148132, + "learning_rate": 9.217908053735889e-05, + "loss": 0.7481, + "step": 56510 + }, + { + "epoch": 0.3610901703231412, + "grad_norm": 0.6961225867271423, + "learning_rate": 9.217638582314442e-05, + "loss": 0.9775, + "step": 56520 + }, + { + "epoch": 0.36115405747287993, + "grad_norm": 0.9226143956184387, + "learning_rate": 9.217369068417726e-05, + "loss": 0.849, + "step": 56530 + }, + { + "epoch": 0.3612179446226186, + "grad_norm": 0.7031887769699097, + "learning_rate": 9.217099512048454e-05, + "loss": 0.8807, + "step": 56540 + }, + { + "epoch": 0.3612818317723573, + "grad_norm": 1.840198278427124, + "learning_rate": 9.216829913209342e-05, + "loss": 0.8067, + "step": 56550 + }, + { + "epoch": 0.361345718922096, + "grad_norm": 0.6737743020057678, + "learning_rate": 9.216560271903105e-05, + "loss": 0.992, + "step": 56560 + }, + { + "epoch": 0.3614096060718347, + "grad_norm": 1.441496729850769, + "learning_rate": 9.216290588132457e-05, + "loss": 0.9385, + "step": 56570 + }, + { + "epoch": 0.3614734932215734, + "grad_norm": 0.6251863837242126, + "learning_rate": 9.216020861900117e-05, + "loss": 0.6162, + "step": 56580 + }, + { + "epoch": 0.3615373803713121, + "grad_norm": 1.0875921249389648, + "learning_rate": 9.215751093208798e-05, + "loss": 0.631, + "step": 56590 + }, + { + "epoch": 0.3616012675210508, + "grad_norm": 0.7786641716957092, + "learning_rate": 9.215481282061221e-05, + "loss": 1.1934, + "step": 56600 + }, + { + "epoch": 0.3616651546707895, + "grad_norm": 0.6881313323974609, + "learning_rate": 9.215211428460098e-05, + "loss": 1.0602, + "step": 56610 + }, + { + "epoch": 0.36172904182052823, + "grad_norm": 1.173574686050415, + "learning_rate": 9.21494153240815e-05, + "loss": 0.8231, + "step": 56620 + }, + { + "epoch": 0.36179292897026694, + "grad_norm": 0.581283450126648, + "learning_rate": 9.214671593908092e-05, + "loss": 0.6751, + "step": 56630 + }, + { + "epoch": 0.36185681612000564, + "grad_norm": 0.7389190196990967, + "learning_rate": 9.214401612962649e-05, + "loss": 0.7668, + "step": 56640 + }, + { + "epoch": 0.36192070326974435, + "grad_norm": 0.6907786130905151, + "learning_rate": 9.214131589574534e-05, + "loss": 0.8037, + "step": 56650 + }, + { + "epoch": 0.361984590419483, + "grad_norm": 0.8607721328735352, + "learning_rate": 9.213861523746467e-05, + "loss": 0.8867, + "step": 56660 + }, + { + "epoch": 0.3620484775692217, + "grad_norm": 1.4309948682785034, + "learning_rate": 9.213591415481172e-05, + "loss": 0.9099, + "step": 56670 + }, + { + "epoch": 0.3621123647189604, + "grad_norm": 0.4009925425052643, + "learning_rate": 9.213321264781363e-05, + "loss": 0.6807, + "step": 56680 + }, + { + "epoch": 0.3621762518686991, + "grad_norm": 1.2572836875915527, + "learning_rate": 9.213051071649766e-05, + "loss": 0.9303, + "step": 56690 + }, + { + "epoch": 0.3622401390184378, + "grad_norm": 0.9721736311912537, + "learning_rate": 9.212780836089098e-05, + "loss": 0.8034, + "step": 56700 + }, + { + "epoch": 0.36230402616817653, + "grad_norm": 0.7228071093559265, + "learning_rate": 9.212510558102083e-05, + "loss": 1.0872, + "step": 56710 + }, + { + "epoch": 0.36236791331791524, + "grad_norm": 1.043264389038086, + "learning_rate": 9.212240237691443e-05, + "loss": 0.8663, + "step": 56720 + }, + { + "epoch": 0.36243180046765394, + "grad_norm": 0.8477384448051453, + "learning_rate": 9.211969874859898e-05, + "loss": 1.0247, + "step": 56730 + }, + { + "epoch": 0.36249568761739265, + "grad_norm": 1.0123629570007324, + "learning_rate": 9.211699469610174e-05, + "loss": 0.886, + "step": 56740 + }, + { + "epoch": 0.36255957476713135, + "grad_norm": 1.2255103588104248, + "learning_rate": 9.211429021944993e-05, + "loss": 0.8577, + "step": 56750 + }, + { + "epoch": 0.36262346191687006, + "grad_norm": 1.0467153787612915, + "learning_rate": 9.211158531867078e-05, + "loss": 1.0881, + "step": 56760 + }, + { + "epoch": 0.36268734906660877, + "grad_norm": 0.876595139503479, + "learning_rate": 9.210887999379153e-05, + "loss": 1.1139, + "step": 56770 + }, + { + "epoch": 0.3627512362163474, + "grad_norm": 0.9058437943458557, + "learning_rate": 9.210617424483943e-05, + "loss": 1.2917, + "step": 56780 + }, + { + "epoch": 0.3628151233660861, + "grad_norm": 0.755662202835083, + "learning_rate": 9.210346807184174e-05, + "loss": 0.899, + "step": 56790 + }, + { + "epoch": 0.36287901051582483, + "grad_norm": 0.5830435156822205, + "learning_rate": 9.210076147482567e-05, + "loss": 0.748, + "step": 56800 + }, + { + "epoch": 0.36294289766556354, + "grad_norm": 0.7086238861083984, + "learning_rate": 9.209805445381854e-05, + "loss": 1.1404, + "step": 56810 + }, + { + "epoch": 0.36300678481530224, + "grad_norm": 0.8161737322807312, + "learning_rate": 9.209534700884758e-05, + "loss": 0.6793, + "step": 56820 + }, + { + "epoch": 0.36307067196504095, + "grad_norm": 0.9982635974884033, + "learning_rate": 9.209263913994004e-05, + "loss": 0.865, + "step": 56830 + }, + { + "epoch": 0.36313455911477965, + "grad_norm": 0.8346578478813171, + "learning_rate": 9.208993084712322e-05, + "loss": 1.001, + "step": 56840 + }, + { + "epoch": 0.36319844626451836, + "grad_norm": 0.7267649173736572, + "learning_rate": 9.20872221304244e-05, + "loss": 0.6267, + "step": 56850 + }, + { + "epoch": 0.36326233341425707, + "grad_norm": 0.8349143266677856, + "learning_rate": 9.208451298987082e-05, + "loss": 0.9017, + "step": 56860 + }, + { + "epoch": 0.36332622056399577, + "grad_norm": 4.069967269897461, + "learning_rate": 9.20818034254898e-05, + "loss": 0.9907, + "step": 56870 + }, + { + "epoch": 0.3633901077137345, + "grad_norm": 1.0109018087387085, + "learning_rate": 9.20790934373086e-05, + "loss": 0.7675, + "step": 56880 + }, + { + "epoch": 0.3634539948634732, + "grad_norm": 0.7021664977073669, + "learning_rate": 9.207638302535452e-05, + "loss": 0.8808, + "step": 56890 + }, + { + "epoch": 0.36351788201321183, + "grad_norm": 1.4066033363342285, + "learning_rate": 9.207367218965487e-05, + "loss": 1.1123, + "step": 56900 + }, + { + "epoch": 0.36358176916295054, + "grad_norm": 0.5992746353149414, + "learning_rate": 9.207096093023694e-05, + "loss": 0.7128, + "step": 56910 + }, + { + "epoch": 0.36364565631268925, + "grad_norm": 0.9940372109413147, + "learning_rate": 9.206824924712805e-05, + "loss": 1.0003, + "step": 56920 + }, + { + "epoch": 0.36370954346242795, + "grad_norm": 0.7933813333511353, + "learning_rate": 9.206553714035549e-05, + "loss": 0.9643, + "step": 56930 + }, + { + "epoch": 0.36377343061216666, + "grad_norm": 0.7373024225234985, + "learning_rate": 9.206282460994657e-05, + "loss": 0.8773, + "step": 56940 + }, + { + "epoch": 0.36383731776190537, + "grad_norm": 1.27448570728302, + "learning_rate": 9.206011165592863e-05, + "loss": 0.909, + "step": 56950 + }, + { + "epoch": 0.36390120491164407, + "grad_norm": 0.7734085917472839, + "learning_rate": 9.205739827832895e-05, + "loss": 0.9389, + "step": 56960 + }, + { + "epoch": 0.3639650920613828, + "grad_norm": 0.5840640068054199, + "learning_rate": 9.205468447717491e-05, + "loss": 1.01, + "step": 56970 + }, + { + "epoch": 0.3640289792111215, + "grad_norm": 1.308883547782898, + "learning_rate": 9.205197025249382e-05, + "loss": 0.8717, + "step": 56980 + }, + { + "epoch": 0.3640928663608602, + "grad_norm": 0.5307298898696899, + "learning_rate": 9.2049255604313e-05, + "loss": 0.998, + "step": 56990 + }, + { + "epoch": 0.3641567535105989, + "grad_norm": 0.6630975604057312, + "learning_rate": 9.20465405326598e-05, + "loss": 0.9668, + "step": 57000 + }, + { + "epoch": 0.3642206406603376, + "grad_norm": 0.6394637823104858, + "learning_rate": 9.204382503756154e-05, + "loss": 0.8324, + "step": 57010 + }, + { + "epoch": 0.3642845278100763, + "grad_norm": 1.0292775630950928, + "learning_rate": 9.204110911904562e-05, + "loss": 0.8907, + "step": 57020 + }, + { + "epoch": 0.36434841495981496, + "grad_norm": 1.187157392501831, + "learning_rate": 9.203839277713935e-05, + "loss": 1.0058, + "step": 57030 + }, + { + "epoch": 0.36441230210955367, + "grad_norm": 1.1334859132766724, + "learning_rate": 9.20356760118701e-05, + "loss": 0.9049, + "step": 57040 + }, + { + "epoch": 0.36447618925929237, + "grad_norm": 1.5810905694961548, + "learning_rate": 9.203295882326521e-05, + "loss": 0.8885, + "step": 57050 + }, + { + "epoch": 0.3645400764090311, + "grad_norm": 0.981046736240387, + "learning_rate": 9.203024121135209e-05, + "loss": 0.8166, + "step": 57060 + }, + { + "epoch": 0.3646039635587698, + "grad_norm": 0.9694949388504028, + "learning_rate": 9.202752317615805e-05, + "loss": 0.7219, + "step": 57070 + }, + { + "epoch": 0.3646678507085085, + "grad_norm": 1.3031917810440063, + "learning_rate": 9.202480471771052e-05, + "loss": 0.9798, + "step": 57080 + }, + { + "epoch": 0.3647317378582472, + "grad_norm": 0.9253140687942505, + "learning_rate": 9.202208583603683e-05, + "loss": 0.7253, + "step": 57090 + }, + { + "epoch": 0.3647956250079859, + "grad_norm": 1.0539807081222534, + "learning_rate": 9.201936653116439e-05, + "loss": 0.8563, + "step": 57100 + }, + { + "epoch": 0.3648595121577246, + "grad_norm": 0.8437415361404419, + "learning_rate": 9.201664680312057e-05, + "loss": 0.955, + "step": 57110 + }, + { + "epoch": 0.3649233993074633, + "grad_norm": 0.7053326368331909, + "learning_rate": 9.201392665193276e-05, + "loss": 0.8577, + "step": 57120 + }, + { + "epoch": 0.364987286457202, + "grad_norm": 0.5430055856704712, + "learning_rate": 9.201120607762837e-05, + "loss": 0.8196, + "step": 57130 + }, + { + "epoch": 0.3650511736069407, + "grad_norm": 0.6964545845985413, + "learning_rate": 9.20084850802348e-05, + "loss": 0.8877, + "step": 57140 + }, + { + "epoch": 0.3651150607566794, + "grad_norm": 2.833962917327881, + "learning_rate": 9.200576365977943e-05, + "loss": 1.1258, + "step": 57150 + }, + { + "epoch": 0.3651789479064181, + "grad_norm": 0.9289480447769165, + "learning_rate": 9.200304181628968e-05, + "loss": 0.8065, + "step": 57160 + }, + { + "epoch": 0.3652428350561568, + "grad_norm": 0.6666757464408875, + "learning_rate": 9.200031954979297e-05, + "loss": 0.876, + "step": 57170 + }, + { + "epoch": 0.3653067222058955, + "grad_norm": 0.9867071509361267, + "learning_rate": 9.19975968603167e-05, + "loss": 0.9151, + "step": 57180 + }, + { + "epoch": 0.3653706093556342, + "grad_norm": 0.6142376661300659, + "learning_rate": 9.19948737478883e-05, + "loss": 0.7852, + "step": 57190 + }, + { + "epoch": 0.3654344965053729, + "grad_norm": 1.6434037685394287, + "learning_rate": 9.199215021253518e-05, + "loss": 0.8127, + "step": 57200 + }, + { + "epoch": 0.3654983836551116, + "grad_norm": 1.2232186794281006, + "learning_rate": 9.198942625428479e-05, + "loss": 0.9223, + "step": 57210 + }, + { + "epoch": 0.3655622708048503, + "grad_norm": 1.112564206123352, + "learning_rate": 9.198670187316456e-05, + "loss": 1.0382, + "step": 57220 + }, + { + "epoch": 0.365626157954589, + "grad_norm": 0.8125051259994507, + "learning_rate": 9.19839770692019e-05, + "loss": 0.8196, + "step": 57230 + }, + { + "epoch": 0.36569004510432773, + "grad_norm": 3.3364717960357666, + "learning_rate": 9.198125184242427e-05, + "loss": 1.0401, + "step": 57240 + }, + { + "epoch": 0.36575393225406644, + "grad_norm": 0.8038178086280823, + "learning_rate": 9.197852619285913e-05, + "loss": 1.2333, + "step": 57250 + }, + { + "epoch": 0.36581781940380514, + "grad_norm": 0.9946600198745728, + "learning_rate": 9.19758001205339e-05, + "loss": 0.8022, + "step": 57260 + }, + { + "epoch": 0.3658817065535438, + "grad_norm": 2.188892126083374, + "learning_rate": 9.197307362547607e-05, + "loss": 0.8886, + "step": 57270 + }, + { + "epoch": 0.3659455937032825, + "grad_norm": 0.699150025844574, + "learning_rate": 9.197034670771306e-05, + "loss": 0.8193, + "step": 57280 + }, + { + "epoch": 0.3660094808530212, + "grad_norm": 0.9230877757072449, + "learning_rate": 9.196761936727235e-05, + "loss": 0.9072, + "step": 57290 + }, + { + "epoch": 0.3660733680027599, + "grad_norm": 1.229096531867981, + "learning_rate": 9.19648916041814e-05, + "loss": 0.883, + "step": 57300 + }, + { + "epoch": 0.3661372551524986, + "grad_norm": 0.6960686445236206, + "learning_rate": 9.196216341846771e-05, + "loss": 1.1022, + "step": 57310 + }, + { + "epoch": 0.3662011423022373, + "grad_norm": 0.7202252745628357, + "learning_rate": 9.195943481015872e-05, + "loss": 0.9708, + "step": 57320 + }, + { + "epoch": 0.36626502945197603, + "grad_norm": 0.6151859760284424, + "learning_rate": 9.19567057792819e-05, + "loss": 1.1355, + "step": 57330 + }, + { + "epoch": 0.36632891660171474, + "grad_norm": 0.6116877198219299, + "learning_rate": 9.195397632586478e-05, + "loss": 0.7314, + "step": 57340 + }, + { + "epoch": 0.36639280375145344, + "grad_norm": 1.559106707572937, + "learning_rate": 9.195124644993483e-05, + "loss": 0.9246, + "step": 57350 + }, + { + "epoch": 0.36645669090119215, + "grad_norm": 0.8441659808158875, + "learning_rate": 9.194851615151951e-05, + "loss": 0.8061, + "step": 57360 + }, + { + "epoch": 0.36652057805093086, + "grad_norm": 0.8084182143211365, + "learning_rate": 9.194578543064635e-05, + "loss": 1.0054, + "step": 57370 + }, + { + "epoch": 0.36658446520066956, + "grad_norm": 0.9725624918937683, + "learning_rate": 9.194305428734285e-05, + "loss": 0.9369, + "step": 57380 + }, + { + "epoch": 0.3666483523504082, + "grad_norm": 0.7019644975662231, + "learning_rate": 9.19403227216365e-05, + "loss": 1.0316, + "step": 57390 + }, + { + "epoch": 0.3667122395001469, + "grad_norm": 0.84947669506073, + "learning_rate": 9.193759073355482e-05, + "loss": 1.0048, + "step": 57400 + }, + { + "epoch": 0.3667761266498856, + "grad_norm": 0.8308162689208984, + "learning_rate": 9.193485832312532e-05, + "loss": 1.2343, + "step": 57410 + }, + { + "epoch": 0.36684001379962433, + "grad_norm": 1.0776206254959106, + "learning_rate": 9.193212549037551e-05, + "loss": 1.2088, + "step": 57420 + }, + { + "epoch": 0.36690390094936304, + "grad_norm": 1.5936800241470337, + "learning_rate": 9.192939223533292e-05, + "loss": 1.0616, + "step": 57430 + }, + { + "epoch": 0.36696778809910174, + "grad_norm": 0.9751461148262024, + "learning_rate": 9.192665855802509e-05, + "loss": 0.9944, + "step": 57440 + }, + { + "epoch": 0.36703167524884045, + "grad_norm": 0.893570601940155, + "learning_rate": 9.192392445847953e-05, + "loss": 0.6853, + "step": 57450 + }, + { + "epoch": 0.36709556239857916, + "grad_norm": 1.0084480047225952, + "learning_rate": 9.192118993672378e-05, + "loss": 1.1445, + "step": 57460 + }, + { + "epoch": 0.36715944954831786, + "grad_norm": 0.9631890654563904, + "learning_rate": 9.191845499278539e-05, + "loss": 0.9025, + "step": 57470 + }, + { + "epoch": 0.36722333669805657, + "grad_norm": 1.809931993484497, + "learning_rate": 9.191571962669187e-05, + "loss": 0.8336, + "step": 57480 + }, + { + "epoch": 0.3672872238477953, + "grad_norm": 1.0318517684936523, + "learning_rate": 9.191298383847083e-05, + "loss": 0.9237, + "step": 57490 + }, + { + "epoch": 0.367351110997534, + "grad_norm": 1.242576003074646, + "learning_rate": 9.191024762814975e-05, + "loss": 0.9736, + "step": 57500 + }, + { + "epoch": 0.36741499814727263, + "grad_norm": 0.8778398036956787, + "learning_rate": 9.190751099575623e-05, + "loss": 0.7765, + "step": 57510 + }, + { + "epoch": 0.36747888529701134, + "grad_norm": 1.108216643333435, + "learning_rate": 9.19047739413178e-05, + "loss": 1.0051, + "step": 57520 + }, + { + "epoch": 0.36754277244675004, + "grad_norm": 0.9173517227172852, + "learning_rate": 9.190203646486206e-05, + "loss": 0.9958, + "step": 57530 + }, + { + "epoch": 0.36760665959648875, + "grad_norm": 0.8545486330986023, + "learning_rate": 9.189929856641657e-05, + "loss": 0.8174, + "step": 57540 + }, + { + "epoch": 0.36767054674622746, + "grad_norm": 0.8391945362091064, + "learning_rate": 9.18965602460089e-05, + "loss": 0.9733, + "step": 57550 + }, + { + "epoch": 0.36773443389596616, + "grad_norm": 0.6733419895172119, + "learning_rate": 9.189382150366662e-05, + "loss": 0.8057, + "step": 57560 + }, + { + "epoch": 0.36779832104570487, + "grad_norm": 2.302520513534546, + "learning_rate": 9.189108233941729e-05, + "loss": 0.9927, + "step": 57570 + }, + { + "epoch": 0.3678622081954436, + "grad_norm": 0.8115237355232239, + "learning_rate": 9.188834275328853e-05, + "loss": 0.9236, + "step": 57580 + }, + { + "epoch": 0.3679260953451823, + "grad_norm": 1.0810778141021729, + "learning_rate": 9.188560274530793e-05, + "loss": 1.1711, + "step": 57590 + }, + { + "epoch": 0.367989982494921, + "grad_norm": 0.9964002966880798, + "learning_rate": 9.188286231550307e-05, + "loss": 1.013, + "step": 57600 + }, + { + "epoch": 0.3680538696446597, + "grad_norm": 1.2044520378112793, + "learning_rate": 9.188012146390155e-05, + "loss": 0.798, + "step": 57610 + }, + { + "epoch": 0.3681177567943984, + "grad_norm": 2.0826616287231445, + "learning_rate": 9.187738019053098e-05, + "loss": 0.7468, + "step": 57620 + }, + { + "epoch": 0.36818164394413705, + "grad_norm": 0.8267672657966614, + "learning_rate": 9.187463849541895e-05, + "loss": 0.911, + "step": 57630 + }, + { + "epoch": 0.36824553109387576, + "grad_norm": 0.7479827404022217, + "learning_rate": 9.18718963785931e-05, + "loss": 0.7645, + "step": 57640 + }, + { + "epoch": 0.36830941824361446, + "grad_norm": 0.7282874584197998, + "learning_rate": 9.186915384008103e-05, + "loss": 1.1665, + "step": 57650 + }, + { + "epoch": 0.36837330539335317, + "grad_norm": 0.6607038974761963, + "learning_rate": 9.186641087991034e-05, + "loss": 0.9505, + "step": 57660 + }, + { + "epoch": 0.3684371925430919, + "grad_norm": 0.7457965612411499, + "learning_rate": 9.186366749810869e-05, + "loss": 0.7571, + "step": 57670 + }, + { + "epoch": 0.3685010796928306, + "grad_norm": 0.9892933368682861, + "learning_rate": 9.186092369470368e-05, + "loss": 0.8002, + "step": 57680 + }, + { + "epoch": 0.3685649668425693, + "grad_norm": 1.134438395500183, + "learning_rate": 9.185817946972296e-05, + "loss": 0.9039, + "step": 57690 + }, + { + "epoch": 0.368628853992308, + "grad_norm": 0.7737533450126648, + "learning_rate": 9.185543482319417e-05, + "loss": 0.8303, + "step": 57700 + }, + { + "epoch": 0.3686927411420467, + "grad_norm": 2.6117820739746094, + "learning_rate": 9.185268975514491e-05, + "loss": 0.9566, + "step": 57710 + }, + { + "epoch": 0.3687566282917854, + "grad_norm": 1.5219961404800415, + "learning_rate": 9.184994426560289e-05, + "loss": 0.976, + "step": 57720 + }, + { + "epoch": 0.3688205154415241, + "grad_norm": 0.796924889087677, + "learning_rate": 9.18471983545957e-05, + "loss": 0.8359, + "step": 57730 + }, + { + "epoch": 0.3688844025912628, + "grad_norm": 0.5662297606468201, + "learning_rate": 9.184445202215104e-05, + "loss": 0.9518, + "step": 57740 + }, + { + "epoch": 0.36894828974100147, + "grad_norm": 1.4038177728652954, + "learning_rate": 9.184170526829654e-05, + "loss": 0.8367, + "step": 57750 + }, + { + "epoch": 0.3690121768907402, + "grad_norm": 1.051730990409851, + "learning_rate": 9.183895809305987e-05, + "loss": 0.8319, + "step": 57760 + }, + { + "epoch": 0.3690760640404789, + "grad_norm": 0.6114339232444763, + "learning_rate": 9.183621049646869e-05, + "loss": 0.8821, + "step": 57770 + }, + { + "epoch": 0.3691399511902176, + "grad_norm": 0.7710915803909302, + "learning_rate": 9.18334624785507e-05, + "loss": 0.7208, + "step": 57780 + }, + { + "epoch": 0.3692038383399563, + "grad_norm": 1.5859193801879883, + "learning_rate": 9.183071403933353e-05, + "loss": 0.8121, + "step": 57790 + }, + { + "epoch": 0.369267725489695, + "grad_norm": 0.8052014708518982, + "learning_rate": 9.182796517884487e-05, + "loss": 0.9727, + "step": 57800 + }, + { + "epoch": 0.3693316126394337, + "grad_norm": 0.7919948101043701, + "learning_rate": 9.182521589711244e-05, + "loss": 1.0669, + "step": 57810 + }, + { + "epoch": 0.3693954997891724, + "grad_norm": 0.9116694927215576, + "learning_rate": 9.182246619416388e-05, + "loss": 0.7669, + "step": 57820 + }, + { + "epoch": 0.3694593869389111, + "grad_norm": 0.7370694875717163, + "learning_rate": 9.181971607002693e-05, + "loss": 0.6573, + "step": 57830 + }, + { + "epoch": 0.3695232740886498, + "grad_norm": 2.3321750164031982, + "learning_rate": 9.181696552472924e-05, + "loss": 1.0031, + "step": 57840 + }, + { + "epoch": 0.36958716123838853, + "grad_norm": 1.1017699241638184, + "learning_rate": 9.181421455829852e-05, + "loss": 0.9181, + "step": 57850 + }, + { + "epoch": 0.36965104838812723, + "grad_norm": 1.479238510131836, + "learning_rate": 9.181146317076252e-05, + "loss": 1.0418, + "step": 57860 + }, + { + "epoch": 0.36971493553786594, + "grad_norm": 0.8015438914299011, + "learning_rate": 9.180871136214889e-05, + "loss": 0.8837, + "step": 57870 + }, + { + "epoch": 0.3697788226876046, + "grad_norm": 0.7428931593894958, + "learning_rate": 9.180595913248537e-05, + "loss": 0.9252, + "step": 57880 + }, + { + "epoch": 0.3698427098373433, + "grad_norm": 1.2738107442855835, + "learning_rate": 9.180320648179968e-05, + "loss": 0.8249, + "step": 57890 + }, + { + "epoch": 0.369906596987082, + "grad_norm": 1.0015136003494263, + "learning_rate": 9.180045341011953e-05, + "loss": 0.9307, + "step": 57900 + }, + { + "epoch": 0.3699704841368207, + "grad_norm": 0.7193623185157776, + "learning_rate": 9.179769991747264e-05, + "loss": 0.8081, + "step": 57910 + }, + { + "epoch": 0.3700343712865594, + "grad_norm": 0.964747428894043, + "learning_rate": 9.179494600388677e-05, + "loss": 0.9367, + "step": 57920 + }, + { + "epoch": 0.3700982584362981, + "grad_norm": 0.8497000932693481, + "learning_rate": 9.179219166938963e-05, + "loss": 0.7509, + "step": 57930 + }, + { + "epoch": 0.37016214558603683, + "grad_norm": 1.6816493272781372, + "learning_rate": 9.178943691400896e-05, + "loss": 0.7834, + "step": 57940 + }, + { + "epoch": 0.37022603273577553, + "grad_norm": 0.8694002032279968, + "learning_rate": 9.178668173777252e-05, + "loss": 0.9374, + "step": 57950 + }, + { + "epoch": 0.37028991988551424, + "grad_norm": 0.6682251691818237, + "learning_rate": 9.178392614070803e-05, + "loss": 0.9475, + "step": 57960 + }, + { + "epoch": 0.37035380703525295, + "grad_norm": 0.610185980796814, + "learning_rate": 9.178117012284326e-05, + "loss": 0.8925, + "step": 57970 + }, + { + "epoch": 0.37041769418499165, + "grad_norm": 0.5272064805030823, + "learning_rate": 9.177841368420596e-05, + "loss": 0.8726, + "step": 57980 + }, + { + "epoch": 0.37048158133473036, + "grad_norm": 0.878194272518158, + "learning_rate": 9.17756568248239e-05, + "loss": 0.8307, + "step": 57990 + }, + { + "epoch": 0.370545468484469, + "grad_norm": 0.5838222503662109, + "learning_rate": 9.177289954472483e-05, + "loss": 1.0879, + "step": 58000 + }, + { + "epoch": 0.3706093556342077, + "grad_norm": 1.0944530963897705, + "learning_rate": 9.177014184393654e-05, + "loss": 0.8774, + "step": 58010 + }, + { + "epoch": 0.3706732427839464, + "grad_norm": 0.8681952953338623, + "learning_rate": 9.176738372248675e-05, + "loss": 0.9085, + "step": 58020 + }, + { + "epoch": 0.37073712993368513, + "grad_norm": 1.0131874084472656, + "learning_rate": 9.176462518040328e-05, + "loss": 1.0068, + "step": 58030 + }, + { + "epoch": 0.37080101708342383, + "grad_norm": 1.0025804042816162, + "learning_rate": 9.176186621771392e-05, + "loss": 0.8304, + "step": 58040 + }, + { + "epoch": 0.37086490423316254, + "grad_norm": 0.5216399431228638, + "learning_rate": 9.175910683444641e-05, + "loss": 1.0596, + "step": 58050 + }, + { + "epoch": 0.37092879138290125, + "grad_norm": 1.1650744676589966, + "learning_rate": 9.17563470306286e-05, + "loss": 0.9596, + "step": 58060 + }, + { + "epoch": 0.37099267853263995, + "grad_norm": 0.738498866558075, + "learning_rate": 9.175358680628825e-05, + "loss": 1.0937, + "step": 58070 + }, + { + "epoch": 0.37105656568237866, + "grad_norm": 1.8002482652664185, + "learning_rate": 9.175082616145314e-05, + "loss": 1.0585, + "step": 58080 + }, + { + "epoch": 0.37112045283211736, + "grad_norm": 0.9968917369842529, + "learning_rate": 9.17480650961511e-05, + "loss": 0.8607, + "step": 58090 + }, + { + "epoch": 0.37118433998185607, + "grad_norm": 0.6830025911331177, + "learning_rate": 9.174530361040992e-05, + "loss": 0.909, + "step": 58100 + }, + { + "epoch": 0.3712482271315948, + "grad_norm": 0.8409507870674133, + "learning_rate": 9.174254170425742e-05, + "loss": 0.7824, + "step": 58110 + }, + { + "epoch": 0.3713121142813334, + "grad_norm": 2.4945926666259766, + "learning_rate": 9.173977937772143e-05, + "loss": 1.0359, + "step": 58120 + }, + { + "epoch": 0.37137600143107213, + "grad_norm": 2.378359317779541, + "learning_rate": 9.173701663082972e-05, + "loss": 0.9768, + "step": 58130 + }, + { + "epoch": 0.37143988858081084, + "grad_norm": 1.1033055782318115, + "learning_rate": 9.173425346361017e-05, + "loss": 0.8963, + "step": 58140 + }, + { + "epoch": 0.37150377573054955, + "grad_norm": 0.982449471950531, + "learning_rate": 9.173148987609057e-05, + "loss": 0.9571, + "step": 58150 + }, + { + "epoch": 0.37156766288028825, + "grad_norm": 0.8938246965408325, + "learning_rate": 9.172872586829878e-05, + "loss": 1.0901, + "step": 58160 + }, + { + "epoch": 0.37163155003002696, + "grad_norm": 1.218361258506775, + "learning_rate": 9.17259614402626e-05, + "loss": 0.9947, + "step": 58170 + }, + { + "epoch": 0.37169543717976566, + "grad_norm": 0.7889940142631531, + "learning_rate": 9.17231965920099e-05, + "loss": 0.8841, + "step": 58180 + }, + { + "epoch": 0.37175932432950437, + "grad_norm": 0.9611823558807373, + "learning_rate": 9.17204313235685e-05, + "loss": 0.8782, + "step": 58190 + }, + { + "epoch": 0.3718232114792431, + "grad_norm": 1.1690157651901245, + "learning_rate": 9.171766563496628e-05, + "loss": 0.7884, + "step": 58200 + }, + { + "epoch": 0.3718870986289818, + "grad_norm": 1.155748963356018, + "learning_rate": 9.171489952623109e-05, + "loss": 1.0516, + "step": 58210 + }, + { + "epoch": 0.3719509857787205, + "grad_norm": 0.927507758140564, + "learning_rate": 9.171213299739075e-05, + "loss": 1.0492, + "step": 58220 + }, + { + "epoch": 0.3720148729284592, + "grad_norm": 1.9883320331573486, + "learning_rate": 9.170936604847315e-05, + "loss": 1.0933, + "step": 58230 + }, + { + "epoch": 0.37207876007819785, + "grad_norm": 0.8213433623313904, + "learning_rate": 9.170659867950615e-05, + "loss": 0.8121, + "step": 58240 + }, + { + "epoch": 0.37214264722793655, + "grad_norm": 1.8754318952560425, + "learning_rate": 9.170383089051762e-05, + "loss": 1.0397, + "step": 58250 + }, + { + "epoch": 0.37220653437767526, + "grad_norm": 1.1018773317337036, + "learning_rate": 9.170106268153543e-05, + "loss": 0.9177, + "step": 58260 + }, + { + "epoch": 0.37227042152741396, + "grad_norm": 0.6226816773414612, + "learning_rate": 9.169829405258747e-05, + "loss": 0.8247, + "step": 58270 + }, + { + "epoch": 0.37233430867715267, + "grad_norm": 0.615023136138916, + "learning_rate": 9.169552500370161e-05, + "loss": 1.0718, + "step": 58280 + }, + { + "epoch": 0.3723981958268914, + "grad_norm": 0.7454681396484375, + "learning_rate": 9.169275553490573e-05, + "loss": 0.9678, + "step": 58290 + }, + { + "epoch": 0.3724620829766301, + "grad_norm": 0.9580934047698975, + "learning_rate": 9.168998564622774e-05, + "loss": 1.2206, + "step": 58300 + }, + { + "epoch": 0.3725259701263688, + "grad_norm": 1.0605610609054565, + "learning_rate": 9.168721533769556e-05, + "loss": 0.8984, + "step": 58310 + }, + { + "epoch": 0.3725898572761075, + "grad_norm": 1.1907299757003784, + "learning_rate": 9.168444460933702e-05, + "loss": 0.8531, + "step": 58320 + }, + { + "epoch": 0.3726537444258462, + "grad_norm": 0.995368480682373, + "learning_rate": 9.168167346118006e-05, + "loss": 0.6946, + "step": 58330 + }, + { + "epoch": 0.3727176315755849, + "grad_norm": 1.1580376625061035, + "learning_rate": 9.167890189325261e-05, + "loss": 0.7377, + "step": 58340 + }, + { + "epoch": 0.3727815187253236, + "grad_norm": 1.3204131126403809, + "learning_rate": 9.167612990558254e-05, + "loss": 0.8134, + "step": 58350 + }, + { + "epoch": 0.37284540587506226, + "grad_norm": 1.4517935514450073, + "learning_rate": 9.167335749819781e-05, + "loss": 0.7879, + "step": 58360 + }, + { + "epoch": 0.37290929302480097, + "grad_norm": 0.6076644659042358, + "learning_rate": 9.167058467112629e-05, + "loss": 0.9626, + "step": 58370 + }, + { + "epoch": 0.3729731801745397, + "grad_norm": 0.8815622925758362, + "learning_rate": 9.166781142439595e-05, + "loss": 0.8204, + "step": 58380 + }, + { + "epoch": 0.3730370673242784, + "grad_norm": 1.5476758480072021, + "learning_rate": 9.16650377580347e-05, + "loss": 1.0324, + "step": 58390 + }, + { + "epoch": 0.3731009544740171, + "grad_norm": 1.1092950105667114, + "learning_rate": 9.166226367207047e-05, + "loss": 0.7715, + "step": 58400 + }, + { + "epoch": 0.3731648416237558, + "grad_norm": 0.8686773777008057, + "learning_rate": 9.16594891665312e-05, + "loss": 0.8466, + "step": 58410 + }, + { + "epoch": 0.3732287287734945, + "grad_norm": 0.948353111743927, + "learning_rate": 9.165671424144484e-05, + "loss": 0.7348, + "step": 58420 + }, + { + "epoch": 0.3732926159232332, + "grad_norm": 0.8880228400230408, + "learning_rate": 9.165393889683933e-05, + "loss": 0.8305, + "step": 58430 + }, + { + "epoch": 0.3733565030729719, + "grad_norm": 0.6419790983200073, + "learning_rate": 9.165116313274262e-05, + "loss": 0.8744, + "step": 58440 + }, + { + "epoch": 0.3734203902227106, + "grad_norm": 0.6578751802444458, + "learning_rate": 9.164838694918266e-05, + "loss": 1.0893, + "step": 58450 + }, + { + "epoch": 0.3734842773724493, + "grad_norm": 0.6613552570343018, + "learning_rate": 9.16456103461874e-05, + "loss": 1.0446, + "step": 58460 + }, + { + "epoch": 0.37354816452218803, + "grad_norm": 0.7612963318824768, + "learning_rate": 9.164283332378483e-05, + "loss": 0.9673, + "step": 58470 + }, + { + "epoch": 0.3736120516719267, + "grad_norm": 1.8602917194366455, + "learning_rate": 9.16400558820029e-05, + "loss": 1.1563, + "step": 58480 + }, + { + "epoch": 0.3736759388216654, + "grad_norm": 0.7156874537467957, + "learning_rate": 9.163755582585293e-05, + "loss": 0.9562, + "step": 58490 + }, + { + "epoch": 0.3737398259714041, + "grad_norm": 0.8070692420005798, + "learning_rate": 9.163477758732727e-05, + "loss": 0.8344, + "step": 58500 + }, + { + "epoch": 0.3738037131211428, + "grad_norm": 0.8404533267021179, + "learning_rate": 9.163199892950341e-05, + "loss": 0.9861, + "step": 58510 + }, + { + "epoch": 0.3738676002708815, + "grad_norm": 0.8748318552970886, + "learning_rate": 9.162921985240928e-05, + "loss": 0.9779, + "step": 58520 + }, + { + "epoch": 0.3739314874206202, + "grad_norm": 0.8599054217338562, + "learning_rate": 9.16264403560729e-05, + "loss": 0.8124, + "step": 58530 + }, + { + "epoch": 0.3739953745703589, + "grad_norm": 0.7923135161399841, + "learning_rate": 9.162366044052226e-05, + "loss": 0.6135, + "step": 58540 + }, + { + "epoch": 0.3740592617200976, + "grad_norm": 0.6415694952011108, + "learning_rate": 9.162088010578535e-05, + "loss": 0.9293, + "step": 58550 + }, + { + "epoch": 0.37412314886983633, + "grad_norm": 2.256666898727417, + "learning_rate": 9.161809935189016e-05, + "loss": 1.1138, + "step": 58560 + }, + { + "epoch": 0.37418703601957504, + "grad_norm": 1.2693225145339966, + "learning_rate": 9.161531817886471e-05, + "loss": 0.6599, + "step": 58570 + }, + { + "epoch": 0.37425092316931374, + "grad_norm": 0.8467420935630798, + "learning_rate": 9.1612536586737e-05, + "loss": 0.6876, + "step": 58580 + }, + { + "epoch": 0.37431481031905245, + "grad_norm": 0.9001184701919556, + "learning_rate": 9.160975457553504e-05, + "loss": 0.5682, + "step": 58590 + }, + { + "epoch": 0.37437869746879116, + "grad_norm": 0.6269614696502686, + "learning_rate": 9.160697214528687e-05, + "loss": 1.0431, + "step": 58600 + }, + { + "epoch": 0.3744425846185298, + "grad_norm": 1.413830280303955, + "learning_rate": 9.160418929602048e-05, + "loss": 0.7761, + "step": 58610 + }, + { + "epoch": 0.3745064717682685, + "grad_norm": 2.2682693004608154, + "learning_rate": 9.160140602776392e-05, + "loss": 1.1893, + "step": 58620 + }, + { + "epoch": 0.3745703589180072, + "grad_norm": 0.5779352188110352, + "learning_rate": 9.159862234054521e-05, + "loss": 1.035, + "step": 58630 + }, + { + "epoch": 0.3746342460677459, + "grad_norm": 0.7203439474105835, + "learning_rate": 9.15958382343924e-05, + "loss": 0.9331, + "step": 58640 + }, + { + "epoch": 0.37469813321748463, + "grad_norm": 0.8126745223999023, + "learning_rate": 9.159305370933349e-05, + "loss": 0.7504, + "step": 58650 + }, + { + "epoch": 0.37476202036722334, + "grad_norm": 0.7604427337646484, + "learning_rate": 9.159026876539656e-05, + "loss": 0.8239, + "step": 58660 + }, + { + "epoch": 0.37482590751696204, + "grad_norm": 0.9764753580093384, + "learning_rate": 9.158748340260962e-05, + "loss": 0.8887, + "step": 58670 + }, + { + "epoch": 0.37488979466670075, + "grad_norm": 1.0595623254776, + "learning_rate": 9.158469762100077e-05, + "loss": 0.9124, + "step": 58680 + }, + { + "epoch": 0.37495368181643945, + "grad_norm": 0.8522325158119202, + "learning_rate": 9.158191142059803e-05, + "loss": 0.8533, + "step": 58690 + }, + { + "epoch": 0.37501756896617816, + "grad_norm": 1.0041230916976929, + "learning_rate": 9.157912480142947e-05, + "loss": 0.9559, + "step": 58700 + }, + { + "epoch": 0.37508145611591687, + "grad_norm": 1.3694050312042236, + "learning_rate": 9.157633776352314e-05, + "loss": 0.905, + "step": 58710 + }, + { + "epoch": 0.3751453432656556, + "grad_norm": 0.4370633363723755, + "learning_rate": 9.157355030690714e-05, + "loss": 0.7518, + "step": 58720 + }, + { + "epoch": 0.3752092304153942, + "grad_norm": 0.8439906239509583, + "learning_rate": 9.157076243160951e-05, + "loss": 0.9578, + "step": 58730 + }, + { + "epoch": 0.37527311756513293, + "grad_norm": 0.7714953422546387, + "learning_rate": 9.156797413765834e-05, + "loss": 0.9042, + "step": 58740 + }, + { + "epoch": 0.37533700471487164, + "grad_norm": 0.7417599558830261, + "learning_rate": 9.156518542508172e-05, + "loss": 0.8571, + "step": 58750 + }, + { + "epoch": 0.37540089186461034, + "grad_norm": 1.951737642288208, + "learning_rate": 9.15623962939077e-05, + "loss": 0.9094, + "step": 58760 + }, + { + "epoch": 0.37546477901434905, + "grad_norm": 0.8249172568321228, + "learning_rate": 9.155960674416441e-05, + "loss": 0.7664, + "step": 58770 + }, + { + "epoch": 0.37552866616408775, + "grad_norm": 0.667812705039978, + "learning_rate": 9.155681677587992e-05, + "loss": 0.708, + "step": 58780 + }, + { + "epoch": 0.37559255331382646, + "grad_norm": 0.6393797993659973, + "learning_rate": 9.155402638908235e-05, + "loss": 0.8337, + "step": 58790 + }, + { + "epoch": 0.37565644046356517, + "grad_norm": 0.7899972200393677, + "learning_rate": 9.155123558379976e-05, + "loss": 1.0715, + "step": 58800 + }, + { + "epoch": 0.3757203276133039, + "grad_norm": 2.3867976665496826, + "learning_rate": 9.154844436006029e-05, + "loss": 0.9635, + "step": 58810 + }, + { + "epoch": 0.3757842147630426, + "grad_norm": 0.7886314392089844, + "learning_rate": 9.154565271789206e-05, + "loss": 0.8288, + "step": 58820 + }, + { + "epoch": 0.3758481019127813, + "grad_norm": 0.6438289880752563, + "learning_rate": 9.154286065732313e-05, + "loss": 0.683, + "step": 58830 + }, + { + "epoch": 0.37591198906252, + "grad_norm": 0.8149610161781311, + "learning_rate": 9.154006817838168e-05, + "loss": 0.9502, + "step": 58840 + }, + { + "epoch": 0.37597587621225864, + "grad_norm": 1.0395874977111816, + "learning_rate": 9.15372752810958e-05, + "loss": 0.6418, + "step": 58850 + }, + { + "epoch": 0.37603976336199735, + "grad_norm": 1.5722790956497192, + "learning_rate": 9.153448196549362e-05, + "loss": 0.927, + "step": 58860 + }, + { + "epoch": 0.37610365051173605, + "grad_norm": 1.1867657899856567, + "learning_rate": 9.153168823160327e-05, + "loss": 0.7479, + "step": 58870 + }, + { + "epoch": 0.37616753766147476, + "grad_norm": 0.9400370121002197, + "learning_rate": 9.15288940794529e-05, + "loss": 0.8849, + "step": 58880 + }, + { + "epoch": 0.37623142481121347, + "grad_norm": 0.6055128574371338, + "learning_rate": 9.152609950907062e-05, + "loss": 0.8318, + "step": 58890 + }, + { + "epoch": 0.3762953119609522, + "grad_norm": 0.8164952993392944, + "learning_rate": 9.152330452048462e-05, + "loss": 0.9452, + "step": 58900 + }, + { + "epoch": 0.3763591991106909, + "grad_norm": 0.4781966507434845, + "learning_rate": 9.152050911372301e-05, + "loss": 1.0144, + "step": 58910 + }, + { + "epoch": 0.3764230862604296, + "grad_norm": 0.7525957822799683, + "learning_rate": 9.151771328881394e-05, + "loss": 1.0175, + "step": 58920 + }, + { + "epoch": 0.3764869734101683, + "grad_norm": 0.9770300388336182, + "learning_rate": 9.151491704578559e-05, + "loss": 1.0909, + "step": 58930 + }, + { + "epoch": 0.376550860559907, + "grad_norm": 0.8200979232788086, + "learning_rate": 9.151212038466612e-05, + "loss": 0.6905, + "step": 58940 + }, + { + "epoch": 0.3766147477096457, + "grad_norm": 0.8204917907714844, + "learning_rate": 9.150932330548367e-05, + "loss": 0.9003, + "step": 58950 + }, + { + "epoch": 0.3766786348593844, + "grad_norm": 0.7505319714546204, + "learning_rate": 9.150652580826642e-05, + "loss": 1.0317, + "step": 58960 + }, + { + "epoch": 0.37674252200912306, + "grad_norm": 0.782026469707489, + "learning_rate": 9.150372789304256e-05, + "loss": 0.8431, + "step": 58970 + }, + { + "epoch": 0.37680640915886177, + "grad_norm": 2.76662278175354, + "learning_rate": 9.150092955984025e-05, + "loss": 1.0264, + "step": 58980 + }, + { + "epoch": 0.37687029630860047, + "grad_norm": 0.9735605716705322, + "learning_rate": 9.149813080868766e-05, + "loss": 1.0035, + "step": 58990 + }, + { + "epoch": 0.3769341834583392, + "grad_norm": 0.6053544282913208, + "learning_rate": 9.149533163961302e-05, + "loss": 0.8895, + "step": 59000 + }, + { + "epoch": 0.3769980706080779, + "grad_norm": 1.281782865524292, + "learning_rate": 9.149253205264448e-05, + "loss": 1.1018, + "step": 59010 + }, + { + "epoch": 0.3770619577578166, + "grad_norm": 0.6073563694953918, + "learning_rate": 9.148973204781023e-05, + "loss": 1.0346, + "step": 59020 + }, + { + "epoch": 0.3771258449075553, + "grad_norm": 0.5802990198135376, + "learning_rate": 9.148693162513851e-05, + "loss": 0.8453, + "step": 59030 + }, + { + "epoch": 0.377189732057294, + "grad_norm": 0.9088721871376038, + "learning_rate": 9.148413078465747e-05, + "loss": 1.0229, + "step": 59040 + }, + { + "epoch": 0.3772536192070327, + "grad_norm": 0.8357219099998474, + "learning_rate": 9.148132952639536e-05, + "loss": 1.133, + "step": 59050 + }, + { + "epoch": 0.3773175063567714, + "grad_norm": 0.7745949029922485, + "learning_rate": 9.147852785038038e-05, + "loss": 0.6222, + "step": 59060 + }, + { + "epoch": 0.3773813935065101, + "grad_norm": 0.5787645578384399, + "learning_rate": 9.147572575664074e-05, + "loss": 0.8277, + "step": 59070 + }, + { + "epoch": 0.3774452806562488, + "grad_norm": 0.5599297881126404, + "learning_rate": 9.147292324520466e-05, + "loss": 0.8404, + "step": 59080 + }, + { + "epoch": 0.3775091678059875, + "grad_norm": 0.6565321087837219, + "learning_rate": 9.147012031610035e-05, + "loss": 0.937, + "step": 59090 + }, + { + "epoch": 0.3775730549557262, + "grad_norm": 0.8938694000244141, + "learning_rate": 9.146731696935606e-05, + "loss": 1.0061, + "step": 59100 + }, + { + "epoch": 0.3776369421054649, + "grad_norm": 0.8118715286254883, + "learning_rate": 9.146451320500001e-05, + "loss": 0.9974, + "step": 59110 + }, + { + "epoch": 0.3777008292552036, + "grad_norm": 0.7012856006622314, + "learning_rate": 9.146170902306045e-05, + "loss": 1.0306, + "step": 59120 + }, + { + "epoch": 0.3777647164049423, + "grad_norm": 0.6307138204574585, + "learning_rate": 9.145890442356561e-05, + "loss": 0.685, + "step": 59130 + }, + { + "epoch": 0.377828603554681, + "grad_norm": 0.793086588382721, + "learning_rate": 9.145609940654373e-05, + "loss": 1.0748, + "step": 59140 + }, + { + "epoch": 0.3778924907044197, + "grad_norm": 1.0463335514068604, + "learning_rate": 9.145329397202307e-05, + "loss": 0.9517, + "step": 59150 + }, + { + "epoch": 0.3779563778541584, + "grad_norm": 0.8374640345573425, + "learning_rate": 9.145048812003186e-05, + "loss": 0.8408, + "step": 59160 + }, + { + "epoch": 0.3780202650038971, + "grad_norm": 0.8713728189468384, + "learning_rate": 9.144768185059838e-05, + "loss": 1.1013, + "step": 59170 + }, + { + "epoch": 0.37808415215363583, + "grad_norm": 0.706382691860199, + "learning_rate": 9.14448751637509e-05, + "loss": 0.8862, + "step": 59180 + }, + { + "epoch": 0.37814803930337454, + "grad_norm": 0.464167058467865, + "learning_rate": 9.144206805951767e-05, + "loss": 0.6612, + "step": 59190 + }, + { + "epoch": 0.37821192645311325, + "grad_norm": 0.7974499464035034, + "learning_rate": 9.143926053792696e-05, + "loss": 1.1017, + "step": 59200 + }, + { + "epoch": 0.3782758136028519, + "grad_norm": 1.0677493810653687, + "learning_rate": 9.143645259900704e-05, + "loss": 1.0395, + "step": 59210 + }, + { + "epoch": 0.3783397007525906, + "grad_norm": 0.6200050711631775, + "learning_rate": 9.14336442427862e-05, + "loss": 0.8772, + "step": 59220 + }, + { + "epoch": 0.3784035879023293, + "grad_norm": 0.8041068315505981, + "learning_rate": 9.143083546929272e-05, + "loss": 0.8241, + "step": 59230 + }, + { + "epoch": 0.378467475052068, + "grad_norm": 1.050399661064148, + "learning_rate": 9.142802627855487e-05, + "loss": 0.7528, + "step": 59240 + }, + { + "epoch": 0.3785313622018067, + "grad_norm": 0.5964527726173401, + "learning_rate": 9.142521667060098e-05, + "loss": 0.8251, + "step": 59250 + }, + { + "epoch": 0.3785952493515454, + "grad_norm": 0.813062310218811, + "learning_rate": 9.14224066454593e-05, + "loss": 1.0259, + "step": 59260 + }, + { + "epoch": 0.37865913650128413, + "grad_norm": 0.8622726798057556, + "learning_rate": 9.141959620315816e-05, + "loss": 0.8479, + "step": 59270 + }, + { + "epoch": 0.37872302365102284, + "grad_norm": 0.59121173620224, + "learning_rate": 9.141678534372584e-05, + "loss": 0.6244, + "step": 59280 + }, + { + "epoch": 0.37878691080076154, + "grad_norm": 0.7073304653167725, + "learning_rate": 9.141397406719066e-05, + "loss": 0.7587, + "step": 59290 + }, + { + "epoch": 0.37885079795050025, + "grad_norm": 0.5923304557800293, + "learning_rate": 9.141116237358095e-05, + "loss": 0.9219, + "step": 59300 + }, + { + "epoch": 0.37891468510023896, + "grad_norm": 0.909243106842041, + "learning_rate": 9.1408350262925e-05, + "loss": 0.721, + "step": 59310 + }, + { + "epoch": 0.37897857224997766, + "grad_norm": 0.40945374965667725, + "learning_rate": 9.140553773525114e-05, + "loss": 0.9946, + "step": 59320 + }, + { + "epoch": 0.3790424593997163, + "grad_norm": 1.5487751960754395, + "learning_rate": 9.14027247905877e-05, + "loss": 1.0129, + "step": 59330 + }, + { + "epoch": 0.379106346549455, + "grad_norm": 0.946149468421936, + "learning_rate": 9.1399911428963e-05, + "loss": 1.0866, + "step": 59340 + }, + { + "epoch": 0.3791702336991937, + "grad_norm": 1.2105820178985596, + "learning_rate": 9.139709765040537e-05, + "loss": 1.1053, + "step": 59350 + }, + { + "epoch": 0.37923412084893243, + "grad_norm": 0.9297011494636536, + "learning_rate": 9.139428345494316e-05, + "loss": 1.0082, + "step": 59360 + }, + { + "epoch": 0.37929800799867114, + "grad_norm": 1.3490419387817383, + "learning_rate": 9.139146884260469e-05, + "loss": 0.7593, + "step": 59370 + }, + { + "epoch": 0.37936189514840984, + "grad_norm": 1.027013897895813, + "learning_rate": 9.138865381341835e-05, + "loss": 0.8555, + "step": 59380 + }, + { + "epoch": 0.37942578229814855, + "grad_norm": 0.7934104800224304, + "learning_rate": 9.138583836741243e-05, + "loss": 0.7812, + "step": 59390 + }, + { + "epoch": 0.37948966944788726, + "grad_norm": 0.702707052230835, + "learning_rate": 9.138302250461532e-05, + "loss": 0.9684, + "step": 59400 + }, + { + "epoch": 0.37955355659762596, + "grad_norm": 0.6672869920730591, + "learning_rate": 9.138020622505539e-05, + "loss": 0.7703, + "step": 59410 + }, + { + "epoch": 0.37961744374736467, + "grad_norm": 0.811865508556366, + "learning_rate": 9.137738952876096e-05, + "loss": 0.7615, + "step": 59420 + }, + { + "epoch": 0.3796813308971034, + "grad_norm": 1.041718602180481, + "learning_rate": 9.137457241576044e-05, + "loss": 0.8087, + "step": 59430 + }, + { + "epoch": 0.3797452180468421, + "grad_norm": 0.9935733079910278, + "learning_rate": 9.137175488608217e-05, + "loss": 0.8609, + "step": 59440 + }, + { + "epoch": 0.3798091051965808, + "grad_norm": 0.6558438539505005, + "learning_rate": 9.136893693975455e-05, + "loss": 1.1521, + "step": 59450 + }, + { + "epoch": 0.37987299234631944, + "grad_norm": 1.0106873512268066, + "learning_rate": 9.136611857680593e-05, + "loss": 0.8439, + "step": 59460 + }, + { + "epoch": 0.37993687949605814, + "grad_norm": 0.8947387337684631, + "learning_rate": 9.136329979726472e-05, + "loss": 0.9528, + "step": 59470 + }, + { + "epoch": 0.38000076664579685, + "grad_norm": 1.6661876440048218, + "learning_rate": 9.13604806011593e-05, + "loss": 0.7804, + "step": 59480 + }, + { + "epoch": 0.38006465379553556, + "grad_norm": 0.7552819848060608, + "learning_rate": 9.135766098851803e-05, + "loss": 0.8697, + "step": 59490 + }, + { + "epoch": 0.38012854094527426, + "grad_norm": 1.3484975099563599, + "learning_rate": 9.135484095936937e-05, + "loss": 0.7785, + "step": 59500 + }, + { + "epoch": 0.38019242809501297, + "grad_norm": 0.9297848343849182, + "learning_rate": 9.135202051374167e-05, + "loss": 0.695, + "step": 59510 + }, + { + "epoch": 0.3802563152447517, + "grad_norm": 0.8916332125663757, + "learning_rate": 9.134919965166335e-05, + "loss": 0.8245, + "step": 59520 + }, + { + "epoch": 0.3803202023944904, + "grad_norm": 1.1042640209197998, + "learning_rate": 9.13463783731628e-05, + "loss": 0.8783, + "step": 59530 + }, + { + "epoch": 0.3803840895442291, + "grad_norm": 0.8340087532997131, + "learning_rate": 9.134355667826847e-05, + "loss": 0.7602, + "step": 59540 + }, + { + "epoch": 0.3804479766939678, + "grad_norm": 1.1028873920440674, + "learning_rate": 9.134073456700876e-05, + "loss": 0.9535, + "step": 59550 + }, + { + "epoch": 0.3805118638437065, + "grad_norm": 1.2923681735992432, + "learning_rate": 9.133791203941207e-05, + "loss": 0.9221, + "step": 59560 + }, + { + "epoch": 0.3805757509934452, + "grad_norm": 1.8344556093215942, + "learning_rate": 9.133508909550686e-05, + "loss": 1.1256, + "step": 59570 + }, + { + "epoch": 0.38063963814318386, + "grad_norm": 0.9875249862670898, + "learning_rate": 9.133226573532154e-05, + "loss": 0.7142, + "step": 59580 + }, + { + "epoch": 0.38070352529292256, + "grad_norm": 0.9586598873138428, + "learning_rate": 9.132944195888455e-05, + "loss": 0.7369, + "step": 59590 + }, + { + "epoch": 0.38076741244266127, + "grad_norm": 0.8368619084358215, + "learning_rate": 9.132661776622431e-05, + "loss": 0.7057, + "step": 59600 + }, + { + "epoch": 0.3808312995924, + "grad_norm": 0.9391975998878479, + "learning_rate": 9.132379315736928e-05, + "loss": 0.9706, + "step": 59610 + }, + { + "epoch": 0.3808951867421387, + "grad_norm": 0.6700417995452881, + "learning_rate": 9.132096813234792e-05, + "loss": 1.1595, + "step": 59620 + }, + { + "epoch": 0.3809590738918774, + "grad_norm": 4.27878475189209, + "learning_rate": 9.131814269118864e-05, + "loss": 1.0109, + "step": 59630 + }, + { + "epoch": 0.3810229610416161, + "grad_norm": 0.9258844256401062, + "learning_rate": 9.131531683391993e-05, + "loss": 0.979, + "step": 59640 + }, + { + "epoch": 0.3810868481913548, + "grad_norm": 4.915820121765137, + "learning_rate": 9.131249056057023e-05, + "loss": 1.0458, + "step": 59650 + }, + { + "epoch": 0.3811507353410935, + "grad_norm": 2.258350133895874, + "learning_rate": 9.130966387116802e-05, + "loss": 0.7549, + "step": 59660 + }, + { + "epoch": 0.3812146224908322, + "grad_norm": 0.5593277812004089, + "learning_rate": 9.130683676574175e-05, + "loss": 0.8745, + "step": 59670 + }, + { + "epoch": 0.3812785096405709, + "grad_norm": 0.8787796497344971, + "learning_rate": 9.13040092443199e-05, + "loss": 0.8045, + "step": 59680 + }, + { + "epoch": 0.3813423967903096, + "grad_norm": 0.9920330047607422, + "learning_rate": 9.130118130693095e-05, + "loss": 0.9066, + "step": 59690 + }, + { + "epoch": 0.3814062839400483, + "grad_norm": 2.229135513305664, + "learning_rate": 9.129835295360336e-05, + "loss": 0.8905, + "step": 59700 + }, + { + "epoch": 0.381470171089787, + "grad_norm": 0.8204028010368347, + "learning_rate": 9.129552418436563e-05, + "loss": 1.0525, + "step": 59710 + }, + { + "epoch": 0.3815340582395257, + "grad_norm": 0.8208606243133545, + "learning_rate": 9.129269499924626e-05, + "loss": 0.8469, + "step": 59720 + }, + { + "epoch": 0.3815979453892644, + "grad_norm": 0.8647171854972839, + "learning_rate": 9.128986539827371e-05, + "loss": 0.8889, + "step": 59730 + }, + { + "epoch": 0.3816618325390031, + "grad_norm": 1.4753942489624023, + "learning_rate": 9.128703538147651e-05, + "loss": 0.9241, + "step": 59740 + }, + { + "epoch": 0.3817257196887418, + "grad_norm": 6.072597503662109, + "learning_rate": 9.128420494888313e-05, + "loss": 1.3249, + "step": 59750 + }, + { + "epoch": 0.3817896068384805, + "grad_norm": 1.8557090759277344, + "learning_rate": 9.128137410052211e-05, + "loss": 1.0087, + "step": 59760 + }, + { + "epoch": 0.3818534939882192, + "grad_norm": 0.828505277633667, + "learning_rate": 9.127854283642192e-05, + "loss": 0.8843, + "step": 59770 + }, + { + "epoch": 0.3819173811379579, + "grad_norm": 0.6272063851356506, + "learning_rate": 9.127571115661111e-05, + "loss": 0.9136, + "step": 59780 + }, + { + "epoch": 0.38198126828769663, + "grad_norm": 0.683825671672821, + "learning_rate": 9.127287906111817e-05, + "loss": 0.9161, + "step": 59790 + }, + { + "epoch": 0.38204515543743534, + "grad_norm": 0.7828848958015442, + "learning_rate": 9.127004654997163e-05, + "loss": 0.8366, + "step": 59800 + }, + { + "epoch": 0.38210904258717404, + "grad_norm": 2.599881410598755, + "learning_rate": 9.126721362320003e-05, + "loss": 1.0435, + "step": 59810 + }, + { + "epoch": 0.3821729297369127, + "grad_norm": 1.0187602043151855, + "learning_rate": 9.126438028083186e-05, + "loss": 0.9667, + "step": 59820 + }, + { + "epoch": 0.3822368168866514, + "grad_norm": 1.3073110580444336, + "learning_rate": 9.126154652289571e-05, + "loss": 0.7698, + "step": 59830 + }, + { + "epoch": 0.3823007040363901, + "grad_norm": 0.6932925581932068, + "learning_rate": 9.125871234942008e-05, + "loss": 0.6695, + "step": 59840 + }, + { + "epoch": 0.3823645911861288, + "grad_norm": 1.466614842414856, + "learning_rate": 9.125587776043352e-05, + "loss": 1.0159, + "step": 59850 + }, + { + "epoch": 0.3824284783358675, + "grad_norm": 0.5515915155410767, + "learning_rate": 9.125304275596458e-05, + "loss": 1.0273, + "step": 59860 + }, + { + "epoch": 0.3824923654856062, + "grad_norm": 0.6064876914024353, + "learning_rate": 9.125020733604182e-05, + "loss": 0.8891, + "step": 59870 + }, + { + "epoch": 0.38255625263534493, + "grad_norm": 0.8917511105537415, + "learning_rate": 9.124737150069378e-05, + "loss": 1.1068, + "step": 59880 + }, + { + "epoch": 0.38262013978508363, + "grad_norm": 0.7151978611946106, + "learning_rate": 9.1244535249949e-05, + "loss": 0.8862, + "step": 59890 + }, + { + "epoch": 0.38268402693482234, + "grad_norm": 0.8112443089485168, + "learning_rate": 9.124169858383611e-05, + "loss": 0.897, + "step": 59900 + }, + { + "epoch": 0.38274791408456105, + "grad_norm": 1.089768886566162, + "learning_rate": 9.123886150238361e-05, + "loss": 0.9832, + "step": 59910 + }, + { + "epoch": 0.38281180123429975, + "grad_norm": 0.7794529795646667, + "learning_rate": 9.12360240056201e-05, + "loss": 0.8341, + "step": 59920 + }, + { + "epoch": 0.38287568838403846, + "grad_norm": 0.5675161480903625, + "learning_rate": 9.123318609357417e-05, + "loss": 0.9027, + "step": 59930 + }, + { + "epoch": 0.3829395755337771, + "grad_norm": 0.8330199718475342, + "learning_rate": 9.123034776627437e-05, + "loss": 0.9739, + "step": 59940 + }, + { + "epoch": 0.3830034626835158, + "grad_norm": 1.6454709768295288, + "learning_rate": 9.12275090237493e-05, + "loss": 0.977, + "step": 59950 + }, + { + "epoch": 0.3830673498332545, + "grad_norm": 0.8024013042449951, + "learning_rate": 9.122466986602756e-05, + "loss": 0.9452, + "step": 59960 + }, + { + "epoch": 0.38313123698299323, + "grad_norm": 1.1360933780670166, + "learning_rate": 9.122183029313771e-05, + "loss": 1.0236, + "step": 59970 + }, + { + "epoch": 0.38319512413273193, + "grad_norm": 0.7337785959243774, + "learning_rate": 9.121899030510839e-05, + "loss": 0.9299, + "step": 59980 + }, + { + "epoch": 0.38325901128247064, + "grad_norm": 0.8636689782142639, + "learning_rate": 9.121614990196816e-05, + "loss": 0.7671, + "step": 59990 + }, + { + "epoch": 0.38332289843220935, + "grad_norm": 1.2737140655517578, + "learning_rate": 9.121330908374564e-05, + "loss": 0.8175, + "step": 60000 + }, + { + "epoch": 0.38338678558194805, + "grad_norm": 0.6086975336074829, + "learning_rate": 9.121046785046945e-05, + "loss": 1.1958, + "step": 60010 + }, + { + "epoch": 0.38345067273168676, + "grad_norm": 0.7334972023963928, + "learning_rate": 9.12076262021682e-05, + "loss": 0.8533, + "step": 60020 + }, + { + "epoch": 0.38351455988142547, + "grad_norm": 0.67818683385849, + "learning_rate": 9.12047841388705e-05, + "loss": 0.7376, + "step": 60030 + }, + { + "epoch": 0.38357844703116417, + "grad_norm": 0.5810967683792114, + "learning_rate": 9.120194166060498e-05, + "loss": 0.8313, + "step": 60040 + }, + { + "epoch": 0.3836423341809029, + "grad_norm": 0.7271260619163513, + "learning_rate": 9.119909876740027e-05, + "loss": 0.8529, + "step": 60050 + }, + { + "epoch": 0.38370622133064153, + "grad_norm": 1.0164223909378052, + "learning_rate": 9.119625545928499e-05, + "loss": 0.919, + "step": 60060 + }, + { + "epoch": 0.38377010848038023, + "grad_norm": 1.4784969091415405, + "learning_rate": 9.119341173628777e-05, + "loss": 0.9259, + "step": 60070 + }, + { + "epoch": 0.38383399563011894, + "grad_norm": 0.8718630075454712, + "learning_rate": 9.119056759843724e-05, + "loss": 0.918, + "step": 60080 + }, + { + "epoch": 0.38389788277985765, + "grad_norm": 0.9398227334022522, + "learning_rate": 9.118772304576209e-05, + "loss": 1.0287, + "step": 60090 + }, + { + "epoch": 0.38396176992959635, + "grad_norm": 0.7162007689476013, + "learning_rate": 9.118487807829093e-05, + "loss": 0.8178, + "step": 60100 + }, + { + "epoch": 0.38402565707933506, + "grad_norm": 1.4307546615600586, + "learning_rate": 9.118203269605242e-05, + "loss": 0.8535, + "step": 60110 + }, + { + "epoch": 0.38408954422907376, + "grad_norm": 1.0519388914108276, + "learning_rate": 9.11791868990752e-05, + "loss": 1.2414, + "step": 60120 + }, + { + "epoch": 0.38415343137881247, + "grad_norm": 0.8539866805076599, + "learning_rate": 9.117634068738794e-05, + "loss": 0.8189, + "step": 60130 + }, + { + "epoch": 0.3842173185285512, + "grad_norm": 0.8897231221199036, + "learning_rate": 9.117349406101931e-05, + "loss": 1.0583, + "step": 60140 + }, + { + "epoch": 0.3842812056782899, + "grad_norm": 0.9356622099876404, + "learning_rate": 9.117064701999797e-05, + "loss": 0.8774, + "step": 60150 + }, + { + "epoch": 0.3843450928280286, + "grad_norm": 0.934384822845459, + "learning_rate": 9.116779956435262e-05, + "loss": 1.0653, + "step": 60160 + }, + { + "epoch": 0.3844089799777673, + "grad_norm": 0.5904353857040405, + "learning_rate": 9.11649516941119e-05, + "loss": 0.8482, + "step": 60170 + }, + { + "epoch": 0.38447286712750595, + "grad_norm": 0.840069055557251, + "learning_rate": 9.116210340930451e-05, + "loss": 1.0966, + "step": 60180 + }, + { + "epoch": 0.38453675427724465, + "grad_norm": 2.140904188156128, + "learning_rate": 9.115925470995912e-05, + "loss": 0.9313, + "step": 60190 + }, + { + "epoch": 0.38460064142698336, + "grad_norm": 1.6145496368408203, + "learning_rate": 9.115640559610444e-05, + "loss": 0.9065, + "step": 60200 + }, + { + "epoch": 0.38466452857672206, + "grad_norm": 0.8971934914588928, + "learning_rate": 9.115355606776913e-05, + "loss": 0.7211, + "step": 60210 + }, + { + "epoch": 0.38472841572646077, + "grad_norm": 0.740960419178009, + "learning_rate": 9.115070612498192e-05, + "loss": 0.7915, + "step": 60220 + }, + { + "epoch": 0.3847923028761995, + "grad_norm": 1.029941201210022, + "learning_rate": 9.114785576777149e-05, + "loss": 0.9746, + "step": 60230 + }, + { + "epoch": 0.3848561900259382, + "grad_norm": 1.0357356071472168, + "learning_rate": 9.114500499616656e-05, + "loss": 0.7439, + "step": 60240 + }, + { + "epoch": 0.3849200771756769, + "grad_norm": 0.823661208152771, + "learning_rate": 9.114215381019584e-05, + "loss": 0.8409, + "step": 60250 + }, + { + "epoch": 0.3849839643254156, + "grad_norm": 1.5275285243988037, + "learning_rate": 9.113930220988804e-05, + "loss": 0.6833, + "step": 60260 + }, + { + "epoch": 0.3850478514751543, + "grad_norm": 0.8406334519386292, + "learning_rate": 9.113645019527187e-05, + "loss": 0.941, + "step": 60270 + }, + { + "epoch": 0.385111738624893, + "grad_norm": 1.2402430772781372, + "learning_rate": 9.113359776637604e-05, + "loss": 0.823, + "step": 60280 + }, + { + "epoch": 0.3851756257746317, + "grad_norm": 0.8033724427223206, + "learning_rate": 9.113074492322933e-05, + "loss": 1.0329, + "step": 60290 + }, + { + "epoch": 0.3852395129243704, + "grad_norm": 0.7544481158256531, + "learning_rate": 9.112789166586041e-05, + "loss": 0.7707, + "step": 60300 + }, + { + "epoch": 0.38530340007410907, + "grad_norm": 1.0110443830490112, + "learning_rate": 9.112503799429805e-05, + "loss": 0.8752, + "step": 60310 + }, + { + "epoch": 0.3853672872238478, + "grad_norm": 0.9389250874519348, + "learning_rate": 9.112218390857098e-05, + "loss": 1.004, + "step": 60320 + }, + { + "epoch": 0.3854311743735865, + "grad_norm": 0.7335034608840942, + "learning_rate": 9.111932940870793e-05, + "loss": 0.9463, + "step": 60330 + }, + { + "epoch": 0.3854950615233252, + "grad_norm": 0.9130538105964661, + "learning_rate": 9.111647449473766e-05, + "loss": 0.8286, + "step": 60340 + }, + { + "epoch": 0.3855589486730639, + "grad_norm": 1.8311418294906616, + "learning_rate": 9.111361916668894e-05, + "loss": 0.9905, + "step": 60350 + }, + { + "epoch": 0.3856228358228026, + "grad_norm": 0.7370787858963013, + "learning_rate": 9.111076342459051e-05, + "loss": 1.105, + "step": 60360 + }, + { + "epoch": 0.3856867229725413, + "grad_norm": 0.8268787860870361, + "learning_rate": 9.110790726847109e-05, + "loss": 0.779, + "step": 60370 + }, + { + "epoch": 0.38575061012228, + "grad_norm": 0.7258269190788269, + "learning_rate": 9.110505069835952e-05, + "loss": 0.8981, + "step": 60380 + }, + { + "epoch": 0.3858144972720187, + "grad_norm": 1.1114614009857178, + "learning_rate": 9.11021937142845e-05, + "loss": 0.9508, + "step": 60390 + }, + { + "epoch": 0.3858783844217574, + "grad_norm": 0.6973649263381958, + "learning_rate": 9.109933631627485e-05, + "loss": 0.9868, + "step": 60400 + }, + { + "epoch": 0.38594227157149613, + "grad_norm": 0.8535771369934082, + "learning_rate": 9.109647850435931e-05, + "loss": 0.9278, + "step": 60410 + }, + { + "epoch": 0.38600615872123484, + "grad_norm": 0.9913718104362488, + "learning_rate": 9.10936202785667e-05, + "loss": 0.9525, + "step": 60420 + }, + { + "epoch": 0.3860700458709735, + "grad_norm": 0.9371497631072998, + "learning_rate": 9.109076163892577e-05, + "loss": 0.9669, + "step": 60430 + }, + { + "epoch": 0.3861339330207122, + "grad_norm": 0.6546643972396851, + "learning_rate": 9.108790258546533e-05, + "loss": 0.6787, + "step": 60440 + }, + { + "epoch": 0.3861978201704509, + "grad_norm": 0.8154623508453369, + "learning_rate": 9.108504311821416e-05, + "loss": 1.0956, + "step": 60450 + }, + { + "epoch": 0.3862617073201896, + "grad_norm": 0.5797396898269653, + "learning_rate": 9.108218323720104e-05, + "loss": 1.2229, + "step": 60460 + }, + { + "epoch": 0.3863255944699283, + "grad_norm": 1.2264608144760132, + "learning_rate": 9.107932294245483e-05, + "loss": 0.9712, + "step": 60470 + }, + { + "epoch": 0.386389481619667, + "grad_norm": 0.9331986904144287, + "learning_rate": 9.107646223400428e-05, + "loss": 0.8631, + "step": 60480 + }, + { + "epoch": 0.3864533687694057, + "grad_norm": 1.17788827419281, + "learning_rate": 9.107360111187821e-05, + "loss": 0.7527, + "step": 60490 + }, + { + "epoch": 0.38651725591914443, + "grad_norm": 0.9666171073913574, + "learning_rate": 9.107073957610546e-05, + "loss": 0.745, + "step": 60500 + }, + { + "epoch": 0.38658114306888314, + "grad_norm": 0.7744701504707336, + "learning_rate": 9.106787762671483e-05, + "loss": 0.9245, + "step": 60510 + }, + { + "epoch": 0.38664503021862184, + "grad_norm": 0.7567153573036194, + "learning_rate": 9.106501526373514e-05, + "loss": 0.8483, + "step": 60520 + }, + { + "epoch": 0.38670891736836055, + "grad_norm": 1.0141370296478271, + "learning_rate": 9.106215248719522e-05, + "loss": 0.8139, + "step": 60530 + }, + { + "epoch": 0.38677280451809926, + "grad_norm": 0.924473762512207, + "learning_rate": 9.10592892971239e-05, + "loss": 1.1207, + "step": 60540 + }, + { + "epoch": 0.3868366916678379, + "grad_norm": 0.6461699604988098, + "learning_rate": 9.105642569355002e-05, + "loss": 1.1942, + "step": 60550 + }, + { + "epoch": 0.3869005788175766, + "grad_norm": 0.7070831060409546, + "learning_rate": 9.105356167650241e-05, + "loss": 0.7269, + "step": 60560 + }, + { + "epoch": 0.3869644659673153, + "grad_norm": 1.24761962890625, + "learning_rate": 9.105069724600992e-05, + "loss": 0.9219, + "step": 60570 + }, + { + "epoch": 0.387028353117054, + "grad_norm": 0.9694204330444336, + "learning_rate": 9.104783240210137e-05, + "loss": 1.1463, + "step": 60580 + }, + { + "epoch": 0.38709224026679273, + "grad_norm": 0.7237581014633179, + "learning_rate": 9.104496714480567e-05, + "loss": 1.1098, + "step": 60590 + }, + { + "epoch": 0.38715612741653144, + "grad_norm": 0.9114017486572266, + "learning_rate": 9.104210147415163e-05, + "loss": 1.0888, + "step": 60600 + }, + { + "epoch": 0.38722001456627014, + "grad_norm": 0.5623325705528259, + "learning_rate": 9.103923539016813e-05, + "loss": 0.9529, + "step": 60610 + }, + { + "epoch": 0.38728390171600885, + "grad_norm": 0.6232447624206543, + "learning_rate": 9.1036368892884e-05, + "loss": 1.0587, + "step": 60620 + }, + { + "epoch": 0.38734778886574756, + "grad_norm": 0.9023538827896118, + "learning_rate": 9.103350198232816e-05, + "loss": 1.0181, + "step": 60630 + }, + { + "epoch": 0.38741167601548626, + "grad_norm": 1.999245047569275, + "learning_rate": 9.103063465852945e-05, + "loss": 0.9449, + "step": 60640 + }, + { + "epoch": 0.38747556316522497, + "grad_norm": 1.0726778507232666, + "learning_rate": 9.102776692151675e-05, + "loss": 0.8554, + "step": 60650 + }, + { + "epoch": 0.3875394503149637, + "grad_norm": 0.9312451481819153, + "learning_rate": 9.102489877131894e-05, + "loss": 0.8106, + "step": 60660 + }, + { + "epoch": 0.3876033374647023, + "grad_norm": 0.7528103590011597, + "learning_rate": 9.102203020796491e-05, + "loss": 0.9015, + "step": 60670 + }, + { + "epoch": 0.38766722461444103, + "grad_norm": 0.6276060342788696, + "learning_rate": 9.101916123148356e-05, + "loss": 0.8222, + "step": 60680 + }, + { + "epoch": 0.38773111176417974, + "grad_norm": 0.818074107170105, + "learning_rate": 9.101629184190375e-05, + "loss": 1.1241, + "step": 60690 + }, + { + "epoch": 0.38779499891391844, + "grad_norm": 0.8359874486923218, + "learning_rate": 9.10134220392544e-05, + "loss": 0.9222, + "step": 60700 + }, + { + "epoch": 0.38785888606365715, + "grad_norm": 0.846093475818634, + "learning_rate": 9.101055182356442e-05, + "loss": 0.9757, + "step": 60710 + }, + { + "epoch": 0.38792277321339586, + "grad_norm": 0.7747712731361389, + "learning_rate": 9.100768119486269e-05, + "loss": 0.7789, + "step": 60720 + }, + { + "epoch": 0.38798666036313456, + "grad_norm": 1.336980938911438, + "learning_rate": 9.100481015317814e-05, + "loss": 1.1395, + "step": 60730 + }, + { + "epoch": 0.38805054751287327, + "grad_norm": 1.1585602760314941, + "learning_rate": 9.100193869853968e-05, + "loss": 1.0321, + "step": 60740 + }, + { + "epoch": 0.388114434662612, + "grad_norm": 0.9213445782661438, + "learning_rate": 9.099906683097623e-05, + "loss": 0.9182, + "step": 60750 + }, + { + "epoch": 0.3881783218123507, + "grad_norm": 0.7520207166671753, + "learning_rate": 9.09961945505167e-05, + "loss": 0.6615, + "step": 60760 + }, + { + "epoch": 0.3882422089620894, + "grad_norm": 1.0059177875518799, + "learning_rate": 9.099332185719003e-05, + "loss": 0.7059, + "step": 60770 + }, + { + "epoch": 0.3883060961118281, + "grad_norm": 1.6132454872131348, + "learning_rate": 9.099044875102513e-05, + "loss": 1.1878, + "step": 60780 + }, + { + "epoch": 0.38836998326156674, + "grad_norm": 0.8192178010940552, + "learning_rate": 9.098757523205097e-05, + "loss": 1.0932, + "step": 60790 + }, + { + "epoch": 0.38843387041130545, + "grad_norm": 0.9005227088928223, + "learning_rate": 9.098470130029645e-05, + "loss": 0.762, + "step": 60800 + }, + { + "epoch": 0.38849775756104415, + "grad_norm": 0.7836887240409851, + "learning_rate": 9.098182695579054e-05, + "loss": 0.8244, + "step": 60810 + }, + { + "epoch": 0.38856164471078286, + "grad_norm": 0.7896131277084351, + "learning_rate": 9.097895219856218e-05, + "loss": 0.7864, + "step": 60820 + }, + { + "epoch": 0.38862553186052157, + "grad_norm": 1.6993827819824219, + "learning_rate": 9.09760770286403e-05, + "loss": 0.7552, + "step": 60830 + }, + { + "epoch": 0.3886894190102603, + "grad_norm": 0.8872599601745605, + "learning_rate": 9.09732014460539e-05, + "loss": 1.1259, + "step": 60840 + }, + { + "epoch": 0.388753306159999, + "grad_norm": 0.8446595072746277, + "learning_rate": 9.097032545083191e-05, + "loss": 0.7728, + "step": 60850 + }, + { + "epoch": 0.3888171933097377, + "grad_norm": 0.7190898656845093, + "learning_rate": 9.09674490430033e-05, + "loss": 1.0357, + "step": 60860 + }, + { + "epoch": 0.3888810804594764, + "grad_norm": 0.8590859770774841, + "learning_rate": 9.096457222259702e-05, + "loss": 0.7801, + "step": 60870 + }, + { + "epoch": 0.3889449676092151, + "grad_norm": 0.994317889213562, + "learning_rate": 9.096169498964206e-05, + "loss": 0.9578, + "step": 60880 + }, + { + "epoch": 0.3890088547589538, + "grad_norm": 1.0959383249282837, + "learning_rate": 9.095881734416742e-05, + "loss": 0.7354, + "step": 60890 + }, + { + "epoch": 0.3890727419086925, + "grad_norm": 1.1300466060638428, + "learning_rate": 9.095593928620203e-05, + "loss": 1.2792, + "step": 60900 + }, + { + "epoch": 0.38913662905843116, + "grad_norm": 0.9118770360946655, + "learning_rate": 9.095306081577491e-05, + "loss": 0.8323, + "step": 60910 + }, + { + "epoch": 0.38920051620816987, + "grad_norm": 0.5770663022994995, + "learning_rate": 9.095018193291504e-05, + "loss": 0.7813, + "step": 60920 + }, + { + "epoch": 0.3892644033579086, + "grad_norm": 1.2142269611358643, + "learning_rate": 9.094730263765141e-05, + "loss": 0.6744, + "step": 60930 + }, + { + "epoch": 0.3893282905076473, + "grad_norm": 0.6319569945335388, + "learning_rate": 9.094442293001301e-05, + "loss": 0.7512, + "step": 60940 + }, + { + "epoch": 0.389392177657386, + "grad_norm": 0.9332210421562195, + "learning_rate": 9.094154281002884e-05, + "loss": 0.9045, + "step": 60950 + }, + { + "epoch": 0.3894560648071247, + "grad_norm": 0.786271870136261, + "learning_rate": 9.093866227772794e-05, + "loss": 1.1151, + "step": 60960 + }, + { + "epoch": 0.3895199519568634, + "grad_norm": 0.8566588163375854, + "learning_rate": 9.093578133313928e-05, + "loss": 0.7992, + "step": 60970 + }, + { + "epoch": 0.3895838391066021, + "grad_norm": 0.7604480385780334, + "learning_rate": 9.093289997629188e-05, + "loss": 0.924, + "step": 60980 + }, + { + "epoch": 0.3896477262563408, + "grad_norm": 1.0149980783462524, + "learning_rate": 9.093001820721479e-05, + "loss": 1.0535, + "step": 60990 + }, + { + "epoch": 0.3897116134060795, + "grad_norm": 1.085911512374878, + "learning_rate": 9.092713602593699e-05, + "loss": 0.7629, + "step": 61000 + }, + { + "epoch": 0.3897755005558182, + "grad_norm": 1.1118038892745972, + "learning_rate": 9.092425343248753e-05, + "loss": 0.8315, + "step": 61010 + }, + { + "epoch": 0.38983938770555693, + "grad_norm": 0.49953410029411316, + "learning_rate": 9.092137042689542e-05, + "loss": 0.8272, + "step": 61020 + }, + { + "epoch": 0.3899032748552956, + "grad_norm": 0.703426718711853, + "learning_rate": 9.091848700918973e-05, + "loss": 0.8759, + "step": 61030 + }, + { + "epoch": 0.3899671620050343, + "grad_norm": 1.1554392576217651, + "learning_rate": 9.091560317939946e-05, + "loss": 0.9506, + "step": 61040 + }, + { + "epoch": 0.390031049154773, + "grad_norm": 0.745389997959137, + "learning_rate": 9.091271893755367e-05, + "loss": 0.7726, + "step": 61050 + }, + { + "epoch": 0.3900949363045117, + "grad_norm": 0.6152491569519043, + "learning_rate": 9.090983428368141e-05, + "loss": 0.896, + "step": 61060 + }, + { + "epoch": 0.3901588234542504, + "grad_norm": 2.3798322677612305, + "learning_rate": 9.09069492178117e-05, + "loss": 0.9652, + "step": 61070 + }, + { + "epoch": 0.3902227106039891, + "grad_norm": 0.8589335680007935, + "learning_rate": 9.090435230629522e-05, + "loss": 0.772, + "step": 61080 + }, + { + "epoch": 0.3902865977537278, + "grad_norm": 0.7509768009185791, + "learning_rate": 9.090146645771047e-05, + "loss": 0.9196, + "step": 61090 + }, + { + "epoch": 0.3903504849034665, + "grad_norm": 0.9738487005233765, + "learning_rate": 9.089858019721258e-05, + "loss": 0.8836, + "step": 61100 + }, + { + "epoch": 0.3904143720532052, + "grad_norm": 2.166499137878418, + "learning_rate": 9.089569352483061e-05, + "loss": 1.2521, + "step": 61110 + }, + { + "epoch": 0.39047825920294393, + "grad_norm": 0.9337096214294434, + "learning_rate": 9.089280644059361e-05, + "loss": 0.8933, + "step": 61120 + }, + { + "epoch": 0.39054214635268264, + "grad_norm": 1.1011388301849365, + "learning_rate": 9.088991894453069e-05, + "loss": 0.7827, + "step": 61130 + }, + { + "epoch": 0.39060603350242135, + "grad_norm": 1.5726940631866455, + "learning_rate": 9.08870310366709e-05, + "loss": 1.0105, + "step": 61140 + }, + { + "epoch": 0.39066992065216005, + "grad_norm": 0.6980756521224976, + "learning_rate": 9.088414271704334e-05, + "loss": 1.1352, + "step": 61150 + }, + { + "epoch": 0.3907338078018987, + "grad_norm": 0.9901998043060303, + "learning_rate": 9.088125398567708e-05, + "loss": 0.8634, + "step": 61160 + }, + { + "epoch": 0.3907976949516374, + "grad_norm": 0.7848410606384277, + "learning_rate": 9.087836484260125e-05, + "loss": 0.968, + "step": 61170 + }, + { + "epoch": 0.3908615821013761, + "grad_norm": 2.4346492290496826, + "learning_rate": 9.08754752878449e-05, + "loss": 0.8797, + "step": 61180 + }, + { + "epoch": 0.3909254692511148, + "grad_norm": 0.5621653199195862, + "learning_rate": 9.087258532143716e-05, + "loss": 0.8708, + "step": 61190 + }, + { + "epoch": 0.3909893564008535, + "grad_norm": 0.6077272891998291, + "learning_rate": 9.086969494340714e-05, + "loss": 0.9137, + "step": 61200 + }, + { + "epoch": 0.39105324355059223, + "grad_norm": 1.679137945175171, + "learning_rate": 9.08668041537839e-05, + "loss": 1.0507, + "step": 61210 + }, + { + "epoch": 0.39111713070033094, + "grad_norm": 0.7337985634803772, + "learning_rate": 9.086391295259662e-05, + "loss": 0.7978, + "step": 61220 + }, + { + "epoch": 0.39118101785006965, + "grad_norm": 0.8496336340904236, + "learning_rate": 9.086102133987436e-05, + "loss": 1.0827, + "step": 61230 + }, + { + "epoch": 0.39124490499980835, + "grad_norm": 1.5202635526657104, + "learning_rate": 9.085812931564627e-05, + "loss": 0.7946, + "step": 61240 + }, + { + "epoch": 0.39130879214954706, + "grad_norm": 1.236046314239502, + "learning_rate": 9.085523687994148e-05, + "loss": 0.6731, + "step": 61250 + }, + { + "epoch": 0.39137267929928576, + "grad_norm": 0.6897780895233154, + "learning_rate": 9.085234403278912e-05, + "loss": 0.9761, + "step": 61260 + }, + { + "epoch": 0.39143656644902447, + "grad_norm": 2.731182098388672, + "learning_rate": 9.08494507742183e-05, + "loss": 0.6203, + "step": 61270 + }, + { + "epoch": 0.3915004535987631, + "grad_norm": 1.2654629945755005, + "learning_rate": 9.084655710425817e-05, + "loss": 0.9412, + "step": 61280 + }, + { + "epoch": 0.3915643407485018, + "grad_norm": 0.9175102114677429, + "learning_rate": 9.084366302293787e-05, + "loss": 0.7672, + "step": 61290 + }, + { + "epoch": 0.39162822789824053, + "grad_norm": 0.8177767395973206, + "learning_rate": 9.084076853028656e-05, + "loss": 0.9016, + "step": 61300 + }, + { + "epoch": 0.39169211504797924, + "grad_norm": 0.6506124138832092, + "learning_rate": 9.083787362633336e-05, + "loss": 0.7544, + "step": 61310 + }, + { + "epoch": 0.39175600219771795, + "grad_norm": 0.7509700655937195, + "learning_rate": 9.083497831110745e-05, + "loss": 0.6952, + "step": 61320 + }, + { + "epoch": 0.39181988934745665, + "grad_norm": 0.8444516658782959, + "learning_rate": 9.0832082584638e-05, + "loss": 0.7465, + "step": 61330 + }, + { + "epoch": 0.39188377649719536, + "grad_norm": 0.7589380145072937, + "learning_rate": 9.082918644695413e-05, + "loss": 0.7664, + "step": 61340 + }, + { + "epoch": 0.39194766364693406, + "grad_norm": 4.895397186279297, + "learning_rate": 9.082628989808504e-05, + "loss": 1.0256, + "step": 61350 + }, + { + "epoch": 0.39201155079667277, + "grad_norm": 0.7632289528846741, + "learning_rate": 9.082339293805988e-05, + "loss": 1.2474, + "step": 61360 + }, + { + "epoch": 0.3920754379464115, + "grad_norm": 0.765465259552002, + "learning_rate": 9.082049556690786e-05, + "loss": 0.9572, + "step": 61370 + }, + { + "epoch": 0.3921393250961502, + "grad_norm": 0.6176701188087463, + "learning_rate": 9.081759778465811e-05, + "loss": 0.8701, + "step": 61380 + }, + { + "epoch": 0.3922032122458889, + "grad_norm": 1.1706359386444092, + "learning_rate": 9.081469959133986e-05, + "loss": 0.8748, + "step": 61390 + }, + { + "epoch": 0.39226709939562754, + "grad_norm": 1.032160758972168, + "learning_rate": 9.081180098698225e-05, + "loss": 0.8505, + "step": 61400 + }, + { + "epoch": 0.39233098654536624, + "grad_norm": 1.0283243656158447, + "learning_rate": 9.080890197161452e-05, + "loss": 0.7096, + "step": 61410 + }, + { + "epoch": 0.39239487369510495, + "grad_norm": 1.100449800491333, + "learning_rate": 9.080600254526583e-05, + "loss": 0.9363, + "step": 61420 + }, + { + "epoch": 0.39245876084484366, + "grad_norm": 1.9551182985305786, + "learning_rate": 9.080310270796539e-05, + "loss": 0.795, + "step": 61430 + }, + { + "epoch": 0.39252264799458236, + "grad_norm": 1.056577205657959, + "learning_rate": 9.080020245974241e-05, + "loss": 0.8075, + "step": 61440 + }, + { + "epoch": 0.39258653514432107, + "grad_norm": 0.6849813461303711, + "learning_rate": 9.07973018006261e-05, + "loss": 0.9542, + "step": 61450 + }, + { + "epoch": 0.3926504222940598, + "grad_norm": 0.8313121199607849, + "learning_rate": 9.079440073064567e-05, + "loss": 1.0857, + "step": 61460 + }, + { + "epoch": 0.3927143094437985, + "grad_norm": 0.7464626431465149, + "learning_rate": 9.079149924983031e-05, + "loss": 0.6962, + "step": 61470 + }, + { + "epoch": 0.3927781965935372, + "grad_norm": 1.59227454662323, + "learning_rate": 9.078859735820928e-05, + "loss": 0.8309, + "step": 61480 + }, + { + "epoch": 0.3928420837432759, + "grad_norm": 0.6378403306007385, + "learning_rate": 9.078569505581178e-05, + "loss": 0.7235, + "step": 61490 + }, + { + "epoch": 0.3929059708930146, + "grad_norm": 0.43592649698257446, + "learning_rate": 9.078279234266705e-05, + "loss": 0.8301, + "step": 61500 + }, + { + "epoch": 0.3929698580427533, + "grad_norm": 0.9196266531944275, + "learning_rate": 9.077988921880431e-05, + "loss": 0.8455, + "step": 61510 + }, + { + "epoch": 0.39303374519249196, + "grad_norm": 0.7270370721817017, + "learning_rate": 9.077698568425283e-05, + "loss": 0.8118, + "step": 61520 + }, + { + "epoch": 0.39309763234223066, + "grad_norm": 0.693191647529602, + "learning_rate": 9.07740817390418e-05, + "loss": 0.9402, + "step": 61530 + }, + { + "epoch": 0.39316151949196937, + "grad_norm": 0.7091450691223145, + "learning_rate": 9.077117738320051e-05, + "loss": 0.7799, + "step": 61540 + }, + { + "epoch": 0.3932254066417081, + "grad_norm": 1.02108633518219, + "learning_rate": 9.07682726167582e-05, + "loss": 0.9665, + "step": 61550 + }, + { + "epoch": 0.3932892937914468, + "grad_norm": 1.1987274885177612, + "learning_rate": 9.07653674397441e-05, + "loss": 0.9019, + "step": 61560 + }, + { + "epoch": 0.3933531809411855, + "grad_norm": 0.7170557379722595, + "learning_rate": 9.076246185218747e-05, + "loss": 0.9895, + "step": 61570 + }, + { + "epoch": 0.3934170680909242, + "grad_norm": 1.2851723432540894, + "learning_rate": 9.07595558541176e-05, + "loss": 0.8972, + "step": 61580 + }, + { + "epoch": 0.3934809552406629, + "grad_norm": 0.7113538384437561, + "learning_rate": 9.075664944556374e-05, + "loss": 0.8101, + "step": 61590 + }, + { + "epoch": 0.3935448423904016, + "grad_norm": 1.113052487373352, + "learning_rate": 9.075374262655516e-05, + "loss": 0.8718, + "step": 61600 + }, + { + "epoch": 0.3936087295401403, + "grad_norm": 0.9161044955253601, + "learning_rate": 9.075083539712113e-05, + "loss": 0.8209, + "step": 61610 + }, + { + "epoch": 0.393672616689879, + "grad_norm": 0.9524838328361511, + "learning_rate": 9.074792775729096e-05, + "loss": 0.9234, + "step": 61620 + }, + { + "epoch": 0.3937365038396177, + "grad_norm": 1.2486933469772339, + "learning_rate": 9.074501970709385e-05, + "loss": 0.7753, + "step": 61630 + }, + { + "epoch": 0.3938003909893564, + "grad_norm": 0.8280370831489563, + "learning_rate": 9.07421112465592e-05, + "loss": 1.0375, + "step": 61640 + }, + { + "epoch": 0.3938642781390951, + "grad_norm": 0.9013057947158813, + "learning_rate": 9.07392023757162e-05, + "loss": 0.9043, + "step": 61650 + }, + { + "epoch": 0.3939281652888338, + "grad_norm": 0.9092079401016235, + "learning_rate": 9.073629309459422e-05, + "loss": 0.9026, + "step": 61660 + }, + { + "epoch": 0.3939920524385725, + "grad_norm": 1.4664134979248047, + "learning_rate": 9.07333834032225e-05, + "loss": 0.6136, + "step": 61670 + }, + { + "epoch": 0.3940559395883112, + "grad_norm": 1.106016755104065, + "learning_rate": 9.07304733016304e-05, + "loss": 1.0314, + "step": 61680 + }, + { + "epoch": 0.3941198267380499, + "grad_norm": 0.8790785670280457, + "learning_rate": 9.072756278984717e-05, + "loss": 1.0497, + "step": 61690 + }, + { + "epoch": 0.3941837138877886, + "grad_norm": 1.431808590888977, + "learning_rate": 9.072465186790215e-05, + "loss": 0.9975, + "step": 61700 + }, + { + "epoch": 0.3942476010375273, + "grad_norm": 0.8433964252471924, + "learning_rate": 9.072174053582468e-05, + "loss": 0.6958, + "step": 61710 + }, + { + "epoch": 0.394311488187266, + "grad_norm": 0.829806923866272, + "learning_rate": 9.071882879364402e-05, + "loss": 1.0986, + "step": 61720 + }, + { + "epoch": 0.39437537533700473, + "grad_norm": 0.8924597501754761, + "learning_rate": 9.071591664138954e-05, + "loss": 0.9314, + "step": 61730 + }, + { + "epoch": 0.39443926248674344, + "grad_norm": 0.7619827389717102, + "learning_rate": 9.071300407909056e-05, + "loss": 0.9549, + "step": 61740 + }, + { + "epoch": 0.39450314963648214, + "grad_norm": 0.6050899028778076, + "learning_rate": 9.07100911067764e-05, + "loss": 0.9636, + "step": 61750 + }, + { + "epoch": 0.3945670367862208, + "grad_norm": 1.1481192111968994, + "learning_rate": 9.070717772447641e-05, + "loss": 0.743, + "step": 61760 + }, + { + "epoch": 0.3946309239359595, + "grad_norm": 1.505147099494934, + "learning_rate": 9.070426393221993e-05, + "loss": 0.7202, + "step": 61770 + }, + { + "epoch": 0.3946948110856982, + "grad_norm": 1.0512402057647705, + "learning_rate": 9.070134973003628e-05, + "loss": 0.8743, + "step": 61780 + }, + { + "epoch": 0.3947586982354369, + "grad_norm": 0.7054274082183838, + "learning_rate": 9.069843511795484e-05, + "loss": 0.9366, + "step": 61790 + }, + { + "epoch": 0.3948225853851756, + "grad_norm": 0.6536909937858582, + "learning_rate": 9.069552009600494e-05, + "loss": 0.7258, + "step": 61800 + }, + { + "epoch": 0.3948864725349143, + "grad_norm": 0.7718044519424438, + "learning_rate": 9.069260466421596e-05, + "loss": 1.0622, + "step": 61810 + }, + { + "epoch": 0.39495035968465303, + "grad_norm": 0.991255521774292, + "learning_rate": 9.068968882261723e-05, + "loss": 1.0272, + "step": 61820 + }, + { + "epoch": 0.39501424683439174, + "grad_norm": 5.583859443664551, + "learning_rate": 9.068677257123815e-05, + "loss": 0.8138, + "step": 61830 + }, + { + "epoch": 0.39507813398413044, + "grad_norm": 3.004866123199463, + "learning_rate": 9.068385591010805e-05, + "loss": 1.1612, + "step": 61840 + }, + { + "epoch": 0.39514202113386915, + "grad_norm": 0.7518250346183777, + "learning_rate": 9.068093883925633e-05, + "loss": 0.8184, + "step": 61850 + }, + { + "epoch": 0.39520590828360785, + "grad_norm": 1.5399583578109741, + "learning_rate": 9.067802135871237e-05, + "loss": 0.8756, + "step": 61860 + }, + { + "epoch": 0.39526979543334656, + "grad_norm": 2.1497974395751953, + "learning_rate": 9.067510346850554e-05, + "loss": 1.1971, + "step": 61870 + }, + { + "epoch": 0.3953336825830852, + "grad_norm": 0.8201958537101746, + "learning_rate": 9.067218516866523e-05, + "loss": 1.0288, + "step": 61880 + }, + { + "epoch": 0.3953975697328239, + "grad_norm": 1.203514575958252, + "learning_rate": 9.066926645922084e-05, + "loss": 1.0717, + "step": 61890 + }, + { + "epoch": 0.3954614568825626, + "grad_norm": 0.8252068161964417, + "learning_rate": 9.066634734020174e-05, + "loss": 0.8844, + "step": 61900 + }, + { + "epoch": 0.39552534403230133, + "grad_norm": 0.7639890313148499, + "learning_rate": 9.066342781163733e-05, + "loss": 0.907, + "step": 61910 + }, + { + "epoch": 0.39558923118204004, + "grad_norm": 0.8897015452384949, + "learning_rate": 9.066050787355704e-05, + "loss": 0.7727, + "step": 61920 + }, + { + "epoch": 0.39565311833177874, + "grad_norm": 0.7301774024963379, + "learning_rate": 9.065758752599026e-05, + "loss": 0.9699, + "step": 61930 + }, + { + "epoch": 0.39571700548151745, + "grad_norm": 1.1246193647384644, + "learning_rate": 9.065466676896639e-05, + "loss": 0.7621, + "step": 61940 + }, + { + "epoch": 0.39578089263125615, + "grad_norm": 0.7929351329803467, + "learning_rate": 9.065174560251487e-05, + "loss": 0.8905, + "step": 61950 + }, + { + "epoch": 0.39584477978099486, + "grad_norm": 1.0358200073242188, + "learning_rate": 9.064882402666508e-05, + "loss": 0.7801, + "step": 61960 + }, + { + "epoch": 0.39590866693073357, + "grad_norm": 0.7412776947021484, + "learning_rate": 9.064590204144647e-05, + "loss": 0.9169, + "step": 61970 + }, + { + "epoch": 0.39597255408047227, + "grad_norm": 0.7912229895591736, + "learning_rate": 9.064297964688848e-05, + "loss": 0.7336, + "step": 61980 + }, + { + "epoch": 0.396036441230211, + "grad_norm": 0.7048013210296631, + "learning_rate": 9.064005684302051e-05, + "loss": 0.7669, + "step": 61990 + }, + { + "epoch": 0.3961003283799497, + "grad_norm": 0.594420850276947, + "learning_rate": 9.063713362987201e-05, + "loss": 0.858, + "step": 62000 + }, + { + "epoch": 0.39616421552968833, + "grad_norm": 0.6770893931388855, + "learning_rate": 9.063421000747243e-05, + "loss": 0.931, + "step": 62010 + }, + { + "epoch": 0.39622810267942704, + "grad_norm": 0.9604712128639221, + "learning_rate": 9.06312859758512e-05, + "loss": 0.8397, + "step": 62020 + }, + { + "epoch": 0.39629198982916575, + "grad_norm": 0.693006157875061, + "learning_rate": 9.062836153503775e-05, + "loss": 0.9519, + "step": 62030 + }, + { + "epoch": 0.39635587697890445, + "grad_norm": 0.6312511563301086, + "learning_rate": 9.062543668506156e-05, + "loss": 0.9113, + "step": 62040 + }, + { + "epoch": 0.39641976412864316, + "grad_norm": 0.7017596364021301, + "learning_rate": 9.062251142595208e-05, + "loss": 0.6917, + "step": 62050 + }, + { + "epoch": 0.39648365127838187, + "grad_norm": 0.5928127765655518, + "learning_rate": 9.061958575773876e-05, + "loss": 0.9722, + "step": 62060 + }, + { + "epoch": 0.39654753842812057, + "grad_norm": 1.742937445640564, + "learning_rate": 9.06166596804511e-05, + "loss": 0.9624, + "step": 62070 + }, + { + "epoch": 0.3966114255778593, + "grad_norm": 0.7170588374137878, + "learning_rate": 9.06137331941185e-05, + "loss": 1.2726, + "step": 62080 + }, + { + "epoch": 0.396675312727598, + "grad_norm": 1.1127740144729614, + "learning_rate": 9.06108062987705e-05, + "loss": 1.0994, + "step": 62090 + }, + { + "epoch": 0.3967391998773367, + "grad_norm": 1.1925806999206543, + "learning_rate": 9.060787899443652e-05, + "loss": 0.8594, + "step": 62100 + }, + { + "epoch": 0.3968030870270754, + "grad_norm": 0.9429210424423218, + "learning_rate": 9.060495128114607e-05, + "loss": 0.8756, + "step": 62110 + }, + { + "epoch": 0.3968669741768141, + "grad_norm": 0.5963910222053528, + "learning_rate": 9.060202315892866e-05, + "loss": 0.9714, + "step": 62120 + }, + { + "epoch": 0.39693086132655275, + "grad_norm": 1.1170175075531006, + "learning_rate": 9.059909462781373e-05, + "loss": 0.8631, + "step": 62130 + }, + { + "epoch": 0.39699474847629146, + "grad_norm": 0.8226674199104309, + "learning_rate": 9.05961656878308e-05, + "loss": 0.8723, + "step": 62140 + }, + { + "epoch": 0.39705863562603017, + "grad_norm": 0.8132166862487793, + "learning_rate": 9.059323633900936e-05, + "loss": 0.8302, + "step": 62150 + }, + { + "epoch": 0.39712252277576887, + "grad_norm": 1.661969542503357, + "learning_rate": 9.059030658137892e-05, + "loss": 0.6955, + "step": 62160 + }, + { + "epoch": 0.3971864099255076, + "grad_norm": 1.1347659826278687, + "learning_rate": 9.058737641496896e-05, + "loss": 0.8491, + "step": 62170 + }, + { + "epoch": 0.3972502970752463, + "grad_norm": 0.8600234985351562, + "learning_rate": 9.058444583980901e-05, + "loss": 0.8166, + "step": 62180 + }, + { + "epoch": 0.397314184224985, + "grad_norm": 1.037380337715149, + "learning_rate": 9.058151485592858e-05, + "loss": 1.0938, + "step": 62190 + }, + { + "epoch": 0.3973780713747237, + "grad_norm": 0.8460977673530579, + "learning_rate": 9.057858346335719e-05, + "loss": 0.7327, + "step": 62200 + }, + { + "epoch": 0.3974419585244624, + "grad_norm": 1.0163371562957764, + "learning_rate": 9.057565166212436e-05, + "loss": 0.8889, + "step": 62210 + }, + { + "epoch": 0.3975058456742011, + "grad_norm": 0.8460572361946106, + "learning_rate": 9.057271945225962e-05, + "loss": 0.9403, + "step": 62220 + }, + { + "epoch": 0.3975697328239398, + "grad_norm": 0.8837966322898865, + "learning_rate": 9.056978683379249e-05, + "loss": 0.8961, + "step": 62230 + }, + { + "epoch": 0.3976336199736785, + "grad_norm": 2.452587127685547, + "learning_rate": 9.056685380675251e-05, + "loss": 0.7814, + "step": 62240 + }, + { + "epoch": 0.39769750712341717, + "grad_norm": 1.0550041198730469, + "learning_rate": 9.056392037116922e-05, + "loss": 0.944, + "step": 62250 + }, + { + "epoch": 0.3977613942731559, + "grad_norm": 0.6195732951164246, + "learning_rate": 9.056098652707215e-05, + "loss": 0.8217, + "step": 62260 + }, + { + "epoch": 0.3978252814228946, + "grad_norm": 0.9907122254371643, + "learning_rate": 9.055805227449086e-05, + "loss": 0.9645, + "step": 62270 + }, + { + "epoch": 0.3978891685726333, + "grad_norm": 0.8403761386871338, + "learning_rate": 9.05551176134549e-05, + "loss": 0.8914, + "step": 62280 + }, + { + "epoch": 0.397953055722372, + "grad_norm": 0.880027174949646, + "learning_rate": 9.055218254399382e-05, + "loss": 0.8041, + "step": 62290 + }, + { + "epoch": 0.3980169428721107, + "grad_norm": 1.1135274171829224, + "learning_rate": 9.054924706613716e-05, + "loss": 0.9497, + "step": 62300 + }, + { + "epoch": 0.3980808300218494, + "grad_norm": 0.6633336544036865, + "learning_rate": 9.054631117991453e-05, + "loss": 0.9701, + "step": 62310 + }, + { + "epoch": 0.3981447171715881, + "grad_norm": 0.9444105625152588, + "learning_rate": 9.054337488535546e-05, + "loss": 0.7332, + "step": 62320 + }, + { + "epoch": 0.3982086043213268, + "grad_norm": 1.4530045986175537, + "learning_rate": 9.054043818248952e-05, + "loss": 0.7233, + "step": 62330 + }, + { + "epoch": 0.3982724914710655, + "grad_norm": 0.8733393549919128, + "learning_rate": 9.053750107134631e-05, + "loss": 0.9144, + "step": 62340 + }, + { + "epoch": 0.39833637862080423, + "grad_norm": 0.6644203662872314, + "learning_rate": 9.053456355195537e-05, + "loss": 0.874, + "step": 62350 + }, + { + "epoch": 0.39840026577054294, + "grad_norm": 0.9878085851669312, + "learning_rate": 9.053162562434633e-05, + "loss": 1.2423, + "step": 62360 + }, + { + "epoch": 0.3984641529202816, + "grad_norm": 1.3864879608154297, + "learning_rate": 9.052868728854876e-05, + "loss": 0.9242, + "step": 62370 + }, + { + "epoch": 0.3985280400700203, + "grad_norm": 0.777885377407074, + "learning_rate": 9.052574854459223e-05, + "loss": 0.9149, + "step": 62380 + }, + { + "epoch": 0.398591927219759, + "grad_norm": 0.8761781454086304, + "learning_rate": 9.052280939250636e-05, + "loss": 1.0996, + "step": 62390 + }, + { + "epoch": 0.3986558143694977, + "grad_norm": 0.5955054759979248, + "learning_rate": 9.051986983232073e-05, + "loss": 0.8387, + "step": 62400 + }, + { + "epoch": 0.3987197015192364, + "grad_norm": 0.9722325205802917, + "learning_rate": 9.051692986406496e-05, + "loss": 1.0697, + "step": 62410 + }, + { + "epoch": 0.3987835886689751, + "grad_norm": 0.6643452048301697, + "learning_rate": 9.051398948776868e-05, + "loss": 1.2275, + "step": 62420 + }, + { + "epoch": 0.3988474758187138, + "grad_norm": 1.9424246549606323, + "learning_rate": 9.051104870346146e-05, + "loss": 0.8924, + "step": 62430 + }, + { + "epoch": 0.39891136296845253, + "grad_norm": 0.771974503993988, + "learning_rate": 9.050810751117292e-05, + "loss": 0.7818, + "step": 62440 + }, + { + "epoch": 0.39897525011819124, + "grad_norm": 1.0370486974716187, + "learning_rate": 9.05051659109327e-05, + "loss": 0.9449, + "step": 62450 + }, + { + "epoch": 0.39903913726792994, + "grad_norm": 1.289140224456787, + "learning_rate": 9.050222390277041e-05, + "loss": 0.7505, + "step": 62460 + }, + { + "epoch": 0.39910302441766865, + "grad_norm": 0.7696613669395447, + "learning_rate": 9.049928148671569e-05, + "loss": 0.9424, + "step": 62470 + }, + { + "epoch": 0.39916691156740736, + "grad_norm": 2.606376886367798, + "learning_rate": 9.049633866279819e-05, + "loss": 0.9175, + "step": 62480 + }, + { + "epoch": 0.399230798717146, + "grad_norm": 0.9909952282905579, + "learning_rate": 9.049339543104751e-05, + "loss": 0.6305, + "step": 62490 + }, + { + "epoch": 0.3992946858668847, + "grad_norm": 0.9084514379501343, + "learning_rate": 9.04904517914933e-05, + "loss": 0.8609, + "step": 62500 + }, + { + "epoch": 0.3993585730166234, + "grad_norm": 1.2386984825134277, + "learning_rate": 9.048750774416521e-05, + "loss": 0.863, + "step": 62510 + }, + { + "epoch": 0.3994224601663621, + "grad_norm": 1.0682573318481445, + "learning_rate": 9.04845632890929e-05, + "loss": 0.9319, + "step": 62520 + }, + { + "epoch": 0.39948634731610083, + "grad_norm": 0.7610236406326294, + "learning_rate": 9.048161842630602e-05, + "loss": 0.7901, + "step": 62530 + }, + { + "epoch": 0.39955023446583954, + "grad_norm": 0.8096383810043335, + "learning_rate": 9.04786731558342e-05, + "loss": 0.8542, + "step": 62540 + }, + { + "epoch": 0.39961412161557824, + "grad_norm": 0.6308041214942932, + "learning_rate": 9.047572747770713e-05, + "loss": 0.9005, + "step": 62550 + }, + { + "epoch": 0.39967800876531695, + "grad_norm": 1.0608285665512085, + "learning_rate": 9.047278139195447e-05, + "loss": 0.9082, + "step": 62560 + }, + { + "epoch": 0.39974189591505566, + "grad_norm": 1.0790623426437378, + "learning_rate": 9.04698348986059e-05, + "loss": 0.9912, + "step": 62570 + }, + { + "epoch": 0.39980578306479436, + "grad_norm": 0.5858973860740662, + "learning_rate": 9.046688799769107e-05, + "loss": 0.7241, + "step": 62580 + }, + { + "epoch": 0.39986967021453307, + "grad_norm": 0.7396795153617859, + "learning_rate": 9.046394068923967e-05, + "loss": 0.8767, + "step": 62590 + }, + { + "epoch": 0.3999335573642718, + "grad_norm": 0.5871313214302063, + "learning_rate": 9.046099297328138e-05, + "loss": 0.6491, + "step": 62600 + }, + { + "epoch": 0.3999974445140104, + "grad_norm": 1.0462760925292969, + "learning_rate": 9.045804484984588e-05, + "loss": 0.9854, + "step": 62610 + }, + { + "epoch": 0.40006133166374913, + "grad_norm": 1.0738905668258667, + "learning_rate": 9.045509631896287e-05, + "loss": 0.8662, + "step": 62620 + }, + { + "epoch": 0.40012521881348784, + "grad_norm": 0.7057567834854126, + "learning_rate": 9.045214738066206e-05, + "loss": 0.736, + "step": 62630 + }, + { + "epoch": 0.40018910596322654, + "grad_norm": 0.9611753225326538, + "learning_rate": 9.044919803497312e-05, + "loss": 0.794, + "step": 62640 + }, + { + "epoch": 0.40025299311296525, + "grad_norm": 0.9139066934585571, + "learning_rate": 9.044624828192573e-05, + "loss": 0.7416, + "step": 62650 + }, + { + "epoch": 0.40031688026270396, + "grad_norm": 0.7299910187721252, + "learning_rate": 9.044329812154966e-05, + "loss": 1.1855, + "step": 62660 + }, + { + "epoch": 0.40038076741244266, + "grad_norm": 1.0329594612121582, + "learning_rate": 9.04403475538746e-05, + "loss": 0.7538, + "step": 62670 + }, + { + "epoch": 0.40044465456218137, + "grad_norm": 0.8256815671920776, + "learning_rate": 9.043739657893025e-05, + "loss": 0.8794, + "step": 62680 + }, + { + "epoch": 0.4005085417119201, + "grad_norm": 0.7287086248397827, + "learning_rate": 9.043444519674631e-05, + "loss": 1.0395, + "step": 62690 + }, + { + "epoch": 0.4005724288616588, + "grad_norm": 0.7934675216674805, + "learning_rate": 9.043149340735253e-05, + "loss": 0.7567, + "step": 62700 + }, + { + "epoch": 0.4006363160113975, + "grad_norm": 0.5604273676872253, + "learning_rate": 9.042854121077865e-05, + "loss": 1.0449, + "step": 62710 + }, + { + "epoch": 0.4007002031611362, + "grad_norm": 1.5315760374069214, + "learning_rate": 9.042558860705436e-05, + "loss": 1.1271, + "step": 62720 + }, + { + "epoch": 0.40076409031087484, + "grad_norm": 1.0402827262878418, + "learning_rate": 9.042263559620945e-05, + "loss": 0.8949, + "step": 62730 + }, + { + "epoch": 0.40082797746061355, + "grad_norm": 0.6324530243873596, + "learning_rate": 9.041968217827363e-05, + "loss": 0.7596, + "step": 62740 + }, + { + "epoch": 0.40089186461035226, + "grad_norm": 0.6953981518745422, + "learning_rate": 9.041672835327661e-05, + "loss": 0.6683, + "step": 62750 + }, + { + "epoch": 0.40095575176009096, + "grad_norm": 0.8645387887954712, + "learning_rate": 9.04137741212482e-05, + "loss": 1.1533, + "step": 62760 + }, + { + "epoch": 0.40101963890982967, + "grad_norm": 0.8752760291099548, + "learning_rate": 9.04108194822181e-05, + "loss": 0.9688, + "step": 62770 + }, + { + "epoch": 0.4010835260595684, + "grad_norm": 1.0620766878128052, + "learning_rate": 9.040786443621609e-05, + "loss": 0.9483, + "step": 62780 + }, + { + "epoch": 0.4011474132093071, + "grad_norm": 0.9805885553359985, + "learning_rate": 9.040490898327194e-05, + "loss": 0.9889, + "step": 62790 + }, + { + "epoch": 0.4012113003590458, + "grad_norm": 1.2980453968048096, + "learning_rate": 9.04019531234154e-05, + "loss": 1.0088, + "step": 62800 + }, + { + "epoch": 0.4012751875087845, + "grad_norm": 0.6901305913925171, + "learning_rate": 9.039899685667624e-05, + "loss": 0.841, + "step": 62810 + }, + { + "epoch": 0.4013390746585232, + "grad_norm": 0.6811827421188354, + "learning_rate": 9.039604018308423e-05, + "loss": 0.7313, + "step": 62820 + }, + { + "epoch": 0.4014029618082619, + "grad_norm": 1.0031507015228271, + "learning_rate": 9.039308310266914e-05, + "loss": 0.9193, + "step": 62830 + }, + { + "epoch": 0.4014668489580006, + "grad_norm": 0.66957688331604, + "learning_rate": 9.039012561546076e-05, + "loss": 0.9917, + "step": 62840 + }, + { + "epoch": 0.4015307361077393, + "grad_norm": 1.3045806884765625, + "learning_rate": 9.038716772148888e-05, + "loss": 0.8695, + "step": 62850 + }, + { + "epoch": 0.40159462325747797, + "grad_norm": 0.8219857811927795, + "learning_rate": 9.038420942078327e-05, + "loss": 0.913, + "step": 62860 + }, + { + "epoch": 0.4016585104072167, + "grad_norm": 1.7274596691131592, + "learning_rate": 9.038125071337374e-05, + "loss": 0.9524, + "step": 62870 + }, + { + "epoch": 0.4017223975569554, + "grad_norm": 1.0028507709503174, + "learning_rate": 9.037829159929008e-05, + "loss": 0.8358, + "step": 62880 + }, + { + "epoch": 0.4017862847066941, + "grad_norm": 1.3326009511947632, + "learning_rate": 9.03753320785621e-05, + "loss": 1.1543, + "step": 62890 + }, + { + "epoch": 0.4018501718564328, + "grad_norm": 0.4937954246997833, + "learning_rate": 9.037237215121958e-05, + "loss": 1.0826, + "step": 62900 + }, + { + "epoch": 0.4019140590061715, + "grad_norm": 1.1819350719451904, + "learning_rate": 9.036941181729236e-05, + "loss": 0.8164, + "step": 62910 + }, + { + "epoch": 0.4019779461559102, + "grad_norm": 0.710355281829834, + "learning_rate": 9.036645107681023e-05, + "loss": 0.8995, + "step": 62920 + }, + { + "epoch": 0.4020418333056489, + "grad_norm": 0.6797157526016235, + "learning_rate": 9.036348992980301e-05, + "loss": 0.9323, + "step": 62930 + }, + { + "epoch": 0.4021057204553876, + "grad_norm": 0.6142218112945557, + "learning_rate": 9.036052837630054e-05, + "loss": 1.0316, + "step": 62940 + }, + { + "epoch": 0.4021696076051263, + "grad_norm": 0.5623320937156677, + "learning_rate": 9.035756641633264e-05, + "loss": 1.0354, + "step": 62950 + }, + { + "epoch": 0.40223349475486503, + "grad_norm": 0.6780490279197693, + "learning_rate": 9.03546040499291e-05, + "loss": 0.8209, + "step": 62960 + }, + { + "epoch": 0.40229738190460373, + "grad_norm": 0.8299171328544617, + "learning_rate": 9.035164127711981e-05, + "loss": 0.9596, + "step": 62970 + }, + { + "epoch": 0.4023612690543424, + "grad_norm": 0.6555722951889038, + "learning_rate": 9.03486780979346e-05, + "loss": 1.0825, + "step": 62980 + }, + { + "epoch": 0.4024251562040811, + "grad_norm": 1.324913501739502, + "learning_rate": 9.034571451240325e-05, + "loss": 0.9062, + "step": 62990 + }, + { + "epoch": 0.4024890433538198, + "grad_norm": 1.155165672302246, + "learning_rate": 9.034275052055568e-05, + "loss": 0.9358, + "step": 63000 + }, + { + "epoch": 0.4025529305035585, + "grad_norm": 0.9214060306549072, + "learning_rate": 9.03397861224217e-05, + "loss": 0.6413, + "step": 63010 + }, + { + "epoch": 0.4026168176532972, + "grad_norm": 0.9040579199790955, + "learning_rate": 9.033682131803119e-05, + "loss": 0.9746, + "step": 63020 + }, + { + "epoch": 0.4026807048030359, + "grad_norm": 0.9403018355369568, + "learning_rate": 9.033385610741398e-05, + "loss": 0.8279, + "step": 63030 + }, + { + "epoch": 0.4027445919527746, + "grad_norm": 0.9676703810691833, + "learning_rate": 9.033089049059996e-05, + "loss": 1.2033, + "step": 63040 + }, + { + "epoch": 0.40280847910251333, + "grad_norm": 3.454418420791626, + "learning_rate": 9.032792446761896e-05, + "loss": 0.8704, + "step": 63050 + }, + { + "epoch": 0.40287236625225203, + "grad_norm": 3.037147283554077, + "learning_rate": 9.032495803850088e-05, + "loss": 0.8457, + "step": 63060 + }, + { + "epoch": 0.40293625340199074, + "grad_norm": 0.6617047190666199, + "learning_rate": 9.032199120327558e-05, + "loss": 0.8883, + "step": 63070 + }, + { + "epoch": 0.40300014055172945, + "grad_norm": 0.987816333770752, + "learning_rate": 9.031902396197296e-05, + "loss": 0.791, + "step": 63080 + }, + { + "epoch": 0.40306402770146815, + "grad_norm": 0.7392496466636658, + "learning_rate": 9.031605631462288e-05, + "loss": 0.8478, + "step": 63090 + }, + { + "epoch": 0.4031279148512068, + "grad_norm": 1.0428085327148438, + "learning_rate": 9.031308826125524e-05, + "loss": 0.9056, + "step": 63100 + }, + { + "epoch": 0.4031918020009455, + "grad_norm": 1.2448642253875732, + "learning_rate": 9.031011980189992e-05, + "loss": 0.8957, + "step": 63110 + }, + { + "epoch": 0.4032556891506842, + "grad_norm": 0.9294689893722534, + "learning_rate": 9.030715093658681e-05, + "loss": 0.6793, + "step": 63120 + }, + { + "epoch": 0.4033195763004229, + "grad_norm": 1.0956008434295654, + "learning_rate": 9.030418166534585e-05, + "loss": 1.106, + "step": 63130 + }, + { + "epoch": 0.40338346345016163, + "grad_norm": 0.8035675883293152, + "learning_rate": 9.030121198820688e-05, + "loss": 0.8668, + "step": 63140 + }, + { + "epoch": 0.40344735059990033, + "grad_norm": 1.7835280895233154, + "learning_rate": 9.029824190519986e-05, + "loss": 1.0177, + "step": 63150 + }, + { + "epoch": 0.40351123774963904, + "grad_norm": 1.057437777519226, + "learning_rate": 9.029527141635467e-05, + "loss": 0.8812, + "step": 63160 + }, + { + "epoch": 0.40357512489937775, + "grad_norm": 0.8696292638778687, + "learning_rate": 9.029230052170123e-05, + "loss": 0.8662, + "step": 63170 + }, + { + "epoch": 0.40363901204911645, + "grad_norm": 0.9994892477989197, + "learning_rate": 9.02893292212695e-05, + "loss": 1.1962, + "step": 63180 + }, + { + "epoch": 0.40370289919885516, + "grad_norm": 1.2672826051712036, + "learning_rate": 9.028635751508933e-05, + "loss": 0.95, + "step": 63190 + }, + { + "epoch": 0.40376678634859386, + "grad_norm": 0.6766789555549622, + "learning_rate": 9.02833854031907e-05, + "loss": 0.7107, + "step": 63200 + }, + { + "epoch": 0.40383067349833257, + "grad_norm": 1.4297183752059937, + "learning_rate": 9.028041288560354e-05, + "loss": 0.9061, + "step": 63210 + }, + { + "epoch": 0.4038945606480712, + "grad_norm": 0.7802338004112244, + "learning_rate": 9.027743996235775e-05, + "loss": 0.858, + "step": 63220 + }, + { + "epoch": 0.4039584477978099, + "grad_norm": 1.5008245706558228, + "learning_rate": 9.027446663348333e-05, + "loss": 1.1954, + "step": 63230 + }, + { + "epoch": 0.40402233494754863, + "grad_norm": 0.9021018743515015, + "learning_rate": 9.027149289901016e-05, + "loss": 0.8044, + "step": 63240 + }, + { + "epoch": 0.40408622209728734, + "grad_norm": 0.7308499217033386, + "learning_rate": 9.026851875896822e-05, + "loss": 1.0717, + "step": 63250 + }, + { + "epoch": 0.40415010924702605, + "grad_norm": 0.8657183051109314, + "learning_rate": 9.026554421338748e-05, + "loss": 1.0214, + "step": 63260 + }, + { + "epoch": 0.40421399639676475, + "grad_norm": 0.9111654162406921, + "learning_rate": 9.026256926229786e-05, + "loss": 0.863, + "step": 63270 + }, + { + "epoch": 0.40427788354650346, + "grad_norm": 0.9648974537849426, + "learning_rate": 9.025959390572933e-05, + "loss": 0.7586, + "step": 63280 + }, + { + "epoch": 0.40434177069624216, + "grad_norm": 0.8858680725097656, + "learning_rate": 9.025661814371187e-05, + "loss": 1.1628, + "step": 63290 + }, + { + "epoch": 0.40440565784598087, + "grad_norm": 0.7507526874542236, + "learning_rate": 9.025364197627543e-05, + "loss": 0.7555, + "step": 63300 + }, + { + "epoch": 0.4044695449957196, + "grad_norm": 0.7680438160896301, + "learning_rate": 9.025066540345e-05, + "loss": 1.0276, + "step": 63310 + }, + { + "epoch": 0.4045334321454583, + "grad_norm": 1.0163004398345947, + "learning_rate": 9.024768842526554e-05, + "loss": 0.9563, + "step": 63320 + }, + { + "epoch": 0.404597319295197, + "grad_norm": 0.7688309550285339, + "learning_rate": 9.024471104175203e-05, + "loss": 0.9156, + "step": 63330 + }, + { + "epoch": 0.40466120644493564, + "grad_norm": 1.04693603515625, + "learning_rate": 9.024173325293949e-05, + "loss": 0.8006, + "step": 63340 + }, + { + "epoch": 0.40472509359467435, + "grad_norm": 0.6474732160568237, + "learning_rate": 9.023875505885786e-05, + "loss": 0.5947, + "step": 63350 + }, + { + "epoch": 0.40478898074441305, + "grad_norm": 0.6486718058586121, + "learning_rate": 9.023577645953718e-05, + "loss": 0.8847, + "step": 63360 + }, + { + "epoch": 0.40485286789415176, + "grad_norm": 0.8913542628288269, + "learning_rate": 9.023279745500738e-05, + "loss": 1.0172, + "step": 63370 + }, + { + "epoch": 0.40491675504389046, + "grad_norm": 0.629562497138977, + "learning_rate": 9.022981804529853e-05, + "loss": 0.9124, + "step": 63380 + }, + { + "epoch": 0.40498064219362917, + "grad_norm": 0.6215500831604004, + "learning_rate": 9.022683823044061e-05, + "loss": 0.7679, + "step": 63390 + }, + { + "epoch": 0.4050445293433679, + "grad_norm": 1.2027158737182617, + "learning_rate": 9.022385801046363e-05, + "loss": 0.9516, + "step": 63400 + }, + { + "epoch": 0.4051084164931066, + "grad_norm": 1.0377594232559204, + "learning_rate": 9.02208773853976e-05, + "loss": 0.791, + "step": 63410 + }, + { + "epoch": 0.4051723036428453, + "grad_norm": 0.5519084334373474, + "learning_rate": 9.021789635527252e-05, + "loss": 1.0029, + "step": 63420 + }, + { + "epoch": 0.405236190792584, + "grad_norm": 0.8896793723106384, + "learning_rate": 9.021491492011844e-05, + "loss": 0.9838, + "step": 63430 + }, + { + "epoch": 0.4053000779423227, + "grad_norm": 0.8860163688659668, + "learning_rate": 9.021193307996538e-05, + "loss": 1.2256, + "step": 63440 + }, + { + "epoch": 0.4053639650920614, + "grad_norm": 1.1644326448440552, + "learning_rate": 9.020895083484337e-05, + "loss": 0.8291, + "step": 63450 + }, + { + "epoch": 0.40542785224180006, + "grad_norm": 0.8265649676322937, + "learning_rate": 9.020596818478244e-05, + "loss": 1.0556, + "step": 63460 + }, + { + "epoch": 0.40549173939153876, + "grad_norm": 1.3576620817184448, + "learning_rate": 9.020298512981262e-05, + "loss": 0.9018, + "step": 63470 + }, + { + "epoch": 0.40555562654127747, + "grad_norm": 0.8418384194374084, + "learning_rate": 9.020000166996397e-05, + "loss": 0.949, + "step": 63480 + }, + { + "epoch": 0.4056195136910162, + "grad_norm": 0.9804365634918213, + "learning_rate": 9.01970178052665e-05, + "loss": 1.0751, + "step": 63490 + }, + { + "epoch": 0.4056834008407549, + "grad_norm": 1.0201619863510132, + "learning_rate": 9.01940335357503e-05, + "loss": 0.9828, + "step": 63500 + }, + { + "epoch": 0.4057472879904936, + "grad_norm": 0.6420082449913025, + "learning_rate": 9.019104886144543e-05, + "loss": 0.7166, + "step": 63510 + }, + { + "epoch": 0.4058111751402323, + "grad_norm": 0.6462534070014954, + "learning_rate": 9.01880637823819e-05, + "loss": 0.883, + "step": 63520 + }, + { + "epoch": 0.405875062289971, + "grad_norm": 0.44123539328575134, + "learning_rate": 9.018507829858981e-05, + "loss": 0.8291, + "step": 63530 + }, + { + "epoch": 0.4059389494397097, + "grad_norm": 0.8670223951339722, + "learning_rate": 9.018209241009921e-05, + "loss": 1.0204, + "step": 63540 + }, + { + "epoch": 0.4060028365894484, + "grad_norm": 1.4019068479537964, + "learning_rate": 9.017910611694018e-05, + "loss": 0.6407, + "step": 63550 + }, + { + "epoch": 0.4060667237391871, + "grad_norm": 0.8712426424026489, + "learning_rate": 9.01761194191428e-05, + "loss": 0.7192, + "step": 63560 + }, + { + "epoch": 0.4061306108889258, + "grad_norm": 1.283201813697815, + "learning_rate": 9.017313231673714e-05, + "loss": 0.9573, + "step": 63570 + }, + { + "epoch": 0.4061944980386645, + "grad_norm": 0.5912864208221436, + "learning_rate": 9.017014480975327e-05, + "loss": 0.6543, + "step": 63580 + }, + { + "epoch": 0.4062583851884032, + "grad_norm": 0.8346386551856995, + "learning_rate": 9.01671568982213e-05, + "loss": 0.9464, + "step": 63590 + }, + { + "epoch": 0.4063222723381419, + "grad_norm": 0.7532115578651428, + "learning_rate": 9.016416858217131e-05, + "loss": 0.7063, + "step": 63600 + }, + { + "epoch": 0.4063861594878806, + "grad_norm": 0.8477169275283813, + "learning_rate": 9.016117986163339e-05, + "loss": 1.3564, + "step": 63610 + }, + { + "epoch": 0.4064500466376193, + "grad_norm": 0.8860613703727722, + "learning_rate": 9.015819073663765e-05, + "loss": 0.9798, + "step": 63620 + }, + { + "epoch": 0.406513933787358, + "grad_norm": 1.1103487014770508, + "learning_rate": 9.015520120721419e-05, + "loss": 1.222, + "step": 63630 + }, + { + "epoch": 0.4065778209370967, + "grad_norm": 0.9043728709220886, + "learning_rate": 9.015221127339311e-05, + "loss": 1.0145, + "step": 63640 + }, + { + "epoch": 0.4066417080868354, + "grad_norm": 1.244024634361267, + "learning_rate": 9.01492209352045e-05, + "loss": 0.8443, + "step": 63650 + }, + { + "epoch": 0.4067055952365741, + "grad_norm": 0.982548713684082, + "learning_rate": 9.014623019267853e-05, + "loss": 0.9263, + "step": 63660 + }, + { + "epoch": 0.40676948238631283, + "grad_norm": 0.7706730961799622, + "learning_rate": 9.01432390458453e-05, + "loss": 1.0515, + "step": 63670 + }, + { + "epoch": 0.40683336953605154, + "grad_norm": 1.0974234342575073, + "learning_rate": 9.014024749473491e-05, + "loss": 0.8391, + "step": 63680 + }, + { + "epoch": 0.40689725668579024, + "grad_norm": 0.8479081392288208, + "learning_rate": 9.01372555393775e-05, + "loss": 1.1546, + "step": 63690 + }, + { + "epoch": 0.40696114383552895, + "grad_norm": 0.672386884689331, + "learning_rate": 9.01342631798032e-05, + "loss": 0.7925, + "step": 63700 + }, + { + "epoch": 0.4070250309852676, + "grad_norm": 0.6552578806877136, + "learning_rate": 9.013127041604217e-05, + "loss": 0.8039, + "step": 63710 + }, + { + "epoch": 0.4070889181350063, + "grad_norm": 0.7471428513526917, + "learning_rate": 9.01282772481245e-05, + "loss": 0.8832, + "step": 63720 + }, + { + "epoch": 0.407152805284745, + "grad_norm": 0.871847927570343, + "learning_rate": 9.012528367608037e-05, + "loss": 0.87, + "step": 63730 + }, + { + "epoch": 0.4072166924344837, + "grad_norm": 0.8684267401695251, + "learning_rate": 9.012228969993992e-05, + "loss": 1.0718, + "step": 63740 + }, + { + "epoch": 0.4072805795842224, + "grad_norm": 1.1163896322250366, + "learning_rate": 9.01192953197333e-05, + "loss": 0.8171, + "step": 63750 + }, + { + "epoch": 0.40734446673396113, + "grad_norm": 0.8885432481765747, + "learning_rate": 9.011630053549069e-05, + "loss": 0.7901, + "step": 63760 + }, + { + "epoch": 0.40740835388369984, + "grad_norm": 0.9396836161613464, + "learning_rate": 9.011330534724221e-05, + "loss": 0.8013, + "step": 63770 + }, + { + "epoch": 0.40747224103343854, + "grad_norm": 1.0267233848571777, + "learning_rate": 9.011030975501804e-05, + "loss": 0.8062, + "step": 63780 + }, + { + "epoch": 0.40753612818317725, + "grad_norm": 0.8495022654533386, + "learning_rate": 9.010731375884835e-05, + "loss": 0.7481, + "step": 63790 + }, + { + "epoch": 0.40760001533291595, + "grad_norm": 0.9368879199028015, + "learning_rate": 9.010431735876332e-05, + "loss": 1.0601, + "step": 63800 + }, + { + "epoch": 0.40766390248265466, + "grad_norm": 0.8383790254592896, + "learning_rate": 9.010132055479313e-05, + "loss": 0.7068, + "step": 63810 + }, + { + "epoch": 0.40772778963239337, + "grad_norm": 0.7061938643455505, + "learning_rate": 9.009832334696792e-05, + "loss": 1.0569, + "step": 63820 + }, + { + "epoch": 0.407791676782132, + "grad_norm": 0.8540278673171997, + "learning_rate": 9.009532573531793e-05, + "loss": 1.0359, + "step": 63830 + }, + { + "epoch": 0.4078555639318707, + "grad_norm": 1.0595225095748901, + "learning_rate": 9.009232771987331e-05, + "loss": 0.8767, + "step": 63840 + }, + { + "epoch": 0.40791945108160943, + "grad_norm": 0.6768961548805237, + "learning_rate": 9.008932930066428e-05, + "loss": 1.0288, + "step": 63850 + }, + { + "epoch": 0.40798333823134814, + "grad_norm": 0.642488420009613, + "learning_rate": 9.0086330477721e-05, + "loss": 0.9504, + "step": 63860 + }, + { + "epoch": 0.40804722538108684, + "grad_norm": 0.9785758852958679, + "learning_rate": 9.008333125107371e-05, + "loss": 0.9287, + "step": 63870 + }, + { + "epoch": 0.40811111253082555, + "grad_norm": 0.949464738368988, + "learning_rate": 9.008033162075259e-05, + "loss": 0.9448, + "step": 63880 + }, + { + "epoch": 0.40817499968056425, + "grad_norm": 1.017958164215088, + "learning_rate": 9.007733158678787e-05, + "loss": 0.8016, + "step": 63890 + }, + { + "epoch": 0.40823888683030296, + "grad_norm": 3.1285202503204346, + "learning_rate": 9.007433114920972e-05, + "loss": 0.9056, + "step": 63900 + }, + { + "epoch": 0.40830277398004167, + "grad_norm": 0.8148319721221924, + "learning_rate": 9.00713303080484e-05, + "loss": 1.0595, + "step": 63910 + }, + { + "epoch": 0.4083666611297804, + "grad_norm": 0.773764967918396, + "learning_rate": 9.006832906333411e-05, + "loss": 0.8172, + "step": 63920 + }, + { + "epoch": 0.4084305482795191, + "grad_norm": 0.8310877084732056, + "learning_rate": 9.00653274150971e-05, + "loss": 0.9826, + "step": 63930 + }, + { + "epoch": 0.4084944354292578, + "grad_norm": 0.904560923576355, + "learning_rate": 9.006232536336756e-05, + "loss": 1.1305, + "step": 63940 + }, + { + "epoch": 0.40855832257899644, + "grad_norm": 0.9716793298721313, + "learning_rate": 9.005932290817576e-05, + "loss": 0.7498, + "step": 63950 + }, + { + "epoch": 0.40862220972873514, + "grad_norm": 1.2290844917297363, + "learning_rate": 9.005632004955192e-05, + "loss": 0.8095, + "step": 63960 + }, + { + "epoch": 0.40868609687847385, + "grad_norm": 0.5113322734832764, + "learning_rate": 9.005331678752629e-05, + "loss": 0.9543, + "step": 63970 + }, + { + "epoch": 0.40874998402821255, + "grad_norm": 0.6806952357292175, + "learning_rate": 9.00503131221291e-05, + "loss": 1.0935, + "step": 63980 + }, + { + "epoch": 0.40881387117795126, + "grad_norm": 1.0288503170013428, + "learning_rate": 9.00473090533906e-05, + "loss": 0.8507, + "step": 63990 + }, + { + "epoch": 0.40887775832768997, + "grad_norm": 0.6882049441337585, + "learning_rate": 9.004430458134107e-05, + "loss": 1.0034, + "step": 64000 + }, + { + "epoch": 0.4089416454774287, + "grad_norm": 1.0259922742843628, + "learning_rate": 9.004129970601074e-05, + "loss": 1.11, + "step": 64010 + }, + { + "epoch": 0.4090055326271674, + "grad_norm": 0.7232376337051392, + "learning_rate": 9.003829442742989e-05, + "loss": 1.0349, + "step": 64020 + }, + { + "epoch": 0.4090694197769061, + "grad_norm": 0.7843562960624695, + "learning_rate": 9.003528874562875e-05, + "loss": 0.8303, + "step": 64030 + }, + { + "epoch": 0.4091333069266448, + "grad_norm": 0.7340204119682312, + "learning_rate": 9.003228266063765e-05, + "loss": 0.8455, + "step": 64040 + }, + { + "epoch": 0.4091971940763835, + "grad_norm": 0.618037760257721, + "learning_rate": 9.00292761724868e-05, + "loss": 0.8546, + "step": 64050 + }, + { + "epoch": 0.4092610812261222, + "grad_norm": 0.4729726314544678, + "learning_rate": 9.002626928120654e-05, + "loss": 0.8056, + "step": 64060 + }, + { + "epoch": 0.40932496837586085, + "grad_norm": 4.078080654144287, + "learning_rate": 9.002326198682712e-05, + "loss": 1.038, + "step": 64070 + }, + { + "epoch": 0.40938885552559956, + "grad_norm": 0.8431742787361145, + "learning_rate": 9.002025428937879e-05, + "loss": 0.9887, + "step": 64080 + }, + { + "epoch": 0.40945274267533827, + "grad_norm": 1.1777523756027222, + "learning_rate": 9.00172461888919e-05, + "loss": 0.729, + "step": 64090 + }, + { + "epoch": 0.40951662982507697, + "grad_norm": 0.9393613338470459, + "learning_rate": 9.001423768539672e-05, + "loss": 0.9377, + "step": 64100 + }, + { + "epoch": 0.4095805169748157, + "grad_norm": 0.6365240812301636, + "learning_rate": 9.001122877892356e-05, + "loss": 0.6823, + "step": 64110 + }, + { + "epoch": 0.4096444041245544, + "grad_norm": 0.9078708291053772, + "learning_rate": 9.00082194695027e-05, + "loss": 0.8199, + "step": 64120 + }, + { + "epoch": 0.4097082912742931, + "grad_norm": 2.3477323055267334, + "learning_rate": 9.000520975716445e-05, + "loss": 0.955, + "step": 64130 + }, + { + "epoch": 0.4097721784240318, + "grad_norm": 0.5948389768600464, + "learning_rate": 9.000219964193914e-05, + "loss": 0.8302, + "step": 64140 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 0.6397002935409546, + "learning_rate": 8.999918912385708e-05, + "loss": 0.8583, + "step": 64150 + }, + { + "epoch": 0.4098999527235092, + "grad_norm": 1.9273061752319336, + "learning_rate": 8.999617820294857e-05, + "loss": 0.9524, + "step": 64160 + }, + { + "epoch": 0.4099638398732479, + "grad_norm": 0.8526675701141357, + "learning_rate": 8.999316687924395e-05, + "loss": 0.9583, + "step": 64170 + }, + { + "epoch": 0.4100277270229866, + "grad_norm": 0.9683032035827637, + "learning_rate": 8.999015515277352e-05, + "loss": 1.1449, + "step": 64180 + }, + { + "epoch": 0.41009161417272527, + "grad_norm": 0.6847209930419922, + "learning_rate": 8.998714302356766e-05, + "loss": 0.8517, + "step": 64190 + }, + { + "epoch": 0.410155501322464, + "grad_norm": 0.6209728717803955, + "learning_rate": 8.998413049165666e-05, + "loss": 0.8522, + "step": 64200 + }, + { + "epoch": 0.4102193884722027, + "grad_norm": 0.6670452952384949, + "learning_rate": 8.998111755707088e-05, + "loss": 0.9287, + "step": 64210 + }, + { + "epoch": 0.4102832756219414, + "grad_norm": 1.0811097621917725, + "learning_rate": 8.997810421984065e-05, + "loss": 0.9053, + "step": 64220 + }, + { + "epoch": 0.4103471627716801, + "grad_norm": 0.8516783118247986, + "learning_rate": 8.997509047999634e-05, + "loss": 0.9318, + "step": 64230 + }, + { + "epoch": 0.4104110499214188, + "grad_norm": 0.880136251449585, + "learning_rate": 8.997207633756828e-05, + "loss": 0.8013, + "step": 64240 + }, + { + "epoch": 0.4104749370711575, + "grad_norm": 0.804787278175354, + "learning_rate": 8.996906179258681e-05, + "loss": 1.0225, + "step": 64250 + }, + { + "epoch": 0.4105388242208962, + "grad_norm": 0.7522739171981812, + "learning_rate": 8.996604684508234e-05, + "loss": 0.9695, + "step": 64260 + }, + { + "epoch": 0.4106027113706349, + "grad_norm": 0.702925443649292, + "learning_rate": 8.996303149508518e-05, + "loss": 0.7292, + "step": 64270 + }, + { + "epoch": 0.4106665985203736, + "grad_norm": 1.3208075761795044, + "learning_rate": 8.996001574262574e-05, + "loss": 0.7754, + "step": 64280 + }, + { + "epoch": 0.41073048567011233, + "grad_norm": 0.8626548051834106, + "learning_rate": 8.995699958773435e-05, + "loss": 0.8081, + "step": 64290 + }, + { + "epoch": 0.41079437281985104, + "grad_norm": 0.8688634037971497, + "learning_rate": 8.995398303044142e-05, + "loss": 0.8649, + "step": 64300 + }, + { + "epoch": 0.4108582599695897, + "grad_norm": 0.9219092130661011, + "learning_rate": 8.995096607077731e-05, + "loss": 0.8441, + "step": 64310 + }, + { + "epoch": 0.4109221471193284, + "grad_norm": 1.0303871631622314, + "learning_rate": 8.994794870877241e-05, + "loss": 0.9238, + "step": 64320 + }, + { + "epoch": 0.4109860342690671, + "grad_norm": 0.938761830329895, + "learning_rate": 8.994493094445711e-05, + "loss": 0.6409, + "step": 64330 + }, + { + "epoch": 0.4110499214188058, + "grad_norm": 0.8566682934761047, + "learning_rate": 8.99419127778618e-05, + "loss": 0.9025, + "step": 64340 + }, + { + "epoch": 0.4111138085685445, + "grad_norm": 0.8266015648841858, + "learning_rate": 8.993889420901687e-05, + "loss": 0.811, + "step": 64350 + }, + { + "epoch": 0.4111776957182832, + "grad_norm": 0.9890789985656738, + "learning_rate": 8.993587523795271e-05, + "loss": 0.921, + "step": 64360 + }, + { + "epoch": 0.4112415828680219, + "grad_norm": 0.8247410655021667, + "learning_rate": 8.993285586469976e-05, + "loss": 1.0017, + "step": 64370 + }, + { + "epoch": 0.41130547001776063, + "grad_norm": 1.1178898811340332, + "learning_rate": 8.992983608928839e-05, + "loss": 0.9229, + "step": 64380 + }, + { + "epoch": 0.41136935716749934, + "grad_norm": 0.6633570194244385, + "learning_rate": 8.992681591174903e-05, + "loss": 0.7906, + "step": 64390 + }, + { + "epoch": 0.41143324431723804, + "grad_norm": 0.9946048855781555, + "learning_rate": 8.99237953321121e-05, + "loss": 0.8584, + "step": 64400 + }, + { + "epoch": 0.41149713146697675, + "grad_norm": 1.1637663841247559, + "learning_rate": 8.992077435040799e-05, + "loss": 0.7197, + "step": 64410 + }, + { + "epoch": 0.41156101861671546, + "grad_norm": 0.9917969703674316, + "learning_rate": 8.991775296666717e-05, + "loss": 0.9235, + "step": 64420 + }, + { + "epoch": 0.4116249057664541, + "grad_norm": 0.6718196868896484, + "learning_rate": 8.991473118092003e-05, + "loss": 0.891, + "step": 64430 + }, + { + "epoch": 0.4116887929161928, + "grad_norm": 0.6841692924499512, + "learning_rate": 8.991170899319702e-05, + "loss": 1.2224, + "step": 64440 + }, + { + "epoch": 0.4117526800659315, + "grad_norm": 0.8956950306892395, + "learning_rate": 8.990868640352857e-05, + "loss": 0.7681, + "step": 64450 + }, + { + "epoch": 0.4118165672156702, + "grad_norm": 0.8539284467697144, + "learning_rate": 8.990566341194513e-05, + "loss": 0.9718, + "step": 64460 + }, + { + "epoch": 0.41188045436540893, + "grad_norm": 1.2791920900344849, + "learning_rate": 8.990294237590787e-05, + "loss": 0.885, + "step": 64470 + }, + { + "epoch": 0.41194434151514764, + "grad_norm": 0.7224549651145935, + "learning_rate": 8.989991862076981e-05, + "loss": 0.8548, + "step": 64480 + }, + { + "epoch": 0.41200822866488634, + "grad_norm": 0.8494675159454346, + "learning_rate": 8.989689446380503e-05, + "loss": 1.2577, + "step": 64490 + }, + { + "epoch": 0.41207211581462505, + "grad_norm": 1.3127714395523071, + "learning_rate": 8.989386990504402e-05, + "loss": 0.9295, + "step": 64500 + }, + { + "epoch": 0.41213600296436376, + "grad_norm": 1.0279886722564697, + "learning_rate": 8.989084494451725e-05, + "loss": 1.096, + "step": 64510 + }, + { + "epoch": 0.41219989011410246, + "grad_norm": 1.2837331295013428, + "learning_rate": 8.988781958225515e-05, + "loss": 0.744, + "step": 64520 + }, + { + "epoch": 0.41226377726384117, + "grad_norm": 0.7983502745628357, + "learning_rate": 8.988479381828817e-05, + "loss": 0.7822, + "step": 64530 + }, + { + "epoch": 0.4123276644135799, + "grad_norm": 1.4588627815246582, + "learning_rate": 8.988176765264684e-05, + "loss": 0.9902, + "step": 64540 + }, + { + "epoch": 0.4123915515633186, + "grad_norm": 0.8244463205337524, + "learning_rate": 8.98787410853616e-05, + "loss": 0.7243, + "step": 64550 + }, + { + "epoch": 0.41245543871305723, + "grad_norm": 0.6158170104026794, + "learning_rate": 8.987571411646292e-05, + "loss": 0.9385, + "step": 64560 + }, + { + "epoch": 0.41251932586279594, + "grad_norm": 1.447361707687378, + "learning_rate": 8.987268674598133e-05, + "loss": 0.8133, + "step": 64570 + }, + { + "epoch": 0.41258321301253464, + "grad_norm": 0.8146925568580627, + "learning_rate": 8.986965897394728e-05, + "loss": 0.8676, + "step": 64580 + }, + { + "epoch": 0.41264710016227335, + "grad_norm": 0.798591136932373, + "learning_rate": 8.986663080039126e-05, + "loss": 0.9362, + "step": 64590 + }, + { + "epoch": 0.41271098731201206, + "grad_norm": 0.6926212906837463, + "learning_rate": 8.986360222534377e-05, + "loss": 0.7352, + "step": 64600 + }, + { + "epoch": 0.41277487446175076, + "grad_norm": 0.8038867115974426, + "learning_rate": 8.986057324883535e-05, + "loss": 1.2618, + "step": 64610 + }, + { + "epoch": 0.41283876161148947, + "grad_norm": 1.9613544940948486, + "learning_rate": 8.985754387089647e-05, + "loss": 0.7561, + "step": 64620 + }, + { + "epoch": 0.4129026487612282, + "grad_norm": 0.8577353358268738, + "learning_rate": 8.985451409155762e-05, + "loss": 0.8224, + "step": 64630 + }, + { + "epoch": 0.4129665359109669, + "grad_norm": 0.5765991806983948, + "learning_rate": 8.985148391084934e-05, + "loss": 0.8005, + "step": 64640 + }, + { + "epoch": 0.4130304230607056, + "grad_norm": 0.4820258319377899, + "learning_rate": 8.984845332880213e-05, + "loss": 0.8121, + "step": 64650 + }, + { + "epoch": 0.4130943102104443, + "grad_norm": 1.0253220796585083, + "learning_rate": 8.984542234544656e-05, + "loss": 0.667, + "step": 64660 + }, + { + "epoch": 0.413158197360183, + "grad_norm": 0.6785098910331726, + "learning_rate": 8.984239096081308e-05, + "loss": 0.9457, + "step": 64670 + }, + { + "epoch": 0.41322208450992165, + "grad_norm": 0.9571899771690369, + "learning_rate": 8.983935917493227e-05, + "loss": 1.1104, + "step": 64680 + }, + { + "epoch": 0.41328597165966036, + "grad_norm": 0.9146550893783569, + "learning_rate": 8.983632698783463e-05, + "loss": 1.0036, + "step": 64690 + }, + { + "epoch": 0.41334985880939906, + "grad_norm": 1.2323453426361084, + "learning_rate": 8.983329439955075e-05, + "loss": 0.8704, + "step": 64700 + }, + { + "epoch": 0.41341374595913777, + "grad_norm": 1.2397221326828003, + "learning_rate": 8.98302614101111e-05, + "loss": 0.9207, + "step": 64710 + }, + { + "epoch": 0.4134776331088765, + "grad_norm": 0.7488781213760376, + "learning_rate": 8.982722801954627e-05, + "loss": 0.7842, + "step": 64720 + }, + { + "epoch": 0.4135415202586152, + "grad_norm": 2.860487461090088, + "learning_rate": 8.98241942278868e-05, + "loss": 0.7993, + "step": 64730 + }, + { + "epoch": 0.4136054074083539, + "grad_norm": 0.9510349035263062, + "learning_rate": 8.982116003516324e-05, + "loss": 0.9748, + "step": 64740 + }, + { + "epoch": 0.4136692945580926, + "grad_norm": 0.7281776666641235, + "learning_rate": 8.981812544140615e-05, + "loss": 0.8138, + "step": 64750 + }, + { + "epoch": 0.4137331817078313, + "grad_norm": 1.0613374710083008, + "learning_rate": 8.981509044664608e-05, + "loss": 0.7931, + "step": 64760 + }, + { + "epoch": 0.41379706885757, + "grad_norm": 0.5681692361831665, + "learning_rate": 8.981205505091363e-05, + "loss": 0.7882, + "step": 64770 + }, + { + "epoch": 0.4138609560073087, + "grad_norm": 0.9813446998596191, + "learning_rate": 8.980901925423932e-05, + "loss": 0.8314, + "step": 64780 + }, + { + "epoch": 0.4139248431570474, + "grad_norm": 0.9293444156646729, + "learning_rate": 8.980598305665375e-05, + "loss": 0.6781, + "step": 64790 + }, + { + "epoch": 0.41398873030678607, + "grad_norm": 0.8885220289230347, + "learning_rate": 8.98029464581875e-05, + "loss": 0.8954, + "step": 64800 + }, + { + "epoch": 0.4140526174565248, + "grad_norm": 0.7418517470359802, + "learning_rate": 8.979990945887114e-05, + "loss": 0.8223, + "step": 64810 + }, + { + "epoch": 0.4141165046062635, + "grad_norm": 0.9399496912956238, + "learning_rate": 8.979687205873526e-05, + "loss": 0.9294, + "step": 64820 + }, + { + "epoch": 0.4141803917560022, + "grad_norm": 0.7863808274269104, + "learning_rate": 8.979383425781046e-05, + "loss": 0.7421, + "step": 64830 + }, + { + "epoch": 0.4142442789057409, + "grad_norm": 0.8157973289489746, + "learning_rate": 8.97907960561273e-05, + "loss": 0.7436, + "step": 64840 + }, + { + "epoch": 0.4143081660554796, + "grad_norm": 0.8442412614822388, + "learning_rate": 8.978775745371642e-05, + "loss": 0.8952, + "step": 64850 + }, + { + "epoch": 0.4143720532052183, + "grad_norm": 1.1635627746582031, + "learning_rate": 8.978471845060838e-05, + "loss": 0.9926, + "step": 64860 + }, + { + "epoch": 0.414435940354957, + "grad_norm": 0.975397527217865, + "learning_rate": 8.978167904683383e-05, + "loss": 0.814, + "step": 64870 + }, + { + "epoch": 0.4144998275046957, + "grad_norm": 0.6589390635490417, + "learning_rate": 8.977863924242335e-05, + "loss": 0.8215, + "step": 64880 + }, + { + "epoch": 0.4145637146544344, + "grad_norm": 1.0602787733078003, + "learning_rate": 8.977559903740756e-05, + "loss": 0.7723, + "step": 64890 + }, + { + "epoch": 0.41462760180417313, + "grad_norm": 0.7909052968025208, + "learning_rate": 8.977255843181707e-05, + "loss": 0.9256, + "step": 64900 + }, + { + "epoch": 0.41469148895391184, + "grad_norm": 3.674098491668701, + "learning_rate": 8.976951742568249e-05, + "loss": 0.822, + "step": 64910 + }, + { + "epoch": 0.4147553761036505, + "grad_norm": 1.0264021158218384, + "learning_rate": 8.97664760190345e-05, + "loss": 0.9519, + "step": 64920 + }, + { + "epoch": 0.4148192632533892, + "grad_norm": 0.8971536755561829, + "learning_rate": 8.976343421190367e-05, + "loss": 0.8374, + "step": 64930 + }, + { + "epoch": 0.4148831504031279, + "grad_norm": 0.4284789562225342, + "learning_rate": 8.976039200432067e-05, + "loss": 0.7519, + "step": 64940 + }, + { + "epoch": 0.4149470375528666, + "grad_norm": 0.5948058366775513, + "learning_rate": 8.975734939631612e-05, + "loss": 0.847, + "step": 64950 + }, + { + "epoch": 0.4150109247026053, + "grad_norm": 1.361910104751587, + "learning_rate": 8.975430638792066e-05, + "loss": 0.8689, + "step": 64960 + }, + { + "epoch": 0.415074811852344, + "grad_norm": 1.295660376548767, + "learning_rate": 8.975126297916495e-05, + "loss": 0.8311, + "step": 64970 + }, + { + "epoch": 0.4151386990020827, + "grad_norm": 1.1786941289901733, + "learning_rate": 8.974821917007962e-05, + "loss": 1.0239, + "step": 64980 + }, + { + "epoch": 0.41520258615182143, + "grad_norm": 1.0737582445144653, + "learning_rate": 8.974517496069536e-05, + "loss": 0.9261, + "step": 64990 + }, + { + "epoch": 0.41526647330156014, + "grad_norm": 0.853832483291626, + "learning_rate": 8.97421303510428e-05, + "loss": 0.9344, + "step": 65000 + }, + { + "epoch": 0.41533036045129884, + "grad_norm": 2.251110553741455, + "learning_rate": 8.973908534115259e-05, + "loss": 1.0998, + "step": 65010 + }, + { + "epoch": 0.41539424760103755, + "grad_norm": 0.8067615628242493, + "learning_rate": 8.973603993105542e-05, + "loss": 0.9716, + "step": 65020 + }, + { + "epoch": 0.41545813475077625, + "grad_norm": 0.5543524622917175, + "learning_rate": 8.973299412078194e-05, + "loss": 0.7536, + "step": 65030 + }, + { + "epoch": 0.4155220219005149, + "grad_norm": 1.8256611824035645, + "learning_rate": 8.972994791036284e-05, + "loss": 1.0459, + "step": 65040 + }, + { + "epoch": 0.4155859090502536, + "grad_norm": 0.8897950053215027, + "learning_rate": 8.97269012998288e-05, + "loss": 0.9819, + "step": 65050 + }, + { + "epoch": 0.4156497961999923, + "grad_norm": 0.9131116271018982, + "learning_rate": 8.97238542892105e-05, + "loss": 0.8402, + "step": 65060 + }, + { + "epoch": 0.415713683349731, + "grad_norm": 1.1356046199798584, + "learning_rate": 8.972080687853861e-05, + "loss": 0.841, + "step": 65070 + }, + { + "epoch": 0.41577757049946973, + "grad_norm": 0.8197834491729736, + "learning_rate": 8.971775906784383e-05, + "loss": 0.7874, + "step": 65080 + }, + { + "epoch": 0.41584145764920843, + "grad_norm": 0.6989075541496277, + "learning_rate": 8.971471085715686e-05, + "loss": 0.8665, + "step": 65090 + }, + { + "epoch": 0.41590534479894714, + "grad_norm": 0.732832670211792, + "learning_rate": 8.97116622465084e-05, + "loss": 0.6657, + "step": 65100 + }, + { + "epoch": 0.41596923194868585, + "grad_norm": 0.8469944000244141, + "learning_rate": 8.970861323592913e-05, + "loss": 0.8977, + "step": 65110 + }, + { + "epoch": 0.41603311909842455, + "grad_norm": 1.300403118133545, + "learning_rate": 8.970556382544978e-05, + "loss": 1.0034, + "step": 65120 + }, + { + "epoch": 0.41609700624816326, + "grad_norm": 1.2970237731933594, + "learning_rate": 8.970251401510107e-05, + "loss": 1.0144, + "step": 65130 + }, + { + "epoch": 0.41616089339790197, + "grad_norm": 1.0652505159378052, + "learning_rate": 8.969946380491367e-05, + "loss": 0.7254, + "step": 65140 + }, + { + "epoch": 0.41622478054764067, + "grad_norm": 0.9304187297821045, + "learning_rate": 8.969641319491833e-05, + "loss": 0.884, + "step": 65150 + }, + { + "epoch": 0.4162886676973793, + "grad_norm": 0.8894677758216858, + "learning_rate": 8.969336218514579e-05, + "loss": 1.1695, + "step": 65160 + }, + { + "epoch": 0.41635255484711803, + "grad_norm": 0.7384070158004761, + "learning_rate": 8.969031077562673e-05, + "loss": 0.8618, + "step": 65170 + }, + { + "epoch": 0.41641644199685673, + "grad_norm": 0.8503040671348572, + "learning_rate": 8.968725896639189e-05, + "loss": 1.1173, + "step": 65180 + }, + { + "epoch": 0.41648032914659544, + "grad_norm": 1.213909387588501, + "learning_rate": 8.968420675747204e-05, + "loss": 0.5525, + "step": 65190 + }, + { + "epoch": 0.41654421629633415, + "grad_norm": 0.8109204769134521, + "learning_rate": 8.968115414889791e-05, + "loss": 0.8147, + "step": 65200 + }, + { + "epoch": 0.41660810344607285, + "grad_norm": 0.9055116772651672, + "learning_rate": 8.967810114070022e-05, + "loss": 0.8597, + "step": 65210 + }, + { + "epoch": 0.41667199059581156, + "grad_norm": 0.7332736849784851, + "learning_rate": 8.96750477329097e-05, + "loss": 0.8135, + "step": 65220 + }, + { + "epoch": 0.41673587774555026, + "grad_norm": 1.6133145093917847, + "learning_rate": 8.967199392555714e-05, + "loss": 0.5944, + "step": 65230 + }, + { + "epoch": 0.41679976489528897, + "grad_norm": 1.0285025835037231, + "learning_rate": 8.966893971867329e-05, + "loss": 1.2345, + "step": 65240 + }, + { + "epoch": 0.4168636520450277, + "grad_norm": 0.7970749139785767, + "learning_rate": 8.966588511228888e-05, + "loss": 0.9716, + "step": 65250 + }, + { + "epoch": 0.4169275391947664, + "grad_norm": 0.9936865568161011, + "learning_rate": 8.96628301064347e-05, + "loss": 0.6991, + "step": 65260 + }, + { + "epoch": 0.4169914263445051, + "grad_norm": 0.6631901264190674, + "learning_rate": 8.965977470114151e-05, + "loss": 1.0921, + "step": 65270 + }, + { + "epoch": 0.41705531349424374, + "grad_norm": 1.0194664001464844, + "learning_rate": 8.965671889644007e-05, + "loss": 0.8967, + "step": 65280 + }, + { + "epoch": 0.41711920064398245, + "grad_norm": 1.145621657371521, + "learning_rate": 8.965366269236117e-05, + "loss": 0.9233, + "step": 65290 + }, + { + "epoch": 0.41718308779372115, + "grad_norm": 0.7853092551231384, + "learning_rate": 8.965060608893559e-05, + "loss": 0.8627, + "step": 65300 + }, + { + "epoch": 0.41724697494345986, + "grad_norm": 0.7077251672744751, + "learning_rate": 8.96475490861941e-05, + "loss": 1.0426, + "step": 65310 + }, + { + "epoch": 0.41731086209319856, + "grad_norm": 0.9070340394973755, + "learning_rate": 8.964449168416749e-05, + "loss": 0.9206, + "step": 65320 + }, + { + "epoch": 0.41737474924293727, + "grad_norm": 1.0521044731140137, + "learning_rate": 8.964143388288653e-05, + "loss": 0.886, + "step": 65330 + }, + { + "epoch": 0.417438636392676, + "grad_norm": 0.46310827136039734, + "learning_rate": 8.963837568238205e-05, + "loss": 0.8873, + "step": 65340 + }, + { + "epoch": 0.4175025235424147, + "grad_norm": 0.7745985388755798, + "learning_rate": 8.963531708268485e-05, + "loss": 0.9885, + "step": 65350 + }, + { + "epoch": 0.4175664106921534, + "grad_norm": 0.7876570820808411, + "learning_rate": 8.96322580838257e-05, + "loss": 0.6772, + "step": 65360 + }, + { + "epoch": 0.4176302978418921, + "grad_norm": 1.1240822076797485, + "learning_rate": 8.962919868583544e-05, + "loss": 0.9992, + "step": 65370 + }, + { + "epoch": 0.4176941849916308, + "grad_norm": 1.488118290901184, + "learning_rate": 8.962613888874485e-05, + "loss": 1.1016, + "step": 65380 + }, + { + "epoch": 0.4177580721413695, + "grad_norm": 1.2619564533233643, + "learning_rate": 8.962307869258476e-05, + "loss": 1.1059, + "step": 65390 + }, + { + "epoch": 0.4178219592911082, + "grad_norm": 3.796415328979492, + "learning_rate": 8.962001809738599e-05, + "loss": 1.0094, + "step": 65400 + }, + { + "epoch": 0.41788584644084686, + "grad_norm": 0.6639039516448975, + "learning_rate": 8.961695710317936e-05, + "loss": 0.9859, + "step": 65410 + }, + { + "epoch": 0.41794973359058557, + "grad_norm": 1.1306976079940796, + "learning_rate": 8.961389570999573e-05, + "loss": 0.6482, + "step": 65420 + }, + { + "epoch": 0.4180136207403243, + "grad_norm": 1.1172826290130615, + "learning_rate": 8.961083391786585e-05, + "loss": 0.9486, + "step": 65430 + }, + { + "epoch": 0.418077507890063, + "grad_norm": 0.6498112678527832, + "learning_rate": 8.960777172682063e-05, + "loss": 0.7618, + "step": 65440 + }, + { + "epoch": 0.4181413950398017, + "grad_norm": 0.7367339134216309, + "learning_rate": 8.960470913689088e-05, + "loss": 0.9973, + "step": 65450 + }, + { + "epoch": 0.4182052821895404, + "grad_norm": 0.7694912552833557, + "learning_rate": 8.960164614810744e-05, + "loss": 0.8996, + "step": 65460 + }, + { + "epoch": 0.4182691693392791, + "grad_norm": 0.9336798191070557, + "learning_rate": 8.959858276050118e-05, + "loss": 0.9093, + "step": 65470 + }, + { + "epoch": 0.4183330564890178, + "grad_norm": 0.7847772836685181, + "learning_rate": 8.959551897410292e-05, + "loss": 0.7734, + "step": 65480 + }, + { + "epoch": 0.4183969436387565, + "grad_norm": 0.8503950834274292, + "learning_rate": 8.959245478894353e-05, + "loss": 0.9816, + "step": 65490 + }, + { + "epoch": 0.4184608307884952, + "grad_norm": 0.9452194571495056, + "learning_rate": 8.958939020505388e-05, + "loss": 0.9682, + "step": 65500 + }, + { + "epoch": 0.4185247179382339, + "grad_norm": 0.7661617398262024, + "learning_rate": 8.95863252224648e-05, + "loss": 0.6985, + "step": 65510 + }, + { + "epoch": 0.41858860508797263, + "grad_norm": 0.4529026448726654, + "learning_rate": 8.958325984120718e-05, + "loss": 0.6564, + "step": 65520 + }, + { + "epoch": 0.4186524922377113, + "grad_norm": 2.9925787448883057, + "learning_rate": 8.958019406131191e-05, + "loss": 0.6851, + "step": 65530 + }, + { + "epoch": 0.41871637938745, + "grad_norm": 0.726306676864624, + "learning_rate": 8.957712788280982e-05, + "loss": 0.8031, + "step": 65540 + }, + { + "epoch": 0.4187802665371887, + "grad_norm": 0.6993584036827087, + "learning_rate": 8.957406130573183e-05, + "loss": 0.8313, + "step": 65550 + }, + { + "epoch": 0.4188441536869274, + "grad_norm": 0.9798833131790161, + "learning_rate": 8.957099433010881e-05, + "loss": 0.9016, + "step": 65560 + }, + { + "epoch": 0.4189080408366661, + "grad_norm": 0.7420501708984375, + "learning_rate": 8.956792695597163e-05, + "loss": 0.6753, + "step": 65570 + }, + { + "epoch": 0.4189719279864048, + "grad_norm": 0.7620697617530823, + "learning_rate": 8.95648591833512e-05, + "loss": 0.8782, + "step": 65580 + }, + { + "epoch": 0.4190358151361435, + "grad_norm": 1.2457002401351929, + "learning_rate": 8.956179101227842e-05, + "loss": 1.1031, + "step": 65590 + }, + { + "epoch": 0.4190997022858822, + "grad_norm": 1.005566120147705, + "learning_rate": 8.955872244278416e-05, + "loss": 1.1262, + "step": 65600 + }, + { + "epoch": 0.41916358943562093, + "grad_norm": 1.014225959777832, + "learning_rate": 8.955565347489935e-05, + "loss": 0.9578, + "step": 65610 + }, + { + "epoch": 0.41922747658535964, + "grad_norm": 1.1588691473007202, + "learning_rate": 8.955258410865488e-05, + "loss": 1.1571, + "step": 65620 + }, + { + "epoch": 0.41929136373509834, + "grad_norm": 0.5882399678230286, + "learning_rate": 8.954951434408168e-05, + "loss": 0.8187, + "step": 65630 + }, + { + "epoch": 0.41935525088483705, + "grad_norm": 0.5177319645881653, + "learning_rate": 8.954644418121065e-05, + "loss": 0.9707, + "step": 65640 + }, + { + "epoch": 0.4194191380345757, + "grad_norm": 1.1925745010375977, + "learning_rate": 8.954337362007273e-05, + "loss": 0.9326, + "step": 65650 + }, + { + "epoch": 0.4194830251843144, + "grad_norm": 7.771919250488281, + "learning_rate": 8.954030266069882e-05, + "loss": 0.9108, + "step": 65660 + }, + { + "epoch": 0.4195469123340531, + "grad_norm": 0.7512636184692383, + "learning_rate": 8.953723130311984e-05, + "loss": 0.7775, + "step": 65670 + }, + { + "epoch": 0.4196107994837918, + "grad_norm": 1.3332923650741577, + "learning_rate": 8.953415954736675e-05, + "loss": 1.0754, + "step": 65680 + }, + { + "epoch": 0.4196746866335305, + "grad_norm": 0.5721650719642639, + "learning_rate": 8.953108739347047e-05, + "loss": 0.9334, + "step": 65690 + }, + { + "epoch": 0.41973857378326923, + "grad_norm": 0.9686694741249084, + "learning_rate": 8.952801484146194e-05, + "loss": 1.0833, + "step": 65700 + }, + { + "epoch": 0.41980246093300794, + "grad_norm": 0.7527593374252319, + "learning_rate": 8.95249418913721e-05, + "loss": 0.9329, + "step": 65710 + }, + { + "epoch": 0.41986634808274664, + "grad_norm": 0.8230323195457458, + "learning_rate": 8.95218685432319e-05, + "loss": 1.1346, + "step": 65720 + }, + { + "epoch": 0.41993023523248535, + "grad_norm": 2.3285200595855713, + "learning_rate": 8.95187947970723e-05, + "loss": 1.1473, + "step": 65730 + }, + { + "epoch": 0.41999412238222406, + "grad_norm": 0.7577224969863892, + "learning_rate": 8.951572065292424e-05, + "loss": 0.9537, + "step": 65740 + }, + { + "epoch": 0.42005800953196276, + "grad_norm": 0.5602964758872986, + "learning_rate": 8.95126461108187e-05, + "loss": 0.9735, + "step": 65750 + }, + { + "epoch": 0.42012189668170147, + "grad_norm": 0.63779217004776, + "learning_rate": 8.950957117078662e-05, + "loss": 0.7456, + "step": 65760 + }, + { + "epoch": 0.4201857838314401, + "grad_norm": 1.0952467918395996, + "learning_rate": 8.950649583285898e-05, + "loss": 0.906, + "step": 65770 + }, + { + "epoch": 0.4202496709811788, + "grad_norm": 0.687418520450592, + "learning_rate": 8.950342009706675e-05, + "loss": 0.7934, + "step": 65780 + }, + { + "epoch": 0.42031355813091753, + "grad_norm": 1.1235476732254028, + "learning_rate": 8.95003439634409e-05, + "loss": 0.9785, + "step": 65790 + }, + { + "epoch": 0.42037744528065624, + "grad_norm": 0.9413420557975769, + "learning_rate": 8.949726743201242e-05, + "loss": 0.8127, + "step": 65800 + }, + { + "epoch": 0.42044133243039494, + "grad_norm": 0.9101980924606323, + "learning_rate": 8.949419050281228e-05, + "loss": 1.1065, + "step": 65810 + }, + { + "epoch": 0.42050521958013365, + "grad_norm": 0.6493207812309265, + "learning_rate": 8.94911131758715e-05, + "loss": 0.7957, + "step": 65820 + }, + { + "epoch": 0.42056910672987236, + "grad_norm": 0.753649890422821, + "learning_rate": 8.9488035451221e-05, + "loss": 1.1482, + "step": 65830 + }, + { + "epoch": 0.42063299387961106, + "grad_norm": 0.9392029047012329, + "learning_rate": 8.948495732889185e-05, + "loss": 0.8979, + "step": 65840 + }, + { + "epoch": 0.42069688102934977, + "grad_norm": 1.159693717956543, + "learning_rate": 8.948187880891501e-05, + "loss": 0.9243, + "step": 65850 + }, + { + "epoch": 0.4207607681790885, + "grad_norm": 0.7073017358779907, + "learning_rate": 8.947879989132151e-05, + "loss": 0.7319, + "step": 65860 + }, + { + "epoch": 0.4208246553288272, + "grad_norm": 0.5303799510002136, + "learning_rate": 8.947572057614231e-05, + "loss": 0.7607, + "step": 65870 + }, + { + "epoch": 0.4208885424785659, + "grad_norm": 0.734471321105957, + "learning_rate": 8.947264086340847e-05, + "loss": 0.8881, + "step": 65880 + }, + { + "epoch": 0.42095242962830454, + "grad_norm": 0.990151047706604, + "learning_rate": 8.9469560753151e-05, + "loss": 1.0643, + "step": 65890 + }, + { + "epoch": 0.42101631677804324, + "grad_norm": 0.6194562315940857, + "learning_rate": 8.94664802454009e-05, + "loss": 1.0016, + "step": 65900 + }, + { + "epoch": 0.42108020392778195, + "grad_norm": 0.8033568263053894, + "learning_rate": 8.946339934018919e-05, + "loss": 0.9705, + "step": 65910 + }, + { + "epoch": 0.42114409107752065, + "grad_norm": 0.9492495656013489, + "learning_rate": 8.946031803754693e-05, + "loss": 0.7755, + "step": 65920 + }, + { + "epoch": 0.42120797822725936, + "grad_norm": 0.7201482057571411, + "learning_rate": 8.945723633750512e-05, + "loss": 0.9378, + "step": 65930 + }, + { + "epoch": 0.42127186537699807, + "grad_norm": 0.9046865701675415, + "learning_rate": 8.945415424009478e-05, + "loss": 0.9119, + "step": 65940 + }, + { + "epoch": 0.4213357525267368, + "grad_norm": 0.8343170881271362, + "learning_rate": 8.945107174534699e-05, + "loss": 0.9199, + "step": 65950 + }, + { + "epoch": 0.4213996396764755, + "grad_norm": 1.330496072769165, + "learning_rate": 8.94479888532928e-05, + "loss": 0.7923, + "step": 65960 + }, + { + "epoch": 0.4214635268262142, + "grad_norm": 0.7059889435768127, + "learning_rate": 8.94449055639632e-05, + "loss": 0.9541, + "step": 65970 + }, + { + "epoch": 0.4215274139759529, + "grad_norm": 0.8712106347084045, + "learning_rate": 8.944182187738929e-05, + "loss": 1.1176, + "step": 65980 + }, + { + "epoch": 0.4215913011256916, + "grad_norm": 0.9229239821434021, + "learning_rate": 8.943873779360213e-05, + "loss": 0.7503, + "step": 65990 + }, + { + "epoch": 0.4216551882754303, + "grad_norm": 1.1028259992599487, + "learning_rate": 8.943565331263274e-05, + "loss": 1.1007, + "step": 66000 + }, + { + "epoch": 0.42171907542516895, + "grad_norm": 0.8142592906951904, + "learning_rate": 8.943256843451221e-05, + "loss": 0.9841, + "step": 66010 + }, + { + "epoch": 0.42178296257490766, + "grad_norm": 1.184031367301941, + "learning_rate": 8.94294831592716e-05, + "loss": 0.9047, + "step": 66020 + }, + { + "epoch": 0.42184684972464637, + "grad_norm": 1.0467089414596558, + "learning_rate": 8.9426397486942e-05, + "loss": 1.0965, + "step": 66030 + }, + { + "epoch": 0.4219107368743851, + "grad_norm": 2.1014811992645264, + "learning_rate": 8.942331141755445e-05, + "loss": 0.7972, + "step": 66040 + }, + { + "epoch": 0.4219746240241238, + "grad_norm": 0.9660546183586121, + "learning_rate": 8.942022495114004e-05, + "loss": 0.8632, + "step": 66050 + }, + { + "epoch": 0.4220385111738625, + "grad_norm": 1.027685523033142, + "learning_rate": 8.941713808772986e-05, + "loss": 0.9732, + "step": 66060 + }, + { + "epoch": 0.4221023983236012, + "grad_norm": 1.0367803573608398, + "learning_rate": 8.941405082735503e-05, + "loss": 0.8984, + "step": 66070 + }, + { + "epoch": 0.4221662854733399, + "grad_norm": 0.9707443714141846, + "learning_rate": 8.941096317004658e-05, + "loss": 0.8234, + "step": 66080 + }, + { + "epoch": 0.4222301726230786, + "grad_norm": 0.9538044929504395, + "learning_rate": 8.940787511583567e-05, + "loss": 0.8863, + "step": 66090 + }, + { + "epoch": 0.4222940597728173, + "grad_norm": 0.920036256313324, + "learning_rate": 8.940478666475333e-05, + "loss": 0.9216, + "step": 66100 + }, + { + "epoch": 0.422357946922556, + "grad_norm": 1.042989730834961, + "learning_rate": 8.94016978168307e-05, + "loss": 0.9341, + "step": 66110 + }, + { + "epoch": 0.4224218340722947, + "grad_norm": 0.8969932794570923, + "learning_rate": 8.93986085720989e-05, + "loss": 0.9116, + "step": 66120 + }, + { + "epoch": 0.42248572122203343, + "grad_norm": 1.1264333724975586, + "learning_rate": 8.939551893058902e-05, + "loss": 0.8614, + "step": 66130 + }, + { + "epoch": 0.4225496083717721, + "grad_norm": 0.9502818584442139, + "learning_rate": 8.939242889233219e-05, + "loss": 0.7416, + "step": 66140 + }, + { + "epoch": 0.4226134955215108, + "grad_norm": 1.023710012435913, + "learning_rate": 8.93893384573595e-05, + "loss": 0.6771, + "step": 66150 + }, + { + "epoch": 0.4226773826712495, + "grad_norm": 0.9767735004425049, + "learning_rate": 8.938624762570213e-05, + "loss": 0.8853, + "step": 66160 + }, + { + "epoch": 0.4227412698209882, + "grad_norm": 0.9584904909133911, + "learning_rate": 8.938315639739115e-05, + "loss": 0.8563, + "step": 66170 + }, + { + "epoch": 0.4228051569707269, + "grad_norm": 0.7890828847885132, + "learning_rate": 8.938006477245773e-05, + "loss": 0.6442, + "step": 66180 + }, + { + "epoch": 0.4228690441204656, + "grad_norm": 0.7645769119262695, + "learning_rate": 8.937697275093298e-05, + "loss": 0.5916, + "step": 66190 + }, + { + "epoch": 0.4229329312702043, + "grad_norm": 1.5121580362319946, + "learning_rate": 8.937388033284804e-05, + "loss": 0.8618, + "step": 66200 + }, + { + "epoch": 0.422996818419943, + "grad_norm": 0.7250730395317078, + "learning_rate": 8.937078751823406e-05, + "loss": 0.7188, + "step": 66210 + }, + { + "epoch": 0.4230607055696817, + "grad_norm": 1.0403555631637573, + "learning_rate": 8.93676943071222e-05, + "loss": 0.7553, + "step": 66220 + }, + { + "epoch": 0.42312459271942043, + "grad_norm": 0.6218414902687073, + "learning_rate": 8.93646006995436e-05, + "loss": 0.921, + "step": 66230 + }, + { + "epoch": 0.42318847986915914, + "grad_norm": 0.6752848029136658, + "learning_rate": 8.93615066955294e-05, + "loss": 0.9711, + "step": 66240 + }, + { + "epoch": 0.42325236701889785, + "grad_norm": 0.8991808295249939, + "learning_rate": 8.935841229511079e-05, + "loss": 0.8345, + "step": 66250 + }, + { + "epoch": 0.4233162541686365, + "grad_norm": 0.5273988246917725, + "learning_rate": 8.935531749831892e-05, + "loss": 1.0576, + "step": 66260 + }, + { + "epoch": 0.4233801413183752, + "grad_norm": 0.6460761427879333, + "learning_rate": 8.935222230518496e-05, + "loss": 0.7446, + "step": 66270 + }, + { + "epoch": 0.4234440284681139, + "grad_norm": 0.8064502477645874, + "learning_rate": 8.934912671574007e-05, + "loss": 0.9758, + "step": 66280 + }, + { + "epoch": 0.4235079156178526, + "grad_norm": 1.1486152410507202, + "learning_rate": 8.934603073001542e-05, + "loss": 0.9056, + "step": 66290 + }, + { + "epoch": 0.4235718027675913, + "grad_norm": 0.8460824489593506, + "learning_rate": 8.934293434804221e-05, + "loss": 1.0032, + "step": 66300 + }, + { + "epoch": 0.42363568991733, + "grad_norm": 1.1093133687973022, + "learning_rate": 8.933983756985163e-05, + "loss": 0.9909, + "step": 66310 + }, + { + "epoch": 0.42369957706706873, + "grad_norm": 0.9142333269119263, + "learning_rate": 8.933674039547484e-05, + "loss": 0.7974, + "step": 66320 + }, + { + "epoch": 0.42376346421680744, + "grad_norm": 1.893848180770874, + "learning_rate": 8.933364282494304e-05, + "loss": 0.8881, + "step": 66330 + }, + { + "epoch": 0.42382735136654615, + "grad_norm": 0.6643238663673401, + "learning_rate": 8.933054485828742e-05, + "loss": 0.6162, + "step": 66340 + }, + { + "epoch": 0.42389123851628485, + "grad_norm": 0.853939950466156, + "learning_rate": 8.932744649553921e-05, + "loss": 0.8599, + "step": 66350 + }, + { + "epoch": 0.42395512566602356, + "grad_norm": 0.6014842987060547, + "learning_rate": 8.932434773672958e-05, + "loss": 0.7083, + "step": 66360 + }, + { + "epoch": 0.42401901281576226, + "grad_norm": 0.7437098622322083, + "learning_rate": 8.932124858188975e-05, + "loss": 0.8959, + "step": 66370 + }, + { + "epoch": 0.4240828999655009, + "grad_norm": 0.731093168258667, + "learning_rate": 8.931814903105092e-05, + "loss": 0.7875, + "step": 66380 + }, + { + "epoch": 0.4241467871152396, + "grad_norm": 0.7877000570297241, + "learning_rate": 8.931504908424431e-05, + "loss": 0.8525, + "step": 66390 + }, + { + "epoch": 0.4242106742649783, + "grad_norm": 0.929058849811554, + "learning_rate": 8.931194874150116e-05, + "loss": 0.7897, + "step": 66400 + }, + { + "epoch": 0.42427456141471703, + "grad_norm": 1.1878929138183594, + "learning_rate": 8.930884800285266e-05, + "loss": 0.9998, + "step": 66410 + }, + { + "epoch": 0.42433844856445574, + "grad_norm": 0.5432239174842834, + "learning_rate": 8.930574686833008e-05, + "loss": 0.8411, + "step": 66420 + }, + { + "epoch": 0.42440233571419445, + "grad_norm": 0.9339645504951477, + "learning_rate": 8.930264533796459e-05, + "loss": 0.9499, + "step": 66430 + }, + { + "epoch": 0.42446622286393315, + "grad_norm": 0.762392520904541, + "learning_rate": 8.929954341178749e-05, + "loss": 0.8893, + "step": 66440 + }, + { + "epoch": 0.42453011001367186, + "grad_norm": 0.6582323908805847, + "learning_rate": 8.929644108982998e-05, + "loss": 0.8409, + "step": 66450 + }, + { + "epoch": 0.42459399716341056, + "grad_norm": 0.8481007218360901, + "learning_rate": 8.92933383721233e-05, + "loss": 0.9136, + "step": 66460 + }, + { + "epoch": 0.42465788431314927, + "grad_norm": 0.6672992706298828, + "learning_rate": 8.929023525869872e-05, + "loss": 0.8445, + "step": 66470 + }, + { + "epoch": 0.424721771462888, + "grad_norm": 1.0847039222717285, + "learning_rate": 8.928713174958748e-05, + "loss": 0.9611, + "step": 66480 + }, + { + "epoch": 0.4247856586126267, + "grad_norm": 0.881767213344574, + "learning_rate": 8.928402784482084e-05, + "loss": 0.9177, + "step": 66490 + }, + { + "epoch": 0.42484954576236533, + "grad_norm": 1.421280026435852, + "learning_rate": 8.928123399227131e-05, + "loss": 0.852, + "step": 66500 + }, + { + "epoch": 0.42491343291210404, + "grad_norm": 0.6856515407562256, + "learning_rate": 8.927812933584552e-05, + "loss": 1.2407, + "step": 66510 + }, + { + "epoch": 0.42497732006184274, + "grad_norm": 0.8232982754707336, + "learning_rate": 8.927502428385498e-05, + "loss": 1.111, + "step": 66520 + }, + { + "epoch": 0.42504120721158145, + "grad_norm": 0.7256129384040833, + "learning_rate": 8.927191883633097e-05, + "loss": 0.9756, + "step": 66530 + }, + { + "epoch": 0.42510509436132016, + "grad_norm": 0.5605076551437378, + "learning_rate": 8.926881299330476e-05, + "loss": 0.8828, + "step": 66540 + }, + { + "epoch": 0.42516898151105886, + "grad_norm": 1.3462023735046387, + "learning_rate": 8.926570675480764e-05, + "loss": 0.9569, + "step": 66550 + }, + { + "epoch": 0.42523286866079757, + "grad_norm": 1.0285893678665161, + "learning_rate": 8.926260012087087e-05, + "loss": 0.9012, + "step": 66560 + }, + { + "epoch": 0.4252967558105363, + "grad_norm": 1.9040067195892334, + "learning_rate": 8.925949309152577e-05, + "loss": 1.0781, + "step": 66570 + }, + { + "epoch": 0.425360642960275, + "grad_norm": 0.7241610288619995, + "learning_rate": 8.925638566680359e-05, + "loss": 0.7973, + "step": 66580 + }, + { + "epoch": 0.4254245301100137, + "grad_norm": 1.2173702716827393, + "learning_rate": 8.925327784673564e-05, + "loss": 1.047, + "step": 66590 + }, + { + "epoch": 0.4254884172597524, + "grad_norm": 0.8426626920700073, + "learning_rate": 8.925016963135324e-05, + "loss": 0.78, + "step": 66600 + }, + { + "epoch": 0.4255523044094911, + "grad_norm": 1.3263126611709595, + "learning_rate": 8.924706102068767e-05, + "loss": 0.7994, + "step": 66610 + }, + { + "epoch": 0.42561619155922975, + "grad_norm": 1.6536914110183716, + "learning_rate": 8.924395201477025e-05, + "loss": 0.8917, + "step": 66620 + }, + { + "epoch": 0.42568007870896846, + "grad_norm": 0.7909001111984253, + "learning_rate": 8.924084261363228e-05, + "loss": 0.7676, + "step": 66630 + }, + { + "epoch": 0.42574396585870716, + "grad_norm": 0.40761637687683105, + "learning_rate": 8.923773281730505e-05, + "loss": 0.7697, + "step": 66640 + }, + { + "epoch": 0.42580785300844587, + "grad_norm": 0.8846787810325623, + "learning_rate": 8.923462262581994e-05, + "loss": 0.8687, + "step": 66650 + }, + { + "epoch": 0.4258717401581846, + "grad_norm": 0.4814871847629547, + "learning_rate": 8.923151203920822e-05, + "loss": 0.6312, + "step": 66660 + }, + { + "epoch": 0.4259356273079233, + "grad_norm": 0.6910040378570557, + "learning_rate": 8.922840105750124e-05, + "loss": 0.8927, + "step": 66670 + }, + { + "epoch": 0.425999514457662, + "grad_norm": 1.046462893486023, + "learning_rate": 8.922528968073032e-05, + "loss": 0.7882, + "step": 66680 + }, + { + "epoch": 0.4260634016074007, + "grad_norm": 1.2014803886413574, + "learning_rate": 8.92221779089268e-05, + "loss": 0.9165, + "step": 66690 + }, + { + "epoch": 0.4261272887571394, + "grad_norm": 0.9146868586540222, + "learning_rate": 8.921906574212202e-05, + "loss": 1.0733, + "step": 66700 + }, + { + "epoch": 0.4261911759068781, + "grad_norm": 0.8694166541099548, + "learning_rate": 8.92159531803473e-05, + "loss": 0.9912, + "step": 66710 + }, + { + "epoch": 0.4262550630566168, + "grad_norm": 0.904970645904541, + "learning_rate": 8.92131515370767e-05, + "loss": 1.1384, + "step": 66720 + }, + { + "epoch": 0.4263189502063555, + "grad_norm": 1.0510960817337036, + "learning_rate": 8.92100382249455e-05, + "loss": 0.8948, + "step": 66730 + }, + { + "epoch": 0.42638283735609417, + "grad_norm": 0.8128019571304321, + "learning_rate": 8.920692451793531e-05, + "loss": 0.7585, + "step": 66740 + }, + { + "epoch": 0.4264467245058329, + "grad_norm": 0.7644541263580322, + "learning_rate": 8.920381041607746e-05, + "loss": 1.0066, + "step": 66750 + }, + { + "epoch": 0.4265106116555716, + "grad_norm": 0.6716737151145935, + "learning_rate": 8.920069591940332e-05, + "loss": 0.7818, + "step": 66760 + }, + { + "epoch": 0.4265744988053103, + "grad_norm": 0.5078364610671997, + "learning_rate": 8.919758102794427e-05, + "loss": 1.0828, + "step": 66770 + }, + { + "epoch": 0.426638385955049, + "grad_norm": 1.3749090433120728, + "learning_rate": 8.919446574173165e-05, + "loss": 0.7222, + "step": 66780 + }, + { + "epoch": 0.4267022731047877, + "grad_norm": 0.9173924922943115, + "learning_rate": 8.919135006079686e-05, + "loss": 0.9544, + "step": 66790 + }, + { + "epoch": 0.4267661602545264, + "grad_norm": 2.012134552001953, + "learning_rate": 8.918823398517127e-05, + "loss": 0.805, + "step": 66800 + }, + { + "epoch": 0.4268300474042651, + "grad_norm": 1.9749096632003784, + "learning_rate": 8.918511751488627e-05, + "loss": 0.9767, + "step": 66810 + }, + { + "epoch": 0.4268939345540038, + "grad_norm": 1.5169198513031006, + "learning_rate": 8.918200064997324e-05, + "loss": 1.0532, + "step": 66820 + }, + { + "epoch": 0.4269578217037425, + "grad_norm": 0.8941536545753479, + "learning_rate": 8.917888339046354e-05, + "loss": 0.9049, + "step": 66830 + }, + { + "epoch": 0.42702170885348123, + "grad_norm": 0.7928354144096375, + "learning_rate": 8.917576573638862e-05, + "loss": 0.7091, + "step": 66840 + }, + { + "epoch": 0.42708559600321994, + "grad_norm": 0.7303599119186401, + "learning_rate": 8.917264768777983e-05, + "loss": 0.9175, + "step": 66850 + }, + { + "epoch": 0.4271494831529586, + "grad_norm": 0.6727188229560852, + "learning_rate": 8.91695292446686e-05, + "loss": 0.9231, + "step": 66860 + }, + { + "epoch": 0.4272133703026973, + "grad_norm": 0.5279465913772583, + "learning_rate": 8.91664104070863e-05, + "loss": 0.7118, + "step": 66870 + }, + { + "epoch": 0.427277257452436, + "grad_norm": 0.7597615122795105, + "learning_rate": 8.916329117506439e-05, + "loss": 0.9777, + "step": 66880 + }, + { + "epoch": 0.4273411446021747, + "grad_norm": 1.0109660625457764, + "learning_rate": 8.916017154863425e-05, + "loss": 0.8774, + "step": 66890 + }, + { + "epoch": 0.4274050317519134, + "grad_norm": 0.7892023324966431, + "learning_rate": 8.91570515278273e-05, + "loss": 1.0988, + "step": 66900 + }, + { + "epoch": 0.4274689189016521, + "grad_norm": 0.7960211038589478, + "learning_rate": 8.915393111267496e-05, + "loss": 0.8625, + "step": 66910 + }, + { + "epoch": 0.4275328060513908, + "grad_norm": 0.9462035894393921, + "learning_rate": 8.915081030320867e-05, + "loss": 0.9255, + "step": 66920 + }, + { + "epoch": 0.42759669320112953, + "grad_norm": 0.6910783648490906, + "learning_rate": 8.914768909945985e-05, + "loss": 0.695, + "step": 66930 + }, + { + "epoch": 0.42766058035086824, + "grad_norm": 0.8994881510734558, + "learning_rate": 8.914456750145991e-05, + "loss": 0.9296, + "step": 66940 + }, + { + "epoch": 0.42772446750060694, + "grad_norm": 0.7058424949645996, + "learning_rate": 8.914144550924034e-05, + "loss": 0.9154, + "step": 66950 + }, + { + "epoch": 0.42778835465034565, + "grad_norm": 0.9544531106948853, + "learning_rate": 8.913832312283254e-05, + "loss": 1.0751, + "step": 66960 + }, + { + "epoch": 0.42785224180008435, + "grad_norm": 1.3822722434997559, + "learning_rate": 8.913520034226797e-05, + "loss": 0.9816, + "step": 66970 + }, + { + "epoch": 0.42791612894982306, + "grad_norm": 0.7017986178398132, + "learning_rate": 8.913207716757807e-05, + "loss": 0.7602, + "step": 66980 + }, + { + "epoch": 0.4279800160995617, + "grad_norm": 0.4731631577014923, + "learning_rate": 8.912895359879431e-05, + "loss": 1.0528, + "step": 66990 + }, + { + "epoch": 0.4280439032493004, + "grad_norm": 0.7982561588287354, + "learning_rate": 8.912582963594813e-05, + "loss": 0.831, + "step": 67000 + }, + { + "epoch": 0.4281077903990391, + "grad_norm": 1.1041196584701538, + "learning_rate": 8.912270527907099e-05, + "loss": 0.8662, + "step": 67010 + }, + { + "epoch": 0.42817167754877783, + "grad_norm": 1.0753505229949951, + "learning_rate": 8.911958052819436e-05, + "loss": 0.8874, + "step": 67020 + }, + { + "epoch": 0.42823556469851654, + "grad_norm": 1.0988258123397827, + "learning_rate": 8.911645538334971e-05, + "loss": 1.1105, + "step": 67030 + }, + { + "epoch": 0.42829945184825524, + "grad_norm": 3.2240488529205322, + "learning_rate": 8.911332984456854e-05, + "loss": 0.8623, + "step": 67040 + }, + { + "epoch": 0.42836333899799395, + "grad_norm": 0.5571461915969849, + "learning_rate": 8.911020391188229e-05, + "loss": 0.8196, + "step": 67050 + }, + { + "epoch": 0.42842722614773265, + "grad_norm": 1.0421580076217651, + "learning_rate": 8.910707758532244e-05, + "loss": 0.8394, + "step": 67060 + }, + { + "epoch": 0.42849111329747136, + "grad_norm": 1.025112509727478, + "learning_rate": 8.91039508649205e-05, + "loss": 0.7116, + "step": 67070 + }, + { + "epoch": 0.42855500044721007, + "grad_norm": 1.0207161903381348, + "learning_rate": 8.910082375070792e-05, + "loss": 1.3015, + "step": 67080 + }, + { + "epoch": 0.42861888759694877, + "grad_norm": 0.5158314108848572, + "learning_rate": 8.909769624271625e-05, + "loss": 0.873, + "step": 67090 + }, + { + "epoch": 0.4286827747466875, + "grad_norm": 2.0511646270751953, + "learning_rate": 8.909456834097693e-05, + "loss": 0.7935, + "step": 67100 + }, + { + "epoch": 0.42874666189642613, + "grad_norm": 0.5745459198951721, + "learning_rate": 8.909144004552148e-05, + "loss": 1.0678, + "step": 67110 + }, + { + "epoch": 0.42881054904616484, + "grad_norm": 1.3202085494995117, + "learning_rate": 8.908831135638143e-05, + "loss": 0.6992, + "step": 67120 + }, + { + "epoch": 0.42887443619590354, + "grad_norm": 1.3148435354232788, + "learning_rate": 8.908518227358826e-05, + "loss": 0.7438, + "step": 67130 + }, + { + "epoch": 0.42893832334564225, + "grad_norm": 1.0374261140823364, + "learning_rate": 8.908205279717349e-05, + "loss": 0.9165, + "step": 67140 + }, + { + "epoch": 0.42900221049538095, + "grad_norm": 1.0254135131835938, + "learning_rate": 8.907892292716864e-05, + "loss": 1.087, + "step": 67150 + }, + { + "epoch": 0.42906609764511966, + "grad_norm": 0.7220088839530945, + "learning_rate": 8.907579266360523e-05, + "loss": 1.0477, + "step": 67160 + }, + { + "epoch": 0.42912998479485837, + "grad_norm": 0.8464503884315491, + "learning_rate": 8.907266200651478e-05, + "loss": 0.9686, + "step": 67170 + }, + { + "epoch": 0.42919387194459707, + "grad_norm": 0.6220828890800476, + "learning_rate": 8.906953095592882e-05, + "loss": 0.7929, + "step": 67180 + }, + { + "epoch": 0.4292577590943358, + "grad_norm": 0.8189134001731873, + "learning_rate": 8.906639951187889e-05, + "loss": 0.9221, + "step": 67190 + }, + { + "epoch": 0.4293216462440745, + "grad_norm": 0.879299521446228, + "learning_rate": 8.906326767439651e-05, + "loss": 0.8583, + "step": 67200 + }, + { + "epoch": 0.4293855333938132, + "grad_norm": 0.9902644157409668, + "learning_rate": 8.906013544351323e-05, + "loss": 0.8649, + "step": 67210 + }, + { + "epoch": 0.4294494205435519, + "grad_norm": 0.727925717830658, + "learning_rate": 8.905700281926061e-05, + "loss": 0.8093, + "step": 67220 + }, + { + "epoch": 0.42951330769329055, + "grad_norm": 0.6252252459526062, + "learning_rate": 8.905386980167016e-05, + "loss": 0.7309, + "step": 67230 + }, + { + "epoch": 0.42957719484302925, + "grad_norm": 1.9642329216003418, + "learning_rate": 8.905073639077347e-05, + "loss": 0.9235, + "step": 67240 + }, + { + "epoch": 0.42964108199276796, + "grad_norm": 0.7746663689613342, + "learning_rate": 8.904760258660208e-05, + "loss": 0.9314, + "step": 67250 + }, + { + "epoch": 0.42970496914250667, + "grad_norm": 0.423170804977417, + "learning_rate": 8.904446838918754e-05, + "loss": 0.9009, + "step": 67260 + }, + { + "epoch": 0.42976885629224537, + "grad_norm": 1.2594034671783447, + "learning_rate": 8.904133379856143e-05, + "loss": 0.9342, + "step": 67270 + }, + { + "epoch": 0.4298327434419841, + "grad_norm": 0.9244500994682312, + "learning_rate": 8.903819881475532e-05, + "loss": 0.9128, + "step": 67280 + }, + { + "epoch": 0.4298966305917228, + "grad_norm": 0.9682210683822632, + "learning_rate": 8.903506343780077e-05, + "loss": 0.8821, + "step": 67290 + }, + { + "epoch": 0.4299605177414615, + "grad_norm": 1.104791283607483, + "learning_rate": 8.903192766772936e-05, + "loss": 1.0183, + "step": 67300 + }, + { + "epoch": 0.4300244048912002, + "grad_norm": 1.1504932641983032, + "learning_rate": 8.902879150457269e-05, + "loss": 0.7472, + "step": 67310 + }, + { + "epoch": 0.4300882920409389, + "grad_norm": 0.5592100024223328, + "learning_rate": 8.90256549483623e-05, + "loss": 0.8123, + "step": 67320 + }, + { + "epoch": 0.4301521791906776, + "grad_norm": 1.0708913803100586, + "learning_rate": 8.902251799912981e-05, + "loss": 0.7882, + "step": 67330 + }, + { + "epoch": 0.4302160663404163, + "grad_norm": 0.6294905543327332, + "learning_rate": 8.90193806569068e-05, + "loss": 0.8449, + "step": 67340 + }, + { + "epoch": 0.43027995349015496, + "grad_norm": 1.0562630891799927, + "learning_rate": 8.901624292172488e-05, + "loss": 1.2612, + "step": 67350 + }, + { + "epoch": 0.43034384063989367, + "grad_norm": 0.6391942501068115, + "learning_rate": 8.901310479361564e-05, + "loss": 0.9626, + "step": 67360 + }, + { + "epoch": 0.4304077277896324, + "grad_norm": 0.8884569406509399, + "learning_rate": 8.900996627261067e-05, + "loss": 0.9499, + "step": 67370 + }, + { + "epoch": 0.4304716149393711, + "grad_norm": 1.3086752891540527, + "learning_rate": 8.90068273587416e-05, + "loss": 0.9847, + "step": 67380 + }, + { + "epoch": 0.4305355020891098, + "grad_norm": 0.8015036582946777, + "learning_rate": 8.900368805204003e-05, + "loss": 0.9094, + "step": 67390 + }, + { + "epoch": 0.4305993892388485, + "grad_norm": 0.5839217901229858, + "learning_rate": 8.900054835253758e-05, + "loss": 0.9917, + "step": 67400 + }, + { + "epoch": 0.4306632763885872, + "grad_norm": 1.5205440521240234, + "learning_rate": 8.899740826026587e-05, + "loss": 0.7, + "step": 67410 + }, + { + "epoch": 0.4307271635383259, + "grad_norm": 0.9681718349456787, + "learning_rate": 8.899426777525653e-05, + "loss": 0.7742, + "step": 67420 + }, + { + "epoch": 0.4307910506880646, + "grad_norm": 0.8119606375694275, + "learning_rate": 8.899112689754117e-05, + "loss": 0.8792, + "step": 67430 + }, + { + "epoch": 0.4308549378378033, + "grad_norm": 0.8435991406440735, + "learning_rate": 8.898798562715142e-05, + "loss": 1.0099, + "step": 67440 + }, + { + "epoch": 0.430918824987542, + "grad_norm": 0.4675378203392029, + "learning_rate": 8.898484396411894e-05, + "loss": 0.8346, + "step": 67450 + }, + { + "epoch": 0.43098271213728073, + "grad_norm": 0.8612586855888367, + "learning_rate": 8.898170190847535e-05, + "loss": 0.7461, + "step": 67460 + }, + { + "epoch": 0.4310465992870194, + "grad_norm": 0.769745409488678, + "learning_rate": 8.897855946025228e-05, + "loss": 0.9233, + "step": 67470 + }, + { + "epoch": 0.4311104864367581, + "grad_norm": 1.4678987264633179, + "learning_rate": 8.897541661948142e-05, + "loss": 0.7533, + "step": 67480 + }, + { + "epoch": 0.4311743735864968, + "grad_norm": 1.0737018585205078, + "learning_rate": 8.897227338619438e-05, + "loss": 0.6886, + "step": 67490 + }, + { + "epoch": 0.4312382607362355, + "grad_norm": 0.6413093209266663, + "learning_rate": 8.896912976042285e-05, + "loss": 0.9434, + "step": 67500 + }, + { + "epoch": 0.4313021478859742, + "grad_norm": 0.8239714503288269, + "learning_rate": 8.896598574219845e-05, + "loss": 0.904, + "step": 67510 + }, + { + "epoch": 0.4313660350357129, + "grad_norm": 0.8861196041107178, + "learning_rate": 8.896284133155288e-05, + "loss": 0.909, + "step": 67520 + }, + { + "epoch": 0.4314299221854516, + "grad_norm": 0.7210700511932373, + "learning_rate": 8.895969652851778e-05, + "loss": 0.9084, + "step": 67530 + }, + { + "epoch": 0.4314938093351903, + "grad_norm": 1.172956943511963, + "learning_rate": 8.895655133312483e-05, + "loss": 0.9011, + "step": 67540 + }, + { + "epoch": 0.43155769648492903, + "grad_norm": 0.9112328886985779, + "learning_rate": 8.895340574540571e-05, + "loss": 0.7824, + "step": 67550 + }, + { + "epoch": 0.43162158363466774, + "grad_norm": 1.0518110990524292, + "learning_rate": 8.895025976539209e-05, + "loss": 1.2023, + "step": 67560 + }, + { + "epoch": 0.43168547078440644, + "grad_norm": 0.8246524930000305, + "learning_rate": 8.894711339311567e-05, + "loss": 1.0688, + "step": 67570 + }, + { + "epoch": 0.43174935793414515, + "grad_norm": 0.9622389078140259, + "learning_rate": 8.894396662860811e-05, + "loss": 0.6852, + "step": 67580 + }, + { + "epoch": 0.4318132450838838, + "grad_norm": 0.7277495265007019, + "learning_rate": 8.894081947190112e-05, + "loss": 0.8892, + "step": 67590 + }, + { + "epoch": 0.4318771322336225, + "grad_norm": 0.5220545530319214, + "learning_rate": 8.893767192302639e-05, + "loss": 0.7688, + "step": 67600 + }, + { + "epoch": 0.4319410193833612, + "grad_norm": 0.7757664322853088, + "learning_rate": 8.893452398201561e-05, + "loss": 0.9166, + "step": 67610 + }, + { + "epoch": 0.4320049065330999, + "grad_norm": 1.4959086179733276, + "learning_rate": 8.89313756489005e-05, + "loss": 1.0149, + "step": 67620 + }, + { + "epoch": 0.4320687936828386, + "grad_norm": 1.3954126834869385, + "learning_rate": 8.892822692371277e-05, + "loss": 0.802, + "step": 67630 + }, + { + "epoch": 0.43213268083257733, + "grad_norm": 0.7430549263954163, + "learning_rate": 8.89250778064841e-05, + "loss": 0.8956, + "step": 67640 + }, + { + "epoch": 0.43219656798231604, + "grad_norm": 1.650481939315796, + "learning_rate": 8.892192829724621e-05, + "loss": 1.1669, + "step": 67650 + }, + { + "epoch": 0.43226045513205474, + "grad_norm": 1.7691149711608887, + "learning_rate": 8.891877839603085e-05, + "loss": 1.1042, + "step": 67660 + }, + { + "epoch": 0.43232434228179345, + "grad_norm": 0.6639987230300903, + "learning_rate": 8.891562810286971e-05, + "loss": 0.8842, + "step": 67670 + }, + { + "epoch": 0.43238822943153216, + "grad_norm": 0.9073365926742554, + "learning_rate": 8.891247741779454e-05, + "loss": 1.1714, + "step": 67680 + }, + { + "epoch": 0.43245211658127086, + "grad_norm": 1.1349682807922363, + "learning_rate": 8.890932634083704e-05, + "loss": 0.9899, + "step": 67690 + }, + { + "epoch": 0.43251600373100957, + "grad_norm": 0.7573813796043396, + "learning_rate": 8.890617487202899e-05, + "loss": 0.8316, + "step": 67700 + }, + { + "epoch": 0.4325798908807482, + "grad_norm": 0.7431557178497314, + "learning_rate": 8.890302301140208e-05, + "loss": 0.8598, + "step": 67710 + }, + { + "epoch": 0.4326437780304869, + "grad_norm": 0.6789889931678772, + "learning_rate": 8.889987075898807e-05, + "loss": 1.1971, + "step": 67720 + }, + { + "epoch": 0.43270766518022563, + "grad_norm": 0.5719479322433472, + "learning_rate": 8.889671811481872e-05, + "loss": 0.6596, + "step": 67730 + }, + { + "epoch": 0.43277155232996434, + "grad_norm": 0.8801824450492859, + "learning_rate": 8.889356507892575e-05, + "loss": 0.8168, + "step": 67740 + }, + { + "epoch": 0.43283543947970304, + "grad_norm": 1.7005552053451538, + "learning_rate": 8.889041165134096e-05, + "loss": 0.8598, + "step": 67750 + }, + { + "epoch": 0.43289932662944175, + "grad_norm": 0.5636479258537292, + "learning_rate": 8.888725783209606e-05, + "loss": 0.7868, + "step": 67760 + }, + { + "epoch": 0.43296321377918046, + "grad_norm": 0.9649848937988281, + "learning_rate": 8.888410362122283e-05, + "loss": 0.8729, + "step": 67770 + }, + { + "epoch": 0.43302710092891916, + "grad_norm": 1.2120856046676636, + "learning_rate": 8.888094901875303e-05, + "loss": 1.1061, + "step": 67780 + }, + { + "epoch": 0.43309098807865787, + "grad_norm": 1.1897577047348022, + "learning_rate": 8.887779402471846e-05, + "loss": 0.8963, + "step": 67790 + }, + { + "epoch": 0.4331548752283966, + "grad_norm": 0.8927859663963318, + "learning_rate": 8.887463863915087e-05, + "loss": 0.985, + "step": 67800 + }, + { + "epoch": 0.4332187623781353, + "grad_norm": 1.1183792352676392, + "learning_rate": 8.887148286208202e-05, + "loss": 1.0094, + "step": 67810 + }, + { + "epoch": 0.433282649527874, + "grad_norm": 1.071887731552124, + "learning_rate": 8.886832669354372e-05, + "loss": 0.8359, + "step": 67820 + }, + { + "epoch": 0.4333465366776127, + "grad_norm": 0.6402618885040283, + "learning_rate": 8.886517013356774e-05, + "loss": 1.0026, + "step": 67830 + }, + { + "epoch": 0.43341042382735134, + "grad_norm": 1.0560641288757324, + "learning_rate": 8.886201318218587e-05, + "loss": 0.7045, + "step": 67840 + }, + { + "epoch": 0.43347431097709005, + "grad_norm": 0.9585883021354675, + "learning_rate": 8.88588558394299e-05, + "loss": 0.9572, + "step": 67850 + }, + { + "epoch": 0.43353819812682876, + "grad_norm": 0.7981050610542297, + "learning_rate": 8.885569810533166e-05, + "loss": 0.7819, + "step": 67860 + }, + { + "epoch": 0.43360208527656746, + "grad_norm": 1.467461347579956, + "learning_rate": 8.88525399799229e-05, + "loss": 0.9183, + "step": 67870 + }, + { + "epoch": 0.43366597242630617, + "grad_norm": 0.9360789060592651, + "learning_rate": 8.884938146323546e-05, + "loss": 1.0038, + "step": 67880 + }, + { + "epoch": 0.4337298595760449, + "grad_norm": 1.3165303468704224, + "learning_rate": 8.884622255530116e-05, + "loss": 0.8743, + "step": 67890 + }, + { + "epoch": 0.4337937467257836, + "grad_norm": 1.1677271127700806, + "learning_rate": 8.884306325615174e-05, + "loss": 1.0382, + "step": 67900 + }, + { + "epoch": 0.4338576338755223, + "grad_norm": 1.1823782920837402, + "learning_rate": 8.883990356581911e-05, + "loss": 0.8917, + "step": 67910 + }, + { + "epoch": 0.433921521025261, + "grad_norm": 0.8433313369750977, + "learning_rate": 8.883674348433504e-05, + "loss": 0.8236, + "step": 67920 + }, + { + "epoch": 0.4339854081749997, + "grad_norm": 1.1049748659133911, + "learning_rate": 8.883358301173138e-05, + "loss": 0.7639, + "step": 67930 + }, + { + "epoch": 0.4340492953247384, + "grad_norm": 0.8020467162132263, + "learning_rate": 8.883042214803991e-05, + "loss": 0.9805, + "step": 67940 + }, + { + "epoch": 0.4341131824744771, + "grad_norm": 0.5183336734771729, + "learning_rate": 8.882726089329252e-05, + "loss": 0.9406, + "step": 67950 + }, + { + "epoch": 0.43417706962421576, + "grad_norm": 0.9549485445022583, + "learning_rate": 8.882409924752102e-05, + "loss": 0.7904, + "step": 67960 + }, + { + "epoch": 0.43424095677395447, + "grad_norm": 0.9031966924667358, + "learning_rate": 8.882093721075724e-05, + "loss": 0.6085, + "step": 67970 + }, + { + "epoch": 0.4343048439236932, + "grad_norm": 0.7417629957199097, + "learning_rate": 8.881777478303306e-05, + "loss": 0.979, + "step": 67980 + }, + { + "epoch": 0.4343687310734319, + "grad_norm": 0.742239236831665, + "learning_rate": 8.881461196438027e-05, + "loss": 1.0707, + "step": 67990 + }, + { + "epoch": 0.4344326182231706, + "grad_norm": 1.0497804880142212, + "learning_rate": 8.88114487548308e-05, + "loss": 0.8717, + "step": 68000 + }, + { + "epoch": 0.4344965053729093, + "grad_norm": 0.7527285814285278, + "learning_rate": 8.880828515441643e-05, + "loss": 1.0762, + "step": 68010 + }, + { + "epoch": 0.434560392522648, + "grad_norm": 0.8218625783920288, + "learning_rate": 8.880512116316908e-05, + "loss": 1.0556, + "step": 68020 + }, + { + "epoch": 0.4346242796723867, + "grad_norm": 1.8415364027023315, + "learning_rate": 8.880195678112058e-05, + "loss": 1.1582, + "step": 68030 + }, + { + "epoch": 0.4346881668221254, + "grad_norm": 0.6465769410133362, + "learning_rate": 8.87987920083028e-05, + "loss": 1.0762, + "step": 68040 + }, + { + "epoch": 0.4347520539718641, + "grad_norm": 0.6471286416053772, + "learning_rate": 8.879562684474762e-05, + "loss": 1.2511, + "step": 68050 + }, + { + "epoch": 0.4348159411216028, + "grad_norm": 0.6721779704093933, + "learning_rate": 8.879246129048693e-05, + "loss": 0.8825, + "step": 68060 + }, + { + "epoch": 0.43487982827134153, + "grad_norm": 0.8682761788368225, + "learning_rate": 8.878929534555259e-05, + "loss": 1.0418, + "step": 68070 + }, + { + "epoch": 0.4349437154210802, + "grad_norm": 0.7083001732826233, + "learning_rate": 8.878612900997648e-05, + "loss": 0.9285, + "step": 68080 + }, + { + "epoch": 0.4350076025708189, + "grad_norm": 0.7909469604492188, + "learning_rate": 8.878296228379048e-05, + "loss": 0.9, + "step": 68090 + }, + { + "epoch": 0.4350714897205576, + "grad_norm": 0.7747198939323425, + "learning_rate": 8.877979516702651e-05, + "loss": 0.7877, + "step": 68100 + }, + { + "epoch": 0.4351353768702963, + "grad_norm": 1.1311992406845093, + "learning_rate": 8.877662765971646e-05, + "loss": 0.9031, + "step": 68110 + }, + { + "epoch": 0.435199264020035, + "grad_norm": 0.8452590107917786, + "learning_rate": 8.877345976189223e-05, + "loss": 0.8362, + "step": 68120 + }, + { + "epoch": 0.4352631511697737, + "grad_norm": 1.3919566869735718, + "learning_rate": 8.877029147358571e-05, + "loss": 0.8168, + "step": 68130 + }, + { + "epoch": 0.4353270383195124, + "grad_norm": 1.0793455839157104, + "learning_rate": 8.87671227948288e-05, + "loss": 0.6657, + "step": 68140 + }, + { + "epoch": 0.4353909254692511, + "grad_norm": 0.6547946929931641, + "learning_rate": 8.876395372565344e-05, + "loss": 1.1194, + "step": 68150 + }, + { + "epoch": 0.43545481261898983, + "grad_norm": 1.289340615272522, + "learning_rate": 8.876078426609153e-05, + "loss": 1.0495, + "step": 68160 + }, + { + "epoch": 0.43551869976872853, + "grad_norm": 1.287331223487854, + "learning_rate": 8.875761441617498e-05, + "loss": 0.8023, + "step": 68170 + }, + { + "epoch": 0.43558258691846724, + "grad_norm": 1.054658055305481, + "learning_rate": 8.875444417593574e-05, + "loss": 0.8072, + "step": 68180 + }, + { + "epoch": 0.43564647406820595, + "grad_norm": 1.5471371412277222, + "learning_rate": 8.87512735454057e-05, + "loss": 0.8984, + "step": 68190 + }, + { + "epoch": 0.4357103612179446, + "grad_norm": 0.9853270649909973, + "learning_rate": 8.874810252461683e-05, + "loss": 0.8457, + "step": 68200 + }, + { + "epoch": 0.4357742483676833, + "grad_norm": 0.8379093408584595, + "learning_rate": 8.874493111360103e-05, + "loss": 1.0092, + "step": 68210 + }, + { + "epoch": 0.435838135517422, + "grad_norm": 0.6254721879959106, + "learning_rate": 8.874175931239026e-05, + "loss": 0.792, + "step": 68220 + }, + { + "epoch": 0.4359020226671607, + "grad_norm": 0.5673577189445496, + "learning_rate": 8.873858712101645e-05, + "loss": 0.7041, + "step": 68230 + }, + { + "epoch": 0.4359659098168994, + "grad_norm": 0.8581469058990479, + "learning_rate": 8.873541453951157e-05, + "loss": 1.118, + "step": 68240 + }, + { + "epoch": 0.43602979696663813, + "grad_norm": 0.7700116634368896, + "learning_rate": 8.873224156790754e-05, + "loss": 0.9587, + "step": 68250 + }, + { + "epoch": 0.43609368411637683, + "grad_norm": 1.4901466369628906, + "learning_rate": 8.872906820623634e-05, + "loss": 0.9082, + "step": 68260 + }, + { + "epoch": 0.43615757126611554, + "grad_norm": 1.1333754062652588, + "learning_rate": 8.872589445452991e-05, + "loss": 0.8202, + "step": 68270 + }, + { + "epoch": 0.43622145841585425, + "grad_norm": 0.4992083013057709, + "learning_rate": 8.872272031282022e-05, + "loss": 0.8428, + "step": 68280 + }, + { + "epoch": 0.43628534556559295, + "grad_norm": 0.7288440465927124, + "learning_rate": 8.871954578113925e-05, + "loss": 0.7839, + "step": 68290 + }, + { + "epoch": 0.43634923271533166, + "grad_norm": 1.4860522747039795, + "learning_rate": 8.871637085951894e-05, + "loss": 0.9678, + "step": 68300 + }, + { + "epoch": 0.43641311986507036, + "grad_norm": 0.8923503756523132, + "learning_rate": 8.87131955479913e-05, + "loss": 0.9322, + "step": 68310 + }, + { + "epoch": 0.436477007014809, + "grad_norm": 1.1527504920959473, + "learning_rate": 8.871001984658826e-05, + "loss": 1.0341, + "step": 68320 + }, + { + "epoch": 0.4365408941645477, + "grad_norm": 0.9049966931343079, + "learning_rate": 8.870684375534185e-05, + "loss": 1.0123, + "step": 68330 + }, + { + "epoch": 0.4366047813142864, + "grad_norm": 0.6281135678291321, + "learning_rate": 8.870366727428404e-05, + "loss": 0.9563, + "step": 68340 + }, + { + "epoch": 0.43666866846402513, + "grad_norm": 0.6897270679473877, + "learning_rate": 8.870049040344682e-05, + "loss": 0.8434, + "step": 68350 + }, + { + "epoch": 0.43673255561376384, + "grad_norm": 1.3322041034698486, + "learning_rate": 8.869731314286215e-05, + "loss": 1.0403, + "step": 68360 + }, + { + "epoch": 0.43679644276350255, + "grad_norm": 0.9318044781684875, + "learning_rate": 8.869413549256209e-05, + "loss": 0.8422, + "step": 68370 + }, + { + "epoch": 0.43686032991324125, + "grad_norm": 0.8586065769195557, + "learning_rate": 8.86909574525786e-05, + "loss": 0.8878, + "step": 68380 + }, + { + "epoch": 0.43692421706297996, + "grad_norm": 1.9818271398544312, + "learning_rate": 8.86877790229437e-05, + "loss": 0.7168, + "step": 68390 + }, + { + "epoch": 0.43698810421271866, + "grad_norm": 0.7556184530258179, + "learning_rate": 8.868460020368941e-05, + "loss": 1.0074, + "step": 68400 + }, + { + "epoch": 0.43705199136245737, + "grad_norm": 0.5547859072685242, + "learning_rate": 8.868142099484771e-05, + "loss": 0.9824, + "step": 68410 + }, + { + "epoch": 0.4371158785121961, + "grad_norm": 0.8270901441574097, + "learning_rate": 8.867824139645063e-05, + "loss": 0.6677, + "step": 68420 + }, + { + "epoch": 0.4371797656619348, + "grad_norm": 0.7375511527061462, + "learning_rate": 8.867506140853021e-05, + "loss": 0.8542, + "step": 68430 + }, + { + "epoch": 0.43724365281167343, + "grad_norm": 0.879522979259491, + "learning_rate": 8.867188103111845e-05, + "loss": 0.8551, + "step": 68440 + }, + { + "epoch": 0.43730753996141214, + "grad_norm": 1.1079013347625732, + "learning_rate": 8.866870026424741e-05, + "loss": 1.1122, + "step": 68450 + }, + { + "epoch": 0.43737142711115085, + "grad_norm": 1.25412917137146, + "learning_rate": 8.86655191079491e-05, + "loss": 0.8394, + "step": 68460 + }, + { + "epoch": 0.43743531426088955, + "grad_norm": 0.7833040952682495, + "learning_rate": 8.866233756225555e-05, + "loss": 0.8275, + "step": 68470 + }, + { + "epoch": 0.43749920141062826, + "grad_norm": 1.0346733331680298, + "learning_rate": 8.865915562719882e-05, + "loss": 0.8503, + "step": 68480 + }, + { + "epoch": 0.43756308856036696, + "grad_norm": 0.9302981495857239, + "learning_rate": 8.865597330281096e-05, + "loss": 0.7965, + "step": 68490 + }, + { + "epoch": 0.43762697571010567, + "grad_norm": 0.8941460251808167, + "learning_rate": 8.8652790589124e-05, + "loss": 0.7823, + "step": 68500 + }, + { + "epoch": 0.4376908628598444, + "grad_norm": 0.7403380870819092, + "learning_rate": 8.864960748617e-05, + "loss": 0.9164, + "step": 68510 + }, + { + "epoch": 0.4377547500095831, + "grad_norm": 1.2985106706619263, + "learning_rate": 8.8646423993981e-05, + "loss": 0.9005, + "step": 68520 + }, + { + "epoch": 0.4378186371593218, + "grad_norm": 0.5682730078697205, + "learning_rate": 8.864324011258908e-05, + "loss": 0.8248, + "step": 68530 + }, + { + "epoch": 0.4378825243090605, + "grad_norm": 1.3618555068969727, + "learning_rate": 8.864005584202632e-05, + "loss": 1.1664, + "step": 68540 + }, + { + "epoch": 0.4379464114587992, + "grad_norm": 0.6019179224967957, + "learning_rate": 8.863687118232475e-05, + "loss": 0.8097, + "step": 68550 + }, + { + "epoch": 0.43801029860853785, + "grad_norm": 1.4094189405441284, + "learning_rate": 8.863368613351648e-05, + "loss": 0.7467, + "step": 68560 + }, + { + "epoch": 0.43807418575827656, + "grad_norm": 1.509199857711792, + "learning_rate": 8.863050069563355e-05, + "loss": 0.9534, + "step": 68570 + }, + { + "epoch": 0.43813807290801526, + "grad_norm": 1.1251524686813354, + "learning_rate": 8.862731486870808e-05, + "loss": 0.9511, + "step": 68580 + }, + { + "epoch": 0.43820196005775397, + "grad_norm": 0.9050050973892212, + "learning_rate": 8.862412865277211e-05, + "loss": 0.9554, + "step": 68590 + }, + { + "epoch": 0.4382658472074927, + "grad_norm": 0.6649369597434998, + "learning_rate": 8.862094204785776e-05, + "loss": 0.8778, + "step": 68600 + }, + { + "epoch": 0.4383297343572314, + "grad_norm": 0.7536949515342712, + "learning_rate": 8.86177550539971e-05, + "loss": 0.8764, + "step": 68610 + }, + { + "epoch": 0.4383936215069701, + "grad_norm": 0.898378312587738, + "learning_rate": 8.861456767122226e-05, + "loss": 1.0107, + "step": 68620 + }, + { + "epoch": 0.4384575086567088, + "grad_norm": 2.269949436187744, + "learning_rate": 8.861137989956529e-05, + "loss": 0.7672, + "step": 68630 + }, + { + "epoch": 0.4385213958064475, + "grad_norm": 6.667402267456055, + "learning_rate": 8.860819173905835e-05, + "loss": 0.7432, + "step": 68640 + }, + { + "epoch": 0.4385852829561862, + "grad_norm": 0.7865056395530701, + "learning_rate": 8.860500318973351e-05, + "loss": 0.9813, + "step": 68650 + }, + { + "epoch": 0.4386491701059249, + "grad_norm": 2.528974771499634, + "learning_rate": 8.860181425162287e-05, + "loss": 0.8215, + "step": 68660 + }, + { + "epoch": 0.4387130572556636, + "grad_norm": 0.5087980031967163, + "learning_rate": 8.859862492475858e-05, + "loss": 1.2262, + "step": 68670 + }, + { + "epoch": 0.4387769444054023, + "grad_norm": 1.1823939085006714, + "learning_rate": 8.859543520917275e-05, + "loss": 0.7388, + "step": 68680 + }, + { + "epoch": 0.438840831555141, + "grad_norm": 0.7431660294532776, + "learning_rate": 8.859224510489747e-05, + "loss": 0.6863, + "step": 68690 + }, + { + "epoch": 0.4389047187048797, + "grad_norm": 1.038490653038025, + "learning_rate": 8.858905461196492e-05, + "loss": 0.853, + "step": 68700 + }, + { + "epoch": 0.4389686058546184, + "grad_norm": 0.9958590269088745, + "learning_rate": 8.85858637304072e-05, + "loss": 0.8119, + "step": 68710 + }, + { + "epoch": 0.4390324930043571, + "grad_norm": 0.6802636384963989, + "learning_rate": 8.858267246025645e-05, + "loss": 1.0443, + "step": 68720 + }, + { + "epoch": 0.4390963801540958, + "grad_norm": 1.3269374370574951, + "learning_rate": 8.857948080154481e-05, + "loss": 0.8071, + "step": 68730 + }, + { + "epoch": 0.4391602673038345, + "grad_norm": 0.7724654078483582, + "learning_rate": 8.857628875430444e-05, + "loss": 0.7978, + "step": 68740 + }, + { + "epoch": 0.4392241544535732, + "grad_norm": 0.7245962023735046, + "learning_rate": 8.857309631856745e-05, + "loss": 0.8891, + "step": 68750 + }, + { + "epoch": 0.4392880416033119, + "grad_norm": 0.9760708808898926, + "learning_rate": 8.8569903494366e-05, + "loss": 0.9933, + "step": 68760 + }, + { + "epoch": 0.4393519287530506, + "grad_norm": 0.828381359577179, + "learning_rate": 8.856671028173227e-05, + "loss": 0.949, + "step": 68770 + }, + { + "epoch": 0.43941581590278933, + "grad_norm": 0.948192834854126, + "learning_rate": 8.85635166806984e-05, + "loss": 0.785, + "step": 68780 + }, + { + "epoch": 0.43947970305252804, + "grad_norm": 1.3965764045715332, + "learning_rate": 8.856032269129655e-05, + "loss": 0.6816, + "step": 68790 + }, + { + "epoch": 0.43954359020226674, + "grad_norm": 0.6188552975654602, + "learning_rate": 8.85571283135589e-05, + "loss": 0.7448, + "step": 68800 + }, + { + "epoch": 0.4396074773520054, + "grad_norm": 0.7305311560630798, + "learning_rate": 8.85539335475176e-05, + "loss": 0.8134, + "step": 68810 + }, + { + "epoch": 0.4396713645017441, + "grad_norm": 0.6679476499557495, + "learning_rate": 8.855073839320484e-05, + "loss": 0.8225, + "step": 68820 + }, + { + "epoch": 0.4397352516514828, + "grad_norm": 0.41290047764778137, + "learning_rate": 8.85475428506528e-05, + "loss": 0.8303, + "step": 68830 + }, + { + "epoch": 0.4397991388012215, + "grad_norm": 1.389434814453125, + "learning_rate": 8.854434691989365e-05, + "loss": 1.0117, + "step": 68840 + }, + { + "epoch": 0.4398630259509602, + "grad_norm": 0.970970869064331, + "learning_rate": 8.854115060095958e-05, + "loss": 0.9634, + "step": 68850 + }, + { + "epoch": 0.4399269131006989, + "grad_norm": 0.8621498346328735, + "learning_rate": 8.853795389388277e-05, + "loss": 0.7216, + "step": 68860 + }, + { + "epoch": 0.43999080025043763, + "grad_norm": 0.8945342898368835, + "learning_rate": 8.853475679869545e-05, + "loss": 0.924, + "step": 68870 + }, + { + "epoch": 0.44005468740017634, + "grad_norm": 0.7587364315986633, + "learning_rate": 8.853155931542978e-05, + "loss": 0.9817, + "step": 68880 + }, + { + "epoch": 0.44011857454991504, + "grad_norm": 0.9429205656051636, + "learning_rate": 8.852836144411795e-05, + "loss": 1.1741, + "step": 68890 + }, + { + "epoch": 0.44018246169965375, + "grad_norm": 0.9457645416259766, + "learning_rate": 8.852516318479223e-05, + "loss": 0.8122, + "step": 68900 + }, + { + "epoch": 0.44024634884939245, + "grad_norm": 0.8908385038375854, + "learning_rate": 8.852196453748476e-05, + "loss": 1.0426, + "step": 68910 + }, + { + "epoch": 0.44031023599913116, + "grad_norm": 1.4087450504302979, + "learning_rate": 8.851876550222779e-05, + "loss": 0.6433, + "step": 68920 + }, + { + "epoch": 0.4403741231488698, + "grad_norm": 0.8311522006988525, + "learning_rate": 8.851556607905351e-05, + "loss": 0.8959, + "step": 68930 + }, + { + "epoch": 0.4404380102986085, + "grad_norm": 0.6747666597366333, + "learning_rate": 8.851236626799419e-05, + "loss": 1.1469, + "step": 68940 + }, + { + "epoch": 0.4405018974483472, + "grad_norm": 0.8693909049034119, + "learning_rate": 8.850916606908199e-05, + "loss": 0.7576, + "step": 68950 + }, + { + "epoch": 0.44056578459808593, + "grad_norm": 0.6947962045669556, + "learning_rate": 8.85059654823492e-05, + "loss": 0.9861, + "step": 68960 + }, + { + "epoch": 0.44062967174782464, + "grad_norm": 4.578150749206543, + "learning_rate": 8.850276450782802e-05, + "loss": 1.0223, + "step": 68970 + }, + { + "epoch": 0.44069355889756334, + "grad_norm": 0.8332919478416443, + "learning_rate": 8.849956314555068e-05, + "loss": 0.7311, + "step": 68980 + }, + { + "epoch": 0.44075744604730205, + "grad_norm": 0.8713606595993042, + "learning_rate": 8.849636139554945e-05, + "loss": 0.8488, + "step": 68990 + }, + { + "epoch": 0.44082133319704075, + "grad_norm": 0.8420679569244385, + "learning_rate": 8.849315925785654e-05, + "loss": 0.9619, + "step": 69000 + }, + { + "epoch": 0.44088522034677946, + "grad_norm": 0.9233155846595764, + "learning_rate": 8.848995673250421e-05, + "loss": 0.9395, + "step": 69010 + }, + { + "epoch": 0.44094910749651817, + "grad_norm": 1.0304968357086182, + "learning_rate": 8.848675381952474e-05, + "loss": 0.7857, + "step": 69020 + }, + { + "epoch": 0.4410129946462569, + "grad_norm": 1.142500638961792, + "learning_rate": 8.848355051895035e-05, + "loss": 0.7173, + "step": 69030 + }, + { + "epoch": 0.4410768817959956, + "grad_norm": 1.1199169158935547, + "learning_rate": 8.848034683081332e-05, + "loss": 0.7658, + "step": 69040 + }, + { + "epoch": 0.44114076894573423, + "grad_norm": 0.6068952679634094, + "learning_rate": 8.84771427551459e-05, + "loss": 0.7444, + "step": 69050 + }, + { + "epoch": 0.44120465609547294, + "grad_norm": 1.1909863948822021, + "learning_rate": 8.847393829198036e-05, + "loss": 1.017, + "step": 69060 + }, + { + "epoch": 0.44126854324521164, + "grad_norm": 0.84711092710495, + "learning_rate": 8.847073344134898e-05, + "loss": 0.7326, + "step": 69070 + }, + { + "epoch": 0.44133243039495035, + "grad_norm": 1.1196755170822144, + "learning_rate": 8.846752820328403e-05, + "loss": 0.9662, + "step": 69080 + }, + { + "epoch": 0.44139631754468905, + "grad_norm": 0.9795490503311157, + "learning_rate": 8.846432257781781e-05, + "loss": 0.976, + "step": 69090 + }, + { + "epoch": 0.44146020469442776, + "grad_norm": 0.7674742341041565, + "learning_rate": 8.846111656498257e-05, + "loss": 0.9718, + "step": 69100 + }, + { + "epoch": 0.44152409184416647, + "grad_norm": 0.8170384764671326, + "learning_rate": 8.845791016481062e-05, + "loss": 0.9278, + "step": 69110 + }, + { + "epoch": 0.4415879789939052, + "grad_norm": 0.8551295399665833, + "learning_rate": 8.845470337733423e-05, + "loss": 0.9096, + "step": 69120 + }, + { + "epoch": 0.4416518661436439, + "grad_norm": 1.4373359680175781, + "learning_rate": 8.845149620258573e-05, + "loss": 1.2263, + "step": 69130 + }, + { + "epoch": 0.4417157532933826, + "grad_norm": 0.746088981628418, + "learning_rate": 8.844828864059738e-05, + "loss": 0.9052, + "step": 69140 + }, + { + "epoch": 0.4417796404431213, + "grad_norm": 0.6683810949325562, + "learning_rate": 8.84450806914015e-05, + "loss": 0.7266, + "step": 69150 + }, + { + "epoch": 0.44184352759286, + "grad_norm": 0.9795920848846436, + "learning_rate": 8.84418723550304e-05, + "loss": 1.1524, + "step": 69160 + }, + { + "epoch": 0.44190741474259865, + "grad_norm": 0.9631989002227783, + "learning_rate": 8.843866363151641e-05, + "loss": 1.059, + "step": 69170 + }, + { + "epoch": 0.44197130189233735, + "grad_norm": 0.7739669680595398, + "learning_rate": 8.84354545208918e-05, + "loss": 0.7757, + "step": 69180 + }, + { + "epoch": 0.44203518904207606, + "grad_norm": 0.5001319646835327, + "learning_rate": 8.843224502318892e-05, + "loss": 0.8741, + "step": 69190 + }, + { + "epoch": 0.44209907619181477, + "grad_norm": 0.739460825920105, + "learning_rate": 8.842903513844008e-05, + "loss": 0.9077, + "step": 69200 + }, + { + "epoch": 0.44216296334155347, + "grad_norm": 0.810375452041626, + "learning_rate": 8.842582486667762e-05, + "loss": 0.7811, + "step": 69210 + }, + { + "epoch": 0.4422268504912922, + "grad_norm": 1.088107705116272, + "learning_rate": 8.842261420793385e-05, + "loss": 0.8338, + "step": 69220 + }, + { + "epoch": 0.4422907376410309, + "grad_norm": 0.6136099100112915, + "learning_rate": 8.841940316224111e-05, + "loss": 0.6255, + "step": 69230 + }, + { + "epoch": 0.4423546247907696, + "grad_norm": 0.7968172430992126, + "learning_rate": 8.841619172963175e-05, + "loss": 0.8622, + "step": 69240 + }, + { + "epoch": 0.4424185119405083, + "grad_norm": 0.8373786211013794, + "learning_rate": 8.84129799101381e-05, + "loss": 0.9056, + "step": 69250 + }, + { + "epoch": 0.442482399090247, + "grad_norm": 0.47702914476394653, + "learning_rate": 8.840976770379252e-05, + "loss": 0.7324, + "step": 69260 + }, + { + "epoch": 0.4425462862399857, + "grad_norm": 0.8604845404624939, + "learning_rate": 8.840655511062734e-05, + "loss": 0.82, + "step": 69270 + }, + { + "epoch": 0.4426101733897244, + "grad_norm": 1.151459813117981, + "learning_rate": 8.840334213067493e-05, + "loss": 1.1619, + "step": 69280 + }, + { + "epoch": 0.44267406053946307, + "grad_norm": 1.4183622598648071, + "learning_rate": 8.840012876396765e-05, + "loss": 0.9147, + "step": 69290 + }, + { + "epoch": 0.44273794768920177, + "grad_norm": 0.7213853001594543, + "learning_rate": 8.839691501053784e-05, + "loss": 0.9717, + "step": 69300 + }, + { + "epoch": 0.4428018348389405, + "grad_norm": 0.8650780320167542, + "learning_rate": 8.839370087041787e-05, + "loss": 1.0401, + "step": 69310 + }, + { + "epoch": 0.4428657219886792, + "grad_norm": 0.9786863923072815, + "learning_rate": 8.839048634364014e-05, + "loss": 0.8482, + "step": 69320 + }, + { + "epoch": 0.4429296091384179, + "grad_norm": 0.907888650894165, + "learning_rate": 8.838727143023698e-05, + "loss": 1.0272, + "step": 69330 + }, + { + "epoch": 0.4429934962881566, + "grad_norm": 0.5181243419647217, + "learning_rate": 8.83840561302408e-05, + "loss": 1.1205, + "step": 69340 + }, + { + "epoch": 0.4430573834378953, + "grad_norm": 1.089030146598816, + "learning_rate": 8.838084044368396e-05, + "loss": 0.9977, + "step": 69350 + }, + { + "epoch": 0.443121270587634, + "grad_norm": 0.7841888070106506, + "learning_rate": 8.837762437059884e-05, + "loss": 0.9291, + "step": 69360 + }, + { + "epoch": 0.4431851577373727, + "grad_norm": 0.706368088722229, + "learning_rate": 8.837440791101787e-05, + "loss": 0.8566, + "step": 69370 + }, + { + "epoch": 0.4432490448871114, + "grad_norm": 0.8301064968109131, + "learning_rate": 8.83711910649734e-05, + "loss": 0.8693, + "step": 69380 + }, + { + "epoch": 0.4433129320368501, + "grad_norm": 0.95965576171875, + "learning_rate": 8.836797383249784e-05, + "loss": 0.8735, + "step": 69390 + }, + { + "epoch": 0.44337681918658883, + "grad_norm": 0.644489586353302, + "learning_rate": 8.836475621362359e-05, + "loss": 1.1971, + "step": 69400 + }, + { + "epoch": 0.4434407063363275, + "grad_norm": 0.834976851940155, + "learning_rate": 8.836153820838304e-05, + "loss": 0.9157, + "step": 69410 + }, + { + "epoch": 0.4435045934860662, + "grad_norm": 0.6156612634658813, + "learning_rate": 8.835831981680864e-05, + "loss": 0.7013, + "step": 69420 + }, + { + "epoch": 0.4435684806358049, + "grad_norm": 0.5868956446647644, + "learning_rate": 8.835510103893276e-05, + "loss": 1.0189, + "step": 69430 + }, + { + "epoch": 0.4436323677855436, + "grad_norm": 1.2473644018173218, + "learning_rate": 8.835188187478782e-05, + "loss": 0.7598, + "step": 69440 + }, + { + "epoch": 0.4436962549352823, + "grad_norm": 1.8413316011428833, + "learning_rate": 8.834866232440627e-05, + "loss": 0.7408, + "step": 69450 + }, + { + "epoch": 0.443760142085021, + "grad_norm": 1.211452603340149, + "learning_rate": 8.83454423878205e-05, + "loss": 0.9004, + "step": 69460 + }, + { + "epoch": 0.4438240292347597, + "grad_norm": 1.3288507461547852, + "learning_rate": 8.834222206506297e-05, + "loss": 0.9584, + "step": 69470 + }, + { + "epoch": 0.4438879163844984, + "grad_norm": 1.8809562921524048, + "learning_rate": 8.833900135616608e-05, + "loss": 0.7489, + "step": 69480 + }, + { + "epoch": 0.44395180353423713, + "grad_norm": 0.8909973502159119, + "learning_rate": 8.833578026116228e-05, + "loss": 0.6701, + "step": 69490 + }, + { + "epoch": 0.44401569068397584, + "grad_norm": 0.9204776287078857, + "learning_rate": 8.833255878008402e-05, + "loss": 0.8157, + "step": 69500 + }, + { + "epoch": 0.44407957783371454, + "grad_norm": 0.722482442855835, + "learning_rate": 8.832933691296371e-05, + "loss": 1.1273, + "step": 69510 + }, + { + "epoch": 0.44414346498345325, + "grad_norm": 0.8715541958808899, + "learning_rate": 8.832611465983383e-05, + "loss": 1.2145, + "step": 69520 + }, + { + "epoch": 0.44420735213319196, + "grad_norm": 0.9637245535850525, + "learning_rate": 8.832289202072681e-05, + "loss": 0.973, + "step": 69530 + }, + { + "epoch": 0.4442712392829306, + "grad_norm": 0.8205868005752563, + "learning_rate": 8.831966899567512e-05, + "loss": 0.7592, + "step": 69540 + }, + { + "epoch": 0.4443351264326693, + "grad_norm": 1.58009672164917, + "learning_rate": 8.831644558471122e-05, + "loss": 1.2691, + "step": 69550 + }, + { + "epoch": 0.444399013582408, + "grad_norm": 1.08955717086792, + "learning_rate": 8.831322178786754e-05, + "loss": 0.9777, + "step": 69560 + }, + { + "epoch": 0.4444629007321467, + "grad_norm": 0.9413936138153076, + "learning_rate": 8.830999760517659e-05, + "loss": 1.029, + "step": 69570 + }, + { + "epoch": 0.44452678788188543, + "grad_norm": 1.7037255764007568, + "learning_rate": 8.830677303667081e-05, + "loss": 1.2211, + "step": 69580 + }, + { + "epoch": 0.44459067503162414, + "grad_norm": 1.0269652605056763, + "learning_rate": 8.83035480823827e-05, + "loss": 1.3183, + "step": 69590 + }, + { + "epoch": 0.44465456218136284, + "grad_norm": 0.8793505430221558, + "learning_rate": 8.830032274234472e-05, + "loss": 0.9107, + "step": 69600 + }, + { + "epoch": 0.44471844933110155, + "grad_norm": 1.2114499807357788, + "learning_rate": 8.829709701658934e-05, + "loss": 0.9675, + "step": 69610 + }, + { + "epoch": 0.44478233648084026, + "grad_norm": 1.1938707828521729, + "learning_rate": 8.82938709051491e-05, + "loss": 0.85, + "step": 69620 + }, + { + "epoch": 0.44484622363057896, + "grad_norm": 1.2485358715057373, + "learning_rate": 8.829064440805641e-05, + "loss": 0.8547, + "step": 69630 + }, + { + "epoch": 0.44491011078031767, + "grad_norm": 0.7239115238189697, + "learning_rate": 8.828741752534382e-05, + "loss": 1.1611, + "step": 69640 + }, + { + "epoch": 0.4449739979300564, + "grad_norm": 0.9881543517112732, + "learning_rate": 8.82841902570438e-05, + "loss": 0.668, + "step": 69650 + }, + { + "epoch": 0.445037885079795, + "grad_norm": 1.0397281646728516, + "learning_rate": 8.828096260318888e-05, + "loss": 0.7762, + "step": 69660 + }, + { + "epoch": 0.44510177222953373, + "grad_norm": 1.2399822473526, + "learning_rate": 8.827773456381155e-05, + "loss": 0.7973, + "step": 69670 + }, + { + "epoch": 0.44516565937927244, + "grad_norm": 0.8497468829154968, + "learning_rate": 8.82745061389443e-05, + "loss": 0.9985, + "step": 69680 + }, + { + "epoch": 0.44522954652901114, + "grad_norm": 0.73412024974823, + "learning_rate": 8.827127732861967e-05, + "loss": 0.812, + "step": 69690 + }, + { + "epoch": 0.44529343367874985, + "grad_norm": 0.7889323234558105, + "learning_rate": 8.826804813287017e-05, + "loss": 0.9489, + "step": 69700 + }, + { + "epoch": 0.44535732082848856, + "grad_norm": 0.7215690612792969, + "learning_rate": 8.826481855172832e-05, + "loss": 1.0469, + "step": 69710 + }, + { + "epoch": 0.44542120797822726, + "grad_norm": 1.6253806352615356, + "learning_rate": 8.826158858522665e-05, + "loss": 0.7258, + "step": 69720 + }, + { + "epoch": 0.44548509512796597, + "grad_norm": 1.0504227876663208, + "learning_rate": 8.825835823339768e-05, + "loss": 0.9111, + "step": 69730 + }, + { + "epoch": 0.4455489822777047, + "grad_norm": 0.9772189855575562, + "learning_rate": 8.825512749627393e-05, + "loss": 0.9676, + "step": 69740 + }, + { + "epoch": 0.4456128694274434, + "grad_norm": 0.7481646537780762, + "learning_rate": 8.825189637388795e-05, + "loss": 0.9435, + "step": 69750 + }, + { + "epoch": 0.4456767565771821, + "grad_norm": 0.6458262801170349, + "learning_rate": 8.824866486627231e-05, + "loss": 0.9124, + "step": 69760 + }, + { + "epoch": 0.4457406437269208, + "grad_norm": 0.9859530925750732, + "learning_rate": 8.824543297345949e-05, + "loss": 1.0758, + "step": 69770 + }, + { + "epoch": 0.44580453087665944, + "grad_norm": 0.8648393750190735, + "learning_rate": 8.82422006954821e-05, + "loss": 1.0258, + "step": 69780 + }, + { + "epoch": 0.44586841802639815, + "grad_norm": 2.013597249984741, + "learning_rate": 8.823896803237264e-05, + "loss": 0.9565, + "step": 69790 + }, + { + "epoch": 0.44593230517613686, + "grad_norm": 0.8398522138595581, + "learning_rate": 8.823573498416371e-05, + "loss": 0.8652, + "step": 69800 + }, + { + "epoch": 0.44599619232587556, + "grad_norm": 0.751560389995575, + "learning_rate": 8.823250155088785e-05, + "loss": 0.948, + "step": 69810 + }, + { + "epoch": 0.44606007947561427, + "grad_norm": 0.7580850124359131, + "learning_rate": 8.82292677325776e-05, + "loss": 0.8804, + "step": 69820 + }, + { + "epoch": 0.446123966625353, + "grad_norm": 2.6924216747283936, + "learning_rate": 8.822603352926558e-05, + "loss": 0.814, + "step": 69830 + }, + { + "epoch": 0.4461878537750917, + "grad_norm": 1.0442085266113281, + "learning_rate": 8.82227989409843e-05, + "loss": 0.9705, + "step": 69840 + }, + { + "epoch": 0.4462517409248304, + "grad_norm": 0.6417388319969177, + "learning_rate": 8.821956396776641e-05, + "loss": 0.8304, + "step": 69850 + }, + { + "epoch": 0.4463156280745691, + "grad_norm": 0.49614080786705017, + "learning_rate": 8.821632860964442e-05, + "loss": 1.1193, + "step": 69860 + }, + { + "epoch": 0.4463795152243078, + "grad_norm": 0.6962358355522156, + "learning_rate": 8.821309286665094e-05, + "loss": 1.02, + "step": 69870 + }, + { + "epoch": 0.4464434023740465, + "grad_norm": 0.9865720868110657, + "learning_rate": 8.820985673881857e-05, + "loss": 1.191, + "step": 69880 + }, + { + "epoch": 0.4465072895237852, + "grad_norm": 0.9626466631889343, + "learning_rate": 8.820662022617987e-05, + "loss": 0.9506, + "step": 69890 + }, + { + "epoch": 0.44657117667352386, + "grad_norm": 0.40864917635917664, + "learning_rate": 8.820338332876745e-05, + "loss": 0.9994, + "step": 69900 + }, + { + "epoch": 0.44663506382326257, + "grad_norm": 0.5569325089454651, + "learning_rate": 8.82001460466139e-05, + "loss": 0.8532, + "step": 69910 + }, + { + "epoch": 0.4466989509730013, + "grad_norm": 0.6157374978065491, + "learning_rate": 8.819690837975185e-05, + "loss": 0.834, + "step": 69920 + }, + { + "epoch": 0.44676283812274, + "grad_norm": 0.9512416124343872, + "learning_rate": 8.819367032821389e-05, + "loss": 0.7586, + "step": 69930 + }, + { + "epoch": 0.4468267252724787, + "grad_norm": 0.6513834595680237, + "learning_rate": 8.819043189203262e-05, + "loss": 1.0077, + "step": 69940 + }, + { + "epoch": 0.4468906124222174, + "grad_norm": 0.6988425254821777, + "learning_rate": 8.818719307124066e-05, + "loss": 0.9777, + "step": 69950 + }, + { + "epoch": 0.4469544995719561, + "grad_norm": 0.7577906250953674, + "learning_rate": 8.818395386587064e-05, + "loss": 0.8364, + "step": 69960 + }, + { + "epoch": 0.4470183867216948, + "grad_norm": 3.7700507640838623, + "learning_rate": 8.818071427595515e-05, + "loss": 0.9155, + "step": 69970 + }, + { + "epoch": 0.4470822738714335, + "grad_norm": 0.9014910459518433, + "learning_rate": 8.817747430152687e-05, + "loss": 0.9437, + "step": 69980 + }, + { + "epoch": 0.4471461610211722, + "grad_norm": 0.7106698155403137, + "learning_rate": 8.817423394261837e-05, + "loss": 1.2196, + "step": 69990 + }, + { + "epoch": 0.4472100481709109, + "grad_norm": 0.7741692066192627, + "learning_rate": 8.817099319926231e-05, + "loss": 0.8265, + "step": 70000 + }, + { + "epoch": 0.44727393532064963, + "grad_norm": 0.8128407001495361, + "learning_rate": 8.816775207149133e-05, + "loss": 0.7937, + "step": 70010 + }, + { + "epoch": 0.4473378224703883, + "grad_norm": 1.0812875032424927, + "learning_rate": 8.816451055933807e-05, + "loss": 0.9699, + "step": 70020 + }, + { + "epoch": 0.447401709620127, + "grad_norm": 0.8170537948608398, + "learning_rate": 8.816126866283515e-05, + "loss": 0.7516, + "step": 70030 + }, + { + "epoch": 0.4474655967698657, + "grad_norm": 0.8234254121780396, + "learning_rate": 8.815802638201527e-05, + "loss": 0.7975, + "step": 70040 + }, + { + "epoch": 0.4475294839196044, + "grad_norm": 0.5763027667999268, + "learning_rate": 8.815478371691104e-05, + "loss": 0.8927, + "step": 70050 + }, + { + "epoch": 0.4475933710693431, + "grad_norm": 0.6996818780899048, + "learning_rate": 8.815154066755514e-05, + "loss": 0.7487, + "step": 70060 + }, + { + "epoch": 0.4476572582190818, + "grad_norm": 1.1514983177185059, + "learning_rate": 8.814829723398021e-05, + "loss": 0.7932, + "step": 70070 + }, + { + "epoch": 0.4477211453688205, + "grad_norm": 1.56476628780365, + "learning_rate": 8.814505341621892e-05, + "loss": 1.0774, + "step": 70080 + }, + { + "epoch": 0.4477850325185592, + "grad_norm": 1.2454763650894165, + "learning_rate": 8.814180921430395e-05, + "loss": 0.7339, + "step": 70090 + }, + { + "epoch": 0.44784891966829793, + "grad_norm": 0.7148693799972534, + "learning_rate": 8.813856462826794e-05, + "loss": 0.8958, + "step": 70100 + }, + { + "epoch": 0.44791280681803664, + "grad_norm": 1.7030229568481445, + "learning_rate": 8.813531965814363e-05, + "loss": 0.9031, + "step": 70110 + }, + { + "epoch": 0.44797669396777534, + "grad_norm": 0.7845126986503601, + "learning_rate": 8.813207430396365e-05, + "loss": 0.8665, + "step": 70120 + }, + { + "epoch": 0.44804058111751405, + "grad_norm": 0.6932292580604553, + "learning_rate": 8.812882856576066e-05, + "loss": 0.9553, + "step": 70130 + }, + { + "epoch": 0.4481044682672527, + "grad_norm": 1.0173585414886475, + "learning_rate": 8.812558244356742e-05, + "loss": 1.1723, + "step": 70140 + }, + { + "epoch": 0.4481683554169914, + "grad_norm": 0.7353670597076416, + "learning_rate": 8.812233593741655e-05, + "loss": 0.8626, + "step": 70150 + }, + { + "epoch": 0.4482322425667301, + "grad_norm": 0.4959295392036438, + "learning_rate": 8.811908904734079e-05, + "loss": 0.9257, + "step": 70160 + }, + { + "epoch": 0.4482961297164688, + "grad_norm": 0.9090648889541626, + "learning_rate": 8.811584177337281e-05, + "loss": 0.6679, + "step": 70170 + }, + { + "epoch": 0.4483600168662075, + "grad_norm": 0.840734601020813, + "learning_rate": 8.811259411554536e-05, + "loss": 0.7846, + "step": 70180 + }, + { + "epoch": 0.44842390401594623, + "grad_norm": 0.8319433927536011, + "learning_rate": 8.81093460738911e-05, + "loss": 0.9433, + "step": 70190 + }, + { + "epoch": 0.44848779116568493, + "grad_norm": 0.5484992265701294, + "learning_rate": 8.810609764844276e-05, + "loss": 0.8511, + "step": 70200 + }, + { + "epoch": 0.44855167831542364, + "grad_norm": 0.8629337549209595, + "learning_rate": 8.810284883923304e-05, + "loss": 1.033, + "step": 70210 + }, + { + "epoch": 0.44861556546516235, + "grad_norm": 0.8372594118118286, + "learning_rate": 8.809959964629467e-05, + "loss": 0.6458, + "step": 70220 + }, + { + "epoch": 0.44867945261490105, + "grad_norm": 0.6603564620018005, + "learning_rate": 8.809635006966037e-05, + "loss": 0.9905, + "step": 70230 + }, + { + "epoch": 0.44874333976463976, + "grad_norm": 0.7497221231460571, + "learning_rate": 8.809310010936288e-05, + "loss": 0.9827, + "step": 70240 + }, + { + "epoch": 0.44880722691437847, + "grad_norm": 0.6426061987876892, + "learning_rate": 8.80898497654349e-05, + "loss": 0.9913, + "step": 70250 + }, + { + "epoch": 0.4488711140641171, + "grad_norm": 1.1607120037078857, + "learning_rate": 8.808659903790919e-05, + "loss": 0.9618, + "step": 70260 + }, + { + "epoch": 0.4489350012138558, + "grad_norm": 0.6175957322120667, + "learning_rate": 8.808334792681848e-05, + "loss": 0.7507, + "step": 70270 + }, + { + "epoch": 0.44899888836359453, + "grad_norm": 0.96190345287323, + "learning_rate": 8.80800964321955e-05, + "loss": 0.7085, + "step": 70280 + }, + { + "epoch": 0.44906277551333323, + "grad_norm": 2.022925615310669, + "learning_rate": 8.807684455407301e-05, + "loss": 0.997, + "step": 70290 + }, + { + "epoch": 0.44912666266307194, + "grad_norm": 0.8769704103469849, + "learning_rate": 8.807359229248376e-05, + "loss": 1.0706, + "step": 70300 + }, + { + "epoch": 0.44919054981281065, + "grad_norm": 0.8224455118179321, + "learning_rate": 8.80703396474605e-05, + "loss": 1.1076, + "step": 70310 + }, + { + "epoch": 0.44925443696254935, + "grad_norm": 0.5599127411842346, + "learning_rate": 8.806708661903598e-05, + "loss": 0.7477, + "step": 70320 + }, + { + "epoch": 0.44931832411228806, + "grad_norm": 0.8950123190879822, + "learning_rate": 8.806383320724295e-05, + "loss": 0.9558, + "step": 70330 + }, + { + "epoch": 0.44938221126202677, + "grad_norm": 0.7584883570671082, + "learning_rate": 8.80605794121142e-05, + "loss": 1.051, + "step": 70340 + }, + { + "epoch": 0.44944609841176547, + "grad_norm": 0.7890920042991638, + "learning_rate": 8.805732523368249e-05, + "loss": 0.747, + "step": 70350 + }, + { + "epoch": 0.4495099855615042, + "grad_norm": 0.7396231889724731, + "learning_rate": 8.805407067198059e-05, + "loss": 0.9456, + "step": 70360 + }, + { + "epoch": 0.4495738727112429, + "grad_norm": 1.3219010829925537, + "learning_rate": 8.805081572704128e-05, + "loss": 0.785, + "step": 70370 + }, + { + "epoch": 0.4496377598609816, + "grad_norm": 0.5966509580612183, + "learning_rate": 8.804756039889735e-05, + "loss": 1.1855, + "step": 70380 + }, + { + "epoch": 0.44970164701072024, + "grad_norm": 0.9530605673789978, + "learning_rate": 8.804430468758153e-05, + "loss": 0.8681, + "step": 70390 + }, + { + "epoch": 0.44976553416045895, + "grad_norm": 0.8958638310432434, + "learning_rate": 8.804104859312668e-05, + "loss": 1.0291, + "step": 70400 + }, + { + "epoch": 0.44982942131019765, + "grad_norm": 0.8644607663154602, + "learning_rate": 8.803779211556555e-05, + "loss": 0.7294, + "step": 70410 + }, + { + "epoch": 0.44989330845993636, + "grad_norm": 0.8499729037284851, + "learning_rate": 8.803453525493096e-05, + "loss": 0.8267, + "step": 70420 + }, + { + "epoch": 0.44995719560967506, + "grad_norm": 0.8581739068031311, + "learning_rate": 8.803127801125568e-05, + "loss": 0.7154, + "step": 70430 + }, + { + "epoch": 0.45002108275941377, + "grad_norm": 0.8350471258163452, + "learning_rate": 8.802802038457253e-05, + "loss": 0.9061, + "step": 70440 + }, + { + "epoch": 0.4500849699091525, + "grad_norm": 0.8576902747154236, + "learning_rate": 8.802476237491433e-05, + "loss": 0.9597, + "step": 70450 + }, + { + "epoch": 0.4501488570588912, + "grad_norm": 0.8955521583557129, + "learning_rate": 8.802150398231387e-05, + "loss": 1.0067, + "step": 70460 + }, + { + "epoch": 0.4502127442086299, + "grad_norm": 0.8049098253250122, + "learning_rate": 8.801824520680397e-05, + "loss": 1.002, + "step": 70470 + }, + { + "epoch": 0.4502766313583686, + "grad_norm": 0.8177332878112793, + "learning_rate": 8.801498604841745e-05, + "loss": 0.9605, + "step": 70480 + }, + { + "epoch": 0.4503405185081073, + "grad_norm": 0.5897266864776611, + "learning_rate": 8.801172650718711e-05, + "loss": 0.9324, + "step": 70490 + }, + { + "epoch": 0.450404405657846, + "grad_norm": 0.7611057758331299, + "learning_rate": 8.800846658314583e-05, + "loss": 1.1378, + "step": 70500 + }, + { + "epoch": 0.45046829280758466, + "grad_norm": 0.9269735813140869, + "learning_rate": 8.80052062763264e-05, + "loss": 0.7767, + "step": 70510 + }, + { + "epoch": 0.45053217995732336, + "grad_norm": 0.7874916791915894, + "learning_rate": 8.800194558676167e-05, + "loss": 1.0075, + "step": 70520 + }, + { + "epoch": 0.45059606710706207, + "grad_norm": 0.5338902473449707, + "learning_rate": 8.799868451448446e-05, + "loss": 0.7581, + "step": 70530 + }, + { + "epoch": 0.4506599542568008, + "grad_norm": 0.6649864315986633, + "learning_rate": 8.799542305952764e-05, + "loss": 1.309, + "step": 70540 + }, + { + "epoch": 0.4507238414065395, + "grad_norm": 0.5478102564811707, + "learning_rate": 8.799216122192402e-05, + "loss": 0.9979, + "step": 70550 + }, + { + "epoch": 0.4507877285562782, + "grad_norm": 0.48872268199920654, + "learning_rate": 8.798889900170648e-05, + "loss": 0.862, + "step": 70560 + }, + { + "epoch": 0.4508516157060169, + "grad_norm": 1.0504260063171387, + "learning_rate": 8.798563639890786e-05, + "loss": 0.8303, + "step": 70570 + }, + { + "epoch": 0.4509155028557556, + "grad_norm": 0.7641623616218567, + "learning_rate": 8.798237341356102e-05, + "loss": 0.8984, + "step": 70580 + }, + { + "epoch": 0.4509793900054943, + "grad_norm": 0.8579826951026917, + "learning_rate": 8.797911004569882e-05, + "loss": 0.8908, + "step": 70590 + }, + { + "epoch": 0.451043277155233, + "grad_norm": 1.6578333377838135, + "learning_rate": 8.797584629535412e-05, + "loss": 1.0401, + "step": 70600 + }, + { + "epoch": 0.4511071643049717, + "grad_norm": 0.901781439781189, + "learning_rate": 8.79725821625598e-05, + "loss": 1.1365, + "step": 70610 + }, + { + "epoch": 0.4511710514547104, + "grad_norm": 1.3552802801132202, + "learning_rate": 8.796931764734873e-05, + "loss": 0.9429, + "step": 70620 + }, + { + "epoch": 0.4512349386044491, + "grad_norm": 0.5758177638053894, + "learning_rate": 8.796605274975377e-05, + "loss": 0.9782, + "step": 70630 + }, + { + "epoch": 0.4512988257541878, + "grad_norm": 0.6553876996040344, + "learning_rate": 8.796278746980782e-05, + "loss": 0.9537, + "step": 70640 + }, + { + "epoch": 0.4513627129039265, + "grad_norm": 0.6024998426437378, + "learning_rate": 8.795952180754376e-05, + "loss": 0.9083, + "step": 70650 + }, + { + "epoch": 0.4514266000536652, + "grad_norm": 0.521595299243927, + "learning_rate": 8.795625576299447e-05, + "loss": 0.8758, + "step": 70660 + }, + { + "epoch": 0.4514904872034039, + "grad_norm": 0.9571405053138733, + "learning_rate": 8.795298933619284e-05, + "loss": 1.0593, + "step": 70670 + }, + { + "epoch": 0.4515543743531426, + "grad_norm": 1.07502281665802, + "learning_rate": 8.79497225271718e-05, + "loss": 0.8501, + "step": 70680 + }, + { + "epoch": 0.4516182615028813, + "grad_norm": 1.2445697784423828, + "learning_rate": 8.794645533596422e-05, + "loss": 0.8585, + "step": 70690 + }, + { + "epoch": 0.45168214865262, + "grad_norm": 0.8725454211235046, + "learning_rate": 8.794318776260299e-05, + "loss": 1.109, + "step": 70700 + }, + { + "epoch": 0.4517460358023587, + "grad_norm": 0.6738957762718201, + "learning_rate": 8.793991980712103e-05, + "loss": 0.8554, + "step": 70710 + }, + { + "epoch": 0.45180992295209743, + "grad_norm": 0.6663877964019775, + "learning_rate": 8.793665146955127e-05, + "loss": 0.8995, + "step": 70720 + }, + { + "epoch": 0.45187381010183614, + "grad_norm": 1.05771005153656, + "learning_rate": 8.79333827499266e-05, + "loss": 0.8408, + "step": 70730 + }, + { + "epoch": 0.45193769725157484, + "grad_norm": 0.8179357051849365, + "learning_rate": 8.793011364827995e-05, + "loss": 0.9386, + "step": 70740 + }, + { + "epoch": 0.4520015844013135, + "grad_norm": 0.8579227328300476, + "learning_rate": 8.792684416464425e-05, + "loss": 0.9987, + "step": 70750 + }, + { + "epoch": 0.4520654715510522, + "grad_norm": 1.311963438987732, + "learning_rate": 8.79235742990524e-05, + "loss": 0.8236, + "step": 70760 + }, + { + "epoch": 0.4521293587007909, + "grad_norm": 1.2034355401992798, + "learning_rate": 8.792030405153736e-05, + "loss": 1.0315, + "step": 70770 + }, + { + "epoch": 0.4521932458505296, + "grad_norm": 0.9574033617973328, + "learning_rate": 8.791703342213205e-05, + "loss": 0.9112, + "step": 70780 + }, + { + "epoch": 0.4522571330002683, + "grad_norm": 0.6111446022987366, + "learning_rate": 8.791376241086942e-05, + "loss": 0.8271, + "step": 70790 + }, + { + "epoch": 0.452321020150007, + "grad_norm": 0.5406328439712524, + "learning_rate": 8.791049101778239e-05, + "loss": 0.6997, + "step": 70800 + }, + { + "epoch": 0.45238490729974573, + "grad_norm": 0.4767141044139862, + "learning_rate": 8.790721924290393e-05, + "loss": 0.8203, + "step": 70810 + }, + { + "epoch": 0.45244879444948444, + "grad_norm": 1.9145870208740234, + "learning_rate": 8.790394708626697e-05, + "loss": 1.0273, + "step": 70820 + }, + { + "epoch": 0.45251268159922314, + "grad_norm": 1.5563242435455322, + "learning_rate": 8.790067454790447e-05, + "loss": 1.1937, + "step": 70830 + }, + { + "epoch": 0.45257656874896185, + "grad_norm": 0.6038081645965576, + "learning_rate": 8.789740162784939e-05, + "loss": 0.9115, + "step": 70840 + }, + { + "epoch": 0.45264045589870056, + "grad_norm": 0.8606191873550415, + "learning_rate": 8.789412832613468e-05, + "loss": 0.8494, + "step": 70850 + }, + { + "epoch": 0.45270434304843926, + "grad_norm": 0.9102177023887634, + "learning_rate": 8.789085464279334e-05, + "loss": 0.9009, + "step": 70860 + }, + { + "epoch": 0.4527682301981779, + "grad_norm": 0.8726232051849365, + "learning_rate": 8.788758057785828e-05, + "loss": 0.9052, + "step": 70870 + }, + { + "epoch": 0.4528321173479166, + "grad_norm": 0.8285619020462036, + "learning_rate": 8.788430613136254e-05, + "loss": 0.9528, + "step": 70880 + }, + { + "epoch": 0.4528960044976553, + "grad_norm": 1.2340794801712036, + "learning_rate": 8.788103130333905e-05, + "loss": 0.8517, + "step": 70890 + }, + { + "epoch": 0.45295989164739403, + "grad_norm": 0.5685308575630188, + "learning_rate": 8.787775609382078e-05, + "loss": 0.9504, + "step": 70900 + }, + { + "epoch": 0.45302377879713274, + "grad_norm": 0.7877033948898315, + "learning_rate": 8.787448050284077e-05, + "loss": 0.8238, + "step": 70910 + }, + { + "epoch": 0.45308766594687144, + "grad_norm": 1.047734260559082, + "learning_rate": 8.787120453043196e-05, + "loss": 1.0679, + "step": 70920 + }, + { + "epoch": 0.45315155309661015, + "grad_norm": 0.5385513305664062, + "learning_rate": 8.786792817662737e-05, + "loss": 0.8655, + "step": 70930 + }, + { + "epoch": 0.45321544024634886, + "grad_norm": 0.9814597964286804, + "learning_rate": 8.786465144145996e-05, + "loss": 0.9863, + "step": 70940 + }, + { + "epoch": 0.45327932739608756, + "grad_norm": 0.7968815565109253, + "learning_rate": 8.786137432496278e-05, + "loss": 0.8118, + "step": 70950 + }, + { + "epoch": 0.45334321454582627, + "grad_norm": 1.0466378927230835, + "learning_rate": 8.785809682716879e-05, + "loss": 0.8782, + "step": 70960 + }, + { + "epoch": 0.453407101695565, + "grad_norm": 1.5503062009811401, + "learning_rate": 8.7854818948111e-05, + "loss": 1.0405, + "step": 70970 + }, + { + "epoch": 0.4534709888453037, + "grad_norm": 0.6179012060165405, + "learning_rate": 8.785154068782246e-05, + "loss": 0.8444, + "step": 70980 + }, + { + "epoch": 0.45353487599504233, + "grad_norm": 1.3943589925765991, + "learning_rate": 8.784826204633614e-05, + "loss": 0.8734, + "step": 70990 + }, + { + "epoch": 0.45359876314478104, + "grad_norm": 0.8613284230232239, + "learning_rate": 8.784498302368508e-05, + "loss": 0.7613, + "step": 71000 + }, + { + "epoch": 0.45366265029451974, + "grad_norm": 1.1512913703918457, + "learning_rate": 8.784170361990232e-05, + "loss": 0.7816, + "step": 71010 + }, + { + "epoch": 0.45372653744425845, + "grad_norm": 0.8666269779205322, + "learning_rate": 8.783842383502084e-05, + "loss": 1.043, + "step": 71020 + }, + { + "epoch": 0.45379042459399715, + "grad_norm": 0.900255560874939, + "learning_rate": 8.783514366907371e-05, + "loss": 0.9663, + "step": 71030 + }, + { + "epoch": 0.45385431174373586, + "grad_norm": 1.041473150253296, + "learning_rate": 8.783186312209395e-05, + "loss": 1.0225, + "step": 71040 + }, + { + "epoch": 0.45391819889347457, + "grad_norm": 1.203635811805725, + "learning_rate": 8.78285821941146e-05, + "loss": 0.8201, + "step": 71050 + }, + { + "epoch": 0.4539820860432133, + "grad_norm": 1.2860292196273804, + "learning_rate": 8.782530088516869e-05, + "loss": 1.0791, + "step": 71060 + }, + { + "epoch": 0.454045973192952, + "grad_norm": 0.9229752421379089, + "learning_rate": 8.782201919528929e-05, + "loss": 0.9097, + "step": 71070 + }, + { + "epoch": 0.4541098603426907, + "grad_norm": 0.5410824418067932, + "learning_rate": 8.7819065348727e-05, + "loss": 1.1974, + "step": 71080 + }, + { + "epoch": 0.4541737474924294, + "grad_norm": 1.527845025062561, + "learning_rate": 8.7815782935165e-05, + "loss": 0.9155, + "step": 71090 + }, + { + "epoch": 0.4542376346421681, + "grad_norm": 0.6132227182388306, + "learning_rate": 8.781250014076534e-05, + "loss": 0.9264, + "step": 71100 + }, + { + "epoch": 0.45430152179190675, + "grad_norm": 1.0122579336166382, + "learning_rate": 8.78092169655611e-05, + "loss": 1.126, + "step": 71110 + }, + { + "epoch": 0.45436540894164545, + "grad_norm": 0.9451808333396912, + "learning_rate": 8.780593340958535e-05, + "loss": 1.0451, + "step": 71120 + }, + { + "epoch": 0.45442929609138416, + "grad_norm": 1.5981924533843994, + "learning_rate": 8.780264947287111e-05, + "loss": 1.1555, + "step": 71130 + }, + { + "epoch": 0.45449318324112287, + "grad_norm": 0.8764825463294983, + "learning_rate": 8.779936515545151e-05, + "loss": 0.8937, + "step": 71140 + }, + { + "epoch": 0.4545570703908616, + "grad_norm": 0.6347659230232239, + "learning_rate": 8.779608045735959e-05, + "loss": 1.2468, + "step": 71150 + }, + { + "epoch": 0.4546209575406003, + "grad_norm": 0.9502388834953308, + "learning_rate": 8.779279537862844e-05, + "loss": 0.791, + "step": 71160 + }, + { + "epoch": 0.454684844690339, + "grad_norm": 0.9341233968734741, + "learning_rate": 8.778950991929114e-05, + "loss": 0.9172, + "step": 71170 + }, + { + "epoch": 0.4547487318400777, + "grad_norm": 0.7763635516166687, + "learning_rate": 8.77862240793808e-05, + "loss": 0.9442, + "step": 71180 + }, + { + "epoch": 0.4548126189898164, + "grad_norm": 1.2328989505767822, + "learning_rate": 8.778293785893048e-05, + "loss": 0.7446, + "step": 71190 + }, + { + "epoch": 0.4548765061395551, + "grad_norm": 0.6243307590484619, + "learning_rate": 8.777965125797329e-05, + "loss": 0.8242, + "step": 71200 + }, + { + "epoch": 0.4549403932892938, + "grad_norm": 0.7185580134391785, + "learning_rate": 8.777636427654234e-05, + "loss": 1.0433, + "step": 71210 + }, + { + "epoch": 0.4550042804390325, + "grad_norm": 0.7410394549369812, + "learning_rate": 8.777307691467072e-05, + "loss": 0.9533, + "step": 71220 + }, + { + "epoch": 0.4550681675887712, + "grad_norm": 0.8406373858451843, + "learning_rate": 8.776978917239153e-05, + "loss": 0.9858, + "step": 71230 + }, + { + "epoch": 0.4551320547385099, + "grad_norm": 1.1634323596954346, + "learning_rate": 8.776650104973789e-05, + "loss": 0.8353, + "step": 71240 + }, + { + "epoch": 0.4551959418882486, + "grad_norm": 0.847737729549408, + "learning_rate": 8.776321254674291e-05, + "loss": 0.6618, + "step": 71250 + }, + { + "epoch": 0.4552598290379873, + "grad_norm": 0.853600025177002, + "learning_rate": 8.77599236634397e-05, + "loss": 1.011, + "step": 71260 + }, + { + "epoch": 0.455323716187726, + "grad_norm": 0.6608572602272034, + "learning_rate": 8.77566343998614e-05, + "loss": 0.988, + "step": 71270 + }, + { + "epoch": 0.4553876033374647, + "grad_norm": 0.8262060284614563, + "learning_rate": 8.775334475604114e-05, + "loss": 1.2176, + "step": 71280 + }, + { + "epoch": 0.4554514904872034, + "grad_norm": 0.7335585355758667, + "learning_rate": 8.775005473201202e-05, + "loss": 0.9556, + "step": 71290 + }, + { + "epoch": 0.4555153776369421, + "grad_norm": 1.2570284605026245, + "learning_rate": 8.774676432780719e-05, + "loss": 1.0209, + "step": 71300 + }, + { + "epoch": 0.4555792647866808, + "grad_norm": 0.6992619037628174, + "learning_rate": 8.774347354345979e-05, + "loss": 0.8241, + "step": 71310 + }, + { + "epoch": 0.4556431519364195, + "grad_norm": 1.3729963302612305, + "learning_rate": 8.774018237900297e-05, + "loss": 0.9433, + "step": 71320 + }, + { + "epoch": 0.4557070390861582, + "grad_norm": 0.9504528045654297, + "learning_rate": 8.773689083446986e-05, + "loss": 0.8976, + "step": 71330 + }, + { + "epoch": 0.45577092623589693, + "grad_norm": 0.9891476035118103, + "learning_rate": 8.773359890989361e-05, + "loss": 0.975, + "step": 71340 + }, + { + "epoch": 0.45583481338563564, + "grad_norm": 1.9540566205978394, + "learning_rate": 8.773030660530736e-05, + "loss": 1.0329, + "step": 71350 + }, + { + "epoch": 0.4558987005353743, + "grad_norm": 0.9334406852722168, + "learning_rate": 8.77270139207443e-05, + "loss": 0.8753, + "step": 71360 + }, + { + "epoch": 0.455962587685113, + "grad_norm": 0.9712562561035156, + "learning_rate": 8.772372085623756e-05, + "loss": 1.0013, + "step": 71370 + }, + { + "epoch": 0.4560264748348517, + "grad_norm": 1.0713132619857788, + "learning_rate": 8.772042741182034e-05, + "loss": 0.9794, + "step": 71380 + }, + { + "epoch": 0.4560903619845904, + "grad_norm": 1.1477205753326416, + "learning_rate": 8.771713358752575e-05, + "loss": 0.9437, + "step": 71390 + }, + { + "epoch": 0.4561542491343291, + "grad_norm": 0.9125179052352905, + "learning_rate": 8.771383938338702e-05, + "loss": 1.0213, + "step": 71400 + }, + { + "epoch": 0.4562181362840678, + "grad_norm": 0.7508702278137207, + "learning_rate": 8.771054479943728e-05, + "loss": 1.0244, + "step": 71410 + }, + { + "epoch": 0.4562820234338065, + "grad_norm": 0.7568835616111755, + "learning_rate": 8.770724983570974e-05, + "loss": 1.0994, + "step": 71420 + }, + { + "epoch": 0.45634591058354523, + "grad_norm": 1.1442028284072876, + "learning_rate": 8.770395449223758e-05, + "loss": 0.768, + "step": 71430 + }, + { + "epoch": 0.45640979773328394, + "grad_norm": 1.0177857875823975, + "learning_rate": 8.770065876905396e-05, + "loss": 0.762, + "step": 71440 + }, + { + "epoch": 0.45647368488302265, + "grad_norm": 0.8974249362945557, + "learning_rate": 8.76973626661921e-05, + "loss": 0.8523, + "step": 71450 + }, + { + "epoch": 0.45653757203276135, + "grad_norm": 1.9021356105804443, + "learning_rate": 8.769406618368519e-05, + "loss": 1.068, + "step": 71460 + }, + { + "epoch": 0.45660145918250006, + "grad_norm": 0.6922752857208252, + "learning_rate": 8.769076932156642e-05, + "loss": 0.9142, + "step": 71470 + }, + { + "epoch": 0.4566653463322387, + "grad_norm": 0.853961706161499, + "learning_rate": 8.7687472079869e-05, + "loss": 0.8104, + "step": 71480 + }, + { + "epoch": 0.4567292334819774, + "grad_norm": 0.6722909212112427, + "learning_rate": 8.768417445862613e-05, + "loss": 0.8424, + "step": 71490 + }, + { + "epoch": 0.4567931206317161, + "grad_norm": 0.8289230465888977, + "learning_rate": 8.768087645787102e-05, + "loss": 1.0402, + "step": 71500 + }, + { + "epoch": 0.4568570077814548, + "grad_norm": 0.9513195157051086, + "learning_rate": 8.767757807763687e-05, + "loss": 0.8345, + "step": 71510 + }, + { + "epoch": 0.45692089493119353, + "grad_norm": 1.016177773475647, + "learning_rate": 8.767427931795694e-05, + "loss": 1.0672, + "step": 71520 + }, + { + "epoch": 0.45698478208093224, + "grad_norm": 0.815597653388977, + "learning_rate": 8.767098017886442e-05, + "loss": 0.9043, + "step": 71530 + }, + { + "epoch": 0.45704866923067095, + "grad_norm": 0.6468181610107422, + "learning_rate": 8.766768066039252e-05, + "loss": 0.7424, + "step": 71540 + }, + { + "epoch": 0.45711255638040965, + "grad_norm": 1.0520371198654175, + "learning_rate": 8.76643807625745e-05, + "loss": 0.7597, + "step": 71550 + }, + { + "epoch": 0.45717644353014836, + "grad_norm": 0.770429790019989, + "learning_rate": 8.766108048544359e-05, + "loss": 0.9429, + "step": 71560 + }, + { + "epoch": 0.45724033067988706, + "grad_norm": 1.4733335971832275, + "learning_rate": 8.7657779829033e-05, + "loss": 0.9293, + "step": 71570 + }, + { + "epoch": 0.45730421782962577, + "grad_norm": 0.8638445138931274, + "learning_rate": 8.765447879337601e-05, + "loss": 0.9278, + "step": 71580 + }, + { + "epoch": 0.4573681049793645, + "grad_norm": 1.1189749240875244, + "learning_rate": 8.765117737850584e-05, + "loss": 0.9357, + "step": 71590 + }, + { + "epoch": 0.4574319921291031, + "grad_norm": 0.82524174451828, + "learning_rate": 8.764787558445573e-05, + "loss": 1.1374, + "step": 71600 + }, + { + "epoch": 0.45749587927884183, + "grad_norm": 1.3126778602600098, + "learning_rate": 8.764457341125894e-05, + "loss": 1.1194, + "step": 71610 + }, + { + "epoch": 0.45755976642858054, + "grad_norm": 0.6606014370918274, + "learning_rate": 8.764127085894874e-05, + "loss": 0.815, + "step": 71620 + }, + { + "epoch": 0.45762365357831924, + "grad_norm": 0.8767343163490295, + "learning_rate": 8.763796792755836e-05, + "loss": 1.0984, + "step": 71630 + }, + { + "epoch": 0.45768754072805795, + "grad_norm": 0.6078357100486755, + "learning_rate": 8.763466461712108e-05, + "loss": 0.9051, + "step": 71640 + }, + { + "epoch": 0.45775142787779666, + "grad_norm": 0.7068758010864258, + "learning_rate": 8.763136092767019e-05, + "loss": 0.8346, + "step": 71650 + }, + { + "epoch": 0.45781531502753536, + "grad_norm": 1.372742772102356, + "learning_rate": 8.762805685923894e-05, + "loss": 0.9978, + "step": 71660 + }, + { + "epoch": 0.45787920217727407, + "grad_norm": 0.9161148071289062, + "learning_rate": 8.762475241186059e-05, + "loss": 1.0414, + "step": 71670 + }, + { + "epoch": 0.4579430893270128, + "grad_norm": 0.8733229637145996, + "learning_rate": 8.762144758556846e-05, + "loss": 0.5721, + "step": 71680 + }, + { + "epoch": 0.4580069764767515, + "grad_norm": 0.966275691986084, + "learning_rate": 8.761814238039576e-05, + "loss": 1.0169, + "step": 71690 + }, + { + "epoch": 0.4580708636264902, + "grad_norm": 0.8296000957489014, + "learning_rate": 8.761483679637585e-05, + "loss": 1.0115, + "step": 71700 + }, + { + "epoch": 0.4581347507762289, + "grad_norm": 0.718637228012085, + "learning_rate": 8.761153083354198e-05, + "loss": 0.9809, + "step": 71710 + }, + { + "epoch": 0.45819863792596754, + "grad_norm": 0.6848533153533936, + "learning_rate": 8.760822449192747e-05, + "loss": 0.9013, + "step": 71720 + }, + { + "epoch": 0.45826252507570625, + "grad_norm": 0.9018274545669556, + "learning_rate": 8.760491777156561e-05, + "loss": 1.2462, + "step": 71730 + }, + { + "epoch": 0.45832641222544496, + "grad_norm": 0.8693473935127258, + "learning_rate": 8.760161067248968e-05, + "loss": 1.1541, + "step": 71740 + }, + { + "epoch": 0.45839029937518366, + "grad_norm": 0.7893520593643188, + "learning_rate": 8.759830319473302e-05, + "loss": 0.7432, + "step": 71750 + }, + { + "epoch": 0.45845418652492237, + "grad_norm": 0.704264223575592, + "learning_rate": 8.759499533832889e-05, + "loss": 0.653, + "step": 71760 + }, + { + "epoch": 0.4585180736746611, + "grad_norm": 0.7048154473304749, + "learning_rate": 8.759168710331064e-05, + "loss": 0.7997, + "step": 71770 + }, + { + "epoch": 0.4585819608243998, + "grad_norm": 0.891446590423584, + "learning_rate": 8.75883784897116e-05, + "loss": 0.8512, + "step": 71780 + }, + { + "epoch": 0.4586458479741385, + "grad_norm": 0.7560920715332031, + "learning_rate": 8.758506949756505e-05, + "loss": 1.0229, + "step": 71790 + }, + { + "epoch": 0.4587097351238772, + "grad_norm": 0.9192051887512207, + "learning_rate": 8.758176012690433e-05, + "loss": 0.8684, + "step": 71800 + }, + { + "epoch": 0.4587736222736159, + "grad_norm": 0.9770452380180359, + "learning_rate": 8.757845037776279e-05, + "loss": 0.7901, + "step": 71810 + }, + { + "epoch": 0.4588375094233546, + "grad_norm": 0.8596158623695374, + "learning_rate": 8.757514025017374e-05, + "loss": 1.098, + "step": 71820 + }, + { + "epoch": 0.4589013965730933, + "grad_norm": 0.8267128467559814, + "learning_rate": 8.757182974417051e-05, + "loss": 1.2681, + "step": 71830 + }, + { + "epoch": 0.45896528372283196, + "grad_norm": 0.850165069103241, + "learning_rate": 8.756851885978646e-05, + "loss": 1.0255, + "step": 71840 + }, + { + "epoch": 0.45902917087257067, + "grad_norm": 0.8577935695648193, + "learning_rate": 8.756520759705494e-05, + "loss": 0.729, + "step": 71850 + }, + { + "epoch": 0.4590930580223094, + "grad_norm": 0.9524192214012146, + "learning_rate": 8.756189595600924e-05, + "loss": 0.7808, + "step": 71860 + }, + { + "epoch": 0.4591569451720481, + "grad_norm": 0.7128868699073792, + "learning_rate": 8.755858393668278e-05, + "loss": 0.9852, + "step": 71870 + }, + { + "epoch": 0.4592208323217868, + "grad_norm": 0.7015582919120789, + "learning_rate": 8.755527153910888e-05, + "loss": 1.0348, + "step": 71880 + }, + { + "epoch": 0.4592847194715255, + "grad_norm": 0.7809120416641235, + "learning_rate": 8.755195876332092e-05, + "loss": 0.9461, + "step": 71890 + }, + { + "epoch": 0.4593486066212642, + "grad_norm": 1.345109224319458, + "learning_rate": 8.754864560935223e-05, + "loss": 0.7244, + "step": 71900 + }, + { + "epoch": 0.4594124937710029, + "grad_norm": 1.41587233543396, + "learning_rate": 8.75453320772362e-05, + "loss": 1.0676, + "step": 71910 + }, + { + "epoch": 0.4594763809207416, + "grad_norm": 0.6541619300842285, + "learning_rate": 8.754201816700619e-05, + "loss": 0.7355, + "step": 71920 + }, + { + "epoch": 0.4595402680704803, + "grad_norm": 0.9459201693534851, + "learning_rate": 8.753870387869558e-05, + "loss": 0.9362, + "step": 71930 + }, + { + "epoch": 0.459604155220219, + "grad_norm": 1.1776293516159058, + "learning_rate": 8.753538921233776e-05, + "loss": 1.0149, + "step": 71940 + }, + { + "epoch": 0.45966804236995773, + "grad_norm": 0.7286693453788757, + "learning_rate": 8.753207416796608e-05, + "loss": 0.8923, + "step": 71950 + }, + { + "epoch": 0.4597319295196964, + "grad_norm": 0.6185081601142883, + "learning_rate": 8.752875874561395e-05, + "loss": 0.7427, + "step": 71960 + }, + { + "epoch": 0.4597958166694351, + "grad_norm": 1.2175235748291016, + "learning_rate": 8.752544294531474e-05, + "loss": 0.8131, + "step": 71970 + }, + { + "epoch": 0.4598597038191738, + "grad_norm": 1.1320040225982666, + "learning_rate": 8.752212676710188e-05, + "loss": 0.9201, + "step": 71980 + }, + { + "epoch": 0.4599235909689125, + "grad_norm": 0.9990871548652649, + "learning_rate": 8.751881021100874e-05, + "loss": 0.7032, + "step": 71990 + }, + { + "epoch": 0.4599874781186512, + "grad_norm": 0.7369240522384644, + "learning_rate": 8.751549327706872e-05, + "loss": 0.7436, + "step": 72000 + }, + { + "epoch": 0.4600513652683899, + "grad_norm": 1.1463274955749512, + "learning_rate": 8.75121759653152e-05, + "loss": 0.8619, + "step": 72010 + }, + { + "epoch": 0.4601152524181286, + "grad_norm": 1.0860117673873901, + "learning_rate": 8.750885827578165e-05, + "loss": 0.8812, + "step": 72020 + }, + { + "epoch": 0.4601791395678673, + "grad_norm": 1.319459319114685, + "learning_rate": 8.750554020850144e-05, + "loss": 0.8778, + "step": 72030 + }, + { + "epoch": 0.46024302671760603, + "grad_norm": 0.8144885301589966, + "learning_rate": 8.750222176350798e-05, + "loss": 0.9735, + "step": 72040 + }, + { + "epoch": 0.46030691386734474, + "grad_norm": 0.9385289549827576, + "learning_rate": 8.749890294083471e-05, + "loss": 0.9461, + "step": 72050 + }, + { + "epoch": 0.46037080101708344, + "grad_norm": 0.8200061321258545, + "learning_rate": 8.749558374051505e-05, + "loss": 1.1353, + "step": 72060 + }, + { + "epoch": 0.46043468816682215, + "grad_norm": 1.0239852666854858, + "learning_rate": 8.749226416258242e-05, + "loss": 0.8544, + "step": 72070 + }, + { + "epoch": 0.46049857531656085, + "grad_norm": 0.8354944586753845, + "learning_rate": 8.748894420707025e-05, + "loss": 0.9432, + "step": 72080 + }, + { + "epoch": 0.4605624624662995, + "grad_norm": 1.8811396360397339, + "learning_rate": 8.748562387401197e-05, + "loss": 1.0131, + "step": 72090 + }, + { + "epoch": 0.4606263496160382, + "grad_norm": 0.9125493764877319, + "learning_rate": 8.748230316344106e-05, + "loss": 0.8341, + "step": 72100 + }, + { + "epoch": 0.4606902367657769, + "grad_norm": 0.9623830914497375, + "learning_rate": 8.747898207539092e-05, + "loss": 0.8355, + "step": 72110 + }, + { + "epoch": 0.4607541239155156, + "grad_norm": 0.845895528793335, + "learning_rate": 8.747566060989498e-05, + "loss": 0.6371, + "step": 72120 + }, + { + "epoch": 0.46081801106525433, + "grad_norm": 1.4472181797027588, + "learning_rate": 8.747233876698674e-05, + "loss": 0.6948, + "step": 72130 + }, + { + "epoch": 0.46088189821499304, + "grad_norm": 1.0555568933486938, + "learning_rate": 8.746901654669962e-05, + "loss": 0.984, + "step": 72140 + }, + { + "epoch": 0.46094578536473174, + "grad_norm": 0.6194450259208679, + "learning_rate": 8.746569394906709e-05, + "loss": 0.7546, + "step": 72150 + }, + { + "epoch": 0.46100967251447045, + "grad_norm": 0.5184745192527771, + "learning_rate": 8.746237097412262e-05, + "loss": 0.6829, + "step": 72160 + }, + { + "epoch": 0.46107355966420915, + "grad_norm": 0.696123480796814, + "learning_rate": 8.745904762189966e-05, + "loss": 0.9351, + "step": 72170 + }, + { + "epoch": 0.46113744681394786, + "grad_norm": 0.8792576789855957, + "learning_rate": 8.745572389243168e-05, + "loss": 0.8513, + "step": 72180 + }, + { + "epoch": 0.46120133396368657, + "grad_norm": 0.8823778629302979, + "learning_rate": 8.745239978575215e-05, + "loss": 1.1258, + "step": 72190 + }, + { + "epoch": 0.46126522111342527, + "grad_norm": 0.9086830019950867, + "learning_rate": 8.744907530189457e-05, + "loss": 1.0118, + "step": 72200 + }, + { + "epoch": 0.4613291082631639, + "grad_norm": 1.1880900859832764, + "learning_rate": 8.74457504408924e-05, + "loss": 0.8436, + "step": 72210 + }, + { + "epoch": 0.46139299541290263, + "grad_norm": 1.3979392051696777, + "learning_rate": 8.744242520277912e-05, + "loss": 0.8647, + "step": 72220 + }, + { + "epoch": 0.46145688256264134, + "grad_norm": 1.0479652881622314, + "learning_rate": 8.743909958758823e-05, + "loss": 0.8345, + "step": 72230 + }, + { + "epoch": 0.46152076971238004, + "grad_norm": 0.9071272015571594, + "learning_rate": 8.743577359535321e-05, + "loss": 0.9718, + "step": 72240 + }, + { + "epoch": 0.46158465686211875, + "grad_norm": 0.838445246219635, + "learning_rate": 8.743244722610757e-05, + "loss": 0.9786, + "step": 72250 + }, + { + "epoch": 0.46164854401185745, + "grad_norm": 0.6116316914558411, + "learning_rate": 8.742912047988481e-05, + "loss": 1.0287, + "step": 72260 + }, + { + "epoch": 0.46171243116159616, + "grad_norm": 0.7119680047035217, + "learning_rate": 8.742579335671841e-05, + "loss": 0.8999, + "step": 72270 + }, + { + "epoch": 0.46177631831133487, + "grad_norm": 0.8352906107902527, + "learning_rate": 8.74224658566419e-05, + "loss": 0.7239, + "step": 72280 + }, + { + "epoch": 0.46184020546107357, + "grad_norm": 1.3281790018081665, + "learning_rate": 8.741913797968879e-05, + "loss": 0.8587, + "step": 72290 + }, + { + "epoch": 0.4619040926108123, + "grad_norm": 1.4477615356445312, + "learning_rate": 8.741580972589258e-05, + "loss": 1.1694, + "step": 72300 + }, + { + "epoch": 0.461967979760551, + "grad_norm": 0.925883948802948, + "learning_rate": 8.741248109528679e-05, + "loss": 0.8509, + "step": 72310 + }, + { + "epoch": 0.4620318669102897, + "grad_norm": 1.4060012102127075, + "learning_rate": 8.740915208790496e-05, + "loss": 0.968, + "step": 72320 + }, + { + "epoch": 0.46209575406002834, + "grad_norm": 0.7903311848640442, + "learning_rate": 8.740582270378061e-05, + "loss": 0.8068, + "step": 72330 + }, + { + "epoch": 0.46215964120976705, + "grad_norm": 0.8597942590713501, + "learning_rate": 8.740249294294727e-05, + "loss": 0.661, + "step": 72340 + }, + { + "epoch": 0.46222352835950575, + "grad_norm": 1.3927608728408813, + "learning_rate": 8.739916280543845e-05, + "loss": 0.9103, + "step": 72350 + }, + { + "epoch": 0.46228741550924446, + "grad_norm": 1.1652497053146362, + "learning_rate": 8.739583229128771e-05, + "loss": 0.726, + "step": 72360 + }, + { + "epoch": 0.46235130265898317, + "grad_norm": 1.1774413585662842, + "learning_rate": 8.739250140052859e-05, + "loss": 0.8478, + "step": 72370 + }, + { + "epoch": 0.46241518980872187, + "grad_norm": 1.1708256006240845, + "learning_rate": 8.738917013319463e-05, + "loss": 0.8796, + "step": 72380 + }, + { + "epoch": 0.4624790769584606, + "grad_norm": 0.6467223763465881, + "learning_rate": 8.738583848931938e-05, + "loss": 0.9981, + "step": 72390 + }, + { + "epoch": 0.4625429641081993, + "grad_norm": 0.9683516621589661, + "learning_rate": 8.73825064689364e-05, + "loss": 0.9423, + "step": 72400 + }, + { + "epoch": 0.462606851257938, + "grad_norm": 1.3207199573516846, + "learning_rate": 8.737917407207922e-05, + "loss": 0.8087, + "step": 72410 + }, + { + "epoch": 0.4626707384076767, + "grad_norm": 0.6273083090782166, + "learning_rate": 8.737584129878145e-05, + "loss": 0.8718, + "step": 72420 + }, + { + "epoch": 0.4627346255574154, + "grad_norm": 0.6665430068969727, + "learning_rate": 8.73725081490766e-05, + "loss": 0.7988, + "step": 72430 + }, + { + "epoch": 0.4627985127071541, + "grad_norm": 0.5622584819793701, + "learning_rate": 8.736917462299827e-05, + "loss": 0.7487, + "step": 72440 + }, + { + "epoch": 0.46286239985689276, + "grad_norm": 2.063795566558838, + "learning_rate": 8.736584072058003e-05, + "loss": 0.828, + "step": 72450 + }, + { + "epoch": 0.46292628700663147, + "grad_norm": 0.960210919380188, + "learning_rate": 8.736250644185545e-05, + "loss": 0.9685, + "step": 72460 + }, + { + "epoch": 0.46299017415637017, + "grad_norm": 0.7123231291770935, + "learning_rate": 8.735917178685807e-05, + "loss": 0.8832, + "step": 72470 + }, + { + "epoch": 0.4630540613061089, + "grad_norm": 1.0152729749679565, + "learning_rate": 8.735583675562154e-05, + "loss": 0.8156, + "step": 72480 + }, + { + "epoch": 0.4631179484558476, + "grad_norm": 0.9183365106582642, + "learning_rate": 8.735250134817942e-05, + "loss": 0.8343, + "step": 72490 + }, + { + "epoch": 0.4631818356055863, + "grad_norm": 0.9543598890304565, + "learning_rate": 8.734916556456528e-05, + "loss": 0.7168, + "step": 72500 + }, + { + "epoch": 0.463245722755325, + "grad_norm": 0.6270715594291687, + "learning_rate": 8.734582940481275e-05, + "loss": 0.9325, + "step": 72510 + }, + { + "epoch": 0.4633096099050637, + "grad_norm": 1.1470279693603516, + "learning_rate": 8.73424928689554e-05, + "loss": 0.898, + "step": 72520 + }, + { + "epoch": 0.4633734970548024, + "grad_norm": 1.4141347408294678, + "learning_rate": 8.733915595702685e-05, + "loss": 1.1052, + "step": 72530 + }, + { + "epoch": 0.4634373842045411, + "grad_norm": 0.9898306727409363, + "learning_rate": 8.733581866906066e-05, + "loss": 1.1064, + "step": 72540 + }, + { + "epoch": 0.4635012713542798, + "grad_norm": 1.889242172241211, + "learning_rate": 8.733248100509052e-05, + "loss": 0.7938, + "step": 72550 + }, + { + "epoch": 0.4635651585040185, + "grad_norm": 1.8361594676971436, + "learning_rate": 8.732914296514998e-05, + "loss": 0.8884, + "step": 72560 + }, + { + "epoch": 0.4636290456537572, + "grad_norm": 1.3661187887191772, + "learning_rate": 8.732580454927267e-05, + "loss": 0.7056, + "step": 72570 + }, + { + "epoch": 0.4636929328034959, + "grad_norm": 0.8329597115516663, + "learning_rate": 8.732246575749223e-05, + "loss": 0.6071, + "step": 72580 + }, + { + "epoch": 0.4637568199532346, + "grad_norm": 0.803459107875824, + "learning_rate": 8.731912658984227e-05, + "loss": 0.8693, + "step": 72590 + }, + { + "epoch": 0.4638207071029733, + "grad_norm": 0.7911583185195923, + "learning_rate": 8.731578704635642e-05, + "loss": 0.7637, + "step": 72600 + }, + { + "epoch": 0.463884594252712, + "grad_norm": 0.9523131847381592, + "learning_rate": 8.73124471270683e-05, + "loss": 0.9817, + "step": 72610 + }, + { + "epoch": 0.4639484814024507, + "grad_norm": 0.9202901124954224, + "learning_rate": 8.730910683201157e-05, + "loss": 0.7682, + "step": 72620 + }, + { + "epoch": 0.4640123685521894, + "grad_norm": 0.8219014406204224, + "learning_rate": 8.730576616121984e-05, + "loss": 0.8304, + "step": 72630 + }, + { + "epoch": 0.4640762557019281, + "grad_norm": 0.653312087059021, + "learning_rate": 8.73024251147268e-05, + "loss": 0.6409, + "step": 72640 + }, + { + "epoch": 0.4641401428516668, + "grad_norm": 0.7120294570922852, + "learning_rate": 8.729908369256603e-05, + "loss": 0.7654, + "step": 72650 + }, + { + "epoch": 0.46420403000140553, + "grad_norm": 0.9074612259864807, + "learning_rate": 8.729574189477124e-05, + "loss": 0.9235, + "step": 72660 + }, + { + "epoch": 0.46426791715114424, + "grad_norm": 1.1278350353240967, + "learning_rate": 8.729239972137608e-05, + "loss": 0.8423, + "step": 72670 + }, + { + "epoch": 0.46433180430088294, + "grad_norm": 0.8770177960395813, + "learning_rate": 8.728905717241417e-05, + "loss": 0.7709, + "step": 72680 + }, + { + "epoch": 0.4643956914506216, + "grad_norm": 0.8250672817230225, + "learning_rate": 8.728571424791921e-05, + "loss": 0.8168, + "step": 72690 + }, + { + "epoch": 0.4644595786003603, + "grad_norm": 1.0635161399841309, + "learning_rate": 8.728237094792482e-05, + "loss": 0.9243, + "step": 72700 + }, + { + "epoch": 0.464523465750099, + "grad_norm": 1.3261529207229614, + "learning_rate": 8.727902727246473e-05, + "loss": 0.9011, + "step": 72710 + }, + { + "epoch": 0.4645873528998377, + "grad_norm": 0.8366795182228088, + "learning_rate": 8.727568322157259e-05, + "loss": 1.0456, + "step": 72720 + }, + { + "epoch": 0.4646512400495764, + "grad_norm": 1.3640772104263306, + "learning_rate": 8.727233879528204e-05, + "loss": 0.8938, + "step": 72730 + }, + { + "epoch": 0.4647151271993151, + "grad_norm": 0.7236993312835693, + "learning_rate": 8.72689939936268e-05, + "loss": 0.8754, + "step": 72740 + }, + { + "epoch": 0.46477901434905383, + "grad_norm": 0.9828342199325562, + "learning_rate": 8.726564881664056e-05, + "loss": 0.8948, + "step": 72750 + }, + { + "epoch": 0.46484290149879254, + "grad_norm": 0.7750747799873352, + "learning_rate": 8.7262303264357e-05, + "loss": 0.9002, + "step": 72760 + }, + { + "epoch": 0.46490678864853124, + "grad_norm": 1.2255038022994995, + "learning_rate": 8.725895733680983e-05, + "loss": 0.933, + "step": 72770 + }, + { + "epoch": 0.46497067579826995, + "grad_norm": 3.7937097549438477, + "learning_rate": 8.725561103403267e-05, + "loss": 0.8682, + "step": 72780 + }, + { + "epoch": 0.46503456294800866, + "grad_norm": 0.7408625483512878, + "learning_rate": 8.725226435605934e-05, + "loss": 0.644, + "step": 72790 + }, + { + "epoch": 0.46509845009774736, + "grad_norm": 0.8894087672233582, + "learning_rate": 8.724891730292344e-05, + "loss": 0.826, + "step": 72800 + }, + { + "epoch": 0.465162337247486, + "grad_norm": 1.360103964805603, + "learning_rate": 8.724556987465872e-05, + "loss": 0.9597, + "step": 72810 + }, + { + "epoch": 0.4652262243972247, + "grad_norm": 0.8977581858634949, + "learning_rate": 8.724222207129889e-05, + "loss": 0.7513, + "step": 72820 + }, + { + "epoch": 0.4652901115469634, + "grad_norm": 0.8301047086715698, + "learning_rate": 8.723887389287768e-05, + "loss": 0.7628, + "step": 72830 + }, + { + "epoch": 0.46535399869670213, + "grad_norm": 1.7645938396453857, + "learning_rate": 8.723552533942878e-05, + "loss": 0.8691, + "step": 72840 + }, + { + "epoch": 0.46541788584644084, + "grad_norm": 0.6487802267074585, + "learning_rate": 8.723217641098594e-05, + "loss": 0.7312, + "step": 72850 + }, + { + "epoch": 0.46548177299617954, + "grad_norm": 0.7930013537406921, + "learning_rate": 8.722882710758286e-05, + "loss": 0.9037, + "step": 72860 + }, + { + "epoch": 0.46554566014591825, + "grad_norm": 0.749622106552124, + "learning_rate": 8.722547742925328e-05, + "loss": 1.0156, + "step": 72870 + }, + { + "epoch": 0.46560954729565696, + "grad_norm": 0.8207896947860718, + "learning_rate": 8.722212737603095e-05, + "loss": 0.7249, + "step": 72880 + }, + { + "epoch": 0.46567343444539566, + "grad_norm": 2.454975128173828, + "learning_rate": 8.721877694794958e-05, + "loss": 0.823, + "step": 72890 + }, + { + "epoch": 0.46573732159513437, + "grad_norm": 0.46863147616386414, + "learning_rate": 8.721542614504294e-05, + "loss": 0.9426, + "step": 72900 + }, + { + "epoch": 0.4658012087448731, + "grad_norm": 1.0021847486495972, + "learning_rate": 8.721207496734476e-05, + "loss": 1.1263, + "step": 72910 + }, + { + "epoch": 0.4658650958946118, + "grad_norm": 0.6298357844352722, + "learning_rate": 8.720872341488879e-05, + "loss": 0.8613, + "step": 72920 + }, + { + "epoch": 0.4659289830443505, + "grad_norm": 1.0806231498718262, + "learning_rate": 8.72053714877088e-05, + "loss": 0.7853, + "step": 72930 + }, + { + "epoch": 0.46599287019408914, + "grad_norm": 0.9028376936912537, + "learning_rate": 8.720201918583853e-05, + "loss": 0.8704, + "step": 72940 + }, + { + "epoch": 0.46605675734382784, + "grad_norm": 1.1052665710449219, + "learning_rate": 8.719866650931172e-05, + "loss": 0.7413, + "step": 72950 + }, + { + "epoch": 0.46612064449356655, + "grad_norm": 0.8171069025993347, + "learning_rate": 8.719531345816216e-05, + "loss": 0.9989, + "step": 72960 + }, + { + "epoch": 0.46618453164330526, + "grad_norm": 1.0400487184524536, + "learning_rate": 8.719196003242362e-05, + "loss": 0.8036, + "step": 72970 + }, + { + "epoch": 0.46624841879304396, + "grad_norm": 0.5930902361869812, + "learning_rate": 8.718860623212988e-05, + "loss": 0.9512, + "step": 72980 + }, + { + "epoch": 0.46631230594278267, + "grad_norm": 0.9061450958251953, + "learning_rate": 8.718525205731469e-05, + "loss": 0.909, + "step": 72990 + }, + { + "epoch": 0.4663761930925214, + "grad_norm": 0.9812560081481934, + "learning_rate": 8.718189750801184e-05, + "loss": 0.9485, + "step": 73000 + }, + { + "epoch": 0.4664400802422601, + "grad_norm": 1.0964970588684082, + "learning_rate": 8.717854258425512e-05, + "loss": 0.6829, + "step": 73010 + }, + { + "epoch": 0.4665039673919988, + "grad_norm": 0.513983964920044, + "learning_rate": 8.717518728607832e-05, + "loss": 0.7848, + "step": 73020 + }, + { + "epoch": 0.4665678545417375, + "grad_norm": 0.647631824016571, + "learning_rate": 8.71718316135152e-05, + "loss": 1.0222, + "step": 73030 + }, + { + "epoch": 0.4666317416914762, + "grad_norm": 0.7781062126159668, + "learning_rate": 8.716847556659961e-05, + "loss": 0.767, + "step": 73040 + }, + { + "epoch": 0.4666956288412149, + "grad_norm": 0.9073989987373352, + "learning_rate": 8.71651191453653e-05, + "loss": 0.7717, + "step": 73050 + }, + { + "epoch": 0.46675951599095356, + "grad_norm": 1.8708226680755615, + "learning_rate": 8.71617623498461e-05, + "loss": 0.902, + "step": 73060 + }, + { + "epoch": 0.46682340314069226, + "grad_norm": 0.9808720350265503, + "learning_rate": 8.715840518007578e-05, + "loss": 0.9002, + "step": 73070 + }, + { + "epoch": 0.46688729029043097, + "grad_norm": 0.9100602865219116, + "learning_rate": 8.715504763608818e-05, + "loss": 1.0678, + "step": 73080 + }, + { + "epoch": 0.4669511774401697, + "grad_norm": 0.6987651586532593, + "learning_rate": 8.71516897179171e-05, + "loss": 1.1864, + "step": 73090 + }, + { + "epoch": 0.4670150645899084, + "grad_norm": 0.9975560903549194, + "learning_rate": 8.714833142559637e-05, + "loss": 1.4101, + "step": 73100 + }, + { + "epoch": 0.4670789517396471, + "grad_norm": 1.2323815822601318, + "learning_rate": 8.714497275915982e-05, + "loss": 0.81, + "step": 73110 + }, + { + "epoch": 0.4671428388893858, + "grad_norm": 1.1183509826660156, + "learning_rate": 8.714161371864124e-05, + "loss": 0.8636, + "step": 73120 + }, + { + "epoch": 0.4672067260391245, + "grad_norm": 0.9153540730476379, + "learning_rate": 8.71382543040745e-05, + "loss": 0.9229, + "step": 73130 + }, + { + "epoch": 0.4672706131888632, + "grad_norm": 0.6757118701934814, + "learning_rate": 8.71348945154934e-05, + "loss": 0.8663, + "step": 73140 + }, + { + "epoch": 0.4673345003386019, + "grad_norm": 0.9965721964836121, + "learning_rate": 8.713153435293178e-05, + "loss": 0.9432, + "step": 73150 + }, + { + "epoch": 0.4673983874883406, + "grad_norm": 3.0663352012634277, + "learning_rate": 8.712817381642348e-05, + "loss": 0.9173, + "step": 73160 + }, + { + "epoch": 0.4674622746380793, + "grad_norm": 0.8566670417785645, + "learning_rate": 8.712481290600235e-05, + "loss": 0.8049, + "step": 73170 + }, + { + "epoch": 0.467526161787818, + "grad_norm": 0.6621735095977783, + "learning_rate": 8.712145162170224e-05, + "loss": 1.0432, + "step": 73180 + }, + { + "epoch": 0.4675900489375567, + "grad_norm": 0.7513931393623352, + "learning_rate": 8.7118089963557e-05, + "loss": 0.999, + "step": 73190 + }, + { + "epoch": 0.4676539360872954, + "grad_norm": 0.6250850558280945, + "learning_rate": 8.711472793160049e-05, + "loss": 1.0574, + "step": 73200 + }, + { + "epoch": 0.4677178232370341, + "grad_norm": 1.0595519542694092, + "learning_rate": 8.711136552586655e-05, + "loss": 0.8253, + "step": 73210 + }, + { + "epoch": 0.4677817103867728, + "grad_norm": 0.5927673578262329, + "learning_rate": 8.71083390411543e-05, + "loss": 0.9964, + "step": 73220 + }, + { + "epoch": 0.4678455975365115, + "grad_norm": 0.9299998879432678, + "learning_rate": 8.710497592533657e-05, + "loss": 0.8753, + "step": 73230 + }, + { + "epoch": 0.4679094846862502, + "grad_norm": 0.6862097978591919, + "learning_rate": 8.710161243583962e-05, + "loss": 0.7737, + "step": 73240 + }, + { + "epoch": 0.4679733718359889, + "grad_norm": 0.6349765062332153, + "learning_rate": 8.709824857269732e-05, + "loss": 0.7059, + "step": 73250 + }, + { + "epoch": 0.4680372589857276, + "grad_norm": 0.7105598449707031, + "learning_rate": 8.709488433594359e-05, + "loss": 0.9881, + "step": 73260 + }, + { + "epoch": 0.46810114613546633, + "grad_norm": 0.836338996887207, + "learning_rate": 8.709151972561228e-05, + "loss": 0.8385, + "step": 73270 + }, + { + "epoch": 0.46816503328520503, + "grad_norm": 0.8248547911643982, + "learning_rate": 8.708815474173728e-05, + "loss": 0.8845, + "step": 73280 + }, + { + "epoch": 0.46822892043494374, + "grad_norm": 3.468738079071045, + "learning_rate": 8.708478938435246e-05, + "loss": 1.0441, + "step": 73290 + }, + { + "epoch": 0.4682928075846824, + "grad_norm": 0.9611918330192566, + "learning_rate": 8.708142365349173e-05, + "loss": 1.1622, + "step": 73300 + }, + { + "epoch": 0.4683566947344211, + "grad_norm": 1.594110369682312, + "learning_rate": 8.7078057549189e-05, + "loss": 0.7014, + "step": 73310 + }, + { + "epoch": 0.4684205818841598, + "grad_norm": 0.8596274852752686, + "learning_rate": 8.707469107147815e-05, + "loss": 0.9094, + "step": 73320 + }, + { + "epoch": 0.4684844690338985, + "grad_norm": 0.673202395439148, + "learning_rate": 8.707132422039305e-05, + "loss": 1.0132, + "step": 73330 + }, + { + "epoch": 0.4685483561836372, + "grad_norm": 0.6166740655899048, + "learning_rate": 8.706795699596769e-05, + "loss": 0.8095, + "step": 73340 + }, + { + "epoch": 0.4686122433333759, + "grad_norm": 0.7982991337776184, + "learning_rate": 8.706458939823592e-05, + "loss": 0.8268, + "step": 73350 + }, + { + "epoch": 0.46867613048311463, + "grad_norm": 2.1832122802734375, + "learning_rate": 8.706122142723167e-05, + "loss": 0.9482, + "step": 73360 + }, + { + "epoch": 0.46874001763285333, + "grad_norm": 0.5912348031997681, + "learning_rate": 8.705785308298886e-05, + "loss": 0.9366, + "step": 73370 + }, + { + "epoch": 0.46880390478259204, + "grad_norm": 0.9966716766357422, + "learning_rate": 8.705448436554139e-05, + "loss": 1.1344, + "step": 73380 + }, + { + "epoch": 0.46886779193233075, + "grad_norm": 0.6441813111305237, + "learning_rate": 8.705111527492322e-05, + "loss": 1.0889, + "step": 73390 + }, + { + "epoch": 0.46893167908206945, + "grad_norm": 0.8401795029640198, + "learning_rate": 8.704774581116827e-05, + "loss": 1.0559, + "step": 73400 + }, + { + "epoch": 0.46899556623180816, + "grad_norm": 0.7190397381782532, + "learning_rate": 8.704437597431047e-05, + "loss": 0.9705, + "step": 73410 + }, + { + "epoch": 0.4690594533815468, + "grad_norm": 0.8827881217002869, + "learning_rate": 8.704100576438374e-05, + "loss": 0.6985, + "step": 73420 + }, + { + "epoch": 0.4691233405312855, + "grad_norm": 1.544293999671936, + "learning_rate": 8.703763518142205e-05, + "loss": 0.9164, + "step": 73430 + }, + { + "epoch": 0.4691872276810242, + "grad_norm": 1.1441346406936646, + "learning_rate": 8.703426422545934e-05, + "loss": 1.31, + "step": 73440 + }, + { + "epoch": 0.4692511148307629, + "grad_norm": 0.8129305243492126, + "learning_rate": 8.703089289652954e-05, + "loss": 0.8406, + "step": 73450 + }, + { + "epoch": 0.46931500198050163, + "grad_norm": 0.8427706956863403, + "learning_rate": 8.70275211946666e-05, + "loss": 0.873, + "step": 73460 + }, + { + "epoch": 0.46937888913024034, + "grad_norm": 1.3682218790054321, + "learning_rate": 8.70241491199045e-05, + "loss": 0.7712, + "step": 73470 + }, + { + "epoch": 0.46944277627997905, + "grad_norm": 0.7666106820106506, + "learning_rate": 8.70207766722772e-05, + "loss": 0.8279, + "step": 73480 + }, + { + "epoch": 0.46950666342971775, + "grad_norm": 0.8763406276702881, + "learning_rate": 8.701740385181863e-05, + "loss": 0.7302, + "step": 73490 + }, + { + "epoch": 0.46957055057945646, + "grad_norm": 0.8393523693084717, + "learning_rate": 8.70140306585628e-05, + "loss": 0.6908, + "step": 73500 + }, + { + "epoch": 0.46963443772919516, + "grad_norm": 0.8529371619224548, + "learning_rate": 8.701065709254363e-05, + "loss": 0.7582, + "step": 73510 + }, + { + "epoch": 0.46969832487893387, + "grad_norm": 0.9004521369934082, + "learning_rate": 8.700728315379515e-05, + "loss": 0.8145, + "step": 73520 + }, + { + "epoch": 0.4697622120286726, + "grad_norm": 1.3330974578857422, + "learning_rate": 8.70039088423513e-05, + "loss": 0.7189, + "step": 73530 + }, + { + "epoch": 0.4698260991784112, + "grad_norm": 0.6903400421142578, + "learning_rate": 8.700053415824608e-05, + "loss": 1.0698, + "step": 73540 + }, + { + "epoch": 0.46988998632814993, + "grad_norm": 0.8569963574409485, + "learning_rate": 8.699715910151347e-05, + "loss": 0.8589, + "step": 73550 + }, + { + "epoch": 0.46995387347788864, + "grad_norm": 1.7143669128417969, + "learning_rate": 8.699378367218747e-05, + "loss": 0.9826, + "step": 73560 + }, + { + "epoch": 0.47001776062762735, + "grad_norm": 0.8588539361953735, + "learning_rate": 8.699040787030205e-05, + "loss": 0.7637, + "step": 73570 + }, + { + "epoch": 0.47008164777736605, + "grad_norm": 0.7576454877853394, + "learning_rate": 8.698703169589122e-05, + "loss": 0.8103, + "step": 73580 + }, + { + "epoch": 0.47014553492710476, + "grad_norm": 1.1570996046066284, + "learning_rate": 8.698365514898899e-05, + "loss": 0.9776, + "step": 73590 + }, + { + "epoch": 0.47020942207684346, + "grad_norm": 0.5057058334350586, + "learning_rate": 8.698027822962937e-05, + "loss": 0.9414, + "step": 73600 + }, + { + "epoch": 0.47027330922658217, + "grad_norm": 0.6375735402107239, + "learning_rate": 8.697690093784634e-05, + "loss": 0.7779, + "step": 73610 + }, + { + "epoch": 0.4703371963763209, + "grad_norm": 0.9835091829299927, + "learning_rate": 8.697352327367391e-05, + "loss": 0.8034, + "step": 73620 + }, + { + "epoch": 0.4704010835260596, + "grad_norm": 0.5068366527557373, + "learning_rate": 8.697014523714615e-05, + "loss": 0.7921, + "step": 73630 + }, + { + "epoch": 0.4704649706757983, + "grad_norm": 0.646186351776123, + "learning_rate": 8.696676682829704e-05, + "loss": 0.7929, + "step": 73640 + }, + { + "epoch": 0.470528857825537, + "grad_norm": 0.6051701307296753, + "learning_rate": 8.696338804716058e-05, + "loss": 0.7974, + "step": 73650 + }, + { + "epoch": 0.4705927449752757, + "grad_norm": 0.7829045653343201, + "learning_rate": 8.696000889377085e-05, + "loss": 0.8989, + "step": 73660 + }, + { + "epoch": 0.47065663212501435, + "grad_norm": 0.8022125959396362, + "learning_rate": 8.695662936816185e-05, + "loss": 0.9278, + "step": 73670 + }, + { + "epoch": 0.47072051927475306, + "grad_norm": 1.3843055963516235, + "learning_rate": 8.69532494703676e-05, + "loss": 0.7795, + "step": 73680 + }, + { + "epoch": 0.47078440642449176, + "grad_norm": 0.8143162727355957, + "learning_rate": 8.694986920042218e-05, + "loss": 0.8787, + "step": 73690 + }, + { + "epoch": 0.47084829357423047, + "grad_norm": 0.9179696440696716, + "learning_rate": 8.694648855835961e-05, + "loss": 0.8331, + "step": 73700 + }, + { + "epoch": 0.4709121807239692, + "grad_norm": 0.7905839085578918, + "learning_rate": 8.694310754421393e-05, + "loss": 0.8672, + "step": 73710 + }, + { + "epoch": 0.4709760678737079, + "grad_norm": 0.642015278339386, + "learning_rate": 8.69397261580192e-05, + "loss": 0.6751, + "step": 73720 + }, + { + "epoch": 0.4710399550234466, + "grad_norm": 0.7508492469787598, + "learning_rate": 8.693634439980946e-05, + "loss": 1.0497, + "step": 73730 + }, + { + "epoch": 0.4711038421731853, + "grad_norm": 0.7233025431632996, + "learning_rate": 8.693296226961879e-05, + "loss": 0.9594, + "step": 73740 + }, + { + "epoch": 0.471167729322924, + "grad_norm": 0.9499550461769104, + "learning_rate": 8.692957976748124e-05, + "loss": 1.0151, + "step": 73750 + }, + { + "epoch": 0.4712316164726627, + "grad_norm": 2.5865375995635986, + "learning_rate": 8.692619689343087e-05, + "loss": 0.8826, + "step": 73760 + }, + { + "epoch": 0.4712955036224014, + "grad_norm": 1.2711101770401, + "learning_rate": 8.692281364750174e-05, + "loss": 1.1665, + "step": 73770 + }, + { + "epoch": 0.4713593907721401, + "grad_norm": 2.8341193199157715, + "learning_rate": 8.691943002972794e-05, + "loss": 0.7414, + "step": 73780 + }, + { + "epoch": 0.47142327792187877, + "grad_norm": 1.014237880706787, + "learning_rate": 8.691604604014355e-05, + "loss": 1.0644, + "step": 73790 + }, + { + "epoch": 0.4714871650716175, + "grad_norm": 0.9451431632041931, + "learning_rate": 8.691266167878263e-05, + "loss": 0.9587, + "step": 73800 + }, + { + "epoch": 0.4715510522213562, + "grad_norm": 0.7285395264625549, + "learning_rate": 8.690927694567927e-05, + "loss": 0.8145, + "step": 73810 + }, + { + "epoch": 0.4716149393710949, + "grad_norm": 0.9039714932441711, + "learning_rate": 8.690589184086758e-05, + "loss": 0.9801, + "step": 73820 + }, + { + "epoch": 0.4716788265208336, + "grad_norm": 1.3245606422424316, + "learning_rate": 8.690250636438161e-05, + "loss": 0.8307, + "step": 73830 + }, + { + "epoch": 0.4717427136705723, + "grad_norm": 0.8605784773826599, + "learning_rate": 8.689912051625549e-05, + "loss": 0.802, + "step": 73840 + }, + { + "epoch": 0.471806600820311, + "grad_norm": 0.8803051710128784, + "learning_rate": 8.689573429652329e-05, + "loss": 0.9706, + "step": 73850 + }, + { + "epoch": 0.4718704879700497, + "grad_norm": 1.14476478099823, + "learning_rate": 8.689234770521913e-05, + "loss": 1.1188, + "step": 73860 + }, + { + "epoch": 0.4719343751197884, + "grad_norm": 0.6339378356933594, + "learning_rate": 8.688896074237712e-05, + "loss": 1.0567, + "step": 73870 + }, + { + "epoch": 0.4719982622695271, + "grad_norm": 1.278977394104004, + "learning_rate": 8.688557340803135e-05, + "loss": 1.0485, + "step": 73880 + }, + { + "epoch": 0.47206214941926583, + "grad_norm": 0.6915751695632935, + "learning_rate": 8.688218570221596e-05, + "loss": 1.0948, + "step": 73890 + }, + { + "epoch": 0.47212603656900454, + "grad_norm": 0.6988540887832642, + "learning_rate": 8.687879762496504e-05, + "loss": 1.1413, + "step": 73900 + }, + { + "epoch": 0.4721899237187432, + "grad_norm": 0.9477376341819763, + "learning_rate": 8.687540917631273e-05, + "loss": 1.0465, + "step": 73910 + }, + { + "epoch": 0.4722538108684819, + "grad_norm": 0.8210738301277161, + "learning_rate": 8.687202035629314e-05, + "loss": 0.9296, + "step": 73920 + }, + { + "epoch": 0.4723176980182206, + "grad_norm": 1.0816015005111694, + "learning_rate": 8.686863116494042e-05, + "loss": 1.1211, + "step": 73930 + }, + { + "epoch": 0.4723815851679593, + "grad_norm": 0.5747155547142029, + "learning_rate": 8.686524160228867e-05, + "loss": 0.9563, + "step": 73940 + }, + { + "epoch": 0.472445472317698, + "grad_norm": 0.9445788860321045, + "learning_rate": 8.686185166837206e-05, + "loss": 1.0368, + "step": 73950 + }, + { + "epoch": 0.4725093594674367, + "grad_norm": 0.9299923181533813, + "learning_rate": 8.685846136322471e-05, + "loss": 0.8771, + "step": 73960 + }, + { + "epoch": 0.4725732466171754, + "grad_norm": 0.8922392725944519, + "learning_rate": 8.685507068688075e-05, + "loss": 0.7732, + "step": 73970 + }, + { + "epoch": 0.47263713376691413, + "grad_norm": 1.0317169427871704, + "learning_rate": 8.685167963937437e-05, + "loss": 0.7212, + "step": 73980 + }, + { + "epoch": 0.47270102091665284, + "grad_norm": 0.6838691234588623, + "learning_rate": 8.684828822073967e-05, + "loss": 0.9596, + "step": 73990 + }, + { + "epoch": 0.47276490806639154, + "grad_norm": 0.93050616979599, + "learning_rate": 8.684489643101085e-05, + "loss": 0.9891, + "step": 74000 + }, + { + "epoch": 0.47282879521613025, + "grad_norm": 0.7852534651756287, + "learning_rate": 8.684150427022205e-05, + "loss": 0.8776, + "step": 74010 + }, + { + "epoch": 0.47289268236586895, + "grad_norm": 1.1245160102844238, + "learning_rate": 8.683811173840741e-05, + "loss": 1.0402, + "step": 74020 + }, + { + "epoch": 0.4729565695156076, + "grad_norm": 1.0451771020889282, + "learning_rate": 8.683471883560113e-05, + "loss": 0.8141, + "step": 74030 + }, + { + "epoch": 0.4730204566653463, + "grad_norm": 0.8366501331329346, + "learning_rate": 8.683132556183735e-05, + "loss": 0.9944, + "step": 74040 + }, + { + "epoch": 0.473084343815085, + "grad_norm": 1.0142920017242432, + "learning_rate": 8.682793191715027e-05, + "loss": 0.8935, + "step": 74050 + }, + { + "epoch": 0.4731482309648237, + "grad_norm": 0.7233960032463074, + "learning_rate": 8.682453790157405e-05, + "loss": 0.9414, + "step": 74060 + }, + { + "epoch": 0.47321211811456243, + "grad_norm": 0.8871427178382874, + "learning_rate": 8.682114351514287e-05, + "loss": 0.8502, + "step": 74070 + }, + { + "epoch": 0.47327600526430114, + "grad_norm": 1.854498028755188, + "learning_rate": 8.681774875789095e-05, + "loss": 1.076, + "step": 74080 + }, + { + "epoch": 0.47333989241403984, + "grad_norm": 0.7415865063667297, + "learning_rate": 8.681435362985242e-05, + "loss": 0.8037, + "step": 74090 + }, + { + "epoch": 0.47340377956377855, + "grad_norm": 0.7761117815971375, + "learning_rate": 8.68109581310615e-05, + "loss": 0.7338, + "step": 74100 + }, + { + "epoch": 0.47346766671351725, + "grad_norm": 1.1805341243743896, + "learning_rate": 8.68075622615524e-05, + "loss": 1.1164, + "step": 74110 + }, + { + "epoch": 0.47353155386325596, + "grad_norm": 0.7617985606193542, + "learning_rate": 8.680416602135929e-05, + "loss": 1.064, + "step": 74120 + }, + { + "epoch": 0.47359544101299467, + "grad_norm": 0.5913506746292114, + "learning_rate": 8.68007694105164e-05, + "loss": 1.0071, + "step": 74130 + }, + { + "epoch": 0.4736593281627334, + "grad_norm": 1.0599095821380615, + "learning_rate": 8.679737242905792e-05, + "loss": 0.8347, + "step": 74140 + }, + { + "epoch": 0.473723215312472, + "grad_norm": 1.3343327045440674, + "learning_rate": 8.679397507701806e-05, + "loss": 1.1742, + "step": 74150 + }, + { + "epoch": 0.47378710246221073, + "grad_norm": 0.5925554037094116, + "learning_rate": 8.679057735443104e-05, + "loss": 0.9706, + "step": 74160 + }, + { + "epoch": 0.47385098961194944, + "grad_norm": 0.764336109161377, + "learning_rate": 8.678717926133109e-05, + "loss": 1.2481, + "step": 74170 + }, + { + "epoch": 0.47391487676168814, + "grad_norm": 0.8975499868392944, + "learning_rate": 8.678378079775241e-05, + "loss": 0.9182, + "step": 74180 + }, + { + "epoch": 0.47397876391142685, + "grad_norm": 0.7242470383644104, + "learning_rate": 8.678038196372925e-05, + "loss": 0.7105, + "step": 74190 + }, + { + "epoch": 0.47404265106116555, + "grad_norm": 0.8966102004051208, + "learning_rate": 8.67769827592958e-05, + "loss": 0.9065, + "step": 74200 + }, + { + "epoch": 0.47410653821090426, + "grad_norm": 0.5318197011947632, + "learning_rate": 8.677358318448633e-05, + "loss": 0.8434, + "step": 74210 + }, + { + "epoch": 0.47417042536064297, + "grad_norm": 1.834756851196289, + "learning_rate": 8.677018323933505e-05, + "loss": 1.1204, + "step": 74220 + }, + { + "epoch": 0.4742343125103817, + "grad_norm": 1.0717896223068237, + "learning_rate": 8.676678292387623e-05, + "loss": 0.8128, + "step": 74230 + }, + { + "epoch": 0.4742981996601204, + "grad_norm": 0.7135387063026428, + "learning_rate": 8.67633822381441e-05, + "loss": 0.8956, + "step": 74240 + }, + { + "epoch": 0.4743620868098591, + "grad_norm": 1.2037732601165771, + "learning_rate": 8.675998118217289e-05, + "loss": 0.833, + "step": 74250 + }, + { + "epoch": 0.4744259739595978, + "grad_norm": 1.4196306467056274, + "learning_rate": 8.675657975599688e-05, + "loss": 1.05, + "step": 74260 + }, + { + "epoch": 0.47448986110933644, + "grad_norm": 0.9658291339874268, + "learning_rate": 8.675317795965031e-05, + "loss": 0.6996, + "step": 74270 + }, + { + "epoch": 0.47455374825907515, + "grad_norm": 1.4640781879425049, + "learning_rate": 8.674977579316745e-05, + "loss": 0.704, + "step": 74280 + }, + { + "epoch": 0.47461763540881385, + "grad_norm": 0.7206962704658508, + "learning_rate": 8.674637325658254e-05, + "loss": 0.6542, + "step": 74290 + }, + { + "epoch": 0.47468152255855256, + "grad_norm": 0.9174501895904541, + "learning_rate": 8.674297034992986e-05, + "loss": 0.904, + "step": 74300 + }, + { + "epoch": 0.47474540970829127, + "grad_norm": 0.7845925092697144, + "learning_rate": 8.673956707324369e-05, + "loss": 0.9329, + "step": 74310 + }, + { + "epoch": 0.47480929685802997, + "grad_norm": 1.1023668050765991, + "learning_rate": 8.67361634265583e-05, + "loss": 0.9712, + "step": 74320 + }, + { + "epoch": 0.4748731840077687, + "grad_norm": 1.1444423198699951, + "learning_rate": 8.673275940990796e-05, + "loss": 0.8582, + "step": 74330 + }, + { + "epoch": 0.4749370711575074, + "grad_norm": 0.9823821783065796, + "learning_rate": 8.672935502332696e-05, + "loss": 0.9597, + "step": 74340 + }, + { + "epoch": 0.4750009583072461, + "grad_norm": 1.3823814392089844, + "learning_rate": 8.672595026684955e-05, + "loss": 0.8345, + "step": 74350 + }, + { + "epoch": 0.4750648454569848, + "grad_norm": 1.1680278778076172, + "learning_rate": 8.672254514051009e-05, + "loss": 1.0284, + "step": 74360 + }, + { + "epoch": 0.4751287326067235, + "grad_norm": 0.6968647241592407, + "learning_rate": 8.67191396443428e-05, + "loss": 0.7296, + "step": 74370 + }, + { + "epoch": 0.4751926197564622, + "grad_norm": 1.0131860971450806, + "learning_rate": 8.671573377838202e-05, + "loss": 0.9332, + "step": 74380 + }, + { + "epoch": 0.47525650690620086, + "grad_norm": 1.0270569324493408, + "learning_rate": 8.671232754266203e-05, + "loss": 0.7498, + "step": 74390 + }, + { + "epoch": 0.47532039405593957, + "grad_norm": 0.8761411905288696, + "learning_rate": 8.670892093721715e-05, + "loss": 0.729, + "step": 74400 + }, + { + "epoch": 0.47538428120567827, + "grad_norm": 0.9730551838874817, + "learning_rate": 8.670551396208168e-05, + "loss": 0.8988, + "step": 74410 + }, + { + "epoch": 0.475448168355417, + "grad_norm": 0.7609050869941711, + "learning_rate": 8.670210661728992e-05, + "loss": 0.965, + "step": 74420 + }, + { + "epoch": 0.4755120555051557, + "grad_norm": 0.9374824166297913, + "learning_rate": 8.669869890287621e-05, + "loss": 0.8523, + "step": 74430 + }, + { + "epoch": 0.4755759426548944, + "grad_norm": 0.9228322505950928, + "learning_rate": 8.669529081887484e-05, + "loss": 1.0975, + "step": 74440 + }, + { + "epoch": 0.4756398298046331, + "grad_norm": 0.8603367209434509, + "learning_rate": 8.669188236532013e-05, + "loss": 0.8295, + "step": 74450 + }, + { + "epoch": 0.4757037169543718, + "grad_norm": 0.9186978936195374, + "learning_rate": 8.668847354224645e-05, + "loss": 0.9944, + "step": 74460 + }, + { + "epoch": 0.4757676041041105, + "grad_norm": 0.8278791904449463, + "learning_rate": 8.668506434968808e-05, + "loss": 1.077, + "step": 74470 + }, + { + "epoch": 0.4758314912538492, + "grad_norm": 1.0060932636260986, + "learning_rate": 8.66816547876794e-05, + "loss": 1.1607, + "step": 74480 + }, + { + "epoch": 0.4758953784035879, + "grad_norm": 1.0081162452697754, + "learning_rate": 8.667824485625471e-05, + "loss": 0.8542, + "step": 74490 + }, + { + "epoch": 0.4759592655533266, + "grad_norm": 1.362919569015503, + "learning_rate": 8.667483455544835e-05, + "loss": 1.1144, + "step": 74500 + }, + { + "epoch": 0.47602315270306533, + "grad_norm": 0.8849195241928101, + "learning_rate": 8.667142388529467e-05, + "loss": 1.0793, + "step": 74510 + }, + { + "epoch": 0.476087039852804, + "grad_norm": 1.2261072397232056, + "learning_rate": 8.666801284582806e-05, + "loss": 0.6733, + "step": 74520 + }, + { + "epoch": 0.4761509270025427, + "grad_norm": 0.8894041180610657, + "learning_rate": 8.666460143708283e-05, + "loss": 0.8456, + "step": 74530 + }, + { + "epoch": 0.4762148141522814, + "grad_norm": 0.7797572016716003, + "learning_rate": 8.666118965909334e-05, + "loss": 0.7103, + "step": 74540 + }, + { + "epoch": 0.4762787013020201, + "grad_norm": 0.8391841650009155, + "learning_rate": 8.665777751189395e-05, + "loss": 0.9627, + "step": 74550 + }, + { + "epoch": 0.4763425884517588, + "grad_norm": 0.9427254796028137, + "learning_rate": 8.665436499551903e-05, + "loss": 0.8994, + "step": 74560 + }, + { + "epoch": 0.4764064756014975, + "grad_norm": 0.9323469996452332, + "learning_rate": 8.665095211000293e-05, + "loss": 0.9536, + "step": 74570 + }, + { + "epoch": 0.4764703627512362, + "grad_norm": 0.731502890586853, + "learning_rate": 8.664753885538005e-05, + "loss": 0.9477, + "step": 74580 + }, + { + "epoch": 0.4765342499009749, + "grad_norm": 0.7330303192138672, + "learning_rate": 8.664412523168474e-05, + "loss": 0.7534, + "step": 74590 + }, + { + "epoch": 0.47659813705071363, + "grad_norm": 1.0151233673095703, + "learning_rate": 8.664071123895138e-05, + "loss": 1.1555, + "step": 74600 + }, + { + "epoch": 0.47666202420045234, + "grad_norm": 0.7544573545455933, + "learning_rate": 8.663729687721439e-05, + "loss": 0.8015, + "step": 74610 + }, + { + "epoch": 0.47672591135019105, + "grad_norm": 0.5822036862373352, + "learning_rate": 8.66338821465081e-05, + "loss": 0.7977, + "step": 74620 + }, + { + "epoch": 0.47678979849992975, + "grad_norm": 1.627901554107666, + "learning_rate": 8.663046704686692e-05, + "loss": 0.9961, + "step": 74630 + }, + { + "epoch": 0.4768536856496684, + "grad_norm": 0.9120510220527649, + "learning_rate": 8.662705157832527e-05, + "loss": 0.9101, + "step": 74640 + }, + { + "epoch": 0.4769175727994071, + "grad_norm": 1.2490442991256714, + "learning_rate": 8.662363574091752e-05, + "loss": 0.9137, + "step": 74650 + }, + { + "epoch": 0.4769814599491458, + "grad_norm": 0.5486982464790344, + "learning_rate": 8.662021953467806e-05, + "loss": 1.0668, + "step": 74660 + }, + { + "epoch": 0.4770453470988845, + "grad_norm": 1.0078871250152588, + "learning_rate": 8.661680295964131e-05, + "loss": 0.8362, + "step": 74670 + }, + { + "epoch": 0.4771092342486232, + "grad_norm": 1.1747907400131226, + "learning_rate": 8.661338601584168e-05, + "loss": 0.6677, + "step": 74680 + }, + { + "epoch": 0.47717312139836193, + "grad_norm": 1.0243124961853027, + "learning_rate": 8.660996870331357e-05, + "loss": 0.9675, + "step": 74690 + }, + { + "epoch": 0.47723700854810064, + "grad_norm": 0.8121140599250793, + "learning_rate": 8.66065510220914e-05, + "loss": 0.7801, + "step": 74700 + }, + { + "epoch": 0.47730089569783934, + "grad_norm": 1.2878518104553223, + "learning_rate": 8.660313297220962e-05, + "loss": 0.8109, + "step": 74710 + }, + { + "epoch": 0.47736478284757805, + "grad_norm": 0.9882553219795227, + "learning_rate": 8.65997145537026e-05, + "loss": 0.8357, + "step": 74720 + }, + { + "epoch": 0.47742866999731676, + "grad_norm": 0.9014390110969543, + "learning_rate": 8.659629576660479e-05, + "loss": 1.0185, + "step": 74730 + }, + { + "epoch": 0.47749255714705546, + "grad_norm": 0.8646599054336548, + "learning_rate": 8.659287661095063e-05, + "loss": 0.8571, + "step": 74740 + }, + { + "epoch": 0.47755644429679417, + "grad_norm": 0.6751865744590759, + "learning_rate": 8.658945708677455e-05, + "loss": 0.8639, + "step": 74750 + }, + { + "epoch": 0.4776203314465328, + "grad_norm": 0.6493138074874878, + "learning_rate": 8.658603719411098e-05, + "loss": 0.9801, + "step": 74760 + }, + { + "epoch": 0.4776842185962715, + "grad_norm": 0.7330247163772583, + "learning_rate": 8.658261693299436e-05, + "loss": 0.9075, + "step": 74770 + }, + { + "epoch": 0.47774810574601023, + "grad_norm": 0.8450262546539307, + "learning_rate": 8.657919630345914e-05, + "loss": 0.9218, + "step": 74780 + }, + { + "epoch": 0.47781199289574894, + "grad_norm": 1.399348258972168, + "learning_rate": 8.657577530553977e-05, + "loss": 0.987, + "step": 74790 + }, + { + "epoch": 0.47787588004548764, + "grad_norm": 0.6834306716918945, + "learning_rate": 8.65723539392707e-05, + "loss": 1.0965, + "step": 74800 + }, + { + "epoch": 0.47793976719522635, + "grad_norm": 0.8273354768753052, + "learning_rate": 8.656893220468638e-05, + "loss": 0.9514, + "step": 74810 + }, + { + "epoch": 0.47800365434496506, + "grad_norm": 0.5543147325515747, + "learning_rate": 8.656551010182128e-05, + "loss": 0.8666, + "step": 74820 + }, + { + "epoch": 0.47806754149470376, + "grad_norm": 1.8543487787246704, + "learning_rate": 8.656208763070986e-05, + "loss": 0.8342, + "step": 74830 + }, + { + "epoch": 0.47813142864444247, + "grad_norm": 0.9043295383453369, + "learning_rate": 8.655866479138659e-05, + "loss": 1.0627, + "step": 74840 + }, + { + "epoch": 0.4781953157941812, + "grad_norm": 3.4877755641937256, + "learning_rate": 8.655524158388595e-05, + "loss": 0.9094, + "step": 74850 + }, + { + "epoch": 0.4782592029439199, + "grad_norm": 0.6816970109939575, + "learning_rate": 8.655181800824237e-05, + "loss": 0.6711, + "step": 74860 + }, + { + "epoch": 0.4783230900936586, + "grad_norm": 1.15105402469635, + "learning_rate": 8.654839406449037e-05, + "loss": 0.8039, + "step": 74870 + }, + { + "epoch": 0.47838697724339724, + "grad_norm": 0.43395039439201355, + "learning_rate": 8.654496975266445e-05, + "loss": 0.8421, + "step": 74880 + }, + { + "epoch": 0.47845086439313594, + "grad_norm": 0.7869691252708435, + "learning_rate": 8.654154507279904e-05, + "loss": 0.8841, + "step": 74890 + }, + { + "epoch": 0.47851475154287465, + "grad_norm": 1.013023853302002, + "learning_rate": 8.653812002492867e-05, + "loss": 1.1382, + "step": 74900 + }, + { + "epoch": 0.47857863869261336, + "grad_norm": 0.7388662099838257, + "learning_rate": 8.653469460908783e-05, + "loss": 0.8478, + "step": 74910 + }, + { + "epoch": 0.47864252584235206, + "grad_norm": 0.8700296878814697, + "learning_rate": 8.6531268825311e-05, + "loss": 0.8433, + "step": 74920 + }, + { + "epoch": 0.47870641299209077, + "grad_norm": 0.7805728912353516, + "learning_rate": 8.652784267363268e-05, + "loss": 0.8564, + "step": 74930 + }, + { + "epoch": 0.4787703001418295, + "grad_norm": 1.0190261602401733, + "learning_rate": 8.652441615408739e-05, + "loss": 0.7729, + "step": 74940 + }, + { + "epoch": 0.4788341872915682, + "grad_norm": 0.9089486002922058, + "learning_rate": 8.652098926670961e-05, + "loss": 0.8051, + "step": 74950 + }, + { + "epoch": 0.4788980744413069, + "grad_norm": 0.8379830121994019, + "learning_rate": 8.651756201153391e-05, + "loss": 0.7344, + "step": 74960 + }, + { + "epoch": 0.4789619615910456, + "grad_norm": 0.8890141844749451, + "learning_rate": 8.651413438859475e-05, + "loss": 1.0695, + "step": 74970 + }, + { + "epoch": 0.4790258487407843, + "grad_norm": 1.0251997709274292, + "learning_rate": 8.651070639792667e-05, + "loss": 0.9472, + "step": 74980 + }, + { + "epoch": 0.479089735890523, + "grad_norm": 0.7137789130210876, + "learning_rate": 8.650727803956418e-05, + "loss": 0.9155, + "step": 74990 + }, + { + "epoch": 0.47915362304026166, + "grad_norm": 0.6541804671287537, + "learning_rate": 8.650384931354183e-05, + "loss": 0.7172, + "step": 75000 + }, + { + "epoch": 0.47921751019000036, + "grad_norm": 1.1364400386810303, + "learning_rate": 8.650042021989415e-05, + "loss": 0.7023, + "step": 75010 + }, + { + "epoch": 0.47928139733973907, + "grad_norm": 1.3749972581863403, + "learning_rate": 8.649699075865564e-05, + "loss": 0.7755, + "step": 75020 + }, + { + "epoch": 0.4793452844894778, + "grad_norm": 1.0463199615478516, + "learning_rate": 8.649356092986086e-05, + "loss": 0.7507, + "step": 75030 + }, + { + "epoch": 0.4794091716392165, + "grad_norm": 2.42689847946167, + "learning_rate": 8.649013073354434e-05, + "loss": 0.8019, + "step": 75040 + }, + { + "epoch": 0.4794730587889552, + "grad_norm": 0.8399762511253357, + "learning_rate": 8.648670016974067e-05, + "loss": 0.9304, + "step": 75050 + }, + { + "epoch": 0.4795369459386939, + "grad_norm": 0.804482638835907, + "learning_rate": 8.648326923848434e-05, + "loss": 0.963, + "step": 75060 + }, + { + "epoch": 0.4796008330884326, + "grad_norm": 1.2717317342758179, + "learning_rate": 8.647983793980993e-05, + "loss": 0.7493, + "step": 75070 + }, + { + "epoch": 0.4796647202381713, + "grad_norm": 1.063368797302246, + "learning_rate": 8.647640627375199e-05, + "loss": 0.9279, + "step": 75080 + }, + { + "epoch": 0.47972860738791, + "grad_norm": 0.8830692768096924, + "learning_rate": 8.647297424034509e-05, + "loss": 0.887, + "step": 75090 + }, + { + "epoch": 0.4797924945376487, + "grad_norm": 0.9686833620071411, + "learning_rate": 8.646954183962378e-05, + "loss": 0.8693, + "step": 75100 + }, + { + "epoch": 0.4798563816873874, + "grad_norm": 0.8640769124031067, + "learning_rate": 8.646610907162262e-05, + "loss": 0.7796, + "step": 75110 + }, + { + "epoch": 0.4799202688371261, + "grad_norm": 0.5140219926834106, + "learning_rate": 8.646267593637621e-05, + "loss": 1.4085, + "step": 75120 + }, + { + "epoch": 0.4799841559868648, + "grad_norm": 0.6936458945274353, + "learning_rate": 8.64592424339191e-05, + "loss": 0.9243, + "step": 75130 + }, + { + "epoch": 0.4800480431366035, + "grad_norm": 0.8202782869338989, + "learning_rate": 8.645580856428588e-05, + "loss": 1.1241, + "step": 75140 + }, + { + "epoch": 0.4801119302863422, + "grad_norm": 1.985823631286621, + "learning_rate": 8.645237432751113e-05, + "loss": 1.0022, + "step": 75150 + }, + { + "epoch": 0.4801758174360809, + "grad_norm": 1.036049723625183, + "learning_rate": 8.644893972362945e-05, + "loss": 1.0675, + "step": 75160 + }, + { + "epoch": 0.4802397045858196, + "grad_norm": 0.7775549292564392, + "learning_rate": 8.644550475267538e-05, + "loss": 0.8857, + "step": 75170 + }, + { + "epoch": 0.4803035917355583, + "grad_norm": 0.7424293160438538, + "learning_rate": 8.644206941468358e-05, + "loss": 1.2084, + "step": 75180 + }, + { + "epoch": 0.480367478885297, + "grad_norm": 2.01617169380188, + "learning_rate": 8.64386337096886e-05, + "loss": 1.284, + "step": 75190 + }, + { + "epoch": 0.4804313660350357, + "grad_norm": 0.6096950173377991, + "learning_rate": 8.643519763772506e-05, + "loss": 0.7568, + "step": 75200 + }, + { + "epoch": 0.48049525318477443, + "grad_norm": 0.817476212978363, + "learning_rate": 8.643176119882755e-05, + "loss": 0.8748, + "step": 75210 + }, + { + "epoch": 0.48055914033451314, + "grad_norm": 0.9491440057754517, + "learning_rate": 8.642832439303067e-05, + "loss": 0.8784, + "step": 75220 + }, + { + "epoch": 0.48062302748425184, + "grad_norm": 0.4414537847042084, + "learning_rate": 8.642488722036908e-05, + "loss": 1.0155, + "step": 75230 + }, + { + "epoch": 0.4806869146339905, + "grad_norm": 0.8975993394851685, + "learning_rate": 8.642144968087735e-05, + "loss": 0.931, + "step": 75240 + }, + { + "epoch": 0.4807508017837292, + "grad_norm": 1.0731254816055298, + "learning_rate": 8.641801177459012e-05, + "loss": 1.1996, + "step": 75250 + }, + { + "epoch": 0.4808146889334679, + "grad_norm": 0.9253545999526978, + "learning_rate": 8.641457350154201e-05, + "loss": 0.6433, + "step": 75260 + }, + { + "epoch": 0.4808785760832066, + "grad_norm": 0.6967938542366028, + "learning_rate": 8.641113486176764e-05, + "loss": 0.7571, + "step": 75270 + }, + { + "epoch": 0.4809424632329453, + "grad_norm": 0.694025993347168, + "learning_rate": 8.640769585530162e-05, + "loss": 1.0296, + "step": 75280 + }, + { + "epoch": 0.481006350382684, + "grad_norm": 0.6931796073913574, + "learning_rate": 8.640425648217863e-05, + "loss": 1.0492, + "step": 75290 + }, + { + "epoch": 0.48107023753242273, + "grad_norm": 0.8335185050964355, + "learning_rate": 8.640081674243326e-05, + "loss": 0.74, + "step": 75300 + }, + { + "epoch": 0.48113412468216143, + "grad_norm": 0.9621481895446777, + "learning_rate": 8.639737663610019e-05, + "loss": 0.9957, + "step": 75310 + }, + { + "epoch": 0.48119801183190014, + "grad_norm": 0.6297350525856018, + "learning_rate": 8.639393616321404e-05, + "loss": 0.6059, + "step": 75320 + }, + { + "epoch": 0.48126189898163885, + "grad_norm": 0.8291562795639038, + "learning_rate": 8.639049532380948e-05, + "loss": 0.7669, + "step": 75330 + }, + { + "epoch": 0.48132578613137755, + "grad_norm": 0.7181857824325562, + "learning_rate": 8.638705411792115e-05, + "loss": 0.9866, + "step": 75340 + }, + { + "epoch": 0.48138967328111626, + "grad_norm": 0.6401185393333435, + "learning_rate": 8.63836125455837e-05, + "loss": 0.7838, + "step": 75350 + }, + { + "epoch": 0.48145356043085497, + "grad_norm": 0.6353443264961243, + "learning_rate": 8.638017060683179e-05, + "loss": 0.6636, + "step": 75360 + }, + { + "epoch": 0.4815174475805936, + "grad_norm": 0.9812245965003967, + "learning_rate": 8.637672830170009e-05, + "loss": 0.7057, + "step": 75370 + }, + { + "epoch": 0.4815813347303323, + "grad_norm": 0.8041467666625977, + "learning_rate": 8.637328563022327e-05, + "loss": 0.8152, + "step": 75380 + }, + { + "epoch": 0.48164522188007103, + "grad_norm": 0.738399863243103, + "learning_rate": 8.636984259243601e-05, + "loss": 0.8781, + "step": 75390 + }, + { + "epoch": 0.48170910902980973, + "grad_norm": 0.9629417061805725, + "learning_rate": 8.636639918837294e-05, + "loss": 0.9158, + "step": 75400 + }, + { + "epoch": 0.48177299617954844, + "grad_norm": 0.898951530456543, + "learning_rate": 8.636295541806881e-05, + "loss": 0.9504, + "step": 75410 + }, + { + "epoch": 0.48183688332928715, + "grad_norm": 0.8771629929542542, + "learning_rate": 8.635951128155822e-05, + "loss": 0.9677, + "step": 75420 + }, + { + "epoch": 0.48190077047902585, + "grad_norm": 0.7448533177375793, + "learning_rate": 8.635606677887591e-05, + "loss": 0.752, + "step": 75430 + }, + { + "epoch": 0.48196465762876456, + "grad_norm": 0.6516122221946716, + "learning_rate": 8.635262191005656e-05, + "loss": 0.6401, + "step": 75440 + }, + { + "epoch": 0.48202854477850327, + "grad_norm": 0.7587134838104248, + "learning_rate": 8.634917667513486e-05, + "loss": 0.8766, + "step": 75450 + }, + { + "epoch": 0.48209243192824197, + "grad_norm": 0.728209376335144, + "learning_rate": 8.63457310741455e-05, + "loss": 0.9743, + "step": 75460 + }, + { + "epoch": 0.4821563190779807, + "grad_norm": 0.7866697907447815, + "learning_rate": 8.634228510712318e-05, + "loss": 0.9598, + "step": 75470 + }, + { + "epoch": 0.4822202062277194, + "grad_norm": 0.8349552750587463, + "learning_rate": 8.633883877410261e-05, + "loss": 0.7729, + "step": 75480 + }, + { + "epoch": 0.48228409337745803, + "grad_norm": 0.7193264365196228, + "learning_rate": 8.63353920751185e-05, + "loss": 0.714, + "step": 75490 + }, + { + "epoch": 0.48234798052719674, + "grad_norm": 0.9247245192527771, + "learning_rate": 8.633194501020556e-05, + "loss": 0.9012, + "step": 75500 + }, + { + "epoch": 0.48241186767693545, + "grad_norm": 1.0399880409240723, + "learning_rate": 8.632849757939849e-05, + "loss": 0.9669, + "step": 75510 + }, + { + "epoch": 0.48247575482667415, + "grad_norm": 0.7889145016670227, + "learning_rate": 8.632504978273204e-05, + "loss": 1.1164, + "step": 75520 + }, + { + "epoch": 0.48253964197641286, + "grad_norm": 0.8151355981826782, + "learning_rate": 8.63216016202409e-05, + "loss": 0.9048, + "step": 75530 + }, + { + "epoch": 0.48260352912615156, + "grad_norm": 0.9007961750030518, + "learning_rate": 8.631815309195981e-05, + "loss": 0.7891, + "step": 75540 + }, + { + "epoch": 0.48266741627589027, + "grad_norm": 1.8607451915740967, + "learning_rate": 8.631470419792348e-05, + "loss": 1.0807, + "step": 75550 + }, + { + "epoch": 0.482731303425629, + "grad_norm": 0.6548914909362793, + "learning_rate": 8.63112549381667e-05, + "loss": 1.0285, + "step": 75560 + }, + { + "epoch": 0.4827951905753677, + "grad_norm": 0.7430241107940674, + "learning_rate": 8.630780531272414e-05, + "loss": 0.8952, + "step": 75570 + }, + { + "epoch": 0.4828590777251064, + "grad_norm": 0.6701022386550903, + "learning_rate": 8.630435532163059e-05, + "loss": 0.8305, + "step": 75580 + }, + { + "epoch": 0.4829229648748451, + "grad_norm": 0.8253774046897888, + "learning_rate": 8.630090496492076e-05, + "loss": 1.2012, + "step": 75590 + }, + { + "epoch": 0.4829868520245838, + "grad_norm": 0.7972230911254883, + "learning_rate": 8.629745424262942e-05, + "loss": 1.138, + "step": 75600 + }, + { + "epoch": 0.48305073917432245, + "grad_norm": 1.0207947492599487, + "learning_rate": 8.62940031547913e-05, + "loss": 0.9028, + "step": 75610 + }, + { + "epoch": 0.48311462632406116, + "grad_norm": 0.6902018785476685, + "learning_rate": 8.62905517014412e-05, + "loss": 0.8966, + "step": 75620 + }, + { + "epoch": 0.48317851347379986, + "grad_norm": 1.1125010251998901, + "learning_rate": 8.628709988261381e-05, + "loss": 1.008, + "step": 75630 + }, + { + "epoch": 0.48324240062353857, + "grad_norm": 0.6313163638114929, + "learning_rate": 8.628364769834395e-05, + "loss": 0.8845, + "step": 75640 + }, + { + "epoch": 0.4833062877732773, + "grad_norm": 0.6679086685180664, + "learning_rate": 8.628019514866637e-05, + "loss": 0.9086, + "step": 75650 + }, + { + "epoch": 0.483370174923016, + "grad_norm": 0.7422047853469849, + "learning_rate": 8.627674223361584e-05, + "loss": 0.8719, + "step": 75660 + }, + { + "epoch": 0.4834340620727547, + "grad_norm": 0.7488150596618652, + "learning_rate": 8.627328895322713e-05, + "loss": 0.8072, + "step": 75670 + }, + { + "epoch": 0.4834979492224934, + "grad_norm": 0.5652221441268921, + "learning_rate": 8.627018068854189e-05, + "loss": 1.0794, + "step": 75680 + }, + { + "epoch": 0.4835618363722321, + "grad_norm": 0.8535979986190796, + "learning_rate": 8.626672671410644e-05, + "loss": 0.8991, + "step": 75690 + }, + { + "epoch": 0.4836257235219708, + "grad_norm": 0.8179265260696411, + "learning_rate": 8.62632723744337e-05, + "loss": 1.2132, + "step": 75700 + }, + { + "epoch": 0.4836896106717095, + "grad_norm": 0.7996183037757874, + "learning_rate": 8.625981766955842e-05, + "loss": 0.8212, + "step": 75710 + }, + { + "epoch": 0.4837534978214482, + "grad_norm": 0.671373724937439, + "learning_rate": 8.625636259951542e-05, + "loss": 0.9386, + "step": 75720 + }, + { + "epoch": 0.48381738497118687, + "grad_norm": 1.1768290996551514, + "learning_rate": 8.625290716433947e-05, + "loss": 0.8154, + "step": 75730 + }, + { + "epoch": 0.4838812721209256, + "grad_norm": 0.9020494818687439, + "learning_rate": 8.62494513640654e-05, + "loss": 1.119, + "step": 75740 + }, + { + "epoch": 0.4839451592706643, + "grad_norm": 1.0053081512451172, + "learning_rate": 8.624599519872798e-05, + "loss": 1.0964, + "step": 75750 + }, + { + "epoch": 0.484009046420403, + "grad_norm": 1.5894237756729126, + "learning_rate": 8.624253866836202e-05, + "loss": 0.9744, + "step": 75760 + }, + { + "epoch": 0.4840729335701417, + "grad_norm": 0.5304593443870544, + "learning_rate": 8.623908177300236e-05, + "loss": 0.7911, + "step": 75770 + }, + { + "epoch": 0.4841368207198804, + "grad_norm": 0.8319995403289795, + "learning_rate": 8.623562451268378e-05, + "loss": 1.0109, + "step": 75780 + }, + { + "epoch": 0.4842007078696191, + "grad_norm": 1.3417378664016724, + "learning_rate": 8.623216688744113e-05, + "loss": 0.8693, + "step": 75790 + }, + { + "epoch": 0.4842645950193578, + "grad_norm": 0.7891839742660522, + "learning_rate": 8.622870889730921e-05, + "loss": 0.9214, + "step": 75800 + }, + { + "epoch": 0.4843284821690965, + "grad_norm": 0.7130112648010254, + "learning_rate": 8.622525054232285e-05, + "loss": 0.8262, + "step": 75810 + }, + { + "epoch": 0.4843923693188352, + "grad_norm": 0.9320762157440186, + "learning_rate": 8.622179182251686e-05, + "loss": 0.8674, + "step": 75820 + }, + { + "epoch": 0.48445625646857393, + "grad_norm": 0.6487066745758057, + "learning_rate": 8.62183327379261e-05, + "loss": 1.126, + "step": 75830 + }, + { + "epoch": 0.48452014361831264, + "grad_norm": 0.6271628737449646, + "learning_rate": 8.62148732885854e-05, + "loss": 0.7827, + "step": 75840 + }, + { + "epoch": 0.4845840307680513, + "grad_norm": 0.7439334988594055, + "learning_rate": 8.621141347452959e-05, + "loss": 1.2293, + "step": 75850 + }, + { + "epoch": 0.48464791791779, + "grad_norm": 0.8553930521011353, + "learning_rate": 8.620795329579354e-05, + "loss": 0.8525, + "step": 75860 + }, + { + "epoch": 0.4847118050675287, + "grad_norm": 0.9168295860290527, + "learning_rate": 8.620449275241205e-05, + "loss": 0.7103, + "step": 75870 + }, + { + "epoch": 0.4847756922172674, + "grad_norm": 0.61861652135849, + "learning_rate": 8.620103184442001e-05, + "loss": 0.796, + "step": 75880 + }, + { + "epoch": 0.4848395793670061, + "grad_norm": 1.4174355268478394, + "learning_rate": 8.619757057185226e-05, + "loss": 0.8479, + "step": 75890 + }, + { + "epoch": 0.4849034665167448, + "grad_norm": 0.9580785036087036, + "learning_rate": 8.619410893474365e-05, + "loss": 0.7067, + "step": 75900 + }, + { + "epoch": 0.4849673536664835, + "grad_norm": 0.7961419820785522, + "learning_rate": 8.619064693312906e-05, + "loss": 1.1983, + "step": 75910 + }, + { + "epoch": 0.48503124081622223, + "grad_norm": 1.8671194314956665, + "learning_rate": 8.618718456704335e-05, + "loss": 1.2858, + "step": 75920 + }, + { + "epoch": 0.48509512796596094, + "grad_norm": 1.4799001216888428, + "learning_rate": 8.618372183652137e-05, + "loss": 0.9962, + "step": 75930 + }, + { + "epoch": 0.48515901511569964, + "grad_norm": 0.6392105221748352, + "learning_rate": 8.6180258741598e-05, + "loss": 0.8089, + "step": 75940 + }, + { + "epoch": 0.48522290226543835, + "grad_norm": 0.8513908982276917, + "learning_rate": 8.617679528230816e-05, + "loss": 0.9247, + "step": 75950 + }, + { + "epoch": 0.48528678941517706, + "grad_norm": 0.6598104238510132, + "learning_rate": 8.617333145868667e-05, + "loss": 0.9169, + "step": 75960 + }, + { + "epoch": 0.4853506765649157, + "grad_norm": 1.3016315698623657, + "learning_rate": 8.616986727076843e-05, + "loss": 0.8606, + "step": 75970 + }, + { + "epoch": 0.4854145637146544, + "grad_norm": 0.950963020324707, + "learning_rate": 8.616640271858835e-05, + "loss": 0.8453, + "step": 75980 + }, + { + "epoch": 0.4854784508643931, + "grad_norm": 0.9443991780281067, + "learning_rate": 8.616293780218131e-05, + "loss": 0.9117, + "step": 75990 + }, + { + "epoch": 0.4855423380141318, + "grad_norm": 0.8694010972976685, + "learning_rate": 8.615947252158219e-05, + "loss": 0.9585, + "step": 76000 + }, + { + "epoch": 0.48560622516387053, + "grad_norm": 1.7652310132980347, + "learning_rate": 8.615600687682591e-05, + "loss": 0.9593, + "step": 76010 + }, + { + "epoch": 0.48567011231360924, + "grad_norm": 0.4394935369491577, + "learning_rate": 8.615254086794735e-05, + "loss": 0.856, + "step": 76020 + }, + { + "epoch": 0.48573399946334794, + "grad_norm": 1.1516753435134888, + "learning_rate": 8.614907449498144e-05, + "loss": 1.2644, + "step": 76030 + }, + { + "epoch": 0.48579788661308665, + "grad_norm": 2.2719500064849854, + "learning_rate": 8.614560775796307e-05, + "loss": 0.7425, + "step": 76040 + }, + { + "epoch": 0.48586177376282536, + "grad_norm": 0.5767148733139038, + "learning_rate": 8.614214065692715e-05, + "loss": 0.8913, + "step": 76050 + }, + { + "epoch": 0.48592566091256406, + "grad_norm": 0.7121883034706116, + "learning_rate": 8.613867319190861e-05, + "loss": 1.0213, + "step": 76060 + }, + { + "epoch": 0.48598954806230277, + "grad_norm": 0.948017418384552, + "learning_rate": 8.613520536294238e-05, + "loss": 0.8787, + "step": 76070 + }, + { + "epoch": 0.4860534352120415, + "grad_norm": 0.6437438130378723, + "learning_rate": 8.613173717006335e-05, + "loss": 0.9009, + "step": 76080 + }, + { + "epoch": 0.4861173223617801, + "grad_norm": 0.7081766724586487, + "learning_rate": 8.612826861330648e-05, + "loss": 0.9181, + "step": 76090 + }, + { + "epoch": 0.48618120951151883, + "grad_norm": 0.7698941826820374, + "learning_rate": 8.61247996927067e-05, + "loss": 0.8093, + "step": 76100 + }, + { + "epoch": 0.48624509666125754, + "grad_norm": 0.8024051785469055, + "learning_rate": 8.612133040829892e-05, + "loss": 0.98, + "step": 76110 + }, + { + "epoch": 0.48630898381099624, + "grad_norm": 2.622551679611206, + "learning_rate": 8.611786076011809e-05, + "loss": 0.9282, + "step": 76120 + }, + { + "epoch": 0.48637287096073495, + "grad_norm": 1.0354362726211548, + "learning_rate": 8.611439074819917e-05, + "loss": 1.0491, + "step": 76130 + }, + { + "epoch": 0.48643675811047365, + "grad_norm": 0.6621295213699341, + "learning_rate": 8.611092037257709e-05, + "loss": 0.776, + "step": 76140 + }, + { + "epoch": 0.48650064526021236, + "grad_norm": 0.6664482355117798, + "learning_rate": 8.610744963328679e-05, + "loss": 1.0026, + "step": 76150 + }, + { + "epoch": 0.48656453240995107, + "grad_norm": 0.9559485912322998, + "learning_rate": 8.610397853036325e-05, + "loss": 1.0425, + "step": 76160 + }, + { + "epoch": 0.4866284195596898, + "grad_norm": 1.0177385807037354, + "learning_rate": 8.61005070638414e-05, + "loss": 0.9678, + "step": 76170 + }, + { + "epoch": 0.4866923067094285, + "grad_norm": 0.630623459815979, + "learning_rate": 8.60970352337562e-05, + "loss": 0.99, + "step": 76180 + }, + { + "epoch": 0.4867561938591672, + "grad_norm": 0.9502881169319153, + "learning_rate": 8.609356304014264e-05, + "loss": 0.7239, + "step": 76190 + }, + { + "epoch": 0.4868200810089059, + "grad_norm": 2.090254306793213, + "learning_rate": 8.60900904830357e-05, + "loss": 1.2069, + "step": 76200 + }, + { + "epoch": 0.4868839681586446, + "grad_norm": 0.5957566499710083, + "learning_rate": 8.60866175624703e-05, + "loss": 0.8932, + "step": 76210 + }, + { + "epoch": 0.48694785530838325, + "grad_norm": 1.5734295845031738, + "learning_rate": 8.608314427848144e-05, + "loss": 0.7924, + "step": 76220 + }, + { + "epoch": 0.48701174245812195, + "grad_norm": 0.6711301207542419, + "learning_rate": 8.60796706311041e-05, + "loss": 0.952, + "step": 76230 + }, + { + "epoch": 0.48707562960786066, + "grad_norm": 0.6539300084114075, + "learning_rate": 8.607619662037327e-05, + "loss": 0.949, + "step": 76240 + }, + { + "epoch": 0.48713951675759937, + "grad_norm": 1.1970055103302002, + "learning_rate": 8.607272224632393e-05, + "loss": 1.0121, + "step": 76250 + }, + { + "epoch": 0.4872034039073381, + "grad_norm": 0.9336310625076294, + "learning_rate": 8.606924750899106e-05, + "loss": 0.8952, + "step": 76260 + }, + { + "epoch": 0.4872672910570768, + "grad_norm": 0.9023282527923584, + "learning_rate": 8.606577240840968e-05, + "loss": 0.9134, + "step": 76270 + }, + { + "epoch": 0.4873311782068155, + "grad_norm": 0.4293481111526489, + "learning_rate": 8.606229694461476e-05, + "loss": 0.7425, + "step": 76280 + }, + { + "epoch": 0.4873950653565542, + "grad_norm": 0.736682116985321, + "learning_rate": 8.605882111764132e-05, + "loss": 0.8171, + "step": 76290 + }, + { + "epoch": 0.4874589525062929, + "grad_norm": 1.6317270994186401, + "learning_rate": 8.605534492752434e-05, + "loss": 0.824, + "step": 76300 + }, + { + "epoch": 0.4875228396560316, + "grad_norm": 3.0119450092315674, + "learning_rate": 8.605186837429887e-05, + "loss": 0.859, + "step": 76310 + }, + { + "epoch": 0.4875867268057703, + "grad_norm": 1.0656332969665527, + "learning_rate": 8.604839145799987e-05, + "loss": 0.7387, + "step": 76320 + }, + { + "epoch": 0.487650613955509, + "grad_norm": 0.7559338808059692, + "learning_rate": 8.604491417866238e-05, + "loss": 0.9439, + "step": 76330 + }, + { + "epoch": 0.48771450110524767, + "grad_norm": 0.8888264894485474, + "learning_rate": 8.604143653632144e-05, + "loss": 1.0296, + "step": 76340 + }, + { + "epoch": 0.4877783882549864, + "grad_norm": 0.9546695947647095, + "learning_rate": 8.603795853101204e-05, + "loss": 1.0504, + "step": 76350 + }, + { + "epoch": 0.4878422754047251, + "grad_norm": 2.9092493057250977, + "learning_rate": 8.603448016276924e-05, + "loss": 1.1027, + "step": 76360 + }, + { + "epoch": 0.4879061625544638, + "grad_norm": 0.840092122554779, + "learning_rate": 8.603100143162803e-05, + "loss": 0.7812, + "step": 76370 + }, + { + "epoch": 0.4879700497042025, + "grad_norm": 0.8399893641471863, + "learning_rate": 8.602752233762348e-05, + "loss": 0.9633, + "step": 76380 + }, + { + "epoch": 0.4880339368539412, + "grad_norm": 0.9037623405456543, + "learning_rate": 8.60240428807906e-05, + "loss": 0.9545, + "step": 76390 + }, + { + "epoch": 0.4880978240036799, + "grad_norm": 0.843728244304657, + "learning_rate": 8.602056306116445e-05, + "loss": 0.7823, + "step": 76400 + }, + { + "epoch": 0.4881617111534186, + "grad_norm": 0.9266428351402283, + "learning_rate": 8.601708287878006e-05, + "loss": 0.7908, + "step": 76410 + }, + { + "epoch": 0.4882255983031573, + "grad_norm": 0.7917917966842651, + "learning_rate": 8.60136023336725e-05, + "loss": 1.0549, + "step": 76420 + }, + { + "epoch": 0.488289485452896, + "grad_norm": 0.7976272702217102, + "learning_rate": 8.601012142587678e-05, + "loss": 0.944, + "step": 76430 + }, + { + "epoch": 0.4883533726026347, + "grad_norm": 1.0543662309646606, + "learning_rate": 8.6006640155428e-05, + "loss": 0.8536, + "step": 76440 + }, + { + "epoch": 0.48841725975237343, + "grad_norm": 0.7209562063217163, + "learning_rate": 8.600315852236121e-05, + "loss": 0.7236, + "step": 76450 + }, + { + "epoch": 0.4884811469021121, + "grad_norm": 0.7003374695777893, + "learning_rate": 8.599967652671147e-05, + "loss": 1.1172, + "step": 76460 + }, + { + "epoch": 0.4885450340518508, + "grad_norm": 1.0924787521362305, + "learning_rate": 8.599619416851384e-05, + "loss": 0.7156, + "step": 76470 + }, + { + "epoch": 0.4886089212015895, + "grad_norm": 0.6103460192680359, + "learning_rate": 8.599271144780339e-05, + "loss": 0.9213, + "step": 76480 + }, + { + "epoch": 0.4886728083513282, + "grad_norm": 0.675788938999176, + "learning_rate": 8.59892283646152e-05, + "loss": 0.7218, + "step": 76490 + }, + { + "epoch": 0.4887366955010669, + "grad_norm": 0.5468382835388184, + "learning_rate": 8.598574491898435e-05, + "loss": 0.7851, + "step": 76500 + }, + { + "epoch": 0.4888005826508056, + "grad_norm": 0.9708940982818604, + "learning_rate": 8.59822611109459e-05, + "loss": 0.8894, + "step": 76510 + }, + { + "epoch": 0.4888644698005443, + "grad_norm": 2.2232227325439453, + "learning_rate": 8.597877694053496e-05, + "loss": 1.0381, + "step": 76520 + }, + { + "epoch": 0.488928356950283, + "grad_norm": 1.9338047504425049, + "learning_rate": 8.597529240778661e-05, + "loss": 0.8914, + "step": 76530 + }, + { + "epoch": 0.48899224410002173, + "grad_norm": 0.842464029788971, + "learning_rate": 8.597180751273595e-05, + "loss": 0.8219, + "step": 76540 + }, + { + "epoch": 0.48905613124976044, + "grad_norm": 0.6262010931968689, + "learning_rate": 8.596832225541806e-05, + "loss": 0.6786, + "step": 76550 + }, + { + "epoch": 0.48912001839949915, + "grad_norm": 0.7342615723609924, + "learning_rate": 8.596483663586804e-05, + "loss": 0.6999, + "step": 76560 + }, + { + "epoch": 0.48918390554923785, + "grad_norm": 0.6208049654960632, + "learning_rate": 8.596135065412101e-05, + "loss": 1.0309, + "step": 76570 + }, + { + "epoch": 0.4892477926989765, + "grad_norm": 0.8948808312416077, + "learning_rate": 8.595786431021207e-05, + "loss": 0.7239, + "step": 76580 + }, + { + "epoch": 0.4893116798487152, + "grad_norm": 0.7227377891540527, + "learning_rate": 8.595437760417633e-05, + "loss": 0.9171, + "step": 76590 + }, + { + "epoch": 0.4893755669984539, + "grad_norm": 0.8162720203399658, + "learning_rate": 8.59508905360489e-05, + "loss": 0.9206, + "step": 76600 + }, + { + "epoch": 0.4894394541481926, + "grad_norm": 1.031140923500061, + "learning_rate": 8.59474031058649e-05, + "loss": 0.8738, + "step": 76610 + }, + { + "epoch": 0.4895033412979313, + "grad_norm": 0.6611879467964172, + "learning_rate": 8.594391531365943e-05, + "loss": 0.8089, + "step": 76620 + }, + { + "epoch": 0.48956722844767003, + "grad_norm": 0.8293446898460388, + "learning_rate": 8.594042715946768e-05, + "loss": 1.0846, + "step": 76630 + }, + { + "epoch": 0.48963111559740874, + "grad_norm": 0.7987895607948303, + "learning_rate": 8.59369386433247e-05, + "loss": 1.0023, + "step": 76640 + }, + { + "epoch": 0.48969500274714745, + "grad_norm": 0.8030225038528442, + "learning_rate": 8.593344976526569e-05, + "loss": 0.6244, + "step": 76650 + }, + { + "epoch": 0.48975888989688615, + "grad_norm": 1.0051014423370361, + "learning_rate": 8.592996052532572e-05, + "loss": 0.8662, + "step": 76660 + }, + { + "epoch": 0.48982277704662486, + "grad_norm": 1.1758030652999878, + "learning_rate": 8.592647092353998e-05, + "loss": 0.8531, + "step": 76670 + }, + { + "epoch": 0.48988666419636356, + "grad_norm": 0.8429425358772278, + "learning_rate": 8.59229809599436e-05, + "loss": 0.9254, + "step": 76680 + }, + { + "epoch": 0.48995055134610227, + "grad_norm": 0.9333186149597168, + "learning_rate": 8.591949063457172e-05, + "loss": 0.9736, + "step": 76690 + }, + { + "epoch": 0.4900144384958409, + "grad_norm": 0.98914635181427, + "learning_rate": 8.59159999474595e-05, + "loss": 0.8071, + "step": 76700 + }, + { + "epoch": 0.4900783256455796, + "grad_norm": 0.6618992686271667, + "learning_rate": 8.591250889864209e-05, + "loss": 1.0296, + "step": 76710 + }, + { + "epoch": 0.49014221279531833, + "grad_norm": 0.5613696575164795, + "learning_rate": 8.590901748815464e-05, + "loss": 0.8095, + "step": 76720 + }, + { + "epoch": 0.49020609994505704, + "grad_norm": 0.6220462322235107, + "learning_rate": 8.590552571603232e-05, + "loss": 0.7297, + "step": 76730 + }, + { + "epoch": 0.49026998709479575, + "grad_norm": 0.5085312128067017, + "learning_rate": 8.590203358231028e-05, + "loss": 0.7892, + "step": 76740 + }, + { + "epoch": 0.49033387424453445, + "grad_norm": 0.7087169885635376, + "learning_rate": 8.589854108702371e-05, + "loss": 0.8027, + "step": 76750 + }, + { + "epoch": 0.49039776139427316, + "grad_norm": 0.7277820110321045, + "learning_rate": 8.589504823020778e-05, + "loss": 0.8146, + "step": 76760 + }, + { + "epoch": 0.49046164854401186, + "grad_norm": 0.8798472881317139, + "learning_rate": 8.589155501189767e-05, + "loss": 0.8185, + "step": 76770 + }, + { + "epoch": 0.49052553569375057, + "grad_norm": 0.8742108345031738, + "learning_rate": 8.588806143212852e-05, + "loss": 0.9735, + "step": 76780 + }, + { + "epoch": 0.4905894228434893, + "grad_norm": 1.9560281038284302, + "learning_rate": 8.588456749093558e-05, + "loss": 0.9918, + "step": 76790 + }, + { + "epoch": 0.490653309993228, + "grad_norm": 0.953271746635437, + "learning_rate": 8.588107318835398e-05, + "loss": 1.0473, + "step": 76800 + }, + { + "epoch": 0.4907171971429667, + "grad_norm": 0.8690406084060669, + "learning_rate": 8.587757852441893e-05, + "loss": 0.9629, + "step": 76810 + }, + { + "epoch": 0.49078108429270534, + "grad_norm": 1.7574247121810913, + "learning_rate": 8.587408349916564e-05, + "loss": 0.662, + "step": 76820 + }, + { + "epoch": 0.49084497144244404, + "grad_norm": 0.9854816198348999, + "learning_rate": 8.587058811262929e-05, + "loss": 0.86, + "step": 76830 + }, + { + "epoch": 0.49090885859218275, + "grad_norm": 1.1772929430007935, + "learning_rate": 8.586709236484507e-05, + "loss": 0.7821, + "step": 76840 + }, + { + "epoch": 0.49097274574192146, + "grad_norm": 1.0073944330215454, + "learning_rate": 8.586359625584822e-05, + "loss": 0.8854, + "step": 76850 + }, + { + "epoch": 0.49103663289166016, + "grad_norm": 0.7128773927688599, + "learning_rate": 8.586009978567391e-05, + "loss": 0.7433, + "step": 76860 + }, + { + "epoch": 0.49110052004139887, + "grad_norm": 0.663662314414978, + "learning_rate": 8.58566029543574e-05, + "loss": 0.8485, + "step": 76870 + }, + { + "epoch": 0.4911644071911376, + "grad_norm": 0.6211137771606445, + "learning_rate": 8.585310576193384e-05, + "loss": 0.8421, + "step": 76880 + }, + { + "epoch": 0.4912282943408763, + "grad_norm": 1.202216625213623, + "learning_rate": 8.584960820843851e-05, + "loss": 0.7988, + "step": 76890 + }, + { + "epoch": 0.491292181490615, + "grad_norm": 0.7233720421791077, + "learning_rate": 8.584611029390661e-05, + "loss": 0.903, + "step": 76900 + }, + { + "epoch": 0.4913560686403537, + "grad_norm": 0.5349984765052795, + "learning_rate": 8.584261201837337e-05, + "loss": 1.0756, + "step": 76910 + }, + { + "epoch": 0.4914199557900924, + "grad_norm": 0.9443363547325134, + "learning_rate": 8.583911338187401e-05, + "loss": 0.8465, + "step": 76920 + }, + { + "epoch": 0.4914838429398311, + "grad_norm": 0.7299784421920776, + "learning_rate": 8.583561438444379e-05, + "loss": 1.1754, + "step": 76930 + }, + { + "epoch": 0.49154773008956976, + "grad_norm": 1.1991528272628784, + "learning_rate": 8.583211502611792e-05, + "loss": 0.9597, + "step": 76940 + }, + { + "epoch": 0.49161161723930846, + "grad_norm": 1.1621544361114502, + "learning_rate": 8.582861530693165e-05, + "loss": 0.7185, + "step": 76950 + }, + { + "epoch": 0.49167550438904717, + "grad_norm": 1.565203070640564, + "learning_rate": 8.582511522692022e-05, + "loss": 0.8875, + "step": 76960 + }, + { + "epoch": 0.4917393915387859, + "grad_norm": 0.9279037714004517, + "learning_rate": 8.58216147861189e-05, + "loss": 1.1073, + "step": 76970 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 1.2294920682907104, + "learning_rate": 8.581811398456292e-05, + "loss": 0.9467, + "step": 76980 + }, + { + "epoch": 0.4918671658382633, + "grad_norm": 0.6225689053535461, + "learning_rate": 8.581461282228756e-05, + "loss": 0.9234, + "step": 76990 + }, + { + "epoch": 0.491931052988002, + "grad_norm": 1.2206075191497803, + "learning_rate": 8.581111129932805e-05, + "loss": 1.0835, + "step": 77000 + }, + { + "epoch": 0.4919949401377407, + "grad_norm": 0.7417840957641602, + "learning_rate": 8.580760941571967e-05, + "loss": 0.9111, + "step": 77010 + }, + { + "epoch": 0.4920588272874794, + "grad_norm": 0.6536421775817871, + "learning_rate": 8.580410717149769e-05, + "loss": 0.9932, + "step": 77020 + }, + { + "epoch": 0.4921227144372181, + "grad_norm": 0.8102644681930542, + "learning_rate": 8.580060456669738e-05, + "loss": 1.0341, + "step": 77030 + }, + { + "epoch": 0.4921866015869568, + "grad_norm": 0.9170993566513062, + "learning_rate": 8.579710160135399e-05, + "loss": 1.0178, + "step": 77040 + }, + { + "epoch": 0.4922504887366955, + "grad_norm": 0.4981268644332886, + "learning_rate": 8.579359827550284e-05, + "loss": 0.7805, + "step": 77050 + }, + { + "epoch": 0.49231437588643423, + "grad_norm": 1.8786097764968872, + "learning_rate": 8.579009458917917e-05, + "loss": 0.9201, + "step": 77060 + }, + { + "epoch": 0.4923782630361729, + "grad_norm": 1.0269956588745117, + "learning_rate": 8.57865905424183e-05, + "loss": 0.9258, + "step": 77070 + }, + { + "epoch": 0.4924421501859116, + "grad_norm": 0.959625244140625, + "learning_rate": 8.578308613525549e-05, + "loss": 0.7213, + "step": 77080 + }, + { + "epoch": 0.4925060373356503, + "grad_norm": 0.7318682670593262, + "learning_rate": 8.577958136772608e-05, + "loss": 0.9329, + "step": 77090 + }, + { + "epoch": 0.492569924485389, + "grad_norm": 0.9213690757751465, + "learning_rate": 8.57760762398653e-05, + "loss": 0.8856, + "step": 77100 + }, + { + "epoch": 0.4926338116351277, + "grad_norm": 0.7937483191490173, + "learning_rate": 8.577257075170849e-05, + "loss": 1.2098, + "step": 77110 + }, + { + "epoch": 0.4926976987848664, + "grad_norm": 1.0895425081253052, + "learning_rate": 8.576906490329094e-05, + "loss": 0.8427, + "step": 77120 + }, + { + "epoch": 0.4927615859346051, + "grad_norm": 0.9130338430404663, + "learning_rate": 8.576555869464798e-05, + "loss": 1.2261, + "step": 77130 + }, + { + "epoch": 0.4928254730843438, + "grad_norm": 0.7346659302711487, + "learning_rate": 8.576205212581488e-05, + "loss": 0.6587, + "step": 77140 + }, + { + "epoch": 0.49288936023408253, + "grad_norm": 3.969825267791748, + "learning_rate": 8.575854519682698e-05, + "loss": 1.0008, + "step": 77150 + }, + { + "epoch": 0.49295324738382124, + "grad_norm": 1.1453746557235718, + "learning_rate": 8.575503790771959e-05, + "loss": 1.0563, + "step": 77160 + }, + { + "epoch": 0.49301713453355994, + "grad_norm": 1.0311975479125977, + "learning_rate": 8.575153025852804e-05, + "loss": 0.944, + "step": 77170 + }, + { + "epoch": 0.49308102168329865, + "grad_norm": 0.713505744934082, + "learning_rate": 8.574802224928766e-05, + "loss": 0.8591, + "step": 77180 + }, + { + "epoch": 0.4931449088330373, + "grad_norm": 0.9348772764205933, + "learning_rate": 8.574451388003378e-05, + "loss": 0.7919, + "step": 77190 + }, + { + "epoch": 0.493208795982776, + "grad_norm": 1.0843831300735474, + "learning_rate": 8.57410051508017e-05, + "loss": 0.8934, + "step": 77200 + }, + { + "epoch": 0.4932726831325147, + "grad_norm": 0.8350475430488586, + "learning_rate": 8.573749606162678e-05, + "loss": 0.7806, + "step": 77210 + }, + { + "epoch": 0.4933365702822534, + "grad_norm": 1.9250234365463257, + "learning_rate": 8.573398661254438e-05, + "loss": 0.7523, + "step": 77220 + }, + { + "epoch": 0.4934004574319921, + "grad_norm": 0.9152832627296448, + "learning_rate": 8.573047680358978e-05, + "loss": 0.8821, + "step": 77230 + }, + { + "epoch": 0.49346434458173083, + "grad_norm": 0.9231581687927246, + "learning_rate": 8.57269666347984e-05, + "loss": 0.7803, + "step": 77240 + }, + { + "epoch": 0.49352823173146954, + "grad_norm": 0.792326807975769, + "learning_rate": 8.572345610620553e-05, + "loss": 0.9124, + "step": 77250 + }, + { + "epoch": 0.49359211888120824, + "grad_norm": 0.986379861831665, + "learning_rate": 8.571994521784659e-05, + "loss": 0.9055, + "step": 77260 + }, + { + "epoch": 0.49365600603094695, + "grad_norm": 1.0129300355911255, + "learning_rate": 8.571643396975688e-05, + "loss": 0.8168, + "step": 77270 + }, + { + "epoch": 0.49371989318068565, + "grad_norm": 0.5476410984992981, + "learning_rate": 8.571292236197178e-05, + "loss": 0.6777, + "step": 77280 + }, + { + "epoch": 0.49378378033042436, + "grad_norm": 1.3759448528289795, + "learning_rate": 8.570941039452665e-05, + "loss": 0.9291, + "step": 77290 + }, + { + "epoch": 0.49384766748016307, + "grad_norm": 1.3601030111312866, + "learning_rate": 8.570589806745687e-05, + "loss": 0.9618, + "step": 77300 + }, + { + "epoch": 0.4939115546299017, + "grad_norm": 1.174814224243164, + "learning_rate": 8.57023853807978e-05, + "loss": 0.8576, + "step": 77310 + }, + { + "epoch": 0.4939754417796404, + "grad_norm": 1.2242119312286377, + "learning_rate": 8.569887233458482e-05, + "loss": 1.1461, + "step": 77320 + }, + { + "epoch": 0.49403932892937913, + "grad_norm": 1.0794601440429688, + "learning_rate": 8.569535892885333e-05, + "loss": 0.8275, + "step": 77330 + }, + { + "epoch": 0.49410321607911784, + "grad_norm": 0.802666962146759, + "learning_rate": 8.569184516363869e-05, + "loss": 1.0711, + "step": 77340 + }, + { + "epoch": 0.49416710322885654, + "grad_norm": 0.9685359001159668, + "learning_rate": 8.568833103897628e-05, + "loss": 0.8529, + "step": 77350 + }, + { + "epoch": 0.49423099037859525, + "grad_norm": 0.9577045440673828, + "learning_rate": 8.568481655490151e-05, + "loss": 0.801, + "step": 77360 + }, + { + "epoch": 0.49429487752833395, + "grad_norm": 0.6817383170127869, + "learning_rate": 8.568130171144975e-05, + "loss": 0.9074, + "step": 77370 + }, + { + "epoch": 0.49435876467807266, + "grad_norm": 1.0045511722564697, + "learning_rate": 8.567778650865643e-05, + "loss": 0.8414, + "step": 77380 + }, + { + "epoch": 0.49442265182781137, + "grad_norm": 1.0986788272857666, + "learning_rate": 8.567427094655693e-05, + "loss": 0.8889, + "step": 77390 + }, + { + "epoch": 0.49448653897755007, + "grad_norm": 0.8840765357017517, + "learning_rate": 8.567075502518667e-05, + "loss": 0.9405, + "step": 77400 + }, + { + "epoch": 0.4945504261272888, + "grad_norm": 0.7125093936920166, + "learning_rate": 8.566723874458102e-05, + "loss": 1.0639, + "step": 77410 + }, + { + "epoch": 0.4946143132770275, + "grad_norm": 0.9904595017433167, + "learning_rate": 8.566372210477544e-05, + "loss": 0.9249, + "step": 77420 + }, + { + "epoch": 0.49467820042676613, + "grad_norm": 0.9218760132789612, + "learning_rate": 8.566020510580532e-05, + "loss": 0.8189, + "step": 77430 + }, + { + "epoch": 0.49474208757650484, + "grad_norm": 0.7512104511260986, + "learning_rate": 8.56566877477061e-05, + "loss": 1.1084, + "step": 77440 + }, + { + "epoch": 0.49480597472624355, + "grad_norm": 1.2798714637756348, + "learning_rate": 8.565317003051316e-05, + "loss": 1.0737, + "step": 77450 + }, + { + "epoch": 0.49486986187598225, + "grad_norm": 0.9428762793540955, + "learning_rate": 8.564965195426197e-05, + "loss": 0.8921, + "step": 77460 + }, + { + "epoch": 0.49493374902572096, + "grad_norm": 0.6847555637359619, + "learning_rate": 8.564613351898794e-05, + "loss": 0.6569, + "step": 77470 + }, + { + "epoch": 0.49499763617545967, + "grad_norm": 0.9463028311729431, + "learning_rate": 8.56426147247265e-05, + "loss": 1.0144, + "step": 77480 + }, + { + "epoch": 0.49506152332519837, + "grad_norm": 0.9155146479606628, + "learning_rate": 8.56390955715131e-05, + "loss": 1.0397, + "step": 77490 + }, + { + "epoch": 0.4951254104749371, + "grad_norm": 1.0941190719604492, + "learning_rate": 8.563557605938317e-05, + "loss": 0.9424, + "step": 77500 + }, + { + "epoch": 0.4951892976246758, + "grad_norm": 0.6458025574684143, + "learning_rate": 8.563205618837217e-05, + "loss": 0.9965, + "step": 77510 + }, + { + "epoch": 0.4952531847744145, + "grad_norm": 0.49309995770454407, + "learning_rate": 8.562853595851554e-05, + "loss": 0.9261, + "step": 77520 + }, + { + "epoch": 0.4953170719241532, + "grad_norm": 0.741894543170929, + "learning_rate": 8.562501536984873e-05, + "loss": 0.9298, + "step": 77530 + }, + { + "epoch": 0.4953809590738919, + "grad_norm": 0.9847732782363892, + "learning_rate": 8.562149442240718e-05, + "loss": 0.911, + "step": 77540 + }, + { + "epoch": 0.49544484622363055, + "grad_norm": 0.9211950302124023, + "learning_rate": 8.561797311622637e-05, + "loss": 0.8011, + "step": 77550 + }, + { + "epoch": 0.49550873337336926, + "grad_norm": 1.5273650884628296, + "learning_rate": 8.561445145134177e-05, + "loss": 0.9902, + "step": 77560 + }, + { + "epoch": 0.49557262052310797, + "grad_norm": 0.8541943430900574, + "learning_rate": 8.561092942778882e-05, + "loss": 0.7033, + "step": 77570 + }, + { + "epoch": 0.49563650767284667, + "grad_norm": 1.0141565799713135, + "learning_rate": 8.560740704560299e-05, + "loss": 0.7592, + "step": 77580 + }, + { + "epoch": 0.4957003948225854, + "grad_norm": 1.0680170059204102, + "learning_rate": 8.560388430481979e-05, + "loss": 1.0655, + "step": 77590 + }, + { + "epoch": 0.4957642819723241, + "grad_norm": 0.8860800862312317, + "learning_rate": 8.560036120547468e-05, + "loss": 0.9499, + "step": 77600 + }, + { + "epoch": 0.4958281691220628, + "grad_norm": 1.698761224746704, + "learning_rate": 8.559683774760311e-05, + "loss": 1.088, + "step": 77610 + }, + { + "epoch": 0.4958920562718015, + "grad_norm": 1.741514801979065, + "learning_rate": 8.559331393124059e-05, + "loss": 0.9058, + "step": 77620 + }, + { + "epoch": 0.4959559434215402, + "grad_norm": 2.4504334926605225, + "learning_rate": 8.558978975642262e-05, + "loss": 0.8441, + "step": 77630 + }, + { + "epoch": 0.4960198305712789, + "grad_norm": 1.7975729703903198, + "learning_rate": 8.558626522318467e-05, + "loss": 1.0169, + "step": 77640 + }, + { + "epoch": 0.4960837177210176, + "grad_norm": 0.8959939479827881, + "learning_rate": 8.558274033156224e-05, + "loss": 0.7613, + "step": 77650 + }, + { + "epoch": 0.4961476048707563, + "grad_norm": 1.107818841934204, + "learning_rate": 8.557921508159083e-05, + "loss": 1.1208, + "step": 77660 + }, + { + "epoch": 0.49621149202049497, + "grad_norm": 0.7719281911849976, + "learning_rate": 8.557568947330596e-05, + "loss": 0.932, + "step": 77670 + }, + { + "epoch": 0.4962753791702337, + "grad_norm": 0.7551287412643433, + "learning_rate": 8.557216350674311e-05, + "loss": 0.9016, + "step": 77680 + }, + { + "epoch": 0.4963392663199724, + "grad_norm": 0.8318659067153931, + "learning_rate": 8.556863718193779e-05, + "loss": 0.9607, + "step": 77690 + }, + { + "epoch": 0.4964031534697111, + "grad_norm": 0.799311101436615, + "learning_rate": 8.556511049892553e-05, + "loss": 1.0467, + "step": 77700 + }, + { + "epoch": 0.4964670406194498, + "grad_norm": 1.174993634223938, + "learning_rate": 8.556158345774184e-05, + "loss": 0.9272, + "step": 77710 + }, + { + "epoch": 0.4965309277691885, + "grad_norm": 0.6065928936004639, + "learning_rate": 8.555805605842224e-05, + "loss": 0.9209, + "step": 77720 + }, + { + "epoch": 0.4965948149189272, + "grad_norm": 0.9526639580726624, + "learning_rate": 8.555452830100226e-05, + "loss": 0.8938, + "step": 77730 + }, + { + "epoch": 0.4966587020686659, + "grad_norm": 0.893748939037323, + "learning_rate": 8.555100018551741e-05, + "loss": 1.0344, + "step": 77740 + }, + { + "epoch": 0.4967225892184046, + "grad_norm": 0.9119165539741516, + "learning_rate": 8.554747171200324e-05, + "loss": 1.1131, + "step": 77750 + }, + { + "epoch": 0.4967864763681433, + "grad_norm": 0.649663507938385, + "learning_rate": 8.554394288049526e-05, + "loss": 0.8, + "step": 77760 + }, + { + "epoch": 0.49685036351788203, + "grad_norm": 0.7879058122634888, + "learning_rate": 8.554041369102904e-05, + "loss": 0.7511, + "step": 77770 + }, + { + "epoch": 0.49691425066762074, + "grad_norm": 1.0487127304077148, + "learning_rate": 8.55368841436401e-05, + "loss": 1.022, + "step": 77780 + }, + { + "epoch": 0.4969781378173594, + "grad_norm": 2.496959686279297, + "learning_rate": 8.553335423836399e-05, + "loss": 1.0092, + "step": 77790 + }, + { + "epoch": 0.4970420249670981, + "grad_norm": 0.8213831186294556, + "learning_rate": 8.552982397523628e-05, + "loss": 0.6308, + "step": 77800 + }, + { + "epoch": 0.4971059121168368, + "grad_norm": 1.1579580307006836, + "learning_rate": 8.55262933542925e-05, + "loss": 1.0219, + "step": 77810 + }, + { + "epoch": 0.4971697992665755, + "grad_norm": 0.927528440952301, + "learning_rate": 8.55227623755682e-05, + "loss": 0.7056, + "step": 77820 + }, + { + "epoch": 0.4972336864163142, + "grad_norm": 1.2623285055160522, + "learning_rate": 8.551923103909896e-05, + "loss": 0.9829, + "step": 77830 + }, + { + "epoch": 0.4972975735660529, + "grad_norm": 0.9464250802993774, + "learning_rate": 8.551569934492032e-05, + "loss": 0.8887, + "step": 77840 + }, + { + "epoch": 0.4973614607157916, + "grad_norm": 2.496879816055298, + "learning_rate": 8.551216729306788e-05, + "loss": 1.054, + "step": 77850 + }, + { + "epoch": 0.49742534786553033, + "grad_norm": 1.2735011577606201, + "learning_rate": 8.550863488357718e-05, + "loss": 0.7108, + "step": 77860 + }, + { + "epoch": 0.49748923501526904, + "grad_norm": 0.8742243647575378, + "learning_rate": 8.550510211648382e-05, + "loss": 1.1427, + "step": 77870 + }, + { + "epoch": 0.49755312216500774, + "grad_norm": 0.5537328720092773, + "learning_rate": 8.550156899182336e-05, + "loss": 0.5181, + "step": 77880 + }, + { + "epoch": 0.49761700931474645, + "grad_norm": 1.0898637771606445, + "learning_rate": 8.54980355096314e-05, + "loss": 1.0092, + "step": 77890 + }, + { + "epoch": 0.49768089646448516, + "grad_norm": 0.8493994474411011, + "learning_rate": 8.549450166994348e-05, + "loss": 0.9335, + "step": 77900 + }, + { + "epoch": 0.49774478361422386, + "grad_norm": 0.8540746569633484, + "learning_rate": 8.549096747279526e-05, + "loss": 0.9631, + "step": 77910 + }, + { + "epoch": 0.4978086707639625, + "grad_norm": 0.9067754745483398, + "learning_rate": 8.548743291822227e-05, + "loss": 0.7435, + "step": 77920 + }, + { + "epoch": 0.4978725579137012, + "grad_norm": 0.9325600862503052, + "learning_rate": 8.548389800626013e-05, + "loss": 0.9721, + "step": 77930 + }, + { + "epoch": 0.4979364450634399, + "grad_norm": 0.892930805683136, + "learning_rate": 8.548036273694445e-05, + "loss": 1.0944, + "step": 77940 + }, + { + "epoch": 0.49800033221317863, + "grad_norm": 0.8587602376937866, + "learning_rate": 8.54768271103108e-05, + "loss": 1.1267, + "step": 77950 + }, + { + "epoch": 0.49806421936291734, + "grad_norm": 0.6374524831771851, + "learning_rate": 8.547329112639483e-05, + "loss": 0.8988, + "step": 77960 + }, + { + "epoch": 0.49812810651265604, + "grad_norm": 0.7276429533958435, + "learning_rate": 8.546975478523211e-05, + "loss": 0.9158, + "step": 77970 + }, + { + "epoch": 0.49819199366239475, + "grad_norm": 1.0100622177124023, + "learning_rate": 8.546621808685829e-05, + "loss": 0.9022, + "step": 77980 + }, + { + "epoch": 0.49825588081213346, + "grad_norm": 0.7123457193374634, + "learning_rate": 8.546268103130897e-05, + "loss": 1.0161, + "step": 77990 + }, + { + "epoch": 0.49831976796187216, + "grad_norm": 1.1501771211624146, + "learning_rate": 8.545914361861977e-05, + "loss": 0.848, + "step": 78000 + }, + { + "epoch": 0.49838365511161087, + "grad_norm": 1.3650606870651245, + "learning_rate": 8.545560584882632e-05, + "loss": 1.0104, + "step": 78010 + }, + { + "epoch": 0.4984475422613496, + "grad_norm": 0.5080598592758179, + "learning_rate": 8.545206772196425e-05, + "loss": 0.8855, + "step": 78020 + }, + { + "epoch": 0.4985114294110883, + "grad_norm": 0.9266533851623535, + "learning_rate": 8.544852923806918e-05, + "loss": 0.8948, + "step": 78030 + }, + { + "epoch": 0.49857531656082693, + "grad_norm": 0.782556414604187, + "learning_rate": 8.544499039717675e-05, + "loss": 1.0098, + "step": 78040 + }, + { + "epoch": 0.49863920371056564, + "grad_norm": 0.6983265280723572, + "learning_rate": 8.544145119932261e-05, + "loss": 0.8239, + "step": 78050 + }, + { + "epoch": 0.49870309086030434, + "grad_norm": 0.8616853952407837, + "learning_rate": 8.543791164454238e-05, + "loss": 0.8512, + "step": 78060 + }, + { + "epoch": 0.49876697801004305, + "grad_norm": 1.00681471824646, + "learning_rate": 8.543437173287175e-05, + "loss": 0.8171, + "step": 78070 + }, + { + "epoch": 0.49883086515978176, + "grad_norm": 0.7537940144538879, + "learning_rate": 8.543083146434632e-05, + "loss": 0.7415, + "step": 78080 + }, + { + "epoch": 0.49889475230952046, + "grad_norm": 0.8292582631111145, + "learning_rate": 8.542729083900176e-05, + "loss": 0.7361, + "step": 78090 + }, + { + "epoch": 0.49895863945925917, + "grad_norm": 0.6987549066543579, + "learning_rate": 8.542374985687376e-05, + "loss": 1.0473, + "step": 78100 + }, + { + "epoch": 0.4990225266089979, + "grad_norm": 0.7763581275939941, + "learning_rate": 8.542020851799792e-05, + "loss": 0.9915, + "step": 78110 + }, + { + "epoch": 0.4990864137587366, + "grad_norm": 0.857244610786438, + "learning_rate": 8.541666682240996e-05, + "loss": 0.8533, + "step": 78120 + }, + { + "epoch": 0.4991503009084753, + "grad_norm": 0.7238770127296448, + "learning_rate": 8.541312477014551e-05, + "loss": 1.0054, + "step": 78130 + }, + { + "epoch": 0.499214188058214, + "grad_norm": 1.2885125875473022, + "learning_rate": 8.540958236124028e-05, + "loss": 0.9024, + "step": 78140 + }, + { + "epoch": 0.4992780752079527, + "grad_norm": 1.3444433212280273, + "learning_rate": 8.540603959572991e-05, + "loss": 0.8516, + "step": 78150 + }, + { + "epoch": 0.49934196235769135, + "grad_norm": 1.0060087442398071, + "learning_rate": 8.540249647365008e-05, + "loss": 0.8785, + "step": 78160 + }, + { + "epoch": 0.49940584950743006, + "grad_norm": 1.0679055452346802, + "learning_rate": 8.539895299503648e-05, + "loss": 0.9874, + "step": 78170 + }, + { + "epoch": 0.49946973665716876, + "grad_norm": 0.7113552093505859, + "learning_rate": 8.539540915992482e-05, + "loss": 0.8014, + "step": 78180 + }, + { + "epoch": 0.49953362380690747, + "grad_norm": 1.0472384691238403, + "learning_rate": 8.539186496835077e-05, + "loss": 0.9478, + "step": 78190 + }, + { + "epoch": 0.4995975109566462, + "grad_norm": 0.7270193099975586, + "learning_rate": 8.538832042035e-05, + "loss": 0.9747, + "step": 78200 + }, + { + "epoch": 0.4996613981063849, + "grad_norm": 0.6182805895805359, + "learning_rate": 8.538477551595824e-05, + "loss": 0.9063, + "step": 78210 + }, + { + "epoch": 0.4997252852561236, + "grad_norm": 1.1360803842544556, + "learning_rate": 8.538123025521117e-05, + "loss": 0.942, + "step": 78220 + }, + { + "epoch": 0.4997891724058623, + "grad_norm": 0.7602689862251282, + "learning_rate": 8.537768463814451e-05, + "loss": 0.9089, + "step": 78230 + }, + { + "epoch": 0.499853059555601, + "grad_norm": 0.8490816354751587, + "learning_rate": 8.537413866479396e-05, + "loss": 1.0442, + "step": 78240 + }, + { + "epoch": 0.4999169467053397, + "grad_norm": 0.8993768692016602, + "learning_rate": 8.537059233519522e-05, + "loss": 0.7744, + "step": 78250 + }, + { + "epoch": 0.4999808338550784, + "grad_norm": 1.221891164779663, + "learning_rate": 8.536704564938402e-05, + "loss": 0.9663, + "step": 78260 + }, + { + "epoch": 0.5000447210048171, + "grad_norm": 0.7886923551559448, + "learning_rate": 8.536349860739608e-05, + "loss": 0.9475, + "step": 78270 + }, + { + "epoch": 0.5001086081545558, + "grad_norm": 0.8273355960845947, + "learning_rate": 8.535995120926712e-05, + "loss": 0.9991, + "step": 78280 + }, + { + "epoch": 0.5001724953042945, + "grad_norm": 0.7041333317756653, + "learning_rate": 8.535640345503285e-05, + "loss": 0.7416, + "step": 78290 + }, + { + "epoch": 0.5002363824540332, + "grad_norm": 0.994158148765564, + "learning_rate": 8.535285534472901e-05, + "loss": 0.802, + "step": 78300 + }, + { + "epoch": 0.5003002696037719, + "grad_norm": 1.7603987455368042, + "learning_rate": 8.534930687839134e-05, + "loss": 0.948, + "step": 78310 + }, + { + "epoch": 0.5003641567535106, + "grad_norm": 0.8774569034576416, + "learning_rate": 8.534575805605555e-05, + "loss": 0.9112, + "step": 78320 + }, + { + "epoch": 0.5004280439032494, + "grad_norm": 1.1153593063354492, + "learning_rate": 8.534220887775743e-05, + "loss": 0.8952, + "step": 78330 + }, + { + "epoch": 0.500491931052988, + "grad_norm": 0.6436009407043457, + "learning_rate": 8.533865934353267e-05, + "loss": 0.8265, + "step": 78340 + }, + { + "epoch": 0.5005558182027267, + "grad_norm": 0.8798633217811584, + "learning_rate": 8.533510945341704e-05, + "loss": 0.876, + "step": 78350 + }, + { + "epoch": 0.5006197053524654, + "grad_norm": 0.92572021484375, + "learning_rate": 8.533155920744629e-05, + "loss": 1.0156, + "step": 78360 + }, + { + "epoch": 0.5006835925022041, + "grad_norm": 1.193281650543213, + "learning_rate": 8.532800860565618e-05, + "loss": 0.7637, + "step": 78370 + }, + { + "epoch": 0.5007474796519428, + "grad_norm": 1.7722188234329224, + "learning_rate": 8.532445764808243e-05, + "loss": 0.8904, + "step": 78380 + }, + { + "epoch": 0.5008113668016815, + "grad_norm": 1.1688928604125977, + "learning_rate": 8.532090633476087e-05, + "loss": 0.9443, + "step": 78390 + }, + { + "epoch": 0.5008752539514202, + "grad_norm": 0.7432667016983032, + "learning_rate": 8.531735466572722e-05, + "loss": 0.8921, + "step": 78400 + }, + { + "epoch": 0.5009391411011589, + "grad_norm": 0.5280702710151672, + "learning_rate": 8.531380264101722e-05, + "loss": 0.6771, + "step": 78410 + }, + { + "epoch": 0.5010030282508976, + "grad_norm": 1.1904703378677368, + "learning_rate": 8.531025026066672e-05, + "loss": 0.9539, + "step": 78420 + }, + { + "epoch": 0.5010669154006363, + "grad_norm": 1.0105900764465332, + "learning_rate": 8.530669752471142e-05, + "loss": 0.7645, + "step": 78430 + }, + { + "epoch": 0.501130802550375, + "grad_norm": 1.2654132843017578, + "learning_rate": 8.530314443318714e-05, + "loss": 0.8891, + "step": 78440 + }, + { + "epoch": 0.5011946897001137, + "grad_norm": 1.8313031196594238, + "learning_rate": 8.529959098612966e-05, + "loss": 0.8518, + "step": 78450 + }, + { + "epoch": 0.5012585768498524, + "grad_norm": 0.9657493829727173, + "learning_rate": 8.529603718357476e-05, + "loss": 0.9087, + "step": 78460 + }, + { + "epoch": 0.5013224639995911, + "grad_norm": 0.9630830883979797, + "learning_rate": 8.529248302555824e-05, + "loss": 0.9349, + "step": 78470 + }, + { + "epoch": 0.5013863511493298, + "grad_norm": 0.7544282674789429, + "learning_rate": 8.528892851211587e-05, + "loss": 0.811, + "step": 78480 + }, + { + "epoch": 0.5014502382990685, + "grad_norm": 1.854946255683899, + "learning_rate": 8.528537364328346e-05, + "loss": 1.142, + "step": 78490 + }, + { + "epoch": 0.5015141254488072, + "grad_norm": 0.9642276763916016, + "learning_rate": 8.528181841909681e-05, + "loss": 0.8864, + "step": 78500 + }, + { + "epoch": 0.501578012598546, + "grad_norm": 0.8221122026443481, + "learning_rate": 8.527826283959173e-05, + "loss": 0.9846, + "step": 78510 + }, + { + "epoch": 0.5016418997482847, + "grad_norm": 1.0337133407592773, + "learning_rate": 8.527470690480403e-05, + "loss": 0.7898, + "step": 78520 + }, + { + "epoch": 0.5017057868980234, + "grad_norm": 0.7045915126800537, + "learning_rate": 8.527115061476951e-05, + "loss": 0.9587, + "step": 78530 + }, + { + "epoch": 0.5017696740477621, + "grad_norm": 0.8976203799247742, + "learning_rate": 8.526759396952398e-05, + "loss": 0.815, + "step": 78540 + }, + { + "epoch": 0.5018335611975008, + "grad_norm": 1.2948665618896484, + "learning_rate": 8.526403696910326e-05, + "loss": 1.1365, + "step": 78550 + }, + { + "epoch": 0.5018974483472395, + "grad_norm": 0.5973215699195862, + "learning_rate": 8.52604796135432e-05, + "loss": 0.934, + "step": 78560 + }, + { + "epoch": 0.5019613354969782, + "grad_norm": 0.8393608927726746, + "learning_rate": 8.52569219028796e-05, + "loss": 0.9389, + "step": 78570 + }, + { + "epoch": 0.5020252226467168, + "grad_norm": 0.8553054332733154, + "learning_rate": 8.525336383714831e-05, + "loss": 0.9821, + "step": 78580 + }, + { + "epoch": 0.5020891097964555, + "grad_norm": 0.43800783157348633, + "learning_rate": 8.524980541638513e-05, + "loss": 0.7432, + "step": 78590 + }, + { + "epoch": 0.5021529969461942, + "grad_norm": 0.6436516046524048, + "learning_rate": 8.524624664062591e-05, + "loss": 0.8488, + "step": 78600 + }, + { + "epoch": 0.5022168840959329, + "grad_norm": 1.9890680313110352, + "learning_rate": 8.524268750990649e-05, + "loss": 0.9869, + "step": 78610 + }, + { + "epoch": 0.5022807712456716, + "grad_norm": 0.5732369422912598, + "learning_rate": 8.523912802426274e-05, + "loss": 0.8985, + "step": 78620 + }, + { + "epoch": 0.5023446583954103, + "grad_norm": 2.491802453994751, + "learning_rate": 8.523556818373047e-05, + "loss": 0.7785, + "step": 78630 + }, + { + "epoch": 0.502408545545149, + "grad_norm": 0.7027126550674438, + "learning_rate": 8.523200798834555e-05, + "loss": 0.9466, + "step": 78640 + }, + { + "epoch": 0.5024724326948877, + "grad_norm": 0.7287322282791138, + "learning_rate": 8.522844743814382e-05, + "loss": 0.7692, + "step": 78650 + }, + { + "epoch": 0.5025363198446264, + "grad_norm": 0.9365010857582092, + "learning_rate": 8.522488653316117e-05, + "loss": 0.7661, + "step": 78660 + }, + { + "epoch": 0.5026002069943651, + "grad_norm": 0.5464925765991211, + "learning_rate": 8.522132527343342e-05, + "loss": 0.6622, + "step": 78670 + }, + { + "epoch": 0.5026640941441038, + "grad_norm": 1.0046019554138184, + "learning_rate": 8.521776365899645e-05, + "loss": 0.8033, + "step": 78680 + }, + { + "epoch": 0.5027279812938426, + "grad_norm": 1.4342055320739746, + "learning_rate": 8.521420168988615e-05, + "loss": 0.6578, + "step": 78690 + }, + { + "epoch": 0.5027918684435813, + "grad_norm": 0.5104334950447083, + "learning_rate": 8.521063936613835e-05, + "loss": 0.8798, + "step": 78700 + }, + { + "epoch": 0.50285575559332, + "grad_norm": 0.5231984853744507, + "learning_rate": 8.520707668778897e-05, + "loss": 0.709, + "step": 78710 + }, + { + "epoch": 0.5029196427430587, + "grad_norm": 1.10532546043396, + "learning_rate": 8.520351365487387e-05, + "loss": 0.7458, + "step": 78720 + }, + { + "epoch": 0.5029835298927974, + "grad_norm": 1.1135833263397217, + "learning_rate": 8.519995026742892e-05, + "loss": 0.779, + "step": 78730 + }, + { + "epoch": 0.5030474170425361, + "grad_norm": 1.3996037244796753, + "learning_rate": 8.519638652549003e-05, + "loss": 0.8194, + "step": 78740 + }, + { + "epoch": 0.5031113041922748, + "grad_norm": 0.8986942172050476, + "learning_rate": 8.519282242909307e-05, + "loss": 0.876, + "step": 78750 + }, + { + "epoch": 0.5031751913420135, + "grad_norm": 1.100974202156067, + "learning_rate": 8.518925797827394e-05, + "loss": 0.9528, + "step": 78760 + }, + { + "epoch": 0.5032390784917522, + "grad_norm": 0.801201581954956, + "learning_rate": 8.518569317306855e-05, + "loss": 1.0414, + "step": 78770 + }, + { + "epoch": 0.5033029656414909, + "grad_norm": 0.7082595825195312, + "learning_rate": 8.518212801351278e-05, + "loss": 0.9245, + "step": 78780 + }, + { + "epoch": 0.5033668527912296, + "grad_norm": 0.708473265171051, + "learning_rate": 8.517856249964254e-05, + "loss": 1.1068, + "step": 78790 + }, + { + "epoch": 0.5034307399409683, + "grad_norm": 0.6335508823394775, + "learning_rate": 8.517499663149376e-05, + "loss": 0.6662, + "step": 78800 + }, + { + "epoch": 0.503494627090707, + "grad_norm": 0.6749662756919861, + "learning_rate": 8.517143040910231e-05, + "loss": 0.7987, + "step": 78810 + }, + { + "epoch": 0.5035585142404457, + "grad_norm": 0.8133144974708557, + "learning_rate": 8.516786383250415e-05, + "loss": 0.8016, + "step": 78820 + }, + { + "epoch": 0.5036224013901843, + "grad_norm": 2.017829418182373, + "learning_rate": 8.516429690173516e-05, + "loss": 0.9251, + "step": 78830 + }, + { + "epoch": 0.503686288539923, + "grad_norm": 0.5370079278945923, + "learning_rate": 8.516072961683128e-05, + "loss": 0.8484, + "step": 78840 + }, + { + "epoch": 0.5037501756896617, + "grad_norm": 0.9369492530822754, + "learning_rate": 8.515716197782845e-05, + "loss": 0.7588, + "step": 78850 + }, + { + "epoch": 0.5038140628394004, + "grad_norm": 0.9964790344238281, + "learning_rate": 8.515359398476257e-05, + "loss": 1.0391, + "step": 78860 + }, + { + "epoch": 0.5038779499891392, + "grad_norm": 0.9717357158660889, + "learning_rate": 8.51500256376696e-05, + "loss": 0.8133, + "step": 78870 + }, + { + "epoch": 0.5039418371388779, + "grad_norm": 1.1114850044250488, + "learning_rate": 8.514645693658545e-05, + "loss": 1.0353, + "step": 78880 + }, + { + "epoch": 0.5040057242886166, + "grad_norm": 0.9141243100166321, + "learning_rate": 8.514288788154607e-05, + "loss": 1.0811, + "step": 78890 + }, + { + "epoch": 0.5040696114383553, + "grad_norm": 0.6969479322433472, + "learning_rate": 8.513931847258741e-05, + "loss": 0.7896, + "step": 78900 + }, + { + "epoch": 0.504133498588094, + "grad_norm": 0.8812980651855469, + "learning_rate": 8.513574870974542e-05, + "loss": 0.9231, + "step": 78910 + }, + { + "epoch": 0.5041973857378327, + "grad_norm": 0.9980469942092896, + "learning_rate": 8.513217859305604e-05, + "loss": 0.8142, + "step": 78920 + }, + { + "epoch": 0.5042612728875714, + "grad_norm": 2.051957130432129, + "learning_rate": 8.512860812255523e-05, + "loss": 0.9061, + "step": 78930 + }, + { + "epoch": 0.5043251600373101, + "grad_norm": 1.683716058731079, + "learning_rate": 8.512503729827894e-05, + "loss": 0.8771, + "step": 78940 + }, + { + "epoch": 0.5043890471870488, + "grad_norm": 0.7868318557739258, + "learning_rate": 8.512146612026314e-05, + "loss": 0.8051, + "step": 78950 + }, + { + "epoch": 0.5044529343367875, + "grad_norm": 0.5927671790122986, + "learning_rate": 8.511789458854379e-05, + "loss": 0.8834, + "step": 78960 + }, + { + "epoch": 0.5045168214865262, + "grad_norm": 1.6531774997711182, + "learning_rate": 8.511432270315685e-05, + "loss": 0.7847, + "step": 78970 + }, + { + "epoch": 0.5045807086362649, + "grad_norm": 1.921579360961914, + "learning_rate": 8.511075046413832e-05, + "loss": 0.9616, + "step": 78980 + }, + { + "epoch": 0.5046445957860036, + "grad_norm": 0.9210075736045837, + "learning_rate": 8.510717787152416e-05, + "loss": 0.6717, + "step": 78990 + }, + { + "epoch": 0.5047084829357423, + "grad_norm": 0.7043361663818359, + "learning_rate": 8.510360492535033e-05, + "loss": 0.8636, + "step": 79000 + }, + { + "epoch": 0.504772370085481, + "grad_norm": 0.8452950716018677, + "learning_rate": 8.510003162565283e-05, + "loss": 1.0588, + "step": 79010 + }, + { + "epoch": 0.5048362572352197, + "grad_norm": 0.9324773550033569, + "learning_rate": 8.509645797246766e-05, + "loss": 0.8968, + "step": 79020 + }, + { + "epoch": 0.5049001443849584, + "grad_norm": 0.7496733069419861, + "learning_rate": 8.50928839658308e-05, + "loss": 0.9523, + "step": 79030 + }, + { + "epoch": 0.5049640315346972, + "grad_norm": 1.1798431873321533, + "learning_rate": 8.508930960577821e-05, + "loss": 0.9494, + "step": 79040 + }, + { + "epoch": 0.5050279186844359, + "grad_norm": 0.9668488502502441, + "learning_rate": 8.508573489234594e-05, + "loss": 0.8869, + "step": 79050 + }, + { + "epoch": 0.5050918058341746, + "grad_norm": 0.746605634689331, + "learning_rate": 8.508215982556996e-05, + "loss": 0.8892, + "step": 79060 + }, + { + "epoch": 0.5051556929839132, + "grad_norm": 0.7922160029411316, + "learning_rate": 8.507858440548628e-05, + "loss": 0.772, + "step": 79070 + }, + { + "epoch": 0.5052195801336519, + "grad_norm": 0.7023123502731323, + "learning_rate": 8.50750086321309e-05, + "loss": 1.062, + "step": 79080 + }, + { + "epoch": 0.5052834672833906, + "grad_norm": 0.7416033148765564, + "learning_rate": 8.507143250553985e-05, + "loss": 0.8482, + "step": 79090 + }, + { + "epoch": 0.5053473544331293, + "grad_norm": 0.6974393725395203, + "learning_rate": 8.506785602574914e-05, + "loss": 1.0322, + "step": 79100 + }, + { + "epoch": 0.505411241582868, + "grad_norm": 1.0407123565673828, + "learning_rate": 8.506427919279478e-05, + "loss": 0.7803, + "step": 79110 + }, + { + "epoch": 0.5054751287326067, + "grad_norm": 0.6520995497703552, + "learning_rate": 8.506070200671277e-05, + "loss": 1.2658, + "step": 79120 + }, + { + "epoch": 0.5055390158823454, + "grad_norm": 1.1513316631317139, + "learning_rate": 8.505712446753918e-05, + "loss": 0.8079, + "step": 79130 + }, + { + "epoch": 0.5056029030320841, + "grad_norm": 0.7983292937278748, + "learning_rate": 8.505354657531001e-05, + "loss": 1.3388, + "step": 79140 + }, + { + "epoch": 0.5056667901818228, + "grad_norm": 0.6515194177627563, + "learning_rate": 8.50499683300613e-05, + "loss": 0.7375, + "step": 79150 + }, + { + "epoch": 0.5057306773315615, + "grad_norm": 0.7249539494514465, + "learning_rate": 8.504638973182908e-05, + "loss": 0.8181, + "step": 79160 + }, + { + "epoch": 0.5057945644813002, + "grad_norm": 1.1405197381973267, + "learning_rate": 8.504281078064942e-05, + "loss": 0.8314, + "step": 79170 + }, + { + "epoch": 0.5058584516310389, + "grad_norm": 0.5889720320701599, + "learning_rate": 8.503923147655832e-05, + "loss": 0.9283, + "step": 79180 + }, + { + "epoch": 0.5059223387807776, + "grad_norm": 1.013061761856079, + "learning_rate": 8.503565181959185e-05, + "loss": 0.7729, + "step": 79190 + }, + { + "epoch": 0.5059862259305163, + "grad_norm": 1.1202266216278076, + "learning_rate": 8.503207180978604e-05, + "loss": 0.888, + "step": 79200 + }, + { + "epoch": 0.506050113080255, + "grad_norm": 1.2008094787597656, + "learning_rate": 8.502849144717698e-05, + "loss": 0.8544, + "step": 79210 + }, + { + "epoch": 0.5061140002299938, + "grad_norm": 0.7154238224029541, + "learning_rate": 8.50249107318007e-05, + "loss": 1.0849, + "step": 79220 + }, + { + "epoch": 0.5061778873797325, + "grad_norm": 0.9151634573936462, + "learning_rate": 8.502132966369327e-05, + "loss": 0.8898, + "step": 79230 + }, + { + "epoch": 0.5062417745294712, + "grad_norm": 0.9326740503311157, + "learning_rate": 8.501774824289076e-05, + "loss": 0.7975, + "step": 79240 + }, + { + "epoch": 0.5063056616792099, + "grad_norm": 0.9655689001083374, + "learning_rate": 8.501416646942922e-05, + "loss": 0.8229, + "step": 79250 + }, + { + "epoch": 0.5063695488289486, + "grad_norm": NaN, + "learning_rate": 8.50109425718202e-05, + "loss": 0.929, + "step": 79260 + }, + { + "epoch": 0.5064334359786873, + "grad_norm": 1.2509207725524902, + "learning_rate": 8.50073601284059e-05, + "loss": 0.8922, + "step": 79270 + }, + { + "epoch": 0.506497323128426, + "grad_norm": 0.7241592407226562, + "learning_rate": 8.50037773324372e-05, + "loss": 1.0239, + "step": 79280 + }, + { + "epoch": 0.5065612102781647, + "grad_norm": 0.7398717999458313, + "learning_rate": 8.500019418395019e-05, + "loss": 1.0812, + "step": 79290 + }, + { + "epoch": 0.5066250974279034, + "grad_norm": 1.2325761318206787, + "learning_rate": 8.499661068298093e-05, + "loss": 0.9234, + "step": 79300 + }, + { + "epoch": 0.506688984577642, + "grad_norm": 0.7128446102142334, + "learning_rate": 8.499302682956554e-05, + "loss": 0.7636, + "step": 79310 + }, + { + "epoch": 0.5067528717273807, + "grad_norm": 1.0909960269927979, + "learning_rate": 8.498944262374009e-05, + "loss": 0.9345, + "step": 79320 + }, + { + "epoch": 0.5068167588771194, + "grad_norm": 0.8504812121391296, + "learning_rate": 8.498585806554069e-05, + "loss": 1.0587, + "step": 79330 + }, + { + "epoch": 0.5068806460268581, + "grad_norm": 1.0179625749588013, + "learning_rate": 8.498227315500343e-05, + "loss": 0.8948, + "step": 79340 + }, + { + "epoch": 0.5069445331765968, + "grad_norm": 0.7913358807563782, + "learning_rate": 8.497868789216439e-05, + "loss": 0.9132, + "step": 79350 + }, + { + "epoch": 0.5070084203263355, + "grad_norm": 2.097581148147583, + "learning_rate": 8.497510227705972e-05, + "loss": 1.0746, + "step": 79360 + }, + { + "epoch": 0.5070723074760742, + "grad_norm": 0.8437251448631287, + "learning_rate": 8.497151630972552e-05, + "loss": 0.8626, + "step": 79370 + }, + { + "epoch": 0.5071361946258129, + "grad_norm": 1.6225666999816895, + "learning_rate": 8.496792999019789e-05, + "loss": 0.9101, + "step": 79380 + }, + { + "epoch": 0.5072000817755516, + "grad_norm": 1.98760986328125, + "learning_rate": 8.496434331851295e-05, + "loss": 0.8182, + "step": 79390 + }, + { + "epoch": 0.5072639689252904, + "grad_norm": 0.8181973099708557, + "learning_rate": 8.496075629470683e-05, + "loss": 0.7777, + "step": 79400 + }, + { + "epoch": 0.5073278560750291, + "grad_norm": 0.9031455516815186, + "learning_rate": 8.495716891881564e-05, + "loss": 1.0561, + "step": 79410 + }, + { + "epoch": 0.5073917432247678, + "grad_norm": 1.2834783792495728, + "learning_rate": 8.495358119087553e-05, + "loss": 0.8807, + "step": 79420 + }, + { + "epoch": 0.5074556303745065, + "grad_norm": 0.5575640201568604, + "learning_rate": 8.494999311092262e-05, + "loss": 0.8329, + "step": 79430 + }, + { + "epoch": 0.5075195175242452, + "grad_norm": 1.2049697637557983, + "learning_rate": 8.494640467899303e-05, + "loss": 1.0383, + "step": 79440 + }, + { + "epoch": 0.5075834046739839, + "grad_norm": 1.0265311002731323, + "learning_rate": 8.494281589512292e-05, + "loss": 0.8573, + "step": 79450 + }, + { + "epoch": 0.5076472918237226, + "grad_norm": 1.0250693559646606, + "learning_rate": 8.493922675934842e-05, + "loss": 1.0297, + "step": 79460 + }, + { + "epoch": 0.5077111789734613, + "grad_norm": 0.6825410723686218, + "learning_rate": 8.493563727170569e-05, + "loss": 0.7719, + "step": 79470 + }, + { + "epoch": 0.5077750661232, + "grad_norm": 0.7861701250076294, + "learning_rate": 8.493204743223084e-05, + "loss": 0.915, + "step": 79480 + }, + { + "epoch": 0.5078389532729387, + "grad_norm": 0.7970221638679504, + "learning_rate": 8.492845724096008e-05, + "loss": 0.7341, + "step": 79490 + }, + { + "epoch": 0.5079028404226774, + "grad_norm": 1.6295416355133057, + "learning_rate": 8.492486669792955e-05, + "loss": 0.7568, + "step": 79500 + }, + { + "epoch": 0.5079667275724161, + "grad_norm": 1.207197666168213, + "learning_rate": 8.492127580317536e-05, + "loss": 0.7609, + "step": 79510 + }, + { + "epoch": 0.5080306147221548, + "grad_norm": 0.6028062105178833, + "learning_rate": 8.491768455673373e-05, + "loss": 0.8507, + "step": 79520 + }, + { + "epoch": 0.5080945018718935, + "grad_norm": 0.6845399737358093, + "learning_rate": 8.49140929586408e-05, + "loss": 0.8778, + "step": 79530 + }, + { + "epoch": 0.5081583890216322, + "grad_norm": 0.9547748565673828, + "learning_rate": 8.491050100893276e-05, + "loss": 0.7729, + "step": 79540 + }, + { + "epoch": 0.508222276171371, + "grad_norm": 1.033980369567871, + "learning_rate": 8.490690870764577e-05, + "loss": 0.928, + "step": 79550 + }, + { + "epoch": 0.5082861633211095, + "grad_norm": 0.9669222831726074, + "learning_rate": 8.490331605481602e-05, + "loss": 0.9523, + "step": 79560 + }, + { + "epoch": 0.5083500504708482, + "grad_norm": 0.8368834853172302, + "learning_rate": 8.489972305047968e-05, + "loss": 1.0998, + "step": 79570 + }, + { + "epoch": 0.508413937620587, + "grad_norm": 0.8119040727615356, + "learning_rate": 8.489612969467292e-05, + "loss": 1.0353, + "step": 79580 + }, + { + "epoch": 0.5084778247703257, + "grad_norm": 0.9374289512634277, + "learning_rate": 8.489253598743195e-05, + "loss": 1.2263, + "step": 79590 + }, + { + "epoch": 0.5085417119200644, + "grad_norm": 0.6595514416694641, + "learning_rate": 8.488894192879297e-05, + "loss": 0.9092, + "step": 79600 + }, + { + "epoch": 0.5086055990698031, + "grad_norm": 0.5380666851997375, + "learning_rate": 8.488534751879213e-05, + "loss": 1.0118, + "step": 79610 + }, + { + "epoch": 0.5086694862195418, + "grad_norm": 0.6525367498397827, + "learning_rate": 8.488175275746568e-05, + "loss": 0.9371, + "step": 79620 + }, + { + "epoch": 0.5087333733692805, + "grad_norm": 0.5488191246986389, + "learning_rate": 8.487815764484981e-05, + "loss": 0.8618, + "step": 79630 + }, + { + "epoch": 0.5087972605190192, + "grad_norm": 0.7757022380828857, + "learning_rate": 8.487456218098071e-05, + "loss": 0.9421, + "step": 79640 + }, + { + "epoch": 0.5088611476687579, + "grad_norm": 0.7398278117179871, + "learning_rate": 8.48709663658946e-05, + "loss": 1.1401, + "step": 79650 + }, + { + "epoch": 0.5089250348184966, + "grad_norm": 1.6941704750061035, + "learning_rate": 8.486737019962769e-05, + "loss": 0.8624, + "step": 79660 + }, + { + "epoch": 0.5089889219682353, + "grad_norm": 0.8483586311340332, + "learning_rate": 8.486377368221621e-05, + "loss": 0.8825, + "step": 79670 + }, + { + "epoch": 0.509052809117974, + "grad_norm": 0.8225073218345642, + "learning_rate": 8.486017681369636e-05, + "loss": 0.7361, + "step": 79680 + }, + { + "epoch": 0.5091166962677127, + "grad_norm": 0.8197336196899414, + "learning_rate": 8.485657959410436e-05, + "loss": 0.9902, + "step": 79690 + }, + { + "epoch": 0.5091805834174514, + "grad_norm": 0.6956250667572021, + "learning_rate": 8.485298202347646e-05, + "loss": 0.9947, + "step": 79700 + }, + { + "epoch": 0.5092444705671901, + "grad_norm": 1.1216806173324585, + "learning_rate": 8.484938410184888e-05, + "loss": 0.7103, + "step": 79710 + }, + { + "epoch": 0.5093083577169288, + "grad_norm": 1.101396083831787, + "learning_rate": 8.484578582925784e-05, + "loss": 0.7783, + "step": 79720 + }, + { + "epoch": 0.5093722448666675, + "grad_norm": 1.2090519666671753, + "learning_rate": 8.48421872057396e-05, + "loss": 0.7809, + "step": 79730 + }, + { + "epoch": 0.5094361320164063, + "grad_norm": 0.9379667043685913, + "learning_rate": 8.48385882313304e-05, + "loss": 1.0095, + "step": 79740 + }, + { + "epoch": 0.509500019166145, + "grad_norm": 0.6880574822425842, + "learning_rate": 8.483498890606647e-05, + "loss": 0.7678, + "step": 79750 + }, + { + "epoch": 0.5095639063158837, + "grad_norm": 0.9663302898406982, + "learning_rate": 8.483138922998406e-05, + "loss": 0.9895, + "step": 79760 + }, + { + "epoch": 0.5096277934656224, + "grad_norm": 2.0903241634368896, + "learning_rate": 8.482778920311942e-05, + "loss": 0.9586, + "step": 79770 + }, + { + "epoch": 0.5096916806153611, + "grad_norm": 0.723540723323822, + "learning_rate": 8.482418882550882e-05, + "loss": 0.7927, + "step": 79780 + }, + { + "epoch": 0.5097555677650998, + "grad_norm": 0.6735635995864868, + "learning_rate": 8.482058809718852e-05, + "loss": 0.733, + "step": 79790 + }, + { + "epoch": 0.5098194549148384, + "grad_norm": 0.7494048476219177, + "learning_rate": 8.481698701819476e-05, + "loss": 0.7265, + "step": 79800 + }, + { + "epoch": 0.5098833420645771, + "grad_norm": 1.2474843263626099, + "learning_rate": 8.481338558856383e-05, + "loss": 0.9442, + "step": 79810 + }, + { + "epoch": 0.5099472292143158, + "grad_norm": 1.0671770572662354, + "learning_rate": 8.4809783808332e-05, + "loss": 0.6449, + "step": 79820 + }, + { + "epoch": 0.5100111163640545, + "grad_norm": 1.693997859954834, + "learning_rate": 8.480618167753551e-05, + "loss": 0.9382, + "step": 79830 + }, + { + "epoch": 0.5100750035137932, + "grad_norm": 0.8211742639541626, + "learning_rate": 8.480257919621067e-05, + "loss": 0.8989, + "step": 79840 + }, + { + "epoch": 0.5101388906635319, + "grad_norm": 0.6184179186820984, + "learning_rate": 8.479897636439375e-05, + "loss": 0.9222, + "step": 79850 + }, + { + "epoch": 0.5102027778132706, + "grad_norm": 0.9833461046218872, + "learning_rate": 8.479537318212103e-05, + "loss": 0.8038, + "step": 79860 + }, + { + "epoch": 0.5102666649630093, + "grad_norm": 1.01847243309021, + "learning_rate": 8.479176964942879e-05, + "loss": 1.0515, + "step": 79870 + }, + { + "epoch": 0.510330552112748, + "grad_norm": 1.1789108514785767, + "learning_rate": 8.478816576635334e-05, + "loss": 0.7673, + "step": 79880 + }, + { + "epoch": 0.5103944392624867, + "grad_norm": 1.949750542640686, + "learning_rate": 8.478456153293096e-05, + "loss": 0.8108, + "step": 79890 + }, + { + "epoch": 0.5104583264122254, + "grad_norm": 1.040195345878601, + "learning_rate": 8.478095694919797e-05, + "loss": 0.9984, + "step": 79900 + }, + { + "epoch": 0.5105222135619641, + "grad_norm": 0.8911735415458679, + "learning_rate": 8.477735201519063e-05, + "loss": 0.9404, + "step": 79910 + }, + { + "epoch": 0.5105861007117029, + "grad_norm": 0.8057443499565125, + "learning_rate": 8.477374673094526e-05, + "loss": 0.638, + "step": 79920 + }, + { + "epoch": 0.5106499878614416, + "grad_norm": 0.5853357911109924, + "learning_rate": 8.477014109649822e-05, + "loss": 0.8098, + "step": 79930 + }, + { + "epoch": 0.5107138750111803, + "grad_norm": 0.8207983374595642, + "learning_rate": 8.476653511188575e-05, + "loss": 0.9, + "step": 79940 + }, + { + "epoch": 0.510777762160919, + "grad_norm": 0.6729571223258972, + "learning_rate": 8.47629287771442e-05, + "loss": 0.8749, + "step": 79950 + }, + { + "epoch": 0.5108416493106577, + "grad_norm": 0.5826616287231445, + "learning_rate": 8.475932209230987e-05, + "loss": 0.8363, + "step": 79960 + }, + { + "epoch": 0.5109055364603964, + "grad_norm": 0.5043898224830627, + "learning_rate": 8.475571505741912e-05, + "loss": 0.7508, + "step": 79970 + }, + { + "epoch": 0.5109694236101351, + "grad_norm": 0.9225212931632996, + "learning_rate": 8.475210767250823e-05, + "loss": 0.9501, + "step": 79980 + }, + { + "epoch": 0.5110333107598738, + "grad_norm": 1.0718021392822266, + "learning_rate": 8.474849993761357e-05, + "loss": 0.8453, + "step": 79990 + }, + { + "epoch": 0.5110971979096125, + "grad_norm": 0.7842211127281189, + "learning_rate": 8.474489185277143e-05, + "loss": 1.3727, + "step": 80000 + }, + { + "epoch": 0.5111610850593512, + "grad_norm": 0.8479704260826111, + "learning_rate": 8.474128341801819e-05, + "loss": 1.1579, + "step": 80010 + }, + { + "epoch": 0.5112249722090899, + "grad_norm": 0.736724317073822, + "learning_rate": 8.473767463339018e-05, + "loss": 0.8251, + "step": 80020 + }, + { + "epoch": 0.5112888593588286, + "grad_norm": 0.6635915040969849, + "learning_rate": 8.47340654989237e-05, + "loss": 1.2262, + "step": 80030 + }, + { + "epoch": 0.5113527465085672, + "grad_norm": 0.818091869354248, + "learning_rate": 8.473045601465515e-05, + "loss": 0.9825, + "step": 80040 + }, + { + "epoch": 0.5114166336583059, + "grad_norm": 0.8209525346755981, + "learning_rate": 8.472684618062085e-05, + "loss": 1.1992, + "step": 80050 + }, + { + "epoch": 0.5114805208080446, + "grad_norm": 0.6535345911979675, + "learning_rate": 8.472323599685718e-05, + "loss": 0.6442, + "step": 80060 + }, + { + "epoch": 0.5115444079577833, + "grad_norm": 1.321568489074707, + "learning_rate": 8.471962546340049e-05, + "loss": 1.0123, + "step": 80070 + }, + { + "epoch": 0.511608295107522, + "grad_norm": 1.0992311239242554, + "learning_rate": 8.471601458028713e-05, + "loss": 0.8926, + "step": 80080 + }, + { + "epoch": 0.5116721822572607, + "grad_norm": 0.987280547618866, + "learning_rate": 8.471240334755346e-05, + "loss": 0.943, + "step": 80090 + }, + { + "epoch": 0.5117360694069994, + "grad_norm": 0.7247947454452515, + "learning_rate": 8.470879176523586e-05, + "loss": 0.8531, + "step": 80100 + }, + { + "epoch": 0.5117999565567382, + "grad_norm": 0.8526644706726074, + "learning_rate": 8.470517983337071e-05, + "loss": 0.7333, + "step": 80110 + }, + { + "epoch": 0.5118638437064769, + "grad_norm": 1.081724762916565, + "learning_rate": 8.470156755199436e-05, + "loss": 0.9023, + "step": 80120 + }, + { + "epoch": 0.5119277308562156, + "grad_norm": 0.9575611352920532, + "learning_rate": 8.469795492114321e-05, + "loss": 0.7949, + "step": 80130 + }, + { + "epoch": 0.5119916180059543, + "grad_norm": 0.6004752516746521, + "learning_rate": 8.469434194085364e-05, + "loss": 1.0179, + "step": 80140 + }, + { + "epoch": 0.512055505155693, + "grad_norm": 0.7859931588172913, + "learning_rate": 8.469072861116202e-05, + "loss": 0.9604, + "step": 80150 + }, + { + "epoch": 0.5121193923054317, + "grad_norm": 0.9513803124427795, + "learning_rate": 8.468711493210476e-05, + "loss": 0.8357, + "step": 80160 + }, + { + "epoch": 0.5121832794551704, + "grad_norm": 0.8474782109260559, + "learning_rate": 8.468350090371825e-05, + "loss": 0.9121, + "step": 80170 + }, + { + "epoch": 0.5122471666049091, + "grad_norm": 0.49391424655914307, + "learning_rate": 8.467988652603887e-05, + "loss": 1.0967, + "step": 80180 + }, + { + "epoch": 0.5123110537546478, + "grad_norm": 0.6341314911842346, + "learning_rate": 8.467627179910304e-05, + "loss": 1.1882, + "step": 80190 + }, + { + "epoch": 0.5123749409043865, + "grad_norm": 0.5973122119903564, + "learning_rate": 8.467265672294715e-05, + "loss": 1.372, + "step": 80200 + }, + { + "epoch": 0.5124388280541252, + "grad_norm": 1.0197665691375732, + "learning_rate": 8.46690412976076e-05, + "loss": 0.6034, + "step": 80210 + }, + { + "epoch": 0.5125027152038639, + "grad_norm": 1.1325267553329468, + "learning_rate": 8.466542552312083e-05, + "loss": 0.9992, + "step": 80220 + }, + { + "epoch": 0.5125666023536026, + "grad_norm": 1.2969529628753662, + "learning_rate": 8.466180939952322e-05, + "loss": 0.9412, + "step": 80230 + }, + { + "epoch": 0.5126304895033413, + "grad_norm": 0.804654598236084, + "learning_rate": 8.465819292685121e-05, + "loss": 0.9241, + "step": 80240 + }, + { + "epoch": 0.51269437665308, + "grad_norm": 0.5683889985084534, + "learning_rate": 8.465457610514122e-05, + "loss": 0.9131, + "step": 80250 + }, + { + "epoch": 0.5127582638028187, + "grad_norm": 1.4431538581848145, + "learning_rate": 8.465095893442965e-05, + "loss": 0.8802, + "step": 80260 + }, + { + "epoch": 0.5128221509525575, + "grad_norm": 0.7495303750038147, + "learning_rate": 8.464734141475296e-05, + "loss": 0.7763, + "step": 80270 + }, + { + "epoch": 0.512886038102296, + "grad_norm": 1.0469660758972168, + "learning_rate": 8.464372354614755e-05, + "loss": 0.7827, + "step": 80280 + }, + { + "epoch": 0.5129499252520348, + "grad_norm": 0.8818047046661377, + "learning_rate": 8.46401053286499e-05, + "loss": 0.7446, + "step": 80290 + }, + { + "epoch": 0.5130138124017735, + "grad_norm": 0.610306441783905, + "learning_rate": 8.463648676229641e-05, + "loss": 0.9616, + "step": 80300 + }, + { + "epoch": 0.5130776995515122, + "grad_norm": 1.0561434030532837, + "learning_rate": 8.463286784712352e-05, + "loss": 0.9341, + "step": 80310 + }, + { + "epoch": 0.5131415867012509, + "grad_norm": 1.1245967149734497, + "learning_rate": 8.46292485831677e-05, + "loss": 1.0593, + "step": 80320 + }, + { + "epoch": 0.5132054738509896, + "grad_norm": 0.8336319327354431, + "learning_rate": 8.462562897046539e-05, + "loss": 0.9832, + "step": 80330 + }, + { + "epoch": 0.5132693610007283, + "grad_norm": 1.2860108613967896, + "learning_rate": 8.462200900905304e-05, + "loss": 1.2113, + "step": 80340 + }, + { + "epoch": 0.513333248150467, + "grad_norm": 0.6594120860099792, + "learning_rate": 8.46183886989671e-05, + "loss": 1.0788, + "step": 80350 + }, + { + "epoch": 0.5133971353002057, + "grad_norm": 1.1738802194595337, + "learning_rate": 8.461476804024405e-05, + "loss": 1.1394, + "step": 80360 + }, + { + "epoch": 0.5134610224499444, + "grad_norm": 0.8349171280860901, + "learning_rate": 8.461114703292032e-05, + "loss": 0.9976, + "step": 80370 + }, + { + "epoch": 0.5135249095996831, + "grad_norm": 0.9331271648406982, + "learning_rate": 8.460752567703242e-05, + "loss": 0.8871, + "step": 80380 + }, + { + "epoch": 0.5135887967494218, + "grad_norm": 1.0842266082763672, + "learning_rate": 8.460390397261679e-05, + "loss": 0.8379, + "step": 80390 + }, + { + "epoch": 0.5136526838991605, + "grad_norm": 0.8020588159561157, + "learning_rate": 8.46002819197099e-05, + "loss": 0.7943, + "step": 80400 + }, + { + "epoch": 0.5137165710488992, + "grad_norm": 1.0201034545898438, + "learning_rate": 8.459665951834825e-05, + "loss": 0.8956, + "step": 80410 + }, + { + "epoch": 0.5137804581986379, + "grad_norm": 0.710241436958313, + "learning_rate": 8.459303676856829e-05, + "loss": 1.1422, + "step": 80420 + }, + { + "epoch": 0.5138443453483766, + "grad_norm": 1.129925012588501, + "learning_rate": 8.458941367040654e-05, + "loss": 1.0028, + "step": 80430 + }, + { + "epoch": 0.5139082324981153, + "grad_norm": 0.9500714540481567, + "learning_rate": 8.458579022389946e-05, + "loss": 0.9935, + "step": 80440 + }, + { + "epoch": 0.513972119647854, + "grad_norm": 1.4013770818710327, + "learning_rate": 8.458216642908357e-05, + "loss": 1.1331, + "step": 80450 + }, + { + "epoch": 0.5140360067975928, + "grad_norm": 1.6361690759658813, + "learning_rate": 8.457854228599533e-05, + "loss": 0.8196, + "step": 80460 + }, + { + "epoch": 0.5140998939473315, + "grad_norm": 1.773687481880188, + "learning_rate": 8.457491779467124e-05, + "loss": 0.6577, + "step": 80470 + }, + { + "epoch": 0.5141637810970702, + "grad_norm": 1.1989527940750122, + "learning_rate": 8.457129295514785e-05, + "loss": 0.9754, + "step": 80480 + }, + { + "epoch": 0.5142276682468089, + "grad_norm": 1.0061672925949097, + "learning_rate": 8.456766776746161e-05, + "loss": 0.7289, + "step": 80490 + }, + { + "epoch": 0.5142915553965476, + "grad_norm": 0.5245055556297302, + "learning_rate": 8.456404223164906e-05, + "loss": 0.8355, + "step": 80500 + }, + { + "epoch": 0.5143554425462863, + "grad_norm": 0.9344064593315125, + "learning_rate": 8.45604163477467e-05, + "loss": 0.9097, + "step": 80510 + }, + { + "epoch": 0.514419329696025, + "grad_norm": 0.8581297993659973, + "learning_rate": 8.455679011579104e-05, + "loss": 0.6422, + "step": 80520 + }, + { + "epoch": 0.5144832168457636, + "grad_norm": 1.0841580629348755, + "learning_rate": 8.455316353581861e-05, + "loss": 1.1547, + "step": 80530 + }, + { + "epoch": 0.5145471039955023, + "grad_norm": 0.8380923867225647, + "learning_rate": 8.454953660786594e-05, + "loss": 1.1443, + "step": 80540 + }, + { + "epoch": 0.514610991145241, + "grad_norm": 1.0253181457519531, + "learning_rate": 8.454590933196953e-05, + "loss": 1.0363, + "step": 80550 + }, + { + "epoch": 0.5146748782949797, + "grad_norm": 0.8039796948432922, + "learning_rate": 8.454228170816594e-05, + "loss": 0.9947, + "step": 80560 + }, + { + "epoch": 0.5147387654447184, + "grad_norm": 0.6157310605049133, + "learning_rate": 8.453865373649168e-05, + "loss": 0.8205, + "step": 80570 + }, + { + "epoch": 0.5148026525944571, + "grad_norm": 1.2950266599655151, + "learning_rate": 8.45350254169833e-05, + "loss": 0.8778, + "step": 80580 + }, + { + "epoch": 0.5148665397441958, + "grad_norm": 0.5074208974838257, + "learning_rate": 8.453139674967735e-05, + "loss": 0.9512, + "step": 80590 + }, + { + "epoch": 0.5149304268939345, + "grad_norm": 0.974296510219574, + "learning_rate": 8.452776773461035e-05, + "loss": 0.7415, + "step": 80600 + }, + { + "epoch": 0.5149943140436732, + "grad_norm": 0.8522329926490784, + "learning_rate": 8.452413837181886e-05, + "loss": 0.7619, + "step": 80610 + }, + { + "epoch": 0.515058201193412, + "grad_norm": 0.7677290439605713, + "learning_rate": 8.452050866133943e-05, + "loss": 0.7501, + "step": 80620 + }, + { + "epoch": 0.5151220883431507, + "grad_norm": 0.7231885194778442, + "learning_rate": 8.451687860320862e-05, + "loss": 0.8417, + "step": 80630 + }, + { + "epoch": 0.5151859754928894, + "grad_norm": 1.0473037958145142, + "learning_rate": 8.451324819746297e-05, + "loss": 0.7961, + "step": 80640 + }, + { + "epoch": 0.5152498626426281, + "grad_norm": 1.339667558670044, + "learning_rate": 8.450961744413906e-05, + "loss": 0.6476, + "step": 80650 + }, + { + "epoch": 0.5153137497923668, + "grad_norm": 1.0341308116912842, + "learning_rate": 8.450598634327342e-05, + "loss": 0.7599, + "step": 80660 + }, + { + "epoch": 0.5153776369421055, + "grad_norm": 0.5190713405609131, + "learning_rate": 8.450235489490268e-05, + "loss": 1.0512, + "step": 80670 + }, + { + "epoch": 0.5154415240918442, + "grad_norm": 0.6918653845787048, + "learning_rate": 8.449872309906338e-05, + "loss": 0.8157, + "step": 80680 + }, + { + "epoch": 0.5155054112415829, + "grad_norm": 0.6024577617645264, + "learning_rate": 8.449509095579206e-05, + "loss": 0.9064, + "step": 80690 + }, + { + "epoch": 0.5155692983913216, + "grad_norm": 0.7624403238296509, + "learning_rate": 8.449145846512536e-05, + "loss": 0.7265, + "step": 80700 + }, + { + "epoch": 0.5156331855410603, + "grad_norm": 1.1898252964019775, + "learning_rate": 8.448782562709983e-05, + "loss": 0.9, + "step": 80710 + }, + { + "epoch": 0.515697072690799, + "grad_norm": 1.169190526008606, + "learning_rate": 8.448419244175205e-05, + "loss": 0.9871, + "step": 80720 + }, + { + "epoch": 0.5157609598405377, + "grad_norm": 0.9798734188079834, + "learning_rate": 8.448055890911863e-05, + "loss": 0.9437, + "step": 80730 + }, + { + "epoch": 0.5158248469902764, + "grad_norm": 0.6357259154319763, + "learning_rate": 8.447692502923615e-05, + "loss": 0.8561, + "step": 80740 + }, + { + "epoch": 0.5158887341400151, + "grad_norm": 0.7946950793266296, + "learning_rate": 8.447329080214119e-05, + "loss": 0.9226, + "step": 80750 + }, + { + "epoch": 0.5159526212897538, + "grad_norm": 1.21712064743042, + "learning_rate": 8.446965622787038e-05, + "loss": 0.7975, + "step": 80760 + }, + { + "epoch": 0.5160165084394924, + "grad_norm": 0.6995162963867188, + "learning_rate": 8.446602130646031e-05, + "loss": 0.7762, + "step": 80770 + }, + { + "epoch": 0.5160803955892311, + "grad_norm": 0.9936223030090332, + "learning_rate": 8.44623860379476e-05, + "loss": 0.8434, + "step": 80780 + }, + { + "epoch": 0.5161442827389698, + "grad_norm": 0.8816124796867371, + "learning_rate": 8.445875042236884e-05, + "loss": 0.9124, + "step": 80790 + }, + { + "epoch": 0.5162081698887085, + "grad_norm": 3.8590049743652344, + "learning_rate": 8.445511445976064e-05, + "loss": 0.9158, + "step": 80800 + }, + { + "epoch": 0.5162720570384473, + "grad_norm": 0.8542178273200989, + "learning_rate": 8.445147815015964e-05, + "loss": 0.7654, + "step": 80810 + }, + { + "epoch": 0.516335944188186, + "grad_norm": 1.375125527381897, + "learning_rate": 8.444784149360245e-05, + "loss": 0.8894, + "step": 80820 + }, + { + "epoch": 0.5163998313379247, + "grad_norm": 0.7835062742233276, + "learning_rate": 8.444420449012569e-05, + "loss": 0.6714, + "step": 80830 + }, + { + "epoch": 0.5164637184876634, + "grad_norm": 0.8339881896972656, + "learning_rate": 8.4440567139766e-05, + "loss": 0.9002, + "step": 80840 + }, + { + "epoch": 0.5165276056374021, + "grad_norm": 0.8910601139068604, + "learning_rate": 8.443692944256001e-05, + "loss": 0.9653, + "step": 80850 + }, + { + "epoch": 0.5165914927871408, + "grad_norm": 0.9393212795257568, + "learning_rate": 8.443329139854433e-05, + "loss": 0.9248, + "step": 80860 + }, + { + "epoch": 0.5166553799368795, + "grad_norm": 0.8954446315765381, + "learning_rate": 8.442965300775563e-05, + "loss": 0.8944, + "step": 80870 + }, + { + "epoch": 0.5167192670866182, + "grad_norm": 1.6217565536499023, + "learning_rate": 8.442601427023054e-05, + "loss": 0.9823, + "step": 80880 + }, + { + "epoch": 0.5167831542363569, + "grad_norm": 0.6830261945724487, + "learning_rate": 8.442237518600569e-05, + "loss": 0.8189, + "step": 80890 + }, + { + "epoch": 0.5168470413860956, + "grad_norm": 0.788052499294281, + "learning_rate": 8.441873575511775e-05, + "loss": 0.897, + "step": 80900 + }, + { + "epoch": 0.5169109285358343, + "grad_norm": 0.8000684380531311, + "learning_rate": 8.441509597760336e-05, + "loss": 0.902, + "step": 80910 + }, + { + "epoch": 0.516974815685573, + "grad_norm": 2.1170742511749268, + "learning_rate": 8.441145585349918e-05, + "loss": 0.7763, + "step": 80920 + }, + { + "epoch": 0.5170387028353117, + "grad_norm": 0.7653173208236694, + "learning_rate": 8.440781538284189e-05, + "loss": 0.8674, + "step": 80930 + }, + { + "epoch": 0.5171025899850504, + "grad_norm": 0.8054555654525757, + "learning_rate": 8.44041745656681e-05, + "loss": 1.0094, + "step": 80940 + }, + { + "epoch": 0.5171664771347891, + "grad_norm": 0.9257411956787109, + "learning_rate": 8.440053340201454e-05, + "loss": 0.698, + "step": 80950 + }, + { + "epoch": 0.5172303642845278, + "grad_norm": 0.7227391600608826, + "learning_rate": 8.439689189191783e-05, + "loss": 1.0385, + "step": 80960 + }, + { + "epoch": 0.5172942514342665, + "grad_norm": 0.873188853263855, + "learning_rate": 8.439325003541466e-05, + "loss": 0.8549, + "step": 80970 + }, + { + "epoch": 0.5173581385840053, + "grad_norm": 2.3998446464538574, + "learning_rate": 8.438960783254171e-05, + "loss": 1.0805, + "step": 80980 + }, + { + "epoch": 0.517422025733744, + "grad_norm": 0.8006882071495056, + "learning_rate": 8.438596528333567e-05, + "loss": 1.0806, + "step": 80990 + }, + { + "epoch": 0.5174859128834827, + "grad_norm": 0.723087728023529, + "learning_rate": 8.438232238783319e-05, + "loss": 0.9784, + "step": 81000 + }, + { + "epoch": 0.5175498000332213, + "grad_norm": 0.805282473564148, + "learning_rate": 8.437867914607099e-05, + "loss": 0.9561, + "step": 81010 + }, + { + "epoch": 0.51761368718296, + "grad_norm": 0.7623560428619385, + "learning_rate": 8.437503555808575e-05, + "loss": 0.9467, + "step": 81020 + }, + { + "epoch": 0.5176775743326987, + "grad_norm": 0.8528174161911011, + "learning_rate": 8.437139162391416e-05, + "loss": 0.8532, + "step": 81030 + }, + { + "epoch": 0.5177414614824374, + "grad_norm": 0.9997398257255554, + "learning_rate": 8.436774734359292e-05, + "loss": 0.9379, + "step": 81040 + }, + { + "epoch": 0.5178053486321761, + "grad_norm": 0.682331383228302, + "learning_rate": 8.436410271715873e-05, + "loss": 0.9537, + "step": 81050 + }, + { + "epoch": 0.5178692357819148, + "grad_norm": 0.829108476638794, + "learning_rate": 8.436045774464831e-05, + "loss": 0.6795, + "step": 81060 + }, + { + "epoch": 0.5179331229316535, + "grad_norm": 0.5814771056175232, + "learning_rate": 8.435681242609834e-05, + "loss": 1.0538, + "step": 81070 + }, + { + "epoch": 0.5179970100813922, + "grad_norm": 0.9756491184234619, + "learning_rate": 8.435316676154557e-05, + "loss": 0.7018, + "step": 81080 + }, + { + "epoch": 0.5180608972311309, + "grad_norm": 0.49684908986091614, + "learning_rate": 8.434952075102665e-05, + "loss": 0.7389, + "step": 81090 + }, + { + "epoch": 0.5181247843808696, + "grad_norm": 0.7988196015357971, + "learning_rate": 8.434587439457837e-05, + "loss": 0.9728, + "step": 81100 + }, + { + "epoch": 0.5181886715306083, + "grad_norm": 0.7685717344284058, + "learning_rate": 8.43422276922374e-05, + "loss": 1.1467, + "step": 81110 + }, + { + "epoch": 0.518252558680347, + "grad_norm": 1.4645850658416748, + "learning_rate": 8.433858064404052e-05, + "loss": 0.9914, + "step": 81120 + }, + { + "epoch": 0.5183164458300857, + "grad_norm": 1.1191790103912354, + "learning_rate": 8.433493325002439e-05, + "loss": 1.0248, + "step": 81130 + }, + { + "epoch": 0.5183803329798244, + "grad_norm": 1.087517499923706, + "learning_rate": 8.43312855102258e-05, + "loss": 0.8725, + "step": 81140 + }, + { + "epoch": 0.5184442201295631, + "grad_norm": 0.9224085807800293, + "learning_rate": 8.432763742468146e-05, + "loss": 1.0876, + "step": 81150 + }, + { + "epoch": 0.5185081072793019, + "grad_norm": 1.3059546947479248, + "learning_rate": 8.432398899342811e-05, + "loss": 1.003, + "step": 81160 + }, + { + "epoch": 0.5185719944290406, + "grad_norm": 1.533253788948059, + "learning_rate": 8.43203402165025e-05, + "loss": 0.8566, + "step": 81170 + }, + { + "epoch": 0.5186358815787793, + "grad_norm": 0.714002251625061, + "learning_rate": 8.431669109394138e-05, + "loss": 0.7685, + "step": 81180 + }, + { + "epoch": 0.518699768728518, + "grad_norm": 0.8544260859489441, + "learning_rate": 8.431304162578148e-05, + "loss": 0.8297, + "step": 81190 + }, + { + "epoch": 0.5187636558782567, + "grad_norm": 0.5248509049415588, + "learning_rate": 8.430939181205957e-05, + "loss": 1.0113, + "step": 81200 + }, + { + "epoch": 0.5188275430279954, + "grad_norm": 0.8331204056739807, + "learning_rate": 8.430574165281239e-05, + "loss": 0.8771, + "step": 81210 + }, + { + "epoch": 0.5188914301777341, + "grad_norm": 0.9952676296234131, + "learning_rate": 8.430209114807675e-05, + "loss": 0.9538, + "step": 81220 + }, + { + "epoch": 0.5189553173274728, + "grad_norm": 5.568673133850098, + "learning_rate": 8.429844029788933e-05, + "loss": 1.1575, + "step": 81230 + }, + { + "epoch": 0.5190192044772115, + "grad_norm": 0.9033558964729309, + "learning_rate": 8.429478910228697e-05, + "loss": 1.1018, + "step": 81240 + }, + { + "epoch": 0.5190830916269502, + "grad_norm": 0.795211911201477, + "learning_rate": 8.42911375613064e-05, + "loss": 0.8717, + "step": 81250 + }, + { + "epoch": 0.5191469787766888, + "grad_norm": 0.7301700115203857, + "learning_rate": 8.428748567498443e-05, + "loss": 0.8484, + "step": 81260 + }, + { + "epoch": 0.5192108659264275, + "grad_norm": 0.9231024384498596, + "learning_rate": 8.428383344335779e-05, + "loss": 0.8684, + "step": 81270 + }, + { + "epoch": 0.5192747530761662, + "grad_norm": 0.681303083896637, + "learning_rate": 8.428018086646333e-05, + "loss": 0.8345, + "step": 81280 + }, + { + "epoch": 0.5193386402259049, + "grad_norm": 1.12642502784729, + "learning_rate": 8.427652794433776e-05, + "loss": 0.8428, + "step": 81290 + }, + { + "epoch": 0.5194025273756436, + "grad_norm": 0.9355636239051819, + "learning_rate": 8.42728746770179e-05, + "loss": 0.7608, + "step": 81300 + }, + { + "epoch": 0.5194664145253823, + "grad_norm": 1.2537355422973633, + "learning_rate": 8.426922106454054e-05, + "loss": 0.8054, + "step": 81310 + }, + { + "epoch": 0.519530301675121, + "grad_norm": 0.8078314065933228, + "learning_rate": 8.42655671069425e-05, + "loss": 0.9239, + "step": 81320 + }, + { + "epoch": 0.5195941888248597, + "grad_norm": 1.214921236038208, + "learning_rate": 8.426191280426052e-05, + "loss": 0.8623, + "step": 81330 + }, + { + "epoch": 0.5196580759745985, + "grad_norm": 1.2319025993347168, + "learning_rate": 8.425825815653145e-05, + "loss": 0.8355, + "step": 81340 + }, + { + "epoch": 0.5197219631243372, + "grad_norm": 0.8140376806259155, + "learning_rate": 8.42546031637921e-05, + "loss": 0.9517, + "step": 81350 + }, + { + "epoch": 0.5197858502740759, + "grad_norm": 1.0668420791625977, + "learning_rate": 8.425094782607925e-05, + "loss": 0.8228, + "step": 81360 + }, + { + "epoch": 0.5198497374238146, + "grad_norm": 0.855273962020874, + "learning_rate": 8.424729214342972e-05, + "loss": 0.8945, + "step": 81370 + }, + { + "epoch": 0.5199136245735533, + "grad_norm": 1.8131141662597656, + "learning_rate": 8.424363611588033e-05, + "loss": 0.9209, + "step": 81380 + }, + { + "epoch": 0.519977511723292, + "grad_norm": 0.7118239402770996, + "learning_rate": 8.42399797434679e-05, + "loss": 1.0615, + "step": 81390 + }, + { + "epoch": 0.5200413988730307, + "grad_norm": 0.7992277145385742, + "learning_rate": 8.423632302622926e-05, + "loss": 0.8819, + "step": 81400 + }, + { + "epoch": 0.5201052860227694, + "grad_norm": 0.9642464518547058, + "learning_rate": 8.423266596420123e-05, + "loss": 0.8717, + "step": 81410 + }, + { + "epoch": 0.5201691731725081, + "grad_norm": 0.8006801605224609, + "learning_rate": 8.422900855742062e-05, + "loss": 0.8487, + "step": 81420 + }, + { + "epoch": 0.5202330603222468, + "grad_norm": 0.7330396771430969, + "learning_rate": 8.422535080592431e-05, + "loss": 0.9406, + "step": 81430 + }, + { + "epoch": 0.5202969474719855, + "grad_norm": 0.8267525434494019, + "learning_rate": 8.422169270974909e-05, + "loss": 0.8308, + "step": 81440 + }, + { + "epoch": 0.5203608346217242, + "grad_norm": 1.518169641494751, + "learning_rate": 8.421803426893182e-05, + "loss": 0.9029, + "step": 81450 + }, + { + "epoch": 0.5204247217714629, + "grad_norm": 1.128998041152954, + "learning_rate": 8.421437548350935e-05, + "loss": 0.8468, + "step": 81460 + }, + { + "epoch": 0.5204886089212016, + "grad_norm": 1.051698088645935, + "learning_rate": 8.42107163535185e-05, + "loss": 1.12, + "step": 81470 + }, + { + "epoch": 0.5205524960709403, + "grad_norm": 0.7762027382850647, + "learning_rate": 8.420705687899616e-05, + "loss": 0.6149, + "step": 81480 + }, + { + "epoch": 0.520616383220679, + "grad_norm": 1.0097482204437256, + "learning_rate": 8.420339705997915e-05, + "loss": 0.7171, + "step": 81490 + }, + { + "epoch": 0.5206802703704176, + "grad_norm": 1.062819242477417, + "learning_rate": 8.419973689650436e-05, + "loss": 0.9634, + "step": 81500 + }, + { + "epoch": 0.5207441575201563, + "grad_norm": 0.9354637861251831, + "learning_rate": 8.41960763886086e-05, + "loss": 0.8755, + "step": 81510 + }, + { + "epoch": 0.520808044669895, + "grad_norm": 0.5439125299453735, + "learning_rate": 8.41924155363288e-05, + "loss": 0.9437, + "step": 81520 + }, + { + "epoch": 0.5208719318196338, + "grad_norm": 0.7927828431129456, + "learning_rate": 8.418875433970177e-05, + "loss": 0.9138, + "step": 81530 + }, + { + "epoch": 0.5209358189693725, + "grad_norm": 1.90019953250885, + "learning_rate": 8.418509279876444e-05, + "loss": 1.1347, + "step": 81540 + }, + { + "epoch": 0.5209997061191112, + "grad_norm": 1.1833665370941162, + "learning_rate": 8.418179711756595e-05, + "loss": 1.0368, + "step": 81550 + }, + { + "epoch": 0.5210635932688499, + "grad_norm": 0.6297504305839539, + "learning_rate": 8.417813492254057e-05, + "loss": 0.7755, + "step": 81560 + }, + { + "epoch": 0.5211274804185886, + "grad_norm": 1.1400600671768188, + "learning_rate": 8.417447238331177e-05, + "loss": 1.1425, + "step": 81570 + }, + { + "epoch": 0.5211913675683273, + "grad_norm": 0.9770885705947876, + "learning_rate": 8.41708094999165e-05, + "loss": 0.8823, + "step": 81580 + }, + { + "epoch": 0.521255254718066, + "grad_norm": 1.117124080657959, + "learning_rate": 8.41671462723916e-05, + "loss": 1.0206, + "step": 81590 + }, + { + "epoch": 0.5213191418678047, + "grad_norm": 0.7140239477157593, + "learning_rate": 8.416348270077399e-05, + "loss": 0.9016, + "step": 81600 + }, + { + "epoch": 0.5213830290175434, + "grad_norm": 0.8254780769348145, + "learning_rate": 8.415981878510054e-05, + "loss": 0.9917, + "step": 81610 + }, + { + "epoch": 0.5214469161672821, + "grad_norm": 0.7922462821006775, + "learning_rate": 8.415615452540817e-05, + "loss": 0.7269, + "step": 81620 + }, + { + "epoch": 0.5215108033170208, + "grad_norm": 0.5598194003105164, + "learning_rate": 8.415248992173377e-05, + "loss": 0.7201, + "step": 81630 + }, + { + "epoch": 0.5215746904667595, + "grad_norm": 1.1841058731079102, + "learning_rate": 8.414882497411424e-05, + "loss": 0.9925, + "step": 81640 + }, + { + "epoch": 0.5216385776164982, + "grad_norm": 1.0316505432128906, + "learning_rate": 8.414515968258653e-05, + "loss": 0.7948, + "step": 81650 + }, + { + "epoch": 0.5217024647662369, + "grad_norm": 1.1518917083740234, + "learning_rate": 8.41414940471875e-05, + "loss": 1.0671, + "step": 81660 + }, + { + "epoch": 0.5217663519159756, + "grad_norm": 1.1492241621017456, + "learning_rate": 8.413782806795409e-05, + "loss": 0.9031, + "step": 81670 + }, + { + "epoch": 0.5218302390657144, + "grad_norm": 1.136619210243225, + "learning_rate": 8.413416174492323e-05, + "loss": 0.8399, + "step": 81680 + }, + { + "epoch": 0.5218941262154531, + "grad_norm": 0.7834781408309937, + "learning_rate": 8.413049507813182e-05, + "loss": 1.0536, + "step": 81690 + }, + { + "epoch": 0.5219580133651918, + "grad_norm": 0.7433455586433411, + "learning_rate": 8.412682806761681e-05, + "loss": 0.9202, + "step": 81700 + }, + { + "epoch": 0.5220219005149305, + "grad_norm": 0.7488266229629517, + "learning_rate": 8.41231607134151e-05, + "loss": 1.1904, + "step": 81710 + }, + { + "epoch": 0.5220857876646692, + "grad_norm": 1.112772822380066, + "learning_rate": 8.411949301556365e-05, + "loss": 0.8893, + "step": 81720 + }, + { + "epoch": 0.5221496748144079, + "grad_norm": 1.1430962085723877, + "learning_rate": 8.411582497409937e-05, + "loss": 0.7654, + "step": 81730 + }, + { + "epoch": 0.5222135619641465, + "grad_norm": 0.9723464250564575, + "learning_rate": 8.411215658905925e-05, + "loss": 0.8554, + "step": 81740 + }, + { + "epoch": 0.5222774491138852, + "grad_norm": 0.7580609321594238, + "learning_rate": 8.410848786048018e-05, + "loss": 0.8596, + "step": 81750 + }, + { + "epoch": 0.5223413362636239, + "grad_norm": 1.1168793439865112, + "learning_rate": 8.410481878839914e-05, + "loss": 0.798, + "step": 81760 + }, + { + "epoch": 0.5224052234133626, + "grad_norm": 0.7281593084335327, + "learning_rate": 8.410114937285308e-05, + "loss": 0.9573, + "step": 81770 + }, + { + "epoch": 0.5224691105631013, + "grad_norm": 0.8710981011390686, + "learning_rate": 8.409747961387892e-05, + "loss": 0.8257, + "step": 81780 + }, + { + "epoch": 0.52253299771284, + "grad_norm": 1.258819818496704, + "learning_rate": 8.409380951151364e-05, + "loss": 0.7601, + "step": 81790 + }, + { + "epoch": 0.5225968848625787, + "grad_norm": 0.5845881700515747, + "learning_rate": 8.409013906579422e-05, + "loss": 0.894, + "step": 81800 + }, + { + "epoch": 0.5226607720123174, + "grad_norm": 0.7453858852386475, + "learning_rate": 8.40864682767576e-05, + "loss": 0.8617, + "step": 81810 + }, + { + "epoch": 0.5227246591620561, + "grad_norm": 0.8870908617973328, + "learning_rate": 8.408279714444076e-05, + "loss": 0.9018, + "step": 81820 + }, + { + "epoch": 0.5227885463117948, + "grad_norm": 0.7710734605789185, + "learning_rate": 8.407912566888068e-05, + "loss": 0.7743, + "step": 81830 + }, + { + "epoch": 0.5228524334615335, + "grad_norm": 1.001556396484375, + "learning_rate": 8.40754538501143e-05, + "loss": 0.9576, + "step": 81840 + }, + { + "epoch": 0.5229163206112722, + "grad_norm": 1.0256420373916626, + "learning_rate": 8.407178168817862e-05, + "loss": 0.9606, + "step": 81850 + }, + { + "epoch": 0.522980207761011, + "grad_norm": 1.1523104906082153, + "learning_rate": 8.406810918311063e-05, + "loss": 0.8513, + "step": 81860 + }, + { + "epoch": 0.5230440949107497, + "grad_norm": 0.8839610815048218, + "learning_rate": 8.40644363349473e-05, + "loss": 1.0848, + "step": 81870 + }, + { + "epoch": 0.5231079820604884, + "grad_norm": 0.863746166229248, + "learning_rate": 8.406076314372564e-05, + "loss": 0.8318, + "step": 81880 + }, + { + "epoch": 0.5231718692102271, + "grad_norm": 1.282516360282898, + "learning_rate": 8.405708960948262e-05, + "loss": 0.9989, + "step": 81890 + }, + { + "epoch": 0.5232357563599658, + "grad_norm": 0.9695531129837036, + "learning_rate": 8.405341573225524e-05, + "loss": 1.042, + "step": 81900 + }, + { + "epoch": 0.5232996435097045, + "grad_norm": 1.1501736640930176, + "learning_rate": 8.40497415120805e-05, + "loss": 0.9179, + "step": 81910 + }, + { + "epoch": 0.5233635306594432, + "grad_norm": 0.8112602829933167, + "learning_rate": 8.404606694899542e-05, + "loss": 0.7767, + "step": 81920 + }, + { + "epoch": 0.5234274178091819, + "grad_norm": 0.7559998631477356, + "learning_rate": 8.404239204303698e-05, + "loss": 0.8375, + "step": 81930 + }, + { + "epoch": 0.5234913049589206, + "grad_norm": 0.6688728332519531, + "learning_rate": 8.403871679424222e-05, + "loss": 0.8205, + "step": 81940 + }, + { + "epoch": 0.5235551921086593, + "grad_norm": 1.4303487539291382, + "learning_rate": 8.403504120264811e-05, + "loss": 1.0413, + "step": 81950 + }, + { + "epoch": 0.523619079258398, + "grad_norm": 1.0139001607894897, + "learning_rate": 8.403136526829171e-05, + "loss": 0.7122, + "step": 81960 + }, + { + "epoch": 0.5236829664081367, + "grad_norm": 1.1375806331634521, + "learning_rate": 8.402768899121e-05, + "loss": 0.9585, + "step": 81970 + }, + { + "epoch": 0.5237468535578753, + "grad_norm": 0.801270604133606, + "learning_rate": 8.402401237144005e-05, + "loss": 1.0773, + "step": 81980 + }, + { + "epoch": 0.523810740707614, + "grad_norm": 1.0210850238800049, + "learning_rate": 8.402033540901884e-05, + "loss": 0.9391, + "step": 81990 + }, + { + "epoch": 0.5238746278573527, + "grad_norm": 0.44863033294677734, + "learning_rate": 8.401665810398342e-05, + "loss": 0.7008, + "step": 82000 + }, + { + "epoch": 0.5239385150070914, + "grad_norm": 0.6878476738929749, + "learning_rate": 8.401298045637083e-05, + "loss": 1.0242, + "step": 82010 + }, + { + "epoch": 0.5240024021568301, + "grad_norm": 1.221062421798706, + "learning_rate": 8.40093024662181e-05, + "loss": 0.6651, + "step": 82020 + }, + { + "epoch": 0.5240662893065688, + "grad_norm": 0.8801291584968567, + "learning_rate": 8.400562413356228e-05, + "loss": 1.07, + "step": 82030 + }, + { + "epoch": 0.5241301764563076, + "grad_norm": 1.5638190507888794, + "learning_rate": 8.40019454584404e-05, + "loss": 1.0253, + "step": 82040 + }, + { + "epoch": 0.5241940636060463, + "grad_norm": 0.7046545147895813, + "learning_rate": 8.399826644088951e-05, + "loss": 1.0211, + "step": 82050 + }, + { + "epoch": 0.524257950755785, + "grad_norm": 0.7261834740638733, + "learning_rate": 8.399458708094668e-05, + "loss": 1.0918, + "step": 82060 + }, + { + "epoch": 0.5243218379055237, + "grad_norm": 0.7312687635421753, + "learning_rate": 8.399090737864893e-05, + "loss": 1.0264, + "step": 82070 + }, + { + "epoch": 0.5243857250552624, + "grad_norm": 0.7334839105606079, + "learning_rate": 8.398722733403335e-05, + "loss": 0.6618, + "step": 82080 + }, + { + "epoch": 0.5244496122050011, + "grad_norm": 1.9923456907272339, + "learning_rate": 8.398354694713697e-05, + "loss": 0.7325, + "step": 82090 + }, + { + "epoch": 0.5245134993547398, + "grad_norm": 1.5566961765289307, + "learning_rate": 8.397986621799688e-05, + "loss": 1.0462, + "step": 82100 + }, + { + "epoch": 0.5245773865044785, + "grad_norm": 0.5462529063224792, + "learning_rate": 8.397618514665015e-05, + "loss": 0.7883, + "step": 82110 + }, + { + "epoch": 0.5246412736542172, + "grad_norm": 0.7034682631492615, + "learning_rate": 8.397250373313383e-05, + "loss": 0.98, + "step": 82120 + }, + { + "epoch": 0.5247051608039559, + "grad_norm": 2.1927855014801025, + "learning_rate": 8.396882197748501e-05, + "loss": 1.0054, + "step": 82130 + }, + { + "epoch": 0.5247690479536946, + "grad_norm": 0.724446713924408, + "learning_rate": 8.396513987974078e-05, + "loss": 0.8474, + "step": 82140 + }, + { + "epoch": 0.5248329351034333, + "grad_norm": 1.2433834075927734, + "learning_rate": 8.396145743993819e-05, + "loss": 1.0282, + "step": 82150 + }, + { + "epoch": 0.524896822253172, + "grad_norm": 0.5404759645462036, + "learning_rate": 8.395777465811434e-05, + "loss": 0.9458, + "step": 82160 + }, + { + "epoch": 0.5249607094029107, + "grad_norm": 0.5961291790008545, + "learning_rate": 8.395409153430633e-05, + "loss": 0.8767, + "step": 82170 + }, + { + "epoch": 0.5250245965526494, + "grad_norm": 1.5657272338867188, + "learning_rate": 8.395040806855125e-05, + "loss": 0.8355, + "step": 82180 + }, + { + "epoch": 0.5250884837023881, + "grad_norm": 0.761735737323761, + "learning_rate": 8.394672426088618e-05, + "loss": 1.1087, + "step": 82190 + }, + { + "epoch": 0.5251523708521268, + "grad_norm": 0.904757022857666, + "learning_rate": 8.394304011134822e-05, + "loss": 1.1173, + "step": 82200 + }, + { + "epoch": 0.5252162580018656, + "grad_norm": 1.890184998512268, + "learning_rate": 8.39393556199745e-05, + "loss": 0.7589, + "step": 82210 + }, + { + "epoch": 0.5252801451516043, + "grad_norm": 0.8762137293815613, + "learning_rate": 8.39356707868021e-05, + "loss": 0.9525, + "step": 82220 + }, + { + "epoch": 0.5253440323013429, + "grad_norm": 1.153976321220398, + "learning_rate": 8.393198561186814e-05, + "loss": 1.0239, + "step": 82230 + }, + { + "epoch": 0.5254079194510816, + "grad_norm": 1.3719924688339233, + "learning_rate": 8.392830009520972e-05, + "loss": 0.8364, + "step": 82240 + }, + { + "epoch": 0.5254718066008203, + "grad_norm": 0.9596297144889832, + "learning_rate": 8.392461423686397e-05, + "loss": 0.8316, + "step": 82250 + }, + { + "epoch": 0.525535693750559, + "grad_norm": 0.8172164559364319, + "learning_rate": 8.392092803686801e-05, + "loss": 0.8146, + "step": 82260 + }, + { + "epoch": 0.5255995809002977, + "grad_norm": 0.7319055795669556, + "learning_rate": 8.391724149525895e-05, + "loss": 0.8911, + "step": 82270 + }, + { + "epoch": 0.5256634680500364, + "grad_norm": 0.949073314666748, + "learning_rate": 8.391355461207393e-05, + "loss": 0.8981, + "step": 82280 + }, + { + "epoch": 0.5257273551997751, + "grad_norm": 2.1071205139160156, + "learning_rate": 8.390986738735007e-05, + "loss": 0.882, + "step": 82290 + }, + { + "epoch": 0.5257912423495138, + "grad_norm": 0.9172298908233643, + "learning_rate": 8.390617982112452e-05, + "loss": 1.0809, + "step": 82300 + }, + { + "epoch": 0.5258551294992525, + "grad_norm": 1.3219941854476929, + "learning_rate": 8.390249191343442e-05, + "loss": 0.8637, + "step": 82310 + }, + { + "epoch": 0.5259190166489912, + "grad_norm": 0.8916542530059814, + "learning_rate": 8.389880366431687e-05, + "loss": 1.1391, + "step": 82320 + }, + { + "epoch": 0.5259829037987299, + "grad_norm": 0.6826764941215515, + "learning_rate": 8.389511507380905e-05, + "loss": 0.814, + "step": 82330 + }, + { + "epoch": 0.5260467909484686, + "grad_norm": 1.7593846321105957, + "learning_rate": 8.389142614194809e-05, + "loss": 1.1427, + "step": 82340 + }, + { + "epoch": 0.5261106780982073, + "grad_norm": 0.4655475318431854, + "learning_rate": 8.388773686877117e-05, + "loss": 0.6992, + "step": 82350 + }, + { + "epoch": 0.526174565247946, + "grad_norm": 1.185663104057312, + "learning_rate": 8.38840472543154e-05, + "loss": 0.8237, + "step": 82360 + }, + { + "epoch": 0.5262384523976847, + "grad_norm": 0.8329865336418152, + "learning_rate": 8.388035729861797e-05, + "loss": 0.9379, + "step": 82370 + }, + { + "epoch": 0.5263023395474234, + "grad_norm": 0.5856214761734009, + "learning_rate": 8.387666700171603e-05, + "loss": 0.7349, + "step": 82380 + }, + { + "epoch": 0.5263662266971622, + "grad_norm": 0.8730786442756653, + "learning_rate": 8.387297636364675e-05, + "loss": 0.8201, + "step": 82390 + }, + { + "epoch": 0.5264301138469009, + "grad_norm": 0.7338623404502869, + "learning_rate": 8.38692853844473e-05, + "loss": 0.9384, + "step": 82400 + }, + { + "epoch": 0.5264940009966396, + "grad_norm": 0.6729745268821716, + "learning_rate": 8.386559406415481e-05, + "loss": 0.8973, + "step": 82410 + }, + { + "epoch": 0.5265578881463783, + "grad_norm": 1.3873519897460938, + "learning_rate": 8.386190240280652e-05, + "loss": 0.836, + "step": 82420 + }, + { + "epoch": 0.526621775296117, + "grad_norm": 0.808180570602417, + "learning_rate": 8.385821040043958e-05, + "loss": 0.8696, + "step": 82430 + }, + { + "epoch": 0.5266856624458557, + "grad_norm": 1.6018669605255127, + "learning_rate": 8.385451805709116e-05, + "loss": 0.7054, + "step": 82440 + }, + { + "epoch": 0.5267495495955944, + "grad_norm": 0.9540040493011475, + "learning_rate": 8.385082537279846e-05, + "loss": 0.9149, + "step": 82450 + }, + { + "epoch": 0.5268134367453331, + "grad_norm": 1.0204883813858032, + "learning_rate": 8.384713234759866e-05, + "loss": 0.8372, + "step": 82460 + }, + { + "epoch": 0.5268773238950717, + "grad_norm": 0.6929607391357422, + "learning_rate": 8.384343898152896e-05, + "loss": 0.7188, + "step": 82470 + }, + { + "epoch": 0.5269412110448104, + "grad_norm": 0.7308977246284485, + "learning_rate": 8.383974527462655e-05, + "loss": 0.8171, + "step": 82480 + }, + { + "epoch": 0.5270050981945491, + "grad_norm": 0.9977597594261169, + "learning_rate": 8.383605122692861e-05, + "loss": 0.7189, + "step": 82490 + }, + { + "epoch": 0.5270689853442878, + "grad_norm": 1.0899478197097778, + "learning_rate": 8.383235683847238e-05, + "loss": 1.0767, + "step": 82500 + }, + { + "epoch": 0.5271328724940265, + "grad_norm": 1.763649582862854, + "learning_rate": 8.382866210929506e-05, + "loss": 0.9618, + "step": 82510 + }, + { + "epoch": 0.5271967596437652, + "grad_norm": 0.8483105897903442, + "learning_rate": 8.382496703943382e-05, + "loss": 0.9302, + "step": 82520 + }, + { + "epoch": 0.5272606467935039, + "grad_norm": 1.4405713081359863, + "learning_rate": 8.38212716289259e-05, + "loss": 1.0092, + "step": 82530 + }, + { + "epoch": 0.5273245339432426, + "grad_norm": 0.8310216069221497, + "learning_rate": 8.381757587780853e-05, + "loss": 0.9264, + "step": 82540 + }, + { + "epoch": 0.5273884210929813, + "grad_norm": 1.1414271593093872, + "learning_rate": 8.381387978611892e-05, + "loss": 0.8157, + "step": 82550 + }, + { + "epoch": 0.52745230824272, + "grad_norm": 1.1843761205673218, + "learning_rate": 8.381018335389428e-05, + "loss": 1.0638, + "step": 82560 + }, + { + "epoch": 0.5275161953924588, + "grad_norm": 0.6531765460968018, + "learning_rate": 8.380648658117186e-05, + "loss": 0.8616, + "step": 82570 + }, + { + "epoch": 0.5275800825421975, + "grad_norm": 0.6731355786323547, + "learning_rate": 8.380278946798883e-05, + "loss": 0.7789, + "step": 82580 + }, + { + "epoch": 0.5276439696919362, + "grad_norm": 0.7285798788070679, + "learning_rate": 8.37990920143825e-05, + "loss": 0.8369, + "step": 82590 + }, + { + "epoch": 0.5277078568416749, + "grad_norm": 1.3306784629821777, + "learning_rate": 8.379539422039006e-05, + "loss": 0.9274, + "step": 82600 + }, + { + "epoch": 0.5277717439914136, + "grad_norm": 0.7392432689666748, + "learning_rate": 8.379169608604877e-05, + "loss": 0.7246, + "step": 82610 + }, + { + "epoch": 0.5278356311411523, + "grad_norm": 1.119094967842102, + "learning_rate": 8.378799761139587e-05, + "loss": 0.8628, + "step": 82620 + }, + { + "epoch": 0.527899518290891, + "grad_norm": 0.5163264274597168, + "learning_rate": 8.378429879646859e-05, + "loss": 0.6895, + "step": 82630 + }, + { + "epoch": 0.5279634054406297, + "grad_norm": 0.7841934561729431, + "learning_rate": 8.378059964130421e-05, + "loss": 0.9658, + "step": 82640 + }, + { + "epoch": 0.5280272925903684, + "grad_norm": 0.6107231378555298, + "learning_rate": 8.377690014593996e-05, + "loss": 0.8572, + "step": 82650 + }, + { + "epoch": 0.5280911797401071, + "grad_norm": 0.839893102645874, + "learning_rate": 8.377320031041309e-05, + "loss": 1.003, + "step": 82660 + }, + { + "epoch": 0.5281550668898458, + "grad_norm": 0.8523367643356323, + "learning_rate": 8.37695001347609e-05, + "loss": 0.8451, + "step": 82670 + }, + { + "epoch": 0.5282189540395845, + "grad_norm": 0.8957772850990295, + "learning_rate": 8.37657996190206e-05, + "loss": 1.1725, + "step": 82680 + }, + { + "epoch": 0.5282828411893232, + "grad_norm": 1.1203519105911255, + "learning_rate": 8.376209876322952e-05, + "loss": 0.7363, + "step": 82690 + }, + { + "epoch": 0.5283467283390619, + "grad_norm": 0.440065860748291, + "learning_rate": 8.375839756742487e-05, + "loss": 0.9615, + "step": 82700 + }, + { + "epoch": 0.5284106154888005, + "grad_norm": 0.8723582029342651, + "learning_rate": 8.375469603164397e-05, + "loss": 1.0116, + "step": 82710 + }, + { + "epoch": 0.5284745026385392, + "grad_norm": 0.7620988488197327, + "learning_rate": 8.375099415592406e-05, + "loss": 0.74, + "step": 82720 + }, + { + "epoch": 0.5285383897882779, + "grad_norm": 0.6935875415802002, + "learning_rate": 8.374729194030245e-05, + "loss": 0.6662, + "step": 82730 + }, + { + "epoch": 0.5286022769380166, + "grad_norm": 0.8621264696121216, + "learning_rate": 8.374358938481641e-05, + "loss": 0.8284, + "step": 82740 + }, + { + "epoch": 0.5286661640877554, + "grad_norm": 1.0655511617660522, + "learning_rate": 8.373988648950324e-05, + "loss": 0.8265, + "step": 82750 + }, + { + "epoch": 0.5287300512374941, + "grad_norm": 0.6569475531578064, + "learning_rate": 8.373618325440022e-05, + "loss": 0.8138, + "step": 82760 + }, + { + "epoch": 0.5287939383872328, + "grad_norm": 0.7676059007644653, + "learning_rate": 8.373247967954465e-05, + "loss": 0.798, + "step": 82770 + }, + { + "epoch": 0.5288578255369715, + "grad_norm": 0.5665331482887268, + "learning_rate": 8.372877576497383e-05, + "loss": 1.0496, + "step": 82780 + }, + { + "epoch": 0.5289217126867102, + "grad_norm": 0.7789902687072754, + "learning_rate": 8.372507151072506e-05, + "loss": 0.9221, + "step": 82790 + }, + { + "epoch": 0.5289855998364489, + "grad_norm": 0.6825060248374939, + "learning_rate": 8.372136691683563e-05, + "loss": 0.7601, + "step": 82800 + }, + { + "epoch": 0.5290494869861876, + "grad_norm": 0.8659316897392273, + "learning_rate": 8.371766198334288e-05, + "loss": 0.763, + "step": 82810 + }, + { + "epoch": 0.5291133741359263, + "grad_norm": 0.7778592109680176, + "learning_rate": 8.371395671028409e-05, + "loss": 0.8137, + "step": 82820 + }, + { + "epoch": 0.529177261285665, + "grad_norm": 0.7770895957946777, + "learning_rate": 8.37102510976966e-05, + "loss": 1.0609, + "step": 82830 + }, + { + "epoch": 0.5292411484354037, + "grad_norm": 0.8733034729957581, + "learning_rate": 8.370654514561771e-05, + "loss": 0.598, + "step": 82840 + }, + { + "epoch": 0.5293050355851424, + "grad_norm": 0.7436967492103577, + "learning_rate": 8.370283885408474e-05, + "loss": 0.8707, + "step": 82850 + }, + { + "epoch": 0.5293689227348811, + "grad_norm": 0.6810287237167358, + "learning_rate": 8.369913222313504e-05, + "loss": 0.831, + "step": 82860 + }, + { + "epoch": 0.5294328098846198, + "grad_norm": 0.6513703465461731, + "learning_rate": 8.369542525280593e-05, + "loss": 1.0104, + "step": 82870 + }, + { + "epoch": 0.5294966970343585, + "grad_norm": 0.7047162055969238, + "learning_rate": 8.369171794313473e-05, + "loss": 0.93, + "step": 82880 + }, + { + "epoch": 0.5295605841840972, + "grad_norm": 1.145445704460144, + "learning_rate": 8.368801029415878e-05, + "loss": 0.777, + "step": 82890 + }, + { + "epoch": 0.5296244713338359, + "grad_norm": 0.8920283317565918, + "learning_rate": 8.368430230591542e-05, + "loss": 0.7874, + "step": 82900 + }, + { + "epoch": 0.5296883584835747, + "grad_norm": 1.2382694482803345, + "learning_rate": 8.3680593978442e-05, + "loss": 0.9681, + "step": 82910 + }, + { + "epoch": 0.5297522456333134, + "grad_norm": 1.1094623804092407, + "learning_rate": 8.367688531177586e-05, + "loss": 1.0037, + "step": 82920 + }, + { + "epoch": 0.5298161327830521, + "grad_norm": 0.6824793815612793, + "learning_rate": 8.367317630595434e-05, + "loss": 0.8448, + "step": 82930 + }, + { + "epoch": 0.5298800199327908, + "grad_norm": 0.7402679324150085, + "learning_rate": 8.366946696101483e-05, + "loss": 0.8678, + "step": 82940 + }, + { + "epoch": 0.5299439070825295, + "grad_norm": 0.8517834544181824, + "learning_rate": 8.366575727699464e-05, + "loss": 0.9731, + "step": 82950 + }, + { + "epoch": 0.5300077942322681, + "grad_norm": 0.645380437374115, + "learning_rate": 8.366204725393114e-05, + "loss": 0.7756, + "step": 82960 + }, + { + "epoch": 0.5300716813820068, + "grad_norm": 1.7040249109268188, + "learning_rate": 8.365833689186172e-05, + "loss": 0.8345, + "step": 82970 + }, + { + "epoch": 0.5301355685317455, + "grad_norm": 1.2551957368850708, + "learning_rate": 8.365462619082372e-05, + "loss": 1.1466, + "step": 82980 + }, + { + "epoch": 0.5301994556814842, + "grad_norm": 0.596899688243866, + "learning_rate": 8.365091515085452e-05, + "loss": 0.9004, + "step": 82990 + }, + { + "epoch": 0.5302633428312229, + "grad_norm": 0.7382088899612427, + "learning_rate": 8.36472037719915e-05, + "loss": 0.9212, + "step": 83000 + }, + { + "epoch": 0.5303272299809616, + "grad_norm": 0.587727963924408, + "learning_rate": 8.364349205427203e-05, + "loss": 1.1075, + "step": 83010 + }, + { + "epoch": 0.5303911171307003, + "grad_norm": 0.5298671126365662, + "learning_rate": 8.363977999773347e-05, + "loss": 0.8375, + "step": 83020 + }, + { + "epoch": 0.530455004280439, + "grad_norm": 0.7597865462303162, + "learning_rate": 8.363606760241323e-05, + "loss": 1.0665, + "step": 83030 + }, + { + "epoch": 0.5305188914301777, + "grad_norm": 1.4384864568710327, + "learning_rate": 8.363235486834871e-05, + "loss": 0.8882, + "step": 83040 + }, + { + "epoch": 0.5305827785799164, + "grad_norm": 1.0824155807495117, + "learning_rate": 8.362864179557726e-05, + "loss": 0.9262, + "step": 83050 + }, + { + "epoch": 0.5306466657296551, + "grad_norm": 0.9911159873008728, + "learning_rate": 8.36249283841363e-05, + "loss": 1.0327, + "step": 83060 + }, + { + "epoch": 0.5307105528793938, + "grad_norm": 1.1133580207824707, + "learning_rate": 8.362121463406323e-05, + "loss": 0.9494, + "step": 83070 + }, + { + "epoch": 0.5307744400291325, + "grad_norm": 1.960688591003418, + "learning_rate": 8.361750054539544e-05, + "loss": 0.9712, + "step": 83080 + }, + { + "epoch": 0.5308383271788712, + "grad_norm": 0.710477888584137, + "learning_rate": 8.361378611817033e-05, + "loss": 0.913, + "step": 83090 + }, + { + "epoch": 0.53090221432861, + "grad_norm": 0.9821794629096985, + "learning_rate": 8.36100713524253e-05, + "loss": 0.9127, + "step": 83100 + }, + { + "epoch": 0.5309661014783487, + "grad_norm": 0.5997344851493835, + "learning_rate": 8.360635624819778e-05, + "loss": 0.8929, + "step": 83110 + }, + { + "epoch": 0.5310299886280874, + "grad_norm": 1.1449615955352783, + "learning_rate": 8.36026408055252e-05, + "loss": 0.5898, + "step": 83120 + }, + { + "epoch": 0.5310938757778261, + "grad_norm": 1.3157292604446411, + "learning_rate": 8.359892502444494e-05, + "loss": 0.8881, + "step": 83130 + }, + { + "epoch": 0.5311577629275648, + "grad_norm": 0.6515958905220032, + "learning_rate": 8.359520890499443e-05, + "loss": 0.8024, + "step": 83140 + }, + { + "epoch": 0.5312216500773035, + "grad_norm": 0.8860677480697632, + "learning_rate": 8.359149244721112e-05, + "loss": 0.8348, + "step": 83150 + }, + { + "epoch": 0.5312855372270422, + "grad_norm": 0.8517594933509827, + "learning_rate": 8.358777565113242e-05, + "loss": 0.8221, + "step": 83160 + }, + { + "epoch": 0.5313494243767809, + "grad_norm": 0.9374033808708191, + "learning_rate": 8.358405851679574e-05, + "loss": 0.7633, + "step": 83170 + }, + { + "epoch": 0.5314133115265196, + "grad_norm": 0.885487973690033, + "learning_rate": 8.358034104423857e-05, + "loss": 0.926, + "step": 83180 + }, + { + "epoch": 0.5314771986762583, + "grad_norm": 0.6758369207382202, + "learning_rate": 8.357662323349828e-05, + "loss": 0.9676, + "step": 83190 + }, + { + "epoch": 0.5315410858259969, + "grad_norm": 1.0564520359039307, + "learning_rate": 8.357290508461238e-05, + "loss": 1.1299, + "step": 83200 + }, + { + "epoch": 0.5316049729757356, + "grad_norm": 0.7590727210044861, + "learning_rate": 8.356918659761826e-05, + "loss": 0.7102, + "step": 83210 + }, + { + "epoch": 0.5316688601254743, + "grad_norm": 1.157811164855957, + "learning_rate": 8.356546777255339e-05, + "loss": 1.3055, + "step": 83220 + }, + { + "epoch": 0.531732747275213, + "grad_norm": 0.7595437169075012, + "learning_rate": 8.356174860945521e-05, + "loss": 0.6526, + "step": 83230 + }, + { + "epoch": 0.5317966344249517, + "grad_norm": 1.0199005603790283, + "learning_rate": 8.355802910836122e-05, + "loss": 0.8271, + "step": 83240 + }, + { + "epoch": 0.5318605215746904, + "grad_norm": 0.9487647414207458, + "learning_rate": 8.355430926930882e-05, + "loss": 1.0003, + "step": 83250 + }, + { + "epoch": 0.5319244087244291, + "grad_norm": 0.8482071161270142, + "learning_rate": 8.35505890923355e-05, + "loss": 1.0448, + "step": 83260 + }, + { + "epoch": 0.5319882958741678, + "grad_norm": 0.7448784708976746, + "learning_rate": 8.354686857747872e-05, + "loss": 1.0787, + "step": 83270 + }, + { + "epoch": 0.5320521830239066, + "grad_norm": 0.6928815245628357, + "learning_rate": 8.354314772477596e-05, + "loss": 0.7474, + "step": 83280 + }, + { + "epoch": 0.5321160701736453, + "grad_norm": 0.7215262651443481, + "learning_rate": 8.353942653426468e-05, + "loss": 0.8376, + "step": 83290 + }, + { + "epoch": 0.532179957323384, + "grad_norm": 1.3700790405273438, + "learning_rate": 8.353570500598235e-05, + "loss": 0.9953, + "step": 83300 + }, + { + "epoch": 0.5322438444731227, + "grad_norm": 1.3022416830062866, + "learning_rate": 8.353198313996649e-05, + "loss": 0.874, + "step": 83310 + }, + { + "epoch": 0.5323077316228614, + "grad_norm": 0.7596305012702942, + "learning_rate": 8.352826093625453e-05, + "loss": 0.6174, + "step": 83320 + }, + { + "epoch": 0.5323716187726001, + "grad_norm": 0.7313193082809448, + "learning_rate": 8.352453839488397e-05, + "loss": 1.105, + "step": 83330 + }, + { + "epoch": 0.5324355059223388, + "grad_norm": 0.6660478711128235, + "learning_rate": 8.35208155158923e-05, + "loss": 1.0361, + "step": 83340 + }, + { + "epoch": 0.5324993930720775, + "grad_norm": 1.1651302576065063, + "learning_rate": 8.351709229931704e-05, + "loss": 0.8399, + "step": 83350 + }, + { + "epoch": 0.5325632802218162, + "grad_norm": 1.0588157176971436, + "learning_rate": 8.351336874519564e-05, + "loss": 1.0247, + "step": 83360 + }, + { + "epoch": 0.5326271673715549, + "grad_norm": 1.3601503372192383, + "learning_rate": 8.350964485356562e-05, + "loss": 0.9979, + "step": 83370 + }, + { + "epoch": 0.5326910545212936, + "grad_norm": 1.1261094808578491, + "learning_rate": 8.350592062446451e-05, + "loss": 0.793, + "step": 83380 + }, + { + "epoch": 0.5327549416710323, + "grad_norm": 0.7241072654724121, + "learning_rate": 8.35021960579298e-05, + "loss": 0.965, + "step": 83390 + }, + { + "epoch": 0.532818828820771, + "grad_norm": 0.7780799865722656, + "learning_rate": 8.349847115399896e-05, + "loss": 1.0567, + "step": 83400 + }, + { + "epoch": 0.5328827159705097, + "grad_norm": 0.7408662438392639, + "learning_rate": 8.349474591270957e-05, + "loss": 0.782, + "step": 83410 + }, + { + "epoch": 0.5329466031202484, + "grad_norm": 1.189795732498169, + "learning_rate": 8.349102033409907e-05, + "loss": 0.8716, + "step": 83420 + }, + { + "epoch": 0.5330104902699871, + "grad_norm": 0.7125329375267029, + "learning_rate": 8.348729441820505e-05, + "loss": 0.9386, + "step": 83430 + }, + { + "epoch": 0.5330743774197257, + "grad_norm": 0.9453898668289185, + "learning_rate": 8.3483568165065e-05, + "loss": 0.9899, + "step": 83440 + }, + { + "epoch": 0.5331382645694644, + "grad_norm": 0.7429458498954773, + "learning_rate": 8.347984157471645e-05, + "loss": 0.893, + "step": 83450 + }, + { + "epoch": 0.5332021517192032, + "grad_norm": 0.758669376373291, + "learning_rate": 8.347611464719694e-05, + "loss": 0.9558, + "step": 83460 + }, + { + "epoch": 0.5332660388689419, + "grad_norm": 0.7475212216377258, + "learning_rate": 8.347238738254399e-05, + "loss": 0.7202, + "step": 83470 + }, + { + "epoch": 0.5333299260186806, + "grad_norm": 1.387135624885559, + "learning_rate": 8.346865978079512e-05, + "loss": 0.6861, + "step": 83480 + }, + { + "epoch": 0.5333938131684193, + "grad_norm": 0.9755001068115234, + "learning_rate": 8.346493184198792e-05, + "loss": 0.8146, + "step": 83490 + }, + { + "epoch": 0.533457700318158, + "grad_norm": 0.9414482712745667, + "learning_rate": 8.346120356615989e-05, + "loss": 0.9161, + "step": 83500 + }, + { + "epoch": 0.5335215874678967, + "grad_norm": 0.7464240193367004, + "learning_rate": 8.34574749533486e-05, + "loss": 0.9186, + "step": 83510 + }, + { + "epoch": 0.5335854746176354, + "grad_norm": 1.2203441858291626, + "learning_rate": 8.34537460035916e-05, + "loss": 0.8786, + "step": 83520 + }, + { + "epoch": 0.5336493617673741, + "grad_norm": 1.0309191942214966, + "learning_rate": 8.345001671692641e-05, + "loss": 0.7864, + "step": 83530 + }, + { + "epoch": 0.5337132489171128, + "grad_norm": 1.0020480155944824, + "learning_rate": 8.344628709339063e-05, + "loss": 0.858, + "step": 83540 + }, + { + "epoch": 0.5337771360668515, + "grad_norm": 0.7191622257232666, + "learning_rate": 8.344255713302181e-05, + "loss": 0.699, + "step": 83550 + }, + { + "epoch": 0.5338410232165902, + "grad_norm": 0.4417421817779541, + "learning_rate": 8.343882683585748e-05, + "loss": 0.8309, + "step": 83560 + }, + { + "epoch": 0.5339049103663289, + "grad_norm": 0.5455567240715027, + "learning_rate": 8.343509620193526e-05, + "loss": 0.7101, + "step": 83570 + }, + { + "epoch": 0.5339687975160676, + "grad_norm": 0.7480769753456116, + "learning_rate": 8.343136523129269e-05, + "loss": 1.0079, + "step": 83580 + }, + { + "epoch": 0.5340326846658063, + "grad_norm": 0.569848895072937, + "learning_rate": 8.342763392396735e-05, + "loss": 0.9228, + "step": 83590 + }, + { + "epoch": 0.534096571815545, + "grad_norm": 0.8300278782844543, + "learning_rate": 8.342390227999683e-05, + "loss": 0.8459, + "step": 83600 + }, + { + "epoch": 0.5341604589652837, + "grad_norm": 0.7378689050674438, + "learning_rate": 8.342017029941868e-05, + "loss": 0.9019, + "step": 83610 + }, + { + "epoch": 0.5342243461150225, + "grad_norm": 1.1345140933990479, + "learning_rate": 8.34164379822705e-05, + "loss": 0.7472, + "step": 83620 + }, + { + "epoch": 0.5342882332647612, + "grad_norm": 0.5428297519683838, + "learning_rate": 8.341270532858989e-05, + "loss": 0.7232, + "step": 83630 + }, + { + "epoch": 0.5343521204144999, + "grad_norm": 0.8249925971031189, + "learning_rate": 8.340897233841443e-05, + "loss": 0.785, + "step": 83640 + }, + { + "epoch": 0.5344160075642386, + "grad_norm": 0.9514716863632202, + "learning_rate": 8.340523901178173e-05, + "loss": 0.7077, + "step": 83650 + }, + { + "epoch": 0.5344798947139773, + "grad_norm": 1.2342941761016846, + "learning_rate": 8.340150534872934e-05, + "loss": 0.7654, + "step": 83660 + }, + { + "epoch": 0.534543781863716, + "grad_norm": 0.7578923106193542, + "learning_rate": 8.339777134929492e-05, + "loss": 0.8597, + "step": 83670 + }, + { + "epoch": 0.5346076690134546, + "grad_norm": 0.7773808836936951, + "learning_rate": 8.339403701351604e-05, + "loss": 0.9918, + "step": 83680 + }, + { + "epoch": 0.5346715561631933, + "grad_norm": 1.0507415533065796, + "learning_rate": 8.339030234143032e-05, + "loss": 0.7686, + "step": 83690 + }, + { + "epoch": 0.534735443312932, + "grad_norm": 0.6321387887001038, + "learning_rate": 8.338656733307537e-05, + "loss": 0.9376, + "step": 83700 + }, + { + "epoch": 0.5347993304626707, + "grad_norm": 0.705500066280365, + "learning_rate": 8.33828319884888e-05, + "loss": 0.9138, + "step": 83710 + }, + { + "epoch": 0.5348632176124094, + "grad_norm": 0.745877206325531, + "learning_rate": 8.337909630770824e-05, + "loss": 1.2483, + "step": 83720 + }, + { + "epoch": 0.5349271047621481, + "grad_norm": 0.9086830615997314, + "learning_rate": 8.337536029077129e-05, + "loss": 0.6154, + "step": 83730 + }, + { + "epoch": 0.5349909919118868, + "grad_norm": 1.130573034286499, + "learning_rate": 8.337162393771559e-05, + "loss": 0.8188, + "step": 83740 + }, + { + "epoch": 0.5350548790616255, + "grad_norm": 0.8201401829719543, + "learning_rate": 8.336788724857878e-05, + "loss": 1.1955, + "step": 83750 + }, + { + "epoch": 0.5351187662113642, + "grad_norm": 1.0572373867034912, + "learning_rate": 8.336415022339847e-05, + "loss": 1.0602, + "step": 83760 + }, + { + "epoch": 0.5351826533611029, + "grad_norm": 0.4251170754432678, + "learning_rate": 8.33604128622123e-05, + "loss": 0.7514, + "step": 83770 + }, + { + "epoch": 0.5352465405108416, + "grad_norm": 1.0447115898132324, + "learning_rate": 8.335667516505791e-05, + "loss": 0.9743, + "step": 83780 + }, + { + "epoch": 0.5353104276605803, + "grad_norm": 0.8478367924690247, + "learning_rate": 8.335293713197296e-05, + "loss": 1.123, + "step": 83790 + }, + { + "epoch": 0.535374314810319, + "grad_norm": 0.8603829145431519, + "learning_rate": 8.334919876299507e-05, + "loss": 0.9385, + "step": 83800 + }, + { + "epoch": 0.5354382019600578, + "grad_norm": 0.719473659992218, + "learning_rate": 8.334546005816188e-05, + "loss": 0.8173, + "step": 83810 + }, + { + "epoch": 0.5355020891097965, + "grad_norm": 1.2602207660675049, + "learning_rate": 8.334172101751108e-05, + "loss": 1.4649, + "step": 83820 + }, + { + "epoch": 0.5355659762595352, + "grad_norm": 1.0611252784729004, + "learning_rate": 8.33379816410803e-05, + "loss": 0.922, + "step": 83830 + }, + { + "epoch": 0.5356298634092739, + "grad_norm": 0.7457683682441711, + "learning_rate": 8.33342419289072e-05, + "loss": 0.9272, + "step": 83840 + }, + { + "epoch": 0.5356937505590126, + "grad_norm": 1.1285589933395386, + "learning_rate": 8.333050188102944e-05, + "loss": 0.9133, + "step": 83850 + }, + { + "epoch": 0.5357576377087513, + "grad_norm": 1.1243196725845337, + "learning_rate": 8.33267614974847e-05, + "loss": 0.6372, + "step": 83860 + }, + { + "epoch": 0.53582152485849, + "grad_norm": 1.4428707361221313, + "learning_rate": 8.332302077831065e-05, + "loss": 0.9235, + "step": 83870 + }, + { + "epoch": 0.5358854120082287, + "grad_norm": 0.7449108362197876, + "learning_rate": 8.331927972354492e-05, + "loss": 0.9763, + "step": 83880 + }, + { + "epoch": 0.5359492991579674, + "grad_norm": 0.6374861001968384, + "learning_rate": 8.331553833322526e-05, + "loss": 0.7703, + "step": 83890 + }, + { + "epoch": 0.5360131863077061, + "grad_norm": 1.0096155405044556, + "learning_rate": 8.331179660738927e-05, + "loss": 0.8562, + "step": 83900 + }, + { + "epoch": 0.5360770734574448, + "grad_norm": 0.7320453524589539, + "learning_rate": 8.330805454607468e-05, + "loss": 0.666, + "step": 83910 + }, + { + "epoch": 0.5361409606071835, + "grad_norm": 0.8086037635803223, + "learning_rate": 8.330431214931917e-05, + "loss": 0.8849, + "step": 83920 + }, + { + "epoch": 0.5362048477569221, + "grad_norm": 2.057863235473633, + "learning_rate": 8.330056941716043e-05, + "loss": 0.9219, + "step": 83930 + }, + { + "epoch": 0.5362687349066608, + "grad_norm": 0.6205108761787415, + "learning_rate": 8.329682634963614e-05, + "loss": 1.0129, + "step": 83940 + }, + { + "epoch": 0.5363326220563995, + "grad_norm": 1.2019091844558716, + "learning_rate": 8.3293082946784e-05, + "loss": 1.1293, + "step": 83950 + }, + { + "epoch": 0.5363965092061382, + "grad_norm": 0.6992289423942566, + "learning_rate": 8.328933920864172e-05, + "loss": 0.6816, + "step": 83960 + }, + { + "epoch": 0.536460396355877, + "grad_norm": 0.8456112742424011, + "learning_rate": 8.328559513524699e-05, + "loss": 0.9054, + "step": 83970 + }, + { + "epoch": 0.5365242835056157, + "grad_norm": 0.6546765565872192, + "learning_rate": 8.328185072663752e-05, + "loss": 0.8627, + "step": 83980 + }, + { + "epoch": 0.5365881706553544, + "grad_norm": 0.9863765835762024, + "learning_rate": 8.327810598285102e-05, + "loss": 1.2343, + "step": 83990 + }, + { + "epoch": 0.5366520578050931, + "grad_norm": 0.8402466773986816, + "learning_rate": 8.32743609039252e-05, + "loss": 0.8839, + "step": 84000 + }, + { + "epoch": 0.5367159449548318, + "grad_norm": 0.5946282744407654, + "learning_rate": 8.327061548989778e-05, + "loss": 0.8401, + "step": 84010 + }, + { + "epoch": 0.5367798321045705, + "grad_norm": 0.8258355855941772, + "learning_rate": 8.32668697408065e-05, + "loss": 1.094, + "step": 84020 + }, + { + "epoch": 0.5368437192543092, + "grad_norm": 0.779899537563324, + "learning_rate": 8.326312365668905e-05, + "loss": 1.0426, + "step": 84030 + }, + { + "epoch": 0.5369076064040479, + "grad_norm": 0.9077179431915283, + "learning_rate": 8.325937723758314e-05, + "loss": 0.9158, + "step": 84040 + }, + { + "epoch": 0.5369714935537866, + "grad_norm": 1.139228105545044, + "learning_rate": 8.325563048352655e-05, + "loss": 0.8906, + "step": 84050 + }, + { + "epoch": 0.5370353807035253, + "grad_norm": 0.8066197037696838, + "learning_rate": 8.3251883394557e-05, + "loss": 0.8026, + "step": 84060 + }, + { + "epoch": 0.537099267853264, + "grad_norm": 0.8473499417304993, + "learning_rate": 8.32481359707122e-05, + "loss": 0.5615, + "step": 84070 + }, + { + "epoch": 0.5371631550030027, + "grad_norm": 1.1238465309143066, + "learning_rate": 8.324438821202992e-05, + "loss": 0.814, + "step": 84080 + }, + { + "epoch": 0.5372270421527414, + "grad_norm": 0.8760488629341125, + "learning_rate": 8.324064011854789e-05, + "loss": 0.8522, + "step": 84090 + }, + { + "epoch": 0.5372909293024801, + "grad_norm": 1.5137993097305298, + "learning_rate": 8.323689169030384e-05, + "loss": 0.7777, + "step": 84100 + }, + { + "epoch": 0.5373548164522188, + "grad_norm": 1.2992900609970093, + "learning_rate": 8.323314292733556e-05, + "loss": 0.8892, + "step": 84110 + }, + { + "epoch": 0.5374187036019575, + "grad_norm": 0.8411065936088562, + "learning_rate": 8.322939382968077e-05, + "loss": 0.8524, + "step": 84120 + }, + { + "epoch": 0.5374825907516962, + "grad_norm": 0.8992130160331726, + "learning_rate": 8.322564439737723e-05, + "loss": 0.8281, + "step": 84130 + }, + { + "epoch": 0.537546477901435, + "grad_norm": 0.5751587152481079, + "learning_rate": 8.322189463046271e-05, + "loss": 0.749, + "step": 84140 + }, + { + "epoch": 0.5376103650511737, + "grad_norm": 0.6489611268043518, + "learning_rate": 8.321814452897498e-05, + "loss": 0.9997, + "step": 84150 + }, + { + "epoch": 0.5376742522009124, + "grad_norm": 0.6058949828147888, + "learning_rate": 8.321439409295179e-05, + "loss": 1.159, + "step": 84160 + }, + { + "epoch": 0.537738139350651, + "grad_norm": 0.779172420501709, + "learning_rate": 8.321064332243091e-05, + "loss": 0.8733, + "step": 84170 + }, + { + "epoch": 0.5378020265003897, + "grad_norm": 0.8683562278747559, + "learning_rate": 8.320689221745012e-05, + "loss": 0.7102, + "step": 84180 + }, + { + "epoch": 0.5378659136501284, + "grad_norm": 0.6446613073348999, + "learning_rate": 8.32031407780472e-05, + "loss": 0.6436, + "step": 84190 + }, + { + "epoch": 0.5379298007998671, + "grad_norm": 0.7266974449157715, + "learning_rate": 8.319938900425994e-05, + "loss": 0.8872, + "step": 84200 + }, + { + "epoch": 0.5379936879496058, + "grad_norm": 0.8739939332008362, + "learning_rate": 8.319563689612611e-05, + "loss": 0.8188, + "step": 84210 + }, + { + "epoch": 0.5380575750993445, + "grad_norm": 1.239883542060852, + "learning_rate": 8.319188445368349e-05, + "loss": 0.8107, + "step": 84220 + }, + { + "epoch": 0.5381214622490832, + "grad_norm": 1.1432856321334839, + "learning_rate": 8.318813167696987e-05, + "loss": 0.7252, + "step": 84230 + }, + { + "epoch": 0.5381853493988219, + "grad_norm": 1.283229947090149, + "learning_rate": 8.318437856602306e-05, + "loss": 0.8599, + "step": 84240 + }, + { + "epoch": 0.5382492365485606, + "grad_norm": 1.238756775856018, + "learning_rate": 8.318062512088087e-05, + "loss": 0.939, + "step": 84250 + }, + { + "epoch": 0.5383131236982993, + "grad_norm": 0.9360271096229553, + "learning_rate": 8.317687134158106e-05, + "loss": 0.9372, + "step": 84260 + }, + { + "epoch": 0.538377010848038, + "grad_norm": 0.6929467916488647, + "learning_rate": 8.317311722816145e-05, + "loss": 0.8145, + "step": 84270 + }, + { + "epoch": 0.5384408979977767, + "grad_norm": 1.410101294517517, + "learning_rate": 8.316936278065986e-05, + "loss": 0.6732, + "step": 84280 + }, + { + "epoch": 0.5385047851475154, + "grad_norm": 1.029524803161621, + "learning_rate": 8.316560799911408e-05, + "loss": 1.0576, + "step": 84290 + }, + { + "epoch": 0.5385686722972541, + "grad_norm": 1.0988661050796509, + "learning_rate": 8.316185288356194e-05, + "loss": 0.7547, + "step": 84300 + }, + { + "epoch": 0.5386325594469928, + "grad_norm": 0.8414357304573059, + "learning_rate": 8.315809743404126e-05, + "loss": 0.9667, + "step": 84310 + }, + { + "epoch": 0.5386964465967315, + "grad_norm": 0.6246783137321472, + "learning_rate": 8.315434165058983e-05, + "loss": 0.7647, + "step": 84320 + }, + { + "epoch": 0.5387603337464703, + "grad_norm": 0.7971277236938477, + "learning_rate": 8.315058553324551e-05, + "loss": 1.2018, + "step": 84330 + }, + { + "epoch": 0.538824220896209, + "grad_norm": 0.7713975310325623, + "learning_rate": 8.314682908204612e-05, + "loss": 0.9313, + "step": 84340 + }, + { + "epoch": 0.5388881080459477, + "grad_norm": 0.6083114147186279, + "learning_rate": 8.314307229702949e-05, + "loss": 0.8577, + "step": 84350 + }, + { + "epoch": 0.5389519951956864, + "grad_norm": 1.0030479431152344, + "learning_rate": 8.313931517823344e-05, + "loss": 0.88, + "step": 84360 + }, + { + "epoch": 0.5390158823454251, + "grad_norm": 0.9634591341018677, + "learning_rate": 8.313555772569581e-05, + "loss": 1.137, + "step": 84370 + }, + { + "epoch": 0.5390797694951638, + "grad_norm": 0.6758565902709961, + "learning_rate": 8.313179993945445e-05, + "loss": 0.8548, + "step": 84380 + }, + { + "epoch": 0.5391436566449025, + "grad_norm": 1.2440674304962158, + "learning_rate": 8.312804181954721e-05, + "loss": 0.9583, + "step": 84390 + }, + { + "epoch": 0.5392075437946412, + "grad_norm": 0.7709629535675049, + "learning_rate": 8.312428336601193e-05, + "loss": 0.9656, + "step": 84400 + }, + { + "epoch": 0.5392714309443798, + "grad_norm": 0.8080304265022278, + "learning_rate": 8.312052457888646e-05, + "loss": 0.8182, + "step": 84410 + }, + { + "epoch": 0.5393353180941185, + "grad_norm": 0.7901466488838196, + "learning_rate": 8.311676545820865e-05, + "loss": 0.8039, + "step": 84420 + }, + { + "epoch": 0.5393992052438572, + "grad_norm": 0.6051963567733765, + "learning_rate": 8.311338196444268e-05, + "loss": 0.9002, + "step": 84430 + }, + { + "epoch": 0.5394630923935959, + "grad_norm": 0.8808472156524658, + "learning_rate": 8.310962221011971e-05, + "loss": 0.8492, + "step": 84440 + }, + { + "epoch": 0.5395269795433346, + "grad_norm": 0.7070138454437256, + "learning_rate": 8.310586212235423e-05, + "loss": 1.134, + "step": 84450 + }, + { + "epoch": 0.5395908666930733, + "grad_norm": 0.7789306640625, + "learning_rate": 8.310210170118406e-05, + "loss": 0.7914, + "step": 84460 + }, + { + "epoch": 0.539654753842812, + "grad_norm": 0.9255892634391785, + "learning_rate": 8.30983409466471e-05, + "loss": 0.8324, + "step": 84470 + }, + { + "epoch": 0.5397186409925507, + "grad_norm": 1.0117281675338745, + "learning_rate": 8.309457985878122e-05, + "loss": 0.9676, + "step": 84480 + }, + { + "epoch": 0.5397825281422894, + "grad_norm": 0.7408267855644226, + "learning_rate": 8.309081843762428e-05, + "loss": 0.829, + "step": 84490 + }, + { + "epoch": 0.5398464152920281, + "grad_norm": 0.6966201663017273, + "learning_rate": 8.308705668321417e-05, + "loss": 0.9113, + "step": 84500 + }, + { + "epoch": 0.5399103024417669, + "grad_norm": 0.7605626583099365, + "learning_rate": 8.308329459558877e-05, + "loss": 0.9392, + "step": 84510 + }, + { + "epoch": 0.5399741895915056, + "grad_norm": 0.7314460277557373, + "learning_rate": 8.307953217478599e-05, + "loss": 0.7721, + "step": 84520 + }, + { + "epoch": 0.5400380767412443, + "grad_norm": 0.8111374974250793, + "learning_rate": 8.30757694208437e-05, + "loss": 0.6695, + "step": 84530 + }, + { + "epoch": 0.540101963890983, + "grad_norm": 0.7169995903968811, + "learning_rate": 8.307200633379978e-05, + "loss": 0.8237, + "step": 84540 + }, + { + "epoch": 0.5401658510407217, + "grad_norm": 0.8992086052894592, + "learning_rate": 8.306824291369216e-05, + "loss": 0.7942, + "step": 84550 + }, + { + "epoch": 0.5402297381904604, + "grad_norm": 0.5550522804260254, + "learning_rate": 8.306447916055871e-05, + "loss": 0.8189, + "step": 84560 + }, + { + "epoch": 0.5402936253401991, + "grad_norm": 1.1253445148468018, + "learning_rate": 8.306071507443737e-05, + "loss": 0.8835, + "step": 84570 + }, + { + "epoch": 0.5403575124899378, + "grad_norm": 1.120518684387207, + "learning_rate": 8.305695065536602e-05, + "loss": 1.099, + "step": 84580 + }, + { + "epoch": 0.5404213996396765, + "grad_norm": 1.3610060214996338, + "learning_rate": 8.305318590338258e-05, + "loss": 0.9345, + "step": 84590 + }, + { + "epoch": 0.5404852867894152, + "grad_norm": 0.8917859792709351, + "learning_rate": 8.304942081852496e-05, + "loss": 1.2491, + "step": 84600 + }, + { + "epoch": 0.5405491739391539, + "grad_norm": 0.734668493270874, + "learning_rate": 8.304565540083107e-05, + "loss": 1.0179, + "step": 84610 + }, + { + "epoch": 0.5406130610888926, + "grad_norm": 0.808816134929657, + "learning_rate": 8.304188965033885e-05, + "loss": 0.9507, + "step": 84620 + }, + { + "epoch": 0.5406769482386313, + "grad_norm": 0.8101891875267029, + "learning_rate": 8.303812356708622e-05, + "loss": 0.7707, + "step": 84630 + }, + { + "epoch": 0.54074083538837, + "grad_norm": 0.5614955425262451, + "learning_rate": 8.303435715111111e-05, + "loss": 0.9146, + "step": 84640 + }, + { + "epoch": 0.5408047225381087, + "grad_norm": 1.0607208013534546, + "learning_rate": 8.303059040245144e-05, + "loss": 0.8684, + "step": 84650 + }, + { + "epoch": 0.5408686096878473, + "grad_norm": 0.9058458805084229, + "learning_rate": 8.302682332114515e-05, + "loss": 0.7029, + "step": 84660 + }, + { + "epoch": 0.540932496837586, + "grad_norm": 0.9082807898521423, + "learning_rate": 8.302305590723016e-05, + "loss": 0.8539, + "step": 84670 + }, + { + "epoch": 0.5409963839873247, + "grad_norm": 0.8213421702384949, + "learning_rate": 8.301928816074445e-05, + "loss": 0.9783, + "step": 84680 + }, + { + "epoch": 0.5410602711370635, + "grad_norm": 0.7759522795677185, + "learning_rate": 8.301552008172593e-05, + "loss": 0.6989, + "step": 84690 + }, + { + "epoch": 0.5411241582868022, + "grad_norm": 0.574531078338623, + "learning_rate": 8.301175167021256e-05, + "loss": 0.9258, + "step": 84700 + }, + { + "epoch": 0.5411880454365409, + "grad_norm": 0.8771001100540161, + "learning_rate": 8.300798292624228e-05, + "loss": 1.0307, + "step": 84710 + }, + { + "epoch": 0.5412519325862796, + "grad_norm": 1.086178183555603, + "learning_rate": 8.300421384985309e-05, + "loss": 1.1992, + "step": 84720 + }, + { + "epoch": 0.5413158197360183, + "grad_norm": 1.1887942552566528, + "learning_rate": 8.300044444108288e-05, + "loss": 0.7615, + "step": 84730 + }, + { + "epoch": 0.541379706885757, + "grad_norm": 0.909010648727417, + "learning_rate": 8.299667469996966e-05, + "loss": 0.9202, + "step": 84740 + }, + { + "epoch": 0.5414435940354957, + "grad_norm": 0.6186991930007935, + "learning_rate": 8.299290462655138e-05, + "loss": 0.7071, + "step": 84750 + }, + { + "epoch": 0.5415074811852344, + "grad_norm": 0.7226212620735168, + "learning_rate": 8.2989134220866e-05, + "loss": 1.0302, + "step": 84760 + }, + { + "epoch": 0.5415713683349731, + "grad_norm": 1.6351087093353271, + "learning_rate": 8.298536348295152e-05, + "loss": 0.7616, + "step": 84770 + }, + { + "epoch": 0.5416352554847118, + "grad_norm": 2.3202216625213623, + "learning_rate": 8.298159241284587e-05, + "loss": 0.8703, + "step": 84780 + }, + { + "epoch": 0.5416991426344505, + "grad_norm": 0.5504477620124817, + "learning_rate": 8.297782101058706e-05, + "loss": 1.0846, + "step": 84790 + }, + { + "epoch": 0.5417630297841892, + "grad_norm": 0.846871554851532, + "learning_rate": 8.297404927621306e-05, + "loss": 0.9876, + "step": 84800 + }, + { + "epoch": 0.5418269169339279, + "grad_norm": 0.9501508474349976, + "learning_rate": 8.297027720976185e-05, + "loss": 0.779, + "step": 84810 + }, + { + "epoch": 0.5418908040836666, + "grad_norm": 0.6770570278167725, + "learning_rate": 8.296650481127144e-05, + "loss": 0.741, + "step": 84820 + }, + { + "epoch": 0.5419546912334053, + "grad_norm": 1.0204015970230103, + "learning_rate": 8.296273208077981e-05, + "loss": 0.8651, + "step": 84830 + }, + { + "epoch": 0.542018578383144, + "grad_norm": 1.2423951625823975, + "learning_rate": 8.295895901832493e-05, + "loss": 0.9325, + "step": 84840 + }, + { + "epoch": 0.5420824655328828, + "grad_norm": 1.19427490234375, + "learning_rate": 8.295518562394484e-05, + "loss": 0.9283, + "step": 84850 + }, + { + "epoch": 0.5421463526826215, + "grad_norm": 0.9197470545768738, + "learning_rate": 8.29514118976775e-05, + "loss": 0.7334, + "step": 84860 + }, + { + "epoch": 0.5422102398323602, + "grad_norm": 1.0136433839797974, + "learning_rate": 8.294763783956096e-05, + "loss": 0.6607, + "step": 84870 + }, + { + "epoch": 0.5422741269820989, + "grad_norm": 1.1331266164779663, + "learning_rate": 8.294386344963319e-05, + "loss": 0.6727, + "step": 84880 + }, + { + "epoch": 0.5423380141318376, + "grad_norm": 1.1634505987167358, + "learning_rate": 8.294008872793222e-05, + "loss": 1.0549, + "step": 84890 + }, + { + "epoch": 0.5424019012815762, + "grad_norm": 0.885592520236969, + "learning_rate": 8.293631367449605e-05, + "loss": 0.9175, + "step": 84900 + }, + { + "epoch": 0.5424657884313149, + "grad_norm": 0.7307121753692627, + "learning_rate": 8.293253828936271e-05, + "loss": 0.8359, + "step": 84910 + }, + { + "epoch": 0.5425296755810536, + "grad_norm": 1.0684562921524048, + "learning_rate": 8.292876257257022e-05, + "loss": 0.9552, + "step": 84920 + }, + { + "epoch": 0.5425935627307923, + "grad_norm": 0.9303468465805054, + "learning_rate": 8.29249865241566e-05, + "loss": 0.8814, + "step": 84930 + }, + { + "epoch": 0.542657449880531, + "grad_norm": 1.3686809539794922, + "learning_rate": 8.292121014415987e-05, + "loss": 1.0071, + "step": 84940 + }, + { + "epoch": 0.5427213370302697, + "grad_norm": 0.9795172214508057, + "learning_rate": 8.29174334326181e-05, + "loss": 0.8684, + "step": 84950 + }, + { + "epoch": 0.5427852241800084, + "grad_norm": 0.7319976687431335, + "learning_rate": 8.29136563895693e-05, + "loss": 0.8488, + "step": 84960 + }, + { + "epoch": 0.5428491113297471, + "grad_norm": 0.7034667730331421, + "learning_rate": 8.290987901505148e-05, + "loss": 0.8596, + "step": 84970 + }, + { + "epoch": 0.5429129984794858, + "grad_norm": 1.3945845365524292, + "learning_rate": 8.290610130910272e-05, + "loss": 0.759, + "step": 84980 + }, + { + "epoch": 0.5429768856292245, + "grad_norm": 0.878729522228241, + "learning_rate": 8.290232327176104e-05, + "loss": 1.0464, + "step": 84990 + }, + { + "epoch": 0.5430407727789632, + "grad_norm": 1.176857590675354, + "learning_rate": 8.289854490306453e-05, + "loss": 0.9618, + "step": 85000 + }, + { + "epoch": 0.5431046599287019, + "grad_norm": 1.061789870262146, + "learning_rate": 8.289476620305118e-05, + "loss": 0.7039, + "step": 85010 + }, + { + "epoch": 0.5431685470784406, + "grad_norm": 1.1933741569519043, + "learning_rate": 8.289098717175909e-05, + "loss": 0.9763, + "step": 85020 + }, + { + "epoch": 0.5432324342281794, + "grad_norm": 1.391781210899353, + "learning_rate": 8.28872078092263e-05, + "loss": 1.044, + "step": 85030 + }, + { + "epoch": 0.5432963213779181, + "grad_norm": 0.5679248571395874, + "learning_rate": 8.288342811549088e-05, + "loss": 1.1061, + "step": 85040 + }, + { + "epoch": 0.5433602085276568, + "grad_norm": 0.892066240310669, + "learning_rate": 8.28796480905909e-05, + "loss": 0.8354, + "step": 85050 + }, + { + "epoch": 0.5434240956773955, + "grad_norm": 0.6071507930755615, + "learning_rate": 8.28758677345644e-05, + "loss": 1.047, + "step": 85060 + }, + { + "epoch": 0.5434879828271342, + "grad_norm": 0.8333146572113037, + "learning_rate": 8.287208704744946e-05, + "loss": 0.7995, + "step": 85070 + }, + { + "epoch": 0.5435518699768729, + "grad_norm": 1.3047791719436646, + "learning_rate": 8.28683060292842e-05, + "loss": 1.1147, + "step": 85080 + }, + { + "epoch": 0.5436157571266116, + "grad_norm": 0.8263481259346008, + "learning_rate": 8.286452468010664e-05, + "loss": 0.9913, + "step": 85090 + }, + { + "epoch": 0.5436796442763503, + "grad_norm": 0.7588023543357849, + "learning_rate": 8.28607429999549e-05, + "loss": 0.8798, + "step": 85100 + }, + { + "epoch": 0.543743531426089, + "grad_norm": 0.6401307582855225, + "learning_rate": 8.285696098886704e-05, + "loss": 1.1625, + "step": 85110 + }, + { + "epoch": 0.5438074185758277, + "grad_norm": 1.0735725164413452, + "learning_rate": 8.285317864688116e-05, + "loss": 0.836, + "step": 85120 + }, + { + "epoch": 0.5438713057255664, + "grad_norm": 0.8113425970077515, + "learning_rate": 8.284939597403533e-05, + "loss": 0.7685, + "step": 85130 + }, + { + "epoch": 0.543935192875305, + "grad_norm": 0.911358654499054, + "learning_rate": 8.28456129703677e-05, + "loss": 0.9308, + "step": 85140 + }, + { + "epoch": 0.5439990800250437, + "grad_norm": 1.186699390411377, + "learning_rate": 8.284182963591631e-05, + "loss": 0.8727, + "step": 85150 + }, + { + "epoch": 0.5440629671747824, + "grad_norm": 1.335977554321289, + "learning_rate": 8.283804597071928e-05, + "loss": 1.0234, + "step": 85160 + }, + { + "epoch": 0.5441268543245211, + "grad_norm": 1.1093186140060425, + "learning_rate": 8.283426197481473e-05, + "loss": 1.1973, + "step": 85170 + }, + { + "epoch": 0.5441907414742598, + "grad_norm": 0.4914005994796753, + "learning_rate": 8.283047764824075e-05, + "loss": 0.9727, + "step": 85180 + }, + { + "epoch": 0.5442546286239985, + "grad_norm": 0.5826841592788696, + "learning_rate": 8.282669299103544e-05, + "loss": 0.9037, + "step": 85190 + }, + { + "epoch": 0.5443185157737372, + "grad_norm": 1.0560849905014038, + "learning_rate": 8.282290800323697e-05, + "loss": 1.0382, + "step": 85200 + }, + { + "epoch": 0.544382402923476, + "grad_norm": 0.6486173272132874, + "learning_rate": 8.28191226848834e-05, + "loss": 0.913, + "step": 85210 + }, + { + "epoch": 0.5444462900732147, + "grad_norm": 1.390495777130127, + "learning_rate": 8.281533703601288e-05, + "loss": 0.9475, + "step": 85220 + }, + { + "epoch": 0.5445101772229534, + "grad_norm": 0.9394730925559998, + "learning_rate": 8.28115510566635e-05, + "loss": 0.8956, + "step": 85230 + }, + { + "epoch": 0.5445740643726921, + "grad_norm": 1.8664871454238892, + "learning_rate": 8.280776474687343e-05, + "loss": 0.8971, + "step": 85240 + }, + { + "epoch": 0.5446379515224308, + "grad_norm": 1.337372899055481, + "learning_rate": 8.28039781066808e-05, + "loss": 0.9751, + "step": 85250 + }, + { + "epoch": 0.5447018386721695, + "grad_norm": 0.7601255178451538, + "learning_rate": 8.280019113612371e-05, + "loss": 0.8855, + "step": 85260 + }, + { + "epoch": 0.5447657258219082, + "grad_norm": 0.9285007119178772, + "learning_rate": 8.279640383524034e-05, + "loss": 0.6376, + "step": 85270 + }, + { + "epoch": 0.5448296129716469, + "grad_norm": 0.9922348260879517, + "learning_rate": 8.279261620406881e-05, + "loss": 1.2103, + "step": 85280 + }, + { + "epoch": 0.5448935001213856, + "grad_norm": 1.2273433208465576, + "learning_rate": 8.278882824264726e-05, + "loss": 0.7563, + "step": 85290 + }, + { + "epoch": 0.5449573872711243, + "grad_norm": 1.0333365201950073, + "learning_rate": 8.278503995101383e-05, + "loss": 1.0593, + "step": 85300 + }, + { + "epoch": 0.545021274420863, + "grad_norm": 0.8431949615478516, + "learning_rate": 8.278125132920669e-05, + "loss": 1.0156, + "step": 85310 + }, + { + "epoch": 0.5450851615706017, + "grad_norm": 0.8159672617912292, + "learning_rate": 8.277746237726401e-05, + "loss": 0.8822, + "step": 85320 + }, + { + "epoch": 0.5451490487203404, + "grad_norm": 0.8089435696601868, + "learning_rate": 8.27736730952239e-05, + "loss": 0.9965, + "step": 85330 + }, + { + "epoch": 0.5452129358700791, + "grad_norm": 1.1019364595413208, + "learning_rate": 8.276988348312456e-05, + "loss": 0.6972, + "step": 85340 + }, + { + "epoch": 0.5452768230198178, + "grad_norm": 0.755990207195282, + "learning_rate": 8.276609354100414e-05, + "loss": 0.941, + "step": 85350 + }, + { + "epoch": 0.5453407101695565, + "grad_norm": 1.2020833492279053, + "learning_rate": 8.276230326890081e-05, + "loss": 0.8955, + "step": 85360 + }, + { + "epoch": 0.5454045973192952, + "grad_norm": 0.9152243733406067, + "learning_rate": 8.275851266685276e-05, + "loss": 0.9033, + "step": 85370 + }, + { + "epoch": 0.545468484469034, + "grad_norm": 0.9686945080757141, + "learning_rate": 8.275472173489814e-05, + "loss": 0.967, + "step": 85380 + }, + { + "epoch": 0.5455323716187725, + "grad_norm": 0.7973108887672424, + "learning_rate": 8.275093047307511e-05, + "loss": 0.9112, + "step": 85390 + }, + { + "epoch": 0.5455962587685113, + "grad_norm": 0.8229728937149048, + "learning_rate": 8.27471388814219e-05, + "loss": 0.9667, + "step": 85400 + }, + { + "epoch": 0.54566014591825, + "grad_norm": 2.0562517642974854, + "learning_rate": 8.274334695997668e-05, + "loss": 0.8901, + "step": 85410 + }, + { + "epoch": 0.5457240330679887, + "grad_norm": 0.8347399830818176, + "learning_rate": 8.273955470877762e-05, + "loss": 0.6852, + "step": 85420 + }, + { + "epoch": 0.5457879202177274, + "grad_norm": 0.8529654741287231, + "learning_rate": 8.273576212786292e-05, + "loss": 0.7934, + "step": 85430 + }, + { + "epoch": 0.5458518073674661, + "grad_norm": 1.5173248052597046, + "learning_rate": 8.273196921727075e-05, + "loss": 1.0872, + "step": 85440 + }, + { + "epoch": 0.5459156945172048, + "grad_norm": 0.9968129396438599, + "learning_rate": 8.272817597703936e-05, + "loss": 0.831, + "step": 85450 + }, + { + "epoch": 0.5459795816669435, + "grad_norm": 0.980364978313446, + "learning_rate": 8.272438240720692e-05, + "loss": 0.993, + "step": 85460 + }, + { + "epoch": 0.5460434688166822, + "grad_norm": 0.8081022500991821, + "learning_rate": 8.272058850781164e-05, + "loss": 0.8859, + "step": 85470 + }, + { + "epoch": 0.5461073559664209, + "grad_norm": 0.5837751030921936, + "learning_rate": 8.271679427889172e-05, + "loss": 0.8914, + "step": 85480 + }, + { + "epoch": 0.5461712431161596, + "grad_norm": 1.0083229541778564, + "learning_rate": 8.271299972048538e-05, + "loss": 1.0343, + "step": 85490 + }, + { + "epoch": 0.5462351302658983, + "grad_norm": 1.3048707246780396, + "learning_rate": 8.270920483263082e-05, + "loss": 1.014, + "step": 85500 + }, + { + "epoch": 0.546299017415637, + "grad_norm": 0.6506006121635437, + "learning_rate": 8.27054096153663e-05, + "loss": 0.941, + "step": 85510 + }, + { + "epoch": 0.5463629045653757, + "grad_norm": 0.6416545510292053, + "learning_rate": 8.270161406872998e-05, + "loss": 1.053, + "step": 85520 + }, + { + "epoch": 0.5464267917151144, + "grad_norm": 0.9509826302528381, + "learning_rate": 8.269781819276015e-05, + "loss": 1.2414, + "step": 85530 + }, + { + "epoch": 0.5464906788648531, + "grad_norm": 0.8895853757858276, + "learning_rate": 8.269402198749496e-05, + "loss": 0.7903, + "step": 85540 + }, + { + "epoch": 0.5465545660145918, + "grad_norm": 0.7762950658798218, + "learning_rate": 8.269022545297272e-05, + "loss": 1.1589, + "step": 85550 + }, + { + "epoch": 0.5466184531643306, + "grad_norm": 1.1687923669815063, + "learning_rate": 8.268642858923161e-05, + "loss": 0.7478, + "step": 85560 + }, + { + "epoch": 0.5466823403140693, + "grad_norm": 0.8369365930557251, + "learning_rate": 8.268263139630989e-05, + "loss": 0.7321, + "step": 85570 + }, + { + "epoch": 0.546746227463808, + "grad_norm": 1.429382085800171, + "learning_rate": 8.26788338742458e-05, + "loss": 0.9934, + "step": 85580 + }, + { + "epoch": 0.5468101146135467, + "grad_norm": 0.798319935798645, + "learning_rate": 8.267503602307758e-05, + "loss": 1.0856, + "step": 85590 + }, + { + "epoch": 0.5468740017632854, + "grad_norm": 0.8369146585464478, + "learning_rate": 8.267123784284348e-05, + "loss": 1.1573, + "step": 85600 + }, + { + "epoch": 0.5469378889130241, + "grad_norm": 0.8850108981132507, + "learning_rate": 8.266743933358176e-05, + "loss": 0.9553, + "step": 85610 + }, + { + "epoch": 0.5470017760627628, + "grad_norm": 1.494284987449646, + "learning_rate": 8.266364049533065e-05, + "loss": 0.8582, + "step": 85620 + }, + { + "epoch": 0.5470656632125014, + "grad_norm": 1.0392504930496216, + "learning_rate": 8.265984132812843e-05, + "loss": 0.9366, + "step": 85630 + }, + { + "epoch": 0.5471295503622401, + "grad_norm": 1.1216447353363037, + "learning_rate": 8.265604183201335e-05, + "loss": 0.7129, + "step": 85640 + }, + { + "epoch": 0.5471934375119788, + "grad_norm": 0.9301803708076477, + "learning_rate": 8.265224200702368e-05, + "loss": 0.6782, + "step": 85650 + }, + { + "epoch": 0.5472573246617175, + "grad_norm": 0.8470297455787659, + "learning_rate": 8.264844185319767e-05, + "loss": 0.9429, + "step": 85660 + }, + { + "epoch": 0.5473212118114562, + "grad_norm": 0.5491040945053101, + "learning_rate": 8.264464137057361e-05, + "loss": 0.896, + "step": 85670 + }, + { + "epoch": 0.5473850989611949, + "grad_norm": 2.296058416366577, + "learning_rate": 8.264084055918979e-05, + "loss": 0.9733, + "step": 85680 + }, + { + "epoch": 0.5474489861109336, + "grad_norm": 0.6744887828826904, + "learning_rate": 8.263703941908445e-05, + "loss": 0.8849, + "step": 85690 + }, + { + "epoch": 0.5475128732606723, + "grad_norm": 0.819434404373169, + "learning_rate": 8.26332379502959e-05, + "loss": 0.9268, + "step": 85700 + }, + { + "epoch": 0.547576760410411, + "grad_norm": 0.725577175617218, + "learning_rate": 8.26294361528624e-05, + "loss": 0.7676, + "step": 85710 + }, + { + "epoch": 0.5476406475601497, + "grad_norm": 0.8266330361366272, + "learning_rate": 8.262563402682226e-05, + "loss": 0.7853, + "step": 85720 + }, + { + "epoch": 0.5477045347098884, + "grad_norm": 0.8885651230812073, + "learning_rate": 8.262183157221375e-05, + "loss": 0.9026, + "step": 85730 + }, + { + "epoch": 0.5477684218596272, + "grad_norm": 0.5924326777458191, + "learning_rate": 8.261802878907518e-05, + "loss": 0.8313, + "step": 85740 + }, + { + "epoch": 0.5478323090093659, + "grad_norm": 0.5639459490776062, + "learning_rate": 8.261422567744484e-05, + "loss": 0.8063, + "step": 85750 + }, + { + "epoch": 0.5478961961591046, + "grad_norm": 1.0976332426071167, + "learning_rate": 8.261042223736101e-05, + "loss": 0.7802, + "step": 85760 + }, + { + "epoch": 0.5479600833088433, + "grad_norm": 0.8617295622825623, + "learning_rate": 8.260661846886205e-05, + "loss": 0.8656, + "step": 85770 + }, + { + "epoch": 0.548023970458582, + "grad_norm": 0.8581666946411133, + "learning_rate": 8.260281437198622e-05, + "loss": 0.84, + "step": 85780 + }, + { + "epoch": 0.5480878576083207, + "grad_norm": 1.351036548614502, + "learning_rate": 8.259900994677185e-05, + "loss": 0.7633, + "step": 85790 + }, + { + "epoch": 0.5481517447580594, + "grad_norm": 0.803022563457489, + "learning_rate": 8.259520519325725e-05, + "loss": 0.8321, + "step": 85800 + }, + { + "epoch": 0.5482156319077981, + "grad_norm": 1.7103533744812012, + "learning_rate": 8.259140011148073e-05, + "loss": 1.2177, + "step": 85810 + }, + { + "epoch": 0.5482795190575368, + "grad_norm": 0.7438388466835022, + "learning_rate": 8.258759470148061e-05, + "loss": 0.7835, + "step": 85820 + }, + { + "epoch": 0.5483434062072755, + "grad_norm": 0.5970612168312073, + "learning_rate": 8.258378896329521e-05, + "loss": 0.8907, + "step": 85830 + }, + { + "epoch": 0.5484072933570142, + "grad_norm": 1.2347373962402344, + "learning_rate": 8.257998289696289e-05, + "loss": 0.7738, + "step": 85840 + }, + { + "epoch": 0.5484711805067529, + "grad_norm": 0.8404068350791931, + "learning_rate": 8.257617650252194e-05, + "loss": 0.9716, + "step": 85850 + }, + { + "epoch": 0.5485350676564916, + "grad_norm": 0.6082412600517273, + "learning_rate": 8.257236978001071e-05, + "loss": 0.8301, + "step": 85860 + }, + { + "epoch": 0.5485989548062302, + "grad_norm": 1.7358567714691162, + "learning_rate": 8.256856272946756e-05, + "loss": 0.8836, + "step": 85870 + }, + { + "epoch": 0.5486628419559689, + "grad_norm": 1.0207961797714233, + "learning_rate": 8.256475535093077e-05, + "loss": 0.8748, + "step": 85880 + }, + { + "epoch": 0.5487267291057076, + "grad_norm": 0.6568586826324463, + "learning_rate": 8.256094764443876e-05, + "loss": 0.8192, + "step": 85890 + }, + { + "epoch": 0.5487906162554463, + "grad_norm": 1.5999755859375, + "learning_rate": 8.255713961002981e-05, + "loss": 0.8331, + "step": 85900 + }, + { + "epoch": 0.548854503405185, + "grad_norm": 0.7200731635093689, + "learning_rate": 8.255333124774231e-05, + "loss": 0.9032, + "step": 85910 + }, + { + "epoch": 0.5489183905549238, + "grad_norm": 0.9791352152824402, + "learning_rate": 8.254952255761458e-05, + "loss": 1.2814, + "step": 85920 + }, + { + "epoch": 0.5489822777046625, + "grad_norm": 0.7664794921875, + "learning_rate": 8.254571353968504e-05, + "loss": 0.9206, + "step": 85930 + }, + { + "epoch": 0.5490461648544012, + "grad_norm": 0.8306074738502502, + "learning_rate": 8.254190419399197e-05, + "loss": 1.034, + "step": 85940 + }, + { + "epoch": 0.5491100520041399, + "grad_norm": 0.9220788478851318, + "learning_rate": 8.25380945205738e-05, + "loss": 1.0287, + "step": 85950 + }, + { + "epoch": 0.5491739391538786, + "grad_norm": 0.8465439081192017, + "learning_rate": 8.253428451946885e-05, + "loss": 0.6528, + "step": 85960 + }, + { + "epoch": 0.5492378263036173, + "grad_norm": 1.0008618831634521, + "learning_rate": 8.253047419071551e-05, + "loss": 1.0279, + "step": 85970 + }, + { + "epoch": 0.549301713453356, + "grad_norm": 1.0628058910369873, + "learning_rate": 8.252666353435217e-05, + "loss": 1.0874, + "step": 85980 + }, + { + "epoch": 0.5493656006030947, + "grad_norm": 0.6217834949493408, + "learning_rate": 8.252285255041717e-05, + "loss": 0.9456, + "step": 85990 + }, + { + "epoch": 0.5494294877528334, + "grad_norm": 0.46333763003349304, + "learning_rate": 8.251904123894892e-05, + "loss": 0.8225, + "step": 86000 + }, + { + "epoch": 0.5494933749025721, + "grad_norm": 0.8038020730018616, + "learning_rate": 8.251522959998577e-05, + "loss": 0.8062, + "step": 86010 + }, + { + "epoch": 0.5495572620523108, + "grad_norm": 1.191167950630188, + "learning_rate": 8.251141763356614e-05, + "loss": 0.8599, + "step": 86020 + }, + { + "epoch": 0.5496211492020495, + "grad_norm": 0.8158173561096191, + "learning_rate": 8.25076053397284e-05, + "loss": 0.9257, + "step": 86030 + }, + { + "epoch": 0.5496850363517882, + "grad_norm": 0.7656988501548767, + "learning_rate": 8.250379271851098e-05, + "loss": 0.7153, + "step": 86040 + }, + { + "epoch": 0.5497489235015269, + "grad_norm": 0.7358280420303345, + "learning_rate": 8.249997976995223e-05, + "loss": 1.0252, + "step": 86050 + }, + { + "epoch": 0.5498128106512656, + "grad_norm": 1.4587607383728027, + "learning_rate": 8.249616649409057e-05, + "loss": 0.8815, + "step": 86060 + }, + { + "epoch": 0.5498766978010043, + "grad_norm": 0.657911479473114, + "learning_rate": 8.24923528909644e-05, + "loss": 1.1022, + "step": 86070 + }, + { + "epoch": 0.549940584950743, + "grad_norm": 0.8566390872001648, + "learning_rate": 8.248853896061213e-05, + "loss": 0.7702, + "step": 86080 + }, + { + "epoch": 0.5500044721004818, + "grad_norm": 0.9400178790092468, + "learning_rate": 8.248472470307216e-05, + "loss": 0.811, + "step": 86090 + }, + { + "epoch": 0.5500683592502205, + "grad_norm": 1.0126426219940186, + "learning_rate": 8.24809101183829e-05, + "loss": 0.8001, + "step": 86100 + }, + { + "epoch": 0.5501322463999591, + "grad_norm": 1.513357162475586, + "learning_rate": 8.24770952065828e-05, + "loss": 0.9136, + "step": 86110 + }, + { + "epoch": 0.5501961335496978, + "grad_norm": 1.1465846300125122, + "learning_rate": 8.247327996771024e-05, + "loss": 0.8718, + "step": 86120 + }, + { + "epoch": 0.5502600206994365, + "grad_norm": 0.9992212057113647, + "learning_rate": 8.246946440180365e-05, + "loss": 0.7195, + "step": 86130 + }, + { + "epoch": 0.5503239078491752, + "grad_norm": 0.9585177302360535, + "learning_rate": 8.246564850890148e-05, + "loss": 0.8262, + "step": 86140 + }, + { + "epoch": 0.5503877949989139, + "grad_norm": 0.9354924559593201, + "learning_rate": 8.246183228904212e-05, + "loss": 1.3068, + "step": 86150 + }, + { + "epoch": 0.5504516821486526, + "grad_norm": 0.7876535058021545, + "learning_rate": 8.245801574226403e-05, + "loss": 0.9991, + "step": 86160 + }, + { + "epoch": 0.5505155692983913, + "grad_norm": 1.089166522026062, + "learning_rate": 8.245419886860566e-05, + "loss": 0.8906, + "step": 86170 + }, + { + "epoch": 0.55057945644813, + "grad_norm": 2.4468488693237305, + "learning_rate": 8.245038166810543e-05, + "loss": 0.9857, + "step": 86180 + }, + { + "epoch": 0.5506433435978687, + "grad_norm": 0.6112414598464966, + "learning_rate": 8.244656414080176e-05, + "loss": 0.8398, + "step": 86190 + }, + { + "epoch": 0.5507072307476074, + "grad_norm": 0.5781684517860413, + "learning_rate": 8.244274628673314e-05, + "loss": 0.8127, + "step": 86200 + }, + { + "epoch": 0.5507711178973461, + "grad_norm": 1.1223702430725098, + "learning_rate": 8.243892810593798e-05, + "loss": 0.8847, + "step": 86210 + }, + { + "epoch": 0.5508350050470848, + "grad_norm": 0.6717120409011841, + "learning_rate": 8.243510959845478e-05, + "loss": 0.8302, + "step": 86220 + }, + { + "epoch": 0.5508988921968235, + "grad_norm": 0.8732984662055969, + "learning_rate": 8.243129076432193e-05, + "loss": 0.6463, + "step": 86230 + }, + { + "epoch": 0.5509627793465622, + "grad_norm": 0.8502248525619507, + "learning_rate": 8.242747160357796e-05, + "loss": 0.8256, + "step": 86240 + }, + { + "epoch": 0.5510266664963009, + "grad_norm": 0.9782050251960754, + "learning_rate": 8.242365211626127e-05, + "loss": 0.9004, + "step": 86250 + }, + { + "epoch": 0.5510905536460396, + "grad_norm": 0.48341086506843567, + "learning_rate": 8.241983230241037e-05, + "loss": 0.7873, + "step": 86260 + }, + { + "epoch": 0.5511544407957784, + "grad_norm": 1.0804799795150757, + "learning_rate": 8.241601216206369e-05, + "loss": 1.1629, + "step": 86270 + }, + { + "epoch": 0.5512183279455171, + "grad_norm": 0.8438146114349365, + "learning_rate": 8.241219169525973e-05, + "loss": 0.7865, + "step": 86280 + }, + { + "epoch": 0.5512822150952558, + "grad_norm": 0.9812383055686951, + "learning_rate": 8.240837090203696e-05, + "loss": 0.8253, + "step": 86290 + }, + { + "epoch": 0.5513461022449945, + "grad_norm": 1.204162836074829, + "learning_rate": 8.240454978243387e-05, + "loss": 1.0205, + "step": 86300 + }, + { + "epoch": 0.5514099893947332, + "grad_norm": 1.9019670486450195, + "learning_rate": 8.240072833648894e-05, + "loss": 0.9302, + "step": 86310 + }, + { + "epoch": 0.5514738765444719, + "grad_norm": 0.8653010129928589, + "learning_rate": 8.239690656424062e-05, + "loss": 0.7381, + "step": 86320 + }, + { + "epoch": 0.5515377636942106, + "grad_norm": 0.9398866891860962, + "learning_rate": 8.239308446572742e-05, + "loss": 0.9952, + "step": 86330 + }, + { + "epoch": 0.5516016508439493, + "grad_norm": 1.2343336343765259, + "learning_rate": 8.238926204098787e-05, + "loss": 0.9225, + "step": 86340 + }, + { + "epoch": 0.551665537993688, + "grad_norm": 0.6204321384429932, + "learning_rate": 8.23854392900604e-05, + "loss": 0.8943, + "step": 86350 + }, + { + "epoch": 0.5517294251434266, + "grad_norm": 1.226536512374878, + "learning_rate": 8.238161621298355e-05, + "loss": 0.7769, + "step": 86360 + }, + { + "epoch": 0.5517933122931653, + "grad_norm": 1.1433632373809814, + "learning_rate": 8.23777928097958e-05, + "loss": 0.9487, + "step": 86370 + }, + { + "epoch": 0.551857199442904, + "grad_norm": 1.344122052192688, + "learning_rate": 8.237396908053567e-05, + "loss": 0.7983, + "step": 86380 + }, + { + "epoch": 0.5519210865926427, + "grad_norm": 0.6882800459861755, + "learning_rate": 8.237014502524168e-05, + "loss": 0.9736, + "step": 86390 + }, + { + "epoch": 0.5519849737423814, + "grad_norm": 0.9313778877258301, + "learning_rate": 8.236632064395231e-05, + "loss": 0.7895, + "step": 86400 + }, + { + "epoch": 0.5520488608921201, + "grad_norm": 1.4271520376205444, + "learning_rate": 8.236249593670609e-05, + "loss": 0.9795, + "step": 86410 + }, + { + "epoch": 0.5521127480418588, + "grad_norm": 2.3434298038482666, + "learning_rate": 8.235867090354153e-05, + "loss": 0.9954, + "step": 86420 + }, + { + "epoch": 0.5521766351915975, + "grad_norm": 3.081796169281006, + "learning_rate": 8.235484554449718e-05, + "loss": 0.8829, + "step": 86430 + }, + { + "epoch": 0.5522405223413362, + "grad_norm": 1.0064252614974976, + "learning_rate": 8.235101985961154e-05, + "loss": 0.8981, + "step": 86440 + }, + { + "epoch": 0.552304409491075, + "grad_norm": 1.3928056955337524, + "learning_rate": 8.234719384892314e-05, + "loss": 1.0775, + "step": 86450 + }, + { + "epoch": 0.5523682966408137, + "grad_norm": 1.0833989381790161, + "learning_rate": 8.23433675124705e-05, + "loss": 0.782, + "step": 86460 + }, + { + "epoch": 0.5524321837905524, + "grad_norm": 0.7659380435943604, + "learning_rate": 8.233954085029219e-05, + "loss": 0.9672, + "step": 86470 + }, + { + "epoch": 0.5524960709402911, + "grad_norm": 0.788266658782959, + "learning_rate": 8.23357138624267e-05, + "loss": 0.7397, + "step": 86480 + }, + { + "epoch": 0.5525599580900298, + "grad_norm": 0.8975238800048828, + "learning_rate": 8.233188654891262e-05, + "loss": 0.9158, + "step": 86490 + }, + { + "epoch": 0.5526238452397685, + "grad_norm": 1.3429840803146362, + "learning_rate": 8.232805890978845e-05, + "loss": 1.0506, + "step": 86500 + }, + { + "epoch": 0.5526877323895072, + "grad_norm": 1.5317329168319702, + "learning_rate": 8.232423094509278e-05, + "loss": 0.9425, + "step": 86510 + }, + { + "epoch": 0.5527516195392459, + "grad_norm": 0.8036388754844666, + "learning_rate": 8.232040265486413e-05, + "loss": 0.9453, + "step": 86520 + }, + { + "epoch": 0.5528155066889846, + "grad_norm": 0.8902758359909058, + "learning_rate": 8.231657403914107e-05, + "loss": 0.7362, + "step": 86530 + }, + { + "epoch": 0.5528793938387233, + "grad_norm": 2.19069242477417, + "learning_rate": 8.231274509796215e-05, + "loss": 0.9963, + "step": 86540 + }, + { + "epoch": 0.552943280988462, + "grad_norm": 0.8316643238067627, + "learning_rate": 8.230891583136593e-05, + "loss": 0.9057, + "step": 86550 + }, + { + "epoch": 0.5530071681382007, + "grad_norm": 1.3300632238388062, + "learning_rate": 8.230508623939097e-05, + "loss": 0.9182, + "step": 86560 + }, + { + "epoch": 0.5530710552879394, + "grad_norm": 0.6165077686309814, + "learning_rate": 8.230125632207585e-05, + "loss": 0.7956, + "step": 86570 + }, + { + "epoch": 0.5531349424376781, + "grad_norm": 0.7122068405151367, + "learning_rate": 8.229742607945915e-05, + "loss": 0.8135, + "step": 86580 + }, + { + "epoch": 0.5531988295874168, + "grad_norm": 1.0047301054000854, + "learning_rate": 8.229359551157941e-05, + "loss": 0.7511, + "step": 86590 + }, + { + "epoch": 0.5532627167371554, + "grad_norm": 0.6250356435775757, + "learning_rate": 8.228976461847522e-05, + "loss": 0.9118, + "step": 86600 + }, + { + "epoch": 0.5533266038868941, + "grad_norm": 0.7042723894119263, + "learning_rate": 8.228593340018518e-05, + "loss": 1.1653, + "step": 86610 + }, + { + "epoch": 0.5533904910366328, + "grad_norm": 1.037340760231018, + "learning_rate": 8.228210185674784e-05, + "loss": 0.9482, + "step": 86620 + }, + { + "epoch": 0.5534543781863716, + "grad_norm": 0.9349649548530579, + "learning_rate": 8.227826998820183e-05, + "loss": 0.7801, + "step": 86630 + }, + { + "epoch": 0.5535182653361103, + "grad_norm": 0.9645569920539856, + "learning_rate": 8.227443779458572e-05, + "loss": 0.8324, + "step": 86640 + }, + { + "epoch": 0.553582152485849, + "grad_norm": 0.991255521774292, + "learning_rate": 8.227060527593808e-05, + "loss": 0.8347, + "step": 86650 + }, + { + "epoch": 0.5536460396355877, + "grad_norm": 0.6694484353065491, + "learning_rate": 8.226677243229753e-05, + "loss": 0.8864, + "step": 86660 + }, + { + "epoch": 0.5537099267853264, + "grad_norm": 0.9695293307304382, + "learning_rate": 8.226293926370268e-05, + "loss": 0.8119, + "step": 86670 + }, + { + "epoch": 0.5537738139350651, + "grad_norm": 0.9662237763404846, + "learning_rate": 8.22591057701921e-05, + "loss": 1.2784, + "step": 86680 + }, + { + "epoch": 0.5538377010848038, + "grad_norm": 0.8255831003189087, + "learning_rate": 8.225527195180442e-05, + "loss": 0.7241, + "step": 86690 + }, + { + "epoch": 0.5539015882345425, + "grad_norm": 1.059599757194519, + "learning_rate": 8.225143780857827e-05, + "loss": 0.9431, + "step": 86700 + }, + { + "epoch": 0.5539654753842812, + "grad_norm": 0.7622451186180115, + "learning_rate": 8.224760334055222e-05, + "loss": 0.8522, + "step": 86710 + }, + { + "epoch": 0.5540293625340199, + "grad_norm": 0.5998595952987671, + "learning_rate": 8.22437685477649e-05, + "loss": 0.7598, + "step": 86720 + }, + { + "epoch": 0.5540932496837586, + "grad_norm": 0.9817114472389221, + "learning_rate": 8.223993343025496e-05, + "loss": 0.8681, + "step": 86730 + }, + { + "epoch": 0.5541571368334973, + "grad_norm": 0.8455765843391418, + "learning_rate": 8.223609798806097e-05, + "loss": 1.0724, + "step": 86740 + }, + { + "epoch": 0.554221023983236, + "grad_norm": 0.9038847088813782, + "learning_rate": 8.22322622212216e-05, + "loss": 0.8695, + "step": 86750 + }, + { + "epoch": 0.5542849111329747, + "grad_norm": 1.3185269832611084, + "learning_rate": 8.222842612977545e-05, + "loss": 0.8196, + "step": 86760 + }, + { + "epoch": 0.5543487982827134, + "grad_norm": 1.6181766986846924, + "learning_rate": 8.22245897137612e-05, + "loss": 0.9941, + "step": 86770 + }, + { + "epoch": 0.5544126854324521, + "grad_norm": 0.7883678674697876, + "learning_rate": 8.222075297321742e-05, + "loss": 0.9108, + "step": 86780 + }, + { + "epoch": 0.5544765725821909, + "grad_norm": 1.2159056663513184, + "learning_rate": 8.221691590818281e-05, + "loss": 0.8442, + "step": 86790 + }, + { + "epoch": 0.5545404597319296, + "grad_norm": 0.6376563310623169, + "learning_rate": 8.221307851869597e-05, + "loss": 0.8837, + "step": 86800 + }, + { + "epoch": 0.5546043468816683, + "grad_norm": 0.8084425926208496, + "learning_rate": 8.220924080479558e-05, + "loss": 0.9176, + "step": 86810 + }, + { + "epoch": 0.554668234031407, + "grad_norm": 0.9043588042259216, + "learning_rate": 8.220540276652024e-05, + "loss": 1.3496, + "step": 86820 + }, + { + "epoch": 0.5547321211811457, + "grad_norm": 0.6929380893707275, + "learning_rate": 8.220156440390865e-05, + "loss": 0.8788, + "step": 86830 + }, + { + "epoch": 0.5547960083308843, + "grad_norm": 0.8745282888412476, + "learning_rate": 8.219772571699945e-05, + "loss": 0.6351, + "step": 86840 + }, + { + "epoch": 0.554859895480623, + "grad_norm": 0.9505758285522461, + "learning_rate": 8.21938867058313e-05, + "loss": 0.7184, + "step": 86850 + }, + { + "epoch": 0.5549237826303617, + "grad_norm": 1.9422472715377808, + "learning_rate": 8.219004737044285e-05, + "loss": 0.915, + "step": 86860 + }, + { + "epoch": 0.5549876697801004, + "grad_norm": 1.590299367904663, + "learning_rate": 8.218620771087277e-05, + "loss": 0.8802, + "step": 86870 + }, + { + "epoch": 0.5550515569298391, + "grad_norm": 0.7018561959266663, + "learning_rate": 8.218236772715972e-05, + "loss": 0.8559, + "step": 86880 + }, + { + "epoch": 0.5551154440795778, + "grad_norm": 1.1416157484054565, + "learning_rate": 8.217852741934242e-05, + "loss": 0.792, + "step": 86890 + }, + { + "epoch": 0.5551793312293165, + "grad_norm": 0.9537761807441711, + "learning_rate": 8.21746867874595e-05, + "loss": 1.0657, + "step": 86900 + }, + { + "epoch": 0.5552432183790552, + "grad_norm": 2.785801887512207, + "learning_rate": 8.217084583154964e-05, + "loss": 1.0421, + "step": 86910 + }, + { + "epoch": 0.5553071055287939, + "grad_norm": 1.0733259916305542, + "learning_rate": 8.216700455165152e-05, + "loss": 1.1064, + "step": 86920 + }, + { + "epoch": 0.5553709926785326, + "grad_norm": 0.9249892830848694, + "learning_rate": 8.216316294780386e-05, + "loss": 0.6912, + "step": 86930 + }, + { + "epoch": 0.5554348798282713, + "grad_norm": 1.2269346714019775, + "learning_rate": 8.215932102004531e-05, + "loss": 0.8911, + "step": 86940 + }, + { + "epoch": 0.55549876697801, + "grad_norm": 1.0599247217178345, + "learning_rate": 8.215547876841459e-05, + "loss": 0.9636, + "step": 86950 + }, + { + "epoch": 0.5555626541277487, + "grad_norm": 1.5449506044387817, + "learning_rate": 8.215163619295036e-05, + "loss": 0.9123, + "step": 86960 + }, + { + "epoch": 0.5556265412774875, + "grad_norm": 0.9864984154701233, + "learning_rate": 8.214779329369134e-05, + "loss": 0.7596, + "step": 86970 + }, + { + "epoch": 0.5556904284272262, + "grad_norm": 0.9634939432144165, + "learning_rate": 8.214395007067624e-05, + "loss": 0.9034, + "step": 86980 + }, + { + "epoch": 0.5557543155769649, + "grad_norm": 1.1555999517440796, + "learning_rate": 8.214010652394376e-05, + "loss": 0.7741, + "step": 86990 + }, + { + "epoch": 0.5558182027267036, + "grad_norm": 1.121670126914978, + "learning_rate": 8.213626265353259e-05, + "loss": 0.926, + "step": 87000 + }, + { + "epoch": 0.5558820898764423, + "grad_norm": 2.0978503227233887, + "learning_rate": 8.213241845948145e-05, + "loss": 0.7041, + "step": 87010 + }, + { + "epoch": 0.555945977026181, + "grad_norm": 0.9516173601150513, + "learning_rate": 8.212857394182906e-05, + "loss": 1.0637, + "step": 87020 + }, + { + "epoch": 0.5560098641759197, + "grad_norm": 1.092417597770691, + "learning_rate": 8.212472910061415e-05, + "loss": 0.8465, + "step": 87030 + }, + { + "epoch": 0.5560737513256584, + "grad_norm": 0.8060597777366638, + "learning_rate": 8.212088393587543e-05, + "loss": 1.2128, + "step": 87040 + }, + { + "epoch": 0.5561376384753971, + "grad_norm": 0.8627819418907166, + "learning_rate": 8.21170384476516e-05, + "loss": 0.9167, + "step": 87050 + }, + { + "epoch": 0.5562015256251358, + "grad_norm": 0.9988081455230713, + "learning_rate": 8.211319263598142e-05, + "loss": 1.0736, + "step": 87060 + }, + { + "epoch": 0.5562654127748745, + "grad_norm": 0.8938575387001038, + "learning_rate": 8.210934650090361e-05, + "loss": 0.9065, + "step": 87070 + }, + { + "epoch": 0.5563292999246132, + "grad_norm": 0.968144953250885, + "learning_rate": 8.210550004245688e-05, + "loss": 1.1044, + "step": 87080 + }, + { + "epoch": 0.5563931870743518, + "grad_norm": 0.6786410212516785, + "learning_rate": 8.210165326068001e-05, + "loss": 0.849, + "step": 87090 + }, + { + "epoch": 0.5564570742240905, + "grad_norm": 0.6535912156105042, + "learning_rate": 8.209780615561172e-05, + "loss": 0.942, + "step": 87100 + }, + { + "epoch": 0.5565209613738292, + "grad_norm": 0.8034776449203491, + "learning_rate": 8.209395872729074e-05, + "loss": 0.8953, + "step": 87110 + }, + { + "epoch": 0.5565848485235679, + "grad_norm": 1.0414639711380005, + "learning_rate": 8.209011097575584e-05, + "loss": 1.1798, + "step": 87120 + }, + { + "epoch": 0.5566487356733066, + "grad_norm": 0.8047749996185303, + "learning_rate": 8.208626290104577e-05, + "loss": 0.9456, + "step": 87130 + }, + { + "epoch": 0.5567126228230453, + "grad_norm": 0.904681384563446, + "learning_rate": 8.208241450319925e-05, + "loss": 0.8932, + "step": 87140 + }, + { + "epoch": 0.556776509972784, + "grad_norm": 0.9343833327293396, + "learning_rate": 8.207856578225508e-05, + "loss": 0.9492, + "step": 87150 + }, + { + "epoch": 0.5568403971225228, + "grad_norm": 0.9821017384529114, + "learning_rate": 8.207471673825199e-05, + "loss": 0.8225, + "step": 87160 + }, + { + "epoch": 0.5569042842722615, + "grad_norm": 0.9593321681022644, + "learning_rate": 8.207086737122876e-05, + "loss": 1.0382, + "step": 87170 + }, + { + "epoch": 0.5569681714220002, + "grad_norm": 1.1232051849365234, + "learning_rate": 8.206701768122415e-05, + "loss": 0.8715, + "step": 87180 + }, + { + "epoch": 0.5570320585717389, + "grad_norm": 0.7729601860046387, + "learning_rate": 8.206316766827692e-05, + "loss": 0.9942, + "step": 87190 + }, + { + "epoch": 0.5570959457214776, + "grad_norm": 1.0262385606765747, + "learning_rate": 8.205931733242586e-05, + "loss": 1.0279, + "step": 87200 + }, + { + "epoch": 0.5571598328712163, + "grad_norm": 0.9253116846084595, + "learning_rate": 8.205546667370975e-05, + "loss": 1.1048, + "step": 87210 + }, + { + "epoch": 0.557223720020955, + "grad_norm": 1.2374264001846313, + "learning_rate": 8.205161569216735e-05, + "loss": 0.7685, + "step": 87220 + }, + { + "epoch": 0.5572876071706937, + "grad_norm": 0.7475976347923279, + "learning_rate": 8.204776438783745e-05, + "loss": 0.9137, + "step": 87230 + }, + { + "epoch": 0.5573514943204324, + "grad_norm": 0.5616995096206665, + "learning_rate": 8.204391276075882e-05, + "loss": 1.0628, + "step": 87240 + }, + { + "epoch": 0.5574153814701711, + "grad_norm": 0.816423773765564, + "learning_rate": 8.20400608109703e-05, + "loss": 0.9533, + "step": 87250 + }, + { + "epoch": 0.5574792686199098, + "grad_norm": 1.1088494062423706, + "learning_rate": 8.203620853851062e-05, + "loss": 1.115, + "step": 87260 + }, + { + "epoch": 0.5575431557696485, + "grad_norm": 0.9146358370780945, + "learning_rate": 8.203235594341862e-05, + "loss": 0.8036, + "step": 87270 + }, + { + "epoch": 0.5576070429193872, + "grad_norm": 0.7417098879814148, + "learning_rate": 8.202850302573308e-05, + "loss": 0.8677, + "step": 87280 + }, + { + "epoch": 0.5576709300691259, + "grad_norm": 1.2907207012176514, + "learning_rate": 8.202464978549281e-05, + "loss": 0.8702, + "step": 87290 + }, + { + "epoch": 0.5577348172188646, + "grad_norm": 0.6193691492080688, + "learning_rate": 8.202079622273662e-05, + "loss": 0.6505, + "step": 87300 + }, + { + "epoch": 0.5577987043686033, + "grad_norm": 0.8418506979942322, + "learning_rate": 8.20169423375033e-05, + "loss": 1.0309, + "step": 87310 + }, + { + "epoch": 0.557862591518342, + "grad_norm": 0.8040059208869934, + "learning_rate": 8.201308812983165e-05, + "loss": 0.8816, + "step": 87320 + }, + { + "epoch": 0.5579264786680806, + "grad_norm": 0.9239259958267212, + "learning_rate": 8.200923359976055e-05, + "loss": 1.2638, + "step": 87330 + }, + { + "epoch": 0.5579903658178194, + "grad_norm": 1.0745997428894043, + "learning_rate": 8.200537874732876e-05, + "loss": 0.839, + "step": 87340 + }, + { + "epoch": 0.5580542529675581, + "grad_norm": 0.9547412395477295, + "learning_rate": 8.200152357257511e-05, + "loss": 1.0117, + "step": 87350 + }, + { + "epoch": 0.5581181401172968, + "grad_norm": 0.6735034584999084, + "learning_rate": 8.199766807553843e-05, + "loss": 1.0254, + "step": 87360 + }, + { + "epoch": 0.5581820272670355, + "grad_norm": 0.9394139051437378, + "learning_rate": 8.199381225625755e-05, + "loss": 0.9061, + "step": 87370 + }, + { + "epoch": 0.5582459144167742, + "grad_norm": 0.6589183211326599, + "learning_rate": 8.198995611477132e-05, + "loss": 0.9975, + "step": 87380 + }, + { + "epoch": 0.5583098015665129, + "grad_norm": 0.8695818185806274, + "learning_rate": 8.198609965111854e-05, + "loss": 0.8602, + "step": 87390 + }, + { + "epoch": 0.5583736887162516, + "grad_norm": 0.8587558269500732, + "learning_rate": 8.198224286533807e-05, + "loss": 1.041, + "step": 87400 + }, + { + "epoch": 0.5584375758659903, + "grad_norm": 0.8694300651550293, + "learning_rate": 8.197838575746874e-05, + "loss": 1.0423, + "step": 87410 + }, + { + "epoch": 0.558501463015729, + "grad_norm": 1.0709831714630127, + "learning_rate": 8.197452832754939e-05, + "loss": 1.1133, + "step": 87420 + }, + { + "epoch": 0.5585653501654677, + "grad_norm": 0.8248537182807922, + "learning_rate": 8.19706705756189e-05, + "loss": 0.8726, + "step": 87430 + }, + { + "epoch": 0.5586292373152064, + "grad_norm": 2.853898525238037, + "learning_rate": 8.196681250171609e-05, + "loss": 0.9757, + "step": 87440 + }, + { + "epoch": 0.5586931244649451, + "grad_norm": 2.0060508251190186, + "learning_rate": 8.196295410587982e-05, + "loss": 1.1241, + "step": 87450 + }, + { + "epoch": 0.5587570116146838, + "grad_norm": 1.0533684492111206, + "learning_rate": 8.195909538814895e-05, + "loss": 0.746, + "step": 87460 + }, + { + "epoch": 0.5588208987644225, + "grad_norm": 0.6444224715232849, + "learning_rate": 8.195523634856234e-05, + "loss": 0.8998, + "step": 87470 + }, + { + "epoch": 0.5588847859141612, + "grad_norm": 0.8431418538093567, + "learning_rate": 8.195137698715887e-05, + "loss": 0.9075, + "step": 87480 + }, + { + "epoch": 0.5589486730639, + "grad_norm": 5.22327995300293, + "learning_rate": 8.194751730397738e-05, + "loss": 0.8413, + "step": 87490 + }, + { + "epoch": 0.5590125602136387, + "grad_norm": 0.5308577418327332, + "learning_rate": 8.194365729905675e-05, + "loss": 0.7192, + "step": 87500 + }, + { + "epoch": 0.5590764473633774, + "grad_norm": 1.6924850940704346, + "learning_rate": 8.193979697243586e-05, + "loss": 0.9143, + "step": 87510 + }, + { + "epoch": 0.5591403345131161, + "grad_norm": 0.574246883392334, + "learning_rate": 8.193593632415358e-05, + "loss": 0.9482, + "step": 87520 + }, + { + "epoch": 0.5592042216628548, + "grad_norm": 0.572245180606842, + "learning_rate": 8.19320753542488e-05, + "loss": 0.6802, + "step": 87530 + }, + { + "epoch": 0.5592681088125935, + "grad_norm": 1.4606465101242065, + "learning_rate": 8.192821406276039e-05, + "loss": 1.3266, + "step": 87540 + }, + { + "epoch": 0.5593319959623322, + "grad_norm": 0.8305387496948242, + "learning_rate": 8.192435244972725e-05, + "loss": 0.8321, + "step": 87550 + }, + { + "epoch": 0.5593958831120709, + "grad_norm": 0.7311245799064636, + "learning_rate": 8.192049051518826e-05, + "loss": 0.8271, + "step": 87560 + }, + { + "epoch": 0.5594597702618095, + "grad_norm": 0.702063798904419, + "learning_rate": 8.19166282591823e-05, + "loss": 0.9991, + "step": 87570 + }, + { + "epoch": 0.5595236574115482, + "grad_norm": 0.699548602104187, + "learning_rate": 8.19127656817483e-05, + "loss": 0.9318, + "step": 87580 + }, + { + "epoch": 0.5595875445612869, + "grad_norm": 1.65463387966156, + "learning_rate": 8.190890278292513e-05, + "loss": 1.1159, + "step": 87590 + }, + { + "epoch": 0.5596514317110256, + "grad_norm": 0.7709631323814392, + "learning_rate": 8.190503956275171e-05, + "loss": 0.9198, + "step": 87600 + }, + { + "epoch": 0.5597153188607643, + "grad_norm": 0.7824264168739319, + "learning_rate": 8.190117602126694e-05, + "loss": 0.7632, + "step": 87610 + }, + { + "epoch": 0.559779206010503, + "grad_norm": 0.6739460825920105, + "learning_rate": 8.189731215850973e-05, + "loss": 1.0088, + "step": 87620 + }, + { + "epoch": 0.5598430931602417, + "grad_norm": 0.6374611258506775, + "learning_rate": 8.189344797451898e-05, + "loss": 0.8637, + "step": 87630 + }, + { + "epoch": 0.5599069803099804, + "grad_norm": 0.7015722990036011, + "learning_rate": 8.188958346933361e-05, + "loss": 0.7372, + "step": 87640 + }, + { + "epoch": 0.5599708674597191, + "grad_norm": 0.9866139888763428, + "learning_rate": 8.188571864299257e-05, + "loss": 0.8335, + "step": 87650 + }, + { + "epoch": 0.5600347546094578, + "grad_norm": 1.009989619255066, + "learning_rate": 8.188185349553474e-05, + "loss": 0.8123, + "step": 87660 + }, + { + "epoch": 0.5600986417591965, + "grad_norm": 0.5342085957527161, + "learning_rate": 8.187798802699909e-05, + "loss": 0.772, + "step": 87670 + }, + { + "epoch": 0.5601625289089353, + "grad_norm": 0.8871537446975708, + "learning_rate": 8.18741222374245e-05, + "loss": 0.8139, + "step": 87680 + }, + { + "epoch": 0.560226416058674, + "grad_norm": 0.8497464656829834, + "learning_rate": 8.187025612684993e-05, + "loss": 0.8549, + "step": 87690 + }, + { + "epoch": 0.5602903032084127, + "grad_norm": 1.7199604511260986, + "learning_rate": 8.18663896953143e-05, + "loss": 1.0503, + "step": 87700 + }, + { + "epoch": 0.5603541903581514, + "grad_norm": 0.686752200126648, + "learning_rate": 8.186252294285656e-05, + "loss": 0.7221, + "step": 87710 + }, + { + "epoch": 0.5604180775078901, + "grad_norm": 0.6970003247261047, + "learning_rate": 8.185865586951567e-05, + "loss": 0.6405, + "step": 87720 + }, + { + "epoch": 0.5604819646576288, + "grad_norm": 0.8833878040313721, + "learning_rate": 8.185478847533052e-05, + "loss": 0.742, + "step": 87730 + }, + { + "epoch": 0.5605458518073675, + "grad_norm": 2.5486907958984375, + "learning_rate": 8.185092076034012e-05, + "loss": 0.7818, + "step": 87740 + }, + { + "epoch": 0.5606097389571062, + "grad_norm": 0.5542274713516235, + "learning_rate": 8.184705272458338e-05, + "loss": 0.8269, + "step": 87750 + }, + { + "epoch": 0.5606736261068449, + "grad_norm": 0.9123031497001648, + "learning_rate": 8.184318436809927e-05, + "loss": 0.9681, + "step": 87760 + }, + { + "epoch": 0.5607375132565836, + "grad_norm": 1.2762703895568848, + "learning_rate": 8.183931569092676e-05, + "loss": 0.992, + "step": 87770 + }, + { + "epoch": 0.5608014004063223, + "grad_norm": 0.723517656326294, + "learning_rate": 8.183544669310477e-05, + "loss": 0.7665, + "step": 87780 + }, + { + "epoch": 0.560865287556061, + "grad_norm": 0.5567760467529297, + "learning_rate": 8.183157737467229e-05, + "loss": 0.8054, + "step": 87790 + }, + { + "epoch": 0.5609291747057997, + "grad_norm": 1.262997031211853, + "learning_rate": 8.182770773566833e-05, + "loss": 0.9354, + "step": 87800 + }, + { + "epoch": 0.5609930618555383, + "grad_norm": 0.7385443449020386, + "learning_rate": 8.182383777613177e-05, + "loss": 1.0054, + "step": 87810 + }, + { + "epoch": 0.561056949005277, + "grad_norm": 0.8645491600036621, + "learning_rate": 8.181996749610166e-05, + "loss": 1.0764, + "step": 87820 + }, + { + "epoch": 0.5611208361550157, + "grad_norm": 1.3922362327575684, + "learning_rate": 8.181609689561693e-05, + "loss": 1.2028, + "step": 87830 + }, + { + "epoch": 0.5611847233047544, + "grad_norm": 1.1848599910736084, + "learning_rate": 8.181222597471658e-05, + "loss": 0.7765, + "step": 87840 + }, + { + "epoch": 0.5612486104544931, + "grad_norm": 1.078633189201355, + "learning_rate": 8.18083547334396e-05, + "loss": 0.793, + "step": 87850 + }, + { + "epoch": 0.5613124976042319, + "grad_norm": 0.8541363477706909, + "learning_rate": 8.180448317182498e-05, + "loss": 0.6918, + "step": 87860 + }, + { + "epoch": 0.5613763847539706, + "grad_norm": 1.5451418161392212, + "learning_rate": 8.180061128991168e-05, + "loss": 0.9104, + "step": 87870 + }, + { + "epoch": 0.5614402719037093, + "grad_norm": 0.9246516823768616, + "learning_rate": 8.179673908773872e-05, + "loss": 0.8337, + "step": 87880 + }, + { + "epoch": 0.561504159053448, + "grad_norm": 1.042832851409912, + "learning_rate": 8.179286656534511e-05, + "loss": 1.1324, + "step": 87890 + }, + { + "epoch": 0.5615680462031867, + "grad_norm": 0.7726474404335022, + "learning_rate": 8.17889937227698e-05, + "loss": 1.1108, + "step": 87900 + }, + { + "epoch": 0.5616319333529254, + "grad_norm": 0.8662251234054565, + "learning_rate": 8.178512056005184e-05, + "loss": 0.6078, + "step": 87910 + }, + { + "epoch": 0.5616958205026641, + "grad_norm": 0.7089243531227112, + "learning_rate": 8.178124707723021e-05, + "loss": 0.8552, + "step": 87920 + }, + { + "epoch": 0.5617597076524028, + "grad_norm": 1.1423563957214355, + "learning_rate": 8.177737327434393e-05, + "loss": 0.8504, + "step": 87930 + }, + { + "epoch": 0.5618235948021415, + "grad_norm": 0.934367299079895, + "learning_rate": 8.1773499151432e-05, + "loss": 1.1178, + "step": 87940 + }, + { + "epoch": 0.5618874819518802, + "grad_norm": 1.198276162147522, + "learning_rate": 8.176962470853346e-05, + "loss": 0.9645, + "step": 87950 + }, + { + "epoch": 0.5619513691016189, + "grad_norm": 0.8779740929603577, + "learning_rate": 8.176574994568731e-05, + "loss": 0.9631, + "step": 87960 + }, + { + "epoch": 0.5620152562513576, + "grad_norm": 1.0242273807525635, + "learning_rate": 8.176187486293258e-05, + "loss": 0.8799, + "step": 87970 + }, + { + "epoch": 0.5620791434010963, + "grad_norm": 0.8855654001235962, + "learning_rate": 8.17579994603083e-05, + "loss": 0.8782, + "step": 87980 + }, + { + "epoch": 0.562143030550835, + "grad_norm": 0.9518892168998718, + "learning_rate": 8.175412373785346e-05, + "loss": 0.9524, + "step": 87990 + }, + { + "epoch": 0.5622069177005737, + "grad_norm": 1.5354324579238892, + "learning_rate": 8.175024769560714e-05, + "loss": 0.8593, + "step": 88000 + }, + { + "epoch": 0.5622708048503124, + "grad_norm": 0.7242043018341064, + "learning_rate": 8.174637133360837e-05, + "loss": 0.7903, + "step": 88010 + }, + { + "epoch": 0.5623346920000512, + "grad_norm": 1.2966783046722412, + "learning_rate": 8.174249465189615e-05, + "loss": 0.9194, + "step": 88020 + }, + { + "epoch": 0.5623985791497899, + "grad_norm": 0.7880471348762512, + "learning_rate": 8.173861765050956e-05, + "loss": 0.8203, + "step": 88030 + }, + { + "epoch": 0.5624624662995286, + "grad_norm": 0.8181769251823425, + "learning_rate": 8.173474032948764e-05, + "loss": 0.8065, + "step": 88040 + }, + { + "epoch": 0.5625263534492673, + "grad_norm": 1.0602998733520508, + "learning_rate": 8.173086268886943e-05, + "loss": 1.0891, + "step": 88050 + }, + { + "epoch": 0.5625902405990059, + "grad_norm": 0.9757769107818604, + "learning_rate": 8.172698472869398e-05, + "loss": 0.7662, + "step": 88060 + }, + { + "epoch": 0.5626541277487446, + "grad_norm": 0.7748228907585144, + "learning_rate": 8.172310644900035e-05, + "loss": 1.0889, + "step": 88070 + }, + { + "epoch": 0.5627180148984833, + "grad_norm": 0.6546492576599121, + "learning_rate": 8.171922784982757e-05, + "loss": 0.7745, + "step": 88080 + }, + { + "epoch": 0.562781902048222, + "grad_norm": 0.9595057368278503, + "learning_rate": 8.171534893121476e-05, + "loss": 0.7195, + "step": 88090 + }, + { + "epoch": 0.5628457891979607, + "grad_norm": 1.4880831241607666, + "learning_rate": 8.171146969320091e-05, + "loss": 0.7606, + "step": 88100 + }, + { + "epoch": 0.5629096763476994, + "grad_norm": 0.6960776448249817, + "learning_rate": 8.17079781059329e-05, + "loss": 0.8305, + "step": 88110 + }, + { + "epoch": 0.5629735634974381, + "grad_norm": 0.6546306610107422, + "learning_rate": 8.17040982611648e-05, + "loss": 0.9146, + "step": 88120 + }, + { + "epoch": 0.5630374506471768, + "grad_norm": 0.9104867577552795, + "learning_rate": 8.170021809710901e-05, + "loss": 0.8428, + "step": 88130 + }, + { + "epoch": 0.5631013377969155, + "grad_norm": 0.8855934143066406, + "learning_rate": 8.169633761380459e-05, + "loss": 1.1035, + "step": 88140 + }, + { + "epoch": 0.5631652249466542, + "grad_norm": 0.8834415674209595, + "learning_rate": 8.169245681129063e-05, + "loss": 0.7963, + "step": 88150 + }, + { + "epoch": 0.5632291120963929, + "grad_norm": 1.0610204935073853, + "learning_rate": 8.168857568960621e-05, + "loss": 0.8703, + "step": 88160 + }, + { + "epoch": 0.5632929992461316, + "grad_norm": 0.7726658582687378, + "learning_rate": 8.168469424879041e-05, + "loss": 0.7914, + "step": 88170 + }, + { + "epoch": 0.5633568863958703, + "grad_norm": 0.835917592048645, + "learning_rate": 8.168081248888236e-05, + "loss": 1.1727, + "step": 88180 + }, + { + "epoch": 0.563420773545609, + "grad_norm": 0.6268913149833679, + "learning_rate": 8.16769304099211e-05, + "loss": 0.8944, + "step": 88190 + }, + { + "epoch": 0.5634846606953477, + "grad_norm": 0.7659708857536316, + "learning_rate": 8.167304801194574e-05, + "loss": 0.6793, + "step": 88200 + }, + { + "epoch": 0.5635485478450865, + "grad_norm": 0.8323150873184204, + "learning_rate": 8.166916529499539e-05, + "loss": 1.0036, + "step": 88210 + }, + { + "epoch": 0.5636124349948252, + "grad_norm": 0.8650678396224976, + "learning_rate": 8.166528225910915e-05, + "loss": 1.0035, + "step": 88220 + }, + { + "epoch": 0.5636763221445639, + "grad_norm": 1.1237086057662964, + "learning_rate": 8.166139890432612e-05, + "loss": 0.863, + "step": 88230 + }, + { + "epoch": 0.5637402092943026, + "grad_norm": 1.4714837074279785, + "learning_rate": 8.165751523068541e-05, + "loss": 0.7476, + "step": 88240 + }, + { + "epoch": 0.5638040964440413, + "grad_norm": 1.561540126800537, + "learning_rate": 8.165363123822613e-05, + "loss": 0.8476, + "step": 88250 + }, + { + "epoch": 0.56386798359378, + "grad_norm": 1.166987657546997, + "learning_rate": 8.164974692698741e-05, + "loss": 0.7393, + "step": 88260 + }, + { + "epoch": 0.5639318707435187, + "grad_norm": 0.8641453385353088, + "learning_rate": 8.164586229700837e-05, + "loss": 0.9397, + "step": 88270 + }, + { + "epoch": 0.5639957578932574, + "grad_norm": 0.8461394906044006, + "learning_rate": 8.164197734832811e-05, + "loss": 0.9712, + "step": 88280 + }, + { + "epoch": 0.5640596450429961, + "grad_norm": 1.124748945236206, + "learning_rate": 8.163809208098573e-05, + "loss": 0.9336, + "step": 88290 + }, + { + "epoch": 0.5641235321927347, + "grad_norm": 0.7399802803993225, + "learning_rate": 8.163420649502044e-05, + "loss": 0.8218, + "step": 88300 + }, + { + "epoch": 0.5641874193424734, + "grad_norm": 0.5592508316040039, + "learning_rate": 8.163032059047129e-05, + "loss": 1.0124, + "step": 88310 + }, + { + "epoch": 0.5642513064922121, + "grad_norm": 0.7099899649620056, + "learning_rate": 8.162643436737747e-05, + "loss": 1.0729, + "step": 88320 + }, + { + "epoch": 0.5643151936419508, + "grad_norm": 0.8268618583679199, + "learning_rate": 8.162254782577807e-05, + "loss": 0.9777, + "step": 88330 + }, + { + "epoch": 0.5643790807916895, + "grad_norm": 1.0088896751403809, + "learning_rate": 8.161866096571229e-05, + "loss": 0.7944, + "step": 88340 + }, + { + "epoch": 0.5644429679414282, + "grad_norm": 1.0741251707077026, + "learning_rate": 8.161477378721922e-05, + "loss": 0.6418, + "step": 88350 + }, + { + "epoch": 0.5645068550911669, + "grad_norm": 0.7108768224716187, + "learning_rate": 8.161088629033802e-05, + "loss": 0.792, + "step": 88360 + }, + { + "epoch": 0.5645707422409056, + "grad_norm": 0.7690078020095825, + "learning_rate": 8.160699847510787e-05, + "loss": 1.0006, + "step": 88370 + }, + { + "epoch": 0.5646346293906443, + "grad_norm": 0.9260159134864807, + "learning_rate": 8.160311034156788e-05, + "loss": 0.836, + "step": 88380 + }, + { + "epoch": 0.564698516540383, + "grad_norm": 0.8775709867477417, + "learning_rate": 8.159922188975724e-05, + "loss": 1.0681, + "step": 88390 + }, + { + "epoch": 0.5647624036901218, + "grad_norm": 5.725255489349365, + "learning_rate": 8.159533311971509e-05, + "loss": 0.8525, + "step": 88400 + }, + { + "epoch": 0.5648262908398605, + "grad_norm": 0.7472132444381714, + "learning_rate": 8.15914440314806e-05, + "loss": 0.8077, + "step": 88410 + }, + { + "epoch": 0.5648901779895992, + "grad_norm": 0.733696460723877, + "learning_rate": 8.158755462509294e-05, + "loss": 0.9897, + "step": 88420 + }, + { + "epoch": 0.5649540651393379, + "grad_norm": 0.7739579081535339, + "learning_rate": 8.15836649005913e-05, + "loss": 0.9822, + "step": 88430 + }, + { + "epoch": 0.5650179522890766, + "grad_norm": 0.7440978288650513, + "learning_rate": 8.157977485801481e-05, + "loss": 0.9445, + "step": 88440 + }, + { + "epoch": 0.5650818394388153, + "grad_norm": 1.1585737466812134, + "learning_rate": 8.157588449740268e-05, + "loss": 0.9039, + "step": 88450 + }, + { + "epoch": 0.565145726588554, + "grad_norm": 0.6025387048721313, + "learning_rate": 8.157199381879406e-05, + "loss": 1.0429, + "step": 88460 + }, + { + "epoch": 0.5652096137382927, + "grad_norm": 1.27161705493927, + "learning_rate": 8.156810282222815e-05, + "loss": 0.9338, + "step": 88470 + }, + { + "epoch": 0.5652735008880314, + "grad_norm": 1.4997930526733398, + "learning_rate": 8.156421150774413e-05, + "loss": 0.9103, + "step": 88480 + }, + { + "epoch": 0.5653373880377701, + "grad_norm": 1.060680866241455, + "learning_rate": 8.156031987538121e-05, + "loss": 0.8186, + "step": 88490 + }, + { + "epoch": 0.5654012751875088, + "grad_norm": 0.8524025082588196, + "learning_rate": 8.155642792517854e-05, + "loss": 0.6961, + "step": 88500 + }, + { + "epoch": 0.5654651623372475, + "grad_norm": 0.7399099469184875, + "learning_rate": 8.155253565717538e-05, + "loss": 0.9526, + "step": 88510 + }, + { + "epoch": 0.5655290494869862, + "grad_norm": 0.5844240784645081, + "learning_rate": 8.154864307141086e-05, + "loss": 0.7829, + "step": 88520 + }, + { + "epoch": 0.5655929366367249, + "grad_norm": 1.2243051528930664, + "learning_rate": 8.154475016792422e-05, + "loss": 1.1276, + "step": 88530 + }, + { + "epoch": 0.5656568237864635, + "grad_norm": 0.8951583504676819, + "learning_rate": 8.154085694675465e-05, + "loss": 0.5625, + "step": 88540 + }, + { + "epoch": 0.5657207109362022, + "grad_norm": 0.9766507148742676, + "learning_rate": 8.153696340794137e-05, + "loss": 0.8022, + "step": 88550 + }, + { + "epoch": 0.565784598085941, + "grad_norm": 1.5120043754577637, + "learning_rate": 8.153306955152358e-05, + "loss": 1.0359, + "step": 88560 + }, + { + "epoch": 0.5658484852356797, + "grad_norm": 0.8103669881820679, + "learning_rate": 8.15291753775405e-05, + "loss": 0.9112, + "step": 88570 + }, + { + "epoch": 0.5659123723854184, + "grad_norm": 0.9393633604049683, + "learning_rate": 8.152528088603136e-05, + "loss": 1.0055, + "step": 88580 + }, + { + "epoch": 0.5659762595351571, + "grad_norm": 0.7244747281074524, + "learning_rate": 8.152138607703534e-05, + "loss": 0.9209, + "step": 88590 + }, + { + "epoch": 0.5660401466848958, + "grad_norm": 0.9815077185630798, + "learning_rate": 8.151749095059172e-05, + "loss": 1.0403, + "step": 88600 + }, + { + "epoch": 0.5661040338346345, + "grad_norm": 0.7395929098129272, + "learning_rate": 8.151359550673968e-05, + "loss": 0.8846, + "step": 88610 + }, + { + "epoch": 0.5661679209843732, + "grad_norm": 1.4598946571350098, + "learning_rate": 8.150969974551848e-05, + "loss": 0.8897, + "step": 88620 + }, + { + "epoch": 0.5662318081341119, + "grad_norm": 1.4247801303863525, + "learning_rate": 8.150580366696734e-05, + "loss": 0.8266, + "step": 88630 + }, + { + "epoch": 0.5662956952838506, + "grad_norm": 0.9204801321029663, + "learning_rate": 8.150190727112551e-05, + "loss": 0.9737, + "step": 88640 + }, + { + "epoch": 0.5663595824335893, + "grad_norm": 0.9166013598442078, + "learning_rate": 8.149801055803222e-05, + "loss": 0.7175, + "step": 88650 + }, + { + "epoch": 0.566423469583328, + "grad_norm": 0.6634111404418945, + "learning_rate": 8.149411352772672e-05, + "loss": 0.8279, + "step": 88660 + }, + { + "epoch": 0.5664873567330667, + "grad_norm": 0.7929425835609436, + "learning_rate": 8.149021618024823e-05, + "loss": 0.938, + "step": 88670 + }, + { + "epoch": 0.5665512438828054, + "grad_norm": 0.834792971611023, + "learning_rate": 8.148631851563602e-05, + "loss": 0.9483, + "step": 88680 + }, + { + "epoch": 0.5666151310325441, + "grad_norm": 0.9581144452095032, + "learning_rate": 8.148242053392937e-05, + "loss": 0.8688, + "step": 88690 + }, + { + "epoch": 0.5666790181822828, + "grad_norm": 1.2036710977554321, + "learning_rate": 8.147852223516747e-05, + "loss": 0.76, + "step": 88700 + }, + { + "epoch": 0.5667429053320215, + "grad_norm": 0.9013402462005615, + "learning_rate": 8.147462361938965e-05, + "loss": 0.7514, + "step": 88710 + }, + { + "epoch": 0.5668067924817602, + "grad_norm": 1.2912942171096802, + "learning_rate": 8.147072468663514e-05, + "loss": 0.934, + "step": 88720 + }, + { + "epoch": 0.566870679631499, + "grad_norm": 0.8134447336196899, + "learning_rate": 8.146682543694318e-05, + "loss": 1.0641, + "step": 88730 + }, + { + "epoch": 0.5669345667812377, + "grad_norm": 1.032091736793518, + "learning_rate": 8.14629258703531e-05, + "loss": 0.9206, + "step": 88740 + }, + { + "epoch": 0.5669984539309764, + "grad_norm": 0.8850687146186829, + "learning_rate": 8.145902598690411e-05, + "loss": 1.1147, + "step": 88750 + }, + { + "epoch": 0.5670623410807151, + "grad_norm": 1.0859354734420776, + "learning_rate": 8.145512578663553e-05, + "loss": 1.1992, + "step": 88760 + }, + { + "epoch": 0.5671262282304538, + "grad_norm": 1.4345630407333374, + "learning_rate": 8.14512252695866e-05, + "loss": 0.7953, + "step": 88770 + }, + { + "epoch": 0.5671901153801925, + "grad_norm": 0.8256250023841858, + "learning_rate": 8.144732443579664e-05, + "loss": 0.8639, + "step": 88780 + }, + { + "epoch": 0.5672540025299311, + "grad_norm": 1.0019294023513794, + "learning_rate": 8.14434232853049e-05, + "loss": 0.9049, + "step": 88790 + }, + { + "epoch": 0.5673178896796698, + "grad_norm": 0.7967015504837036, + "learning_rate": 8.14395218181507e-05, + "loss": 1.0006, + "step": 88800 + }, + { + "epoch": 0.5673817768294085, + "grad_norm": 0.729104220867157, + "learning_rate": 8.143562003437331e-05, + "loss": 0.724, + "step": 88810 + }, + { + "epoch": 0.5674456639791472, + "grad_norm": 1.0281226634979248, + "learning_rate": 8.143171793401204e-05, + "loss": 0.7041, + "step": 88820 + }, + { + "epoch": 0.5675095511288859, + "grad_norm": 0.5173068046569824, + "learning_rate": 8.142781551710617e-05, + "loss": 0.7518, + "step": 88830 + }, + { + "epoch": 0.5675734382786246, + "grad_norm": 1.003184199333191, + "learning_rate": 8.1423912783695e-05, + "loss": 0.9193, + "step": 88840 + }, + { + "epoch": 0.5676373254283633, + "grad_norm": 0.8439940214157104, + "learning_rate": 8.142000973381787e-05, + "loss": 0.874, + "step": 88850 + }, + { + "epoch": 0.567701212578102, + "grad_norm": 1.0235188007354736, + "learning_rate": 8.141610636751405e-05, + "loss": 1.3299, + "step": 88860 + }, + { + "epoch": 0.5677650997278407, + "grad_norm": 1.1160435676574707, + "learning_rate": 8.141220268482284e-05, + "loss": 0.9362, + "step": 88870 + }, + { + "epoch": 0.5678289868775794, + "grad_norm": 0.9153828620910645, + "learning_rate": 8.140829868578359e-05, + "loss": 1.0991, + "step": 88880 + }, + { + "epoch": 0.5678928740273181, + "grad_norm": 0.7839625477790833, + "learning_rate": 8.140439437043558e-05, + "loss": 0.8622, + "step": 88890 + }, + { + "epoch": 0.5679567611770568, + "grad_norm": 0.9344704151153564, + "learning_rate": 8.140048973881817e-05, + "loss": 0.9697, + "step": 88900 + }, + { + "epoch": 0.5680206483267956, + "grad_norm": 1.2923128604888916, + "learning_rate": 8.139697529998467e-05, + "loss": 0.9158, + "step": 88910 + }, + { + "epoch": 0.5680845354765343, + "grad_norm": 0.7165302038192749, + "learning_rate": 8.139307006756369e-05, + "loss": 0.9479, + "step": 88920 + }, + { + "epoch": 0.568148422626273, + "grad_norm": 0.9380423426628113, + "learning_rate": 8.138916451898734e-05, + "loss": 1.0259, + "step": 88930 + }, + { + "epoch": 0.5682123097760117, + "grad_norm": 0.9337356686592102, + "learning_rate": 8.138525865429494e-05, + "loss": 0.7998, + "step": 88940 + }, + { + "epoch": 0.5682761969257504, + "grad_norm": 1.1728187799453735, + "learning_rate": 8.138135247352586e-05, + "loss": 0.7984, + "step": 88950 + }, + { + "epoch": 0.5683400840754891, + "grad_norm": 0.7271674871444702, + "learning_rate": 8.137744597671938e-05, + "loss": 0.8608, + "step": 88960 + }, + { + "epoch": 0.5684039712252278, + "grad_norm": 0.8207966089248657, + "learning_rate": 8.137353916391488e-05, + "loss": 0.8993, + "step": 88970 + }, + { + "epoch": 0.5684678583749665, + "grad_norm": 0.8019614815711975, + "learning_rate": 8.136963203515173e-05, + "loss": 0.9974, + "step": 88980 + }, + { + "epoch": 0.5685317455247052, + "grad_norm": 0.8686451315879822, + "learning_rate": 8.136572459046921e-05, + "loss": 0.7662, + "step": 88990 + }, + { + "epoch": 0.5685956326744439, + "grad_norm": 0.7033054232597351, + "learning_rate": 8.136181682990673e-05, + "loss": 0.9693, + "step": 89000 + }, + { + "epoch": 0.5686595198241826, + "grad_norm": 0.5672471523284912, + "learning_rate": 8.135790875350361e-05, + "loss": 0.8768, + "step": 89010 + }, + { + "epoch": 0.5687234069739213, + "grad_norm": 0.8341061472892761, + "learning_rate": 8.135400036129923e-05, + "loss": 0.8212, + "step": 89020 + }, + { + "epoch": 0.5687872941236599, + "grad_norm": 0.6905550956726074, + "learning_rate": 8.135009165333294e-05, + "loss": 0.8526, + "step": 89030 + }, + { + "epoch": 0.5688511812733986, + "grad_norm": 1.169189453125, + "learning_rate": 8.134618262964409e-05, + "loss": 0.8095, + "step": 89040 + }, + { + "epoch": 0.5689150684231373, + "grad_norm": 0.8723841309547424, + "learning_rate": 8.134227329027208e-05, + "loss": 0.934, + "step": 89050 + }, + { + "epoch": 0.568978955572876, + "grad_norm": 0.7888420224189758, + "learning_rate": 8.133836363525626e-05, + "loss": 0.8145, + "step": 89060 + }, + { + "epoch": 0.5690428427226147, + "grad_norm": 0.774649441242218, + "learning_rate": 8.133445366463601e-05, + "loss": 0.8815, + "step": 89070 + }, + { + "epoch": 0.5691067298723534, + "grad_norm": 0.6284635066986084, + "learning_rate": 8.13305433784507e-05, + "loss": 0.8557, + "step": 89080 + }, + { + "epoch": 0.5691706170220922, + "grad_norm": 0.747380793094635, + "learning_rate": 8.132663277673971e-05, + "loss": 1.0254, + "step": 89090 + }, + { + "epoch": 0.5692345041718309, + "grad_norm": 0.6164722442626953, + "learning_rate": 8.132272185954243e-05, + "loss": 0.8931, + "step": 89100 + }, + { + "epoch": 0.5692983913215696, + "grad_norm": 1.5792529582977295, + "learning_rate": 8.131881062689823e-05, + "loss": 1.1431, + "step": 89110 + }, + { + "epoch": 0.5693622784713083, + "grad_norm": 0.8760352730751038, + "learning_rate": 8.131489907884653e-05, + "loss": 1.0246, + "step": 89120 + }, + { + "epoch": 0.569426165621047, + "grad_norm": 1.4259626865386963, + "learning_rate": 8.13109872154267e-05, + "loss": 0.7266, + "step": 89130 + }, + { + "epoch": 0.5694900527707857, + "grad_norm": 1.4645694494247437, + "learning_rate": 8.130707503667814e-05, + "loss": 0.645, + "step": 89140 + }, + { + "epoch": 0.5695539399205244, + "grad_norm": 1.1326792240142822, + "learning_rate": 8.130316254264024e-05, + "loss": 1.1472, + "step": 89150 + }, + { + "epoch": 0.5696178270702631, + "grad_norm": 0.9139853715896606, + "learning_rate": 8.129924973335243e-05, + "loss": 1.025, + "step": 89160 + }, + { + "epoch": 0.5696817142200018, + "grad_norm": 0.5680515766143799, + "learning_rate": 8.129533660885407e-05, + "loss": 0.9172, + "step": 89170 + }, + { + "epoch": 0.5697456013697405, + "grad_norm": 0.7290217876434326, + "learning_rate": 8.129142316918463e-05, + "loss": 0.9759, + "step": 89180 + }, + { + "epoch": 0.5698094885194792, + "grad_norm": 0.5047475099563599, + "learning_rate": 8.128750941438346e-05, + "loss": 0.9244, + "step": 89190 + }, + { + "epoch": 0.5698733756692179, + "grad_norm": 1.2016278505325317, + "learning_rate": 8.128359534449002e-05, + "loss": 0.6997, + "step": 89200 + }, + { + "epoch": 0.5699372628189566, + "grad_norm": 1.542084813117981, + "learning_rate": 8.127968095954371e-05, + "loss": 0.8523, + "step": 89210 + }, + { + "epoch": 0.5700011499686953, + "grad_norm": 0.9595821499824524, + "learning_rate": 8.127576625958394e-05, + "loss": 0.9706, + "step": 89220 + }, + { + "epoch": 0.570065037118434, + "grad_norm": 0.6322153210639954, + "learning_rate": 8.127185124465016e-05, + "loss": 0.7701, + "step": 89230 + }, + { + "epoch": 0.5701289242681727, + "grad_norm": 1.8529796600341797, + "learning_rate": 8.126793591478177e-05, + "loss": 0.877, + "step": 89240 + }, + { + "epoch": 0.5701928114179114, + "grad_norm": 0.6426035761833191, + "learning_rate": 8.126402027001822e-05, + "loss": 0.9249, + "step": 89250 + }, + { + "epoch": 0.5702566985676502, + "grad_norm": 0.822106659412384, + "learning_rate": 8.126010431039895e-05, + "loss": 0.7386, + "step": 89260 + }, + { + "epoch": 0.5703205857173888, + "grad_norm": 1.577788233757019, + "learning_rate": 8.125618803596338e-05, + "loss": 0.9725, + "step": 89270 + }, + { + "epoch": 0.5703844728671275, + "grad_norm": 0.9563323259353638, + "learning_rate": 8.125227144675096e-05, + "loss": 0.6633, + "step": 89280 + }, + { + "epoch": 0.5704483600168662, + "grad_norm": 0.5990996956825256, + "learning_rate": 8.12483545428011e-05, + "loss": 0.969, + "step": 89290 + }, + { + "epoch": 0.5705122471666049, + "grad_norm": 0.8185253739356995, + "learning_rate": 8.124443732415331e-05, + "loss": 0.8085, + "step": 89300 + }, + { + "epoch": 0.5705761343163436, + "grad_norm": 1.3149315118789673, + "learning_rate": 8.124051979084699e-05, + "loss": 1.0496, + "step": 89310 + }, + { + "epoch": 0.5706400214660823, + "grad_norm": 0.9711151123046875, + "learning_rate": 8.123660194292162e-05, + "loss": 0.8151, + "step": 89320 + }, + { + "epoch": 0.570703908615821, + "grad_norm": 0.8611701726913452, + "learning_rate": 8.123268378041664e-05, + "loss": 1.007, + "step": 89330 + }, + { + "epoch": 0.5707677957655597, + "grad_norm": 0.8310745358467102, + "learning_rate": 8.122876530337151e-05, + "loss": 1.0438, + "step": 89340 + }, + { + "epoch": 0.5708316829152984, + "grad_norm": 1.0816930532455444, + "learning_rate": 8.12248465118257e-05, + "loss": 0.7444, + "step": 89350 + }, + { + "epoch": 0.5708955700650371, + "grad_norm": 0.9173716902732849, + "learning_rate": 8.122092740581867e-05, + "loss": 1.183, + "step": 89360 + }, + { + "epoch": 0.5709594572147758, + "grad_norm": 1.3235098123550415, + "learning_rate": 8.121700798538989e-05, + "loss": 0.9047, + "step": 89370 + }, + { + "epoch": 0.5710233443645145, + "grad_norm": 0.7508029341697693, + "learning_rate": 8.121308825057882e-05, + "loss": 1.086, + "step": 89380 + }, + { + "epoch": 0.5710872315142532, + "grad_norm": 0.840752124786377, + "learning_rate": 8.120916820142498e-05, + "loss": 1.1446, + "step": 89390 + }, + { + "epoch": 0.5711511186639919, + "grad_norm": 1.024584412574768, + "learning_rate": 8.12052478379678e-05, + "loss": 1.057, + "step": 89400 + }, + { + "epoch": 0.5712150058137306, + "grad_norm": 0.9996415972709656, + "learning_rate": 8.120132716024678e-05, + "loss": 0.9998, + "step": 89410 + }, + { + "epoch": 0.5712788929634693, + "grad_norm": 0.9148262143135071, + "learning_rate": 8.11974061683014e-05, + "loss": 1.3289, + "step": 89420 + }, + { + "epoch": 0.571342780113208, + "grad_norm": 1.0694867372512817, + "learning_rate": 8.119348486217116e-05, + "loss": 0.9789, + "step": 89430 + }, + { + "epoch": 0.5714066672629468, + "grad_norm": 0.6613714694976807, + "learning_rate": 8.118956324189553e-05, + "loss": 1.0028, + "step": 89440 + }, + { + "epoch": 0.5714705544126855, + "grad_norm": 0.981604814529419, + "learning_rate": 8.1185641307514e-05, + "loss": 0.9502, + "step": 89450 + }, + { + "epoch": 0.5715344415624242, + "grad_norm": 1.0735020637512207, + "learning_rate": 8.118171905906611e-05, + "loss": 0.7949, + "step": 89460 + }, + { + "epoch": 0.5715983287121629, + "grad_norm": 0.7923375368118286, + "learning_rate": 8.117779649659132e-05, + "loss": 0.7809, + "step": 89470 + }, + { + "epoch": 0.5716622158619016, + "grad_norm": 1.0053727626800537, + "learning_rate": 8.117387362012915e-05, + "loss": 0.772, + "step": 89480 + }, + { + "epoch": 0.5717261030116403, + "grad_norm": 0.8714725375175476, + "learning_rate": 8.116995042971909e-05, + "loss": 1.0257, + "step": 89490 + }, + { + "epoch": 0.571789990161379, + "grad_norm": 0.8163152933120728, + "learning_rate": 8.116602692540069e-05, + "loss": 0.7977, + "step": 89500 + }, + { + "epoch": 0.5718538773111176, + "grad_norm": 0.49878737330436707, + "learning_rate": 8.116210310721342e-05, + "loss": 0.8128, + "step": 89510 + }, + { + "epoch": 0.5719177644608563, + "grad_norm": 0.6907072067260742, + "learning_rate": 8.115817897519682e-05, + "loss": 0.7639, + "step": 89520 + }, + { + "epoch": 0.571981651610595, + "grad_norm": 0.6646702885627747, + "learning_rate": 8.115425452939039e-05, + "loss": 0.8343, + "step": 89530 + }, + { + "epoch": 0.5720455387603337, + "grad_norm": 0.7955873608589172, + "learning_rate": 8.115032976983368e-05, + "loss": 0.8254, + "step": 89540 + }, + { + "epoch": 0.5721094259100724, + "grad_norm": 1.2668566703796387, + "learning_rate": 8.114640469656619e-05, + "loss": 0.879, + "step": 89550 + }, + { + "epoch": 0.5721733130598111, + "grad_norm": 0.8631924390792847, + "learning_rate": 8.114247930962746e-05, + "loss": 0.9851, + "step": 89560 + }, + { + "epoch": 0.5722372002095498, + "grad_norm": 1.4024226665496826, + "learning_rate": 8.113855360905702e-05, + "loss": 0.9293, + "step": 89570 + }, + { + "epoch": 0.5723010873592885, + "grad_norm": 0.826225996017456, + "learning_rate": 8.113462759489441e-05, + "loss": 0.8176, + "step": 89580 + }, + { + "epoch": 0.5723649745090272, + "grad_norm": 0.7401711344718933, + "learning_rate": 8.113070126717916e-05, + "loss": 0.7405, + "step": 89590 + }, + { + "epoch": 0.5724288616587659, + "grad_norm": 0.8626922369003296, + "learning_rate": 8.112677462595084e-05, + "loss": 0.9589, + "step": 89600 + }, + { + "epoch": 0.5724927488085046, + "grad_norm": 0.7971317172050476, + "learning_rate": 8.112284767124894e-05, + "loss": 0.9191, + "step": 89610 + }, + { + "epoch": 0.5725566359582434, + "grad_norm": 0.7105472683906555, + "learning_rate": 8.111892040311305e-05, + "loss": 1.0642, + "step": 89620 + }, + { + "epoch": 0.5726205231079821, + "grad_norm": 0.8470255732536316, + "learning_rate": 8.111499282158271e-05, + "loss": 0.9904, + "step": 89630 + }, + { + "epoch": 0.5726844102577208, + "grad_norm": 1.0502973794937134, + "learning_rate": 8.111106492669747e-05, + "loss": 0.7748, + "step": 89640 + }, + { + "epoch": 0.5727482974074595, + "grad_norm": 0.9017430543899536, + "learning_rate": 8.11071367184969e-05, + "loss": 1.0352, + "step": 89650 + }, + { + "epoch": 0.5728121845571982, + "grad_norm": 1.4443550109863281, + "learning_rate": 8.110320819702055e-05, + "loss": 0.7986, + "step": 89660 + }, + { + "epoch": 0.5728760717069369, + "grad_norm": 0.8041828274726868, + "learning_rate": 8.109927936230798e-05, + "loss": 0.8368, + "step": 89670 + }, + { + "epoch": 0.5729399588566756, + "grad_norm": 0.8347397446632385, + "learning_rate": 8.109535021439876e-05, + "loss": 1.0824, + "step": 89680 + }, + { + "epoch": 0.5730038460064143, + "grad_norm": 1.0206586122512817, + "learning_rate": 8.109142075333247e-05, + "loss": 0.9412, + "step": 89690 + }, + { + "epoch": 0.573067733156153, + "grad_norm": 1.7678121328353882, + "learning_rate": 8.108749097914867e-05, + "loss": 0.698, + "step": 89700 + }, + { + "epoch": 0.5731316203058917, + "grad_norm": 0.7423132658004761, + "learning_rate": 8.108356089188694e-05, + "loss": 1.1062, + "step": 89710 + }, + { + "epoch": 0.5731955074556304, + "grad_norm": 0.5420845746994019, + "learning_rate": 8.107963049158686e-05, + "loss": 0.9447, + "step": 89720 + }, + { + "epoch": 0.5732593946053691, + "grad_norm": 1.7676161527633667, + "learning_rate": 8.107569977828803e-05, + "loss": 0.9414, + "step": 89730 + }, + { + "epoch": 0.5733232817551078, + "grad_norm": 0.9517451524734497, + "learning_rate": 8.107176875203e-05, + "loss": 0.8824, + "step": 89740 + }, + { + "epoch": 0.5733871689048465, + "grad_norm": 3.0669705867767334, + "learning_rate": 8.106783741285237e-05, + "loss": 0.9844, + "step": 89750 + }, + { + "epoch": 0.5734510560545851, + "grad_norm": 1.1198365688323975, + "learning_rate": 8.106390576079477e-05, + "loss": 0.9505, + "step": 89760 + }, + { + "epoch": 0.5735149432043238, + "grad_norm": 0.8078299164772034, + "learning_rate": 8.105997379589675e-05, + "loss": 0.9776, + "step": 89770 + }, + { + "epoch": 0.5735788303540625, + "grad_norm": 2.656707286834717, + "learning_rate": 8.105604151819793e-05, + "loss": 0.6794, + "step": 89780 + }, + { + "epoch": 0.5736427175038012, + "grad_norm": 1.2671973705291748, + "learning_rate": 8.105210892773789e-05, + "loss": 0.7823, + "step": 89790 + }, + { + "epoch": 0.57370660465354, + "grad_norm": 0.7666161060333252, + "learning_rate": 8.104817602455626e-05, + "loss": 0.7727, + "step": 89800 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 1.4913362264633179, + "learning_rate": 8.104424280869263e-05, + "loss": 0.8733, + "step": 89810 + }, + { + "epoch": 0.5738343789530174, + "grad_norm": 1.1301679611206055, + "learning_rate": 8.104030928018662e-05, + "loss": 0.7014, + "step": 89820 + }, + { + "epoch": 0.5738982661027561, + "grad_norm": 1.294581413269043, + "learning_rate": 8.103637543907784e-05, + "loss": 1.1176, + "step": 89830 + }, + { + "epoch": 0.5739621532524948, + "grad_norm": 0.9880439639091492, + "learning_rate": 8.103244128540591e-05, + "loss": 1.1367, + "step": 89840 + }, + { + "epoch": 0.5740260404022335, + "grad_norm": 1.8410000801086426, + "learning_rate": 8.102850681921046e-05, + "loss": 0.9493, + "step": 89850 + }, + { + "epoch": 0.5740899275519722, + "grad_norm": 0.7882397770881653, + "learning_rate": 8.102457204053109e-05, + "loss": 0.9901, + "step": 89860 + }, + { + "epoch": 0.5741538147017109, + "grad_norm": 0.833531379699707, + "learning_rate": 8.102063694940745e-05, + "loss": 0.8304, + "step": 89870 + }, + { + "epoch": 0.5742177018514496, + "grad_norm": 0.7302635908126831, + "learning_rate": 8.101670154587915e-05, + "loss": 0.9241, + "step": 89880 + }, + { + "epoch": 0.5742815890011883, + "grad_norm": 0.7871063947677612, + "learning_rate": 8.101276582998583e-05, + "loss": 0.7141, + "step": 89890 + }, + { + "epoch": 0.574345476150927, + "grad_norm": 1.1040356159210205, + "learning_rate": 8.100882980176712e-05, + "loss": 0.8373, + "step": 89900 + }, + { + "epoch": 0.5744093633006657, + "grad_norm": 0.5926182866096497, + "learning_rate": 8.100489346126268e-05, + "loss": 0.6924, + "step": 89910 + }, + { + "epoch": 0.5744732504504044, + "grad_norm": 1.047753930091858, + "learning_rate": 8.100095680851214e-05, + "loss": 0.8614, + "step": 89920 + }, + { + "epoch": 0.5745371376001431, + "grad_norm": 0.9589722752571106, + "learning_rate": 8.099701984355514e-05, + "loss": 0.796, + "step": 89930 + }, + { + "epoch": 0.5746010247498818, + "grad_norm": 1.2964690923690796, + "learning_rate": 8.099308256643134e-05, + "loss": 0.7672, + "step": 89940 + }, + { + "epoch": 0.5746649118996205, + "grad_norm": 1.9228540658950806, + "learning_rate": 8.09891449771804e-05, + "loss": 0.8787, + "step": 89950 + }, + { + "epoch": 0.5747287990493593, + "grad_norm": 0.5643669962882996, + "learning_rate": 8.098520707584195e-05, + "loss": 0.855, + "step": 89960 + }, + { + "epoch": 0.574792686199098, + "grad_norm": 0.9459285736083984, + "learning_rate": 8.098126886245564e-05, + "loss": 0.8887, + "step": 89970 + }, + { + "epoch": 0.5748565733488367, + "grad_norm": 0.8743549585342407, + "learning_rate": 8.097733033706117e-05, + "loss": 1.0167, + "step": 89980 + }, + { + "epoch": 0.5749204604985754, + "grad_norm": 1.0637538433074951, + "learning_rate": 8.097339149969818e-05, + "loss": 0.76, + "step": 89990 + }, + { + "epoch": 0.574984347648314, + "grad_norm": 0.6457778215408325, + "learning_rate": 8.096945235040634e-05, + "loss": 0.8873, + "step": 90000 + }, + { + "epoch": 0.5750482347980527, + "grad_norm": 0.7007945775985718, + "learning_rate": 8.096551288922532e-05, + "loss": 0.6989, + "step": 90010 + }, + { + "epoch": 0.5751121219477914, + "grad_norm": 1.048057198524475, + "learning_rate": 8.096157311619479e-05, + "loss": 0.7702, + "step": 90020 + }, + { + "epoch": 0.5751760090975301, + "grad_norm": 0.8541986346244812, + "learning_rate": 8.095763303135444e-05, + "loss": 0.996, + "step": 90030 + }, + { + "epoch": 0.5752398962472688, + "grad_norm": 1.4196783304214478, + "learning_rate": 8.095369263474396e-05, + "loss": 0.5537, + "step": 90040 + }, + { + "epoch": 0.5753037833970075, + "grad_norm": 0.8951913118362427, + "learning_rate": 8.094975192640299e-05, + "loss": 0.6825, + "step": 90050 + }, + { + "epoch": 0.5753676705467462, + "grad_norm": 0.6169331669807434, + "learning_rate": 8.094581090637127e-05, + "loss": 0.8536, + "step": 90060 + }, + { + "epoch": 0.5754315576964849, + "grad_norm": 0.7136598825454712, + "learning_rate": 8.094186957468843e-05, + "loss": 0.9564, + "step": 90070 + }, + { + "epoch": 0.5754954448462236, + "grad_norm": 1.0115174055099487, + "learning_rate": 8.093792793139421e-05, + "loss": 0.7456, + "step": 90080 + }, + { + "epoch": 0.5755593319959623, + "grad_norm": 0.7271766662597656, + "learning_rate": 8.09339859765283e-05, + "loss": 0.8566, + "step": 90090 + }, + { + "epoch": 0.575623219145701, + "grad_norm": 0.870293140411377, + "learning_rate": 8.093004371013038e-05, + "loss": 1.0552, + "step": 90100 + }, + { + "epoch": 0.5756871062954397, + "grad_norm": 0.8186811208724976, + "learning_rate": 8.092610113224017e-05, + "loss": 0.7465, + "step": 90110 + }, + { + "epoch": 0.5757509934451784, + "grad_norm": 0.918304979801178, + "learning_rate": 8.092215824289735e-05, + "loss": 0.8869, + "step": 90120 + }, + { + "epoch": 0.5758148805949171, + "grad_norm": 0.9499895572662354, + "learning_rate": 8.091821504214166e-05, + "loss": 0.8331, + "step": 90130 + }, + { + "epoch": 0.5758787677446559, + "grad_norm": 1.6564801931381226, + "learning_rate": 8.091427153001278e-05, + "loss": 0.9647, + "step": 90140 + }, + { + "epoch": 0.5759426548943946, + "grad_norm": 1.2745451927185059, + "learning_rate": 8.091032770655048e-05, + "loss": 0.7745, + "step": 90150 + }, + { + "epoch": 0.5760065420441333, + "grad_norm": 1.276982069015503, + "learning_rate": 8.09063835717944e-05, + "loss": 1.0932, + "step": 90160 + }, + { + "epoch": 0.576070429193872, + "grad_norm": 1.1688791513442993, + "learning_rate": 8.09024391257843e-05, + "loss": 0.8497, + "step": 90170 + }, + { + "epoch": 0.5761343163436107, + "grad_norm": 0.6883856058120728, + "learning_rate": 8.089849436855992e-05, + "loss": 0.8468, + "step": 90180 + }, + { + "epoch": 0.5761982034933494, + "grad_norm": 1.2166844606399536, + "learning_rate": 8.089454930016095e-05, + "loss": 0.9501, + "step": 90190 + }, + { + "epoch": 0.5762620906430881, + "grad_norm": 0.9265638589859009, + "learning_rate": 8.089060392062718e-05, + "loss": 0.828, + "step": 90200 + }, + { + "epoch": 0.5763259777928268, + "grad_norm": 1.2245463132858276, + "learning_rate": 8.088665822999827e-05, + "loss": 1.0158, + "step": 90210 + }, + { + "epoch": 0.5763898649425655, + "grad_norm": 1.1771318912506104, + "learning_rate": 8.088271222831401e-05, + "loss": 0.8838, + "step": 90220 + }, + { + "epoch": 0.5764537520923042, + "grad_norm": 0.6416171193122864, + "learning_rate": 8.08787659156141e-05, + "loss": 0.8364, + "step": 90230 + }, + { + "epoch": 0.5765176392420428, + "grad_norm": 0.7262217998504639, + "learning_rate": 8.087481929193831e-05, + "loss": 0.8772, + "step": 90240 + }, + { + "epoch": 0.5765815263917815, + "grad_norm": 1.0803191661834717, + "learning_rate": 8.08708723573264e-05, + "loss": 0.9563, + "step": 90250 + }, + { + "epoch": 0.5766454135415202, + "grad_norm": 0.8832546472549438, + "learning_rate": 8.086692511181806e-05, + "loss": 0.9067, + "step": 90260 + }, + { + "epoch": 0.5767093006912589, + "grad_norm": 0.8966745138168335, + "learning_rate": 8.086297755545312e-05, + "loss": 1.0014, + "step": 90270 + }, + { + "epoch": 0.5767731878409976, + "grad_norm": 1.051358938217163, + "learning_rate": 8.085902968827128e-05, + "loss": 0.8469, + "step": 90280 + }, + { + "epoch": 0.5768370749907363, + "grad_norm": 0.5282111763954163, + "learning_rate": 8.085508151031232e-05, + "loss": 0.7655, + "step": 90290 + }, + { + "epoch": 0.576900962140475, + "grad_norm": 0.9313019514083862, + "learning_rate": 8.085113302161598e-05, + "loss": 0.8174, + "step": 90300 + }, + { + "epoch": 0.5769648492902137, + "grad_norm": 1.0769340991973877, + "learning_rate": 8.084718422222205e-05, + "loss": 0.6471, + "step": 90310 + }, + { + "epoch": 0.5770287364399524, + "grad_norm": 0.8049689531326294, + "learning_rate": 8.084323511217029e-05, + "loss": 0.9863, + "step": 90320 + }, + { + "epoch": 0.5770926235896912, + "grad_norm": 1.1444233655929565, + "learning_rate": 8.083928569150045e-05, + "loss": 0.7216, + "step": 90330 + }, + { + "epoch": 0.5771565107394299, + "grad_norm": 0.6875047087669373, + "learning_rate": 8.083533596025234e-05, + "loss": 0.8915, + "step": 90340 + }, + { + "epoch": 0.5772203978891686, + "grad_norm": 0.9097625613212585, + "learning_rate": 8.083138591846574e-05, + "loss": 0.7636, + "step": 90350 + }, + { + "epoch": 0.5772842850389073, + "grad_norm": 1.2456170320510864, + "learning_rate": 8.082743556618038e-05, + "loss": 0.8581, + "step": 90360 + }, + { + "epoch": 0.577348172188646, + "grad_norm": 0.8649427890777588, + "learning_rate": 8.082348490343608e-05, + "loss": 1.0361, + "step": 90370 + }, + { + "epoch": 0.5774120593383847, + "grad_norm": 2.0383450984954834, + "learning_rate": 8.081953393027263e-05, + "loss": 0.7167, + "step": 90380 + }, + { + "epoch": 0.5774759464881234, + "grad_norm": 1.4382243156433105, + "learning_rate": 8.081558264672982e-05, + "loss": 0.881, + "step": 90390 + }, + { + "epoch": 0.5775398336378621, + "grad_norm": 0.6351116895675659, + "learning_rate": 8.081163105284741e-05, + "loss": 0.7479, + "step": 90400 + }, + { + "epoch": 0.5776037207876008, + "grad_norm": 0.8547778129577637, + "learning_rate": 8.080767914866523e-05, + "loss": 0.9155, + "step": 90410 + }, + { + "epoch": 0.5776676079373395, + "grad_norm": 0.6154083609580994, + "learning_rate": 8.080372693422307e-05, + "loss": 0.7414, + "step": 90420 + }, + { + "epoch": 0.5777314950870782, + "grad_norm": 0.8668635487556458, + "learning_rate": 8.079977440956073e-05, + "loss": 0.6981, + "step": 90430 + }, + { + "epoch": 0.5777953822368169, + "grad_norm": 0.6722155809402466, + "learning_rate": 8.079582157471801e-05, + "loss": 0.9792, + "step": 90440 + }, + { + "epoch": 0.5778592693865556, + "grad_norm": 0.5810309648513794, + "learning_rate": 8.079186842973473e-05, + "loss": 0.5862, + "step": 90450 + }, + { + "epoch": 0.5779231565362943, + "grad_norm": 1.0260207653045654, + "learning_rate": 8.07879149746507e-05, + "loss": 0.803, + "step": 90460 + }, + { + "epoch": 0.577987043686033, + "grad_norm": 0.8295899033546448, + "learning_rate": 8.078396120950572e-05, + "loss": 0.955, + "step": 90470 + }, + { + "epoch": 0.5780509308357717, + "grad_norm": 0.7317106127738953, + "learning_rate": 8.078000713433962e-05, + "loss": 1.0108, + "step": 90480 + }, + { + "epoch": 0.5781148179855103, + "grad_norm": 0.925954282283783, + "learning_rate": 8.077605274919224e-05, + "loss": 0.675, + "step": 90490 + }, + { + "epoch": 0.578178705135249, + "grad_norm": 1.5816576480865479, + "learning_rate": 8.077209805410336e-05, + "loss": 0.802, + "step": 90500 + }, + { + "epoch": 0.5782425922849878, + "grad_norm": 1.3754863739013672, + "learning_rate": 8.076814304911285e-05, + "loss": 0.7063, + "step": 90510 + }, + { + "epoch": 0.5783064794347265, + "grad_norm": 1.1097428798675537, + "learning_rate": 8.076418773426051e-05, + "loss": 0.8896, + "step": 90520 + }, + { + "epoch": 0.5783703665844652, + "grad_norm": 0.9202744364738464, + "learning_rate": 8.076023210958618e-05, + "loss": 1.1369, + "step": 90530 + }, + { + "epoch": 0.5784342537342039, + "grad_norm": 0.8386440873146057, + "learning_rate": 8.07562761751297e-05, + "loss": 1.1063, + "step": 90540 + }, + { + "epoch": 0.5784981408839426, + "grad_norm": 1.387734293937683, + "learning_rate": 8.075231993093093e-05, + "loss": 0.8543, + "step": 90550 + }, + { + "epoch": 0.5785620280336813, + "grad_norm": 1.1072419881820679, + "learning_rate": 8.074836337702969e-05, + "loss": 1.1178, + "step": 90560 + }, + { + "epoch": 0.57862591518342, + "grad_norm": 0.7916972041130066, + "learning_rate": 8.074440651346582e-05, + "loss": 1.2825, + "step": 90570 + }, + { + "epoch": 0.5786898023331587, + "grad_norm": 1.407332181930542, + "learning_rate": 8.074044934027918e-05, + "loss": 0.9507, + "step": 90580 + }, + { + "epoch": 0.5787536894828974, + "grad_norm": 0.8653108477592468, + "learning_rate": 8.073649185750962e-05, + "loss": 0.7288, + "step": 90590 + }, + { + "epoch": 0.5788175766326361, + "grad_norm": 1.2476141452789307, + "learning_rate": 8.073253406519699e-05, + "loss": 0.8561, + "step": 90600 + }, + { + "epoch": 0.5788814637823748, + "grad_norm": 0.8226998448371887, + "learning_rate": 8.072857596338116e-05, + "loss": 0.8944, + "step": 90610 + }, + { + "epoch": 0.5789453509321135, + "grad_norm": 0.7982886433601379, + "learning_rate": 8.0724617552102e-05, + "loss": 0.8097, + "step": 90620 + }, + { + "epoch": 0.5790092380818522, + "grad_norm": 0.7011058926582336, + "learning_rate": 8.072065883139935e-05, + "loss": 1.0291, + "step": 90630 + }, + { + "epoch": 0.5790731252315909, + "grad_norm": 1.2704604864120483, + "learning_rate": 8.071669980131307e-05, + "loss": 0.893, + "step": 90640 + }, + { + "epoch": 0.5791370123813296, + "grad_norm": 0.7645861506462097, + "learning_rate": 8.071274046188306e-05, + "loss": 0.6751, + "step": 90650 + }, + { + "epoch": 0.5792008995310683, + "grad_norm": 3.0247256755828857, + "learning_rate": 8.07087808131492e-05, + "loss": 0.7969, + "step": 90660 + }, + { + "epoch": 0.579264786680807, + "grad_norm": 1.2994235754013062, + "learning_rate": 8.070482085515134e-05, + "loss": 1.3255, + "step": 90670 + }, + { + "epoch": 0.5793286738305458, + "grad_norm": 1.6243011951446533, + "learning_rate": 8.070086058792937e-05, + "loss": 0.8921, + "step": 90680 + }, + { + "epoch": 0.5793925609802845, + "grad_norm": 0.907557487487793, + "learning_rate": 8.069690001152317e-05, + "loss": 0.8408, + "step": 90690 + }, + { + "epoch": 0.5794564481300232, + "grad_norm": 0.7467745542526245, + "learning_rate": 8.069293912597263e-05, + "loss": 0.881, + "step": 90700 + }, + { + "epoch": 0.5795203352797619, + "grad_norm": 0.7291324734687805, + "learning_rate": 8.068897793131764e-05, + "loss": 1.0837, + "step": 90710 + }, + { + "epoch": 0.5795842224295006, + "grad_norm": 0.7500112056732178, + "learning_rate": 8.068501642759811e-05, + "loss": 0.6602, + "step": 90720 + }, + { + "epoch": 0.5796481095792392, + "grad_norm": 0.3867965042591095, + "learning_rate": 8.068105461485391e-05, + "loss": 0.7279, + "step": 90730 + }, + { + "epoch": 0.5797119967289779, + "grad_norm": 0.6861584186553955, + "learning_rate": 8.067709249312494e-05, + "loss": 0.924, + "step": 90740 + }, + { + "epoch": 0.5797758838787166, + "grad_norm": 0.9725950956344604, + "learning_rate": 8.06731300624511e-05, + "loss": 0.686, + "step": 90750 + }, + { + "epoch": 0.5798397710284553, + "grad_norm": 0.7137267589569092, + "learning_rate": 8.066916732287232e-05, + "loss": 0.7585, + "step": 90760 + }, + { + "epoch": 0.579903658178194, + "grad_norm": 0.8141860961914062, + "learning_rate": 8.06652042744285e-05, + "loss": 1.2448, + "step": 90770 + }, + { + "epoch": 0.5799675453279327, + "grad_norm": 0.9452531337738037, + "learning_rate": 8.066124091715952e-05, + "loss": 0.8939, + "step": 90780 + }, + { + "epoch": 0.5800314324776714, + "grad_norm": 0.8053810596466064, + "learning_rate": 8.065727725110533e-05, + "loss": 0.8234, + "step": 90790 + }, + { + "epoch": 0.5800953196274101, + "grad_norm": 0.8168609142303467, + "learning_rate": 8.065331327630585e-05, + "loss": 1.1362, + "step": 90800 + }, + { + "epoch": 0.5801592067771488, + "grad_norm": 0.7657856941223145, + "learning_rate": 8.064934899280096e-05, + "loss": 0.9269, + "step": 90810 + }, + { + "epoch": 0.5802230939268875, + "grad_norm": 1.230660319328308, + "learning_rate": 8.064538440063063e-05, + "loss": 0.7815, + "step": 90820 + }, + { + "epoch": 0.5802869810766262, + "grad_norm": 0.7919756770133972, + "learning_rate": 8.064141949983476e-05, + "loss": 0.7874, + "step": 90830 + }, + { + "epoch": 0.580350868226365, + "grad_norm": 0.6535720229148865, + "learning_rate": 8.063745429045329e-05, + "loss": 0.7333, + "step": 90840 + }, + { + "epoch": 0.5804147553761037, + "grad_norm": 0.8120725750923157, + "learning_rate": 8.063348877252614e-05, + "loss": 0.7505, + "step": 90850 + }, + { + "epoch": 0.5804786425258424, + "grad_norm": 0.7102304697036743, + "learning_rate": 8.062952294609327e-05, + "loss": 0.6825, + "step": 90860 + }, + { + "epoch": 0.5805425296755811, + "grad_norm": 0.9454075694084167, + "learning_rate": 8.062555681119459e-05, + "loss": 0.6988, + "step": 90870 + }, + { + "epoch": 0.5806064168253198, + "grad_norm": 1.1664451360702515, + "learning_rate": 8.062159036787007e-05, + "loss": 0.8987, + "step": 90880 + }, + { + "epoch": 0.5806703039750585, + "grad_norm": 1.2764151096343994, + "learning_rate": 8.061762361615964e-05, + "loss": 0.9188, + "step": 90890 + }, + { + "epoch": 0.5807341911247972, + "grad_norm": 1.244565725326538, + "learning_rate": 8.061365655610325e-05, + "loss": 1.1752, + "step": 90900 + }, + { + "epoch": 0.5807980782745359, + "grad_norm": 0.8151182532310486, + "learning_rate": 8.060968918774085e-05, + "loss": 0.8404, + "step": 90910 + }, + { + "epoch": 0.5808619654242746, + "grad_norm": 0.8376042246818542, + "learning_rate": 8.06057215111124e-05, + "loss": 0.8567, + "step": 90920 + }, + { + "epoch": 0.5809258525740133, + "grad_norm": 1.4422600269317627, + "learning_rate": 8.060175352625787e-05, + "loss": 0.9805, + "step": 90930 + }, + { + "epoch": 0.580989739723752, + "grad_norm": 0.6964714527130127, + "learning_rate": 8.05977852332172e-05, + "loss": 0.6621, + "step": 90940 + }, + { + "epoch": 0.5810536268734907, + "grad_norm": 0.9028936624526978, + "learning_rate": 8.059381663203036e-05, + "loss": 0.9439, + "step": 90950 + }, + { + "epoch": 0.5811175140232294, + "grad_norm": 1.128549575805664, + "learning_rate": 8.058984772273733e-05, + "loss": 0.7059, + "step": 90960 + }, + { + "epoch": 0.581181401172968, + "grad_norm": 1.0536413192749023, + "learning_rate": 8.058587850537804e-05, + "loss": 0.8569, + "step": 90970 + }, + { + "epoch": 0.5812452883227067, + "grad_norm": 0.8410016894340515, + "learning_rate": 8.058190897999252e-05, + "loss": 0.9881, + "step": 90980 + }, + { + "epoch": 0.5813091754724454, + "grad_norm": 0.5887959003448486, + "learning_rate": 8.057793914662071e-05, + "loss": 1.2143, + "step": 90990 + }, + { + "epoch": 0.5813730626221841, + "grad_norm": 0.9902825951576233, + "learning_rate": 8.057396900530261e-05, + "loss": 1.0165, + "step": 91000 + }, + { + "epoch": 0.5814369497719228, + "grad_norm": 0.7618111371994019, + "learning_rate": 8.056999855607819e-05, + "loss": 0.9192, + "step": 91010 + }, + { + "epoch": 0.5815008369216615, + "grad_norm": 1.465938925743103, + "learning_rate": 8.056602779898742e-05, + "loss": 1.0972, + "step": 91020 + }, + { + "epoch": 0.5815647240714003, + "grad_norm": 0.6929851770401001, + "learning_rate": 8.056205673407031e-05, + "loss": 0.8941, + "step": 91030 + }, + { + "epoch": 0.581628611221139, + "grad_norm": 0.7662091851234436, + "learning_rate": 8.055808536136687e-05, + "loss": 1.1148, + "step": 91040 + }, + { + "epoch": 0.5816924983708777, + "grad_norm": 1.1561191082000732, + "learning_rate": 8.055411368091706e-05, + "loss": 0.9246, + "step": 91050 + }, + { + "epoch": 0.5817563855206164, + "grad_norm": 1.0664466619491577, + "learning_rate": 8.05501416927609e-05, + "loss": 1.0493, + "step": 91060 + }, + { + "epoch": 0.5818202726703551, + "grad_norm": 1.0375691652297974, + "learning_rate": 8.054616939693837e-05, + "loss": 0.8237, + "step": 91070 + }, + { + "epoch": 0.5818841598200938, + "grad_norm": 1.251013159751892, + "learning_rate": 8.054219679348949e-05, + "loss": 0.9984, + "step": 91080 + }, + { + "epoch": 0.5819480469698325, + "grad_norm": 0.7131451964378357, + "learning_rate": 8.053822388245426e-05, + "loss": 0.8158, + "step": 91090 + }, + { + "epoch": 0.5820119341195712, + "grad_norm": 0.6554450988769531, + "learning_rate": 8.053425066387271e-05, + "loss": 0.988, + "step": 91100 + }, + { + "epoch": 0.5820758212693099, + "grad_norm": 0.5957306623458862, + "learning_rate": 8.053027713778484e-05, + "loss": 0.8844, + "step": 91110 + }, + { + "epoch": 0.5821397084190486, + "grad_norm": 2.745039463043213, + "learning_rate": 8.052630330423066e-05, + "loss": 0.8555, + "step": 91120 + }, + { + "epoch": 0.5822035955687873, + "grad_norm": 1.19644033908844, + "learning_rate": 8.05223291632502e-05, + "loss": 1.1978, + "step": 91130 + }, + { + "epoch": 0.582267482718526, + "grad_norm": 0.7143746614456177, + "learning_rate": 8.051835471488347e-05, + "loss": 1.0662, + "step": 91140 + }, + { + "epoch": 0.5823313698682647, + "grad_norm": 0.6921029686927795, + "learning_rate": 8.051437995917051e-05, + "loss": 0.8375, + "step": 91150 + }, + { + "epoch": 0.5823952570180034, + "grad_norm": 1.4767210483551025, + "learning_rate": 8.051040489615136e-05, + "loss": 0.9358, + "step": 91160 + }, + { + "epoch": 0.5824591441677421, + "grad_norm": 1.0952929258346558, + "learning_rate": 8.050642952586602e-05, + "loss": 0.8761, + "step": 91170 + }, + { + "epoch": 0.5825230313174808, + "grad_norm": 0.7604880332946777, + "learning_rate": 8.050245384835455e-05, + "loss": 0.9666, + "step": 91180 + }, + { + "epoch": 0.5825869184672195, + "grad_norm": 0.6385078430175781, + "learning_rate": 8.049847786365698e-05, + "loss": 1.0319, + "step": 91190 + }, + { + "epoch": 0.5826508056169583, + "grad_norm": 0.8328523635864258, + "learning_rate": 8.049450157181336e-05, + "loss": 0.6776, + "step": 91200 + }, + { + "epoch": 0.5827146927666969, + "grad_norm": 0.9883635640144348, + "learning_rate": 8.049052497286372e-05, + "loss": 0.8049, + "step": 91210 + }, + { + "epoch": 0.5827785799164356, + "grad_norm": 0.9479039907455444, + "learning_rate": 8.048654806684812e-05, + "loss": 0.8743, + "step": 91220 + }, + { + "epoch": 0.5828424670661743, + "grad_norm": 1.1088539361953735, + "learning_rate": 8.048257085380659e-05, + "loss": 0.9272, + "step": 91230 + }, + { + "epoch": 0.582906354215913, + "grad_norm": 0.6845352053642273, + "learning_rate": 8.047859333377923e-05, + "loss": 0.9951, + "step": 91240 + }, + { + "epoch": 0.5829702413656517, + "grad_norm": 1.5272711515426636, + "learning_rate": 8.047461550680606e-05, + "loss": 0.8264, + "step": 91250 + }, + { + "epoch": 0.5830341285153904, + "grad_norm": 0.7585494518280029, + "learning_rate": 8.047063737292712e-05, + "loss": 0.7845, + "step": 91260 + }, + { + "epoch": 0.5830980156651291, + "grad_norm": 0.8442081809043884, + "learning_rate": 8.046665893218253e-05, + "loss": 0.8805, + "step": 91270 + }, + { + "epoch": 0.5831619028148678, + "grad_norm": 0.7809866666793823, + "learning_rate": 8.046268018461232e-05, + "loss": 0.8469, + "step": 91280 + }, + { + "epoch": 0.5832257899646065, + "grad_norm": 0.5992255806922913, + "learning_rate": 8.045870113025655e-05, + "loss": 0.9114, + "step": 91290 + }, + { + "epoch": 0.5832896771143452, + "grad_norm": 0.9385843873023987, + "learning_rate": 8.045472176915533e-05, + "loss": 0.9167, + "step": 91300 + }, + { + "epoch": 0.5833535642640839, + "grad_norm": 0.6832097172737122, + "learning_rate": 8.04507421013487e-05, + "loss": 1.0817, + "step": 91310 + }, + { + "epoch": 0.5834174514138226, + "grad_norm": 0.5917803049087524, + "learning_rate": 8.044676212687677e-05, + "loss": 0.8989, + "step": 91320 + }, + { + "epoch": 0.5834813385635613, + "grad_norm": 0.6751442551612854, + "learning_rate": 8.04427818457796e-05, + "loss": 0.8093, + "step": 91330 + }, + { + "epoch": 0.5835452257133, + "grad_norm": 0.5237501859664917, + "learning_rate": 8.043880125809727e-05, + "loss": 0.8325, + "step": 91340 + }, + { + "epoch": 0.5836091128630387, + "grad_norm": 1.9701563119888306, + "learning_rate": 8.043482036386989e-05, + "loss": 0.8434, + "step": 91350 + }, + { + "epoch": 0.5836730000127774, + "grad_norm": 1.0748164653778076, + "learning_rate": 8.043083916313752e-05, + "loss": 0.8942, + "step": 91360 + }, + { + "epoch": 0.5837368871625161, + "grad_norm": 0.7747710347175598, + "learning_rate": 8.042685765594029e-05, + "loss": 0.7678, + "step": 91370 + }, + { + "epoch": 0.5838007743122549, + "grad_norm": 1.0838667154312134, + "learning_rate": 8.042287584231828e-05, + "loss": 0.9147, + "step": 91380 + }, + { + "epoch": 0.5838646614619936, + "grad_norm": 1.229852557182312, + "learning_rate": 8.041889372231159e-05, + "loss": 1.0037, + "step": 91390 + }, + { + "epoch": 0.5839285486117323, + "grad_norm": 1.2635694742202759, + "learning_rate": 8.041491129596032e-05, + "loss": 0.8378, + "step": 91400 + }, + { + "epoch": 0.583992435761471, + "grad_norm": 1.1819652318954468, + "learning_rate": 8.041092856330457e-05, + "loss": 0.7489, + "step": 91410 + }, + { + "epoch": 0.5840563229112097, + "grad_norm": 1.0836447477340698, + "learning_rate": 8.040694552438448e-05, + "loss": 0.9781, + "step": 91420 + }, + { + "epoch": 0.5841202100609484, + "grad_norm": 2.449270248413086, + "learning_rate": 8.040296217924014e-05, + "loss": 0.9975, + "step": 91430 + }, + { + "epoch": 0.5841840972106871, + "grad_norm": 0.9335359334945679, + "learning_rate": 8.039897852791167e-05, + "loss": 0.8676, + "step": 91440 + }, + { + "epoch": 0.5842479843604258, + "grad_norm": 1.0198067426681519, + "learning_rate": 8.039499457043918e-05, + "loss": 0.9543, + "step": 91450 + }, + { + "epoch": 0.5843118715101644, + "grad_norm": 0.7770729660987854, + "learning_rate": 8.03910103068628e-05, + "loss": 0.8293, + "step": 91460 + }, + { + "epoch": 0.5843757586599031, + "grad_norm": 0.9211755990982056, + "learning_rate": 8.038702573722266e-05, + "loss": 0.9459, + "step": 91470 + }, + { + "epoch": 0.5844396458096418, + "grad_norm": 0.6153119802474976, + "learning_rate": 8.038304086155887e-05, + "loss": 0.9555, + "step": 91480 + }, + { + "epoch": 0.5845035329593805, + "grad_norm": 0.5979563593864441, + "learning_rate": 8.037905567991158e-05, + "loss": 0.906, + "step": 91490 + }, + { + "epoch": 0.5845674201091192, + "grad_norm": 0.7104209065437317, + "learning_rate": 8.037507019232091e-05, + "loss": 0.7133, + "step": 91500 + }, + { + "epoch": 0.5846313072588579, + "grad_norm": 0.8748192191123962, + "learning_rate": 8.037108439882702e-05, + "loss": 0.7568, + "step": 91510 + }, + { + "epoch": 0.5846951944085966, + "grad_norm": 0.9937753081321716, + "learning_rate": 8.036709829947003e-05, + "loss": 0.7123, + "step": 91520 + }, + { + "epoch": 0.5847590815583353, + "grad_norm": 1.2634817361831665, + "learning_rate": 8.036311189429009e-05, + "loss": 0.9345, + "step": 91530 + }, + { + "epoch": 0.584822968708074, + "grad_norm": 0.8244264721870422, + "learning_rate": 8.035912518332733e-05, + "loss": 1.0218, + "step": 91540 + }, + { + "epoch": 0.5848868558578127, + "grad_norm": 1.2409876585006714, + "learning_rate": 8.035513816662194e-05, + "loss": 0.9595, + "step": 91550 + }, + { + "epoch": 0.5849507430075515, + "grad_norm": 0.9279502034187317, + "learning_rate": 8.035115084421404e-05, + "loss": 0.7282, + "step": 91560 + }, + { + "epoch": 0.5850146301572902, + "grad_norm": 1.0995663404464722, + "learning_rate": 8.034716321614377e-05, + "loss": 1.0674, + "step": 91570 + }, + { + "epoch": 0.5850785173070289, + "grad_norm": 1.0315444469451904, + "learning_rate": 8.034317528245134e-05, + "loss": 0.8631, + "step": 91580 + }, + { + "epoch": 0.5851424044567676, + "grad_norm": 0.8768134117126465, + "learning_rate": 8.033918704317686e-05, + "loss": 0.7553, + "step": 91590 + }, + { + "epoch": 0.5852062916065063, + "grad_norm": 1.049591064453125, + "learning_rate": 8.033519849836055e-05, + "loss": 0.979, + "step": 91600 + }, + { + "epoch": 0.585270178756245, + "grad_norm": 0.7832081317901611, + "learning_rate": 8.033120964804252e-05, + "loss": 0.8019, + "step": 91610 + }, + { + "epoch": 0.5853340659059837, + "grad_norm": 1.217409372329712, + "learning_rate": 8.0327220492263e-05, + "loss": 0.96, + "step": 91620 + }, + { + "epoch": 0.5853979530557224, + "grad_norm": 1.7218462228775024, + "learning_rate": 8.03232310310621e-05, + "loss": 1.3111, + "step": 91630 + }, + { + "epoch": 0.5854618402054611, + "grad_norm": 0.9196959733963013, + "learning_rate": 8.031924126448005e-05, + "loss": 0.9832, + "step": 91640 + }, + { + "epoch": 0.5855257273551998, + "grad_norm": 0.9768834114074707, + "learning_rate": 8.031525119255701e-05, + "loss": 0.9437, + "step": 91650 + }, + { + "epoch": 0.5855896145049385, + "grad_norm": 0.707797646522522, + "learning_rate": 8.031126081533315e-05, + "loss": 0.7751, + "step": 91660 + }, + { + "epoch": 0.5856535016546772, + "grad_norm": 3.0832972526550293, + "learning_rate": 8.030727013284868e-05, + "loss": 0.8823, + "step": 91670 + }, + { + "epoch": 0.5857173888044159, + "grad_norm": 0.8001325130462646, + "learning_rate": 8.030327914514377e-05, + "loss": 0.9358, + "step": 91680 + }, + { + "epoch": 0.5857812759541546, + "grad_norm": 2.1798999309539795, + "learning_rate": 8.029928785225864e-05, + "loss": 0.8331, + "step": 91690 + }, + { + "epoch": 0.5858451631038932, + "grad_norm": 0.8021385669708252, + "learning_rate": 8.029529625423347e-05, + "loss": 0.9211, + "step": 91700 + }, + { + "epoch": 0.5859090502536319, + "grad_norm": 0.7000755071640015, + "learning_rate": 8.029130435110844e-05, + "loss": 0.8239, + "step": 91710 + }, + { + "epoch": 0.5859729374033706, + "grad_norm": 0.9345866441726685, + "learning_rate": 8.028731214292377e-05, + "loss": 0.8656, + "step": 91720 + }, + { + "epoch": 0.5860368245531093, + "grad_norm": 0.9513382911682129, + "learning_rate": 8.02833196297197e-05, + "loss": 0.8464, + "step": 91730 + }, + { + "epoch": 0.586100711702848, + "grad_norm": 1.2676148414611816, + "learning_rate": 8.027932681153636e-05, + "loss": 0.9357, + "step": 91740 + }, + { + "epoch": 0.5861645988525868, + "grad_norm": 0.7879144549369812, + "learning_rate": 8.027533368841402e-05, + "loss": 1.1884, + "step": 91750 + }, + { + "epoch": 0.5862284860023255, + "grad_norm": 0.6441530585289001, + "learning_rate": 8.027134026039288e-05, + "loss": 1.0553, + "step": 91760 + }, + { + "epoch": 0.5862923731520642, + "grad_norm": 2.0362207889556885, + "learning_rate": 8.026734652751316e-05, + "loss": 0.751, + "step": 91770 + }, + { + "epoch": 0.5863562603018029, + "grad_norm": 0.9429267644882202, + "learning_rate": 8.026335248981506e-05, + "loss": 0.7756, + "step": 91780 + }, + { + "epoch": 0.5864201474515416, + "grad_norm": 1.2869027853012085, + "learning_rate": 8.025935814733883e-05, + "loss": 0.7654, + "step": 91790 + }, + { + "epoch": 0.5864840346012803, + "grad_norm": 1.0939258337020874, + "learning_rate": 8.025536350012468e-05, + "loss": 0.9738, + "step": 91800 + }, + { + "epoch": 0.586547921751019, + "grad_norm": 0.5842766761779785, + "learning_rate": 8.025136854821285e-05, + "loss": 0.7524, + "step": 91810 + }, + { + "epoch": 0.5866118089007577, + "grad_norm": 0.8317599892616272, + "learning_rate": 8.024737329164356e-05, + "loss": 1.1187, + "step": 91820 + }, + { + "epoch": 0.5866756960504964, + "grad_norm": 0.8482229113578796, + "learning_rate": 8.024337773045704e-05, + "loss": 0.6012, + "step": 91830 + }, + { + "epoch": 0.5867395832002351, + "grad_norm": 0.7396560311317444, + "learning_rate": 8.023938186469357e-05, + "loss": 0.9791, + "step": 91840 + }, + { + "epoch": 0.5868034703499738, + "grad_norm": 0.7632973790168762, + "learning_rate": 8.023538569439335e-05, + "loss": 0.9775, + "step": 91850 + }, + { + "epoch": 0.5868673574997125, + "grad_norm": 1.459350347518921, + "learning_rate": 8.023138921959665e-05, + "loss": 0.7903, + "step": 91860 + }, + { + "epoch": 0.5869312446494512, + "grad_norm": 0.5813467502593994, + "learning_rate": 8.022739244034369e-05, + "loss": 0.8206, + "step": 91870 + }, + { + "epoch": 0.5869951317991899, + "grad_norm": 0.9439472556114197, + "learning_rate": 8.022339535667476e-05, + "loss": 0.6998, + "step": 91880 + }, + { + "epoch": 0.5870590189489286, + "grad_norm": 1.054968237876892, + "learning_rate": 8.021939796863007e-05, + "loss": 0.8882, + "step": 91890 + }, + { + "epoch": 0.5871229060986674, + "grad_norm": 0.7589655518531799, + "learning_rate": 8.021540027624991e-05, + "loss": 1.0338, + "step": 91900 + }, + { + "epoch": 0.5871867932484061, + "grad_norm": 5.119521617889404, + "learning_rate": 8.021140227957451e-05, + "loss": 1.1161, + "step": 91910 + }, + { + "epoch": 0.5872506803981448, + "grad_norm": 0.9572505354881287, + "learning_rate": 8.020740397864418e-05, + "loss": 0.9184, + "step": 91920 + }, + { + "epoch": 0.5873145675478835, + "grad_norm": 1.0495151281356812, + "learning_rate": 8.020340537349915e-05, + "loss": 0.8544, + "step": 91930 + }, + { + "epoch": 0.5873784546976221, + "grad_norm": 0.8135344386100769, + "learning_rate": 8.019940646417969e-05, + "loss": 0.8227, + "step": 91940 + }, + { + "epoch": 0.5874423418473608, + "grad_norm": 1.2222908735275269, + "learning_rate": 8.019540725072609e-05, + "loss": 1.034, + "step": 91950 + }, + { + "epoch": 0.5875062289970995, + "grad_norm": 0.953247606754303, + "learning_rate": 8.019140773317862e-05, + "loss": 0.9178, + "step": 91960 + }, + { + "epoch": 0.5875701161468382, + "grad_norm": 0.7658291459083557, + "learning_rate": 8.018740791157755e-05, + "loss": 0.7629, + "step": 91970 + }, + { + "epoch": 0.5876340032965769, + "grad_norm": 1.0047904253005981, + "learning_rate": 8.018340778596316e-05, + "loss": 0.9189, + "step": 91980 + }, + { + "epoch": 0.5876978904463156, + "grad_norm": 0.9360259771347046, + "learning_rate": 8.017940735637574e-05, + "loss": 1.0436, + "step": 91990 + }, + { + "epoch": 0.5877617775960543, + "grad_norm": 0.74342942237854, + "learning_rate": 8.017540662285558e-05, + "loss": 0.6901, + "step": 92000 + }, + { + "epoch": 0.587825664745793, + "grad_norm": 0.7133846879005432, + "learning_rate": 8.017140558544299e-05, + "loss": 0.9163, + "step": 92010 + }, + { + "epoch": 0.5878895518955317, + "grad_norm": 1.2013561725616455, + "learning_rate": 8.016740424417822e-05, + "loss": 0.8493, + "step": 92020 + }, + { + "epoch": 0.5879534390452704, + "grad_norm": 1.1158215999603271, + "learning_rate": 8.01634025991016e-05, + "loss": 0.9638, + "step": 92030 + }, + { + "epoch": 0.5880173261950091, + "grad_norm": 0.8271144032478333, + "learning_rate": 8.015940065025343e-05, + "loss": 0.7683, + "step": 92040 + }, + { + "epoch": 0.5880812133447478, + "grad_norm": 1.0541661977767944, + "learning_rate": 8.015539839767399e-05, + "loss": 0.8228, + "step": 92050 + }, + { + "epoch": 0.5881451004944865, + "grad_norm": 1.1830748319625854, + "learning_rate": 8.01513958414036e-05, + "loss": 0.8117, + "step": 92060 + }, + { + "epoch": 0.5882089876442252, + "grad_norm": 0.5275201201438904, + "learning_rate": 8.014739298148258e-05, + "loss": 0.7335, + "step": 92070 + }, + { + "epoch": 0.588272874793964, + "grad_norm": 0.9330576062202454, + "learning_rate": 8.014338981795122e-05, + "loss": 0.8961, + "step": 92080 + }, + { + "epoch": 0.5883367619437027, + "grad_norm": 1.2388197183609009, + "learning_rate": 8.013938635084983e-05, + "loss": 0.697, + "step": 92090 + }, + { + "epoch": 0.5884006490934414, + "grad_norm": 1.1590933799743652, + "learning_rate": 8.013538258021877e-05, + "loss": 1.1775, + "step": 92100 + }, + { + "epoch": 0.5884645362431801, + "grad_norm": 1.2639012336730957, + "learning_rate": 8.013137850609833e-05, + "loss": 0.7401, + "step": 92110 + }, + { + "epoch": 0.5885284233929188, + "grad_norm": 0.6682813763618469, + "learning_rate": 8.012737412852886e-05, + "loss": 0.8262, + "step": 92120 + }, + { + "epoch": 0.5885923105426575, + "grad_norm": 0.7417098879814148, + "learning_rate": 8.012336944755064e-05, + "loss": 1.0828, + "step": 92130 + }, + { + "epoch": 0.5886561976923962, + "grad_norm": 0.7538353800773621, + "learning_rate": 8.011936446320405e-05, + "loss": 0.647, + "step": 92140 + }, + { + "epoch": 0.5887200848421349, + "grad_norm": 0.7363274097442627, + "learning_rate": 8.01153591755294e-05, + "loss": 1.1043, + "step": 92150 + }, + { + "epoch": 0.5887839719918736, + "grad_norm": 0.6675977110862732, + "learning_rate": 8.011135358456701e-05, + "loss": 0.9313, + "step": 92160 + }, + { + "epoch": 0.5888478591416123, + "grad_norm": 0.7670975923538208, + "learning_rate": 8.010734769035726e-05, + "loss": 0.8746, + "step": 92170 + }, + { + "epoch": 0.588911746291351, + "grad_norm": 1.1923779249191284, + "learning_rate": 8.010334149294045e-05, + "loss": 1.0088, + "step": 92180 + }, + { + "epoch": 0.5889756334410896, + "grad_norm": 0.8132577538490295, + "learning_rate": 8.009933499235698e-05, + "loss": 1.0256, + "step": 92190 + }, + { + "epoch": 0.5890395205908283, + "grad_norm": 0.7252603769302368, + "learning_rate": 8.009532818864714e-05, + "loss": 0.7461, + "step": 92200 + }, + { + "epoch": 0.589103407740567, + "grad_norm": 1.4953992366790771, + "learning_rate": 8.009132108185132e-05, + "loss": 1.3034, + "step": 92210 + }, + { + "epoch": 0.5891672948903057, + "grad_norm": 0.7073407769203186, + "learning_rate": 8.008731367200988e-05, + "loss": 0.9938, + "step": 92220 + }, + { + "epoch": 0.5892311820400444, + "grad_norm": 1.0197674036026, + "learning_rate": 8.008330595916314e-05, + "loss": 0.9457, + "step": 92230 + }, + { + "epoch": 0.5892950691897831, + "grad_norm": 0.8998727798461914, + "learning_rate": 8.00792979433515e-05, + "loss": 0.7637, + "step": 92240 + }, + { + "epoch": 0.5893589563395218, + "grad_norm": 0.5019026398658752, + "learning_rate": 8.007528962461527e-05, + "loss": 0.9488, + "step": 92250 + }, + { + "epoch": 0.5894228434892606, + "grad_norm": 1.0908100605010986, + "learning_rate": 8.007128100299491e-05, + "loss": 1.1868, + "step": 92260 + }, + { + "epoch": 0.5894867306389993, + "grad_norm": 1.2412331104278564, + "learning_rate": 8.006727207853069e-05, + "loss": 0.7634, + "step": 92270 + }, + { + "epoch": 0.589550617788738, + "grad_norm": 0.8074179291725159, + "learning_rate": 8.006326285126305e-05, + "loss": 0.8855, + "step": 92280 + }, + { + "epoch": 0.5896145049384767, + "grad_norm": 1.471113920211792, + "learning_rate": 8.005925332123235e-05, + "loss": 0.7663, + "step": 92290 + }, + { + "epoch": 0.5896783920882154, + "grad_norm": 0.7869247794151306, + "learning_rate": 8.005524348847894e-05, + "loss": 0.9511, + "step": 92300 + }, + { + "epoch": 0.5897422792379541, + "grad_norm": 1.1059610843658447, + "learning_rate": 8.005123335304322e-05, + "loss": 0.9348, + "step": 92310 + }, + { + "epoch": 0.5898061663876928, + "grad_norm": 1.1069176197052002, + "learning_rate": 8.004722291496562e-05, + "loss": 0.8676, + "step": 92320 + }, + { + "epoch": 0.5898700535374315, + "grad_norm": 0.5974422693252563, + "learning_rate": 8.004321217428647e-05, + "loss": 0.7969, + "step": 92330 + }, + { + "epoch": 0.5899339406871702, + "grad_norm": 1.1670259237289429, + "learning_rate": 8.003920113104618e-05, + "loss": 0.8566, + "step": 92340 + }, + { + "epoch": 0.5899978278369089, + "grad_norm": 0.9760884642601013, + "learning_rate": 8.003518978528515e-05, + "loss": 0.8049, + "step": 92350 + }, + { + "epoch": 0.5900617149866476, + "grad_norm": 0.7791697978973389, + "learning_rate": 8.003117813704378e-05, + "loss": 0.6515, + "step": 92360 + }, + { + "epoch": 0.5901256021363863, + "grad_norm": 0.8998212218284607, + "learning_rate": 8.002716618636245e-05, + "loss": 0.9429, + "step": 92370 + }, + { + "epoch": 0.590189489286125, + "grad_norm": 0.9141538739204407, + "learning_rate": 8.00231539332816e-05, + "loss": 0.8178, + "step": 92380 + }, + { + "epoch": 0.5902533764358637, + "grad_norm": 0.765386164188385, + "learning_rate": 8.001914137784161e-05, + "loss": 0.973, + "step": 92390 + }, + { + "epoch": 0.5903172635856024, + "grad_norm": 0.7694385647773743, + "learning_rate": 8.00151285200829e-05, + "loss": 0.7036, + "step": 92400 + }, + { + "epoch": 0.5903811507353411, + "grad_norm": 1.3476502895355225, + "learning_rate": 8.001111536004586e-05, + "loss": 0.7961, + "step": 92410 + }, + { + "epoch": 0.5904450378850798, + "grad_norm": 1.1224573850631714, + "learning_rate": 8.000710189777094e-05, + "loss": 0.8736, + "step": 92420 + }, + { + "epoch": 0.5905089250348184, + "grad_norm": 0.8447276949882507, + "learning_rate": 8.000308813329855e-05, + "loss": 1.0167, + "step": 92430 + }, + { + "epoch": 0.5905728121845571, + "grad_norm": 2.5802526473999023, + "learning_rate": 7.999907406666909e-05, + "loss": 0.8453, + "step": 92440 + }, + { + "epoch": 0.5906366993342959, + "grad_norm": 0.7821781635284424, + "learning_rate": 7.999505969792302e-05, + "loss": 0.8587, + "step": 92450 + }, + { + "epoch": 0.5907005864840346, + "grad_norm": 0.9936961531639099, + "learning_rate": 7.999104502710074e-05, + "loss": 1.1057, + "step": 92460 + }, + { + "epoch": 0.5907644736337733, + "grad_norm": 0.6793760061264038, + "learning_rate": 7.998703005424268e-05, + "loss": 1.0557, + "step": 92470 + }, + { + "epoch": 0.590828360783512, + "grad_norm": 1.253307819366455, + "learning_rate": 7.99830147793893e-05, + "loss": 0.7205, + "step": 92480 + }, + { + "epoch": 0.5908922479332507, + "grad_norm": 0.8064923882484436, + "learning_rate": 7.997899920258101e-05, + "loss": 0.7813, + "step": 92490 + }, + { + "epoch": 0.5909561350829894, + "grad_norm": 1.2392529249191284, + "learning_rate": 7.997498332385827e-05, + "loss": 0.9077, + "step": 92500 + }, + { + "epoch": 0.5910200222327281, + "grad_norm": 1.3533644676208496, + "learning_rate": 7.997096714326151e-05, + "loss": 0.8915, + "step": 92510 + }, + { + "epoch": 0.5910839093824668, + "grad_norm": 0.8305091261863708, + "learning_rate": 7.99669506608312e-05, + "loss": 1.0454, + "step": 92520 + }, + { + "epoch": 0.5911477965322055, + "grad_norm": 0.884864866733551, + "learning_rate": 7.996293387660776e-05, + "loss": 0.9556, + "step": 92530 + }, + { + "epoch": 0.5912116836819442, + "grad_norm": 0.6807804703712463, + "learning_rate": 7.995891679063165e-05, + "loss": 1.0857, + "step": 92540 + }, + { + "epoch": 0.5912755708316829, + "grad_norm": 1.4398140907287598, + "learning_rate": 7.995489940294333e-05, + "loss": 0.8875, + "step": 92550 + }, + { + "epoch": 0.5913394579814216, + "grad_norm": 0.9925829768180847, + "learning_rate": 7.995088171358325e-05, + "loss": 1.038, + "step": 92560 + }, + { + "epoch": 0.5914033451311603, + "grad_norm": 0.7312915325164795, + "learning_rate": 7.99468637225919e-05, + "loss": 1.0752, + "step": 92570 + }, + { + "epoch": 0.591467232280899, + "grad_norm": 1.0309982299804688, + "learning_rate": 7.994284543000972e-05, + "loss": 0.8225, + "step": 92580 + }, + { + "epoch": 0.5915311194306377, + "grad_norm": 0.844560980796814, + "learning_rate": 7.993882683587717e-05, + "loss": 0.8288, + "step": 92590 + }, + { + "epoch": 0.5915950065803764, + "grad_norm": 1.377153754234314, + "learning_rate": 7.993480794023473e-05, + "loss": 0.8974, + "step": 92600 + }, + { + "epoch": 0.5916588937301152, + "grad_norm": 1.0687837600708008, + "learning_rate": 7.99307887431229e-05, + "loss": 0.6578, + "step": 92610 + }, + { + "epoch": 0.5917227808798539, + "grad_norm": 1.0184049606323242, + "learning_rate": 7.992676924458212e-05, + "loss": 0.7034, + "step": 92620 + }, + { + "epoch": 0.5917866680295926, + "grad_norm": 1.7974580526351929, + "learning_rate": 7.992274944465287e-05, + "loss": 0.8216, + "step": 92630 + }, + { + "epoch": 0.5918505551793313, + "grad_norm": 0.9643707871437073, + "learning_rate": 7.991872934337568e-05, + "loss": 0.9556, + "step": 92640 + }, + { + "epoch": 0.59191444232907, + "grad_norm": 2.1205992698669434, + "learning_rate": 7.991470894079098e-05, + "loss": 1.1122, + "step": 92650 + }, + { + "epoch": 0.5919783294788087, + "grad_norm": 0.7959349751472473, + "learning_rate": 7.991068823693928e-05, + "loss": 0.892, + "step": 92660 + }, + { + "epoch": 0.5920422166285473, + "grad_norm": 0.8903279900550842, + "learning_rate": 7.990666723186107e-05, + "loss": 0.909, + "step": 92670 + }, + { + "epoch": 0.592106103778286, + "grad_norm": 0.8091008067131042, + "learning_rate": 7.990264592559686e-05, + "loss": 1.0945, + "step": 92680 + }, + { + "epoch": 0.5921699909280247, + "grad_norm": 0.8795812726020813, + "learning_rate": 7.989862431818713e-05, + "loss": 1.0445, + "step": 92690 + }, + { + "epoch": 0.5922338780777634, + "grad_norm": 0.7716434001922607, + "learning_rate": 7.989460240967239e-05, + "loss": 1.0517, + "step": 92700 + }, + { + "epoch": 0.5922977652275021, + "grad_norm": 0.9718101024627686, + "learning_rate": 7.989058020009315e-05, + "loss": 0.9155, + "step": 92710 + }, + { + "epoch": 0.5923616523772408, + "grad_norm": 1.8687045574188232, + "learning_rate": 7.98865576894899e-05, + "loss": 0.8678, + "step": 92720 + }, + { + "epoch": 0.5924255395269795, + "grad_norm": 0.5522985458374023, + "learning_rate": 7.988253487790315e-05, + "loss": 0.9144, + "step": 92730 + }, + { + "epoch": 0.5924894266767182, + "grad_norm": 0.9412902593612671, + "learning_rate": 7.987851176537342e-05, + "loss": 0.7785, + "step": 92740 + }, + { + "epoch": 0.5925533138264569, + "grad_norm": 0.5858872532844543, + "learning_rate": 7.987448835194124e-05, + "loss": 0.7684, + "step": 92750 + }, + { + "epoch": 0.5926172009761956, + "grad_norm": 0.7545718550682068, + "learning_rate": 7.987046463764712e-05, + "loss": 0.8157, + "step": 92760 + }, + { + "epoch": 0.5926810881259343, + "grad_norm": 1.0280770063400269, + "learning_rate": 7.986644062253157e-05, + "loss": 0.8308, + "step": 92770 + }, + { + "epoch": 0.592744975275673, + "grad_norm": 0.6888710260391235, + "learning_rate": 7.986241630663512e-05, + "loss": 0.827, + "step": 92780 + }, + { + "epoch": 0.5928088624254118, + "grad_norm": 0.7648938298225403, + "learning_rate": 7.985839168999831e-05, + "loss": 0.7851, + "step": 92790 + }, + { + "epoch": 0.5928727495751505, + "grad_norm": 1.144452452659607, + "learning_rate": 7.985436677266166e-05, + "loss": 0.8219, + "step": 92800 + }, + { + "epoch": 0.5929366367248892, + "grad_norm": 0.9473939538002014, + "learning_rate": 7.985034155466572e-05, + "loss": 0.9113, + "step": 92810 + }, + { + "epoch": 0.5930005238746279, + "grad_norm": 1.33318030834198, + "learning_rate": 7.984631603605102e-05, + "loss": 0.7248, + "step": 92820 + }, + { + "epoch": 0.5930644110243666, + "grad_norm": 1.1216987371444702, + "learning_rate": 7.984229021685807e-05, + "loss": 0.7527, + "step": 92830 + }, + { + "epoch": 0.5931282981741053, + "grad_norm": 1.0254566669464111, + "learning_rate": 7.983826409712747e-05, + "loss": 1.0293, + "step": 92840 + }, + { + "epoch": 0.593192185323844, + "grad_norm": 0.7557952404022217, + "learning_rate": 7.983423767689972e-05, + "loss": 0.8593, + "step": 92850 + }, + { + "epoch": 0.5932560724735827, + "grad_norm": 0.8302872180938721, + "learning_rate": 7.983021095621539e-05, + "loss": 0.8756, + "step": 92860 + }, + { + "epoch": 0.5933199596233214, + "grad_norm": 0.7966361045837402, + "learning_rate": 7.982618393511503e-05, + "loss": 0.8578, + "step": 92870 + }, + { + "epoch": 0.5933838467730601, + "grad_norm": 1.1069227457046509, + "learning_rate": 7.982215661363918e-05, + "loss": 0.7703, + "step": 92880 + }, + { + "epoch": 0.5934477339227988, + "grad_norm": 0.5603930354118347, + "learning_rate": 7.981812899182844e-05, + "loss": 0.8665, + "step": 92890 + }, + { + "epoch": 0.5935116210725375, + "grad_norm": 1.1370865106582642, + "learning_rate": 7.981410106972333e-05, + "loss": 0.8621, + "step": 92900 + }, + { + "epoch": 0.5935755082222761, + "grad_norm": 1.1741241216659546, + "learning_rate": 7.981007284736442e-05, + "loss": 1.0893, + "step": 92910 + }, + { + "epoch": 0.5936393953720148, + "grad_norm": 1.065045714378357, + "learning_rate": 7.98060443247923e-05, + "loss": 1.0186, + "step": 92920 + }, + { + "epoch": 0.5937032825217535, + "grad_norm": 0.7145927548408508, + "learning_rate": 7.980201550204753e-05, + "loss": 0.9665, + "step": 92930 + }, + { + "epoch": 0.5937671696714922, + "grad_norm": 0.7131385803222656, + "learning_rate": 7.979798637917068e-05, + "loss": 0.9271, + "step": 92940 + }, + { + "epoch": 0.5938310568212309, + "grad_norm": 2.897143840789795, + "learning_rate": 7.979395695620234e-05, + "loss": 0.9113, + "step": 92950 + }, + { + "epoch": 0.5938949439709696, + "grad_norm": 0.9042292237281799, + "learning_rate": 7.978992723318305e-05, + "loss": 0.869, + "step": 92960 + }, + { + "epoch": 0.5939588311207084, + "grad_norm": 0.5825813412666321, + "learning_rate": 7.978589721015343e-05, + "loss": 1.1739, + "step": 92970 + }, + { + "epoch": 0.5940227182704471, + "grad_norm": 1.031434416770935, + "learning_rate": 7.978186688715406e-05, + "loss": 0.817, + "step": 92980 + }, + { + "epoch": 0.5940866054201858, + "grad_norm": 0.81146639585495, + "learning_rate": 7.977783626422553e-05, + "loss": 1.0523, + "step": 92990 + }, + { + "epoch": 0.5941504925699245, + "grad_norm": 1.5747389793395996, + "learning_rate": 7.977380534140843e-05, + "loss": 0.7111, + "step": 93000 + }, + { + "epoch": 0.5942143797196632, + "grad_norm": 0.8901327848434448, + "learning_rate": 7.976977411874334e-05, + "loss": 0.8639, + "step": 93010 + }, + { + "epoch": 0.5942782668694019, + "grad_norm": 0.6581230163574219, + "learning_rate": 7.976574259627087e-05, + "loss": 0.7677, + "step": 93020 + }, + { + "epoch": 0.5943421540191406, + "grad_norm": 0.660140335559845, + "learning_rate": 7.976171077403163e-05, + "loss": 1.058, + "step": 93030 + }, + { + "epoch": 0.5944060411688793, + "grad_norm": 1.2601295709609985, + "learning_rate": 7.97576786520662e-05, + "loss": 1.0485, + "step": 93040 + }, + { + "epoch": 0.594469928318618, + "grad_norm": 0.7179937362670898, + "learning_rate": 7.975364623041523e-05, + "loss": 0.7853, + "step": 93050 + }, + { + "epoch": 0.5945338154683567, + "grad_norm": 1.2381994724273682, + "learning_rate": 7.974961350911926e-05, + "loss": 0.8814, + "step": 93060 + }, + { + "epoch": 0.5945977026180954, + "grad_norm": 0.8336678743362427, + "learning_rate": 7.974558048821898e-05, + "loss": 0.8024, + "step": 93070 + }, + { + "epoch": 0.5946615897678341, + "grad_norm": 0.6095098257064819, + "learning_rate": 7.974154716775497e-05, + "loss": 1.0271, + "step": 93080 + }, + { + "epoch": 0.5947254769175728, + "grad_norm": 0.7402802109718323, + "learning_rate": 7.973791692324393e-05, + "loss": 1.0233, + "step": 93090 + }, + { + "epoch": 0.5947893640673115, + "grad_norm": 1.3775721788406372, + "learning_rate": 7.973388303372073e-05, + "loss": 0.8029, + "step": 93100 + }, + { + "epoch": 0.5948532512170502, + "grad_norm": 1.3130244016647339, + "learning_rate": 7.972984884475162e-05, + "loss": 0.7344, + "step": 93110 + }, + { + "epoch": 0.594917138366789, + "grad_norm": 1.00970458984375, + "learning_rate": 7.97258143563772e-05, + "loss": 0.871, + "step": 93120 + }, + { + "epoch": 0.5949810255165277, + "grad_norm": 0.5588665008544922, + "learning_rate": 7.972177956863811e-05, + "loss": 0.8927, + "step": 93130 + }, + { + "epoch": 0.5950449126662664, + "grad_norm": 0.5917448997497559, + "learning_rate": 7.971774448157499e-05, + "loss": 0.922, + "step": 93140 + }, + { + "epoch": 0.5951087998160051, + "grad_norm": 0.8795225620269775, + "learning_rate": 7.971370909522847e-05, + "loss": 1.1024, + "step": 93150 + }, + { + "epoch": 0.5951726869657437, + "grad_norm": 0.7276126146316528, + "learning_rate": 7.97096734096392e-05, + "loss": 0.9061, + "step": 93160 + }, + { + "epoch": 0.5952365741154824, + "grad_norm": 1.1168190240859985, + "learning_rate": 7.970563742484782e-05, + "loss": 0.8808, + "step": 93170 + }, + { + "epoch": 0.5953004612652211, + "grad_norm": 0.7256569266319275, + "learning_rate": 7.970160114089496e-05, + "loss": 0.8881, + "step": 93180 + }, + { + "epoch": 0.5953643484149598, + "grad_norm": 1.2579829692840576, + "learning_rate": 7.969756455782129e-05, + "loss": 0.986, + "step": 93190 + }, + { + "epoch": 0.5954282355646985, + "grad_norm": 1.1053801774978638, + "learning_rate": 7.969352767566744e-05, + "loss": 0.9049, + "step": 93200 + }, + { + "epoch": 0.5954921227144372, + "grad_norm": 1.0768349170684814, + "learning_rate": 7.96894904944741e-05, + "loss": 0.9947, + "step": 93210 + }, + { + "epoch": 0.5955560098641759, + "grad_norm": 0.8521894812583923, + "learning_rate": 7.968545301428188e-05, + "loss": 0.7064, + "step": 93220 + }, + { + "epoch": 0.5956198970139146, + "grad_norm": 0.9622153639793396, + "learning_rate": 7.968141523513149e-05, + "loss": 0.8189, + "step": 93230 + }, + { + "epoch": 0.5956837841636533, + "grad_norm": 1.014563798904419, + "learning_rate": 7.967737715706354e-05, + "loss": 1.0966, + "step": 93240 + }, + { + "epoch": 0.595747671313392, + "grad_norm": 1.3811198472976685, + "learning_rate": 7.967333878011875e-05, + "loss": 1.043, + "step": 93250 + }, + { + "epoch": 0.5958115584631307, + "grad_norm": 0.7607221007347107, + "learning_rate": 7.966930010433777e-05, + "loss": 0.9357, + "step": 93260 + }, + { + "epoch": 0.5958754456128694, + "grad_norm": 0.9977759122848511, + "learning_rate": 7.966526112976126e-05, + "loss": 1.0732, + "step": 93270 + }, + { + "epoch": 0.5959393327626081, + "grad_norm": 1.0473393201828003, + "learning_rate": 7.966122185642992e-05, + "loss": 0.9741, + "step": 93280 + }, + { + "epoch": 0.5960032199123468, + "grad_norm": 1.254073977470398, + "learning_rate": 7.965718228438442e-05, + "loss": 0.7127, + "step": 93290 + }, + { + "epoch": 0.5960671070620855, + "grad_norm": 0.9612134099006653, + "learning_rate": 7.965314241366542e-05, + "loss": 0.8608, + "step": 93300 + }, + { + "epoch": 0.5961309942118242, + "grad_norm": 1.8050369024276733, + "learning_rate": 7.964910224431361e-05, + "loss": 0.6996, + "step": 93310 + }, + { + "epoch": 0.596194881361563, + "grad_norm": 1.1563001871109009, + "learning_rate": 7.96450617763697e-05, + "loss": 0.7266, + "step": 93320 + }, + { + "epoch": 0.5962587685113017, + "grad_norm": 1.396944284439087, + "learning_rate": 7.964102100987439e-05, + "loss": 1.2127, + "step": 93330 + }, + { + "epoch": 0.5963226556610404, + "grad_norm": 1.3693512678146362, + "learning_rate": 7.963697994486834e-05, + "loss": 0.8019, + "step": 93340 + }, + { + "epoch": 0.5963865428107791, + "grad_norm": 0.8561686277389526, + "learning_rate": 7.963293858139227e-05, + "loss": 1.2762, + "step": 93350 + }, + { + "epoch": 0.5964504299605178, + "grad_norm": 1.4842015504837036, + "learning_rate": 7.962889691948687e-05, + "loss": 1.0344, + "step": 93360 + }, + { + "epoch": 0.5965143171102565, + "grad_norm": 0.6768175363540649, + "learning_rate": 7.962485495919285e-05, + "loss": 0.7479, + "step": 93370 + }, + { + "epoch": 0.5965782042599952, + "grad_norm": 0.8603546023368835, + "learning_rate": 7.96208127005509e-05, + "loss": 0.7013, + "step": 93380 + }, + { + "epoch": 0.5966420914097339, + "grad_norm": 0.8126310706138611, + "learning_rate": 7.961677014360174e-05, + "loss": 0.9827, + "step": 93390 + }, + { + "epoch": 0.5967059785594725, + "grad_norm": 0.7931829690933228, + "learning_rate": 7.961272728838609e-05, + "loss": 0.8245, + "step": 93400 + }, + { + "epoch": 0.5967698657092112, + "grad_norm": 0.7296050786972046, + "learning_rate": 7.960868413494465e-05, + "loss": 0.7299, + "step": 93410 + }, + { + "epoch": 0.5968337528589499, + "grad_norm": 1.148210883140564, + "learning_rate": 7.960464068331814e-05, + "loss": 0.9199, + "step": 93420 + }, + { + "epoch": 0.5968976400086886, + "grad_norm": 1.0481257438659668, + "learning_rate": 7.960059693354731e-05, + "loss": 0.8088, + "step": 93430 + }, + { + "epoch": 0.5969615271584273, + "grad_norm": 0.6699005365371704, + "learning_rate": 7.959655288567285e-05, + "loss": 1.055, + "step": 93440 + }, + { + "epoch": 0.597025414308166, + "grad_norm": 1.0113779306411743, + "learning_rate": 7.959250853973549e-05, + "loss": 1.0175, + "step": 93450 + }, + { + "epoch": 0.5970893014579047, + "grad_norm": 1.1636087894439697, + "learning_rate": 7.958846389577597e-05, + "loss": 0.8509, + "step": 93460 + }, + { + "epoch": 0.5971531886076434, + "grad_norm": 0.8707061409950256, + "learning_rate": 7.958441895383503e-05, + "loss": 1.0673, + "step": 93470 + }, + { + "epoch": 0.5972170757573821, + "grad_norm": 0.4944153130054474, + "learning_rate": 7.95803737139534e-05, + "loss": 0.8289, + "step": 93480 + }, + { + "epoch": 0.5972809629071208, + "grad_norm": 0.9866294860839844, + "learning_rate": 7.95763281761718e-05, + "loss": 0.9177, + "step": 93490 + }, + { + "epoch": 0.5973448500568596, + "grad_norm": 0.7539530992507935, + "learning_rate": 7.957228234053099e-05, + "loss": 0.6764, + "step": 93500 + }, + { + "epoch": 0.5974087372065983, + "grad_norm": 0.793980062007904, + "learning_rate": 7.956823620707172e-05, + "loss": 0.9874, + "step": 93510 + }, + { + "epoch": 0.597472624356337, + "grad_norm": 0.760221004486084, + "learning_rate": 7.956418977583474e-05, + "loss": 0.8189, + "step": 93520 + }, + { + "epoch": 0.5975365115060757, + "grad_norm": 0.4851871728897095, + "learning_rate": 7.956014304686076e-05, + "loss": 0.9784, + "step": 93530 + }, + { + "epoch": 0.5976003986558144, + "grad_norm": 0.6846779584884644, + "learning_rate": 7.95560960201906e-05, + "loss": 0.796, + "step": 93540 + }, + { + "epoch": 0.5976642858055531, + "grad_norm": 0.6320601105690002, + "learning_rate": 7.955204869586497e-05, + "loss": 0.9733, + "step": 93550 + }, + { + "epoch": 0.5977281729552918, + "grad_norm": 1.2107212543487549, + "learning_rate": 7.954800107392463e-05, + "loss": 0.928, + "step": 93560 + }, + { + "epoch": 0.5977920601050305, + "grad_norm": 1.3490923643112183, + "learning_rate": 7.954395315441039e-05, + "loss": 1.0885, + "step": 93570 + }, + { + "epoch": 0.5978559472547692, + "grad_norm": 0.724120020866394, + "learning_rate": 7.953990493736296e-05, + "loss": 0.7475, + "step": 93580 + }, + { + "epoch": 0.5979198344045079, + "grad_norm": 1.5954548120498657, + "learning_rate": 7.953585642282314e-05, + "loss": 1.1069, + "step": 93590 + }, + { + "epoch": 0.5979837215542466, + "grad_norm": 1.4548444747924805, + "learning_rate": 7.953180761083169e-05, + "loss": 1.0161, + "step": 93600 + }, + { + "epoch": 0.5980476087039853, + "grad_norm": 0.6197888851165771, + "learning_rate": 7.952775850142939e-05, + "loss": 0.7571, + "step": 93610 + }, + { + "epoch": 0.598111495853724, + "grad_norm": 0.7104484438896179, + "learning_rate": 7.952370909465702e-05, + "loss": 0.8744, + "step": 93620 + }, + { + "epoch": 0.5981753830034627, + "grad_norm": 0.6192456483840942, + "learning_rate": 7.951965939055535e-05, + "loss": 1.1207, + "step": 93630 + }, + { + "epoch": 0.5982392701532013, + "grad_norm": 1.01494300365448, + "learning_rate": 7.951560938916517e-05, + "loss": 0.7774, + "step": 93640 + }, + { + "epoch": 0.59830315730294, + "grad_norm": 1.5885846614837646, + "learning_rate": 7.951155909052727e-05, + "loss": 0.7792, + "step": 93650 + }, + { + "epoch": 0.5983670444526787, + "grad_norm": 0.9878436326980591, + "learning_rate": 7.950750849468245e-05, + "loss": 0.8927, + "step": 93660 + }, + { + "epoch": 0.5984309316024174, + "grad_norm": 0.7260859608650208, + "learning_rate": 7.950345760167148e-05, + "loss": 0.9279, + "step": 93670 + }, + { + "epoch": 0.5984948187521562, + "grad_norm": 0.9209891557693481, + "learning_rate": 7.949940641153517e-05, + "loss": 0.7823, + "step": 93680 + }, + { + "epoch": 0.5985587059018949, + "grad_norm": 0.6021539568901062, + "learning_rate": 7.949535492431433e-05, + "loss": 0.8444, + "step": 93690 + }, + { + "epoch": 0.5986225930516336, + "grad_norm": 1.3056902885437012, + "learning_rate": 7.949130314004974e-05, + "loss": 0.812, + "step": 93700 + }, + { + "epoch": 0.5986864802013723, + "grad_norm": 0.8002413511276245, + "learning_rate": 7.948725105878221e-05, + "loss": 0.9184, + "step": 93710 + }, + { + "epoch": 0.598750367351111, + "grad_norm": 1.4868652820587158, + "learning_rate": 7.948319868055254e-05, + "loss": 0.8872, + "step": 93720 + }, + { + "epoch": 0.5988142545008497, + "grad_norm": 0.8618303537368774, + "learning_rate": 7.947914600540158e-05, + "loss": 0.7252, + "step": 93730 + }, + { + "epoch": 0.5988781416505884, + "grad_norm": 0.5396475791931152, + "learning_rate": 7.947509303337009e-05, + "loss": 0.7807, + "step": 93740 + }, + { + "epoch": 0.5989420288003271, + "grad_norm": 0.7746068239212036, + "learning_rate": 7.947103976449892e-05, + "loss": 0.8018, + "step": 93750 + }, + { + "epoch": 0.5990059159500658, + "grad_norm": 1.0696126222610474, + "learning_rate": 7.94669861988289e-05, + "loss": 0.7254, + "step": 93760 + }, + { + "epoch": 0.5990698030998045, + "grad_norm": 0.8595736622810364, + "learning_rate": 7.946293233640082e-05, + "loss": 0.765, + "step": 93770 + }, + { + "epoch": 0.5991336902495432, + "grad_norm": 1.1637159585952759, + "learning_rate": 7.945887817725552e-05, + "loss": 0.8776, + "step": 93780 + }, + { + "epoch": 0.5991975773992819, + "grad_norm": 0.9262639284133911, + "learning_rate": 7.945482372143385e-05, + "loss": 1.2434, + "step": 93790 + }, + { + "epoch": 0.5992614645490206, + "grad_norm": 0.6639724969863892, + "learning_rate": 7.945076896897661e-05, + "loss": 0.9795, + "step": 93800 + }, + { + "epoch": 0.5993253516987593, + "grad_norm": 0.7169008255004883, + "learning_rate": 7.944671391992465e-05, + "loss": 1.1887, + "step": 93810 + }, + { + "epoch": 0.599389238848498, + "grad_norm": 0.9885384440422058, + "learning_rate": 7.944265857431881e-05, + "loss": 0.9445, + "step": 93820 + }, + { + "epoch": 0.5994531259982367, + "grad_norm": 0.639473021030426, + "learning_rate": 7.943860293219993e-05, + "loss": 0.7167, + "step": 93830 + }, + { + "epoch": 0.5995170131479755, + "grad_norm": 0.9396152496337891, + "learning_rate": 7.943454699360884e-05, + "loss": 0.7037, + "step": 93840 + }, + { + "epoch": 0.5995809002977142, + "grad_norm": 1.092712163925171, + "learning_rate": 7.94304907585864e-05, + "loss": 1.2453, + "step": 93850 + }, + { + "epoch": 0.5996447874474529, + "grad_norm": 1.040013313293457, + "learning_rate": 7.942643422717346e-05, + "loss": 0.7103, + "step": 93860 + }, + { + "epoch": 0.5997086745971916, + "grad_norm": 1.5030107498168945, + "learning_rate": 7.942237739941086e-05, + "loss": 1.5542, + "step": 93870 + }, + { + "epoch": 0.5997725617469303, + "grad_norm": 1.024461030960083, + "learning_rate": 7.941832027533948e-05, + "loss": 0.947, + "step": 93880 + }, + { + "epoch": 0.5998364488966689, + "grad_norm": 1.201267957687378, + "learning_rate": 7.941426285500016e-05, + "loss": 0.7646, + "step": 93890 + }, + { + "epoch": 0.5999003360464076, + "grad_norm": 1.2446759939193726, + "learning_rate": 7.941020513843376e-05, + "loss": 0.8437, + "step": 93900 + }, + { + "epoch": 0.5999642231961463, + "grad_norm": 0.7842534780502319, + "learning_rate": 7.940614712568115e-05, + "loss": 0.6985, + "step": 93910 + }, + { + "epoch": 0.600028110345885, + "grad_norm": 1.0466797351837158, + "learning_rate": 7.940208881678322e-05, + "loss": 0.9707, + "step": 93920 + }, + { + "epoch": 0.6000919974956237, + "grad_norm": 0.7793298363685608, + "learning_rate": 7.939803021178078e-05, + "loss": 0.9014, + "step": 93930 + }, + { + "epoch": 0.6001558846453624, + "grad_norm": 0.6594678163528442, + "learning_rate": 7.939397131071478e-05, + "loss": 0.8811, + "step": 93940 + }, + { + "epoch": 0.6002197717951011, + "grad_norm": 0.7732535004615784, + "learning_rate": 7.938991211362602e-05, + "loss": 0.9198, + "step": 93950 + }, + { + "epoch": 0.6002836589448398, + "grad_norm": 0.6953932642936707, + "learning_rate": 7.938585262055546e-05, + "loss": 0.8225, + "step": 93960 + }, + { + "epoch": 0.6003475460945785, + "grad_norm": 0.4991307556629181, + "learning_rate": 7.938179283154392e-05, + "loss": 0.7595, + "step": 93970 + }, + { + "epoch": 0.6004114332443172, + "grad_norm": 1.582554817199707, + "learning_rate": 7.937773274663231e-05, + "loss": 0.9514, + "step": 93980 + }, + { + "epoch": 0.6004753203940559, + "grad_norm": 0.8112611770629883, + "learning_rate": 7.937367236586153e-05, + "loss": 0.9182, + "step": 93990 + }, + { + "epoch": 0.6005392075437946, + "grad_norm": 0.8769091367721558, + "learning_rate": 7.936961168927244e-05, + "loss": 1.1743, + "step": 94000 + }, + { + "epoch": 0.6006030946935333, + "grad_norm": 0.733625054359436, + "learning_rate": 7.936555071690597e-05, + "loss": 0.9444, + "step": 94010 + }, + { + "epoch": 0.600666981843272, + "grad_norm": 0.9377986192703247, + "learning_rate": 7.936148944880297e-05, + "loss": 0.7817, + "step": 94020 + }, + { + "epoch": 0.6007308689930108, + "grad_norm": 0.7029353380203247, + "learning_rate": 7.935742788500438e-05, + "loss": 0.8874, + "step": 94030 + }, + { + "epoch": 0.6007947561427495, + "grad_norm": 1.3966723680496216, + "learning_rate": 7.93533660255511e-05, + "loss": 1.0347, + "step": 94040 + }, + { + "epoch": 0.6008586432924882, + "grad_norm": 0.5431897044181824, + "learning_rate": 7.934930387048405e-05, + "loss": 0.8037, + "step": 94050 + }, + { + "epoch": 0.6009225304422269, + "grad_norm": 0.6450621485710144, + "learning_rate": 7.93452414198441e-05, + "loss": 0.8367, + "step": 94060 + }, + { + "epoch": 0.6009864175919656, + "grad_norm": 0.6869795322418213, + "learning_rate": 7.93411786736722e-05, + "loss": 1.1556, + "step": 94070 + }, + { + "epoch": 0.6010503047417043, + "grad_norm": 1.2084323167800903, + "learning_rate": 7.93371156320092e-05, + "loss": 0.7216, + "step": 94080 + }, + { + "epoch": 0.601114191891443, + "grad_norm": 0.8116541504859924, + "learning_rate": 7.93330522948961e-05, + "loss": 0.9562, + "step": 94090 + }, + { + "epoch": 0.6011780790411817, + "grad_norm": 1.043238878250122, + "learning_rate": 7.932898866237378e-05, + "loss": 0.7446, + "step": 94100 + }, + { + "epoch": 0.6012419661909204, + "grad_norm": 1.0671076774597168, + "learning_rate": 7.932492473448318e-05, + "loss": 0.952, + "step": 94110 + }, + { + "epoch": 0.6013058533406591, + "grad_norm": 1.1957184076309204, + "learning_rate": 7.932086051126521e-05, + "loss": 0.8455, + "step": 94120 + }, + { + "epoch": 0.6013697404903977, + "grad_norm": 0.6923540830612183, + "learning_rate": 7.931679599276081e-05, + "loss": 0.8397, + "step": 94130 + }, + { + "epoch": 0.6014336276401364, + "grad_norm": 0.699149489402771, + "learning_rate": 7.931273117901091e-05, + "loss": 0.8874, + "step": 94140 + }, + { + "epoch": 0.6014975147898751, + "grad_norm": 2.1960628032684326, + "learning_rate": 7.930866607005643e-05, + "loss": 0.811, + "step": 94150 + }, + { + "epoch": 0.6015614019396138, + "grad_norm": 0.6790569424629211, + "learning_rate": 7.930460066593836e-05, + "loss": 1.0083, + "step": 94160 + }, + { + "epoch": 0.6016252890893525, + "grad_norm": 0.6594065427780151, + "learning_rate": 7.930053496669758e-05, + "loss": 0.9659, + "step": 94170 + }, + { + "epoch": 0.6016891762390912, + "grad_norm": 0.7281797528266907, + "learning_rate": 7.929646897237509e-05, + "loss": 0.8501, + "step": 94180 + }, + { + "epoch": 0.60175306338883, + "grad_norm": 1.0084353685379028, + "learning_rate": 7.929240268301179e-05, + "loss": 1.0269, + "step": 94190 + }, + { + "epoch": 0.6018169505385687, + "grad_norm": 0.6016590595245361, + "learning_rate": 7.928833609864867e-05, + "loss": 1.0357, + "step": 94200 + }, + { + "epoch": 0.6018808376883074, + "grad_norm": 0.7156760692596436, + "learning_rate": 7.928426921932665e-05, + "loss": 0.7571, + "step": 94210 + }, + { + "epoch": 0.6019447248380461, + "grad_norm": 0.6921888589859009, + "learning_rate": 7.928020204508673e-05, + "loss": 0.7635, + "step": 94220 + }, + { + "epoch": 0.6020086119877848, + "grad_norm": 0.9318345189094543, + "learning_rate": 7.927613457596983e-05, + "loss": 0.8359, + "step": 94230 + }, + { + "epoch": 0.6020724991375235, + "grad_norm": 1.3739084005355835, + "learning_rate": 7.927206681201693e-05, + "loss": 1.111, + "step": 94240 + }, + { + "epoch": 0.6021363862872622, + "grad_norm": 1.0711864233016968, + "learning_rate": 7.926799875326898e-05, + "loss": 0.941, + "step": 94250 + }, + { + "epoch": 0.6022002734370009, + "grad_norm": 0.7577251195907593, + "learning_rate": 7.926393039976698e-05, + "loss": 0.767, + "step": 94260 + }, + { + "epoch": 0.6022641605867396, + "grad_norm": 1.2173386812210083, + "learning_rate": 7.925986175155188e-05, + "loss": 0.9573, + "step": 94270 + }, + { + "epoch": 0.6023280477364783, + "grad_norm": 0.8785862922668457, + "learning_rate": 7.925579280866465e-05, + "loss": 0.9249, + "step": 94280 + }, + { + "epoch": 0.602391934886217, + "grad_norm": 0.6809684038162231, + "learning_rate": 7.925172357114628e-05, + "loss": 0.7592, + "step": 94290 + }, + { + "epoch": 0.6024558220359557, + "grad_norm": 0.7136757373809814, + "learning_rate": 7.924765403903775e-05, + "loss": 0.8912, + "step": 94300 + }, + { + "epoch": 0.6025197091856944, + "grad_norm": 1.4909855127334595, + "learning_rate": 7.924358421238005e-05, + "loss": 0.7308, + "step": 94310 + }, + { + "epoch": 0.6025835963354331, + "grad_norm": 0.7292258739471436, + "learning_rate": 7.923951409121416e-05, + "loss": 0.8375, + "step": 94320 + }, + { + "epoch": 0.6026474834851718, + "grad_norm": 0.7449434995651245, + "learning_rate": 7.923544367558104e-05, + "loss": 0.7606, + "step": 94330 + }, + { + "epoch": 0.6027113706349105, + "grad_norm": 1.1462843418121338, + "learning_rate": 7.923137296552174e-05, + "loss": 0.9557, + "step": 94340 + }, + { + "epoch": 0.6027752577846492, + "grad_norm": 1.5585012435913086, + "learning_rate": 7.92273019610772e-05, + "loss": 0.9248, + "step": 94350 + }, + { + "epoch": 0.602839144934388, + "grad_norm": 0.7382988929748535, + "learning_rate": 7.922323066228845e-05, + "loss": 0.7405, + "step": 94360 + }, + { + "epoch": 0.6029030320841265, + "grad_norm": 0.7861965894699097, + "learning_rate": 7.92191590691965e-05, + "loss": 0.8206, + "step": 94370 + }, + { + "epoch": 0.6029669192338653, + "grad_norm": 0.8503504395484924, + "learning_rate": 7.921508718184233e-05, + "loss": 0.7665, + "step": 94380 + }, + { + "epoch": 0.603030806383604, + "grad_norm": 1.1574392318725586, + "learning_rate": 7.921101500026695e-05, + "loss": 0.8627, + "step": 94390 + }, + { + "epoch": 0.6030946935333427, + "grad_norm": 1.1855578422546387, + "learning_rate": 7.92069425245114e-05, + "loss": 0.8954, + "step": 94400 + }, + { + "epoch": 0.6031585806830814, + "grad_norm": 0.5956348776817322, + "learning_rate": 7.920286975461665e-05, + "loss": 1.0047, + "step": 94410 + }, + { + "epoch": 0.6032224678328201, + "grad_norm": 1.5525617599487305, + "learning_rate": 7.919879669062376e-05, + "loss": 0.9846, + "step": 94420 + }, + { + "epoch": 0.6032863549825588, + "grad_norm": 0.8425145745277405, + "learning_rate": 7.919472333257369e-05, + "loss": 0.7559, + "step": 94430 + }, + { + "epoch": 0.6033502421322975, + "grad_norm": 3.6817235946655273, + "learning_rate": 7.919064968050753e-05, + "loss": 0.8537, + "step": 94440 + }, + { + "epoch": 0.6034141292820362, + "grad_norm": 1.1978166103363037, + "learning_rate": 7.918657573446626e-05, + "loss": 0.7715, + "step": 94450 + }, + { + "epoch": 0.6034780164317749, + "grad_norm": 1.2537955045700073, + "learning_rate": 7.918250149449093e-05, + "loss": 0.9165, + "step": 94460 + }, + { + "epoch": 0.6035419035815136, + "grad_norm": 0.7937589883804321, + "learning_rate": 7.917842696062257e-05, + "loss": 1.1595, + "step": 94470 + }, + { + "epoch": 0.6036057907312523, + "grad_norm": 0.7704179883003235, + "learning_rate": 7.917435213290218e-05, + "loss": 0.7351, + "step": 94480 + }, + { + "epoch": 0.603669677880991, + "grad_norm": 0.5766093730926514, + "learning_rate": 7.917027701137085e-05, + "loss": 0.763, + "step": 94490 + }, + { + "epoch": 0.6037335650307297, + "grad_norm": 1.0193499326705933, + "learning_rate": 7.916620159606958e-05, + "loss": 0.85, + "step": 94500 + }, + { + "epoch": 0.6037974521804684, + "grad_norm": 0.7235758304595947, + "learning_rate": 7.916212588703944e-05, + "loss": 0.9735, + "step": 94510 + }, + { + "epoch": 0.6038613393302071, + "grad_norm": 1.022512435913086, + "learning_rate": 7.915804988432146e-05, + "loss": 0.9828, + "step": 94520 + }, + { + "epoch": 0.6039252264799458, + "grad_norm": 1.0742284059524536, + "learning_rate": 7.915397358795669e-05, + "loss": 0.8061, + "step": 94530 + }, + { + "epoch": 0.6039891136296845, + "grad_norm": 0.8321564793586731, + "learning_rate": 7.914989699798618e-05, + "loss": 0.7689, + "step": 94540 + }, + { + "epoch": 0.6040530007794233, + "grad_norm": 1.0344573259353638, + "learning_rate": 7.9145820114451e-05, + "loss": 0.6514, + "step": 94550 + }, + { + "epoch": 0.604116887929162, + "grad_norm": 0.8881844878196716, + "learning_rate": 7.914174293739221e-05, + "loss": 0.8515, + "step": 94560 + }, + { + "epoch": 0.6041807750789007, + "grad_norm": 0.4412252604961395, + "learning_rate": 7.913766546685083e-05, + "loss": 0.824, + "step": 94570 + }, + { + "epoch": 0.6042446622286394, + "grad_norm": 0.8809495568275452, + "learning_rate": 7.913358770286796e-05, + "loss": 0.7449, + "step": 94580 + }, + { + "epoch": 0.6043085493783781, + "grad_norm": 0.5300993919372559, + "learning_rate": 7.912950964548466e-05, + "loss": 0.7912, + "step": 94590 + }, + { + "epoch": 0.6043724365281168, + "grad_norm": 1.1538459062576294, + "learning_rate": 7.9125431294742e-05, + "loss": 0.9233, + "step": 94600 + }, + { + "epoch": 0.6044363236778555, + "grad_norm": 1.1883444786071777, + "learning_rate": 7.912135265068104e-05, + "loss": 0.9529, + "step": 94610 + }, + { + "epoch": 0.6045002108275941, + "grad_norm": 0.4078890085220337, + "learning_rate": 7.911727371334285e-05, + "loss": 0.8081, + "step": 94620 + }, + { + "epoch": 0.6045640979773328, + "grad_norm": 0.7821246981620789, + "learning_rate": 7.911319448276855e-05, + "loss": 0.71, + "step": 94630 + }, + { + "epoch": 0.6046279851270715, + "grad_norm": 1.401476502418518, + "learning_rate": 7.910911495899919e-05, + "loss": 0.7582, + "step": 94640 + }, + { + "epoch": 0.6046918722768102, + "grad_norm": 0.75636887550354, + "learning_rate": 7.910503514207585e-05, + "loss": 0.8437, + "step": 94650 + }, + { + "epoch": 0.6047557594265489, + "grad_norm": 0.6574771404266357, + "learning_rate": 7.910095503203964e-05, + "loss": 0.701, + "step": 94660 + }, + { + "epoch": 0.6048196465762876, + "grad_norm": 0.7583115100860596, + "learning_rate": 7.909687462893163e-05, + "loss": 0.6434, + "step": 94670 + }, + { + "epoch": 0.6048835337260263, + "grad_norm": 0.9831967353820801, + "learning_rate": 7.909279393279292e-05, + "loss": 0.8715, + "step": 94680 + }, + { + "epoch": 0.604947420875765, + "grad_norm": 0.7744137048721313, + "learning_rate": 7.908871294366461e-05, + "loss": 0.9322, + "step": 94690 + }, + { + "epoch": 0.6050113080255037, + "grad_norm": 0.7049340605735779, + "learning_rate": 7.90846316615878e-05, + "loss": 0.7142, + "step": 94700 + }, + { + "epoch": 0.6050751951752424, + "grad_norm": 1.377447247505188, + "learning_rate": 7.908055008660358e-05, + "loss": 0.7331, + "step": 94710 + }, + { + "epoch": 0.6051390823249811, + "grad_norm": 0.7816984057426453, + "learning_rate": 7.907646821875305e-05, + "loss": 0.8407, + "step": 94720 + }, + { + "epoch": 0.6052029694747199, + "grad_norm": 0.7210595011711121, + "learning_rate": 7.907238605807734e-05, + "loss": 0.8329, + "step": 94730 + }, + { + "epoch": 0.6052668566244586, + "grad_norm": 0.8540478944778442, + "learning_rate": 7.906830360461757e-05, + "loss": 1.0039, + "step": 94740 + }, + { + "epoch": 0.6053307437741973, + "grad_norm": 1.395521879196167, + "learning_rate": 7.906422085841481e-05, + "loss": 0.8576, + "step": 94750 + }, + { + "epoch": 0.605394630923936, + "grad_norm": 1.551674246788025, + "learning_rate": 7.906013781951022e-05, + "loss": 0.8292, + "step": 94760 + }, + { + "epoch": 0.6054585180736747, + "grad_norm": 0.5742878317832947, + "learning_rate": 7.905605448794489e-05, + "loss": 0.7943, + "step": 94770 + }, + { + "epoch": 0.6055224052234134, + "grad_norm": 1.2160093784332275, + "learning_rate": 7.905197086375995e-05, + "loss": 0.7231, + "step": 94780 + }, + { + "epoch": 0.6055862923731521, + "grad_norm": 0.824417769908905, + "learning_rate": 7.904788694699654e-05, + "loss": 0.7877, + "step": 94790 + }, + { + "epoch": 0.6056501795228908, + "grad_norm": 0.9510350227355957, + "learning_rate": 7.904380273769578e-05, + "loss": 0.9848, + "step": 94800 + }, + { + "epoch": 0.6057140666726295, + "grad_norm": 1.160369634628296, + "learning_rate": 7.90397182358988e-05, + "loss": 0.7315, + "step": 94810 + }, + { + "epoch": 0.6057779538223682, + "grad_norm": 0.949501633644104, + "learning_rate": 7.903563344164673e-05, + "loss": 0.9625, + "step": 94820 + }, + { + "epoch": 0.6058418409721069, + "grad_norm": 1.4712127447128296, + "learning_rate": 7.90315483549807e-05, + "loss": 0.8038, + "step": 94830 + }, + { + "epoch": 0.6059057281218456, + "grad_norm": 1.125845193862915, + "learning_rate": 7.902746297594187e-05, + "loss": 0.9969, + "step": 94840 + }, + { + "epoch": 0.6059696152715843, + "grad_norm": 1.1382551193237305, + "learning_rate": 7.90233773045714e-05, + "loss": 1.1508, + "step": 94850 + }, + { + "epoch": 0.6060335024213229, + "grad_norm": 0.8378157615661621, + "learning_rate": 7.901929134091038e-05, + "loss": 0.8542, + "step": 94860 + }, + { + "epoch": 0.6060973895710616, + "grad_norm": 1.001591682434082, + "learning_rate": 7.9015205085e-05, + "loss": 0.81, + "step": 94870 + }, + { + "epoch": 0.6061612767208003, + "grad_norm": 0.8801831603050232, + "learning_rate": 7.901111853688141e-05, + "loss": 1.3241, + "step": 94880 + }, + { + "epoch": 0.606225163870539, + "grad_norm": 0.739589273929596, + "learning_rate": 7.900703169659574e-05, + "loss": 0.7344, + "step": 94890 + }, + { + "epoch": 0.6062890510202777, + "grad_norm": 0.6854998469352722, + "learning_rate": 7.900294456418418e-05, + "loss": 0.7173, + "step": 94900 + }, + { + "epoch": 0.6063529381700165, + "grad_norm": 0.9052863121032715, + "learning_rate": 7.899885713968789e-05, + "loss": 0.8141, + "step": 94910 + }, + { + "epoch": 0.6064168253197552, + "grad_norm": 0.6532935500144958, + "learning_rate": 7.8994769423148e-05, + "loss": 0.8135, + "step": 94920 + }, + { + "epoch": 0.6064807124694939, + "grad_norm": 1.091203212738037, + "learning_rate": 7.89906814146057e-05, + "loss": 0.9978, + "step": 94930 + }, + { + "epoch": 0.6065445996192326, + "grad_norm": 0.9277145266532898, + "learning_rate": 7.898659311410218e-05, + "loss": 0.8818, + "step": 94940 + }, + { + "epoch": 0.6066084867689713, + "grad_norm": 1.1531500816345215, + "learning_rate": 7.898250452167856e-05, + "loss": 0.7621, + "step": 94950 + }, + { + "epoch": 0.60667237391871, + "grad_norm": 0.7892251014709473, + "learning_rate": 7.897841563737605e-05, + "loss": 0.9858, + "step": 94960 + }, + { + "epoch": 0.6067362610684487, + "grad_norm": 0.8957139849662781, + "learning_rate": 7.897432646123583e-05, + "loss": 0.8984, + "step": 94970 + }, + { + "epoch": 0.6068001482181874, + "grad_norm": 1.0226024389266968, + "learning_rate": 7.89702369932991e-05, + "loss": 0.902, + "step": 94980 + }, + { + "epoch": 0.6068640353679261, + "grad_norm": 1.1574631929397583, + "learning_rate": 7.8966147233607e-05, + "loss": 0.9923, + "step": 94990 + }, + { + "epoch": 0.6069279225176648, + "grad_norm": 0.5747475624084473, + "learning_rate": 7.896205718220073e-05, + "loss": 0.8992, + "step": 95000 + }, + { + "epoch": 0.6069918096674035, + "grad_norm": 0.7874606847763062, + "learning_rate": 7.895796683912148e-05, + "loss": 0.9888, + "step": 95010 + }, + { + "epoch": 0.6070556968171422, + "grad_norm": 2.17077898979187, + "learning_rate": 7.895387620441049e-05, + "loss": 0.8266, + "step": 95020 + }, + { + "epoch": 0.6071195839668809, + "grad_norm": 1.0135728120803833, + "learning_rate": 7.894978527810889e-05, + "loss": 0.7683, + "step": 95030 + }, + { + "epoch": 0.6071834711166196, + "grad_norm": 1.4031306505203247, + "learning_rate": 7.894569406025791e-05, + "loss": 0.8458, + "step": 95040 + }, + { + "epoch": 0.6072473582663583, + "grad_norm": 2.591813325881958, + "learning_rate": 7.894160255089876e-05, + "loss": 0.8058, + "step": 95050 + }, + { + "epoch": 0.607311245416097, + "grad_norm": 0.827282726764679, + "learning_rate": 7.893751075007263e-05, + "loss": 1.1121, + "step": 95060 + }, + { + "epoch": 0.6073751325658358, + "grad_norm": 0.9045417904853821, + "learning_rate": 7.893341865782073e-05, + "loss": 0.9837, + "step": 95070 + }, + { + "epoch": 0.6074390197155745, + "grad_norm": 1.0362218618392944, + "learning_rate": 7.892932627418428e-05, + "loss": 0.9518, + "step": 95080 + }, + { + "epoch": 0.6075029068653132, + "grad_norm": 0.9262713193893433, + "learning_rate": 7.892523359920447e-05, + "loss": 0.7016, + "step": 95090 + }, + { + "epoch": 0.6075667940150518, + "grad_norm": 0.7739233374595642, + "learning_rate": 7.892114063292256e-05, + "loss": 0.726, + "step": 95100 + }, + { + "epoch": 0.6076306811647905, + "grad_norm": 0.9387316703796387, + "learning_rate": 7.891704737537972e-05, + "loss": 0.9781, + "step": 95110 + }, + { + "epoch": 0.6076945683145292, + "grad_norm": 0.8391212821006775, + "learning_rate": 7.89129538266172e-05, + "loss": 0.7901, + "step": 95120 + }, + { + "epoch": 0.6077584554642679, + "grad_norm": 1.2274523973464966, + "learning_rate": 7.890885998667623e-05, + "loss": 0.7709, + "step": 95130 + }, + { + "epoch": 0.6078223426140066, + "grad_norm": 2.197125196456909, + "learning_rate": 7.890476585559802e-05, + "loss": 0.8797, + "step": 95140 + }, + { + "epoch": 0.6078862297637453, + "grad_norm": 0.8918281197547913, + "learning_rate": 7.890067143342381e-05, + "loss": 0.9798, + "step": 95150 + }, + { + "epoch": 0.607950116913484, + "grad_norm": 0.9237262010574341, + "learning_rate": 7.889657672019483e-05, + "loss": 0.9019, + "step": 95160 + }, + { + "epoch": 0.6080140040632227, + "grad_norm": 2.1953394412994385, + "learning_rate": 7.889248171595235e-05, + "loss": 1.0639, + "step": 95170 + }, + { + "epoch": 0.6080778912129614, + "grad_norm": 1.0482672452926636, + "learning_rate": 7.888838642073757e-05, + "loss": 1.0944, + "step": 95180 + }, + { + "epoch": 0.6081417783627001, + "grad_norm": 0.8295831084251404, + "learning_rate": 7.888429083459175e-05, + "loss": 0.7337, + "step": 95190 + }, + { + "epoch": 0.6082056655124388, + "grad_norm": 0.771742045879364, + "learning_rate": 7.888019495755612e-05, + "loss": 0.8807, + "step": 95200 + }, + { + "epoch": 0.6082695526621775, + "grad_norm": 0.9289408922195435, + "learning_rate": 7.887609878967195e-05, + "loss": 0.8625, + "step": 95210 + }, + { + "epoch": 0.6083334398119162, + "grad_norm": 0.9988054633140564, + "learning_rate": 7.887200233098049e-05, + "loss": 0.992, + "step": 95220 + }, + { + "epoch": 0.6083973269616549, + "grad_norm": 1.2625335454940796, + "learning_rate": 7.8867905581523e-05, + "loss": 0.798, + "step": 95230 + }, + { + "epoch": 0.6084612141113936, + "grad_norm": 0.8305104374885559, + "learning_rate": 7.886421825844037e-05, + "loss": 0.9033, + "step": 95240 + }, + { + "epoch": 0.6085251012611324, + "grad_norm": 0.540634274482727, + "learning_rate": 7.886012095664107e-05, + "loss": 1.109, + "step": 95250 + }, + { + "epoch": 0.6085889884108711, + "grad_norm": 0.7756277918815613, + "learning_rate": 7.885602336419534e-05, + "loss": 1.0592, + "step": 95260 + }, + { + "epoch": 0.6086528755606098, + "grad_norm": 0.8292693495750427, + "learning_rate": 7.885192548114453e-05, + "loss": 0.9055, + "step": 95270 + }, + { + "epoch": 0.6087167627103485, + "grad_norm": 1.1361613273620605, + "learning_rate": 7.884782730752984e-05, + "loss": 0.9744, + "step": 95280 + }, + { + "epoch": 0.6087806498600872, + "grad_norm": 0.6474214792251587, + "learning_rate": 7.88437288433926e-05, + "loss": 0.8828, + "step": 95290 + }, + { + "epoch": 0.6088445370098259, + "grad_norm": 0.8289141058921814, + "learning_rate": 7.883963008877404e-05, + "loss": 0.8699, + "step": 95300 + }, + { + "epoch": 0.6089084241595646, + "grad_norm": 1.024628758430481, + "learning_rate": 7.883553104371547e-05, + "loss": 0.8444, + "step": 95310 + }, + { + "epoch": 0.6089723113093033, + "grad_norm": 0.7327681183815002, + "learning_rate": 7.883143170825815e-05, + "loss": 0.883, + "step": 95320 + }, + { + "epoch": 0.609036198459042, + "grad_norm": 1.0134698152542114, + "learning_rate": 7.882733208244337e-05, + "loss": 0.8742, + "step": 95330 + }, + { + "epoch": 0.6091000856087806, + "grad_norm": 1.6711806058883667, + "learning_rate": 7.882323216631241e-05, + "loss": 1.0554, + "step": 95340 + }, + { + "epoch": 0.6091639727585193, + "grad_norm": 0.8501431345939636, + "learning_rate": 7.881913195990658e-05, + "loss": 0.9567, + "step": 95350 + }, + { + "epoch": 0.609227859908258, + "grad_norm": 1.3267399072647095, + "learning_rate": 7.881503146326714e-05, + "loss": 0.763, + "step": 95360 + }, + { + "epoch": 0.6092917470579967, + "grad_norm": 1.0088346004486084, + "learning_rate": 7.881093067643541e-05, + "loss": 1.0992, + "step": 95370 + }, + { + "epoch": 0.6093556342077354, + "grad_norm": 0.7467685341835022, + "learning_rate": 7.88068295994527e-05, + "loss": 0.9101, + "step": 95380 + }, + { + "epoch": 0.6094195213574741, + "grad_norm": 1.0090692043304443, + "learning_rate": 7.880272823236027e-05, + "loss": 0.9231, + "step": 95390 + }, + { + "epoch": 0.6094834085072128, + "grad_norm": 0.705679178237915, + "learning_rate": 7.879862657519948e-05, + "loss": 0.7734, + "step": 95400 + }, + { + "epoch": 0.6095472956569515, + "grad_norm": 1.0415382385253906, + "learning_rate": 7.879452462801158e-05, + "loss": 0.767, + "step": 95410 + }, + { + "epoch": 0.6096111828066902, + "grad_norm": 1.0252981185913086, + "learning_rate": 7.879042239083792e-05, + "loss": 0.775, + "step": 95420 + }, + { + "epoch": 0.609675069956429, + "grad_norm": 1.6463427543640137, + "learning_rate": 7.878631986371978e-05, + "loss": 0.856, + "step": 95430 + }, + { + "epoch": 0.6097389571061677, + "grad_norm": 0.9976633191108704, + "learning_rate": 7.878221704669852e-05, + "loss": 0.8753, + "step": 95440 + }, + { + "epoch": 0.6098028442559064, + "grad_norm": 0.9319173693656921, + "learning_rate": 7.877811393981542e-05, + "loss": 0.7858, + "step": 95450 + }, + { + "epoch": 0.6098667314056451, + "grad_norm": 0.7984362840652466, + "learning_rate": 7.877401054311182e-05, + "loss": 0.7756, + "step": 95460 + }, + { + "epoch": 0.6099306185553838, + "grad_norm": 1.3641668558120728, + "learning_rate": 7.876990685662903e-05, + "loss": 0.8173, + "step": 95470 + }, + { + "epoch": 0.6099945057051225, + "grad_norm": 1.6693613529205322, + "learning_rate": 7.87658028804084e-05, + "loss": 0.7144, + "step": 95480 + }, + { + "epoch": 0.6100583928548612, + "grad_norm": 1.5207191705703735, + "learning_rate": 7.876169861449125e-05, + "loss": 0.8905, + "step": 95490 + }, + { + "epoch": 0.6101222800045999, + "grad_norm": 0.7093350887298584, + "learning_rate": 7.875759405891891e-05, + "loss": 1.0028, + "step": 95500 + }, + { + "epoch": 0.6101861671543386, + "grad_norm": 0.6331580281257629, + "learning_rate": 7.875348921373271e-05, + "loss": 0.699, + "step": 95510 + }, + { + "epoch": 0.6102500543040773, + "grad_norm": 0.6561687588691711, + "learning_rate": 7.8749384078974e-05, + "loss": 0.826, + "step": 95520 + }, + { + "epoch": 0.610313941453816, + "grad_norm": 1.0018503665924072, + "learning_rate": 7.874527865468414e-05, + "loss": 0.8969, + "step": 95530 + }, + { + "epoch": 0.6103778286035547, + "grad_norm": 0.9253289103507996, + "learning_rate": 7.874117294090445e-05, + "loss": 0.8585, + "step": 95540 + }, + { + "epoch": 0.6104417157532934, + "grad_norm": 0.924781858921051, + "learning_rate": 7.873706693767626e-05, + "loss": 1.0908, + "step": 95550 + }, + { + "epoch": 0.6105056029030321, + "grad_norm": 0.8251575231552124, + "learning_rate": 7.873296064504096e-05, + "loss": 0.7825, + "step": 95560 + }, + { + "epoch": 0.6105694900527708, + "grad_norm": 1.3431663513183594, + "learning_rate": 7.87288540630399e-05, + "loss": 0.9041, + "step": 95570 + }, + { + "epoch": 0.6106333772025095, + "grad_norm": 0.9507653117179871, + "learning_rate": 7.872474719171441e-05, + "loss": 0.8849, + "step": 95580 + }, + { + "epoch": 0.6106972643522481, + "grad_norm": 0.655210018157959, + "learning_rate": 7.872064003110589e-05, + "loss": 0.7671, + "step": 95590 + }, + { + "epoch": 0.6107611515019868, + "grad_norm": 0.8236376047134399, + "learning_rate": 7.871653258125564e-05, + "loss": 0.7553, + "step": 95600 + }, + { + "epoch": 0.6108250386517255, + "grad_norm": 1.1168328523635864, + "learning_rate": 7.871242484220509e-05, + "loss": 0.9552, + "step": 95610 + }, + { + "epoch": 0.6108889258014643, + "grad_norm": 0.9979621767997742, + "learning_rate": 7.870831681399558e-05, + "loss": 0.8804, + "step": 95620 + }, + { + "epoch": 0.610952812951203, + "grad_norm": 0.9995028972625732, + "learning_rate": 7.870420849666847e-05, + "loss": 0.9085, + "step": 95630 + }, + { + "epoch": 0.6110167001009417, + "grad_norm": 0.6226816773414612, + "learning_rate": 7.870009989026516e-05, + "loss": 0.8221, + "step": 95640 + }, + { + "epoch": 0.6110805872506804, + "grad_norm": 0.6372109055519104, + "learning_rate": 7.8695990994827e-05, + "loss": 0.7805, + "step": 95650 + }, + { + "epoch": 0.6111444744004191, + "grad_norm": 0.5445452928543091, + "learning_rate": 7.86918818103954e-05, + "loss": 0.8984, + "step": 95660 + }, + { + "epoch": 0.6112083615501578, + "grad_norm": 1.3520268201828003, + "learning_rate": 7.868777233701174e-05, + "loss": 0.6649, + "step": 95670 + }, + { + "epoch": 0.6112722486998965, + "grad_norm": 3.7762889862060547, + "learning_rate": 7.868366257471737e-05, + "loss": 1.2053, + "step": 95680 + }, + { + "epoch": 0.6113361358496352, + "grad_norm": 0.8047236204147339, + "learning_rate": 7.867955252355371e-05, + "loss": 0.8838, + "step": 95690 + }, + { + "epoch": 0.6114000229993739, + "grad_norm": 0.9412931799888611, + "learning_rate": 7.867544218356215e-05, + "loss": 1.0598, + "step": 95700 + }, + { + "epoch": 0.6114639101491126, + "grad_norm": 0.8420425057411194, + "learning_rate": 7.867133155478408e-05, + "loss": 1.0705, + "step": 95710 + }, + { + "epoch": 0.6115277972988513, + "grad_norm": 3.140885353088379, + "learning_rate": 7.866722063726089e-05, + "loss": 1.1464, + "step": 95720 + }, + { + "epoch": 0.61159168444859, + "grad_norm": 1.0284521579742432, + "learning_rate": 7.866310943103399e-05, + "loss": 1.22, + "step": 95730 + }, + { + "epoch": 0.6116555715983287, + "grad_norm": 0.5975868105888367, + "learning_rate": 7.865899793614478e-05, + "loss": 0.67, + "step": 95740 + }, + { + "epoch": 0.6117194587480674, + "grad_norm": 0.9867643117904663, + "learning_rate": 7.865488615263467e-05, + "loss": 0.8758, + "step": 95750 + }, + { + "epoch": 0.6117833458978061, + "grad_norm": 1.000255823135376, + "learning_rate": 7.865077408054507e-05, + "loss": 0.8155, + "step": 95760 + }, + { + "epoch": 0.6118472330475448, + "grad_norm": 0.6636435389518738, + "learning_rate": 7.864666171991736e-05, + "loss": 0.8283, + "step": 95770 + }, + { + "epoch": 0.6119111201972836, + "grad_norm": 0.43315425515174866, + "learning_rate": 7.864254907079302e-05, + "loss": 0.7155, + "step": 95780 + }, + { + "epoch": 0.6119750073470223, + "grad_norm": 0.7825480103492737, + "learning_rate": 7.863843613321342e-05, + "loss": 0.7891, + "step": 95790 + }, + { + "epoch": 0.612038894496761, + "grad_norm": 1.0271575450897217, + "learning_rate": 7.863432290722e-05, + "loss": 1.0863, + "step": 95800 + }, + { + "epoch": 0.6121027816464997, + "grad_norm": 0.8053982853889465, + "learning_rate": 7.863020939285415e-05, + "loss": 1.0562, + "step": 95810 + }, + { + "epoch": 0.6121666687962384, + "grad_norm": 0.6690106987953186, + "learning_rate": 7.862609559015735e-05, + "loss": 0.9627, + "step": 95820 + }, + { + "epoch": 0.612230555945977, + "grad_norm": 0.6722576022148132, + "learning_rate": 7.862198149917099e-05, + "loss": 0.9642, + "step": 95830 + }, + { + "epoch": 0.6122944430957157, + "grad_norm": 1.5476411581039429, + "learning_rate": 7.86178671199365e-05, + "loss": 0.9018, + "step": 95840 + }, + { + "epoch": 0.6123583302454544, + "grad_norm": 0.9946299195289612, + "learning_rate": 7.861375245249536e-05, + "loss": 0.8733, + "step": 95850 + }, + { + "epoch": 0.6124222173951931, + "grad_norm": 0.9345956444740295, + "learning_rate": 7.860963749688897e-05, + "loss": 0.6437, + "step": 95860 + }, + { + "epoch": 0.6124861045449318, + "grad_norm": 0.6753872036933899, + "learning_rate": 7.860552225315877e-05, + "loss": 0.8364, + "step": 95870 + }, + { + "epoch": 0.6125499916946705, + "grad_norm": 0.6145283579826355, + "learning_rate": 7.860140672134622e-05, + "loss": 0.7866, + "step": 95880 + }, + { + "epoch": 0.6126138788444092, + "grad_norm": 1.010877251625061, + "learning_rate": 7.859729090149275e-05, + "loss": 0.9073, + "step": 95890 + }, + { + "epoch": 0.6126777659941479, + "grad_norm": 0.9995620250701904, + "learning_rate": 7.859317479363983e-05, + "loss": 0.8959, + "step": 95900 + }, + { + "epoch": 0.6127416531438866, + "grad_norm": 0.7354846596717834, + "learning_rate": 7.85890583978289e-05, + "loss": 1.1287, + "step": 95910 + }, + { + "epoch": 0.6128055402936253, + "grad_norm": 1.1408978700637817, + "learning_rate": 7.858494171410144e-05, + "loss": 0.9378, + "step": 95920 + }, + { + "epoch": 0.612869427443364, + "grad_norm": 0.7992193698883057, + "learning_rate": 7.858082474249886e-05, + "loss": 0.7438, + "step": 95930 + }, + { + "epoch": 0.6129333145931027, + "grad_norm": 0.8835758566856384, + "learning_rate": 7.857670748306267e-05, + "loss": 0.8875, + "step": 95940 + }, + { + "epoch": 0.6129972017428414, + "grad_norm": 1.195634365081787, + "learning_rate": 7.857258993583429e-05, + "loss": 0.9826, + "step": 95950 + }, + { + "epoch": 0.6130610888925802, + "grad_norm": 0.5840584635734558, + "learning_rate": 7.856847210085523e-05, + "loss": 0.7557, + "step": 95960 + }, + { + "epoch": 0.6131249760423189, + "grad_norm": 0.8779674768447876, + "learning_rate": 7.856435397816693e-05, + "loss": 1.0267, + "step": 95970 + }, + { + "epoch": 0.6131888631920576, + "grad_norm": 0.7890565991401672, + "learning_rate": 7.856023556781087e-05, + "loss": 0.9257, + "step": 95980 + }, + { + "epoch": 0.6132527503417963, + "grad_norm": 0.8011789321899414, + "learning_rate": 7.855611686982854e-05, + "loss": 0.7494, + "step": 95990 + }, + { + "epoch": 0.613316637491535, + "grad_norm": 0.802932620048523, + "learning_rate": 7.85519978842614e-05, + "loss": 0.6817, + "step": 96000 + }, + { + "epoch": 0.6133805246412737, + "grad_norm": 0.604083240032196, + "learning_rate": 7.854787861115093e-05, + "loss": 0.7141, + "step": 96010 + }, + { + "epoch": 0.6134444117910124, + "grad_norm": 0.9903905391693115, + "learning_rate": 7.854375905053866e-05, + "loss": 0.8143, + "step": 96020 + }, + { + "epoch": 0.6135082989407511, + "grad_norm": 1.5344460010528564, + "learning_rate": 7.853963920246601e-05, + "loss": 0.6426, + "step": 96030 + }, + { + "epoch": 0.6135721860904898, + "grad_norm": 0.6864035129547119, + "learning_rate": 7.853551906697452e-05, + "loss": 0.7646, + "step": 96040 + }, + { + "epoch": 0.6136360732402285, + "grad_norm": 0.9132956862449646, + "learning_rate": 7.853139864410565e-05, + "loss": 1.0619, + "step": 96050 + }, + { + "epoch": 0.6136999603899672, + "grad_norm": 1.0728709697723389, + "learning_rate": 7.852727793390094e-05, + "loss": 0.962, + "step": 96060 + }, + { + "epoch": 0.6137638475397058, + "grad_norm": 0.7748542428016663, + "learning_rate": 7.852315693640184e-05, + "loss": 0.7858, + "step": 96070 + }, + { + "epoch": 0.6138277346894445, + "grad_norm": 0.7569340467453003, + "learning_rate": 7.851903565164987e-05, + "loss": 1.1526, + "step": 96080 + }, + { + "epoch": 0.6138916218391832, + "grad_norm": 1.0770772695541382, + "learning_rate": 7.851491407968655e-05, + "loss": 0.8305, + "step": 96090 + }, + { + "epoch": 0.6139555089889219, + "grad_norm": 0.9833508729934692, + "learning_rate": 7.851079222055338e-05, + "loss": 0.8214, + "step": 96100 + }, + { + "epoch": 0.6140193961386606, + "grad_norm": 0.6459149718284607, + "learning_rate": 7.850667007429187e-05, + "loss": 0.9574, + "step": 96110 + }, + { + "epoch": 0.6140832832883993, + "grad_norm": 0.6652225255966187, + "learning_rate": 7.850254764094351e-05, + "loss": 0.7369, + "step": 96120 + }, + { + "epoch": 0.614147170438138, + "grad_norm": 1.0020723342895508, + "learning_rate": 7.849842492054986e-05, + "loss": 0.8702, + "step": 96130 + }, + { + "epoch": 0.6142110575878768, + "grad_norm": 0.847522497177124, + "learning_rate": 7.84943019131524e-05, + "loss": 0.7167, + "step": 96140 + }, + { + "epoch": 0.6142749447376155, + "grad_norm": 0.7430154085159302, + "learning_rate": 7.849017861879266e-05, + "loss": 0.7177, + "step": 96150 + }, + { + "epoch": 0.6143388318873542, + "grad_norm": 0.7399063110351562, + "learning_rate": 7.84860550375122e-05, + "loss": 1.1364, + "step": 96160 + }, + { + "epoch": 0.6144027190370929, + "grad_norm": 1.0138983726501465, + "learning_rate": 7.84819311693525e-05, + "loss": 0.9364, + "step": 96170 + }, + { + "epoch": 0.6144666061868316, + "grad_norm": 0.8306713104248047, + "learning_rate": 7.847780701435514e-05, + "loss": 0.9233, + "step": 96180 + }, + { + "epoch": 0.6145304933365703, + "grad_norm": 0.8946832418441772, + "learning_rate": 7.84736825725616e-05, + "loss": 1.2262, + "step": 96190 + }, + { + "epoch": 0.614594380486309, + "grad_norm": 0.6287519931793213, + "learning_rate": 7.846955784401345e-05, + "loss": 0.7374, + "step": 96200 + }, + { + "epoch": 0.6146582676360477, + "grad_norm": 0.8874316215515137, + "learning_rate": 7.846543282875222e-05, + "loss": 0.8743, + "step": 96210 + }, + { + "epoch": 0.6147221547857864, + "grad_norm": 0.7302953004837036, + "learning_rate": 7.846130752681946e-05, + "loss": 0.7668, + "step": 96220 + }, + { + "epoch": 0.6147860419355251, + "grad_norm": 0.9165903925895691, + "learning_rate": 7.845718193825671e-05, + "loss": 0.8457, + "step": 96230 + }, + { + "epoch": 0.6148499290852638, + "grad_norm": 0.7827330827713013, + "learning_rate": 7.845305606310552e-05, + "loss": 0.9301, + "step": 96240 + }, + { + "epoch": 0.6149138162350025, + "grad_norm": 1.272709846496582, + "learning_rate": 7.844892990140744e-05, + "loss": 0.6134, + "step": 96250 + }, + { + "epoch": 0.6149777033847412, + "grad_norm": 0.8357341885566711, + "learning_rate": 7.844480345320402e-05, + "loss": 1.3462, + "step": 96260 + }, + { + "epoch": 0.6150415905344799, + "grad_norm": 1.107619047164917, + "learning_rate": 7.844067671853683e-05, + "loss": 1.1149, + "step": 96270 + }, + { + "epoch": 0.6151054776842186, + "grad_norm": 0.9972173571586609, + "learning_rate": 7.843654969744741e-05, + "loss": 1.0077, + "step": 96280 + }, + { + "epoch": 0.6151693648339573, + "grad_norm": 0.5411679148674011, + "learning_rate": 7.843242238997735e-05, + "loss": 0.9392, + "step": 96290 + }, + { + "epoch": 0.615233251983696, + "grad_norm": 0.6733404397964478, + "learning_rate": 7.842829479616818e-05, + "loss": 1.0999, + "step": 96300 + }, + { + "epoch": 0.6152971391334348, + "grad_norm": 1.565991997718811, + "learning_rate": 7.842416691606149e-05, + "loss": 1.13, + "step": 96310 + }, + { + "epoch": 0.6153610262831734, + "grad_norm": 0.7156278491020203, + "learning_rate": 7.842003874969886e-05, + "loss": 1.3407, + "step": 96320 + }, + { + "epoch": 0.6154249134329121, + "grad_norm": 1.1994881629943848, + "learning_rate": 7.841591029712185e-05, + "loss": 0.9944, + "step": 96330 + }, + { + "epoch": 0.6154888005826508, + "grad_norm": 0.923414945602417, + "learning_rate": 7.841178155837204e-05, + "loss": 0.8468, + "step": 96340 + }, + { + "epoch": 0.6155526877323895, + "grad_norm": 1.0727185010910034, + "learning_rate": 7.8407652533491e-05, + "loss": 0.8268, + "step": 96350 + }, + { + "epoch": 0.6156165748821282, + "grad_norm": 1.3032314777374268, + "learning_rate": 7.840352322252032e-05, + "loss": 0.8223, + "step": 96360 + }, + { + "epoch": 0.6156804620318669, + "grad_norm": 1.055098056793213, + "learning_rate": 7.839939362550161e-05, + "loss": 1.0348, + "step": 96370 + }, + { + "epoch": 0.6157443491816056, + "grad_norm": 1.0928640365600586, + "learning_rate": 7.839526374247642e-05, + "loss": 0.7047, + "step": 96380 + }, + { + "epoch": 0.6158082363313443, + "grad_norm": 1.1519923210144043, + "learning_rate": 7.839113357348637e-05, + "loss": 1.3878, + "step": 96390 + }, + { + "epoch": 0.615872123481083, + "grad_norm": 1.0324209928512573, + "learning_rate": 7.838700311857303e-05, + "loss": 1.0287, + "step": 96400 + }, + { + "epoch": 0.6159360106308217, + "grad_norm": 1.1393169164657593, + "learning_rate": 7.838287237777802e-05, + "loss": 0.7189, + "step": 96410 + }, + { + "epoch": 0.6159998977805604, + "grad_norm": 0.7285189628601074, + "learning_rate": 7.837874135114294e-05, + "loss": 0.8353, + "step": 96420 + }, + { + "epoch": 0.6160637849302991, + "grad_norm": 0.8310270309448242, + "learning_rate": 7.837461003870936e-05, + "loss": 0.9424, + "step": 96430 + }, + { + "epoch": 0.6161276720800378, + "grad_norm": 0.8968883752822876, + "learning_rate": 7.837047844051893e-05, + "loss": 1.1358, + "step": 96440 + }, + { + "epoch": 0.6161915592297765, + "grad_norm": 0.7500566244125366, + "learning_rate": 7.836634655661323e-05, + "loss": 0.9524, + "step": 96450 + }, + { + "epoch": 0.6162554463795152, + "grad_norm": 0.8886957764625549, + "learning_rate": 7.836221438703388e-05, + "loss": 0.6771, + "step": 96460 + }, + { + "epoch": 0.6163193335292539, + "grad_norm": 0.7337315082550049, + "learning_rate": 7.835808193182248e-05, + "loss": 0.7978, + "step": 96470 + }, + { + "epoch": 0.6163832206789926, + "grad_norm": 0.7529950141906738, + "learning_rate": 7.835394919102068e-05, + "loss": 0.8103, + "step": 96480 + }, + { + "epoch": 0.6164471078287314, + "grad_norm": 1.0321061611175537, + "learning_rate": 7.834981616467007e-05, + "loss": 1.1549, + "step": 96490 + }, + { + "epoch": 0.6165109949784701, + "grad_norm": 1.114965558052063, + "learning_rate": 7.83456828528123e-05, + "loss": 0.8008, + "step": 96500 + }, + { + "epoch": 0.6165748821282088, + "grad_norm": 0.8190954923629761, + "learning_rate": 7.834154925548898e-05, + "loss": 1.0127, + "step": 96510 + }, + { + "epoch": 0.6166387692779475, + "grad_norm": 1.0019639730453491, + "learning_rate": 7.833741537274173e-05, + "loss": 0.735, + "step": 96520 + }, + { + "epoch": 0.6167026564276862, + "grad_norm": 0.6965848207473755, + "learning_rate": 7.833328120461219e-05, + "loss": 1.2864, + "step": 96530 + }, + { + "epoch": 0.6167665435774249, + "grad_norm": 1.2692817449569702, + "learning_rate": 7.8329146751142e-05, + "loss": 0.8433, + "step": 96540 + }, + { + "epoch": 0.6168304307271636, + "grad_norm": 1.0348045825958252, + "learning_rate": 7.832501201237279e-05, + "loss": 0.7535, + "step": 96550 + }, + { + "epoch": 0.6168943178769022, + "grad_norm": 0.7855243682861328, + "learning_rate": 7.832087698834621e-05, + "loss": 0.8068, + "step": 96560 + }, + { + "epoch": 0.6169582050266409, + "grad_norm": 0.7087273001670837, + "learning_rate": 7.83167416791039e-05, + "loss": 1.015, + "step": 96570 + }, + { + "epoch": 0.6170220921763796, + "grad_norm": 0.6482358574867249, + "learning_rate": 7.83126060846875e-05, + "loss": 0.8353, + "step": 96580 + }, + { + "epoch": 0.6170859793261183, + "grad_norm": 1.1553382873535156, + "learning_rate": 7.830847020513867e-05, + "loss": 0.8833, + "step": 96590 + }, + { + "epoch": 0.617149866475857, + "grad_norm": 1.0533820390701294, + "learning_rate": 7.830433404049904e-05, + "loss": 0.9203, + "step": 96600 + }, + { + "epoch": 0.6172137536255957, + "grad_norm": 0.9476677775382996, + "learning_rate": 7.830019759081028e-05, + "loss": 0.9134, + "step": 96610 + }, + { + "epoch": 0.6172776407753344, + "grad_norm": 1.1443191766738892, + "learning_rate": 7.829606085611408e-05, + "loss": 1.0249, + "step": 96620 + }, + { + "epoch": 0.6173415279250731, + "grad_norm": 1.9042986631393433, + "learning_rate": 7.829192383645203e-05, + "loss": 0.8699, + "step": 96630 + }, + { + "epoch": 0.6174054150748118, + "grad_norm": 0.6363811492919922, + "learning_rate": 7.828778653186586e-05, + "loss": 0.7259, + "step": 96640 + }, + { + "epoch": 0.6174693022245505, + "grad_norm": 0.6034536957740784, + "learning_rate": 7.82836489423972e-05, + "loss": 1.0086, + "step": 96650 + }, + { + "epoch": 0.6175331893742892, + "grad_norm": 0.7996253371238708, + "learning_rate": 7.827951106808771e-05, + "loss": 0.8618, + "step": 96660 + }, + { + "epoch": 0.617597076524028, + "grad_norm": 0.9885534644126892, + "learning_rate": 7.827537290897908e-05, + "loss": 0.8571, + "step": 96670 + }, + { + "epoch": 0.6176609636737667, + "grad_norm": 0.7702460885047913, + "learning_rate": 7.827123446511298e-05, + "loss": 0.8013, + "step": 96680 + }, + { + "epoch": 0.6177248508235054, + "grad_norm": 1.5067464113235474, + "learning_rate": 7.82670957365311e-05, + "loss": 0.8273, + "step": 96690 + }, + { + "epoch": 0.6177887379732441, + "grad_norm": 0.8331496119499207, + "learning_rate": 7.826295672327512e-05, + "loss": 1.0143, + "step": 96700 + }, + { + "epoch": 0.6178526251229828, + "grad_norm": 1.1344146728515625, + "learning_rate": 7.82588174253867e-05, + "loss": 1.0309, + "step": 96710 + }, + { + "epoch": 0.6179165122727215, + "grad_norm": 0.6412261128425598, + "learning_rate": 7.825467784290755e-05, + "loss": 0.8596, + "step": 96720 + }, + { + "epoch": 0.6179803994224602, + "grad_norm": 0.5586232542991638, + "learning_rate": 7.825053797587936e-05, + "loss": 1.0329, + "step": 96730 + }, + { + "epoch": 0.6180442865721989, + "grad_norm": 0.8391451835632324, + "learning_rate": 7.824639782434379e-05, + "loss": 1.0402, + "step": 96740 + }, + { + "epoch": 0.6181081737219376, + "grad_norm": 0.9592933058738708, + "learning_rate": 7.824225738834256e-05, + "loss": 0.9333, + "step": 96750 + }, + { + "epoch": 0.6181720608716763, + "grad_norm": 0.5291448831558228, + "learning_rate": 7.823811666791738e-05, + "loss": 0.6241, + "step": 96760 + }, + { + "epoch": 0.618235948021415, + "grad_norm": 1.1414803266525269, + "learning_rate": 7.823397566310992e-05, + "loss": 0.8571, + "step": 96770 + }, + { + "epoch": 0.6182998351711537, + "grad_norm": 0.7651611566543579, + "learning_rate": 7.822983437396192e-05, + "loss": 0.7009, + "step": 96780 + }, + { + "epoch": 0.6183637223208924, + "grad_norm": 1.016514539718628, + "learning_rate": 7.822569280051505e-05, + "loss": 0.9783, + "step": 96790 + }, + { + "epoch": 0.618427609470631, + "grad_norm": 0.9900182485580444, + "learning_rate": 7.822155094281104e-05, + "loss": 0.919, + "step": 96800 + }, + { + "epoch": 0.6184914966203697, + "grad_norm": 0.969688892364502, + "learning_rate": 7.821740880089159e-05, + "loss": 1.054, + "step": 96810 + }, + { + "epoch": 0.6185553837701084, + "grad_norm": 0.9642791748046875, + "learning_rate": 7.821326637479842e-05, + "loss": 0.7227, + "step": 96820 + }, + { + "epoch": 0.6186192709198471, + "grad_norm": 0.9115810394287109, + "learning_rate": 7.820912366457327e-05, + "loss": 0.9451, + "step": 96830 + }, + { + "epoch": 0.6186831580695858, + "grad_norm": 1.4009279012680054, + "learning_rate": 7.820498067025782e-05, + "loss": 1.1126, + "step": 96840 + }, + { + "epoch": 0.6187470452193246, + "grad_norm": 1.6597306728363037, + "learning_rate": 7.820083739189381e-05, + "loss": 0.8138, + "step": 96850 + }, + { + "epoch": 0.6188109323690633, + "grad_norm": 2.7487285137176514, + "learning_rate": 7.819669382952299e-05, + "loss": 0.929, + "step": 96860 + }, + { + "epoch": 0.618874819518802, + "grad_norm": 0.7340418100357056, + "learning_rate": 7.819254998318706e-05, + "loss": 0.9411, + "step": 96870 + }, + { + "epoch": 0.6189387066685407, + "grad_norm": 0.8978639245033264, + "learning_rate": 7.818840585292775e-05, + "loss": 0.755, + "step": 96880 + }, + { + "epoch": 0.6190025938182794, + "grad_norm": 0.8307545781135559, + "learning_rate": 7.818426143878683e-05, + "loss": 0.8567, + "step": 96890 + }, + { + "epoch": 0.6190664809680181, + "grad_norm": 0.6987618803977966, + "learning_rate": 7.818011674080601e-05, + "loss": 0.7964, + "step": 96900 + }, + { + "epoch": 0.6191303681177568, + "grad_norm": 1.2218877077102661, + "learning_rate": 7.817597175902702e-05, + "loss": 1.2578, + "step": 96910 + }, + { + "epoch": 0.6191942552674955, + "grad_norm": 1.1471195220947266, + "learning_rate": 7.817182649349164e-05, + "loss": 0.8193, + "step": 96920 + }, + { + "epoch": 0.6192581424172342, + "grad_norm": 0.7587412595748901, + "learning_rate": 7.816768094424157e-05, + "loss": 0.8189, + "step": 96930 + }, + { + "epoch": 0.6193220295669729, + "grad_norm": 0.5350973010063171, + "learning_rate": 7.81635351113186e-05, + "loss": 0.6833, + "step": 96940 + }, + { + "epoch": 0.6193859167167116, + "grad_norm": 0.5886098146438599, + "learning_rate": 7.815938899476447e-05, + "loss": 0.7651, + "step": 96950 + }, + { + "epoch": 0.6194498038664503, + "grad_norm": 0.8069875240325928, + "learning_rate": 7.815524259462093e-05, + "loss": 0.8622, + "step": 96960 + }, + { + "epoch": 0.619513691016189, + "grad_norm": 0.9382511973381042, + "learning_rate": 7.815109591092973e-05, + "loss": 0.7166, + "step": 96970 + }, + { + "epoch": 0.6195775781659277, + "grad_norm": 1.3142880201339722, + "learning_rate": 7.814694894373263e-05, + "loss": 1.2337, + "step": 96980 + }, + { + "epoch": 0.6196414653156664, + "grad_norm": 0.8636249303817749, + "learning_rate": 7.814280169307142e-05, + "loss": 0.7238, + "step": 96990 + }, + { + "epoch": 0.6197053524654051, + "grad_norm": 0.7896556258201599, + "learning_rate": 7.813865415898785e-05, + "loss": 0.7152, + "step": 97000 + }, + { + "epoch": 0.6197692396151439, + "grad_norm": 0.7915673851966858, + "learning_rate": 7.813450634152369e-05, + "loss": 0.9909, + "step": 97010 + }, + { + "epoch": 0.6198331267648826, + "grad_norm": 0.6319288611412048, + "learning_rate": 7.81303582407207e-05, + "loss": 0.8935, + "step": 97020 + }, + { + "epoch": 0.6198970139146213, + "grad_norm": 0.72498619556427, + "learning_rate": 7.812620985662066e-05, + "loss": 0.8319, + "step": 97030 + }, + { + "epoch": 0.6199609010643599, + "grad_norm": 0.5626809000968933, + "learning_rate": 7.812206118926539e-05, + "loss": 0.6742, + "step": 97040 + }, + { + "epoch": 0.6200247882140986, + "grad_norm": 1.0397377014160156, + "learning_rate": 7.81179122386966e-05, + "loss": 0.863, + "step": 97050 + }, + { + "epoch": 0.6200886753638373, + "grad_norm": 0.5984945297241211, + "learning_rate": 7.811376300495612e-05, + "loss": 1.0026, + "step": 97060 + }, + { + "epoch": 0.620152562513576, + "grad_norm": 1.6048803329467773, + "learning_rate": 7.810961348808572e-05, + "loss": 0.9494, + "step": 97070 + }, + { + "epoch": 0.6202164496633147, + "grad_norm": 0.6135510206222534, + "learning_rate": 7.810546368812721e-05, + "loss": 0.7765, + "step": 97080 + }, + { + "epoch": 0.6202803368130534, + "grad_norm": 1.2818505764007568, + "learning_rate": 7.810131360512236e-05, + "loss": 0.7591, + "step": 97090 + }, + { + "epoch": 0.6203442239627921, + "grad_norm": 0.8664326071739197, + "learning_rate": 7.809716323911296e-05, + "loss": 0.8345, + "step": 97100 + }, + { + "epoch": 0.6204081111125308, + "grad_norm": 0.9286889433860779, + "learning_rate": 7.809301259014083e-05, + "loss": 1.1506, + "step": 97110 + }, + { + "epoch": 0.6204719982622695, + "grad_norm": 0.7341832518577576, + "learning_rate": 7.808886165824775e-05, + "loss": 0.8342, + "step": 97120 + }, + { + "epoch": 0.6205358854120082, + "grad_norm": 2.2999391555786133, + "learning_rate": 7.808471044347555e-05, + "loss": 0.8378, + "step": 97130 + }, + { + "epoch": 0.6205997725617469, + "grad_norm": 0.6908975839614868, + "learning_rate": 7.808055894586602e-05, + "loss": 1.0413, + "step": 97140 + }, + { + "epoch": 0.6206636597114856, + "grad_norm": 0.7688397765159607, + "learning_rate": 7.807640716546094e-05, + "loss": 0.7239, + "step": 97150 + }, + { + "epoch": 0.6207275468612243, + "grad_norm": 0.830764889717102, + "learning_rate": 7.807225510230216e-05, + "loss": 0.8647, + "step": 97160 + }, + { + "epoch": 0.620791434010963, + "grad_norm": 1.2622300386428833, + "learning_rate": 7.80681027564315e-05, + "loss": 0.8693, + "step": 97170 + }, + { + "epoch": 0.6208553211607017, + "grad_norm": 0.9150146842002869, + "learning_rate": 7.806395012789074e-05, + "loss": 0.854, + "step": 97180 + }, + { + "epoch": 0.6209192083104405, + "grad_norm": 0.8529565334320068, + "learning_rate": 7.805979721672175e-05, + "loss": 0.8418, + "step": 97190 + }, + { + "epoch": 0.6209830954601792, + "grad_norm": 1.1613361835479736, + "learning_rate": 7.80556440229663e-05, + "loss": 0.93, + "step": 97200 + }, + { + "epoch": 0.6210469826099179, + "grad_norm": 1.5015759468078613, + "learning_rate": 7.805149054666626e-05, + "loss": 1.0721, + "step": 97210 + }, + { + "epoch": 0.6211108697596566, + "grad_norm": 0.8608677387237549, + "learning_rate": 7.804733678786345e-05, + "loss": 0.9352, + "step": 97220 + }, + { + "epoch": 0.6211747569093953, + "grad_norm": 0.7824024558067322, + "learning_rate": 7.804318274659967e-05, + "loss": 0.7138, + "step": 97230 + }, + { + "epoch": 0.621238644059134, + "grad_norm": 0.6938091516494751, + "learning_rate": 7.803902842291679e-05, + "loss": 0.8414, + "step": 97240 + }, + { + "epoch": 0.6213025312088727, + "grad_norm": 1.1884207725524902, + "learning_rate": 7.803487381685665e-05, + "loss": 0.846, + "step": 97250 + }, + { + "epoch": 0.6213664183586114, + "grad_norm": 0.9938066005706787, + "learning_rate": 7.803071892846106e-05, + "loss": 1.0066, + "step": 97260 + }, + { + "epoch": 0.6214303055083501, + "grad_norm": 1.1937052011489868, + "learning_rate": 7.802656375777188e-05, + "loss": 1.042, + "step": 97270 + }, + { + "epoch": 0.6214941926580888, + "grad_norm": 0.7454966306686401, + "learning_rate": 7.802240830483096e-05, + "loss": 0.9139, + "step": 97280 + }, + { + "epoch": 0.6215580798078274, + "grad_norm": 0.8491148948669434, + "learning_rate": 7.801825256968015e-05, + "loss": 0.8569, + "step": 97290 + }, + { + "epoch": 0.6216219669575661, + "grad_norm": 1.283415675163269, + "learning_rate": 7.80140965523613e-05, + "loss": 1.134, + "step": 97300 + }, + { + "epoch": 0.6216858541073048, + "grad_norm": 0.9425275921821594, + "learning_rate": 7.800994025291626e-05, + "loss": 0.8734, + "step": 97310 + }, + { + "epoch": 0.6217497412570435, + "grad_norm": 0.8842566609382629, + "learning_rate": 7.800578367138688e-05, + "loss": 0.9209, + "step": 97320 + }, + { + "epoch": 0.6218136284067822, + "grad_norm": 0.9904354214668274, + "learning_rate": 7.800162680781504e-05, + "loss": 0.7265, + "step": 97330 + }, + { + "epoch": 0.6218775155565209, + "grad_norm": 0.956762969493866, + "learning_rate": 7.79974696622426e-05, + "loss": 0.911, + "step": 97340 + }, + { + "epoch": 0.6219414027062596, + "grad_norm": 0.7186155319213867, + "learning_rate": 7.79933122347114e-05, + "loss": 0.9249, + "step": 97350 + }, + { + "epoch": 0.6220052898559983, + "grad_norm": 0.733720064163208, + "learning_rate": 7.798915452526334e-05, + "loss": 0.9297, + "step": 97360 + }, + { + "epoch": 0.622069177005737, + "grad_norm": 0.8453028202056885, + "learning_rate": 7.798499653394028e-05, + "loss": 0.9505, + "step": 97370 + }, + { + "epoch": 0.6221330641554758, + "grad_norm": 1.2403620481491089, + "learning_rate": 7.798083826078408e-05, + "loss": 1.3309, + "step": 97380 + }, + { + "epoch": 0.6221969513052145, + "grad_norm": 1.1222939491271973, + "learning_rate": 7.797667970583666e-05, + "loss": 1.1289, + "step": 97390 + }, + { + "epoch": 0.6222608384549532, + "grad_norm": 0.633385956287384, + "learning_rate": 7.797252086913984e-05, + "loss": 0.9007, + "step": 97400 + }, + { + "epoch": 0.6223247256046919, + "grad_norm": 0.7996073365211487, + "learning_rate": 7.796877767525162e-05, + "loss": 0.9044, + "step": 97410 + }, + { + "epoch": 0.6223886127544306, + "grad_norm": 0.7875693440437317, + "learning_rate": 7.796461830334642e-05, + "loss": 1.0713, + "step": 97420 + }, + { + "epoch": 0.6224524999041693, + "grad_norm": 1.1441236734390259, + "learning_rate": 7.79604586498133e-05, + "loss": 0.9106, + "step": 97430 + }, + { + "epoch": 0.622516387053908, + "grad_norm": 1.1980715990066528, + "learning_rate": 7.795629871469419e-05, + "loss": 0.9184, + "step": 97440 + }, + { + "epoch": 0.6225802742036467, + "grad_norm": 0.8532522320747375, + "learning_rate": 7.795213849803094e-05, + "loss": 0.827, + "step": 97450 + }, + { + "epoch": 0.6226441613533854, + "grad_norm": 0.9568140506744385, + "learning_rate": 7.794797799986549e-05, + "loss": 0.8368, + "step": 97460 + }, + { + "epoch": 0.6227080485031241, + "grad_norm": 0.8139510750770569, + "learning_rate": 7.794381722023973e-05, + "loss": 1.1102, + "step": 97470 + }, + { + "epoch": 0.6227719356528628, + "grad_norm": 1.3737013339996338, + "learning_rate": 7.793965615919555e-05, + "loss": 1.0294, + "step": 97480 + }, + { + "epoch": 0.6228358228026015, + "grad_norm": 1.1957775354385376, + "learning_rate": 7.793549481677485e-05, + "loss": 0.75, + "step": 97490 + }, + { + "epoch": 0.6228997099523402, + "grad_norm": 0.7739052772521973, + "learning_rate": 7.793133319301956e-05, + "loss": 0.8989, + "step": 97500 + }, + { + "epoch": 0.6229635971020789, + "grad_norm": 1.287320852279663, + "learning_rate": 7.792717128797157e-05, + "loss": 1.1412, + "step": 97510 + }, + { + "epoch": 0.6230274842518176, + "grad_norm": 1.1825543642044067, + "learning_rate": 7.792300910167284e-05, + "loss": 0.7868, + "step": 97520 + }, + { + "epoch": 0.6230913714015562, + "grad_norm": 0.9416884183883667, + "learning_rate": 7.791884663416522e-05, + "loss": 1.0537, + "step": 97530 + }, + { + "epoch": 0.6231552585512949, + "grad_norm": 0.7893606424331665, + "learning_rate": 7.791468388549066e-05, + "loss": 0.9721, + "step": 97540 + }, + { + "epoch": 0.6232191457010336, + "grad_norm": 0.6625798344612122, + "learning_rate": 7.79105208556911e-05, + "loss": 1.0084, + "step": 97550 + }, + { + "epoch": 0.6232830328507724, + "grad_norm": 0.9183120131492615, + "learning_rate": 7.790635754480844e-05, + "loss": 1.2298, + "step": 97560 + }, + { + "epoch": 0.6233469200005111, + "grad_norm": 0.7689588069915771, + "learning_rate": 7.790219395288461e-05, + "loss": 0.8109, + "step": 97570 + }, + { + "epoch": 0.6234108071502498, + "grad_norm": 0.8191707134246826, + "learning_rate": 7.789803007996156e-05, + "loss": 0.9747, + "step": 97580 + }, + { + "epoch": 0.6234746942999885, + "grad_norm": 1.4356540441513062, + "learning_rate": 7.789386592608121e-05, + "loss": 1.053, + "step": 97590 + }, + { + "epoch": 0.6235385814497272, + "grad_norm": 0.5826048851013184, + "learning_rate": 7.78897014912855e-05, + "loss": 0.9845, + "step": 97600 + }, + { + "epoch": 0.6236024685994659, + "grad_norm": 1.0215983390808105, + "learning_rate": 7.788553677561635e-05, + "loss": 1.0139, + "step": 97610 + }, + { + "epoch": 0.6236663557492046, + "grad_norm": 1.153480887413025, + "learning_rate": 7.788137177911573e-05, + "loss": 0.8468, + "step": 97620 + }, + { + "epoch": 0.6237302428989433, + "grad_norm": 1.5136088132858276, + "learning_rate": 7.78772065018256e-05, + "loss": 0.7882, + "step": 97630 + }, + { + "epoch": 0.623794130048682, + "grad_norm": 1.1974624395370483, + "learning_rate": 7.787304094378785e-05, + "loss": 0.9654, + "step": 97640 + }, + { + "epoch": 0.6238580171984207, + "grad_norm": 0.695049524307251, + "learning_rate": 7.786887510504447e-05, + "loss": 0.8364, + "step": 97650 + }, + { + "epoch": 0.6239219043481594, + "grad_norm": 0.7446387410163879, + "learning_rate": 7.786470898563741e-05, + "loss": 0.5882, + "step": 97660 + }, + { + "epoch": 0.6239857914978981, + "grad_norm": 1.169751763343811, + "learning_rate": 7.786054258560863e-05, + "loss": 0.795, + "step": 97670 + }, + { + "epoch": 0.6240496786476368, + "grad_norm": 1.1560198068618774, + "learning_rate": 7.785637590500007e-05, + "loss": 0.7352, + "step": 97680 + }, + { + "epoch": 0.6241135657973755, + "grad_norm": 0.8361658453941345, + "learning_rate": 7.785220894385373e-05, + "loss": 0.7835, + "step": 97690 + }, + { + "epoch": 0.6241774529471142, + "grad_norm": 1.0349642038345337, + "learning_rate": 7.784804170221154e-05, + "loss": 0.7047, + "step": 97700 + }, + { + "epoch": 0.624241340096853, + "grad_norm": 0.7345200181007385, + "learning_rate": 7.784387418011547e-05, + "loss": 0.9272, + "step": 97710 + }, + { + "epoch": 0.6243052272465917, + "grad_norm": 1.210518717765808, + "learning_rate": 7.783970637760751e-05, + "loss": 0.9561, + "step": 97720 + }, + { + "epoch": 0.6243691143963304, + "grad_norm": 1.1094375848770142, + "learning_rate": 7.783553829472962e-05, + "loss": 1.1463, + "step": 97730 + }, + { + "epoch": 0.6244330015460691, + "grad_norm": 0.9743418097496033, + "learning_rate": 7.783136993152376e-05, + "loss": 0.8341, + "step": 97740 + }, + { + "epoch": 0.6244968886958078, + "grad_norm": 0.6543291211128235, + "learning_rate": 7.782720128803195e-05, + "loss": 0.9606, + "step": 97750 + }, + { + "epoch": 0.6245607758455465, + "grad_norm": 1.24593186378479, + "learning_rate": 7.782303236429614e-05, + "loss": 1.2391, + "step": 97760 + }, + { + "epoch": 0.6246246629952851, + "grad_norm": 1.0866676568984985, + "learning_rate": 7.781886316035834e-05, + "loss": 0.96, + "step": 97770 + }, + { + "epoch": 0.6246885501450238, + "grad_norm": 0.8642030358314514, + "learning_rate": 7.78146936762605e-05, + "loss": 1.1729, + "step": 97780 + }, + { + "epoch": 0.6247524372947625, + "grad_norm": 0.8341190218925476, + "learning_rate": 7.781052391204464e-05, + "loss": 0.8916, + "step": 97790 + }, + { + "epoch": 0.6248163244445012, + "grad_norm": 0.8593606948852539, + "learning_rate": 7.780635386775273e-05, + "loss": 0.7954, + "step": 97800 + }, + { + "epoch": 0.6248802115942399, + "grad_norm": 0.7424865365028381, + "learning_rate": 7.780218354342679e-05, + "loss": 0.8716, + "step": 97810 + }, + { + "epoch": 0.6249440987439786, + "grad_norm": 1.128391146659851, + "learning_rate": 7.779801293910883e-05, + "loss": 0.8336, + "step": 97820 + }, + { + "epoch": 0.6250079858937173, + "grad_norm": 0.8907873630523682, + "learning_rate": 7.779384205484079e-05, + "loss": 1.0559, + "step": 97830 + }, + { + "epoch": 0.625071873043456, + "grad_norm": 0.7362083792686462, + "learning_rate": 7.778967089066474e-05, + "loss": 1.0033, + "step": 97840 + }, + { + "epoch": 0.6251357601931947, + "grad_norm": 0.8434352278709412, + "learning_rate": 7.778549944662266e-05, + "loss": 0.9773, + "step": 97850 + }, + { + "epoch": 0.6251996473429334, + "grad_norm": 2.6534831523895264, + "learning_rate": 7.778132772275657e-05, + "loss": 0.921, + "step": 97860 + }, + { + "epoch": 0.6252635344926721, + "grad_norm": 1.1809990406036377, + "learning_rate": 7.777715571910846e-05, + "loss": 0.9785, + "step": 97870 + }, + { + "epoch": 0.6253274216424108, + "grad_norm": 0.7903746962547302, + "learning_rate": 7.777298343572038e-05, + "loss": 0.8915, + "step": 97880 + }, + { + "epoch": 0.6253913087921495, + "grad_norm": 0.6154451370239258, + "learning_rate": 7.776881087263433e-05, + "loss": 1.0139, + "step": 97890 + }, + { + "epoch": 0.6254551959418883, + "grad_norm": 0.7355427145957947, + "learning_rate": 7.776463802989232e-05, + "loss": 0.8841, + "step": 97900 + }, + { + "epoch": 0.625519083091627, + "grad_norm": 1.5603142976760864, + "learning_rate": 7.776046490753638e-05, + "loss": 0.9459, + "step": 97910 + }, + { + "epoch": 0.6255829702413657, + "grad_norm": 1.0645157098770142, + "learning_rate": 7.775629150560854e-05, + "loss": 0.8107, + "step": 97920 + }, + { + "epoch": 0.6256468573911044, + "grad_norm": 0.7436626553535461, + "learning_rate": 7.775211782415084e-05, + "loss": 0.6884, + "step": 97930 + }, + { + "epoch": 0.6257107445408431, + "grad_norm": 1.6905604600906372, + "learning_rate": 7.774794386320531e-05, + "loss": 0.9719, + "step": 97940 + }, + { + "epoch": 0.6257746316905818, + "grad_norm": 1.0564686059951782, + "learning_rate": 7.774376962281398e-05, + "loss": 0.9414, + "step": 97950 + }, + { + "epoch": 0.6258385188403205, + "grad_norm": 0.7647698521614075, + "learning_rate": 7.773959510301887e-05, + "loss": 1.0905, + "step": 97960 + }, + { + "epoch": 0.6259024059900592, + "grad_norm": 0.8428241610527039, + "learning_rate": 7.773542030386205e-05, + "loss": 1.0266, + "step": 97970 + }, + { + "epoch": 0.6259662931397979, + "grad_norm": 0.5705221891403198, + "learning_rate": 7.773124522538556e-05, + "loss": 0.8996, + "step": 97980 + }, + { + "epoch": 0.6260301802895366, + "grad_norm": 0.9240884780883789, + "learning_rate": 7.772706986763142e-05, + "loss": 0.6718, + "step": 97990 + }, + { + "epoch": 0.6260940674392753, + "grad_norm": 1.4182459115982056, + "learning_rate": 7.772289423064174e-05, + "loss": 0.9454, + "step": 98000 + }, + { + "epoch": 0.626157954589014, + "grad_norm": 0.46557140350341797, + "learning_rate": 7.77187183144585e-05, + "loss": 0.8053, + "step": 98010 + }, + { + "epoch": 0.6262218417387526, + "grad_norm": 1.070710301399231, + "learning_rate": 7.771454211912378e-05, + "loss": 0.8369, + "step": 98020 + }, + { + "epoch": 0.6262857288884913, + "grad_norm": 1.3407284021377563, + "learning_rate": 7.771036564467967e-05, + "loss": 0.6425, + "step": 98030 + }, + { + "epoch": 0.62634961603823, + "grad_norm": 1.1556596755981445, + "learning_rate": 7.770618889116819e-05, + "loss": 0.7995, + "step": 98040 + }, + { + "epoch": 0.6264135031879687, + "grad_norm": 0.8401532769203186, + "learning_rate": 7.770201185863142e-05, + "loss": 1.0753, + "step": 98050 + }, + { + "epoch": 0.6264773903377074, + "grad_norm": 0.927470862865448, + "learning_rate": 7.769783454711143e-05, + "loss": 0.8812, + "step": 98060 + }, + { + "epoch": 0.6265412774874461, + "grad_norm": 0.7423887252807617, + "learning_rate": 7.769365695665027e-05, + "loss": 0.9529, + "step": 98070 + }, + { + "epoch": 0.6266051646371849, + "grad_norm": 0.5495186448097229, + "learning_rate": 7.768947908729003e-05, + "loss": 0.7396, + "step": 98080 + }, + { + "epoch": 0.6266690517869236, + "grad_norm": 0.8177791833877563, + "learning_rate": 7.768530093907279e-05, + "loss": 0.9336, + "step": 98090 + }, + { + "epoch": 0.6267329389366623, + "grad_norm": 2.721142530441284, + "learning_rate": 7.768112251204061e-05, + "loss": 1.0718, + "step": 98100 + }, + { + "epoch": 0.626796826086401, + "grad_norm": 0.5694549679756165, + "learning_rate": 7.767694380623558e-05, + "loss": 0.6205, + "step": 98110 + }, + { + "epoch": 0.6268607132361397, + "grad_norm": 0.9336040616035461, + "learning_rate": 7.767276482169979e-05, + "loss": 0.8428, + "step": 98120 + }, + { + "epoch": 0.6269246003858784, + "grad_norm": 1.029270052909851, + "learning_rate": 7.766858555847531e-05, + "loss": 0.8425, + "step": 98130 + }, + { + "epoch": 0.6269884875356171, + "grad_norm": 1.2212886810302734, + "learning_rate": 7.766440601660424e-05, + "loss": 1.0028, + "step": 98140 + }, + { + "epoch": 0.6270523746853558, + "grad_norm": 2.6575090885162354, + "learning_rate": 7.766022619612867e-05, + "loss": 0.8225, + "step": 98150 + }, + { + "epoch": 0.6271162618350945, + "grad_norm": 0.7824742197990417, + "learning_rate": 7.765604609709069e-05, + "loss": 0.903, + "step": 98160 + }, + { + "epoch": 0.6271801489848332, + "grad_norm": 0.9830259084701538, + "learning_rate": 7.76518657195324e-05, + "loss": 1.0083, + "step": 98170 + }, + { + "epoch": 0.6272440361345719, + "grad_norm": 0.7284572720527649, + "learning_rate": 7.764768506349589e-05, + "loss": 0.7337, + "step": 98180 + }, + { + "epoch": 0.6273079232843106, + "grad_norm": 1.008009910583496, + "learning_rate": 7.764350412902328e-05, + "loss": 1.0301, + "step": 98190 + }, + { + "epoch": 0.6273718104340493, + "grad_norm": 0.7041063904762268, + "learning_rate": 7.763932291615667e-05, + "loss": 1.0467, + "step": 98200 + }, + { + "epoch": 0.627435697583788, + "grad_norm": 1.120405673980713, + "learning_rate": 7.763514142493818e-05, + "loss": 1.0133, + "step": 98210 + }, + { + "epoch": 0.6274995847335267, + "grad_norm": 0.855456531047821, + "learning_rate": 7.76309596554099e-05, + "loss": 1.1737, + "step": 98220 + }, + { + "epoch": 0.6275634718832654, + "grad_norm": 0.8081047534942627, + "learning_rate": 7.762677760761394e-05, + "loss": 0.9021, + "step": 98230 + }, + { + "epoch": 0.6276273590330042, + "grad_norm": 0.7557641267776489, + "learning_rate": 7.762259528159243e-05, + "loss": 0.9765, + "step": 98240 + }, + { + "epoch": 0.6276912461827429, + "grad_norm": 3.023898124694824, + "learning_rate": 7.76184126773875e-05, + "loss": 0.8915, + "step": 98250 + }, + { + "epoch": 0.6277551333324815, + "grad_norm": 1.2447547912597656, + "learning_rate": 7.761422979504128e-05, + "loss": 0.9107, + "step": 98260 + }, + { + "epoch": 0.6278190204822202, + "grad_norm": 1.0318201780319214, + "learning_rate": 7.761004663459584e-05, + "loss": 0.8787, + "step": 98270 + }, + { + "epoch": 0.6278829076319589, + "grad_norm": 0.69561767578125, + "learning_rate": 7.760586319609335e-05, + "loss": 0.8026, + "step": 98280 + }, + { + "epoch": 0.6279467947816976, + "grad_norm": 0.5632861256599426, + "learning_rate": 7.760167947957595e-05, + "loss": 0.7686, + "step": 98290 + }, + { + "epoch": 0.6280106819314363, + "grad_norm": 0.8044828772544861, + "learning_rate": 7.759749548508575e-05, + "loss": 0.8049, + "step": 98300 + }, + { + "epoch": 0.628074569081175, + "grad_norm": 1.2165446281433105, + "learning_rate": 7.759331121266489e-05, + "loss": 0.7037, + "step": 98310 + }, + { + "epoch": 0.6281384562309137, + "grad_norm": 0.5233101844787598, + "learning_rate": 7.758912666235552e-05, + "loss": 0.8435, + "step": 98320 + }, + { + "epoch": 0.6282023433806524, + "grad_norm": 0.7019632458686829, + "learning_rate": 7.758494183419978e-05, + "loss": 0.7907, + "step": 98330 + }, + { + "epoch": 0.6282662305303911, + "grad_norm": 0.7987385988235474, + "learning_rate": 7.758075672823982e-05, + "loss": 1.0906, + "step": 98340 + }, + { + "epoch": 0.6283301176801298, + "grad_norm": 0.8817057013511658, + "learning_rate": 7.757657134451776e-05, + "loss": 0.9428, + "step": 98350 + }, + { + "epoch": 0.6283940048298685, + "grad_norm": 0.7818195819854736, + "learning_rate": 7.757238568307576e-05, + "loss": 0.6948, + "step": 98360 + }, + { + "epoch": 0.6284578919796072, + "grad_norm": 1.5914932489395142, + "learning_rate": 7.756819974395602e-05, + "loss": 0.9719, + "step": 98370 + }, + { + "epoch": 0.6285217791293459, + "grad_norm": 0.9636878967285156, + "learning_rate": 7.756401352720063e-05, + "loss": 0.9189, + "step": 98380 + }, + { + "epoch": 0.6285856662790846, + "grad_norm": 1.070579171180725, + "learning_rate": 7.755982703285178e-05, + "loss": 0.9929, + "step": 98390 + }, + { + "epoch": 0.6286495534288233, + "grad_norm": 0.9646096229553223, + "learning_rate": 7.755564026095164e-05, + "loss": 0.7181, + "step": 98400 + }, + { + "epoch": 0.628713440578562, + "grad_norm": 1.1919089555740356, + "learning_rate": 7.755145321154235e-05, + "loss": 0.956, + "step": 98410 + }, + { + "epoch": 0.6287773277283008, + "grad_norm": 1.0525037050247192, + "learning_rate": 7.754726588466611e-05, + "loss": 0.8289, + "step": 98420 + }, + { + "epoch": 0.6288412148780395, + "grad_norm": 0.8866745233535767, + "learning_rate": 7.754307828036507e-05, + "loss": 1.1507, + "step": 98430 + }, + { + "epoch": 0.6289051020277782, + "grad_norm": 1.4663811922073364, + "learning_rate": 7.753889039868138e-05, + "loss": 0.8841, + "step": 98440 + }, + { + "epoch": 0.6289689891775169, + "grad_norm": 0.8468247056007385, + "learning_rate": 7.753470223965726e-05, + "loss": 1.0977, + "step": 98450 + }, + { + "epoch": 0.6290328763272556, + "grad_norm": 0.6286731958389282, + "learning_rate": 7.753051380333485e-05, + "loss": 0.6818, + "step": 98460 + }, + { + "epoch": 0.6290967634769943, + "grad_norm": 1.2429255247116089, + "learning_rate": 7.752632508975636e-05, + "loss": 0.8151, + "step": 98470 + }, + { + "epoch": 0.629160650626733, + "grad_norm": 1.2103321552276611, + "learning_rate": 7.752213609896396e-05, + "loss": 0.7415, + "step": 98480 + }, + { + "epoch": 0.6292245377764717, + "grad_norm": 1.9655529260635376, + "learning_rate": 7.751794683099986e-05, + "loss": 1.1028, + "step": 98490 + }, + { + "epoch": 0.6292884249262103, + "grad_norm": 0.7226641774177551, + "learning_rate": 7.75137572859062e-05, + "loss": 0.9055, + "step": 98500 + }, + { + "epoch": 0.629352312075949, + "grad_norm": 1.1247568130493164, + "learning_rate": 7.750956746372521e-05, + "loss": 1.0251, + "step": 98510 + }, + { + "epoch": 0.6294161992256877, + "grad_norm": 1.818439245223999, + "learning_rate": 7.750537736449908e-05, + "loss": 0.6323, + "step": 98520 + }, + { + "epoch": 0.6294800863754264, + "grad_norm": 0.6944345235824585, + "learning_rate": 7.750118698827e-05, + "loss": 0.9022, + "step": 98530 + }, + { + "epoch": 0.6295439735251651, + "grad_norm": 1.0383299589157104, + "learning_rate": 7.749699633508019e-05, + "loss": 0.9505, + "step": 98540 + }, + { + "epoch": 0.6296078606749038, + "grad_norm": 0.5083116888999939, + "learning_rate": 7.749280540497181e-05, + "loss": 0.7154, + "step": 98550 + }, + { + "epoch": 0.6296717478246425, + "grad_norm": 1.316440224647522, + "learning_rate": 7.748861419798712e-05, + "loss": 0.7183, + "step": 98560 + }, + { + "epoch": 0.6297356349743812, + "grad_norm": 0.9615148901939392, + "learning_rate": 7.74844227141683e-05, + "loss": 1.0438, + "step": 98570 + }, + { + "epoch": 0.6297995221241199, + "grad_norm": 1.223386526107788, + "learning_rate": 7.748023095355756e-05, + "loss": 0.8429, + "step": 98580 + }, + { + "epoch": 0.6298634092738586, + "grad_norm": 0.7648318409919739, + "learning_rate": 7.747603891619712e-05, + "loss": 0.8862, + "step": 98590 + }, + { + "epoch": 0.6299272964235973, + "grad_norm": 0.8979175686836243, + "learning_rate": 7.747184660212918e-05, + "loss": 0.9744, + "step": 98600 + }, + { + "epoch": 0.629991183573336, + "grad_norm": 0.8479979038238525, + "learning_rate": 7.7467654011396e-05, + "loss": 0.9066, + "step": 98610 + }, + { + "epoch": 0.6300550707230748, + "grad_norm": 1.0452567338943481, + "learning_rate": 7.746346114403978e-05, + "loss": 0.717, + "step": 98620 + }, + { + "epoch": 0.6301189578728135, + "grad_norm": 0.5586809515953064, + "learning_rate": 7.745926800010275e-05, + "loss": 0.7231, + "step": 98630 + }, + { + "epoch": 0.6301828450225522, + "grad_norm": 0.9758456945419312, + "learning_rate": 7.745507457962712e-05, + "loss": 0.7899, + "step": 98640 + }, + { + "epoch": 0.6302467321722909, + "grad_norm": 0.8799155354499817, + "learning_rate": 7.745088088265516e-05, + "loss": 0.8026, + "step": 98650 + }, + { + "epoch": 0.6303106193220296, + "grad_norm": 0.7209200263023376, + "learning_rate": 7.744668690922907e-05, + "loss": 0.9363, + "step": 98660 + }, + { + "epoch": 0.6303745064717683, + "grad_norm": 1.0429208278656006, + "learning_rate": 7.74424926593911e-05, + "loss": 1.1868, + "step": 98670 + }, + { + "epoch": 0.630438393621507, + "grad_norm": 0.829575777053833, + "learning_rate": 7.743829813318349e-05, + "loss": 0.795, + "step": 98680 + }, + { + "epoch": 0.6305022807712457, + "grad_norm": 0.7974848747253418, + "learning_rate": 7.743410333064847e-05, + "loss": 1.0371, + "step": 98690 + }, + { + "epoch": 0.6305661679209844, + "grad_norm": 1.1023069620132446, + "learning_rate": 7.74299082518283e-05, + "loss": 0.8404, + "step": 98700 + }, + { + "epoch": 0.6306300550707231, + "grad_norm": 1.1051509380340576, + "learning_rate": 7.742571289676522e-05, + "loss": 0.9185, + "step": 98710 + }, + { + "epoch": 0.6306939422204618, + "grad_norm": 1.0306414365768433, + "learning_rate": 7.742151726550149e-05, + "loss": 0.784, + "step": 98720 + }, + { + "epoch": 0.6307578293702005, + "grad_norm": 0.6763244271278381, + "learning_rate": 7.741732135807937e-05, + "loss": 0.983, + "step": 98730 + }, + { + "epoch": 0.6308217165199391, + "grad_norm": 0.5767059326171875, + "learning_rate": 7.741312517454109e-05, + "loss": 0.9532, + "step": 98740 + }, + { + "epoch": 0.6308856036696778, + "grad_norm": 0.8445504307746887, + "learning_rate": 7.740892871492894e-05, + "loss": 1.0529, + "step": 98750 + }, + { + "epoch": 0.6309494908194165, + "grad_norm": 0.5848102569580078, + "learning_rate": 7.740473197928513e-05, + "loss": 0.6972, + "step": 98760 + }, + { + "epoch": 0.6310133779691552, + "grad_norm": 1.0560247898101807, + "learning_rate": 7.740053496765199e-05, + "loss": 0.8724, + "step": 98770 + }, + { + "epoch": 0.631077265118894, + "grad_norm": 1.2998313903808594, + "learning_rate": 7.739633768007175e-05, + "loss": 1.0596, + "step": 98780 + }, + { + "epoch": 0.6311411522686327, + "grad_norm": 0.6636534333229065, + "learning_rate": 7.739214011658669e-05, + "loss": 0.9184, + "step": 98790 + }, + { + "epoch": 0.6312050394183714, + "grad_norm": 0.8721036911010742, + "learning_rate": 7.738794227723907e-05, + "loss": 0.9777, + "step": 98800 + }, + { + "epoch": 0.6312689265681101, + "grad_norm": 0.9047155380249023, + "learning_rate": 7.73837441620712e-05, + "loss": 0.8865, + "step": 98810 + }, + { + "epoch": 0.6313328137178488, + "grad_norm": 0.7591509819030762, + "learning_rate": 7.737954577112532e-05, + "loss": 0.8674, + "step": 98820 + }, + { + "epoch": 0.6313967008675875, + "grad_norm": 0.7103126049041748, + "learning_rate": 7.737534710444372e-05, + "loss": 1.0165, + "step": 98830 + }, + { + "epoch": 0.6314605880173262, + "grad_norm": 0.9940080642700195, + "learning_rate": 7.73711481620687e-05, + "loss": 0.8711, + "step": 98840 + }, + { + "epoch": 0.6315244751670649, + "grad_norm": 0.8602542281150818, + "learning_rate": 7.736694894404254e-05, + "loss": 0.8148, + "step": 98850 + }, + { + "epoch": 0.6315883623168036, + "grad_norm": 0.687978208065033, + "learning_rate": 7.736274945040753e-05, + "loss": 1.0732, + "step": 98860 + }, + { + "epoch": 0.6316522494665423, + "grad_norm": 0.7140915989875793, + "learning_rate": 7.735854968120596e-05, + "loss": 1.043, + "step": 98870 + }, + { + "epoch": 0.631716136616281, + "grad_norm": 0.7227775454521179, + "learning_rate": 7.735434963648013e-05, + "loss": 0.8709, + "step": 98880 + }, + { + "epoch": 0.6317800237660197, + "grad_norm": 0.5700997710227966, + "learning_rate": 7.735014931627234e-05, + "loss": 0.7587, + "step": 98890 + }, + { + "epoch": 0.6318439109157584, + "grad_norm": 1.2444158792495728, + "learning_rate": 7.734594872062486e-05, + "loss": 0.7822, + "step": 98900 + }, + { + "epoch": 0.6319077980654971, + "grad_norm": 1.6679047346115112, + "learning_rate": 7.734174784958004e-05, + "loss": 1.1175, + "step": 98910 + }, + { + "epoch": 0.6319716852152358, + "grad_norm": 0.6713977456092834, + "learning_rate": 7.733754670318016e-05, + "loss": 0.8503, + "step": 98920 + }, + { + "epoch": 0.6320355723649745, + "grad_norm": 1.2853600978851318, + "learning_rate": 7.733334528146753e-05, + "loss": 0.8824, + "step": 98930 + }, + { + "epoch": 0.6320994595147132, + "grad_norm": 1.0082619190216064, + "learning_rate": 7.732914358448448e-05, + "loss": 0.9314, + "step": 98940 + }, + { + "epoch": 0.632163346664452, + "grad_norm": 0.7332447171211243, + "learning_rate": 7.73249416122733e-05, + "loss": 0.7704, + "step": 98950 + }, + { + "epoch": 0.6322272338141907, + "grad_norm": 1.46458101272583, + "learning_rate": 7.732073936487631e-05, + "loss": 0.9249, + "step": 98960 + }, + { + "epoch": 0.6322911209639294, + "grad_norm": 0.8572206497192383, + "learning_rate": 7.731653684233585e-05, + "loss": 1.1214, + "step": 98970 + }, + { + "epoch": 0.6323550081136681, + "grad_norm": 0.780282199382782, + "learning_rate": 7.731233404469424e-05, + "loss": 0.7917, + "step": 98980 + }, + { + "epoch": 0.6324188952634067, + "grad_norm": 0.9528806209564209, + "learning_rate": 7.730813097199379e-05, + "loss": 0.9149, + "step": 98990 + }, + { + "epoch": 0.6324827824131454, + "grad_norm": 0.9242857098579407, + "learning_rate": 7.730392762427683e-05, + "loss": 0.7985, + "step": 99000 + }, + { + "epoch": 0.6325466695628841, + "grad_norm": 0.7129524350166321, + "learning_rate": 7.72997240015857e-05, + "loss": 0.978, + "step": 99010 + }, + { + "epoch": 0.6326105567126228, + "grad_norm": 0.7845136523246765, + "learning_rate": 7.729552010396274e-05, + "loss": 0.8295, + "step": 99020 + }, + { + "epoch": 0.6326744438623615, + "grad_norm": 0.6033239960670471, + "learning_rate": 7.729131593145027e-05, + "loss": 1.1444, + "step": 99030 + }, + { + "epoch": 0.6327383310121002, + "grad_norm": 0.8433771133422852, + "learning_rate": 7.728711148409063e-05, + "loss": 1.1936, + "step": 99040 + }, + { + "epoch": 0.6328022181618389, + "grad_norm": 1.812710165977478, + "learning_rate": 7.728290676192619e-05, + "loss": 0.7792, + "step": 99050 + }, + { + "epoch": 0.6328661053115776, + "grad_norm": 0.8290817737579346, + "learning_rate": 7.727870176499928e-05, + "loss": 0.756, + "step": 99060 + }, + { + "epoch": 0.6329299924613163, + "grad_norm": 1.5934466123580933, + "learning_rate": 7.727449649335222e-05, + "loss": 1.0873, + "step": 99070 + }, + { + "epoch": 0.632993879611055, + "grad_norm": 1.0674597024917603, + "learning_rate": 7.727029094702739e-05, + "loss": 0.7499, + "step": 99080 + }, + { + "epoch": 0.6330577667607937, + "grad_norm": 0.7989637851715088, + "learning_rate": 7.726608512606714e-05, + "loss": 0.926, + "step": 99090 + }, + { + "epoch": 0.6331216539105324, + "grad_norm": 0.6635336875915527, + "learning_rate": 7.726187903051383e-05, + "loss": 0.8383, + "step": 99100 + }, + { + "epoch": 0.6331855410602711, + "grad_norm": 0.4389435648918152, + "learning_rate": 7.725767266040982e-05, + "loss": 0.8101, + "step": 99110 + }, + { + "epoch": 0.6332494282100098, + "grad_norm": 0.8760795593261719, + "learning_rate": 7.725346601579744e-05, + "loss": 1.3585, + "step": 99120 + }, + { + "epoch": 0.6333133153597486, + "grad_norm": 1.2530437707901, + "learning_rate": 7.72492590967191e-05, + "loss": 1.0076, + "step": 99130 + }, + { + "epoch": 0.6333772025094873, + "grad_norm": 1.7212085723876953, + "learning_rate": 7.724505190321714e-05, + "loss": 1.144, + "step": 99140 + }, + { + "epoch": 0.633441089659226, + "grad_norm": 1.0681732892990112, + "learning_rate": 7.724084443533395e-05, + "loss": 0.8348, + "step": 99150 + }, + { + "epoch": 0.6335049768089647, + "grad_norm": 0.8044700026512146, + "learning_rate": 7.723663669311188e-05, + "loss": 0.9591, + "step": 99160 + }, + { + "epoch": 0.6335688639587034, + "grad_norm": 1.0960590839385986, + "learning_rate": 7.723242867659331e-05, + "loss": 0.9373, + "step": 99170 + }, + { + "epoch": 0.6336327511084421, + "grad_norm": 1.0857396125793457, + "learning_rate": 7.722822038582062e-05, + "loss": 1.23, + "step": 99180 + }, + { + "epoch": 0.6336966382581808, + "grad_norm": 0.9322388768196106, + "learning_rate": 7.722401182083621e-05, + "loss": 0.8729, + "step": 99190 + }, + { + "epoch": 0.6337605254079195, + "grad_norm": 0.8809077143669128, + "learning_rate": 7.721980298168243e-05, + "loss": 1.1811, + "step": 99200 + }, + { + "epoch": 0.6338244125576582, + "grad_norm": 0.662137508392334, + "learning_rate": 7.721559386840172e-05, + "loss": 0.8042, + "step": 99210 + }, + { + "epoch": 0.6338882997073969, + "grad_norm": 0.7486308813095093, + "learning_rate": 7.72113844810364e-05, + "loss": 0.8313, + "step": 99220 + }, + { + "epoch": 0.6339521868571355, + "grad_norm": 1.1322681903839111, + "learning_rate": 7.720717481962891e-05, + "loss": 0.9974, + "step": 99230 + }, + { + "epoch": 0.6340160740068742, + "grad_norm": 0.9434690475463867, + "learning_rate": 7.720296488422163e-05, + "loss": 0.7033, + "step": 99240 + }, + { + "epoch": 0.6340799611566129, + "grad_norm": 1.0906821489334106, + "learning_rate": 7.719875467485696e-05, + "loss": 0.7467, + "step": 99250 + }, + { + "epoch": 0.6341438483063516, + "grad_norm": 0.9866161942481995, + "learning_rate": 7.71945441915773e-05, + "loss": 0.9562, + "step": 99260 + }, + { + "epoch": 0.6342077354560903, + "grad_norm": 1.0579166412353516, + "learning_rate": 7.719033343442506e-05, + "loss": 0.7671, + "step": 99270 + }, + { + "epoch": 0.634271622605829, + "grad_norm": 0.8389692902565002, + "learning_rate": 7.718612240344264e-05, + "loss": 0.9807, + "step": 99280 + }, + { + "epoch": 0.6343355097555677, + "grad_norm": 0.9779929518699646, + "learning_rate": 7.718191109867244e-05, + "loss": 0.9206, + "step": 99290 + }, + { + "epoch": 0.6343993969053064, + "grad_norm": 1.6635096073150635, + "learning_rate": 7.717769952015687e-05, + "loss": 0.8582, + "step": 99300 + }, + { + "epoch": 0.6344632840550452, + "grad_norm": 1.1410149335861206, + "learning_rate": 7.717348766793837e-05, + "loss": 0.9895, + "step": 99310 + }, + { + "epoch": 0.6345271712047839, + "grad_norm": 0.8290955424308777, + "learning_rate": 7.716927554205935e-05, + "loss": 0.6947, + "step": 99320 + }, + { + "epoch": 0.6345910583545226, + "grad_norm": 0.919790506362915, + "learning_rate": 7.71650631425622e-05, + "loss": 0.8935, + "step": 99330 + }, + { + "epoch": 0.6346549455042613, + "grad_norm": 0.9774859547615051, + "learning_rate": 7.716085046948937e-05, + "loss": 0.9696, + "step": 99340 + }, + { + "epoch": 0.634718832654, + "grad_norm": 0.6959844827651978, + "learning_rate": 7.715663752288328e-05, + "loss": 0.8616, + "step": 99350 + }, + { + "epoch": 0.6347827198037387, + "grad_norm": 0.7934970855712891, + "learning_rate": 7.715242430278636e-05, + "loss": 1.0319, + "step": 99360 + }, + { + "epoch": 0.6348466069534774, + "grad_norm": 0.8985568284988403, + "learning_rate": 7.714821080924102e-05, + "loss": 1.032, + "step": 99370 + }, + { + "epoch": 0.6349104941032161, + "grad_norm": 0.8157596588134766, + "learning_rate": 7.714399704228972e-05, + "loss": 0.8366, + "step": 99380 + }, + { + "epoch": 0.6349743812529548, + "grad_norm": 0.6816175580024719, + "learning_rate": 7.713978300197488e-05, + "loss": 0.8663, + "step": 99390 + }, + { + "epoch": 0.6350382684026935, + "grad_norm": 1.0564788579940796, + "learning_rate": 7.713556868833896e-05, + "loss": 0.9809, + "step": 99400 + }, + { + "epoch": 0.6351021555524322, + "grad_norm": 1.043823480606079, + "learning_rate": 7.713135410142437e-05, + "loss": 0.7598, + "step": 99410 + }, + { + "epoch": 0.6351660427021709, + "grad_norm": 0.858410120010376, + "learning_rate": 7.71271392412736e-05, + "loss": 0.9618, + "step": 99420 + }, + { + "epoch": 0.6352299298519096, + "grad_norm": 1.3805270195007324, + "learning_rate": 7.712292410792905e-05, + "loss": 0.9138, + "step": 99430 + }, + { + "epoch": 0.6352938170016483, + "grad_norm": 1.437741756439209, + "learning_rate": 7.711870870143321e-05, + "loss": 1.0721, + "step": 99440 + }, + { + "epoch": 0.635357704151387, + "grad_norm": 0.5732793807983398, + "learning_rate": 7.711449302182849e-05, + "loss": 0.8065, + "step": 99450 + }, + { + "epoch": 0.6354215913011257, + "grad_norm": 0.852961003780365, + "learning_rate": 7.711027706915738e-05, + "loss": 0.8523, + "step": 99460 + }, + { + "epoch": 0.6354854784508643, + "grad_norm": 0.7680826783180237, + "learning_rate": 7.710606084346232e-05, + "loss": 1.0963, + "step": 99470 + }, + { + "epoch": 0.635549365600603, + "grad_norm": 0.7236658334732056, + "learning_rate": 7.710184434478577e-05, + "loss": 1.0257, + "step": 99480 + }, + { + "epoch": 0.6356132527503418, + "grad_norm": 0.8568646907806396, + "learning_rate": 7.709762757317021e-05, + "loss": 0.7875, + "step": 99490 + }, + { + "epoch": 0.6356771399000805, + "grad_norm": 1.2610046863555908, + "learning_rate": 7.709341052865811e-05, + "loss": 1.0912, + "step": 99500 + }, + { + "epoch": 0.6357410270498192, + "grad_norm": 0.8206515312194824, + "learning_rate": 7.708919321129192e-05, + "loss": 1.2463, + "step": 99510 + }, + { + "epoch": 0.6358049141995579, + "grad_norm": 1.1320310831069946, + "learning_rate": 7.70849756211141e-05, + "loss": 0.7328, + "step": 99520 + }, + { + "epoch": 0.6358688013492966, + "grad_norm": 0.832253098487854, + "learning_rate": 7.708075775816715e-05, + "loss": 1.0437, + "step": 99530 + }, + { + "epoch": 0.6359326884990353, + "grad_norm": 0.8778328895568848, + "learning_rate": 7.707653962249355e-05, + "loss": 0.748, + "step": 99540 + }, + { + "epoch": 0.635996575648774, + "grad_norm": 0.6165944933891296, + "learning_rate": 7.707232121413577e-05, + "loss": 0.9211, + "step": 99550 + }, + { + "epoch": 0.6360604627985127, + "grad_norm": 0.7879787683486938, + "learning_rate": 7.70681025331363e-05, + "loss": 0.7676, + "step": 99560 + }, + { + "epoch": 0.6361243499482514, + "grad_norm": 1.0421432256698608, + "learning_rate": 7.70638835795376e-05, + "loss": 0.951, + "step": 99570 + }, + { + "epoch": 0.6361882370979901, + "grad_norm": 0.9447425603866577, + "learning_rate": 7.705966435338218e-05, + "loss": 0.9486, + "step": 99580 + }, + { + "epoch": 0.6362521242477288, + "grad_norm": Infinity, + "learning_rate": 7.705586681684145e-05, + "loss": 1.0416, + "step": 99590 + }, + { + "epoch": 0.6363160113974675, + "grad_norm": 0.745619535446167, + "learning_rate": 7.705164707294533e-05, + "loss": 0.7663, + "step": 99600 + }, + { + "epoch": 0.6363798985472062, + "grad_norm": 1.6954360008239746, + "learning_rate": 7.704742705661573e-05, + "loss": 1.0033, + "step": 99610 + }, + { + "epoch": 0.6364437856969449, + "grad_norm": 0.6701345443725586, + "learning_rate": 7.704320676789514e-05, + "loss": 0.7165, + "step": 99620 + }, + { + "epoch": 0.6365076728466836, + "grad_norm": 0.7628158926963806, + "learning_rate": 7.703898620682606e-05, + "loss": 0.8939, + "step": 99630 + }, + { + "epoch": 0.6365715599964223, + "grad_norm": 1.4524946212768555, + "learning_rate": 7.7034765373451e-05, + "loss": 0.9185, + "step": 99640 + }, + { + "epoch": 0.636635447146161, + "grad_norm": 1.0932461023330688, + "learning_rate": 7.703054426781246e-05, + "loss": 0.8613, + "step": 99650 + }, + { + "epoch": 0.6366993342958998, + "grad_norm": 0.8748453855514526, + "learning_rate": 7.702632288995297e-05, + "loss": 0.9724, + "step": 99660 + }, + { + "epoch": 0.6367632214456385, + "grad_norm": 0.6235826015472412, + "learning_rate": 7.7022101239915e-05, + "loss": 0.8267, + "step": 99670 + }, + { + "epoch": 0.6368271085953772, + "grad_norm": 0.9133662581443787, + "learning_rate": 7.701787931774111e-05, + "loss": 0.9088, + "step": 99680 + }, + { + "epoch": 0.6368909957451159, + "grad_norm": 0.8982312679290771, + "learning_rate": 7.701365712347379e-05, + "loss": 0.9541, + "step": 99690 + }, + { + "epoch": 0.6369548828948546, + "grad_norm": 0.6857670545578003, + "learning_rate": 7.700943465715557e-05, + "loss": 1.0629, + "step": 99700 + }, + { + "epoch": 0.6370187700445933, + "grad_norm": 0.8805925250053406, + "learning_rate": 7.7005211918829e-05, + "loss": 0.9387, + "step": 99710 + }, + { + "epoch": 0.6370826571943319, + "grad_norm": 0.7308109402656555, + "learning_rate": 7.700098890853658e-05, + "loss": 0.6636, + "step": 99720 + }, + { + "epoch": 0.6371465443440706, + "grad_norm": 0.8776551485061646, + "learning_rate": 7.699676562632084e-05, + "loss": 0.9326, + "step": 99730 + }, + { + "epoch": 0.6372104314938093, + "grad_norm": 0.6855533719062805, + "learning_rate": 7.699254207222429e-05, + "loss": 1.1151, + "step": 99740 + }, + { + "epoch": 0.637274318643548, + "grad_norm": 1.2820175886154175, + "learning_rate": 7.698831824628951e-05, + "loss": 1.0056, + "step": 99750 + }, + { + "epoch": 0.6373382057932867, + "grad_norm": 0.8445931673049927, + "learning_rate": 7.698409414855902e-05, + "loss": 0.9203, + "step": 99760 + }, + { + "epoch": 0.6374020929430254, + "grad_norm": 1.1383895874023438, + "learning_rate": 7.697986977907534e-05, + "loss": 0.9536, + "step": 99770 + }, + { + "epoch": 0.6374659800927641, + "grad_norm": 1.0668944120407104, + "learning_rate": 7.697564513788105e-05, + "loss": 1.3005, + "step": 99780 + }, + { + "epoch": 0.6375298672425028, + "grad_norm": 1.0279992818832397, + "learning_rate": 7.697142022501866e-05, + "loss": 0.9918, + "step": 99790 + }, + { + "epoch": 0.6375937543922415, + "grad_norm": 0.8683011531829834, + "learning_rate": 7.696719504053075e-05, + "loss": 1.1289, + "step": 99800 + }, + { + "epoch": 0.6376576415419802, + "grad_norm": 0.6475001573562622, + "learning_rate": 7.696296958445985e-05, + "loss": 0.9956, + "step": 99810 + }, + { + "epoch": 0.6377215286917189, + "grad_norm": 0.9741643667221069, + "learning_rate": 7.695874385684852e-05, + "loss": 0.8118, + "step": 99820 + }, + { + "epoch": 0.6377854158414576, + "grad_norm": 0.6049178838729858, + "learning_rate": 7.695451785773931e-05, + "loss": 0.9677, + "step": 99830 + }, + { + "epoch": 0.6378493029911964, + "grad_norm": 0.7751360535621643, + "learning_rate": 7.695029158717479e-05, + "loss": 0.7172, + "step": 99840 + }, + { + "epoch": 0.6379131901409351, + "grad_norm": 0.6430035829544067, + "learning_rate": 7.694606504519752e-05, + "loss": 0.9808, + "step": 99850 + }, + { + "epoch": 0.6379770772906738, + "grad_norm": 1.0987446308135986, + "learning_rate": 7.694183823185005e-05, + "loss": 0.8621, + "step": 99860 + }, + { + "epoch": 0.6380409644404125, + "grad_norm": 0.7962204217910767, + "learning_rate": 7.6937611147175e-05, + "loss": 0.9515, + "step": 99870 + }, + { + "epoch": 0.6381048515901512, + "grad_norm": 0.8740015625953674, + "learning_rate": 7.693338379121486e-05, + "loss": 0.865, + "step": 99880 + }, + { + "epoch": 0.6381687387398899, + "grad_norm": 0.8522897362709045, + "learning_rate": 7.692915616401226e-05, + "loss": 0.7795, + "step": 99890 + }, + { + "epoch": 0.6382326258896286, + "grad_norm": 0.8299471139907837, + "learning_rate": 7.692492826560978e-05, + "loss": 1.0699, + "step": 99900 + }, + { + "epoch": 0.6382965130393673, + "grad_norm": 1.0273027420043945, + "learning_rate": 7.692070009604994e-05, + "loss": 0.8754, + "step": 99910 + }, + { + "epoch": 0.638360400189106, + "grad_norm": 0.8855130672454834, + "learning_rate": 7.69164716553754e-05, + "loss": 0.9245, + "step": 99920 + }, + { + "epoch": 0.6384242873388447, + "grad_norm": 0.9745055437088013, + "learning_rate": 7.691224294362866e-05, + "loss": 0.6921, + "step": 99930 + }, + { + "epoch": 0.6384881744885834, + "grad_norm": 0.7872833609580994, + "learning_rate": 7.690801396085239e-05, + "loss": 0.7978, + "step": 99940 + }, + { + "epoch": 0.6385520616383221, + "grad_norm": 0.8484395742416382, + "learning_rate": 7.690378470708912e-05, + "loss": 0.6428, + "step": 99950 + }, + { + "epoch": 0.6386159487880607, + "grad_norm": 0.9526743292808533, + "learning_rate": 7.689955518238148e-05, + "loss": 0.9055, + "step": 99960 + }, + { + "epoch": 0.6386798359377994, + "grad_norm": 0.7249189615249634, + "learning_rate": 7.689532538677203e-05, + "loss": 0.9418, + "step": 99970 + }, + { + "epoch": 0.6387437230875381, + "grad_norm": 0.695597767829895, + "learning_rate": 7.689109532030339e-05, + "loss": 0.9258, + "step": 99980 + }, + { + "epoch": 0.6388076102372768, + "grad_norm": 1.010576605796814, + "learning_rate": 7.688686498301816e-05, + "loss": 0.8175, + "step": 99990 + }, + { + "epoch": 0.6388714973870155, + "grad_norm": 0.8327822685241699, + "learning_rate": 7.688263437495892e-05, + "loss": 0.8687, + "step": 100000 + }, + { + "epoch": 0.6389353845367542, + "grad_norm": 0.7016774415969849, + "learning_rate": 7.687840349616833e-05, + "loss": 1.0383, + "step": 100010 + }, + { + "epoch": 0.638999271686493, + "grad_norm": 0.9525433778762817, + "learning_rate": 7.687417234668895e-05, + "loss": 0.8909, + "step": 100020 + }, + { + "epoch": 0.6390631588362317, + "grad_norm": 0.8068029284477234, + "learning_rate": 7.686994092656339e-05, + "loss": 0.9069, + "step": 100030 + }, + { + "epoch": 0.6391270459859704, + "grad_norm": 1.108211636543274, + "learning_rate": 7.686570923583429e-05, + "loss": 0.931, + "step": 100040 + }, + { + "epoch": 0.6391909331357091, + "grad_norm": 1.0641669034957886, + "learning_rate": 7.686147727454426e-05, + "loss": 0.8077, + "step": 100050 + }, + { + "epoch": 0.6392548202854478, + "grad_norm": 0.8529702425003052, + "learning_rate": 7.68572450427359e-05, + "loss": 1.153, + "step": 100060 + }, + { + "epoch": 0.6393187074351865, + "grad_norm": 0.6657126545906067, + "learning_rate": 7.685301254045188e-05, + "loss": 0.7854, + "step": 100070 + }, + { + "epoch": 0.6393825945849252, + "grad_norm": 1.6039618253707886, + "learning_rate": 7.684877976773476e-05, + "loss": 0.9493, + "step": 100080 + }, + { + "epoch": 0.6394464817346639, + "grad_norm": 0.7978668212890625, + "learning_rate": 7.684454672462723e-05, + "loss": 1.034, + "step": 100090 + }, + { + "epoch": 0.6395103688844026, + "grad_norm": 0.9934602379798889, + "learning_rate": 7.684031341117186e-05, + "loss": 1.1376, + "step": 100100 + }, + { + "epoch": 0.6395742560341413, + "grad_norm": 1.048313856124878, + "learning_rate": 7.683607982741132e-05, + "loss": 0.9767, + "step": 100110 + }, + { + "epoch": 0.63963814318388, + "grad_norm": 0.9221808910369873, + "learning_rate": 7.683184597338826e-05, + "loss": 0.836, + "step": 100120 + }, + { + "epoch": 0.6397020303336187, + "grad_norm": 0.9617723822593689, + "learning_rate": 7.682761184914528e-05, + "loss": 0.6673, + "step": 100130 + }, + { + "epoch": 0.6397659174833574, + "grad_norm": 1.2165039777755737, + "learning_rate": 7.682337745472505e-05, + "loss": 1.1207, + "step": 100140 + }, + { + "epoch": 0.6398298046330961, + "grad_norm": 1.1467498540878296, + "learning_rate": 7.681914279017019e-05, + "loss": 0.8988, + "step": 100150 + }, + { + "epoch": 0.6398936917828348, + "grad_norm": 1.0085322856903076, + "learning_rate": 7.681490785552337e-05, + "loss": 0.9632, + "step": 100160 + }, + { + "epoch": 0.6399575789325735, + "grad_norm": 1.1508851051330566, + "learning_rate": 7.681067265082721e-05, + "loss": 0.885, + "step": 100170 + }, + { + "epoch": 0.6400214660823123, + "grad_norm": 1.3940848112106323, + "learning_rate": 7.680643717612441e-05, + "loss": 0.9147, + "step": 100180 + }, + { + "epoch": 0.640085353232051, + "grad_norm": 1.0096964836120605, + "learning_rate": 7.680220143145757e-05, + "loss": 0.9108, + "step": 100190 + }, + { + "epoch": 0.6401492403817896, + "grad_norm": 0.6056742668151855, + "learning_rate": 7.679796541686942e-05, + "loss": 0.9895, + "step": 100200 + }, + { + "epoch": 0.6402131275315283, + "grad_norm": 0.8732916116714478, + "learning_rate": 7.679372913240252e-05, + "loss": 0.8695, + "step": 100210 + }, + { + "epoch": 0.640277014681267, + "grad_norm": 1.1940739154815674, + "learning_rate": 7.678949257809962e-05, + "loss": 0.851, + "step": 100220 + }, + { + "epoch": 0.6403409018310057, + "grad_norm": 0.9139200448989868, + "learning_rate": 7.678525575400335e-05, + "loss": 0.8596, + "step": 100230 + }, + { + "epoch": 0.6404047889807444, + "grad_norm": 0.871724009513855, + "learning_rate": 7.678101866015638e-05, + "loss": 1.1388, + "step": 100240 + }, + { + "epoch": 0.6404686761304831, + "grad_norm": 1.4798542261123657, + "learning_rate": 7.677678129660137e-05, + "loss": 0.8845, + "step": 100250 + }, + { + "epoch": 0.6405325632802218, + "grad_norm": 0.8862691521644592, + "learning_rate": 7.677254366338103e-05, + "loss": 0.8407, + "step": 100260 + }, + { + "epoch": 0.6405964504299605, + "grad_norm": 1.0017880201339722, + "learning_rate": 7.676830576053799e-05, + "loss": 0.9927, + "step": 100270 + }, + { + "epoch": 0.6406603375796992, + "grad_norm": 1.1630281209945679, + "learning_rate": 7.676406758811497e-05, + "loss": 1.0337, + "step": 100280 + }, + { + "epoch": 0.6407242247294379, + "grad_norm": 0.8417305946350098, + "learning_rate": 7.675982914615464e-05, + "loss": 0.799, + "step": 100290 + }, + { + "epoch": 0.6407881118791766, + "grad_norm": 0.5187862515449524, + "learning_rate": 7.675559043469966e-05, + "loss": 0.9353, + "step": 100300 + }, + { + "epoch": 0.6408519990289153, + "grad_norm": 0.6978999376296997, + "learning_rate": 7.675135145379276e-05, + "loss": 0.9321, + "step": 100310 + }, + { + "epoch": 0.640915886178654, + "grad_norm": 0.9272652268409729, + "learning_rate": 7.674711220347659e-05, + "loss": 0.9838, + "step": 100320 + }, + { + "epoch": 0.6409797733283927, + "grad_norm": 0.8608036637306213, + "learning_rate": 7.674287268379386e-05, + "loss": 0.9587, + "step": 100330 + }, + { + "epoch": 0.6410436604781314, + "grad_norm": 1.0041062831878662, + "learning_rate": 7.673863289478727e-05, + "loss": 1.0735, + "step": 100340 + }, + { + "epoch": 0.6411075476278701, + "grad_norm": 0.7018103003501892, + "learning_rate": 7.673439283649952e-05, + "loss": 0.9516, + "step": 100350 + }, + { + "epoch": 0.6411714347776089, + "grad_norm": 1.4162198305130005, + "learning_rate": 7.673015250897331e-05, + "loss": 1.0228, + "step": 100360 + }, + { + "epoch": 0.6412353219273476, + "grad_norm": 0.7780821919441223, + "learning_rate": 7.672591191225134e-05, + "loss": 0.9125, + "step": 100370 + }, + { + "epoch": 0.6412992090770863, + "grad_norm": 0.9130464792251587, + "learning_rate": 7.67216710463763e-05, + "loss": 0.779, + "step": 100380 + }, + { + "epoch": 0.641363096226825, + "grad_norm": 1.32298743724823, + "learning_rate": 7.671742991139093e-05, + "loss": 0.7318, + "step": 100390 + }, + { + "epoch": 0.6414269833765637, + "grad_norm": 0.8966230750083923, + "learning_rate": 7.671318850733791e-05, + "loss": 0.8519, + "step": 100400 + }, + { + "epoch": 0.6414908705263024, + "grad_norm": 0.7686439752578735, + "learning_rate": 7.670894683425997e-05, + "loss": 0.796, + "step": 100410 + }, + { + "epoch": 0.6415547576760411, + "grad_norm": 0.48994553089141846, + "learning_rate": 7.670470489219986e-05, + "loss": 0.8914, + "step": 100420 + }, + { + "epoch": 0.6416186448257798, + "grad_norm": 0.9652213454246521, + "learning_rate": 7.670046268120023e-05, + "loss": 0.8382, + "step": 100430 + }, + { + "epoch": 0.6416825319755184, + "grad_norm": 0.8491564989089966, + "learning_rate": 7.669622020130387e-05, + "loss": 0.6973, + "step": 100440 + }, + { + "epoch": 0.6417464191252571, + "grad_norm": 0.8289761543273926, + "learning_rate": 7.669197745255348e-05, + "loss": 1.0381, + "step": 100450 + }, + { + "epoch": 0.6418103062749958, + "grad_norm": 1.043124794960022, + "learning_rate": 7.668773443499176e-05, + "loss": 0.9624, + "step": 100460 + }, + { + "epoch": 0.6418741934247345, + "grad_norm": 0.8697907328605652, + "learning_rate": 7.668349114866149e-05, + "loss": 0.8019, + "step": 100470 + }, + { + "epoch": 0.6419380805744732, + "grad_norm": 2.3505332469940186, + "learning_rate": 7.667924759360537e-05, + "loss": 0.7879, + "step": 100480 + }, + { + "epoch": 0.6420019677242119, + "grad_norm": 0.638027548789978, + "learning_rate": 7.667500376986614e-05, + "loss": 0.8642, + "step": 100490 + }, + { + "epoch": 0.6420658548739506, + "grad_norm": 0.7815401554107666, + "learning_rate": 7.667075967748655e-05, + "loss": 0.9564, + "step": 100500 + }, + { + "epoch": 0.6421297420236893, + "grad_norm": 0.5453735589981079, + "learning_rate": 7.666651531650934e-05, + "loss": 0.9481, + "step": 100510 + }, + { + "epoch": 0.642193629173428, + "grad_norm": 0.6826755404472351, + "learning_rate": 7.666227068697722e-05, + "loss": 0.9086, + "step": 100520 + }, + { + "epoch": 0.6422575163231667, + "grad_norm": 1.5171852111816406, + "learning_rate": 7.665802578893301e-05, + "loss": 0.993, + "step": 100530 + }, + { + "epoch": 0.6423214034729055, + "grad_norm": 1.0722650289535522, + "learning_rate": 7.665378062241939e-05, + "loss": 1.005, + "step": 100540 + }, + { + "epoch": 0.6423852906226442, + "grad_norm": 0.9586762189865112, + "learning_rate": 7.664953518747916e-05, + "loss": 0.8817, + "step": 100550 + }, + { + "epoch": 0.6424491777723829, + "grad_norm": 1.246511459350586, + "learning_rate": 7.664528948415505e-05, + "loss": 0.7576, + "step": 100560 + }, + { + "epoch": 0.6425130649221216, + "grad_norm": 0.9459572434425354, + "learning_rate": 7.664104351248982e-05, + "loss": 1.0113, + "step": 100570 + }, + { + "epoch": 0.6425769520718603, + "grad_norm": 0.9673700332641602, + "learning_rate": 7.663679727252624e-05, + "loss": 1.2202, + "step": 100580 + }, + { + "epoch": 0.642640839221599, + "grad_norm": 1.0018703937530518, + "learning_rate": 7.663255076430707e-05, + "loss": 0.8106, + "step": 100590 + }, + { + "epoch": 0.6427047263713377, + "grad_norm": 2.040289878845215, + "learning_rate": 7.662830398787506e-05, + "loss": 0.8261, + "step": 100600 + }, + { + "epoch": 0.6427686135210764, + "grad_norm": 1.0974010229110718, + "learning_rate": 7.662405694327302e-05, + "loss": 0.9523, + "step": 100610 + }, + { + "epoch": 0.6428325006708151, + "grad_norm": 0.9285750389099121, + "learning_rate": 7.661980963054366e-05, + "loss": 0.7965, + "step": 100620 + }, + { + "epoch": 0.6428963878205538, + "grad_norm": 0.529984176158905, + "learning_rate": 7.66155620497298e-05, + "loss": 0.6934, + "step": 100630 + }, + { + "epoch": 0.6429602749702925, + "grad_norm": 0.9809777736663818, + "learning_rate": 7.661131420087421e-05, + "loss": 1.0064, + "step": 100640 + }, + { + "epoch": 0.6430241621200312, + "grad_norm": 0.7377033829689026, + "learning_rate": 7.660706608401965e-05, + "loss": 0.9637, + "step": 100650 + }, + { + "epoch": 0.6430880492697699, + "grad_norm": 0.7346864342689514, + "learning_rate": 7.660281769920893e-05, + "loss": 0.8089, + "step": 100660 + }, + { + "epoch": 0.6431519364195086, + "grad_norm": 0.6735924482345581, + "learning_rate": 7.659856904648482e-05, + "loss": 0.7855, + "step": 100670 + }, + { + "epoch": 0.6432158235692473, + "grad_norm": 1.3427221775054932, + "learning_rate": 7.659432012589009e-05, + "loss": 0.7815, + "step": 100680 + }, + { + "epoch": 0.6432797107189859, + "grad_norm": 0.7119907736778259, + "learning_rate": 7.659007093746757e-05, + "loss": 0.8669, + "step": 100690 + }, + { + "epoch": 0.6433435978687246, + "grad_norm": 1.4726430177688599, + "learning_rate": 7.658582148126001e-05, + "loss": 0.9779, + "step": 100700 + }, + { + "epoch": 0.6434074850184633, + "grad_norm": 0.721088171005249, + "learning_rate": 7.658157175731024e-05, + "loss": 0.7114, + "step": 100710 + }, + { + "epoch": 0.643471372168202, + "grad_norm": 0.8141944408416748, + "learning_rate": 7.657732176566105e-05, + "loss": 1.0834, + "step": 100720 + }, + { + "epoch": 0.6435352593179408, + "grad_norm": 1.5644798278808594, + "learning_rate": 7.657307150635524e-05, + "loss": 1.3679, + "step": 100730 + }, + { + "epoch": 0.6435991464676795, + "grad_norm": 1.1040544509887695, + "learning_rate": 7.65688209794356e-05, + "loss": 0.913, + "step": 100740 + }, + { + "epoch": 0.6436630336174182, + "grad_norm": 0.8945556282997131, + "learning_rate": 7.656457018494496e-05, + "loss": 0.9663, + "step": 100750 + }, + { + "epoch": 0.6437269207671569, + "grad_norm": 0.8544933199882507, + "learning_rate": 7.656031912292612e-05, + "loss": 0.7184, + "step": 100760 + }, + { + "epoch": 0.6437908079168956, + "grad_norm": 0.5265579223632812, + "learning_rate": 7.655606779342188e-05, + "loss": 0.9399, + "step": 100770 + }, + { + "epoch": 0.6438546950666343, + "grad_norm": 0.9724834561347961, + "learning_rate": 7.655181619647505e-05, + "loss": 0.919, + "step": 100780 + }, + { + "epoch": 0.643918582216373, + "grad_norm": 0.756826639175415, + "learning_rate": 7.654756433212848e-05, + "loss": 0.7773, + "step": 100790 + }, + { + "epoch": 0.6439824693661117, + "grad_norm": 0.599709689617157, + "learning_rate": 7.654331220042497e-05, + "loss": 0.8449, + "step": 100800 + }, + { + "epoch": 0.6440463565158504, + "grad_norm": 0.8727756142616272, + "learning_rate": 7.653905980140734e-05, + "loss": 0.8233, + "step": 100810 + }, + { + "epoch": 0.6441102436655891, + "grad_norm": 1.616363286972046, + "learning_rate": 7.653480713511841e-05, + "loss": 0.8221, + "step": 100820 + }, + { + "epoch": 0.6441741308153278, + "grad_norm": 0.8817083239555359, + "learning_rate": 7.653055420160102e-05, + "loss": 0.8405, + "step": 100830 + }, + { + "epoch": 0.6442380179650665, + "grad_norm": 1.3261228799819946, + "learning_rate": 7.6526301000898e-05, + "loss": 0.9731, + "step": 100840 + }, + { + "epoch": 0.6443019051148052, + "grad_norm": 0.7460963726043701, + "learning_rate": 7.652204753305217e-05, + "loss": 0.8105, + "step": 100850 + }, + { + "epoch": 0.6443657922645439, + "grad_norm": 1.7109055519104004, + "learning_rate": 7.651779379810639e-05, + "loss": 1.1521, + "step": 100860 + }, + { + "epoch": 0.6444296794142826, + "grad_norm": 1.031250238418579, + "learning_rate": 7.651353979610348e-05, + "loss": 0.9203, + "step": 100870 + }, + { + "epoch": 0.6444935665640213, + "grad_norm": 1.0401231050491333, + "learning_rate": 7.650928552708628e-05, + "loss": 0.7879, + "step": 100880 + }, + { + "epoch": 0.64455745371376, + "grad_norm": 0.9327844381332397, + "learning_rate": 7.650503099109765e-05, + "loss": 0.9528, + "step": 100890 + }, + { + "epoch": 0.6446213408634988, + "grad_norm": 0.803861677646637, + "learning_rate": 7.650077618818044e-05, + "loss": 0.7373, + "step": 100900 + }, + { + "epoch": 0.6446852280132375, + "grad_norm": 1.1715584993362427, + "learning_rate": 7.649652111837746e-05, + "loss": 0.963, + "step": 100910 + }, + { + "epoch": 0.6447491151629762, + "grad_norm": 0.7066060900688171, + "learning_rate": 7.649226578173161e-05, + "loss": 0.8937, + "step": 100920 + }, + { + "epoch": 0.6448130023127148, + "grad_norm": 0.9723853468894958, + "learning_rate": 7.648801017828571e-05, + "loss": 0.9408, + "step": 100930 + }, + { + "epoch": 0.6448768894624535, + "grad_norm": 0.7268878817558289, + "learning_rate": 7.648375430808264e-05, + "loss": 0.9303, + "step": 100940 + }, + { + "epoch": 0.6449407766121922, + "grad_norm": 0.8822718262672424, + "learning_rate": 7.647949817116525e-05, + "loss": 1.1401, + "step": 100950 + }, + { + "epoch": 0.6450046637619309, + "grad_norm": 0.9792453646659851, + "learning_rate": 7.64752417675764e-05, + "loss": 0.9359, + "step": 100960 + }, + { + "epoch": 0.6450685509116696, + "grad_norm": 0.5390404462814331, + "learning_rate": 7.647098509735897e-05, + "loss": 0.8654, + "step": 100970 + }, + { + "epoch": 0.6451324380614083, + "grad_norm": 0.7712870836257935, + "learning_rate": 7.646672816055583e-05, + "loss": 0.8716, + "step": 100980 + }, + { + "epoch": 0.645196325211147, + "grad_norm": 0.711517333984375, + "learning_rate": 7.646247095720982e-05, + "loss": 0.978, + "step": 100990 + }, + { + "epoch": 0.6452602123608857, + "grad_norm": 0.6369432210922241, + "learning_rate": 7.645821348736383e-05, + "loss": 0.8599, + "step": 101000 + }, + { + "epoch": 0.6453240995106244, + "grad_norm": 1.0040960311889648, + "learning_rate": 7.645395575106075e-05, + "loss": 0.8098, + "step": 101010 + }, + { + "epoch": 0.6453879866603631, + "grad_norm": 0.9577940106391907, + "learning_rate": 7.644969774834348e-05, + "loss": 0.6533, + "step": 101020 + }, + { + "epoch": 0.6454518738101018, + "grad_norm": 0.8677441477775574, + "learning_rate": 7.644543947925483e-05, + "loss": 0.8457, + "step": 101030 + }, + { + "epoch": 0.6455157609598405, + "grad_norm": 0.7165183424949646, + "learning_rate": 7.644118094383774e-05, + "loss": 0.8876, + "step": 101040 + }, + { + "epoch": 0.6455796481095792, + "grad_norm": 1.3950011730194092, + "learning_rate": 7.643692214213507e-05, + "loss": 1.1486, + "step": 101050 + }, + { + "epoch": 0.645643535259318, + "grad_norm": 0.824797511100769, + "learning_rate": 7.643266307418974e-05, + "loss": 0.8211, + "step": 101060 + }, + { + "epoch": 0.6457074224090567, + "grad_norm": 0.7592344284057617, + "learning_rate": 7.642840374004463e-05, + "loss": 1.0379, + "step": 101070 + }, + { + "epoch": 0.6457713095587954, + "grad_norm": 0.9701903462409973, + "learning_rate": 7.642414413974262e-05, + "loss": 0.6966, + "step": 101080 + }, + { + "epoch": 0.6458351967085341, + "grad_norm": 0.8895474672317505, + "learning_rate": 7.641988427332663e-05, + "loss": 1.1851, + "step": 101090 + }, + { + "epoch": 0.6458990838582728, + "grad_norm": 0.7744872570037842, + "learning_rate": 7.641562414083952e-05, + "loss": 0.8782, + "step": 101100 + }, + { + "epoch": 0.6459629710080115, + "grad_norm": 1.7571711540222168, + "learning_rate": 7.641136374232425e-05, + "loss": 0.9443, + "step": 101110 + }, + { + "epoch": 0.6460268581577502, + "grad_norm": 0.6616213917732239, + "learning_rate": 7.640710307782368e-05, + "loss": 0.7928, + "step": 101120 + }, + { + "epoch": 0.6460907453074889, + "grad_norm": 0.7461645603179932, + "learning_rate": 7.640284214738075e-05, + "loss": 1.0164, + "step": 101130 + }, + { + "epoch": 0.6461546324572276, + "grad_norm": 0.9420298933982849, + "learning_rate": 7.639858095103836e-05, + "loss": 0.7869, + "step": 101140 + }, + { + "epoch": 0.6462185196069663, + "grad_norm": 1.6276788711547852, + "learning_rate": 7.639431948883941e-05, + "loss": 0.6882, + "step": 101150 + }, + { + "epoch": 0.646282406756705, + "grad_norm": 1.1304795742034912, + "learning_rate": 7.639005776082683e-05, + "loss": 0.892, + "step": 101160 + }, + { + "epoch": 0.6463462939064436, + "grad_norm": 0.9252867102622986, + "learning_rate": 7.638579576704355e-05, + "loss": 0.8588, + "step": 101170 + }, + { + "epoch": 0.6464101810561823, + "grad_norm": 1.1409999132156372, + "learning_rate": 7.638153350753246e-05, + "loss": 0.927, + "step": 101180 + }, + { + "epoch": 0.646474068205921, + "grad_norm": 1.0091885328292847, + "learning_rate": 7.637727098233651e-05, + "loss": 0.9428, + "step": 101190 + }, + { + "epoch": 0.6465379553556597, + "grad_norm": 0.8046776652336121, + "learning_rate": 7.637300819149862e-05, + "loss": 1.0972, + "step": 101200 + }, + { + "epoch": 0.6466018425053984, + "grad_norm": 1.0795817375183105, + "learning_rate": 7.636874513506174e-05, + "loss": 1.1528, + "step": 101210 + }, + { + "epoch": 0.6466657296551371, + "grad_norm": 1.0064719915390015, + "learning_rate": 7.636448181306876e-05, + "loss": 0.876, + "step": 101220 + }, + { + "epoch": 0.6467296168048758, + "grad_norm": 0.786211371421814, + "learning_rate": 7.636021822556266e-05, + "loss": 0.9598, + "step": 101230 + }, + { + "epoch": 0.6467935039546145, + "grad_norm": 1.0055882930755615, + "learning_rate": 7.635595437258634e-05, + "loss": 0.8143, + "step": 101240 + }, + { + "epoch": 0.6468573911043533, + "grad_norm": 0.8458549380302429, + "learning_rate": 7.635169025418278e-05, + "loss": 0.89, + "step": 101250 + }, + { + "epoch": 0.646921278254092, + "grad_norm": 0.9229540228843689, + "learning_rate": 7.634742587039489e-05, + "loss": 0.6728, + "step": 101260 + }, + { + "epoch": 0.6469851654038307, + "grad_norm": 0.8958204388618469, + "learning_rate": 7.634316122126562e-05, + "loss": 0.7874, + "step": 101270 + }, + { + "epoch": 0.6470490525535694, + "grad_norm": 0.6617315411567688, + "learning_rate": 7.633889630683794e-05, + "loss": 0.8585, + "step": 101280 + }, + { + "epoch": 0.6471129397033081, + "grad_norm": 0.5204321146011353, + "learning_rate": 7.633463112715477e-05, + "loss": 0.8739, + "step": 101290 + }, + { + "epoch": 0.6471768268530468, + "grad_norm": 0.7551961541175842, + "learning_rate": 7.633036568225911e-05, + "loss": 0.7589, + "step": 101300 + }, + { + "epoch": 0.6472407140027855, + "grad_norm": 1.2205754518508911, + "learning_rate": 7.632609997219388e-05, + "loss": 1.1764, + "step": 101310 + }, + { + "epoch": 0.6473046011525242, + "grad_norm": 0.5850151181221008, + "learning_rate": 7.632183399700204e-05, + "loss": 0.7911, + "step": 101320 + }, + { + "epoch": 0.6473684883022629, + "grad_norm": 0.8024903535842896, + "learning_rate": 7.631756775672656e-05, + "loss": 0.9156, + "step": 101330 + }, + { + "epoch": 0.6474323754520016, + "grad_norm": 1.3309029340744019, + "learning_rate": 7.63133012514104e-05, + "loss": 0.8624, + "step": 101340 + }, + { + "epoch": 0.6474962626017403, + "grad_norm": 1.0072933435440063, + "learning_rate": 7.630903448109654e-05, + "loss": 0.7941, + "step": 101350 + }, + { + "epoch": 0.647560149751479, + "grad_norm": 0.6163814663887024, + "learning_rate": 7.630476744582794e-05, + "loss": 1.1017, + "step": 101360 + }, + { + "epoch": 0.6476240369012177, + "grad_norm": 1.1491094827651978, + "learning_rate": 7.630050014564755e-05, + "loss": 0.7671, + "step": 101370 + }, + { + "epoch": 0.6476879240509564, + "grad_norm": 0.9501873254776001, + "learning_rate": 7.62962325805984e-05, + "loss": 0.8288, + "step": 101380 + }, + { + "epoch": 0.6477518112006951, + "grad_norm": 1.3051087856292725, + "learning_rate": 7.629196475072345e-05, + "loss": 0.7035, + "step": 101390 + }, + { + "epoch": 0.6478156983504338, + "grad_norm": 0.8004158139228821, + "learning_rate": 7.628769665606564e-05, + "loss": 0.8295, + "step": 101400 + }, + { + "epoch": 0.6478795855001726, + "grad_norm": 0.8266284465789795, + "learning_rate": 7.628342829666799e-05, + "loss": 0.9296, + "step": 101410 + }, + { + "epoch": 0.6479434726499111, + "grad_norm": 0.7903985977172852, + "learning_rate": 7.627915967257348e-05, + "loss": 0.8982, + "step": 101420 + }, + { + "epoch": 0.6480073597996499, + "grad_norm": 0.9672759771347046, + "learning_rate": 7.62748907838251e-05, + "loss": 0.9396, + "step": 101430 + }, + { + "epoch": 0.6480712469493886, + "grad_norm": 0.5555049777030945, + "learning_rate": 7.627062163046585e-05, + "loss": 0.787, + "step": 101440 + }, + { + "epoch": 0.6481351340991273, + "grad_norm": 1.0060865879058838, + "learning_rate": 7.626635221253871e-05, + "loss": 1.3664, + "step": 101450 + }, + { + "epoch": 0.648199021248866, + "grad_norm": 0.8967821002006531, + "learning_rate": 7.626208253008667e-05, + "loss": 0.6339, + "step": 101460 + }, + { + "epoch": 0.6482629083986047, + "grad_norm": 0.9876995086669922, + "learning_rate": 7.625781258315273e-05, + "loss": 0.8212, + "step": 101470 + }, + { + "epoch": 0.6483267955483434, + "grad_norm": 0.8965706825256348, + "learning_rate": 7.625354237177991e-05, + "loss": 0.9081, + "step": 101480 + }, + { + "epoch": 0.6483906826980821, + "grad_norm": 0.7027506232261658, + "learning_rate": 7.624927189601121e-05, + "loss": 0.7717, + "step": 101490 + }, + { + "epoch": 0.6484545698478208, + "grad_norm": 1.207170844078064, + "learning_rate": 7.624500115588963e-05, + "loss": 1.0589, + "step": 101500 + }, + { + "epoch": 0.6485184569975595, + "grad_norm": 0.6559182405471802, + "learning_rate": 7.624073015145819e-05, + "loss": 0.9365, + "step": 101510 + }, + { + "epoch": 0.6485823441472982, + "grad_norm": 0.8271816372871399, + "learning_rate": 7.623645888275988e-05, + "loss": 1.0675, + "step": 101520 + }, + { + "epoch": 0.6486462312970369, + "grad_norm": 0.9976779818534851, + "learning_rate": 7.623218734983775e-05, + "loss": 0.9971, + "step": 101530 + }, + { + "epoch": 0.6487101184467756, + "grad_norm": 1.128108263015747, + "learning_rate": 7.622791555273478e-05, + "loss": 0.7268, + "step": 101540 + }, + { + "epoch": 0.6487740055965143, + "grad_norm": 1.8108998537063599, + "learning_rate": 7.622364349149402e-05, + "loss": 0.9972, + "step": 101550 + }, + { + "epoch": 0.648837892746253, + "grad_norm": 1.99355149269104, + "learning_rate": 7.621937116615849e-05, + "loss": 0.844, + "step": 101560 + }, + { + "epoch": 0.6489017798959917, + "grad_norm": 0.9173657894134521, + "learning_rate": 7.62150985767712e-05, + "loss": 0.7644, + "step": 101570 + }, + { + "epoch": 0.6489656670457304, + "grad_norm": 0.8146788477897644, + "learning_rate": 7.62108257233752e-05, + "loss": 0.8551, + "step": 101580 + }, + { + "epoch": 0.6490295541954691, + "grad_norm": 1.06039297580719, + "learning_rate": 7.62065526060135e-05, + "loss": 1.0242, + "step": 101590 + }, + { + "epoch": 0.6490934413452079, + "grad_norm": 0.5522758364677429, + "learning_rate": 7.620227922472914e-05, + "loss": 1.0448, + "step": 101600 + }, + { + "epoch": 0.6491573284949466, + "grad_norm": 1.0754237174987793, + "learning_rate": 7.619800557956516e-05, + "loss": 0.8727, + "step": 101610 + }, + { + "epoch": 0.6492212156446853, + "grad_norm": 0.5872507691383362, + "learning_rate": 7.619373167056461e-05, + "loss": 0.7673, + "step": 101620 + }, + { + "epoch": 0.649285102794424, + "grad_norm": 1.68289053440094, + "learning_rate": 7.618945749777051e-05, + "loss": 1.2075, + "step": 101630 + }, + { + "epoch": 0.6493489899441627, + "grad_norm": 0.673235297203064, + "learning_rate": 7.618518306122593e-05, + "loss": 0.8398, + "step": 101640 + }, + { + "epoch": 0.6494128770939014, + "grad_norm": 0.8392667174339294, + "learning_rate": 7.618090836097389e-05, + "loss": 1.0282, + "step": 101650 + }, + { + "epoch": 0.64947676424364, + "grad_norm": 0.834991991519928, + "learning_rate": 7.617706090531277e-05, + "loss": 0.8716, + "step": 101660 + }, + { + "epoch": 0.6495406513933787, + "grad_norm": 1.0076117515563965, + "learning_rate": 7.617278570413519e-05, + "loss": 1.0367, + "step": 101670 + }, + { + "epoch": 0.6496045385431174, + "grad_norm": 1.0270442962646484, + "learning_rate": 7.616851023937501e-05, + "loss": 0.9725, + "step": 101680 + }, + { + "epoch": 0.6496684256928561, + "grad_norm": 0.7313565611839294, + "learning_rate": 7.61642345110753e-05, + "loss": 0.858, + "step": 101690 + }, + { + "epoch": 0.6497323128425948, + "grad_norm": 0.8768913149833679, + "learning_rate": 7.615995851927911e-05, + "loss": 0.9933, + "step": 101700 + }, + { + "epoch": 0.6497961999923335, + "grad_norm": 1.0162863731384277, + "learning_rate": 7.615568226402951e-05, + "loss": 0.8052, + "step": 101710 + }, + { + "epoch": 0.6498600871420722, + "grad_norm": 0.6026840209960938, + "learning_rate": 7.615140574536956e-05, + "loss": 0.8807, + "step": 101720 + }, + { + "epoch": 0.6499239742918109, + "grad_norm": 1.1195095777511597, + "learning_rate": 7.614712896334233e-05, + "loss": 0.9153, + "step": 101730 + }, + { + "epoch": 0.6499878614415496, + "grad_norm": 1.266113042831421, + "learning_rate": 7.614285191799088e-05, + "loss": 1.144, + "step": 101740 + }, + { + "epoch": 0.6500517485912883, + "grad_norm": 0.7634327411651611, + "learning_rate": 7.613857460935831e-05, + "loss": 0.7877, + "step": 101750 + }, + { + "epoch": 0.650115635741027, + "grad_norm": 0.8808565139770508, + "learning_rate": 7.613429703748768e-05, + "loss": 0.8561, + "step": 101760 + }, + { + "epoch": 0.6501795228907657, + "grad_norm": 1.3923038244247437, + "learning_rate": 7.613001920242206e-05, + "loss": 0.9596, + "step": 101770 + }, + { + "epoch": 0.6502434100405045, + "grad_norm": 1.1155999898910522, + "learning_rate": 7.612574110420454e-05, + "loss": 0.7707, + "step": 101780 + }, + { + "epoch": 0.6503072971902432, + "grad_norm": 0.7540896534919739, + "learning_rate": 7.612146274287821e-05, + "loss": 1.2077, + "step": 101790 + }, + { + "epoch": 0.6503711843399819, + "grad_norm": 0.7972086071968079, + "learning_rate": 7.611718411848617e-05, + "loss": 0.9242, + "step": 101800 + }, + { + "epoch": 0.6504350714897206, + "grad_norm": 0.8194320797920227, + "learning_rate": 7.611290523107146e-05, + "loss": 1.0412, + "step": 101810 + }, + { + "epoch": 0.6504989586394593, + "grad_norm": 0.8786047101020813, + "learning_rate": 7.610862608067721e-05, + "loss": 0.901, + "step": 101820 + }, + { + "epoch": 0.650562845789198, + "grad_norm": 0.7013679146766663, + "learning_rate": 7.610434666734651e-05, + "loss": 0.9909, + "step": 101830 + }, + { + "epoch": 0.6506267329389367, + "grad_norm": 1.3710912466049194, + "learning_rate": 7.610006699112248e-05, + "loss": 1.1784, + "step": 101840 + }, + { + "epoch": 0.6506906200886754, + "grad_norm": 0.9212914705276489, + "learning_rate": 7.609578705204816e-05, + "loss": 0.8956, + "step": 101850 + }, + { + "epoch": 0.6507545072384141, + "grad_norm": 1.100433588027954, + "learning_rate": 7.609150685016671e-05, + "loss": 0.8263, + "step": 101860 + }, + { + "epoch": 0.6508183943881528, + "grad_norm": 1.0776516199111938, + "learning_rate": 7.60872263855212e-05, + "loss": 0.9641, + "step": 101870 + }, + { + "epoch": 0.6508822815378915, + "grad_norm": 0.9307558536529541, + "learning_rate": 7.608294565815476e-05, + "loss": 0.8491, + "step": 101880 + }, + { + "epoch": 0.6509461686876302, + "grad_norm": 0.9816484451293945, + "learning_rate": 7.60786646681105e-05, + "loss": 0.5917, + "step": 101890 + }, + { + "epoch": 0.6510100558373688, + "grad_norm": 0.8662287592887878, + "learning_rate": 7.607438341543152e-05, + "loss": 0.8599, + "step": 101900 + }, + { + "epoch": 0.6510739429871075, + "grad_norm": 1.7769293785095215, + "learning_rate": 7.607010190016093e-05, + "loss": 0.8963, + "step": 101910 + }, + { + "epoch": 0.6511378301368462, + "grad_norm": 0.9365469813346863, + "learning_rate": 7.606582012234188e-05, + "loss": 0.8052, + "step": 101920 + }, + { + "epoch": 0.6512017172865849, + "grad_norm": 0.8617343902587891, + "learning_rate": 7.606153808201746e-05, + "loss": 0.9196, + "step": 101930 + }, + { + "epoch": 0.6512656044363236, + "grad_norm": 0.689240038394928, + "learning_rate": 7.605725577923081e-05, + "loss": 0.9611, + "step": 101940 + }, + { + "epoch": 0.6513294915860623, + "grad_norm": 0.7218610644340515, + "learning_rate": 7.605297321402504e-05, + "loss": 0.8778, + "step": 101950 + }, + { + "epoch": 0.651393378735801, + "grad_norm": 1.1720585823059082, + "learning_rate": 7.60486903864433e-05, + "loss": 0.8404, + "step": 101960 + }, + { + "epoch": 0.6514572658855398, + "grad_norm": 1.2702221870422363, + "learning_rate": 7.60444072965287e-05, + "loss": 1.1349, + "step": 101970 + }, + { + "epoch": 0.6515211530352785, + "grad_norm": 0.8623278141021729, + "learning_rate": 7.60401239443244e-05, + "loss": 0.6295, + "step": 101980 + }, + { + "epoch": 0.6515850401850172, + "grad_norm": 1.1232866048812866, + "learning_rate": 7.603584032987353e-05, + "loss": 0.9372, + "step": 101990 + }, + { + "epoch": 0.6516489273347559, + "grad_norm": 1.010554552078247, + "learning_rate": 7.603155645321921e-05, + "loss": 0.9095, + "step": 102000 + }, + { + "epoch": 0.6517128144844946, + "grad_norm": 0.9294137358665466, + "learning_rate": 7.60272723144046e-05, + "loss": 0.9728, + "step": 102010 + }, + { + "epoch": 0.6517767016342333, + "grad_norm": 0.6776549816131592, + "learning_rate": 7.602298791347284e-05, + "loss": 0.9536, + "step": 102020 + }, + { + "epoch": 0.651840588783972, + "grad_norm": 0.8278113007545471, + "learning_rate": 7.601870325046707e-05, + "loss": 0.691, + "step": 102030 + }, + { + "epoch": 0.6519044759337107, + "grad_norm": 0.7509378790855408, + "learning_rate": 7.601441832543046e-05, + "loss": 0.7495, + "step": 102040 + }, + { + "epoch": 0.6519683630834494, + "grad_norm": 0.6673555374145508, + "learning_rate": 7.601013313840616e-05, + "loss": 1.0675, + "step": 102050 + }, + { + "epoch": 0.6520322502331881, + "grad_norm": 1.0808007717132568, + "learning_rate": 7.600584768943731e-05, + "loss": 0.8746, + "step": 102060 + }, + { + "epoch": 0.6520961373829268, + "grad_norm": 0.9976995587348938, + "learning_rate": 7.600156197856707e-05, + "loss": 0.7481, + "step": 102070 + }, + { + "epoch": 0.6521600245326655, + "grad_norm": 0.7592312693595886, + "learning_rate": 7.599727600583861e-05, + "loss": 0.7296, + "step": 102080 + }, + { + "epoch": 0.6522239116824042, + "grad_norm": 0.7450394034385681, + "learning_rate": 7.599298977129509e-05, + "loss": 0.8024, + "step": 102090 + }, + { + "epoch": 0.6522877988321429, + "grad_norm": 1.1607773303985596, + "learning_rate": 7.598870327497967e-05, + "loss": 1.0051, + "step": 102100 + }, + { + "epoch": 0.6523516859818816, + "grad_norm": 1.8432011604309082, + "learning_rate": 7.598441651693554e-05, + "loss": 0.8561, + "step": 102110 + }, + { + "epoch": 0.6524155731316204, + "grad_norm": 0.6616463661193848, + "learning_rate": 7.598012949720585e-05, + "loss": 0.981, + "step": 102120 + }, + { + "epoch": 0.6524794602813591, + "grad_norm": 0.715186357498169, + "learning_rate": 7.597584221583377e-05, + "loss": 0.8713, + "step": 102130 + }, + { + "epoch": 0.6525433474310978, + "grad_norm": 1.7039902210235596, + "learning_rate": 7.597155467286249e-05, + "loss": 0.8644, + "step": 102140 + }, + { + "epoch": 0.6526072345808364, + "grad_norm": 0.7133430242538452, + "learning_rate": 7.59672668683352e-05, + "loss": 0.8738, + "step": 102150 + }, + { + "epoch": 0.6526711217305751, + "grad_norm": 0.79267418384552, + "learning_rate": 7.596297880229504e-05, + "loss": 0.7017, + "step": 102160 + }, + { + "epoch": 0.6527350088803138, + "grad_norm": 3.0087802410125732, + "learning_rate": 7.595869047478524e-05, + "loss": 0.8555, + "step": 102170 + }, + { + "epoch": 0.6527988960300525, + "grad_norm": 1.167046070098877, + "learning_rate": 7.595440188584897e-05, + "loss": 0.9269, + "step": 102180 + }, + { + "epoch": 0.6528627831797912, + "grad_norm": 0.7473157644271851, + "learning_rate": 7.595011303552941e-05, + "loss": 1.1813, + "step": 102190 + }, + { + "epoch": 0.6529266703295299, + "grad_norm": 2.5482473373413086, + "learning_rate": 7.594582392386977e-05, + "loss": 0.9303, + "step": 102200 + }, + { + "epoch": 0.6529905574792686, + "grad_norm": 0.5995567440986633, + "learning_rate": 7.594153455091324e-05, + "loss": 0.8351, + "step": 102210 + }, + { + "epoch": 0.6530544446290073, + "grad_norm": 0.8906635046005249, + "learning_rate": 7.593724491670302e-05, + "loss": 1.0707, + "step": 102220 + }, + { + "epoch": 0.653118331778746, + "grad_norm": 1.3267639875411987, + "learning_rate": 7.593295502128229e-05, + "loss": 0.9804, + "step": 102230 + }, + { + "epoch": 0.6531822189284847, + "grad_norm": 0.9197192788124084, + "learning_rate": 7.592866486469427e-05, + "loss": 0.831, + "step": 102240 + }, + { + "epoch": 0.6532461060782234, + "grad_norm": 0.9400060772895813, + "learning_rate": 7.592437444698217e-05, + "loss": 1.1257, + "step": 102250 + }, + { + "epoch": 0.6533099932279621, + "grad_norm": 1.9750057458877563, + "learning_rate": 7.592008376818918e-05, + "loss": 1.1414, + "step": 102260 + }, + { + "epoch": 0.6533738803777008, + "grad_norm": 0.8337990045547485, + "learning_rate": 7.591579282835854e-05, + "loss": 0.9497, + "step": 102270 + }, + { + "epoch": 0.6534377675274395, + "grad_norm": 0.6679349541664124, + "learning_rate": 7.591150162753343e-05, + "loss": 0.859, + "step": 102280 + }, + { + "epoch": 0.6535016546771782, + "grad_norm": 1.1458660364151, + "learning_rate": 7.590721016575709e-05, + "loss": 0.8031, + "step": 102290 + }, + { + "epoch": 0.653565541826917, + "grad_norm": 1.3683353662490845, + "learning_rate": 7.590291844307274e-05, + "loss": 0.7009, + "step": 102300 + }, + { + "epoch": 0.6536294289766557, + "grad_norm": 1.0884777307510376, + "learning_rate": 7.589862645952358e-05, + "loss": 1.1495, + "step": 102310 + }, + { + "epoch": 0.6536933161263944, + "grad_norm": 2.0209991931915283, + "learning_rate": 7.589433421515284e-05, + "loss": 0.7371, + "step": 102320 + }, + { + "epoch": 0.6537572032761331, + "grad_norm": 0.8924853801727295, + "learning_rate": 7.589004171000376e-05, + "loss": 0.8453, + "step": 102330 + }, + { + "epoch": 0.6538210904258718, + "grad_norm": 0.9563447833061218, + "learning_rate": 7.588574894411957e-05, + "loss": 0.9245, + "step": 102340 + }, + { + "epoch": 0.6538849775756105, + "grad_norm": 0.6629459857940674, + "learning_rate": 7.588145591754348e-05, + "loss": 0.8992, + "step": 102350 + }, + { + "epoch": 0.6539488647253492, + "grad_norm": 0.7915505170822144, + "learning_rate": 7.587716263031875e-05, + "loss": 0.6657, + "step": 102360 + }, + { + "epoch": 0.6540127518750879, + "grad_norm": 0.9542189240455627, + "learning_rate": 7.587286908248859e-05, + "loss": 0.8998, + "step": 102370 + }, + { + "epoch": 0.6540766390248266, + "grad_norm": 0.6477545499801636, + "learning_rate": 7.586857527409625e-05, + "loss": 0.828, + "step": 102380 + }, + { + "epoch": 0.6541405261745652, + "grad_norm": 3.1310455799102783, + "learning_rate": 7.586428120518498e-05, + "loss": 0.8608, + "step": 102390 + }, + { + "epoch": 0.6542044133243039, + "grad_norm": 0.7806214094161987, + "learning_rate": 7.585998687579805e-05, + "loss": 0.7727, + "step": 102400 + }, + { + "epoch": 0.6542683004740426, + "grad_norm": 0.8768726587295532, + "learning_rate": 7.585569228597866e-05, + "loss": 0.7605, + "step": 102410 + }, + { + "epoch": 0.6543321876237813, + "grad_norm": 1.5629066228866577, + "learning_rate": 7.585139743577007e-05, + "loss": 0.7757, + "step": 102420 + }, + { + "epoch": 0.65439607477352, + "grad_norm": 0.6218248009681702, + "learning_rate": 7.584710232521558e-05, + "loss": 0.805, + "step": 102430 + }, + { + "epoch": 0.6544599619232587, + "grad_norm": 0.555633008480072, + "learning_rate": 7.584280695435839e-05, + "loss": 0.9534, + "step": 102440 + }, + { + "epoch": 0.6545238490729974, + "grad_norm": 0.9295303821563721, + "learning_rate": 7.583851132324176e-05, + "loss": 0.9972, + "step": 102450 + }, + { + "epoch": 0.6545877362227361, + "grad_norm": 0.4502405822277069, + "learning_rate": 7.583421543190899e-05, + "loss": 0.8702, + "step": 102460 + }, + { + "epoch": 0.6546516233724748, + "grad_norm": 1.0970765352249146, + "learning_rate": 7.58299192804033e-05, + "loss": 0.6886, + "step": 102470 + }, + { + "epoch": 0.6547155105222136, + "grad_norm": 0.967715859413147, + "learning_rate": 7.5825622868768e-05, + "loss": 0.769, + "step": 102480 + }, + { + "epoch": 0.6547793976719523, + "grad_norm": 1.3791766166687012, + "learning_rate": 7.582132619704632e-05, + "loss": 1.0848, + "step": 102490 + }, + { + "epoch": 0.654843284821691, + "grad_norm": 1.011976718902588, + "learning_rate": 7.581702926528156e-05, + "loss": 0.8708, + "step": 102500 + }, + { + "epoch": 0.6549071719714297, + "grad_norm": 1.187624454498291, + "learning_rate": 7.581273207351696e-05, + "loss": 0.8925, + "step": 102510 + }, + { + "epoch": 0.6549710591211684, + "grad_norm": 1.0658549070358276, + "learning_rate": 7.580843462179583e-05, + "loss": 0.6593, + "step": 102520 + }, + { + "epoch": 0.6550349462709071, + "grad_norm": 0.7370697855949402, + "learning_rate": 7.580413691016144e-05, + "loss": 0.9271, + "step": 102530 + }, + { + "epoch": 0.6550988334206458, + "grad_norm": 1.2200112342834473, + "learning_rate": 7.579983893865704e-05, + "loss": 0.9089, + "step": 102540 + }, + { + "epoch": 0.6551627205703845, + "grad_norm": 0.8738793730735779, + "learning_rate": 7.579554070732597e-05, + "loss": 0.9293, + "step": 102550 + }, + { + "epoch": 0.6552266077201232, + "grad_norm": 0.6876864433288574, + "learning_rate": 7.579124221621148e-05, + "loss": 0.7449, + "step": 102560 + }, + { + "epoch": 0.6552904948698619, + "grad_norm": 0.8214115500450134, + "learning_rate": 7.578694346535686e-05, + "loss": 0.7657, + "step": 102570 + }, + { + "epoch": 0.6553543820196006, + "grad_norm": 2.8567984104156494, + "learning_rate": 7.578264445480543e-05, + "loss": 0.8339, + "step": 102580 + }, + { + "epoch": 0.6554182691693393, + "grad_norm": 1.8069883584976196, + "learning_rate": 7.577834518460046e-05, + "loss": 0.772, + "step": 102590 + }, + { + "epoch": 0.655482156319078, + "grad_norm": 0.7831799387931824, + "learning_rate": 7.577404565478525e-05, + "loss": 1.1675, + "step": 102600 + }, + { + "epoch": 0.6555460434688167, + "grad_norm": 1.1496902704238892, + "learning_rate": 7.576974586540309e-05, + "loss": 0.727, + "step": 102610 + }, + { + "epoch": 0.6556099306185554, + "grad_norm": 0.9092468619346619, + "learning_rate": 7.57654458164973e-05, + "loss": 0.7713, + "step": 102620 + }, + { + "epoch": 0.655673817768294, + "grad_norm": 1.3994219303131104, + "learning_rate": 7.57611455081112e-05, + "loss": 0.9057, + "step": 102630 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.7715085744857788, + "learning_rate": 7.575684494028805e-05, + "loss": 0.9913, + "step": 102640 + }, + { + "epoch": 0.6558015920677714, + "grad_norm": 1.0739322900772095, + "learning_rate": 7.57525441130712e-05, + "loss": 1.0197, + "step": 102650 + }, + { + "epoch": 0.6558654792175102, + "grad_norm": 1.3112781047821045, + "learning_rate": 7.574824302650396e-05, + "loss": 0.819, + "step": 102660 + }, + { + "epoch": 0.6559293663672489, + "grad_norm": 0.8328858017921448, + "learning_rate": 7.574394168062964e-05, + "loss": 1.1271, + "step": 102670 + }, + { + "epoch": 0.6559932535169876, + "grad_norm": 1.3175311088562012, + "learning_rate": 7.573964007549155e-05, + "loss": 1.0228, + "step": 102680 + }, + { + "epoch": 0.6560571406667263, + "grad_norm": 1.8402165174484253, + "learning_rate": 7.573533821113302e-05, + "loss": 0.6843, + "step": 102690 + }, + { + "epoch": 0.656121027816465, + "grad_norm": 0.6495104432106018, + "learning_rate": 7.573103608759736e-05, + "loss": 0.8759, + "step": 102700 + }, + { + "epoch": 0.6561849149662037, + "grad_norm": 1.0838744640350342, + "learning_rate": 7.572673370492788e-05, + "loss": 0.9824, + "step": 102710 + }, + { + "epoch": 0.6562488021159424, + "grad_norm": 1.348099708557129, + "learning_rate": 7.572243106316798e-05, + "loss": 0.995, + "step": 102720 + }, + { + "epoch": 0.6563126892656811, + "grad_norm": 1.7211371660232544, + "learning_rate": 7.571812816236093e-05, + "loss": 1.0677, + "step": 102730 + }, + { + "epoch": 0.6563765764154198, + "grad_norm": 1.924634337425232, + "learning_rate": 7.571382500255009e-05, + "loss": 0.6923, + "step": 102740 + }, + { + "epoch": 0.6564404635651585, + "grad_norm": 0.7509860992431641, + "learning_rate": 7.570952158377877e-05, + "loss": 1.3095, + "step": 102750 + }, + { + "epoch": 0.6565043507148972, + "grad_norm": 2.123326301574707, + "learning_rate": 7.570521790609033e-05, + "loss": 1.0223, + "step": 102760 + }, + { + "epoch": 0.6565682378646359, + "grad_norm": 0.9180009961128235, + "learning_rate": 7.570091396952811e-05, + "loss": 0.7883, + "step": 102770 + }, + { + "epoch": 0.6566321250143746, + "grad_norm": 0.9530285596847534, + "learning_rate": 7.569660977413546e-05, + "loss": 0.6924, + "step": 102780 + }, + { + "epoch": 0.6566960121641133, + "grad_norm": 0.9887816905975342, + "learning_rate": 7.56923053199557e-05, + "loss": 0.9007, + "step": 102790 + }, + { + "epoch": 0.656759899313852, + "grad_norm": 0.853535532951355, + "learning_rate": 7.568800060703222e-05, + "loss": 0.9581, + "step": 102800 + }, + { + "epoch": 0.6568237864635907, + "grad_norm": 0.716618001461029, + "learning_rate": 7.568369563540834e-05, + "loss": 1.1438, + "step": 102810 + }, + { + "epoch": 0.6568876736133294, + "grad_norm": 0.9339463114738464, + "learning_rate": 7.567939040512742e-05, + "loss": 1.0451, + "step": 102820 + }, + { + "epoch": 0.6569515607630682, + "grad_norm": 1.0368573665618896, + "learning_rate": 7.567508491623283e-05, + "loss": 0.6927, + "step": 102830 + }, + { + "epoch": 0.6570154479128069, + "grad_norm": 0.9054602384567261, + "learning_rate": 7.567077916876793e-05, + "loss": 0.9451, + "step": 102840 + }, + { + "epoch": 0.6570793350625456, + "grad_norm": 0.7542839646339417, + "learning_rate": 7.566647316277607e-05, + "loss": 0.6406, + "step": 102850 + }, + { + "epoch": 0.6571432222122843, + "grad_norm": 0.7233191132545471, + "learning_rate": 7.566216689830061e-05, + "loss": 0.9122, + "step": 102860 + }, + { + "epoch": 0.6572071093620229, + "grad_norm": 0.8938806056976318, + "learning_rate": 7.565786037538492e-05, + "loss": 0.9786, + "step": 102870 + }, + { + "epoch": 0.6572709965117616, + "grad_norm": 0.8927178978919983, + "learning_rate": 7.56535535940724e-05, + "loss": 0.8482, + "step": 102880 + }, + { + "epoch": 0.6573348836615003, + "grad_norm": 0.9564715623855591, + "learning_rate": 7.564924655440639e-05, + "loss": 0.824, + "step": 102890 + }, + { + "epoch": 0.657398770811239, + "grad_norm": 1.4152295589447021, + "learning_rate": 7.564493925643028e-05, + "loss": 0.8507, + "step": 102900 + }, + { + "epoch": 0.6574626579609777, + "grad_norm": 2.762669324874878, + "learning_rate": 7.564063170018745e-05, + "loss": 1.0035, + "step": 102910 + }, + { + "epoch": 0.6575265451107164, + "grad_norm": 0.6634849905967712, + "learning_rate": 7.563632388572128e-05, + "loss": 1.004, + "step": 102920 + }, + { + "epoch": 0.6575904322604551, + "grad_norm": 0.7901267409324646, + "learning_rate": 7.563201581307516e-05, + "loss": 1.004, + "step": 102930 + }, + { + "epoch": 0.6576543194101938, + "grad_norm": 1.174972653388977, + "learning_rate": 7.562770748229245e-05, + "loss": 1.0687, + "step": 102940 + }, + { + "epoch": 0.6577182065599325, + "grad_norm": 0.6787839531898499, + "learning_rate": 7.562339889341655e-05, + "loss": 0.6921, + "step": 102950 + }, + { + "epoch": 0.6577820937096712, + "grad_norm": 1.1864277124404907, + "learning_rate": 7.561909004649086e-05, + "loss": 0.8603, + "step": 102960 + }, + { + "epoch": 0.6578459808594099, + "grad_norm": 0.715882420539856, + "learning_rate": 7.561478094155877e-05, + "loss": 0.9814, + "step": 102970 + }, + { + "epoch": 0.6579098680091486, + "grad_norm": 0.7654950618743896, + "learning_rate": 7.561047157866368e-05, + "loss": 0.7236, + "step": 102980 + }, + { + "epoch": 0.6579737551588873, + "grad_norm": 0.8814225196838379, + "learning_rate": 7.560616195784898e-05, + "loss": 1.0984, + "step": 102990 + }, + { + "epoch": 0.658037642308626, + "grad_norm": 0.6822389364242554, + "learning_rate": 7.560185207915808e-05, + "loss": 0.7056, + "step": 103000 + }, + { + "epoch": 0.6581015294583648, + "grad_norm": 0.9107899069786072, + "learning_rate": 7.559754194263438e-05, + "loss": 1.0334, + "step": 103010 + }, + { + "epoch": 0.6581654166081035, + "grad_norm": 1.1239162683486938, + "learning_rate": 7.559323154832128e-05, + "loss": 1.0588, + "step": 103020 + }, + { + "epoch": 0.6582293037578422, + "grad_norm": 0.7621965408325195, + "learning_rate": 7.558892089626222e-05, + "loss": 1.3516, + "step": 103030 + }, + { + "epoch": 0.6582931909075809, + "grad_norm": 0.9127793312072754, + "learning_rate": 7.558460998650056e-05, + "loss": 0.8635, + "step": 103040 + }, + { + "epoch": 0.6583570780573196, + "grad_norm": 1.0723611116409302, + "learning_rate": 7.558029881907977e-05, + "loss": 0.8784, + "step": 103050 + }, + { + "epoch": 0.6584209652070583, + "grad_norm": 1.1511666774749756, + "learning_rate": 7.557598739404322e-05, + "loss": 0.9645, + "step": 103060 + }, + { + "epoch": 0.658484852356797, + "grad_norm": 0.8360834717750549, + "learning_rate": 7.557167571143435e-05, + "loss": 1.2718, + "step": 103070 + }, + { + "epoch": 0.6585487395065357, + "grad_norm": 0.8610426187515259, + "learning_rate": 7.556736377129659e-05, + "loss": 0.6349, + "step": 103080 + }, + { + "epoch": 0.6586126266562744, + "grad_norm": 2.5912959575653076, + "learning_rate": 7.556305157367336e-05, + "loss": 0.9122, + "step": 103090 + }, + { + "epoch": 0.6586765138060131, + "grad_norm": 0.6567677855491638, + "learning_rate": 7.555873911860808e-05, + "loss": 0.7129, + "step": 103100 + }, + { + "epoch": 0.6587404009557518, + "grad_norm": 2.0863733291625977, + "learning_rate": 7.55544264061442e-05, + "loss": 0.9628, + "step": 103110 + }, + { + "epoch": 0.6588042881054904, + "grad_norm": 0.9286092519760132, + "learning_rate": 7.555011343632512e-05, + "loss": 0.8073, + "step": 103120 + }, + { + "epoch": 0.6588681752552291, + "grad_norm": 0.8470326662063599, + "learning_rate": 7.55458002091943e-05, + "loss": 0.7094, + "step": 103130 + }, + { + "epoch": 0.6589320624049678, + "grad_norm": 0.9523374438285828, + "learning_rate": 7.554148672479518e-05, + "loss": 0.898, + "step": 103140 + }, + { + "epoch": 0.6589959495547065, + "grad_norm": 0.7024726271629333, + "learning_rate": 7.553717298317118e-05, + "loss": 0.8187, + "step": 103150 + }, + { + "epoch": 0.6590598367044452, + "grad_norm": 0.8820728063583374, + "learning_rate": 7.553285898436577e-05, + "loss": 0.9663, + "step": 103160 + }, + { + "epoch": 0.6591237238541839, + "grad_norm": 0.6763830184936523, + "learning_rate": 7.552854472842238e-05, + "loss": 0.7318, + "step": 103170 + }, + { + "epoch": 0.6591876110039226, + "grad_norm": 0.881584644317627, + "learning_rate": 7.552423021538445e-05, + "loss": 1.0267, + "step": 103180 + }, + { + "epoch": 0.6592514981536614, + "grad_norm": 1.0598586797714233, + "learning_rate": 7.551991544529544e-05, + "loss": 0.9321, + "step": 103190 + }, + { + "epoch": 0.6593153853034001, + "grad_norm": 0.879578709602356, + "learning_rate": 7.55156004181988e-05, + "loss": 0.8715, + "step": 103200 + }, + { + "epoch": 0.6593792724531388, + "grad_norm": 0.9016001224517822, + "learning_rate": 7.5511285134138e-05, + "loss": 0.8316, + "step": 103210 + }, + { + "epoch": 0.6594431596028775, + "grad_norm": 0.7828614115715027, + "learning_rate": 7.550696959315647e-05, + "loss": 0.7837, + "step": 103220 + }, + { + "epoch": 0.6595070467526162, + "grad_norm": 0.6147605180740356, + "learning_rate": 7.550265379529771e-05, + "loss": 0.6583, + "step": 103230 + }, + { + "epoch": 0.6595709339023549, + "grad_norm": 0.5749229192733765, + "learning_rate": 7.549833774060515e-05, + "loss": 0.7125, + "step": 103240 + }, + { + "epoch": 0.6596348210520936, + "grad_norm": 0.9546531438827515, + "learning_rate": 7.549402142912228e-05, + "loss": 0.9588, + "step": 103250 + }, + { + "epoch": 0.6596987082018323, + "grad_norm": 0.8251008987426758, + "learning_rate": 7.548970486089255e-05, + "loss": 1.0117, + "step": 103260 + }, + { + "epoch": 0.659762595351571, + "grad_norm": 1.1722609996795654, + "learning_rate": 7.548538803595944e-05, + "loss": 0.6943, + "step": 103270 + }, + { + "epoch": 0.6598264825013097, + "grad_norm": 2.220587730407715, + "learning_rate": 7.548107095436644e-05, + "loss": 0.6568, + "step": 103280 + }, + { + "epoch": 0.6598903696510484, + "grad_norm": 1.1250571012496948, + "learning_rate": 7.547675361615701e-05, + "loss": 0.6475, + "step": 103290 + }, + { + "epoch": 0.6599542568007871, + "grad_norm": 0.6930386424064636, + "learning_rate": 7.547243602137462e-05, + "loss": 1.03, + "step": 103300 + }, + { + "epoch": 0.6600181439505258, + "grad_norm": 0.7208458185195923, + "learning_rate": 7.546811817006275e-05, + "loss": 0.7767, + "step": 103310 + }, + { + "epoch": 0.6600820311002645, + "grad_norm": 0.9552310705184937, + "learning_rate": 7.546380006226493e-05, + "loss": 1.2225, + "step": 103320 + }, + { + "epoch": 0.6601459182500032, + "grad_norm": 0.8683717250823975, + "learning_rate": 7.545948169802458e-05, + "loss": 1.0714, + "step": 103330 + }, + { + "epoch": 0.660209805399742, + "grad_norm": 1.0436851978302002, + "learning_rate": 7.545516307738524e-05, + "loss": 1.001, + "step": 103340 + }, + { + "epoch": 0.6602736925494807, + "grad_norm": 1.0748889446258545, + "learning_rate": 7.545084420039038e-05, + "loss": 0.9228, + "step": 103350 + }, + { + "epoch": 0.6603375796992192, + "grad_norm": 0.716839075088501, + "learning_rate": 7.54465250670835e-05, + "loss": 0.9845, + "step": 103360 + }, + { + "epoch": 0.660401466848958, + "grad_norm": 1.4549845457077026, + "learning_rate": 7.54422056775081e-05, + "loss": 0.8982, + "step": 103370 + }, + { + "epoch": 0.6604653539986967, + "grad_norm": 1.124532699584961, + "learning_rate": 7.54378860317077e-05, + "loss": 0.9131, + "step": 103380 + }, + { + "epoch": 0.6605292411484354, + "grad_norm": 0.7380385994911194, + "learning_rate": 7.543356612972575e-05, + "loss": 0.9161, + "step": 103390 + }, + { + "epoch": 0.6605931282981741, + "grad_norm": 1.4843467473983765, + "learning_rate": 7.54292459716058e-05, + "loss": 0.9268, + "step": 103400 + }, + { + "epoch": 0.6606570154479128, + "grad_norm": 0.9203116297721863, + "learning_rate": 7.542492555739135e-05, + "loss": 0.861, + "step": 103410 + }, + { + "epoch": 0.6607209025976515, + "grad_norm": 0.9751471877098083, + "learning_rate": 7.54206048871259e-05, + "loss": 0.7468, + "step": 103420 + }, + { + "epoch": 0.6607847897473902, + "grad_norm": 1.2186683416366577, + "learning_rate": 7.541628396085296e-05, + "loss": 0.967, + "step": 103430 + }, + { + "epoch": 0.6608486768971289, + "grad_norm": 1.1114938259124756, + "learning_rate": 7.541196277861604e-05, + "loss": 1.0421, + "step": 103440 + }, + { + "epoch": 0.6609125640468676, + "grad_norm": 1.0457226037979126, + "learning_rate": 7.540764134045869e-05, + "loss": 0.8752, + "step": 103450 + }, + { + "epoch": 0.6609764511966063, + "grad_norm": 1.4589784145355225, + "learning_rate": 7.540331964642441e-05, + "loss": 0.8713, + "step": 103460 + }, + { + "epoch": 0.661040338346345, + "grad_norm": 1.1460403203964233, + "learning_rate": 7.539899769655672e-05, + "loss": 0.832, + "step": 103470 + }, + { + "epoch": 0.6611042254960837, + "grad_norm": 0.9500607252120972, + "learning_rate": 7.539467549089914e-05, + "loss": 1.0559, + "step": 103480 + }, + { + "epoch": 0.6611681126458224, + "grad_norm": 1.8726199865341187, + "learning_rate": 7.539035302949523e-05, + "loss": 0.7371, + "step": 103490 + }, + { + "epoch": 0.6612319997955611, + "grad_norm": 1.0229368209838867, + "learning_rate": 7.538603031238849e-05, + "loss": 1.2995, + "step": 103500 + }, + { + "epoch": 0.6612958869452998, + "grad_norm": 0.8887920379638672, + "learning_rate": 7.538170733962245e-05, + "loss": 1.0423, + "step": 103510 + }, + { + "epoch": 0.6613597740950385, + "grad_norm": 1.2013378143310547, + "learning_rate": 7.537738411124066e-05, + "loss": 0.8154, + "step": 103520 + }, + { + "epoch": 0.6614236612447773, + "grad_norm": 0.6866891980171204, + "learning_rate": 7.537306062728669e-05, + "loss": 0.9957, + "step": 103530 + }, + { + "epoch": 0.661487548394516, + "grad_norm": 1.4273715019226074, + "learning_rate": 7.536873688780402e-05, + "loss": 0.9109, + "step": 103540 + }, + { + "epoch": 0.6615514355442547, + "grad_norm": 0.7437546253204346, + "learning_rate": 7.536441289283622e-05, + "loss": 0.9392, + "step": 103550 + }, + { + "epoch": 0.6616153226939934, + "grad_norm": 0.8337574005126953, + "learning_rate": 7.536008864242685e-05, + "loss": 1.0836, + "step": 103560 + }, + { + "epoch": 0.6616792098437321, + "grad_norm": 0.6678511500358582, + "learning_rate": 7.535576413661944e-05, + "loss": 0.7588, + "step": 103570 + }, + { + "epoch": 0.6617430969934708, + "grad_norm": 0.8168431520462036, + "learning_rate": 7.535143937545757e-05, + "loss": 0.7712, + "step": 103580 + }, + { + "epoch": 0.6618069841432095, + "grad_norm": 0.7926838994026184, + "learning_rate": 7.534711435898473e-05, + "loss": 0.8549, + "step": 103590 + }, + { + "epoch": 0.6618708712929481, + "grad_norm": 0.8065713047981262, + "learning_rate": 7.534278908724455e-05, + "loss": 0.9805, + "step": 103600 + }, + { + "epoch": 0.6619347584426868, + "grad_norm": 1.288443684577942, + "learning_rate": 7.533846356028056e-05, + "loss": 0.9803, + "step": 103610 + }, + { + "epoch": 0.6619986455924255, + "grad_norm": 0.8270924687385559, + "learning_rate": 7.533413777813632e-05, + "loss": 1.0178, + "step": 103620 + }, + { + "epoch": 0.6620625327421642, + "grad_norm": 0.969517707824707, + "learning_rate": 7.532981174085538e-05, + "loss": 0.7647, + "step": 103630 + }, + { + "epoch": 0.6621264198919029, + "grad_norm": 0.8974095582962036, + "learning_rate": 7.532548544848134e-05, + "loss": 0.92, + "step": 103640 + }, + { + "epoch": 0.6621903070416416, + "grad_norm": 0.6502230763435364, + "learning_rate": 7.532115890105776e-05, + "loss": 0.8038, + "step": 103650 + }, + { + "epoch": 0.6622541941913803, + "grad_norm": 2.234053134918213, + "learning_rate": 7.531683209862818e-05, + "loss": 0.6465, + "step": 103660 + }, + { + "epoch": 0.662318081341119, + "grad_norm": 0.8067479729652405, + "learning_rate": 7.531250504123622e-05, + "loss": 0.8337, + "step": 103670 + }, + { + "epoch": 0.6623819684908577, + "grad_norm": 1.0391908884048462, + "learning_rate": 7.530817772892543e-05, + "loss": 0.839, + "step": 103680 + }, + { + "epoch": 0.6624458556405964, + "grad_norm": 0.6917629241943359, + "learning_rate": 7.53038501617394e-05, + "loss": 1.0411, + "step": 103690 + }, + { + "epoch": 0.6625097427903351, + "grad_norm": 0.8440914750099182, + "learning_rate": 7.529952233972169e-05, + "loss": 0.886, + "step": 103700 + }, + { + "epoch": 0.6625736299400738, + "grad_norm": 0.7776359915733337, + "learning_rate": 7.529519426291591e-05, + "loss": 0.8062, + "step": 103710 + }, + { + "epoch": 0.6626375170898126, + "grad_norm": 0.7299126386642456, + "learning_rate": 7.529086593136564e-05, + "loss": 0.9034, + "step": 103720 + }, + { + "epoch": 0.6627014042395513, + "grad_norm": 1.0849508047103882, + "learning_rate": 7.528653734511447e-05, + "loss": 0.8913, + "step": 103730 + }, + { + "epoch": 0.66276529138929, + "grad_norm": 1.4763078689575195, + "learning_rate": 7.5282208504206e-05, + "loss": 1.0506, + "step": 103740 + }, + { + "epoch": 0.6628291785390287, + "grad_norm": 1.22445547580719, + "learning_rate": 7.52778794086838e-05, + "loss": 0.7745, + "step": 103750 + }, + { + "epoch": 0.6628930656887674, + "grad_norm": 0.757455587387085, + "learning_rate": 7.52735500585915e-05, + "loss": 0.7991, + "step": 103760 + }, + { + "epoch": 0.6629569528385061, + "grad_norm": 0.643725574016571, + "learning_rate": 7.526922045397269e-05, + "loss": 0.8059, + "step": 103770 + }, + { + "epoch": 0.6630208399882448, + "grad_norm": 0.8213297724723816, + "learning_rate": 7.526489059487097e-05, + "loss": 0.7859, + "step": 103780 + }, + { + "epoch": 0.6630847271379835, + "grad_norm": 0.90571528673172, + "learning_rate": 7.526056048132993e-05, + "loss": 0.8258, + "step": 103790 + }, + { + "epoch": 0.6631486142877222, + "grad_norm": 1.3528343439102173, + "learning_rate": 7.52562301133932e-05, + "loss": 0.8203, + "step": 103800 + }, + { + "epoch": 0.6632125014374609, + "grad_norm": 0.9805328845977783, + "learning_rate": 7.525189949110438e-05, + "loss": 0.9493, + "step": 103810 + }, + { + "epoch": 0.6632763885871996, + "grad_norm": 1.103614091873169, + "learning_rate": 7.52475686145071e-05, + "loss": 0.6765, + "step": 103820 + }, + { + "epoch": 0.6633402757369383, + "grad_norm": 0.864163875579834, + "learning_rate": 7.524323748364494e-05, + "loss": 0.9175, + "step": 103830 + }, + { + "epoch": 0.663404162886677, + "grad_norm": 0.5981049537658691, + "learning_rate": 7.523890609856157e-05, + "loss": 0.808, + "step": 103840 + }, + { + "epoch": 0.6634680500364156, + "grad_norm": 0.5768615007400513, + "learning_rate": 7.523457445930055e-05, + "loss": 0.9691, + "step": 103850 + }, + { + "epoch": 0.6635319371861543, + "grad_norm": 0.5739576816558838, + "learning_rate": 7.523024256590556e-05, + "loss": 0.852, + "step": 103860 + }, + { + "epoch": 0.663595824335893, + "grad_norm": 0.8774191737174988, + "learning_rate": 7.522591041842018e-05, + "loss": 0.9165, + "step": 103870 + }, + { + "epoch": 0.6636597114856317, + "grad_norm": 1.1826159954071045, + "learning_rate": 7.522157801688807e-05, + "loss": 0.648, + "step": 103880 + }, + { + "epoch": 0.6637235986353704, + "grad_norm": 1.5209389925003052, + "learning_rate": 7.521724536135287e-05, + "loss": 0.8307, + "step": 103890 + }, + { + "epoch": 0.6637874857851092, + "grad_norm": 0.7982348799705505, + "learning_rate": 7.521291245185815e-05, + "loss": 0.8145, + "step": 103900 + }, + { + "epoch": 0.6638513729348479, + "grad_norm": 0.9979506731033325, + "learning_rate": 7.52085792884476e-05, + "loss": 0.9305, + "step": 103910 + }, + { + "epoch": 0.6639152600845866, + "grad_norm": 0.9951682686805725, + "learning_rate": 7.520424587116485e-05, + "loss": 0.8077, + "step": 103920 + }, + { + "epoch": 0.6639791472343253, + "grad_norm": 0.7749238610267639, + "learning_rate": 7.519991220005355e-05, + "loss": 0.7822, + "step": 103930 + }, + { + "epoch": 0.664043034384064, + "grad_norm": 1.0329078435897827, + "learning_rate": 7.519557827515733e-05, + "loss": 0.8315, + "step": 103940 + }, + { + "epoch": 0.6641069215338027, + "grad_norm": 1.0199581384658813, + "learning_rate": 7.519124409651984e-05, + "loss": 0.9086, + "step": 103950 + }, + { + "epoch": 0.6641708086835414, + "grad_norm": 1.0277516841888428, + "learning_rate": 7.518690966418474e-05, + "loss": 1.0459, + "step": 103960 + }, + { + "epoch": 0.6642346958332801, + "grad_norm": 1.1918634176254272, + "learning_rate": 7.518257497819566e-05, + "loss": 1.1006, + "step": 103970 + }, + { + "epoch": 0.6642985829830188, + "grad_norm": 1.1144057512283325, + "learning_rate": 7.517824003859624e-05, + "loss": 0.8974, + "step": 103980 + }, + { + "epoch": 0.6643624701327575, + "grad_norm": 0.5517343282699585, + "learning_rate": 7.517390484543018e-05, + "loss": 0.9436, + "step": 103990 + }, + { + "epoch": 0.6644263572824962, + "grad_norm": 0.7781495451927185, + "learning_rate": 7.516956939874113e-05, + "loss": 0.9474, + "step": 104000 + }, + { + "epoch": 0.6644902444322349, + "grad_norm": 0.9537546634674072, + "learning_rate": 7.516523369857273e-05, + "loss": 0.7145, + "step": 104010 + }, + { + "epoch": 0.6645541315819736, + "grad_norm": 0.9538782238960266, + "learning_rate": 7.516089774496866e-05, + "loss": 0.7486, + "step": 104020 + }, + { + "epoch": 0.6646180187317123, + "grad_norm": 0.8699349164962769, + "learning_rate": 7.515656153797257e-05, + "loss": 0.8378, + "step": 104030 + }, + { + "epoch": 0.664681905881451, + "grad_norm": 0.7079137563705444, + "learning_rate": 7.515222507762815e-05, + "loss": 0.7564, + "step": 104040 + }, + { + "epoch": 0.6647457930311897, + "grad_norm": 0.596549391746521, + "learning_rate": 7.514788836397908e-05, + "loss": 0.9477, + "step": 104050 + }, + { + "epoch": 0.6648096801809285, + "grad_norm": 0.8176950812339783, + "learning_rate": 7.5143551397069e-05, + "loss": 0.883, + "step": 104060 + }, + { + "epoch": 0.6648735673306672, + "grad_norm": 0.5366206765174866, + "learning_rate": 7.51392141769416e-05, + "loss": 0.7585, + "step": 104070 + }, + { + "epoch": 0.6649374544804059, + "grad_norm": 2.3098976612091064, + "learning_rate": 7.51348767036406e-05, + "loss": 0.9656, + "step": 104080 + }, + { + "epoch": 0.6650013416301445, + "grad_norm": 1.27628755569458, + "learning_rate": 7.51305389772096e-05, + "loss": 0.8482, + "step": 104090 + }, + { + "epoch": 0.6650652287798832, + "grad_norm": 0.6717961430549622, + "learning_rate": 7.512620099769235e-05, + "loss": 0.94, + "step": 104100 + }, + { + "epoch": 0.6651291159296219, + "grad_norm": 2.8479366302490234, + "learning_rate": 7.512186276513252e-05, + "loss": 0.9146, + "step": 104110 + }, + { + "epoch": 0.6651930030793606, + "grad_norm": 1.3686326742172241, + "learning_rate": 7.51175242795738e-05, + "loss": 0.9382, + "step": 104120 + }, + { + "epoch": 0.6652568902290993, + "grad_norm": 1.0768921375274658, + "learning_rate": 7.511318554105988e-05, + "loss": 0.8419, + "step": 104130 + }, + { + "epoch": 0.665320777378838, + "grad_norm": 0.581240713596344, + "learning_rate": 7.510884654963446e-05, + "loss": 1.4625, + "step": 104140 + }, + { + "epoch": 0.6653846645285767, + "grad_norm": 0.8124034404754639, + "learning_rate": 7.510450730534123e-05, + "loss": 0.8727, + "step": 104150 + }, + { + "epoch": 0.6654485516783154, + "grad_norm": 0.9794655442237854, + "learning_rate": 7.510016780822388e-05, + "loss": 0.9003, + "step": 104160 + }, + { + "epoch": 0.6655124388280541, + "grad_norm": 1.2169163227081299, + "learning_rate": 7.509582805832614e-05, + "loss": 0.9785, + "step": 104170 + }, + { + "epoch": 0.6655763259777928, + "grad_norm": 1.4729397296905518, + "learning_rate": 7.50914880556917e-05, + "loss": 1.0235, + "step": 104180 + }, + { + "epoch": 0.6656402131275315, + "grad_norm": 0.7866071462631226, + "learning_rate": 7.508714780036428e-05, + "loss": 0.7818, + "step": 104190 + }, + { + "epoch": 0.6657041002772702, + "grad_norm": 1.0959784984588623, + "learning_rate": 7.508280729238754e-05, + "loss": 1.0379, + "step": 104200 + }, + { + "epoch": 0.6657679874270089, + "grad_norm": 0.6036289930343628, + "learning_rate": 7.507846653180527e-05, + "loss": 0.7128, + "step": 104210 + }, + { + "epoch": 0.6658318745767476, + "grad_norm": 1.0480402708053589, + "learning_rate": 7.507412551866113e-05, + "loss": 0.8218, + "step": 104220 + }, + { + "epoch": 0.6658957617264863, + "grad_norm": 0.8238396048545837, + "learning_rate": 7.506978425299886e-05, + "loss": 1.0993, + "step": 104230 + }, + { + "epoch": 0.665959648876225, + "grad_norm": 0.6929308176040649, + "learning_rate": 7.506544273486216e-05, + "loss": 0.7918, + "step": 104240 + }, + { + "epoch": 0.6660235360259638, + "grad_norm": 0.9507032036781311, + "learning_rate": 7.506110096429478e-05, + "loss": 1.1289, + "step": 104250 + }, + { + "epoch": 0.6660874231757025, + "grad_norm": 0.6241841316223145, + "learning_rate": 7.505675894134042e-05, + "loss": 0.6933, + "step": 104260 + }, + { + "epoch": 0.6661513103254412, + "grad_norm": 1.4490808248519897, + "learning_rate": 7.505241666604284e-05, + "loss": 0.9477, + "step": 104270 + }, + { + "epoch": 0.6662151974751799, + "grad_norm": 1.379927635192871, + "learning_rate": 7.504807413844573e-05, + "loss": 0.7406, + "step": 104280 + }, + { + "epoch": 0.6662790846249186, + "grad_norm": 0.7105908393859863, + "learning_rate": 7.504373135859283e-05, + "loss": 0.981, + "step": 104290 + }, + { + "epoch": 0.6663429717746573, + "grad_norm": 2.1415674686431885, + "learning_rate": 7.503938832652793e-05, + "loss": 0.804, + "step": 104300 + }, + { + "epoch": 0.666406858924396, + "grad_norm": 1.058493733406067, + "learning_rate": 7.50350450422947e-05, + "loss": 0.8421, + "step": 104310 + }, + { + "epoch": 0.6664707460741347, + "grad_norm": 0.8077415823936462, + "learning_rate": 7.503070150593692e-05, + "loss": 1.0033, + "step": 104320 + }, + { + "epoch": 0.6665346332238733, + "grad_norm": 1.5287679433822632, + "learning_rate": 7.502635771749832e-05, + "loss": 0.9708, + "step": 104330 + }, + { + "epoch": 0.666598520373612, + "grad_norm": 1.087836742401123, + "learning_rate": 7.502201367702264e-05, + "loss": 1.0815, + "step": 104340 + }, + { + "epoch": 0.6666624075233507, + "grad_norm": 1.3868272304534912, + "learning_rate": 7.501766938455365e-05, + "loss": 0.8257, + "step": 104350 + }, + { + "epoch": 0.6667262946730894, + "grad_norm": 0.9557937979698181, + "learning_rate": 7.501332484013508e-05, + "loss": 0.9096, + "step": 104360 + }, + { + "epoch": 0.6667901818228281, + "grad_norm": 0.964483916759491, + "learning_rate": 7.50089800438107e-05, + "loss": 0.8804, + "step": 104370 + }, + { + "epoch": 0.6668540689725668, + "grad_norm": 0.955265462398529, + "learning_rate": 7.500463499562423e-05, + "loss": 1.2936, + "step": 104380 + }, + { + "epoch": 0.6669179561223055, + "grad_norm": 0.7339872717857361, + "learning_rate": 7.500028969561947e-05, + "loss": 0.6539, + "step": 104390 + }, + { + "epoch": 0.6669818432720442, + "grad_norm": 0.724774181842804, + "learning_rate": 7.499594414384015e-05, + "loss": 1.0477, + "step": 104400 + }, + { + "epoch": 0.667045730421783, + "grad_norm": 0.9447941184043884, + "learning_rate": 7.499159834033006e-05, + "loss": 0.9432, + "step": 104410 + }, + { + "epoch": 0.6671096175715217, + "grad_norm": 1.0058971643447876, + "learning_rate": 7.498725228513295e-05, + "loss": 0.9858, + "step": 104420 + }, + { + "epoch": 0.6671735047212604, + "grad_norm": 0.9895200133323669, + "learning_rate": 7.49829059782926e-05, + "loss": 0.9344, + "step": 104430 + }, + { + "epoch": 0.6672373918709991, + "grad_norm": 0.8452537655830383, + "learning_rate": 7.497855941985274e-05, + "loss": 0.9845, + "step": 104440 + }, + { + "epoch": 0.6673012790207378, + "grad_norm": 1.1709915399551392, + "learning_rate": 7.497421260985721e-05, + "loss": 0.6073, + "step": 104450 + }, + { + "epoch": 0.6673651661704765, + "grad_norm": 0.6118887662887573, + "learning_rate": 7.496986554834974e-05, + "loss": 0.8184, + "step": 104460 + }, + { + "epoch": 0.6674290533202152, + "grad_norm": 1.082446575164795, + "learning_rate": 7.496551823537414e-05, + "loss": 0.7765, + "step": 104470 + }, + { + "epoch": 0.6674929404699539, + "grad_norm": 1.0023239850997925, + "learning_rate": 7.496117067097416e-05, + "loss": 0.9153, + "step": 104480 + }, + { + "epoch": 0.6675568276196926, + "grad_norm": 0.8243518471717834, + "learning_rate": 7.49568228551936e-05, + "loss": 0.7647, + "step": 104490 + }, + { + "epoch": 0.6676207147694313, + "grad_norm": 1.0458136796951294, + "learning_rate": 7.495247478807624e-05, + "loss": 0.7552, + "step": 104500 + }, + { + "epoch": 0.66768460191917, + "grad_norm": 1.0639128684997559, + "learning_rate": 7.494856131281384e-05, + "loss": 1.1131, + "step": 104510 + }, + { + "epoch": 0.6677484890689087, + "grad_norm": 0.9773111343383789, + "learning_rate": 7.494421276827722e-05, + "loss": 0.9946, + "step": 104520 + }, + { + "epoch": 0.6678123762186474, + "grad_norm": 0.8806718587875366, + "learning_rate": 7.493986397253079e-05, + "loss": 0.79, + "step": 104530 + }, + { + "epoch": 0.6678762633683861, + "grad_norm": 0.7560052275657654, + "learning_rate": 7.493551492561835e-05, + "loss": 0.8424, + "step": 104540 + }, + { + "epoch": 0.6679401505181248, + "grad_norm": 0.7948547601699829, + "learning_rate": 7.49311656275837e-05, + "loss": 0.992, + "step": 104550 + }, + { + "epoch": 0.6680040376678635, + "grad_norm": 0.8209701776504517, + "learning_rate": 7.492681607847064e-05, + "loss": 0.7234, + "step": 104560 + }, + { + "epoch": 0.6680679248176021, + "grad_norm": 0.6525776386260986, + "learning_rate": 7.492246627832297e-05, + "loss": 0.9871, + "step": 104570 + }, + { + "epoch": 0.6681318119673408, + "grad_norm": 0.7713031768798828, + "learning_rate": 7.491811622718454e-05, + "loss": 1.1684, + "step": 104580 + }, + { + "epoch": 0.6681956991170795, + "grad_norm": 0.7066755890846252, + "learning_rate": 7.49137659250991e-05, + "loss": 1.0549, + "step": 104590 + }, + { + "epoch": 0.6682595862668183, + "grad_norm": 0.9427279829978943, + "learning_rate": 7.490941537211047e-05, + "loss": 1.1706, + "step": 104600 + }, + { + "epoch": 0.668323473416557, + "grad_norm": 1.0990161895751953, + "learning_rate": 7.49050645682625e-05, + "loss": 0.7285, + "step": 104610 + }, + { + "epoch": 0.6683873605662957, + "grad_norm": 0.9260150790214539, + "learning_rate": 7.490071351359896e-05, + "loss": 1.0507, + "step": 104620 + }, + { + "epoch": 0.6684512477160344, + "grad_norm": 0.7509433627128601, + "learning_rate": 7.48963622081637e-05, + "loss": 1.1747, + "step": 104630 + }, + { + "epoch": 0.6685151348657731, + "grad_norm": 0.5178989768028259, + "learning_rate": 7.489201065200055e-05, + "loss": 0.7815, + "step": 104640 + }, + { + "epoch": 0.6685790220155118, + "grad_norm": 0.6780941486358643, + "learning_rate": 7.488765884515331e-05, + "loss": 0.8624, + "step": 104650 + }, + { + "epoch": 0.6686429091652505, + "grad_norm": 0.9320861101150513, + "learning_rate": 7.488330678766581e-05, + "loss": 0.7658, + "step": 104660 + }, + { + "epoch": 0.6687067963149892, + "grad_norm": 1.1950159072875977, + "learning_rate": 7.487895447958189e-05, + "loss": 0.7474, + "step": 104670 + }, + { + "epoch": 0.6687706834647279, + "grad_norm": 1.2028008699417114, + "learning_rate": 7.487460192094538e-05, + "loss": 1.0459, + "step": 104680 + }, + { + "epoch": 0.6688345706144666, + "grad_norm": 0.6161251068115234, + "learning_rate": 7.48702491118001e-05, + "loss": 0.8281, + "step": 104690 + }, + { + "epoch": 0.6688984577642053, + "grad_norm": 0.6632505059242249, + "learning_rate": 7.48658960521899e-05, + "loss": 0.927, + "step": 104700 + }, + { + "epoch": 0.668962344913944, + "grad_norm": 1.3930466175079346, + "learning_rate": 7.48615427421586e-05, + "loss": 0.8673, + "step": 104710 + }, + { + "epoch": 0.6690262320636827, + "grad_norm": 0.6448533535003662, + "learning_rate": 7.485718918175006e-05, + "loss": 1.0228, + "step": 104720 + }, + { + "epoch": 0.6690901192134214, + "grad_norm": 0.997040331363678, + "learning_rate": 7.485283537100813e-05, + "loss": 0.952, + "step": 104730 + }, + { + "epoch": 0.6691540063631601, + "grad_norm": 0.7598833441734314, + "learning_rate": 7.484848130997664e-05, + "loss": 0.7925, + "step": 104740 + }, + { + "epoch": 0.6692178935128988, + "grad_norm": 1.102980375289917, + "learning_rate": 7.484412699869946e-05, + "loss": 0.8564, + "step": 104750 + }, + { + "epoch": 0.6692817806626375, + "grad_norm": 0.7010207772254944, + "learning_rate": 7.483977243722042e-05, + "loss": 0.821, + "step": 104760 + }, + { + "epoch": 0.6693456678123763, + "grad_norm": 0.6999570727348328, + "learning_rate": 7.483541762558338e-05, + "loss": 0.9697, + "step": 104770 + }, + { + "epoch": 0.669409554962115, + "grad_norm": 1.1638367176055908, + "learning_rate": 7.48310625638322e-05, + "loss": 0.7476, + "step": 104780 + }, + { + "epoch": 0.6694734421118537, + "grad_norm": 1.272621512413025, + "learning_rate": 7.482670725201075e-05, + "loss": 0.9064, + "step": 104790 + }, + { + "epoch": 0.6695373292615924, + "grad_norm": 1.063870906829834, + "learning_rate": 7.482235169016286e-05, + "loss": 1.0145, + "step": 104800 + }, + { + "epoch": 0.6696012164113311, + "grad_norm": 0.9829151630401611, + "learning_rate": 7.481799587833241e-05, + "loss": 0.7884, + "step": 104810 + }, + { + "epoch": 0.6696651035610697, + "grad_norm": 0.8304445147514343, + "learning_rate": 7.481363981656329e-05, + "loss": 0.7588, + "step": 104820 + }, + { + "epoch": 0.6697289907108084, + "grad_norm": 0.8410045504570007, + "learning_rate": 7.480928350489935e-05, + "loss": 0.8533, + "step": 104830 + }, + { + "epoch": 0.6697928778605471, + "grad_norm": 0.8726821541786194, + "learning_rate": 7.480492694338445e-05, + "loss": 0.9978, + "step": 104840 + }, + { + "epoch": 0.6698567650102858, + "grad_norm": 0.6928712129592896, + "learning_rate": 7.480057013206248e-05, + "loss": 0.5947, + "step": 104850 + }, + { + "epoch": 0.6699206521600245, + "grad_norm": 0.9680970907211304, + "learning_rate": 7.479621307097732e-05, + "loss": 1.0667, + "step": 104860 + }, + { + "epoch": 0.6699845393097632, + "grad_norm": 1.0033652782440186, + "learning_rate": 7.479185576017283e-05, + "loss": 0.8343, + "step": 104870 + }, + { + "epoch": 0.6700484264595019, + "grad_norm": 0.9829990863800049, + "learning_rate": 7.478749819969291e-05, + "loss": 0.8276, + "step": 104880 + }, + { + "epoch": 0.6701123136092406, + "grad_norm": 0.591493546962738, + "learning_rate": 7.478314038958144e-05, + "loss": 0.8754, + "step": 104890 + }, + { + "epoch": 0.6701762007589793, + "grad_norm": 0.9688683748245239, + "learning_rate": 7.477878232988231e-05, + "loss": 0.7653, + "step": 104900 + }, + { + "epoch": 0.670240087908718, + "grad_norm": 1.1329195499420166, + "learning_rate": 7.47744240206394e-05, + "loss": 1.1918, + "step": 104910 + }, + { + "epoch": 0.6703039750584567, + "grad_norm": 0.9358948469161987, + "learning_rate": 7.47700654618966e-05, + "loss": 0.9408, + "step": 104920 + }, + { + "epoch": 0.6703678622081954, + "grad_norm": 0.8225820064544678, + "learning_rate": 7.476570665369782e-05, + "loss": 0.7708, + "step": 104930 + }, + { + "epoch": 0.6704317493579341, + "grad_norm": 0.5274181962013245, + "learning_rate": 7.476134759608695e-05, + "loss": 1.0094, + "step": 104940 + }, + { + "epoch": 0.6704956365076729, + "grad_norm": 0.9145537614822388, + "learning_rate": 7.475698828910789e-05, + "loss": 1.132, + "step": 104950 + }, + { + "epoch": 0.6705595236574116, + "grad_norm": 0.9951286315917969, + "learning_rate": 7.475262873280453e-05, + "loss": 0.7787, + "step": 104960 + }, + { + "epoch": 0.6706234108071503, + "grad_norm": 0.9020349979400635, + "learning_rate": 7.47482689272208e-05, + "loss": 0.8919, + "step": 104970 + }, + { + "epoch": 0.670687297956889, + "grad_norm": 1.0705788135528564, + "learning_rate": 7.474390887240058e-05, + "loss": 1.0581, + "step": 104980 + }, + { + "epoch": 0.6707511851066277, + "grad_norm": 0.665863573551178, + "learning_rate": 7.47395485683878e-05, + "loss": 1.1165, + "step": 104990 + }, + { + "epoch": 0.6708150722563664, + "grad_norm": 0.692719042301178, + "learning_rate": 7.473518801522636e-05, + "loss": 0.927, + "step": 105000 + }, + { + "epoch": 0.6708789594061051, + "grad_norm": 1.2955323457717896, + "learning_rate": 7.473082721296017e-05, + "loss": 0.7546, + "step": 105010 + }, + { + "epoch": 0.6709428465558438, + "grad_norm": 3.011267900466919, + "learning_rate": 7.472646616163317e-05, + "loss": 0.8958, + "step": 105020 + }, + { + "epoch": 0.6710067337055825, + "grad_norm": 1.2101144790649414, + "learning_rate": 7.472210486128926e-05, + "loss": 0.8662, + "step": 105030 + }, + { + "epoch": 0.6710706208553212, + "grad_norm": 1.1374763250350952, + "learning_rate": 7.471774331197235e-05, + "loss": 0.7575, + "step": 105040 + }, + { + "epoch": 0.6711345080050599, + "grad_norm": 0.7340751886367798, + "learning_rate": 7.47133815137264e-05, + "loss": 0.9587, + "step": 105050 + }, + { + "epoch": 0.6711983951547985, + "grad_norm": 0.9507089853286743, + "learning_rate": 7.470901946659529e-05, + "loss": 0.7413, + "step": 105060 + }, + { + "epoch": 0.6712622823045372, + "grad_norm": 0.9281381368637085, + "learning_rate": 7.470465717062301e-05, + "loss": 0.8137, + "step": 105070 + }, + { + "epoch": 0.6713261694542759, + "grad_norm": 1.0482163429260254, + "learning_rate": 7.470029462585344e-05, + "loss": 0.9596, + "step": 105080 + }, + { + "epoch": 0.6713900566040146, + "grad_norm": 0.8228697180747986, + "learning_rate": 7.469593183233055e-05, + "loss": 0.9596, + "step": 105090 + }, + { + "epoch": 0.6714539437537533, + "grad_norm": 0.9611666798591614, + "learning_rate": 7.469156879009824e-05, + "loss": 0.9952, + "step": 105100 + }, + { + "epoch": 0.671517830903492, + "grad_norm": 0.9121928811073303, + "learning_rate": 7.468720549920049e-05, + "loss": 0.9997, + "step": 105110 + }, + { + "epoch": 0.6715817180532307, + "grad_norm": 0.6626456379890442, + "learning_rate": 7.468284195968122e-05, + "loss": 1.3452, + "step": 105120 + }, + { + "epoch": 0.6716456052029695, + "grad_norm": 1.1006265878677368, + "learning_rate": 7.467847817158438e-05, + "loss": 0.8195, + "step": 105130 + }, + { + "epoch": 0.6717094923527082, + "grad_norm": 0.881610631942749, + "learning_rate": 7.46741141349539e-05, + "loss": 0.7741, + "step": 105140 + }, + { + "epoch": 0.6717733795024469, + "grad_norm": 0.8922167420387268, + "learning_rate": 7.466974984983374e-05, + "loss": 0.6272, + "step": 105150 + }, + { + "epoch": 0.6718372666521856, + "grad_norm": 0.9514210820198059, + "learning_rate": 7.466538531626788e-05, + "loss": 0.8509, + "step": 105160 + }, + { + "epoch": 0.6719011538019243, + "grad_norm": 1.4604068994522095, + "learning_rate": 7.466102053430023e-05, + "loss": 1.0156, + "step": 105170 + }, + { + "epoch": 0.671965040951663, + "grad_norm": 0.619574785232544, + "learning_rate": 7.46566555039748e-05, + "loss": 0.8557, + "step": 105180 + }, + { + "epoch": 0.6720289281014017, + "grad_norm": 0.6358922123908997, + "learning_rate": 7.46522902253355e-05, + "loss": 0.7148, + "step": 105190 + }, + { + "epoch": 0.6720928152511404, + "grad_norm": 1.179234266281128, + "learning_rate": 7.46479246984263e-05, + "loss": 1.074, + "step": 105200 + }, + { + "epoch": 0.6721567024008791, + "grad_norm": 1.14448881149292, + "learning_rate": 7.464355892329119e-05, + "loss": 1.2771, + "step": 105210 + }, + { + "epoch": 0.6722205895506178, + "grad_norm": 1.1141912937164307, + "learning_rate": 7.463919289997413e-05, + "loss": 1.0229, + "step": 105220 + }, + { + "epoch": 0.6722844767003565, + "grad_norm": 1.5927108526229858, + "learning_rate": 7.463482662851904e-05, + "loss": 0.7145, + "step": 105230 + }, + { + "epoch": 0.6723483638500952, + "grad_norm": 0.9684871435165405, + "learning_rate": 7.463046010896996e-05, + "loss": 0.862, + "step": 105240 + }, + { + "epoch": 0.6724122509998339, + "grad_norm": 2.3723561763763428, + "learning_rate": 7.462609334137085e-05, + "loss": 0.8466, + "step": 105250 + }, + { + "epoch": 0.6724761381495726, + "grad_norm": 1.4940024614334106, + "learning_rate": 7.462172632576566e-05, + "loss": 1.0536, + "step": 105260 + }, + { + "epoch": 0.6725400252993113, + "grad_norm": 2.5333075523376465, + "learning_rate": 7.46173590621984e-05, + "loss": 0.8354, + "step": 105270 + }, + { + "epoch": 0.67260391244905, + "grad_norm": 0.7402299046516418, + "learning_rate": 7.461299155071302e-05, + "loss": 0.8569, + "step": 105280 + }, + { + "epoch": 0.6726677995987888, + "grad_norm": 1.2563143968582153, + "learning_rate": 7.460862379135353e-05, + "loss": 0.9467, + "step": 105290 + }, + { + "epoch": 0.6727316867485273, + "grad_norm": 0.6169116497039795, + "learning_rate": 7.460425578416392e-05, + "loss": 0.8101, + "step": 105300 + }, + { + "epoch": 0.672795573898266, + "grad_norm": 0.9723251461982727, + "learning_rate": 7.459988752918815e-05, + "loss": 0.836, + "step": 105310 + }, + { + "epoch": 0.6728594610480048, + "grad_norm": 0.9852758049964905, + "learning_rate": 7.459551902647023e-05, + "loss": 0.7975, + "step": 105320 + }, + { + "epoch": 0.6729233481977435, + "grad_norm": 0.8491624593734741, + "learning_rate": 7.459115027605416e-05, + "loss": 1.0277, + "step": 105330 + }, + { + "epoch": 0.6729872353474822, + "grad_norm": 0.9614421725273132, + "learning_rate": 7.458678127798394e-05, + "loss": 0.6645, + "step": 105340 + }, + { + "epoch": 0.6730511224972209, + "grad_norm": 0.9115906357765198, + "learning_rate": 7.458241203230355e-05, + "loss": 0.8675, + "step": 105350 + }, + { + "epoch": 0.6731150096469596, + "grad_norm": 0.9010282158851624, + "learning_rate": 7.457804253905701e-05, + "loss": 0.8478, + "step": 105360 + }, + { + "epoch": 0.6731788967966983, + "grad_norm": 1.2737298011779785, + "learning_rate": 7.457367279828833e-05, + "loss": 0.7011, + "step": 105370 + }, + { + "epoch": 0.673242783946437, + "grad_norm": 1.0771639347076416, + "learning_rate": 7.456930281004148e-05, + "loss": 1.0038, + "step": 105380 + }, + { + "epoch": 0.6733066710961757, + "grad_norm": 0.9873582124710083, + "learning_rate": 7.456493257436052e-05, + "loss": 0.858, + "step": 105390 + }, + { + "epoch": 0.6733705582459144, + "grad_norm": 0.7805922031402588, + "learning_rate": 7.456056209128942e-05, + "loss": 0.9136, + "step": 105400 + }, + { + "epoch": 0.6734344453956531, + "grad_norm": 0.9018038511276245, + "learning_rate": 7.455619136087221e-05, + "loss": 1.0227, + "step": 105410 + }, + { + "epoch": 0.6734983325453918, + "grad_norm": 0.6552641987800598, + "learning_rate": 7.455182038315294e-05, + "loss": 1.0684, + "step": 105420 + }, + { + "epoch": 0.6735622196951305, + "grad_norm": 1.088218331336975, + "learning_rate": 7.454744915817557e-05, + "loss": 0.833, + "step": 105430 + }, + { + "epoch": 0.6736261068448692, + "grad_norm": 1.2808659076690674, + "learning_rate": 7.454307768598416e-05, + "loss": 0.6516, + "step": 105440 + }, + { + "epoch": 0.6736899939946079, + "grad_norm": 0.9723607301712036, + "learning_rate": 7.453870596662271e-05, + "loss": 1.0049, + "step": 105450 + }, + { + "epoch": 0.6737538811443466, + "grad_norm": 0.6379223465919495, + "learning_rate": 7.453433400013528e-05, + "loss": 0.9626, + "step": 105460 + }, + { + "epoch": 0.6738177682940854, + "grad_norm": 0.5692765712738037, + "learning_rate": 7.452996178656587e-05, + "loss": 0.7118, + "step": 105470 + }, + { + "epoch": 0.6738816554438241, + "grad_norm": 0.4991033971309662, + "learning_rate": 7.452558932595853e-05, + "loss": 0.8539, + "step": 105480 + }, + { + "epoch": 0.6739455425935628, + "grad_norm": 0.6770216226577759, + "learning_rate": 7.45212166183573e-05, + "loss": 0.797, + "step": 105490 + }, + { + "epoch": 0.6740094297433015, + "grad_norm": 1.0302858352661133, + "learning_rate": 7.451728097037279e-05, + "loss": 1.1371, + "step": 105500 + }, + { + "epoch": 0.6740733168930402, + "grad_norm": 0.986290693283081, + "learning_rate": 7.451290779360444e-05, + "loss": 0.8325, + "step": 105510 + }, + { + "epoch": 0.6741372040427789, + "grad_norm": 0.6854764223098755, + "learning_rate": 7.450853436996992e-05, + "loss": 1.201, + "step": 105520 + }, + { + "epoch": 0.6742010911925176, + "grad_norm": 1.172593116760254, + "learning_rate": 7.450416069951324e-05, + "loss": 0.7934, + "step": 105530 + }, + { + "epoch": 0.6742649783422563, + "grad_norm": 0.6270721554756165, + "learning_rate": 7.44997867822785e-05, + "loss": 0.944, + "step": 105540 + }, + { + "epoch": 0.6743288654919949, + "grad_norm": 1.119352102279663, + "learning_rate": 7.449541261830968e-05, + "loss": 0.9087, + "step": 105550 + }, + { + "epoch": 0.6743927526417336, + "grad_norm": 1.074959397315979, + "learning_rate": 7.449103820765086e-05, + "loss": 0.7586, + "step": 105560 + }, + { + "epoch": 0.6744566397914723, + "grad_norm": 0.7056079506874084, + "learning_rate": 7.44866635503461e-05, + "loss": 0.9738, + "step": 105570 + }, + { + "epoch": 0.674520526941211, + "grad_norm": 0.5963863730430603, + "learning_rate": 7.448228864643947e-05, + "loss": 0.7275, + "step": 105580 + }, + { + "epoch": 0.6745844140909497, + "grad_norm": 0.836320698261261, + "learning_rate": 7.447791349597502e-05, + "loss": 0.9359, + "step": 105590 + }, + { + "epoch": 0.6746483012406884, + "grad_norm": 0.8702114224433899, + "learning_rate": 7.447353809899677e-05, + "loss": 0.8465, + "step": 105600 + }, + { + "epoch": 0.6747121883904271, + "grad_norm": 0.9476937651634216, + "learning_rate": 7.446916245554885e-05, + "loss": 1.0381, + "step": 105610 + }, + { + "epoch": 0.6747760755401658, + "grad_norm": 1.5769599676132202, + "learning_rate": 7.446478656567529e-05, + "loss": 1.1323, + "step": 105620 + }, + { + "epoch": 0.6748399626899045, + "grad_norm": 0.9263478517532349, + "learning_rate": 7.446041042942016e-05, + "loss": 1.2046, + "step": 105630 + }, + { + "epoch": 0.6749038498396432, + "grad_norm": 0.5203749537467957, + "learning_rate": 7.445603404682754e-05, + "loss": 0.9345, + "step": 105640 + }, + { + "epoch": 0.674967736989382, + "grad_norm": 1.511046290397644, + "learning_rate": 7.445165741794149e-05, + "loss": 0.7772, + "step": 105650 + }, + { + "epoch": 0.6750316241391207, + "grad_norm": 0.868693470954895, + "learning_rate": 7.44472805428061e-05, + "loss": 0.9579, + "step": 105660 + }, + { + "epoch": 0.6750955112888594, + "grad_norm": 0.7717391848564148, + "learning_rate": 7.444290342146545e-05, + "loss": 0.911, + "step": 105670 + }, + { + "epoch": 0.6751593984385981, + "grad_norm": 1.1811197996139526, + "learning_rate": 7.443852605396361e-05, + "loss": 0.8844, + "step": 105680 + }, + { + "epoch": 0.6752232855883368, + "grad_norm": 0.9373357892036438, + "learning_rate": 7.443414844034468e-05, + "loss": 1.0509, + "step": 105690 + }, + { + "epoch": 0.6752871727380755, + "grad_norm": 1.0546302795410156, + "learning_rate": 7.442977058065273e-05, + "loss": 0.8802, + "step": 105700 + }, + { + "epoch": 0.6753510598878142, + "grad_norm": 0.8621144890785217, + "learning_rate": 7.442539247493185e-05, + "loss": 0.6729, + "step": 105710 + }, + { + "epoch": 0.6754149470375529, + "grad_norm": 0.6948429942131042, + "learning_rate": 7.442101412322613e-05, + "loss": 0.9194, + "step": 105720 + }, + { + "epoch": 0.6754788341872916, + "grad_norm": 1.381230354309082, + "learning_rate": 7.441663552557969e-05, + "loss": 0.7042, + "step": 105730 + }, + { + "epoch": 0.6755427213370303, + "grad_norm": 1.3487558364868164, + "learning_rate": 7.441225668203658e-05, + "loss": 0.8875, + "step": 105740 + }, + { + "epoch": 0.675606608486769, + "grad_norm": 1.0803183317184448, + "learning_rate": 7.440787759264095e-05, + "loss": 1.0096, + "step": 105750 + }, + { + "epoch": 0.6756704956365077, + "grad_norm": 1.2716935873031616, + "learning_rate": 7.440349825743687e-05, + "loss": 0.7651, + "step": 105760 + }, + { + "epoch": 0.6757343827862464, + "grad_norm": 1.0950727462768555, + "learning_rate": 7.439911867646845e-05, + "loss": 0.8156, + "step": 105770 + }, + { + "epoch": 0.6757982699359851, + "grad_norm": 0.8086333870887756, + "learning_rate": 7.43947388497798e-05, + "loss": 0.8297, + "step": 105780 + }, + { + "epoch": 0.6758621570857237, + "grad_norm": 0.7670168876647949, + "learning_rate": 7.439035877741503e-05, + "loss": 0.855, + "step": 105790 + }, + { + "epoch": 0.6759260442354624, + "grad_norm": 0.7938393950462341, + "learning_rate": 7.438597845941824e-05, + "loss": 0.8926, + "step": 105800 + }, + { + "epoch": 0.6759899313852011, + "grad_norm": 0.7349621057510376, + "learning_rate": 7.438159789583354e-05, + "loss": 0.9497, + "step": 105810 + }, + { + "epoch": 0.6760538185349398, + "grad_norm": 0.7684302926063538, + "learning_rate": 7.437721708670508e-05, + "loss": 0.9919, + "step": 105820 + }, + { + "epoch": 0.6761177056846785, + "grad_norm": 1.0219396352767944, + "learning_rate": 7.437283603207693e-05, + "loss": 0.8476, + "step": 105830 + }, + { + "epoch": 0.6761815928344173, + "grad_norm": 0.8535874485969543, + "learning_rate": 7.436845473199325e-05, + "loss": 0.8841, + "step": 105840 + }, + { + "epoch": 0.676245479984156, + "grad_norm": 0.8949576020240784, + "learning_rate": 7.436407318649814e-05, + "loss": 0.8545, + "step": 105850 + }, + { + "epoch": 0.6763093671338947, + "grad_norm": 1.5550041198730469, + "learning_rate": 7.435969139563574e-05, + "loss": 1.1006, + "step": 105860 + }, + { + "epoch": 0.6763732542836334, + "grad_norm": 0.8512755036354065, + "learning_rate": 7.435530935945018e-05, + "loss": 0.9107, + "step": 105870 + }, + { + "epoch": 0.6764371414333721, + "grad_norm": 0.6899836659431458, + "learning_rate": 7.435092707798559e-05, + "loss": 1.1033, + "step": 105880 + }, + { + "epoch": 0.6765010285831108, + "grad_norm": 0.6218075156211853, + "learning_rate": 7.434654455128607e-05, + "loss": 0.8597, + "step": 105890 + }, + { + "epoch": 0.6765649157328495, + "grad_norm": 0.6466425657272339, + "learning_rate": 7.43421617793958e-05, + "loss": 0.798, + "step": 105900 + }, + { + "epoch": 0.6766288028825882, + "grad_norm": 0.6886029243469238, + "learning_rate": 7.43377787623589e-05, + "loss": 0.8296, + "step": 105910 + }, + { + "epoch": 0.6766926900323269, + "grad_norm": 0.6372695565223694, + "learning_rate": 7.433339550021951e-05, + "loss": 0.8947, + "step": 105920 + }, + { + "epoch": 0.6767565771820656, + "grad_norm": 0.8008190393447876, + "learning_rate": 7.43290119930218e-05, + "loss": 0.9288, + "step": 105930 + }, + { + "epoch": 0.6768204643318043, + "grad_norm": 0.8058283925056458, + "learning_rate": 7.432462824080985e-05, + "loss": 0.8823, + "step": 105940 + }, + { + "epoch": 0.676884351481543, + "grad_norm": 0.7353378534317017, + "learning_rate": 7.432024424362789e-05, + "loss": 0.9896, + "step": 105950 + }, + { + "epoch": 0.6769482386312817, + "grad_norm": 1.0405430793762207, + "learning_rate": 7.431586000152001e-05, + "loss": 1.3537, + "step": 105960 + }, + { + "epoch": 0.6770121257810204, + "grad_norm": 0.5413171052932739, + "learning_rate": 7.431147551453038e-05, + "loss": 0.8819, + "step": 105970 + }, + { + "epoch": 0.6770760129307591, + "grad_norm": 1.0479340553283691, + "learning_rate": 7.430709078270316e-05, + "loss": 0.9891, + "step": 105980 + }, + { + "epoch": 0.6771399000804978, + "grad_norm": 1.1004263162612915, + "learning_rate": 7.430270580608252e-05, + "loss": 0.8119, + "step": 105990 + }, + { + "epoch": 0.6772037872302366, + "grad_norm": 0.6329840421676636, + "learning_rate": 7.42983205847126e-05, + "loss": 0.8123, + "step": 106000 + }, + { + "epoch": 0.6772676743799753, + "grad_norm": 0.8763070702552795, + "learning_rate": 7.429393511863757e-05, + "loss": 0.8839, + "step": 106010 + }, + { + "epoch": 0.677331561529714, + "grad_norm": 1.7734843492507935, + "learning_rate": 7.42895494079016e-05, + "loss": 0.9038, + "step": 106020 + }, + { + "epoch": 0.6773954486794526, + "grad_norm": 1.5639463663101196, + "learning_rate": 7.428516345254886e-05, + "loss": 0.7489, + "step": 106030 + }, + { + "epoch": 0.6774593358291913, + "grad_norm": 0.8904886245727539, + "learning_rate": 7.42807772526235e-05, + "loss": 0.7974, + "step": 106040 + }, + { + "epoch": 0.67752322297893, + "grad_norm": 0.9649606347084045, + "learning_rate": 7.42763908081697e-05, + "loss": 1.0977, + "step": 106050 + }, + { + "epoch": 0.6775871101286687, + "grad_norm": 0.7616420984268188, + "learning_rate": 7.427200411923166e-05, + "loss": 0.9091, + "step": 106060 + }, + { + "epoch": 0.6776509972784074, + "grad_norm": 1.3879841566085815, + "learning_rate": 7.426761718585353e-05, + "loss": 0.9823, + "step": 106070 + }, + { + "epoch": 0.6777148844281461, + "grad_norm": 1.2231416702270508, + "learning_rate": 7.426323000807951e-05, + "loss": 0.8862, + "step": 106080 + }, + { + "epoch": 0.6777787715778848, + "grad_norm": 1.1057007312774658, + "learning_rate": 7.425884258595377e-05, + "loss": 0.9518, + "step": 106090 + }, + { + "epoch": 0.6778426587276235, + "grad_norm": 0.7669041156768799, + "learning_rate": 7.42544549195205e-05, + "loss": 0.8201, + "step": 106100 + }, + { + "epoch": 0.6779065458773622, + "grad_norm": 0.9496064186096191, + "learning_rate": 7.425006700882388e-05, + "loss": 0.747, + "step": 106110 + }, + { + "epoch": 0.6779704330271009, + "grad_norm": 0.8966147899627686, + "learning_rate": 7.424567885390811e-05, + "loss": 0.9232, + "step": 106120 + }, + { + "epoch": 0.6780343201768396, + "grad_norm": 0.8240459561347961, + "learning_rate": 7.424129045481738e-05, + "loss": 0.9572, + "step": 106130 + }, + { + "epoch": 0.6780982073265783, + "grad_norm": 0.9006532430648804, + "learning_rate": 7.423690181159588e-05, + "loss": 1.0682, + "step": 106140 + }, + { + "epoch": 0.678162094476317, + "grad_norm": 0.6999794840812683, + "learning_rate": 7.423251292428782e-05, + "loss": 0.7835, + "step": 106150 + }, + { + "epoch": 0.6782259816260557, + "grad_norm": 0.639180600643158, + "learning_rate": 7.422812379293738e-05, + "loss": 0.9808, + "step": 106160 + }, + { + "epoch": 0.6782898687757944, + "grad_norm": 0.9818177819252014, + "learning_rate": 7.422373441758877e-05, + "loss": 0.9845, + "step": 106170 + }, + { + "epoch": 0.6783537559255332, + "grad_norm": 0.85085529088974, + "learning_rate": 7.421934479828621e-05, + "loss": 1.0079, + "step": 106180 + }, + { + "epoch": 0.6784176430752719, + "grad_norm": 1.0107144117355347, + "learning_rate": 7.421495493507388e-05, + "loss": 0.71, + "step": 106190 + }, + { + "epoch": 0.6784815302250106, + "grad_norm": 0.8467554450035095, + "learning_rate": 7.421056482799602e-05, + "loss": 0.8878, + "step": 106200 + }, + { + "epoch": 0.6785454173747493, + "grad_norm": 1.0272150039672852, + "learning_rate": 7.42061744770968e-05, + "loss": 1.0742, + "step": 106210 + }, + { + "epoch": 0.678609304524488, + "grad_norm": 0.5289245247840881, + "learning_rate": 7.42017838824205e-05, + "loss": 0.6677, + "step": 106220 + }, + { + "epoch": 0.6786731916742267, + "grad_norm": 0.78628009557724, + "learning_rate": 7.419739304401127e-05, + "loss": 0.9517, + "step": 106230 + }, + { + "epoch": 0.6787370788239654, + "grad_norm": 1.0156890153884888, + "learning_rate": 7.419300196191338e-05, + "loss": 1.0061, + "step": 106240 + }, + { + "epoch": 0.6788009659737041, + "grad_norm": 1.2271900177001953, + "learning_rate": 7.418861063617102e-05, + "loss": 0.8683, + "step": 106250 + }, + { + "epoch": 0.6788648531234428, + "grad_norm": 2.280670404434204, + "learning_rate": 7.41842190668284e-05, + "loss": 0.7783, + "step": 106260 + }, + { + "epoch": 0.6789287402731814, + "grad_norm": 0.7349517345428467, + "learning_rate": 7.41798272539298e-05, + "loss": 0.8688, + "step": 106270 + }, + { + "epoch": 0.6789926274229201, + "grad_norm": 0.9518811702728271, + "learning_rate": 7.417543519751943e-05, + "loss": 0.7962, + "step": 106280 + }, + { + "epoch": 0.6790565145726588, + "grad_norm": 1.090990662574768, + "learning_rate": 7.41710428976415e-05, + "loss": 0.9509, + "step": 106290 + }, + { + "epoch": 0.6791204017223975, + "grad_norm": 0.817570149898529, + "learning_rate": 7.416665035434025e-05, + "loss": 0.8855, + "step": 106300 + }, + { + "epoch": 0.6791842888721362, + "grad_norm": 0.6482291221618652, + "learning_rate": 7.416225756765993e-05, + "loss": 0.7992, + "step": 106310 + }, + { + "epoch": 0.6792481760218749, + "grad_norm": 1.4157582521438599, + "learning_rate": 7.415786453764478e-05, + "loss": 1.0889, + "step": 106320 + }, + { + "epoch": 0.6793120631716136, + "grad_norm": 1.1152769327163696, + "learning_rate": 7.415347126433903e-05, + "loss": 1.1192, + "step": 106330 + }, + { + "epoch": 0.6793759503213523, + "grad_norm": 0.8868082761764526, + "learning_rate": 7.414907774778693e-05, + "loss": 0.7684, + "step": 106340 + }, + { + "epoch": 0.679439837471091, + "grad_norm": 0.8544641137123108, + "learning_rate": 7.414468398803272e-05, + "loss": 0.8531, + "step": 106350 + }, + { + "epoch": 0.6795037246208298, + "grad_norm": 2.2637743949890137, + "learning_rate": 7.414028998512065e-05, + "loss": 0.8975, + "step": 106360 + }, + { + "epoch": 0.6795676117705685, + "grad_norm": 0.912253737449646, + "learning_rate": 7.413589573909498e-05, + "loss": 0.6954, + "step": 106370 + }, + { + "epoch": 0.6796314989203072, + "grad_norm": 0.7223014831542969, + "learning_rate": 7.413150124999997e-05, + "loss": 0.9605, + "step": 106380 + }, + { + "epoch": 0.6796953860700459, + "grad_norm": 0.7518347501754761, + "learning_rate": 7.412710651787986e-05, + "loss": 0.7462, + "step": 106390 + }, + { + "epoch": 0.6797592732197846, + "grad_norm": 0.6483036279678345, + "learning_rate": 7.412271154277891e-05, + "loss": 0.8934, + "step": 106400 + }, + { + "epoch": 0.6798231603695233, + "grad_norm": 1.010314702987671, + "learning_rate": 7.411831632474138e-05, + "loss": 0.724, + "step": 106410 + }, + { + "epoch": 0.679887047519262, + "grad_norm": 0.7592995762825012, + "learning_rate": 7.411392086381154e-05, + "loss": 0.8157, + "step": 106420 + }, + { + "epoch": 0.6799509346690007, + "grad_norm": 1.2588444948196411, + "learning_rate": 7.410952516003367e-05, + "loss": 0.9328, + "step": 106430 + }, + { + "epoch": 0.6800148218187394, + "grad_norm": 0.7056863903999329, + "learning_rate": 7.410512921345201e-05, + "loss": 1.0183, + "step": 106440 + }, + { + "epoch": 0.6800787089684781, + "grad_norm": 0.7405192255973816, + "learning_rate": 7.410073302411085e-05, + "loss": 0.7652, + "step": 106450 + }, + { + "epoch": 0.6801425961182168, + "grad_norm": 0.6608672738075256, + "learning_rate": 7.409633659205446e-05, + "loss": 0.9101, + "step": 106460 + }, + { + "epoch": 0.6802064832679555, + "grad_norm": 1.030137300491333, + "learning_rate": 7.409193991732711e-05, + "loss": 0.849, + "step": 106470 + }, + { + "epoch": 0.6802703704176942, + "grad_norm": 0.6124225854873657, + "learning_rate": 7.40875429999731e-05, + "loss": 0.758, + "step": 106480 + }, + { + "epoch": 0.6803342575674329, + "grad_norm": 0.8795433640480042, + "learning_rate": 7.408314584003666e-05, + "loss": 1.0669, + "step": 106490 + }, + { + "epoch": 0.6803981447171716, + "grad_norm": 0.7640893459320068, + "learning_rate": 7.407874843756213e-05, + "loss": 0.8179, + "step": 106500 + }, + { + "epoch": 0.6804620318669103, + "grad_norm": 0.6787682771682739, + "learning_rate": 7.407435079259377e-05, + "loss": 0.9895, + "step": 106510 + }, + { + "epoch": 0.6805259190166489, + "grad_norm": 0.8706437349319458, + "learning_rate": 7.406995290517587e-05, + "loss": 0.7269, + "step": 106520 + }, + { + "epoch": 0.6805898061663876, + "grad_norm": 0.6258346438407898, + "learning_rate": 7.406555477535271e-05, + "loss": 1.0131, + "step": 106530 + }, + { + "epoch": 0.6806536933161264, + "grad_norm": 1.0943886041641235, + "learning_rate": 7.406115640316861e-05, + "loss": 0.7417, + "step": 106540 + }, + { + "epoch": 0.6807175804658651, + "grad_norm": 0.7393679618835449, + "learning_rate": 7.405675778866785e-05, + "loss": 0.8613, + "step": 106550 + }, + { + "epoch": 0.6807814676156038, + "grad_norm": 0.8770964741706848, + "learning_rate": 7.40523589318947e-05, + "loss": 1.2317, + "step": 106560 + }, + { + "epoch": 0.6808453547653425, + "grad_norm": 0.980842649936676, + "learning_rate": 7.404795983289351e-05, + "loss": 0.8648, + "step": 106570 + }, + { + "epoch": 0.6809092419150812, + "grad_norm": 0.7715876698493958, + "learning_rate": 7.404356049170856e-05, + "loss": 0.9493, + "step": 106580 + }, + { + "epoch": 0.6809731290648199, + "grad_norm": 0.8744866847991943, + "learning_rate": 7.403916090838414e-05, + "loss": 1.3351, + "step": 106590 + }, + { + "epoch": 0.6810370162145586, + "grad_norm": 2.178861618041992, + "learning_rate": 7.403476108296458e-05, + "loss": 1.0401, + "step": 106600 + }, + { + "epoch": 0.6811009033642973, + "grad_norm": 0.7490164637565613, + "learning_rate": 7.40303610154942e-05, + "loss": 0.8825, + "step": 106610 + }, + { + "epoch": 0.681164790514036, + "grad_norm": 1.3160593509674072, + "learning_rate": 7.402596070601729e-05, + "loss": 0.9475, + "step": 106620 + }, + { + "epoch": 0.6812286776637747, + "grad_norm": 0.7300577163696289, + "learning_rate": 7.402156015457815e-05, + "loss": 0.7993, + "step": 106630 + }, + { + "epoch": 0.6812925648135134, + "grad_norm": 1.1624113321304321, + "learning_rate": 7.401715936122114e-05, + "loss": 1.0644, + "step": 106640 + }, + { + "epoch": 0.6813564519632521, + "grad_norm": 0.6754822134971619, + "learning_rate": 7.401275832599054e-05, + "loss": 0.9375, + "step": 106650 + }, + { + "epoch": 0.6814203391129908, + "grad_norm": 0.8442546129226685, + "learning_rate": 7.40083570489307e-05, + "loss": 1.0102, + "step": 106660 + }, + { + "epoch": 0.6814842262627295, + "grad_norm": 0.8470264673233032, + "learning_rate": 7.400395553008593e-05, + "loss": 0.8809, + "step": 106670 + }, + { + "epoch": 0.6815481134124682, + "grad_norm": 1.252909541130066, + "learning_rate": 7.399955376950056e-05, + "loss": 0.9274, + "step": 106680 + }, + { + "epoch": 0.6816120005622069, + "grad_norm": 1.0591319799423218, + "learning_rate": 7.399515176721894e-05, + "loss": 0.7077, + "step": 106690 + }, + { + "epoch": 0.6816758877119456, + "grad_norm": 0.9662178754806519, + "learning_rate": 7.399074952328536e-05, + "loss": 0.9326, + "step": 106700 + }, + { + "epoch": 0.6817397748616844, + "grad_norm": 0.6794439554214478, + "learning_rate": 7.398634703774417e-05, + "loss": 0.9654, + "step": 106710 + }, + { + "epoch": 0.6818036620114231, + "grad_norm": 1.1868617534637451, + "learning_rate": 7.398194431063974e-05, + "loss": 0.6711, + "step": 106720 + }, + { + "epoch": 0.6818675491611618, + "grad_norm": 0.6283101439476013, + "learning_rate": 7.397754134201637e-05, + "loss": 0.7644, + "step": 106730 + }, + { + "epoch": 0.6819314363109005, + "grad_norm": 0.9207131862640381, + "learning_rate": 7.397313813191842e-05, + "loss": 1.1804, + "step": 106740 + }, + { + "epoch": 0.6819953234606392, + "grad_norm": 0.7542859315872192, + "learning_rate": 7.396873468039022e-05, + "loss": 1.1054, + "step": 106750 + }, + { + "epoch": 0.6820592106103778, + "grad_norm": 1.1628599166870117, + "learning_rate": 7.396433098747613e-05, + "loss": 0.7886, + "step": 106760 + }, + { + "epoch": 0.6821230977601165, + "grad_norm": 0.9535654187202454, + "learning_rate": 7.39599270532205e-05, + "loss": 0.8177, + "step": 106770 + }, + { + "epoch": 0.6821869849098552, + "grad_norm": 1.606237769126892, + "learning_rate": 7.395552287766766e-05, + "loss": 0.9816, + "step": 106780 + }, + { + "epoch": 0.6822508720595939, + "grad_norm": 0.8882198333740234, + "learning_rate": 7.395111846086201e-05, + "loss": 0.6792, + "step": 106790 + }, + { + "epoch": 0.6823147592093326, + "grad_norm": 0.7362374067306519, + "learning_rate": 7.394671380284784e-05, + "loss": 0.8806, + "step": 106800 + }, + { + "epoch": 0.6823786463590713, + "grad_norm": 0.7599479556083679, + "learning_rate": 7.394230890366956e-05, + "loss": 0.8613, + "step": 106810 + }, + { + "epoch": 0.68244253350881, + "grad_norm": 0.7655912041664124, + "learning_rate": 7.393790376337153e-05, + "loss": 0.8717, + "step": 106820 + }, + { + "epoch": 0.6825064206585487, + "grad_norm": 1.046034812927246, + "learning_rate": 7.393349838199809e-05, + "loss": 1.1742, + "step": 106830 + }, + { + "epoch": 0.6825703078082874, + "grad_norm": 0.7715229392051697, + "learning_rate": 7.392909275959362e-05, + "loss": 1.078, + "step": 106840 + }, + { + "epoch": 0.6826341949580261, + "grad_norm": 0.7597649097442627, + "learning_rate": 7.39246868962025e-05, + "loss": 0.971, + "step": 106850 + }, + { + "epoch": 0.6826980821077648, + "grad_norm": 0.5466295480728149, + "learning_rate": 7.392028079186906e-05, + "loss": 0.7825, + "step": 106860 + }, + { + "epoch": 0.6827619692575035, + "grad_norm": 4.8443284034729, + "learning_rate": 7.39158744466377e-05, + "loss": 1.1009, + "step": 106870 + }, + { + "epoch": 0.6828258564072422, + "grad_norm": 0.6265544891357422, + "learning_rate": 7.39114678605528e-05, + "loss": 0.8694, + "step": 106880 + }, + { + "epoch": 0.682889743556981, + "grad_norm": 0.917610764503479, + "learning_rate": 7.390706103365873e-05, + "loss": 0.9782, + "step": 106890 + }, + { + "epoch": 0.6829536307067197, + "grad_norm": 0.9550445079803467, + "learning_rate": 7.390265396599987e-05, + "loss": 0.906, + "step": 106900 + }, + { + "epoch": 0.6830175178564584, + "grad_norm": 2.5587947368621826, + "learning_rate": 7.389824665762061e-05, + "loss": 0.8528, + "step": 106910 + }, + { + "epoch": 0.6830814050061971, + "grad_norm": 1.318000078201294, + "learning_rate": 7.389383910856534e-05, + "loss": 0.9362, + "step": 106920 + }, + { + "epoch": 0.6831452921559358, + "grad_norm": 1.0165103673934937, + "learning_rate": 7.388943131887842e-05, + "loss": 0.7795, + "step": 106930 + }, + { + "epoch": 0.6832091793056745, + "grad_norm": 0.9445672631263733, + "learning_rate": 7.388502328860427e-05, + "loss": 0.9833, + "step": 106940 + }, + { + "epoch": 0.6832730664554132, + "grad_norm": 1.0553864240646362, + "learning_rate": 7.388061501778727e-05, + "loss": 0.8713, + "step": 106950 + }, + { + "epoch": 0.6833369536051519, + "grad_norm": 0.913757860660553, + "learning_rate": 7.387620650647182e-05, + "loss": 0.9192, + "step": 106960 + }, + { + "epoch": 0.6834008407548906, + "grad_norm": 0.7922553420066833, + "learning_rate": 7.387179775470232e-05, + "loss": 0.8956, + "step": 106970 + }, + { + "epoch": 0.6834647279046293, + "grad_norm": 0.7192181348800659, + "learning_rate": 7.386738876252315e-05, + "loss": 0.8198, + "step": 106980 + }, + { + "epoch": 0.683528615054368, + "grad_norm": 1.0555779933929443, + "learning_rate": 7.386297952997874e-05, + "loss": 0.7005, + "step": 106990 + }, + { + "epoch": 0.6835925022041066, + "grad_norm": 1.0021594762802124, + "learning_rate": 7.385857005711348e-05, + "loss": 0.8504, + "step": 107000 + }, + { + "epoch": 0.6836563893538453, + "grad_norm": 0.5227010250091553, + "learning_rate": 7.385416034397177e-05, + "loss": 0.8899, + "step": 107010 + }, + { + "epoch": 0.683720276503584, + "grad_norm": 0.47646623849868774, + "learning_rate": 7.384975039059802e-05, + "loss": 1.0871, + "step": 107020 + }, + { + "epoch": 0.6837841636533227, + "grad_norm": 1.0652568340301514, + "learning_rate": 7.384534019703667e-05, + "loss": 0.9768, + "step": 107030 + }, + { + "epoch": 0.6838480508030614, + "grad_norm": 0.7635281682014465, + "learning_rate": 7.384092976333212e-05, + "loss": 0.957, + "step": 107040 + }, + { + "epoch": 0.6839119379528001, + "grad_norm": 0.6990230083465576, + "learning_rate": 7.383651908952877e-05, + "loss": 0.8431, + "step": 107050 + }, + { + "epoch": 0.6839758251025388, + "grad_norm": 1.1831239461898804, + "learning_rate": 7.383210817567104e-05, + "loss": 0.9295, + "step": 107060 + }, + { + "epoch": 0.6840397122522776, + "grad_norm": 0.9544264078140259, + "learning_rate": 7.382769702180339e-05, + "loss": 1.0323, + "step": 107070 + }, + { + "epoch": 0.6841035994020163, + "grad_norm": 0.7274150848388672, + "learning_rate": 7.38232856279702e-05, + "loss": 0.9134, + "step": 107080 + }, + { + "epoch": 0.684167486551755, + "grad_norm": 1.0423110723495483, + "learning_rate": 7.381887399421592e-05, + "loss": 0.9402, + "step": 107090 + }, + { + "epoch": 0.6842313737014937, + "grad_norm": 0.8618479371070862, + "learning_rate": 7.381446212058497e-05, + "loss": 0.7547, + "step": 107100 + }, + { + "epoch": 0.6842952608512324, + "grad_norm": 0.8330484628677368, + "learning_rate": 7.381005000712177e-05, + "loss": 0.8832, + "step": 107110 + }, + { + "epoch": 0.6843591480009711, + "grad_norm": 1.7487927675247192, + "learning_rate": 7.380563765387079e-05, + "loss": 0.8351, + "step": 107120 + }, + { + "epoch": 0.6844230351507098, + "grad_norm": 1.0328443050384521, + "learning_rate": 7.380122506087644e-05, + "loss": 0.7783, + "step": 107130 + }, + { + "epoch": 0.6844869223004485, + "grad_norm": 1.1022374629974365, + "learning_rate": 7.379681222818314e-05, + "loss": 0.8898, + "step": 107140 + }, + { + "epoch": 0.6845508094501872, + "grad_norm": 1.1118669509887695, + "learning_rate": 7.379239915583538e-05, + "loss": 0.8272, + "step": 107150 + }, + { + "epoch": 0.6846146965999259, + "grad_norm": 1.1815778017044067, + "learning_rate": 7.378798584387756e-05, + "loss": 1.045, + "step": 107160 + }, + { + "epoch": 0.6846785837496646, + "grad_norm": 0.781929612159729, + "learning_rate": 7.378357229235415e-05, + "loss": 1.0828, + "step": 107170 + }, + { + "epoch": 0.6847424708994033, + "grad_norm": 0.8094179630279541, + "learning_rate": 7.37791585013096e-05, + "loss": 0.6823, + "step": 107180 + }, + { + "epoch": 0.684806358049142, + "grad_norm": 0.9121211767196655, + "learning_rate": 7.377474447078835e-05, + "loss": 0.8463, + "step": 107190 + }, + { + "epoch": 0.6848702451988807, + "grad_norm": 0.9199677109718323, + "learning_rate": 7.377033020083485e-05, + "loss": 0.9192, + "step": 107200 + }, + { + "epoch": 0.6849341323486194, + "grad_norm": 1.0086863040924072, + "learning_rate": 7.376591569149356e-05, + "loss": 0.8648, + "step": 107210 + }, + { + "epoch": 0.6849980194983581, + "grad_norm": 0.6935834288597107, + "learning_rate": 7.376150094280894e-05, + "loss": 0.8088, + "step": 107220 + }, + { + "epoch": 0.6850619066480969, + "grad_norm": 1.3548187017440796, + "learning_rate": 7.375708595482544e-05, + "loss": 0.7954, + "step": 107230 + }, + { + "epoch": 0.6851257937978356, + "grad_norm": 2.9168577194213867, + "learning_rate": 7.375267072758753e-05, + "loss": 1.0147, + "step": 107240 + }, + { + "epoch": 0.6851896809475742, + "grad_norm": 0.9866139888763428, + "learning_rate": 7.37482552611397e-05, + "loss": 0.77, + "step": 107250 + }, + { + "epoch": 0.6852535680973129, + "grad_norm": 1.5297490358352661, + "learning_rate": 7.374383955552638e-05, + "loss": 1.2862, + "step": 107260 + }, + { + "epoch": 0.6853174552470516, + "grad_norm": 0.7798259854316711, + "learning_rate": 7.373942361079204e-05, + "loss": 0.7411, + "step": 107270 + }, + { + "epoch": 0.6853813423967903, + "grad_norm": 0.7515537738800049, + "learning_rate": 7.37350074269812e-05, + "loss": 0.6391, + "step": 107280 + }, + { + "epoch": 0.685445229546529, + "grad_norm": 1.7930855751037598, + "learning_rate": 7.373059100413829e-05, + "loss": 0.8938, + "step": 107290 + }, + { + "epoch": 0.6855091166962677, + "grad_norm": 1.0468648672103882, + "learning_rate": 7.372617434230778e-05, + "loss": 0.8846, + "step": 107300 + }, + { + "epoch": 0.6855730038460064, + "grad_norm": 0.9677194952964783, + "learning_rate": 7.372175744153417e-05, + "loss": 0.879, + "step": 107310 + }, + { + "epoch": 0.6856368909957451, + "grad_norm": 0.9054749608039856, + "learning_rate": 7.371734030186195e-05, + "loss": 0.9007, + "step": 107320 + }, + { + "epoch": 0.6857007781454838, + "grad_norm": 1.1012799739837646, + "learning_rate": 7.371292292333559e-05, + "loss": 0.7437, + "step": 107330 + }, + { + "epoch": 0.6857646652952225, + "grad_norm": 0.8656480312347412, + "learning_rate": 7.370850530599959e-05, + "loss": 0.7237, + "step": 107340 + }, + { + "epoch": 0.6858285524449612, + "grad_norm": 0.986134946346283, + "learning_rate": 7.370408744989844e-05, + "loss": 0.9098, + "step": 107350 + }, + { + "epoch": 0.6858924395946999, + "grad_norm": 1.038024663925171, + "learning_rate": 7.36996693550766e-05, + "loss": 0.7683, + "step": 107360 + }, + { + "epoch": 0.6859563267444386, + "grad_norm": 0.9421197175979614, + "learning_rate": 7.369525102157861e-05, + "loss": 0.7816, + "step": 107370 + }, + { + "epoch": 0.6860202138941773, + "grad_norm": 0.8556358218193054, + "learning_rate": 7.369083244944893e-05, + "loss": 0.9645, + "step": 107380 + }, + { + "epoch": 0.686084101043916, + "grad_norm": 0.7408592700958252, + "learning_rate": 7.368641363873207e-05, + "loss": 0.8846, + "step": 107390 + }, + { + "epoch": 0.6861479881936547, + "grad_norm": 0.5881041288375854, + "learning_rate": 7.368199458947254e-05, + "loss": 0.7665, + "step": 107400 + }, + { + "epoch": 0.6862118753433935, + "grad_norm": 0.9732454419136047, + "learning_rate": 7.367757530171482e-05, + "loss": 1.018, + "step": 107410 + }, + { + "epoch": 0.6862757624931322, + "grad_norm": 0.4878905415534973, + "learning_rate": 7.367315577550344e-05, + "loss": 1.0164, + "step": 107420 + }, + { + "epoch": 0.6863396496428709, + "grad_norm": 0.9142529368400574, + "learning_rate": 7.366873601088291e-05, + "loss": 0.8166, + "step": 107430 + }, + { + "epoch": 0.6864035367926096, + "grad_norm": 0.7303772568702698, + "learning_rate": 7.366431600789772e-05, + "loss": 0.6688, + "step": 107440 + }, + { + "epoch": 0.6864674239423483, + "grad_norm": 0.7583977580070496, + "learning_rate": 7.36598957665924e-05, + "loss": 0.859, + "step": 107450 + }, + { + "epoch": 0.686531311092087, + "grad_norm": 0.8306979537010193, + "learning_rate": 7.365547528701146e-05, + "loss": 0.9408, + "step": 107460 + }, + { + "epoch": 0.6865951982418257, + "grad_norm": 0.9841431379318237, + "learning_rate": 7.365105456919942e-05, + "loss": 0.9479, + "step": 107470 + }, + { + "epoch": 0.6866590853915644, + "grad_norm": 0.8412874341011047, + "learning_rate": 7.364663361320081e-05, + "loss": 1.2542, + "step": 107480 + }, + { + "epoch": 0.686722972541303, + "grad_norm": 0.9620808362960815, + "learning_rate": 7.364221241906014e-05, + "loss": 1.0792, + "step": 107490 + }, + { + "epoch": 0.6867868596910417, + "grad_norm": 0.8014304637908936, + "learning_rate": 7.363779098682193e-05, + "loss": 1.1819, + "step": 107500 + }, + { + "epoch": 0.6868507468407804, + "grad_norm": 1.1913782358169556, + "learning_rate": 7.36333693165307e-05, + "loss": 1.0777, + "step": 107510 + }, + { + "epoch": 0.6869146339905191, + "grad_norm": 0.6413132548332214, + "learning_rate": 7.362894740823102e-05, + "loss": 0.9969, + "step": 107520 + }, + { + "epoch": 0.6869785211402578, + "grad_norm": 2.0043857097625732, + "learning_rate": 7.362452526196738e-05, + "loss": 0.7761, + "step": 107530 + }, + { + "epoch": 0.6870424082899965, + "grad_norm": 2.9130804538726807, + "learning_rate": 7.362010287778435e-05, + "loss": 0.9517, + "step": 107540 + }, + { + "epoch": 0.6871062954397352, + "grad_norm": 0.6536256670951843, + "learning_rate": 7.361568025572644e-05, + "loss": 0.7987, + "step": 107550 + }, + { + "epoch": 0.6871701825894739, + "grad_norm": 0.8029404878616333, + "learning_rate": 7.36112573958382e-05, + "loss": 0.8677, + "step": 107560 + }, + { + "epoch": 0.6872340697392126, + "grad_norm": 1.2548484802246094, + "learning_rate": 7.360683429816418e-05, + "loss": 0.9721, + "step": 107570 + }, + { + "epoch": 0.6872979568889513, + "grad_norm": 0.6949800848960876, + "learning_rate": 7.360241096274892e-05, + "loss": 0.7863, + "step": 107580 + }, + { + "epoch": 0.68736184403869, + "grad_norm": 0.7144826054573059, + "learning_rate": 7.359798738963694e-05, + "loss": 0.6767, + "step": 107590 + }, + { + "epoch": 0.6874257311884288, + "grad_norm": 0.7971734404563904, + "learning_rate": 7.359356357887282e-05, + "loss": 0.7645, + "step": 107600 + }, + { + "epoch": 0.6874896183381675, + "grad_norm": 0.6574593186378479, + "learning_rate": 7.35891395305011e-05, + "loss": 1.1075, + "step": 107610 + }, + { + "epoch": 0.6875535054879062, + "grad_norm": 0.8098707795143127, + "learning_rate": 7.358471524456635e-05, + "loss": 0.9526, + "step": 107620 + }, + { + "epoch": 0.6876173926376449, + "grad_norm": 0.7118765711784363, + "learning_rate": 7.35802907211131e-05, + "loss": 0.7041, + "step": 107630 + }, + { + "epoch": 0.6876812797873836, + "grad_norm": 0.8008665442466736, + "learning_rate": 7.357586596018594e-05, + "loss": 0.8071, + "step": 107640 + }, + { + "epoch": 0.6877451669371223, + "grad_norm": 0.9328833222389221, + "learning_rate": 7.357144096182938e-05, + "loss": 1.3249, + "step": 107650 + }, + { + "epoch": 0.687809054086861, + "grad_norm": 0.6230046153068542, + "learning_rate": 7.356701572608806e-05, + "loss": 0.7683, + "step": 107660 + }, + { + "epoch": 0.6878729412365997, + "grad_norm": 0.6966734528541565, + "learning_rate": 7.356259025300646e-05, + "loss": 0.9071, + "step": 107670 + }, + { + "epoch": 0.6879368283863384, + "grad_norm": 0.8863798975944519, + "learning_rate": 7.355816454262923e-05, + "loss": 0.8069, + "step": 107680 + }, + { + "epoch": 0.6880007155360771, + "grad_norm": 0.928939700126648, + "learning_rate": 7.35537385950009e-05, + "loss": 0.8535, + "step": 107690 + }, + { + "epoch": 0.6880646026858158, + "grad_norm": 0.8435116410255432, + "learning_rate": 7.354931241016601e-05, + "loss": 0.798, + "step": 107700 + }, + { + "epoch": 0.6881284898355545, + "grad_norm": 1.1882624626159668, + "learning_rate": 7.35448859881692e-05, + "loss": 1.0083, + "step": 107710 + }, + { + "epoch": 0.6881923769852932, + "grad_norm": 0.8240717053413391, + "learning_rate": 7.3540459329055e-05, + "loss": 0.8051, + "step": 107720 + }, + { + "epoch": 0.6882562641350318, + "grad_norm": 0.9132935404777527, + "learning_rate": 7.353603243286805e-05, + "loss": 0.9164, + "step": 107730 + }, + { + "epoch": 0.6883201512847705, + "grad_norm": 0.9722372889518738, + "learning_rate": 7.353160529965285e-05, + "loss": 0.9007, + "step": 107740 + }, + { + "epoch": 0.6883840384345092, + "grad_norm": 0.7652561068534851, + "learning_rate": 7.352717792945404e-05, + "loss": 0.9988, + "step": 107750 + }, + { + "epoch": 0.6884479255842479, + "grad_norm": 1.0295495986938477, + "learning_rate": 7.352275032231619e-05, + "loss": 1.0438, + "step": 107760 + }, + { + "epoch": 0.6885118127339867, + "grad_norm": 1.0043138265609741, + "learning_rate": 7.351832247828391e-05, + "loss": 0.8635, + "step": 107770 + }, + { + "epoch": 0.6885756998837254, + "grad_norm": 0.9536296129226685, + "learning_rate": 7.351389439740176e-05, + "loss": 0.7845, + "step": 107780 + }, + { + "epoch": 0.6886395870334641, + "grad_norm": 1.182599663734436, + "learning_rate": 7.350946607971436e-05, + "loss": 0.9473, + "step": 107790 + }, + { + "epoch": 0.6887034741832028, + "grad_norm": 0.9443296194076538, + "learning_rate": 7.35050375252663e-05, + "loss": 1.12, + "step": 107800 + }, + { + "epoch": 0.6887673613329415, + "grad_norm": 1.2377766370773315, + "learning_rate": 7.350060873410216e-05, + "loss": 0.6376, + "step": 107810 + }, + { + "epoch": 0.6888312484826802, + "grad_norm": 1.1331062316894531, + "learning_rate": 7.349617970626658e-05, + "loss": 0.8585, + "step": 107820 + }, + { + "epoch": 0.6888951356324189, + "grad_norm": 0.9837049245834351, + "learning_rate": 7.349175044180414e-05, + "loss": 0.7217, + "step": 107830 + }, + { + "epoch": 0.6889590227821576, + "grad_norm": 0.4539640545845032, + "learning_rate": 7.348732094075942e-05, + "loss": 0.6076, + "step": 107840 + }, + { + "epoch": 0.6890229099318963, + "grad_norm": 0.9993829131126404, + "learning_rate": 7.348289120317709e-05, + "loss": 0.7641, + "step": 107850 + }, + { + "epoch": 0.689086797081635, + "grad_norm": 0.9905250072479248, + "learning_rate": 7.347846122910174e-05, + "loss": 0.6454, + "step": 107860 + }, + { + "epoch": 0.6891506842313737, + "grad_norm": 0.8237646818161011, + "learning_rate": 7.347403101857795e-05, + "loss": 0.8458, + "step": 107870 + }, + { + "epoch": 0.6892145713811124, + "grad_norm": 1.0233882665634155, + "learning_rate": 7.346960057165036e-05, + "loss": 0.8326, + "step": 107880 + }, + { + "epoch": 0.6892784585308511, + "grad_norm": 1.116274356842041, + "learning_rate": 7.34651698883636e-05, + "loss": 0.8269, + "step": 107890 + }, + { + "epoch": 0.6893423456805898, + "grad_norm": 0.6511923670768738, + "learning_rate": 7.346073896876227e-05, + "loss": 1.2329, + "step": 107900 + }, + { + "epoch": 0.6894062328303285, + "grad_norm": 1.0535688400268555, + "learning_rate": 7.345630781289102e-05, + "loss": 0.8788, + "step": 107910 + }, + { + "epoch": 0.6894701199800672, + "grad_norm": 1.045600175857544, + "learning_rate": 7.345187642079443e-05, + "loss": 0.8773, + "step": 107920 + }, + { + "epoch": 0.689534007129806, + "grad_norm": 1.1185442209243774, + "learning_rate": 7.344744479251717e-05, + "loss": 0.7012, + "step": 107930 + }, + { + "epoch": 0.6895978942795447, + "grad_norm": 0.8849347233772278, + "learning_rate": 7.344301292810385e-05, + "loss": 0.8627, + "step": 107940 + }, + { + "epoch": 0.6896617814292834, + "grad_norm": 0.8291599154472351, + "learning_rate": 7.343858082759912e-05, + "loss": 0.8959, + "step": 107950 + }, + { + "epoch": 0.6897256685790221, + "grad_norm": 0.6584329009056091, + "learning_rate": 7.34341484910476e-05, + "loss": 0.7189, + "step": 107960 + }, + { + "epoch": 0.6897895557287607, + "grad_norm": 1.2374427318572998, + "learning_rate": 7.342971591849393e-05, + "loss": 0.9428, + "step": 107970 + }, + { + "epoch": 0.6898534428784994, + "grad_norm": 0.8575314879417419, + "learning_rate": 7.342528310998275e-05, + "loss": 0.759, + "step": 107980 + }, + { + "epoch": 0.6899173300282381, + "grad_norm": 0.7263084650039673, + "learning_rate": 7.34208500655587e-05, + "loss": 0.7271, + "step": 107990 + }, + { + "epoch": 0.6899812171779768, + "grad_norm": 1.145310401916504, + "learning_rate": 7.341641678526643e-05, + "loss": 1.1193, + "step": 108000 + }, + { + "epoch": 0.6900451043277155, + "grad_norm": 1.2653499841690063, + "learning_rate": 7.341198326915057e-05, + "loss": 0.8146, + "step": 108010 + }, + { + "epoch": 0.6901089914774542, + "grad_norm": 0.6225971579551697, + "learning_rate": 7.340754951725582e-05, + "loss": 0.7708, + "step": 108020 + }, + { + "epoch": 0.6901728786271929, + "grad_norm": 2.235273838043213, + "learning_rate": 7.340311552962676e-05, + "loss": 0.8989, + "step": 108030 + }, + { + "epoch": 0.6902367657769316, + "grad_norm": 0.8102111220359802, + "learning_rate": 7.33986813063081e-05, + "loss": 1.1533, + "step": 108040 + }, + { + "epoch": 0.6903006529266703, + "grad_norm": 0.7722830772399902, + "learning_rate": 7.339424684734447e-05, + "loss": 1.0018, + "step": 108050 + }, + { + "epoch": 0.690364540076409, + "grad_norm": 0.7864007949829102, + "learning_rate": 7.338981215278055e-05, + "loss": 1.0525, + "step": 108060 + }, + { + "epoch": 0.6904284272261477, + "grad_norm": 0.6729293465614319, + "learning_rate": 7.338537722266097e-05, + "loss": 0.8472, + "step": 108070 + }, + { + "epoch": 0.6904923143758864, + "grad_norm": 0.7282936573028564, + "learning_rate": 7.338094205703043e-05, + "loss": 0.9557, + "step": 108080 + }, + { + "epoch": 0.6905562015256251, + "grad_norm": 1.0277268886566162, + "learning_rate": 7.337650665593355e-05, + "loss": 0.93, + "step": 108090 + }, + { + "epoch": 0.6906200886753638, + "grad_norm": 2.55513334274292, + "learning_rate": 7.337207101941503e-05, + "loss": 0.796, + "step": 108100 + }, + { + "epoch": 0.6906839758251025, + "grad_norm": 0.7738178968429565, + "learning_rate": 7.336763514751954e-05, + "loss": 0.8795, + "step": 108110 + }, + { + "epoch": 0.6907478629748413, + "grad_norm": 0.9889559149742126, + "learning_rate": 7.336319904029176e-05, + "loss": 0.848, + "step": 108120 + }, + { + "epoch": 0.69081175012458, + "grad_norm": 1.2246037721633911, + "learning_rate": 7.335876269777634e-05, + "loss": 0.8715, + "step": 108130 + }, + { + "epoch": 0.6908756372743187, + "grad_norm": 0.899691641330719, + "learning_rate": 7.335432612001798e-05, + "loss": 1.013, + "step": 108140 + }, + { + "epoch": 0.6909395244240574, + "grad_norm": 0.9258847236633301, + "learning_rate": 7.334988930706133e-05, + "loss": 0.8774, + "step": 108150 + }, + { + "epoch": 0.6910034115737961, + "grad_norm": 0.825404167175293, + "learning_rate": 7.334545225895111e-05, + "loss": 0.6752, + "step": 108160 + }, + { + "epoch": 0.6910672987235348, + "grad_norm": 0.6678471565246582, + "learning_rate": 7.334101497573199e-05, + "loss": 0.7239, + "step": 108170 + }, + { + "epoch": 0.6911311858732735, + "grad_norm": 0.8919599056243896, + "learning_rate": 7.333657745744866e-05, + "loss": 0.8604, + "step": 108180 + }, + { + "epoch": 0.6911950730230122, + "grad_norm": 0.4956168234348297, + "learning_rate": 7.333213970414579e-05, + "loss": 0.8364, + "step": 108190 + }, + { + "epoch": 0.6912589601727509, + "grad_norm": 2.8205111026763916, + "learning_rate": 7.332770171586811e-05, + "loss": 0.7711, + "step": 108200 + }, + { + "epoch": 0.6913228473224896, + "grad_norm": 0.9555968046188354, + "learning_rate": 7.332326349266028e-05, + "loss": 0.8765, + "step": 108210 + }, + { + "epoch": 0.6913867344722282, + "grad_norm": 0.928036093711853, + "learning_rate": 7.331882503456701e-05, + "loss": 1.0052, + "step": 108220 + }, + { + "epoch": 0.6914506216219669, + "grad_norm": 0.8674328923225403, + "learning_rate": 7.331438634163298e-05, + "loss": 0.7707, + "step": 108230 + }, + { + "epoch": 0.6915145087717056, + "grad_norm": 0.8306328058242798, + "learning_rate": 7.330994741390293e-05, + "loss": 0.8573, + "step": 108240 + }, + { + "epoch": 0.6915783959214443, + "grad_norm": 1.346864938735962, + "learning_rate": 7.330550825142156e-05, + "loss": 0.7394, + "step": 108250 + }, + { + "epoch": 0.691642283071183, + "grad_norm": 1.4455012083053589, + "learning_rate": 7.330106885423353e-05, + "loss": 0.8614, + "step": 108260 + }, + { + "epoch": 0.6917061702209217, + "grad_norm": 0.7791756391525269, + "learning_rate": 7.32966292223836e-05, + "loss": 0.7627, + "step": 108270 + }, + { + "epoch": 0.6917700573706604, + "grad_norm": 0.8995997905731201, + "learning_rate": 7.329218935591645e-05, + "loss": 0.8276, + "step": 108280 + }, + { + "epoch": 0.6918339445203991, + "grad_norm": 0.9824413657188416, + "learning_rate": 7.328774925487679e-05, + "loss": 0.9905, + "step": 108290 + }, + { + "epoch": 0.6918978316701379, + "grad_norm": 0.9453624486923218, + "learning_rate": 7.328330891930937e-05, + "loss": 0.9079, + "step": 108300 + }, + { + "epoch": 0.6919617188198766, + "grad_norm": 0.9004096388816833, + "learning_rate": 7.327886834925888e-05, + "loss": 0.9236, + "step": 108310 + }, + { + "epoch": 0.6920256059696153, + "grad_norm": 0.7478508353233337, + "learning_rate": 7.327442754477003e-05, + "loss": 0.8575, + "step": 108320 + }, + { + "epoch": 0.692089493119354, + "grad_norm": 2.181452751159668, + "learning_rate": 7.326998650588758e-05, + "loss": 0.7738, + "step": 108330 + }, + { + "epoch": 0.6921533802690927, + "grad_norm": 1.4748575687408447, + "learning_rate": 7.326554523265624e-05, + "loss": 1.3507, + "step": 108340 + }, + { + "epoch": 0.6922172674188314, + "grad_norm": 1.0010013580322266, + "learning_rate": 7.326110372512071e-05, + "loss": 0.8854, + "step": 108350 + }, + { + "epoch": 0.6922811545685701, + "grad_norm": 0.87949138879776, + "learning_rate": 7.325666198332575e-05, + "loss": 0.746, + "step": 108360 + }, + { + "epoch": 0.6923450417183088, + "grad_norm": 0.8844693303108215, + "learning_rate": 7.325222000731609e-05, + "loss": 0.9919, + "step": 108370 + }, + { + "epoch": 0.6924089288680475, + "grad_norm": 1.2705687284469604, + "learning_rate": 7.324777779713644e-05, + "loss": 0.9765, + "step": 108380 + }, + { + "epoch": 0.6924728160177862, + "grad_norm": 0.8071838021278381, + "learning_rate": 7.324333535283157e-05, + "loss": 0.837, + "step": 108390 + }, + { + "epoch": 0.6925367031675249, + "grad_norm": 0.9001646637916565, + "learning_rate": 7.323889267444621e-05, + "loss": 0.846, + "step": 108400 + }, + { + "epoch": 0.6926005903172636, + "grad_norm": 0.9376798272132874, + "learning_rate": 7.323444976202508e-05, + "loss": 0.7456, + "step": 108410 + }, + { + "epoch": 0.6926644774670023, + "grad_norm": 0.8280836939811707, + "learning_rate": 7.323000661561295e-05, + "loss": 0.9753, + "step": 108420 + }, + { + "epoch": 0.692728364616741, + "grad_norm": 1.4879751205444336, + "learning_rate": 7.322556323525456e-05, + "loss": 0.9096, + "step": 108430 + }, + { + "epoch": 0.6927922517664797, + "grad_norm": 1.0255200862884521, + "learning_rate": 7.322111962099465e-05, + "loss": 0.9377, + "step": 108440 + }, + { + "epoch": 0.6928561389162184, + "grad_norm": 0.9533114433288574, + "learning_rate": 7.321667577287799e-05, + "loss": 0.7927, + "step": 108450 + }, + { + "epoch": 0.692920026065957, + "grad_norm": 0.7866392731666565, + "learning_rate": 7.32122316909493e-05, + "loss": 0.9866, + "step": 108460 + }, + { + "epoch": 0.6929839132156957, + "grad_norm": 1.0992743968963623, + "learning_rate": 7.320778737525335e-05, + "loss": 0.7761, + "step": 108470 + }, + { + "epoch": 0.6930478003654345, + "grad_norm": 0.9191528558731079, + "learning_rate": 7.320334282583492e-05, + "loss": 0.8788, + "step": 108480 + }, + { + "epoch": 0.6931116875151732, + "grad_norm": 1.2555981874465942, + "learning_rate": 7.319889804273876e-05, + "loss": 0.9633, + "step": 108490 + }, + { + "epoch": 0.6931755746649119, + "grad_norm": 0.8771397471427917, + "learning_rate": 7.319445302600961e-05, + "loss": 0.615, + "step": 108500 + }, + { + "epoch": 0.6932394618146506, + "grad_norm": 0.714777946472168, + "learning_rate": 7.319000777569226e-05, + "loss": 0.7238, + "step": 108510 + }, + { + "epoch": 0.6933033489643893, + "grad_norm": 1.2296061515808105, + "learning_rate": 7.318556229183146e-05, + "loss": 0.7767, + "step": 108520 + }, + { + "epoch": 0.693367236114128, + "grad_norm": 0.7856013178825378, + "learning_rate": 7.3181116574472e-05, + "loss": 0.692, + "step": 108530 + }, + { + "epoch": 0.6934311232638667, + "grad_norm": 0.9102780818939209, + "learning_rate": 7.317667062365863e-05, + "loss": 0.9865, + "step": 108540 + }, + { + "epoch": 0.6934950104136054, + "grad_norm": 1.0297400951385498, + "learning_rate": 7.317222443943616e-05, + "loss": 0.9191, + "step": 108550 + }, + { + "epoch": 0.6935588975633441, + "grad_norm": 1.809927225112915, + "learning_rate": 7.316777802184934e-05, + "loss": 1.084, + "step": 108560 + }, + { + "epoch": 0.6936227847130828, + "grad_norm": 2.1884663105010986, + "learning_rate": 7.316333137094294e-05, + "loss": 0.8257, + "step": 108570 + }, + { + "epoch": 0.6936866718628215, + "grad_norm": 0.8382952213287354, + "learning_rate": 7.315888448676175e-05, + "loss": 0.8348, + "step": 108580 + }, + { + "epoch": 0.6937505590125602, + "grad_norm": 0.7834774851799011, + "learning_rate": 7.315443736935056e-05, + "loss": 0.8987, + "step": 108590 + }, + { + "epoch": 0.6938144461622989, + "grad_norm": 0.710081934928894, + "learning_rate": 7.314999001875415e-05, + "loss": 0.6713, + "step": 108600 + }, + { + "epoch": 0.6938783333120376, + "grad_norm": 0.9444938898086548, + "learning_rate": 7.314554243501732e-05, + "loss": 0.9177, + "step": 108610 + }, + { + "epoch": 0.6939422204617763, + "grad_norm": 0.6890098452568054, + "learning_rate": 7.314109461818485e-05, + "loss": 0.9145, + "step": 108620 + }, + { + "epoch": 0.694006107611515, + "grad_norm": 0.9023224115371704, + "learning_rate": 7.313664656830154e-05, + "loss": 1.0199, + "step": 108630 + }, + { + "epoch": 0.6940699947612538, + "grad_norm": 0.6425119638442993, + "learning_rate": 7.31321982854122e-05, + "loss": 0.862, + "step": 108640 + }, + { + "epoch": 0.6941338819109925, + "grad_norm": 1.188393473625183, + "learning_rate": 7.312774976956159e-05, + "loss": 0.801, + "step": 108650 + }, + { + "epoch": 0.6941977690607312, + "grad_norm": 0.7165592908859253, + "learning_rate": 7.312330102079454e-05, + "loss": 1.3727, + "step": 108660 + }, + { + "epoch": 0.6942616562104699, + "grad_norm": 0.6589129567146301, + "learning_rate": 7.311885203915585e-05, + "loss": 0.8308, + "step": 108670 + }, + { + "epoch": 0.6943255433602086, + "grad_norm": 1.0794988870620728, + "learning_rate": 7.31144028246903e-05, + "loss": 0.9766, + "step": 108680 + }, + { + "epoch": 0.6943894305099473, + "grad_norm": 1.6722362041473389, + "learning_rate": 7.310995337744271e-05, + "loss": 0.8217, + "step": 108690 + }, + { + "epoch": 0.6944533176596859, + "grad_norm": 2.617365598678589, + "learning_rate": 7.310550369745793e-05, + "loss": 0.8649, + "step": 108700 + }, + { + "epoch": 0.6945172048094246, + "grad_norm": 1.0052344799041748, + "learning_rate": 7.310105378478071e-05, + "loss": 0.8908, + "step": 108710 + }, + { + "epoch": 0.6945810919591633, + "grad_norm": 0.6017476320266724, + "learning_rate": 7.309660363945592e-05, + "loss": 0.8932, + "step": 108720 + }, + { + "epoch": 0.694644979108902, + "grad_norm": 1.1323217153549194, + "learning_rate": 7.309215326152833e-05, + "loss": 0.9389, + "step": 108730 + }, + { + "epoch": 0.6947088662586407, + "grad_norm": 1.0148589611053467, + "learning_rate": 7.308770265104279e-05, + "loss": 0.8976, + "step": 108740 + }, + { + "epoch": 0.6947727534083794, + "grad_norm": 1.195841670036316, + "learning_rate": 7.30832518080441e-05, + "loss": 1.0468, + "step": 108750 + }, + { + "epoch": 0.6948366405581181, + "grad_norm": 2.77616810798645, + "learning_rate": 7.307880073257711e-05, + "loss": 0.8265, + "step": 108760 + }, + { + "epoch": 0.6949005277078568, + "grad_norm": 0.820035457611084, + "learning_rate": 7.30743494246866e-05, + "loss": 1.0138, + "step": 108770 + }, + { + "epoch": 0.6949644148575955, + "grad_norm": 0.768181324005127, + "learning_rate": 7.306989788441747e-05, + "loss": 0.896, + "step": 108780 + }, + { + "epoch": 0.6950283020073342, + "grad_norm": 0.9276620745658875, + "learning_rate": 7.306544611181449e-05, + "loss": 0.9899, + "step": 108790 + }, + { + "epoch": 0.6950921891570729, + "grad_norm": 1.3727481365203857, + "learning_rate": 7.306099410692251e-05, + "loss": 0.9883, + "step": 108800 + }, + { + "epoch": 0.6951560763068116, + "grad_norm": 0.6537569165229797, + "learning_rate": 7.305654186978636e-05, + "loss": 0.7696, + "step": 108810 + }, + { + "epoch": 0.6952199634565503, + "grad_norm": 0.8590995669364929, + "learning_rate": 7.30520894004509e-05, + "loss": 0.9809, + "step": 108820 + }, + { + "epoch": 0.6952838506062891, + "grad_norm": 0.9551057815551758, + "learning_rate": 7.304763669896096e-05, + "loss": 0.9619, + "step": 108830 + }, + { + "epoch": 0.6953477377560278, + "grad_norm": 0.8596848845481873, + "learning_rate": 7.304318376536138e-05, + "loss": 0.8957, + "step": 108840 + }, + { + "epoch": 0.6954116249057665, + "grad_norm": 1.1509318351745605, + "learning_rate": 7.3038730599697e-05, + "loss": 0.6727, + "step": 108850 + }, + { + "epoch": 0.6954755120555052, + "grad_norm": 0.5256636142730713, + "learning_rate": 7.303427720201265e-05, + "loss": 0.7634, + "step": 108860 + }, + { + "epoch": 0.6955393992052439, + "grad_norm": 0.8332456350326538, + "learning_rate": 7.302982357235323e-05, + "loss": 1.3683, + "step": 108870 + }, + { + "epoch": 0.6956032863549826, + "grad_norm": 0.7100444436073303, + "learning_rate": 7.302536971076355e-05, + "loss": 0.8936, + "step": 108880 + }, + { + "epoch": 0.6956671735047213, + "grad_norm": 1.0301616191864014, + "learning_rate": 7.302091561728848e-05, + "loss": 0.784, + "step": 108890 + }, + { + "epoch": 0.69573106065446, + "grad_norm": 0.8167005777359009, + "learning_rate": 7.301646129197289e-05, + "loss": 1.0153, + "step": 108900 + }, + { + "epoch": 0.6957949478041987, + "grad_norm": 0.6708621382713318, + "learning_rate": 7.30120067348616e-05, + "loss": 0.9037, + "step": 108910 + }, + { + "epoch": 0.6958588349539374, + "grad_norm": 1.8930144309997559, + "learning_rate": 7.30075519459995e-05, + "loss": 0.9704, + "step": 108920 + }, + { + "epoch": 0.6959227221036761, + "grad_norm": 0.9844603538513184, + "learning_rate": 7.300309692543145e-05, + "loss": 1.0861, + "step": 108930 + }, + { + "epoch": 0.6959866092534148, + "grad_norm": 0.9566649198532104, + "learning_rate": 7.299864167320232e-05, + "loss": 1.0209, + "step": 108940 + }, + { + "epoch": 0.6960504964031534, + "grad_norm": 0.9092232584953308, + "learning_rate": 7.299418618935695e-05, + "loss": 1.0676, + "step": 108950 + }, + { + "epoch": 0.6961143835528921, + "grad_norm": 0.7573904395103455, + "learning_rate": 7.298973047394025e-05, + "loss": 0.7415, + "step": 108960 + }, + { + "epoch": 0.6961782707026308, + "grad_norm": 1.1252961158752441, + "learning_rate": 7.298527452699708e-05, + "loss": 1.0561, + "step": 108970 + }, + { + "epoch": 0.6962421578523695, + "grad_norm": 0.7041053175926208, + "learning_rate": 7.298081834857229e-05, + "loss": 0.9674, + "step": 108980 + }, + { + "epoch": 0.6963060450021082, + "grad_norm": 0.9071682095527649, + "learning_rate": 7.29763619387108e-05, + "loss": 0.9063, + "step": 108990 + }, + { + "epoch": 0.696369932151847, + "grad_norm": 0.698070228099823, + "learning_rate": 7.297190529745746e-05, + "loss": 0.8875, + "step": 109000 + }, + { + "epoch": 0.6964338193015857, + "grad_norm": 0.9515412449836731, + "learning_rate": 7.296744842485715e-05, + "loss": 0.8703, + "step": 109010 + }, + { + "epoch": 0.6964977064513244, + "grad_norm": 1.2427845001220703, + "learning_rate": 7.296299132095478e-05, + "loss": 0.9569, + "step": 109020 + }, + { + "epoch": 0.6965615936010631, + "grad_norm": 0.5841128826141357, + "learning_rate": 7.295853398579521e-05, + "loss": 0.9137, + "step": 109030 + }, + { + "epoch": 0.6966254807508018, + "grad_norm": 0.5396087765693665, + "learning_rate": 7.295407641942334e-05, + "loss": 0.7979, + "step": 109040 + }, + { + "epoch": 0.6966893679005405, + "grad_norm": 0.7131836414337158, + "learning_rate": 7.294961862188407e-05, + "loss": 1.0448, + "step": 109050 + }, + { + "epoch": 0.6967532550502792, + "grad_norm": 1.0554966926574707, + "learning_rate": 7.29451605932223e-05, + "loss": 0.92, + "step": 109060 + }, + { + "epoch": 0.6968171422000179, + "grad_norm": 0.7954362630844116, + "learning_rate": 7.294070233348289e-05, + "loss": 0.841, + "step": 109070 + }, + { + "epoch": 0.6968810293497566, + "grad_norm": 0.8883830308914185, + "learning_rate": 7.293624384271076e-05, + "loss": 0.7748, + "step": 109080 + }, + { + "epoch": 0.6969449164994953, + "grad_norm": 1.4885032176971436, + "learning_rate": 7.293178512095082e-05, + "loss": 0.8115, + "step": 109090 + }, + { + "epoch": 0.697008803649234, + "grad_norm": 0.9093277454376221, + "learning_rate": 7.292732616824797e-05, + "loss": 0.8182, + "step": 109100 + }, + { + "epoch": 0.6970726907989727, + "grad_norm": 0.9241993427276611, + "learning_rate": 7.29228669846471e-05, + "loss": 0.7127, + "step": 109110 + }, + { + "epoch": 0.6971365779487114, + "grad_norm": 0.6447529792785645, + "learning_rate": 7.291840757019314e-05, + "loss": 0.8501, + "step": 109120 + }, + { + "epoch": 0.6972004650984501, + "grad_norm": 0.7052245736122131, + "learning_rate": 7.291394792493098e-05, + "loss": 0.947, + "step": 109130 + }, + { + "epoch": 0.6972643522481888, + "grad_norm": 1.3450639247894287, + "learning_rate": 7.290948804890555e-05, + "loss": 0.7365, + "step": 109140 + }, + { + "epoch": 0.6973282393979275, + "grad_norm": 0.5776755213737488, + "learning_rate": 7.290502794216173e-05, + "loss": 0.7275, + "step": 109150 + }, + { + "epoch": 0.6973921265476662, + "grad_norm": 0.8304409980773926, + "learning_rate": 7.290056760474448e-05, + "loss": 0.9387, + "step": 109160 + }, + { + "epoch": 0.697456013697405, + "grad_norm": 0.8991537690162659, + "learning_rate": 7.289610703669872e-05, + "loss": 0.7778, + "step": 109170 + }, + { + "epoch": 0.6975199008471437, + "grad_norm": 0.8365470170974731, + "learning_rate": 7.289164623806933e-05, + "loss": 0.8706, + "step": 109180 + }, + { + "epoch": 0.6975837879968823, + "grad_norm": 0.855769157409668, + "learning_rate": 7.288718520890127e-05, + "loss": 0.7282, + "step": 109190 + }, + { + "epoch": 0.697647675146621, + "grad_norm": 0.7348789572715759, + "learning_rate": 7.288272394923945e-05, + "loss": 1.0745, + "step": 109200 + }, + { + "epoch": 0.6977115622963597, + "grad_norm": 1.0957111120224, + "learning_rate": 7.287826245912879e-05, + "loss": 0.7343, + "step": 109210 + }, + { + "epoch": 0.6977754494460984, + "grad_norm": 0.8726381063461304, + "learning_rate": 7.287380073861425e-05, + "loss": 1.0231, + "step": 109220 + }, + { + "epoch": 0.6978393365958371, + "grad_norm": 0.6815057992935181, + "learning_rate": 7.286933878774075e-05, + "loss": 0.8475, + "step": 109230 + }, + { + "epoch": 0.6979032237455758, + "grad_norm": 1.1125048398971558, + "learning_rate": 7.286487660655323e-05, + "loss": 0.8779, + "step": 109240 + }, + { + "epoch": 0.6979671108953145, + "grad_norm": 0.725688636302948, + "learning_rate": 7.28604141950966e-05, + "loss": 0.9028, + "step": 109250 + }, + { + "epoch": 0.6980309980450532, + "grad_norm": 0.8986996412277222, + "learning_rate": 7.285595155341583e-05, + "loss": 0.9237, + "step": 109260 + }, + { + "epoch": 0.6980948851947919, + "grad_norm": 0.9736185073852539, + "learning_rate": 7.285148868155587e-05, + "loss": 0.8967, + "step": 109270 + }, + { + "epoch": 0.6981587723445306, + "grad_norm": 1.0567455291748047, + "learning_rate": 7.284702557956165e-05, + "loss": 1.0126, + "step": 109280 + }, + { + "epoch": 0.6982226594942693, + "grad_norm": 1.032707691192627, + "learning_rate": 7.28425622474781e-05, + "loss": 1.0844, + "step": 109290 + }, + { + "epoch": 0.698286546644008, + "grad_norm": 0.6320337653160095, + "learning_rate": 7.283809868535018e-05, + "loss": 0.745, + "step": 109300 + }, + { + "epoch": 0.6983504337937467, + "grad_norm": 0.7750630974769592, + "learning_rate": 7.283363489322287e-05, + "loss": 1.0077, + "step": 109310 + }, + { + "epoch": 0.6984143209434854, + "grad_norm": 0.7525535821914673, + "learning_rate": 7.282917087114109e-05, + "loss": 0.9631, + "step": 109320 + }, + { + "epoch": 0.6984782080932241, + "grad_norm": 0.903925895690918, + "learning_rate": 7.282470661914982e-05, + "loss": 1.0631, + "step": 109330 + }, + { + "epoch": 0.6985420952429628, + "grad_norm": 0.6858085989952087, + "learning_rate": 7.282024213729399e-05, + "loss": 0.9775, + "step": 109340 + }, + { + "epoch": 0.6986059823927016, + "grad_norm": 1.176261067390442, + "learning_rate": 7.28157774256186e-05, + "loss": 0.8382, + "step": 109350 + }, + { + "epoch": 0.6986698695424403, + "grad_norm": 0.7239077091217041, + "learning_rate": 7.281131248416858e-05, + "loss": 0.8858, + "step": 109360 + }, + { + "epoch": 0.698733756692179, + "grad_norm": 1.3246084451675415, + "learning_rate": 7.280684731298892e-05, + "loss": 0.8572, + "step": 109370 + }, + { + "epoch": 0.6987976438419177, + "grad_norm": 0.7234712839126587, + "learning_rate": 7.280238191212455e-05, + "loss": 0.7359, + "step": 109380 + }, + { + "epoch": 0.6988615309916564, + "grad_norm": 1.1668168306350708, + "learning_rate": 7.27979162816205e-05, + "loss": 0.8897, + "step": 109390 + }, + { + "epoch": 0.6989254181413951, + "grad_norm": 0.9035739302635193, + "learning_rate": 7.279345042152167e-05, + "loss": 0.8598, + "step": 109400 + }, + { + "epoch": 0.6989893052911338, + "grad_norm": 0.9039598107337952, + "learning_rate": 7.278898433187311e-05, + "loss": 0.9865, + "step": 109410 + }, + { + "epoch": 0.6990531924408725, + "grad_norm": 0.9996391534805298, + "learning_rate": 7.278451801271975e-05, + "loss": 0.7356, + "step": 109420 + }, + { + "epoch": 0.6991170795906111, + "grad_norm": 0.8987241983413696, + "learning_rate": 7.27800514641066e-05, + "loss": 0.699, + "step": 109430 + }, + { + "epoch": 0.6991809667403498, + "grad_norm": 0.9513826370239258, + "learning_rate": 7.27755846860786e-05, + "loss": 1.0641, + "step": 109440 + }, + { + "epoch": 0.6992448538900885, + "grad_norm": 1.103652000427246, + "learning_rate": 7.277111767868076e-05, + "loss": 0.7386, + "step": 109450 + }, + { + "epoch": 0.6993087410398272, + "grad_norm": 0.7837316393852234, + "learning_rate": 7.276665044195808e-05, + "loss": 0.8191, + "step": 109460 + }, + { + "epoch": 0.6993726281895659, + "grad_norm": 0.8951888680458069, + "learning_rate": 7.276218297595553e-05, + "loss": 1.0341, + "step": 109470 + }, + { + "epoch": 0.6994365153393046, + "grad_norm": 0.8686550259590149, + "learning_rate": 7.275771528071811e-05, + "loss": 0.9451, + "step": 109480 + }, + { + "epoch": 0.6995004024890433, + "grad_norm": 0.7066924571990967, + "learning_rate": 7.27532473562908e-05, + "loss": 0.867, + "step": 109490 + }, + { + "epoch": 0.699564289638782, + "grad_norm": 1.1765514612197876, + "learning_rate": 7.274877920271861e-05, + "loss": 0.8446, + "step": 109500 + }, + { + "epoch": 0.6996281767885207, + "grad_norm": 1.2923158407211304, + "learning_rate": 7.274431082004652e-05, + "loss": 0.6812, + "step": 109510 + }, + { + "epoch": 0.6996920639382594, + "grad_norm": 1.4523509740829468, + "learning_rate": 7.273984220831956e-05, + "loss": 0.7639, + "step": 109520 + }, + { + "epoch": 0.6997559510879982, + "grad_norm": 1.0498130321502686, + "learning_rate": 7.273537336758272e-05, + "loss": 0.6902, + "step": 109530 + }, + { + "epoch": 0.6998198382377369, + "grad_norm": 0.7311170697212219, + "learning_rate": 7.273090429788098e-05, + "loss": 0.7766, + "step": 109540 + }, + { + "epoch": 0.6998837253874756, + "grad_norm": 1.5511587858200073, + "learning_rate": 7.272643499925937e-05, + "loss": 0.8909, + "step": 109550 + }, + { + "epoch": 0.6999476125372143, + "grad_norm": 0.810479998588562, + "learning_rate": 7.27219654717629e-05, + "loss": 1.0596, + "step": 109560 + }, + { + "epoch": 0.700011499686953, + "grad_norm": 0.8483265042304993, + "learning_rate": 7.27174957154366e-05, + "loss": 0.5923, + "step": 109570 + }, + { + "epoch": 0.7000753868366917, + "grad_norm": 1.1115506887435913, + "learning_rate": 7.271302573032546e-05, + "loss": 0.7011, + "step": 109580 + }, + { + "epoch": 0.7001392739864304, + "grad_norm": 0.8002986907958984, + "learning_rate": 7.270855551647449e-05, + "loss": 1.025, + "step": 109590 + }, + { + "epoch": 0.7002031611361691, + "grad_norm": 0.8855366110801697, + "learning_rate": 7.270408507392872e-05, + "loss": 0.7358, + "step": 109600 + }, + { + "epoch": 0.7002670482859078, + "grad_norm": 1.6254435777664185, + "learning_rate": 7.269961440273317e-05, + "loss": 1.0024, + "step": 109610 + }, + { + "epoch": 0.7003309354356465, + "grad_norm": 0.809699535369873, + "learning_rate": 7.269514350293287e-05, + "loss": 0.8733, + "step": 109620 + }, + { + "epoch": 0.7003948225853852, + "grad_norm": 1.2254425287246704, + "learning_rate": 7.269111949769275e-05, + "loss": 1.0453, + "step": 109630 + }, + { + "epoch": 0.7004587097351239, + "grad_norm": 0.8027240037918091, + "learning_rate": 7.268664816366747e-05, + "loss": 0.7901, + "step": 109640 + }, + { + "epoch": 0.7005225968848626, + "grad_norm": 0.5763524770736694, + "learning_rate": 7.268217660116801e-05, + "loss": 0.9035, + "step": 109650 + }, + { + "epoch": 0.7005864840346013, + "grad_norm": 0.8503153920173645, + "learning_rate": 7.267770481023941e-05, + "loss": 1.1969, + "step": 109660 + }, + { + "epoch": 0.70065037118434, + "grad_norm": 1.0496258735656738, + "learning_rate": 7.26732327909267e-05, + "loss": 0.8281, + "step": 109670 + }, + { + "epoch": 0.7007142583340786, + "grad_norm": 0.9994955658912659, + "learning_rate": 7.266876054327491e-05, + "loss": 1.0602, + "step": 109680 + }, + { + "epoch": 0.7007781454838173, + "grad_norm": 0.8883937001228333, + "learning_rate": 7.266428806732913e-05, + "loss": 1.1059, + "step": 109690 + }, + { + "epoch": 0.700842032633556, + "grad_norm": 0.8434162735939026, + "learning_rate": 7.265981536313432e-05, + "loss": 0.7179, + "step": 109700 + }, + { + "epoch": 0.7009059197832948, + "grad_norm": 0.8952722549438477, + "learning_rate": 7.265534243073558e-05, + "loss": 0.831, + "step": 109710 + }, + { + "epoch": 0.7009698069330335, + "grad_norm": 0.49630191922187805, + "learning_rate": 7.265086927017795e-05, + "loss": 0.8177, + "step": 109720 + }, + { + "epoch": 0.7010336940827722, + "grad_norm": 1.5639264583587646, + "learning_rate": 7.264639588150646e-05, + "loss": 0.9209, + "step": 109730 + }, + { + "epoch": 0.7010975812325109, + "grad_norm": 0.8661503791809082, + "learning_rate": 7.264192226476617e-05, + "loss": 0.7273, + "step": 109740 + }, + { + "epoch": 0.7011614683822496, + "grad_norm": 1.6213655471801758, + "learning_rate": 7.263744842000214e-05, + "loss": 1.4111, + "step": 109750 + }, + { + "epoch": 0.7012253555319883, + "grad_norm": 1.2722123861312866, + "learning_rate": 7.263297434725941e-05, + "loss": 0.9177, + "step": 109760 + }, + { + "epoch": 0.701289242681727, + "grad_norm": 0.9025132060050964, + "learning_rate": 7.262850004658308e-05, + "loss": 0.8063, + "step": 109770 + }, + { + "epoch": 0.7013531298314657, + "grad_norm": 0.753537654876709, + "learning_rate": 7.262402551801815e-05, + "loss": 0.7072, + "step": 109780 + }, + { + "epoch": 0.7014170169812044, + "grad_norm": 0.7488612532615662, + "learning_rate": 7.261955076160972e-05, + "loss": 0.9609, + "step": 109790 + }, + { + "epoch": 0.7014809041309431, + "grad_norm": 0.7096448540687561, + "learning_rate": 7.261507577740283e-05, + "loss": 1.1737, + "step": 109800 + }, + { + "epoch": 0.7015447912806818, + "grad_norm": 1.065198540687561, + "learning_rate": 7.261060056544258e-05, + "loss": 0.9114, + "step": 109810 + }, + { + "epoch": 0.7016086784304205, + "grad_norm": 0.7157565951347351, + "learning_rate": 7.260612512577402e-05, + "loss": 0.7947, + "step": 109820 + }, + { + "epoch": 0.7016725655801592, + "grad_norm": 0.6602898240089417, + "learning_rate": 7.260164945844222e-05, + "loss": 0.8586, + "step": 109830 + }, + { + "epoch": 0.7017364527298979, + "grad_norm": 1.0240232944488525, + "learning_rate": 7.259717356349224e-05, + "loss": 0.8433, + "step": 109840 + }, + { + "epoch": 0.7018003398796366, + "grad_norm": 0.7069511413574219, + "learning_rate": 7.25926974409692e-05, + "loss": 0.7815, + "step": 109850 + }, + { + "epoch": 0.7018642270293753, + "grad_norm": 0.8306097984313965, + "learning_rate": 7.258822109091813e-05, + "loss": 0.8288, + "step": 109860 + }, + { + "epoch": 0.701928114179114, + "grad_norm": 0.46350932121276855, + "learning_rate": 7.258374451338415e-05, + "loss": 0.846, + "step": 109870 + }, + { + "epoch": 0.7019920013288528, + "grad_norm": 0.7333908677101135, + "learning_rate": 7.257926770841231e-05, + "loss": 0.863, + "step": 109880 + }, + { + "epoch": 0.7020558884785915, + "grad_norm": 1.8804274797439575, + "learning_rate": 7.25747906760477e-05, + "loss": 1.2467, + "step": 109890 + }, + { + "epoch": 0.7021197756283302, + "grad_norm": 1.2987992763519287, + "learning_rate": 7.257031341633545e-05, + "loss": 0.8424, + "step": 109900 + }, + { + "epoch": 0.7021836627780689, + "grad_norm": 0.5555353164672852, + "learning_rate": 7.25658359293206e-05, + "loss": 0.6798, + "step": 109910 + }, + { + "epoch": 0.7022475499278075, + "grad_norm": 1.0028846263885498, + "learning_rate": 7.256135821504827e-05, + "loss": 0.8265, + "step": 109920 + }, + { + "epoch": 0.7023114370775462, + "grad_norm": 0.8981877565383911, + "learning_rate": 7.255688027356353e-05, + "loss": 1.0722, + "step": 109930 + }, + { + "epoch": 0.7023753242272849, + "grad_norm": 0.9332131147384644, + "learning_rate": 7.25524021049115e-05, + "loss": 0.8057, + "step": 109940 + }, + { + "epoch": 0.7024392113770236, + "grad_norm": 0.8092306852340698, + "learning_rate": 7.254792370913728e-05, + "loss": 0.9814, + "step": 109950 + }, + { + "epoch": 0.7025030985267623, + "grad_norm": 0.5844110250473022, + "learning_rate": 7.254344508628594e-05, + "loss": 0.9691, + "step": 109960 + }, + { + "epoch": 0.702566985676501, + "grad_norm": 0.7450523972511292, + "learning_rate": 7.253896623640262e-05, + "loss": 1.0341, + "step": 109970 + }, + { + "epoch": 0.7026308728262397, + "grad_norm": 1.2225341796875, + "learning_rate": 7.253448715953241e-05, + "loss": 0.9289, + "step": 109980 + }, + { + "epoch": 0.7026947599759784, + "grad_norm": 0.8283683061599731, + "learning_rate": 7.25300078557204e-05, + "loss": 0.9794, + "step": 109990 + }, + { + "epoch": 0.7027586471257171, + "grad_norm": 1.274552345275879, + "learning_rate": 7.252552832501174e-05, + "loss": 1.3319, + "step": 110000 + }, + { + "epoch": 0.7028225342754558, + "grad_norm": 1.2938295602798462, + "learning_rate": 7.252104856745153e-05, + "loss": 1.0077, + "step": 110010 + }, + { + "epoch": 0.7028864214251945, + "grad_norm": 0.6301164031028748, + "learning_rate": 7.251656858308484e-05, + "loss": 0.7968, + "step": 110020 + }, + { + "epoch": 0.7029503085749332, + "grad_norm": 1.0087745189666748, + "learning_rate": 7.251208837195686e-05, + "loss": 0.9097, + "step": 110030 + }, + { + "epoch": 0.7030141957246719, + "grad_norm": 1.1381967067718506, + "learning_rate": 7.250760793411265e-05, + "loss": 0.8822, + "step": 110040 + }, + { + "epoch": 0.7030780828744106, + "grad_norm": 1.071937918663025, + "learning_rate": 7.250312726959739e-05, + "loss": 0.8275, + "step": 110050 + }, + { + "epoch": 0.7031419700241494, + "grad_norm": 0.9654282927513123, + "learning_rate": 7.249864637845614e-05, + "loss": 0.8608, + "step": 110060 + }, + { + "epoch": 0.7032058571738881, + "grad_norm": 0.739723801612854, + "learning_rate": 7.249416526073405e-05, + "loss": 0.6858, + "step": 110070 + }, + { + "epoch": 0.7032697443236268, + "grad_norm": 0.9041827917098999, + "learning_rate": 7.248968391647628e-05, + "loss": 0.8474, + "step": 110080 + }, + { + "epoch": 0.7033336314733655, + "grad_norm": 0.9711044430732727, + "learning_rate": 7.248520234572794e-05, + "loss": 0.7781, + "step": 110090 + }, + { + "epoch": 0.7033975186231042, + "grad_norm": 0.8251720666885376, + "learning_rate": 7.248072054853414e-05, + "loss": 1.1387, + "step": 110100 + }, + { + "epoch": 0.7034614057728429, + "grad_norm": 0.7342681288719177, + "learning_rate": 7.247623852494005e-05, + "loss": 0.8043, + "step": 110110 + }, + { + "epoch": 0.7035252929225816, + "grad_norm": 0.8310518264770508, + "learning_rate": 7.247175627499078e-05, + "loss": 0.935, + "step": 110120 + }, + { + "epoch": 0.7035891800723203, + "grad_norm": 0.8513674736022949, + "learning_rate": 7.24672737987315e-05, + "loss": 1.1539, + "step": 110130 + }, + { + "epoch": 0.703653067222059, + "grad_norm": 1.232692003250122, + "learning_rate": 7.246279109620733e-05, + "loss": 0.9609, + "step": 110140 + }, + { + "epoch": 0.7037169543717977, + "grad_norm": 0.6645367741584778, + "learning_rate": 7.245830816746342e-05, + "loss": 0.8672, + "step": 110150 + }, + { + "epoch": 0.7037808415215363, + "grad_norm": 0.7718523740768433, + "learning_rate": 7.245382501254491e-05, + "loss": 0.8136, + "step": 110160 + }, + { + "epoch": 0.703844728671275, + "grad_norm": 0.9974465370178223, + "learning_rate": 7.244934163149697e-05, + "loss": 0.8419, + "step": 110170 + }, + { + "epoch": 0.7039086158210137, + "grad_norm": 1.4745326042175293, + "learning_rate": 7.244485802436472e-05, + "loss": 0.7839, + "step": 110180 + }, + { + "epoch": 0.7039725029707524, + "grad_norm": 0.7670671343803406, + "learning_rate": 7.244037419119333e-05, + "loss": 0.7499, + "step": 110190 + }, + { + "epoch": 0.7040363901204911, + "grad_norm": 0.8539519906044006, + "learning_rate": 7.243589013202799e-05, + "loss": 0.9251, + "step": 110200 + }, + { + "epoch": 0.7041002772702298, + "grad_norm": 0.8276684284210205, + "learning_rate": 7.24314058469138e-05, + "loss": 1.0614, + "step": 110210 + }, + { + "epoch": 0.7041641644199685, + "grad_norm": 1.3117198944091797, + "learning_rate": 7.242692133589596e-05, + "loss": 0.8776, + "step": 110220 + }, + { + "epoch": 0.7042280515697072, + "grad_norm": 1.0242599248886108, + "learning_rate": 7.242243659901961e-05, + "loss": 0.7777, + "step": 110230 + }, + { + "epoch": 0.704291938719446, + "grad_norm": 0.7861204743385315, + "learning_rate": 7.241795163632994e-05, + "loss": 0.9854, + "step": 110240 + }, + { + "epoch": 0.7043558258691847, + "grad_norm": 1.0489882230758667, + "learning_rate": 7.241346644787208e-05, + "loss": 0.9031, + "step": 110250 + }, + { + "epoch": 0.7044197130189234, + "grad_norm": 1.8665564060211182, + "learning_rate": 7.240898103369124e-05, + "loss": 0.762, + "step": 110260 + }, + { + "epoch": 0.7044836001686621, + "grad_norm": 0.9396441578865051, + "learning_rate": 7.240449539383257e-05, + "loss": 1.0817, + "step": 110270 + }, + { + "epoch": 0.7045474873184008, + "grad_norm": 0.686319887638092, + "learning_rate": 7.240000952834125e-05, + "loss": 1.1644, + "step": 110280 + }, + { + "epoch": 0.7046113744681395, + "grad_norm": 0.7401465773582458, + "learning_rate": 7.239552343726246e-05, + "loss": 0.7316, + "step": 110290 + }, + { + "epoch": 0.7046752616178782, + "grad_norm": 1.3043559789657593, + "learning_rate": 7.239103712064136e-05, + "loss": 1.0058, + "step": 110300 + }, + { + "epoch": 0.7047391487676169, + "grad_norm": 0.9462155699729919, + "learning_rate": 7.238655057852314e-05, + "loss": 0.9211, + "step": 110310 + }, + { + "epoch": 0.7048030359173556, + "grad_norm": 0.9175712466239929, + "learning_rate": 7.238206381095302e-05, + "loss": 0.9743, + "step": 110320 + }, + { + "epoch": 0.7048669230670943, + "grad_norm": 1.9088410139083862, + "learning_rate": 7.237757681797613e-05, + "loss": 0.9467, + "step": 110330 + }, + { + "epoch": 0.704930810216833, + "grad_norm": 0.5652353763580322, + "learning_rate": 7.237308959963769e-05, + "loss": 0.7986, + "step": 110340 + }, + { + "epoch": 0.7049946973665717, + "grad_norm": 0.8433083891868591, + "learning_rate": 7.236860215598288e-05, + "loss": 0.8457, + "step": 110350 + }, + { + "epoch": 0.7050585845163104, + "grad_norm": 0.7066556811332703, + "learning_rate": 7.236411448705689e-05, + "loss": 0.7944, + "step": 110360 + }, + { + "epoch": 0.7051224716660491, + "grad_norm": NaN, + "learning_rate": 7.2360075392454e-05, + "loss": 0.784, + "step": 110370 + }, + { + "epoch": 0.7051863588157878, + "grad_norm": 0.7894417643547058, + "learning_rate": 7.23555872956373e-05, + "loss": 0.6449, + "step": 110380 + }, + { + "epoch": 0.7052502459655265, + "grad_norm": 1.0463151931762695, + "learning_rate": 7.235109897368049e-05, + "loss": 0.8735, + "step": 110390 + }, + { + "epoch": 0.7053141331152651, + "grad_norm": 0.7032953500747681, + "learning_rate": 7.234661042662877e-05, + "loss": 0.8867, + "step": 110400 + }, + { + "epoch": 0.7053780202650038, + "grad_norm": 0.71566241979599, + "learning_rate": 7.234212165452736e-05, + "loss": 0.7895, + "step": 110410 + }, + { + "epoch": 0.7054419074147426, + "grad_norm": 2.098254680633545, + "learning_rate": 7.233763265742146e-05, + "loss": 1.1398, + "step": 110420 + }, + { + "epoch": 0.7055057945644813, + "grad_norm": 0.8573238253593445, + "learning_rate": 7.233314343535627e-05, + "loss": 0.778, + "step": 110430 + }, + { + "epoch": 0.70556968171422, + "grad_norm": 0.5292240381240845, + "learning_rate": 7.2328653988377e-05, + "loss": 0.7799, + "step": 110440 + }, + { + "epoch": 0.7056335688639587, + "grad_norm": 0.9991651773452759, + "learning_rate": 7.232416431652887e-05, + "loss": 0.9966, + "step": 110450 + }, + { + "epoch": 0.7056974560136974, + "grad_norm": 0.45349401235580444, + "learning_rate": 7.23196744198571e-05, + "loss": 0.6847, + "step": 110460 + }, + { + "epoch": 0.7057613431634361, + "grad_norm": 0.9464260935783386, + "learning_rate": 7.231518429840689e-05, + "loss": 0.979, + "step": 110470 + }, + { + "epoch": 0.7058252303131748, + "grad_norm": 1.0189330577850342, + "learning_rate": 7.231069395222347e-05, + "loss": 0.8901, + "step": 110480 + }, + { + "epoch": 0.7058891174629135, + "grad_norm": 1.015156865119934, + "learning_rate": 7.230620338135205e-05, + "loss": 0.5721, + "step": 110490 + }, + { + "epoch": 0.7059530046126522, + "grad_norm": 2.3729195594787598, + "learning_rate": 7.230171258583788e-05, + "loss": 0.8536, + "step": 110500 + }, + { + "epoch": 0.7060168917623909, + "grad_norm": 0.6549795269966125, + "learning_rate": 7.229722156572616e-05, + "loss": 0.743, + "step": 110510 + }, + { + "epoch": 0.7060807789121296, + "grad_norm": 0.968298614025116, + "learning_rate": 7.229273032106214e-05, + "loss": 0.965, + "step": 110520 + }, + { + "epoch": 0.7061446660618683, + "grad_norm": 1.6288883686065674, + "learning_rate": 7.228823885189103e-05, + "loss": 0.9272, + "step": 110530 + }, + { + "epoch": 0.706208553211607, + "grad_norm": 0.7440875172615051, + "learning_rate": 7.228374715825807e-05, + "loss": 1.0698, + "step": 110540 + }, + { + "epoch": 0.7062724403613457, + "grad_norm": 0.9549334645271301, + "learning_rate": 7.227925524020853e-05, + "loss": 0.8151, + "step": 110550 + }, + { + "epoch": 0.7063363275110844, + "grad_norm": 1.4166960716247559, + "learning_rate": 7.227476309778759e-05, + "loss": 0.8097, + "step": 110560 + }, + { + "epoch": 0.7064002146608231, + "grad_norm": 1.3286716938018799, + "learning_rate": 7.227027073104052e-05, + "loss": 1.0101, + "step": 110570 + }, + { + "epoch": 0.7064641018105619, + "grad_norm": 0.5131798386573792, + "learning_rate": 7.226577814001254e-05, + "loss": 1.0766, + "step": 110580 + }, + { + "epoch": 0.7065279889603006, + "grad_norm": 0.8524615168571472, + "learning_rate": 7.226128532474893e-05, + "loss": 1.2115, + "step": 110590 + }, + { + "epoch": 0.7065918761100393, + "grad_norm": 0.9582625031471252, + "learning_rate": 7.225679228529491e-05, + "loss": 0.8322, + "step": 110600 + }, + { + "epoch": 0.706655763259778, + "grad_norm": 1.0330963134765625, + "learning_rate": 7.225229902169575e-05, + "loss": 0.8503, + "step": 110610 + }, + { + "epoch": 0.7067196504095167, + "grad_norm": 1.450884222984314, + "learning_rate": 7.224780553399667e-05, + "loss": 0.792, + "step": 110620 + }, + { + "epoch": 0.7067835375592554, + "grad_norm": 1.2359329462051392, + "learning_rate": 7.224331182224296e-05, + "loss": 0.9444, + "step": 110630 + }, + { + "epoch": 0.7068474247089941, + "grad_norm": 0.6814751029014587, + "learning_rate": 7.223881788647984e-05, + "loss": 0.8523, + "step": 110640 + }, + { + "epoch": 0.7069113118587327, + "grad_norm": 0.5562382340431213, + "learning_rate": 7.223432372675258e-05, + "loss": 0.9113, + "step": 110650 + }, + { + "epoch": 0.7069751990084714, + "grad_norm": 1.0418486595153809, + "learning_rate": 7.222982934310645e-05, + "loss": 0.8662, + "step": 110660 + }, + { + "epoch": 0.7070390861582101, + "grad_norm": 1.3430500030517578, + "learning_rate": 7.222533473558671e-05, + "loss": 0.9428, + "step": 110670 + }, + { + "epoch": 0.7071029733079488, + "grad_norm": 0.9535601139068604, + "learning_rate": 7.222083990423863e-05, + "loss": 0.8891, + "step": 110680 + }, + { + "epoch": 0.7071668604576875, + "grad_norm": 0.8033544421195984, + "learning_rate": 7.221634484910746e-05, + "loss": 0.7643, + "step": 110690 + }, + { + "epoch": 0.7072307476074262, + "grad_norm": 0.8913300037384033, + "learning_rate": 7.221184957023848e-05, + "loss": 0.8317, + "step": 110700 + }, + { + "epoch": 0.7072946347571649, + "grad_norm": 0.8205702304840088, + "learning_rate": 7.220735406767696e-05, + "loss": 0.9091, + "step": 110710 + }, + { + "epoch": 0.7073585219069036, + "grad_norm": 0.7440993189811707, + "learning_rate": 7.220285834146816e-05, + "loss": 1.0378, + "step": 110720 + }, + { + "epoch": 0.7074224090566423, + "grad_norm": 1.0705007314682007, + "learning_rate": 7.219836239165737e-05, + "loss": 0.914, + "step": 110730 + }, + { + "epoch": 0.707486296206381, + "grad_norm": 0.6237671375274658, + "learning_rate": 7.219386621828989e-05, + "loss": 0.7851, + "step": 110740 + }, + { + "epoch": 0.7075501833561197, + "grad_norm": 0.8966255784034729, + "learning_rate": 7.218936982141096e-05, + "loss": 0.8537, + "step": 110750 + }, + { + "epoch": 0.7076140705058585, + "grad_norm": 0.8700849413871765, + "learning_rate": 7.218487320106588e-05, + "loss": 1.0906, + "step": 110760 + }, + { + "epoch": 0.7076779576555972, + "grad_norm": 0.9361857771873474, + "learning_rate": 7.218037635729993e-05, + "loss": 1.1403, + "step": 110770 + }, + { + "epoch": 0.7077418448053359, + "grad_norm": 0.9435377717018127, + "learning_rate": 7.21758792901584e-05, + "loss": 1.1844, + "step": 110780 + }, + { + "epoch": 0.7078057319550746, + "grad_norm": 2.09425950050354, + "learning_rate": 7.21713819996866e-05, + "loss": 1.0254, + "step": 110790 + }, + { + "epoch": 0.7078696191048133, + "grad_norm": 1.783277988433838, + "learning_rate": 7.21668844859298e-05, + "loss": 1.2829, + "step": 110800 + }, + { + "epoch": 0.707933506254552, + "grad_norm": 0.7960687279701233, + "learning_rate": 7.216238674893328e-05, + "loss": 0.7562, + "step": 110810 + }, + { + "epoch": 0.7079973934042907, + "grad_norm": 0.8216058611869812, + "learning_rate": 7.215788878874237e-05, + "loss": 0.937, + "step": 110820 + }, + { + "epoch": 0.7080612805540294, + "grad_norm": 0.8827890753746033, + "learning_rate": 7.215339060540231e-05, + "loss": 0.9726, + "step": 110830 + }, + { + "epoch": 0.7081251677037681, + "grad_norm": 1.042328119277954, + "learning_rate": 7.214889219895849e-05, + "loss": 0.8115, + "step": 110840 + }, + { + "epoch": 0.7081890548535068, + "grad_norm": 0.9282635450363159, + "learning_rate": 7.214439356945614e-05, + "loss": 1.0414, + "step": 110850 + }, + { + "epoch": 0.7082529420032455, + "grad_norm": 0.7158064246177673, + "learning_rate": 7.213989471694059e-05, + "loss": 0.9759, + "step": 110860 + }, + { + "epoch": 0.7083168291529842, + "grad_norm": 0.8982157111167908, + "learning_rate": 7.213539564145715e-05, + "loss": 0.7747, + "step": 110870 + }, + { + "epoch": 0.7083807163027229, + "grad_norm": 0.5959254503250122, + "learning_rate": 7.213089634305112e-05, + "loss": 0.844, + "step": 110880 + }, + { + "epoch": 0.7084446034524615, + "grad_norm": 1.0131826400756836, + "learning_rate": 7.212639682176782e-05, + "loss": 0.8006, + "step": 110890 + }, + { + "epoch": 0.7085084906022002, + "grad_norm": 3.458534002304077, + "learning_rate": 7.212189707765257e-05, + "loss": 0.8292, + "step": 110900 + }, + { + "epoch": 0.7085723777519389, + "grad_norm": 1.091951847076416, + "learning_rate": 7.211739711075067e-05, + "loss": 0.8682, + "step": 110910 + }, + { + "epoch": 0.7086362649016776, + "grad_norm": 1.005330204963684, + "learning_rate": 7.211289692110746e-05, + "loss": 0.8157, + "step": 110920 + }, + { + "epoch": 0.7087001520514163, + "grad_norm": 0.5316596627235413, + "learning_rate": 7.210839650876824e-05, + "loss": 0.721, + "step": 110930 + }, + { + "epoch": 0.708764039201155, + "grad_norm": 0.904162585735321, + "learning_rate": 7.210389587377833e-05, + "loss": 0.8916, + "step": 110940 + }, + { + "epoch": 0.7088279263508938, + "grad_norm": 0.6802520155906677, + "learning_rate": 7.209939501618308e-05, + "loss": 1.0842, + "step": 110950 + }, + { + "epoch": 0.7088918135006325, + "grad_norm": 0.8692640066146851, + "learning_rate": 7.20948939360278e-05, + "loss": 1.13, + "step": 110960 + }, + { + "epoch": 0.7089557006503712, + "grad_norm": 0.9759564399719238, + "learning_rate": 7.209039263335782e-05, + "loss": 1.1396, + "step": 110970 + }, + { + "epoch": 0.7090195878001099, + "grad_norm": 0.8422264456748962, + "learning_rate": 7.208589110821848e-05, + "loss": 0.9562, + "step": 110980 + }, + { + "epoch": 0.7090834749498486, + "grad_norm": 0.9133360385894775, + "learning_rate": 7.208138936065509e-05, + "loss": 0.9616, + "step": 110990 + }, + { + "epoch": 0.7091473620995873, + "grad_norm": 1.0148824453353882, + "learning_rate": 7.207688739071303e-05, + "loss": 0.8601, + "step": 111000 + }, + { + "epoch": 0.709211249249326, + "grad_norm": 0.9856421947479248, + "learning_rate": 7.207238519843761e-05, + "loss": 0.6616, + "step": 111010 + }, + { + "epoch": 0.7092751363990647, + "grad_norm": 1.0549514293670654, + "learning_rate": 7.206788278387417e-05, + "loss": 0.8587, + "step": 111020 + }, + { + "epoch": 0.7093390235488034, + "grad_norm": 0.8037036657333374, + "learning_rate": 7.206338014706806e-05, + "loss": 0.782, + "step": 111030 + }, + { + "epoch": 0.7094029106985421, + "grad_norm": 0.7747255563735962, + "learning_rate": 7.205887728806463e-05, + "loss": 0.8245, + "step": 111040 + }, + { + "epoch": 0.7094667978482808, + "grad_norm": 0.8603516221046448, + "learning_rate": 7.205437420690922e-05, + "loss": 0.884, + "step": 111050 + }, + { + "epoch": 0.7095306849980195, + "grad_norm": 0.5913922190666199, + "learning_rate": 7.204987090364717e-05, + "loss": 1.1146, + "step": 111060 + }, + { + "epoch": 0.7095945721477582, + "grad_norm": 1.0901986360549927, + "learning_rate": 7.204536737832385e-05, + "loss": 0.99, + "step": 111070 + }, + { + "epoch": 0.7096584592974969, + "grad_norm": 0.8610712885856628, + "learning_rate": 7.204086363098462e-05, + "loss": 1.1021, + "step": 111080 + }, + { + "epoch": 0.7097223464472356, + "grad_norm": 1.4266153573989868, + "learning_rate": 7.203635966167482e-05, + "loss": 0.9517, + "step": 111090 + }, + { + "epoch": 0.7097862335969743, + "grad_norm": 1.2162227630615234, + "learning_rate": 7.203185547043981e-05, + "loss": 0.8217, + "step": 111100 + }, + { + "epoch": 0.709850120746713, + "grad_norm": 1.2490782737731934, + "learning_rate": 7.202735105732497e-05, + "loss": 0.7838, + "step": 111110 + }, + { + "epoch": 0.7099140078964518, + "grad_norm": 0.9295443892478943, + "learning_rate": 7.202284642237563e-05, + "loss": 0.8437, + "step": 111120 + }, + { + "epoch": 0.7099778950461904, + "grad_norm": 1.247816562652588, + "learning_rate": 7.201834156563718e-05, + "loss": 0.7268, + "step": 111130 + }, + { + "epoch": 0.7100417821959291, + "grad_norm": 0.8253741264343262, + "learning_rate": 7.201383648715498e-05, + "loss": 0.8062, + "step": 111140 + }, + { + "epoch": 0.7101056693456678, + "grad_norm": 0.7604842185974121, + "learning_rate": 7.200933118697439e-05, + "loss": 0.9545, + "step": 111150 + }, + { + "epoch": 0.7101695564954065, + "grad_norm": 1.0069366693496704, + "learning_rate": 7.200482566514081e-05, + "loss": 0.8775, + "step": 111160 + }, + { + "epoch": 0.7102334436451452, + "grad_norm": 0.5719223022460938, + "learning_rate": 7.20003199216996e-05, + "loss": 0.8167, + "step": 111170 + }, + { + "epoch": 0.7102973307948839, + "grad_norm": 0.9412128925323486, + "learning_rate": 7.199581395669613e-05, + "loss": 0.7928, + "step": 111180 + }, + { + "epoch": 0.7103612179446226, + "grad_norm": 0.8376321196556091, + "learning_rate": 7.199130777017578e-05, + "loss": 0.9514, + "step": 111190 + }, + { + "epoch": 0.7104251050943613, + "grad_norm": 1.0094441175460815, + "learning_rate": 7.198680136218394e-05, + "loss": 0.9398, + "step": 111200 + }, + { + "epoch": 0.7104889922441, + "grad_norm": 2.3433310985565186, + "learning_rate": 7.1982294732766e-05, + "loss": 0.8579, + "step": 111210 + }, + { + "epoch": 0.7105528793938387, + "grad_norm": 0.6974323987960815, + "learning_rate": 7.197778788196732e-05, + "loss": 0.7997, + "step": 111220 + }, + { + "epoch": 0.7106167665435774, + "grad_norm": 0.4892445206642151, + "learning_rate": 7.197328080983331e-05, + "loss": 0.6908, + "step": 111230 + }, + { + "epoch": 0.7106806536933161, + "grad_norm": 0.5663550496101379, + "learning_rate": 7.196877351640934e-05, + "loss": 0.7576, + "step": 111240 + }, + { + "epoch": 0.7107445408430548, + "grad_norm": 1.5886496305465698, + "learning_rate": 7.196426600174083e-05, + "loss": 1.0888, + "step": 111250 + }, + { + "epoch": 0.7108084279927935, + "grad_norm": 1.0448259115219116, + "learning_rate": 7.195975826587315e-05, + "loss": 0.9002, + "step": 111260 + }, + { + "epoch": 0.7108723151425322, + "grad_norm": 1.3552470207214355, + "learning_rate": 7.195525030885173e-05, + "loss": 1.0711, + "step": 111270 + }, + { + "epoch": 0.710936202292271, + "grad_norm": 0.8098346590995789, + "learning_rate": 7.195074213072192e-05, + "loss": 0.7809, + "step": 111280 + }, + { + "epoch": 0.7110000894420097, + "grad_norm": 0.7576483488082886, + "learning_rate": 7.194623373152916e-05, + "loss": 1.0012, + "step": 111290 + }, + { + "epoch": 0.7110639765917484, + "grad_norm": 1.0490570068359375, + "learning_rate": 7.194172511131883e-05, + "loss": 0.8137, + "step": 111300 + }, + { + "epoch": 0.7111278637414871, + "grad_norm": 1.080182671546936, + "learning_rate": 7.193721627013635e-05, + "loss": 1.161, + "step": 111310 + }, + { + "epoch": 0.7111917508912258, + "grad_norm": 0.9876430034637451, + "learning_rate": 7.193270720802713e-05, + "loss": 1.1287, + "step": 111320 + }, + { + "epoch": 0.7112556380409645, + "grad_norm": 0.8091046214103699, + "learning_rate": 7.192819792503656e-05, + "loss": 0.8741, + "step": 111330 + }, + { + "epoch": 0.7113195251907032, + "grad_norm": 0.6781719326972961, + "learning_rate": 7.192368842121008e-05, + "loss": 0.8945, + "step": 111340 + }, + { + "epoch": 0.7113834123404419, + "grad_norm": 2.1423685550689697, + "learning_rate": 7.191917869659307e-05, + "loss": 0.7859, + "step": 111350 + }, + { + "epoch": 0.7114472994901806, + "grad_norm": 0.6364580988883972, + "learning_rate": 7.191466875123099e-05, + "loss": 0.7873, + "step": 111360 + }, + { + "epoch": 0.7115111866399193, + "grad_norm": 0.8204523921012878, + "learning_rate": 7.191015858516921e-05, + "loss": 0.8482, + "step": 111370 + }, + { + "epoch": 0.7115750737896579, + "grad_norm": 0.5086873173713684, + "learning_rate": 7.19056481984532e-05, + "loss": 0.951, + "step": 111380 + }, + { + "epoch": 0.7116389609393966, + "grad_norm": 0.7575923204421997, + "learning_rate": 7.190113759112837e-05, + "loss": 1.0358, + "step": 111390 + }, + { + "epoch": 0.7117028480891353, + "grad_norm": 0.6204468607902527, + "learning_rate": 7.189662676324012e-05, + "loss": 1.0471, + "step": 111400 + }, + { + "epoch": 0.711766735238874, + "grad_norm": 0.7456675171852112, + "learning_rate": 7.18921157148339e-05, + "loss": 0.7964, + "step": 111410 + }, + { + "epoch": 0.7118306223886127, + "grad_norm": 0.8651039004325867, + "learning_rate": 7.188760444595513e-05, + "loss": 0.596, + "step": 111420 + }, + { + "epoch": 0.7118945095383514, + "grad_norm": 1.0460362434387207, + "learning_rate": 7.188309295664926e-05, + "loss": 0.9561, + "step": 111430 + }, + { + "epoch": 0.7119583966880901, + "grad_norm": 1.1043407917022705, + "learning_rate": 7.187858124696171e-05, + "loss": 0.95, + "step": 111440 + }, + { + "epoch": 0.7120222838378288, + "grad_norm": 1.1724969148635864, + "learning_rate": 7.187406931693791e-05, + "loss": 1.278, + "step": 111450 + }, + { + "epoch": 0.7120861709875675, + "grad_norm": 0.8911068439483643, + "learning_rate": 7.186955716662332e-05, + "loss": 0.9101, + "step": 111460 + }, + { + "epoch": 0.7121500581373063, + "grad_norm": 0.9001386761665344, + "learning_rate": 7.186504479606336e-05, + "loss": 0.6922, + "step": 111470 + }, + { + "epoch": 0.712213945287045, + "grad_norm": 0.7261586785316467, + "learning_rate": 7.186053220530349e-05, + "loss": 0.9494, + "step": 111480 + }, + { + "epoch": 0.7122778324367837, + "grad_norm": 1.1621700525283813, + "learning_rate": 7.185601939438914e-05, + "loss": 1.1188, + "step": 111490 + }, + { + "epoch": 0.7123417195865224, + "grad_norm": 1.0625040531158447, + "learning_rate": 7.185150636336578e-05, + "loss": 0.6796, + "step": 111500 + }, + { + "epoch": 0.7124056067362611, + "grad_norm": 0.5753048062324524, + "learning_rate": 7.184699311227883e-05, + "loss": 0.6958, + "step": 111510 + }, + { + "epoch": 0.7124694938859998, + "grad_norm": 0.8998501896858215, + "learning_rate": 7.184247964117376e-05, + "loss": 0.765, + "step": 111520 + }, + { + "epoch": 0.7125333810357385, + "grad_norm": 1.5378433465957642, + "learning_rate": 7.183796595009604e-05, + "loss": 0.7618, + "step": 111530 + }, + { + "epoch": 0.7125972681854772, + "grad_norm": 0.5962892174720764, + "learning_rate": 7.18334520390911e-05, + "loss": 0.8256, + "step": 111540 + }, + { + "epoch": 0.7126611553352159, + "grad_norm": 0.8945503234863281, + "learning_rate": 7.182893790820441e-05, + "loss": 0.935, + "step": 111550 + }, + { + "epoch": 0.7127250424849546, + "grad_norm": 0.8816238045692444, + "learning_rate": 7.182442355748143e-05, + "loss": 0.6085, + "step": 111560 + }, + { + "epoch": 0.7127889296346933, + "grad_norm": 0.7212803959846497, + "learning_rate": 7.181990898696762e-05, + "loss": 0.7271, + "step": 111570 + }, + { + "epoch": 0.712852816784432, + "grad_norm": 0.9324487447738647, + "learning_rate": 7.181539419670847e-05, + "loss": 0.8302, + "step": 111580 + }, + { + "epoch": 0.7129167039341707, + "grad_norm": 0.7855546474456787, + "learning_rate": 7.18108791867494e-05, + "loss": 1.1443, + "step": 111590 + }, + { + "epoch": 0.7129805910839094, + "grad_norm": 1.1288433074951172, + "learning_rate": 7.180636395713592e-05, + "loss": 0.77, + "step": 111600 + }, + { + "epoch": 0.7130444782336481, + "grad_norm": 0.8179686665534973, + "learning_rate": 7.18018485079135e-05, + "loss": 0.9342, + "step": 111610 + }, + { + "epoch": 0.7131083653833867, + "grad_norm": 0.5550733804702759, + "learning_rate": 7.179733283912759e-05, + "loss": 1.1414, + "step": 111620 + }, + { + "epoch": 0.7131722525331254, + "grad_norm": 1.0934454202651978, + "learning_rate": 7.179281695082369e-05, + "loss": 1.1329, + "step": 111630 + }, + { + "epoch": 0.7132361396828641, + "grad_norm": 0.8720685243606567, + "learning_rate": 7.178830084304725e-05, + "loss": 0.9261, + "step": 111640 + }, + { + "epoch": 0.7133000268326029, + "grad_norm": 1.734836220741272, + "learning_rate": 7.17837845158438e-05, + "loss": 1.0047, + "step": 111650 + }, + { + "epoch": 0.7133639139823416, + "grad_norm": 0.9459303021430969, + "learning_rate": 7.177926796925877e-05, + "loss": 1.0278, + "step": 111660 + }, + { + "epoch": 0.7134278011320803, + "grad_norm": 1.2539857625961304, + "learning_rate": 7.177475120333767e-05, + "loss": 0.7048, + "step": 111670 + }, + { + "epoch": 0.713491688281819, + "grad_norm": 1.275357723236084, + "learning_rate": 7.177023421812601e-05, + "loss": 0.7448, + "step": 111680 + }, + { + "epoch": 0.7135555754315577, + "grad_norm": 0.5253180265426636, + "learning_rate": 7.176571701366924e-05, + "loss": 0.9166, + "step": 111690 + }, + { + "epoch": 0.7136194625812964, + "grad_norm": 2.3519647121429443, + "learning_rate": 7.176119959001287e-05, + "loss": 0.7995, + "step": 111700 + }, + { + "epoch": 0.7136833497310351, + "grad_norm": 0.8496062755584717, + "learning_rate": 7.17566819472024e-05, + "loss": 0.9215, + "step": 111710 + }, + { + "epoch": 0.7137472368807738, + "grad_norm": 0.6206763386726379, + "learning_rate": 7.175216408528331e-05, + "loss": 0.7705, + "step": 111720 + }, + { + "epoch": 0.7138111240305125, + "grad_norm": 0.9494533538818359, + "learning_rate": 7.174764600430112e-05, + "loss": 0.9813, + "step": 111730 + }, + { + "epoch": 0.7138750111802512, + "grad_norm": 0.609451413154602, + "learning_rate": 7.174312770430131e-05, + "loss": 0.8221, + "step": 111740 + }, + { + "epoch": 0.7139388983299899, + "grad_norm": 0.6117620468139648, + "learning_rate": 7.17386091853294e-05, + "loss": 1.1774, + "step": 111750 + }, + { + "epoch": 0.7140027854797286, + "grad_norm": 0.881416916847229, + "learning_rate": 7.173409044743092e-05, + "loss": 1.1644, + "step": 111760 + }, + { + "epoch": 0.7140666726294673, + "grad_norm": 0.9561392068862915, + "learning_rate": 7.17295714906513e-05, + "loss": 1.0794, + "step": 111770 + }, + { + "epoch": 0.714130559779206, + "grad_norm": 0.9907708764076233, + "learning_rate": 7.172505231503613e-05, + "loss": 0.8979, + "step": 111780 + }, + { + "epoch": 0.7141944469289447, + "grad_norm": 1.2147117853164673, + "learning_rate": 7.172053292063085e-05, + "loss": 0.8886, + "step": 111790 + }, + { + "epoch": 0.7142583340786834, + "grad_norm": 0.8002836108207703, + "learning_rate": 7.171601330748104e-05, + "loss": 0.9329, + "step": 111800 + }, + { + "epoch": 0.7143222212284221, + "grad_norm": 0.9203763604164124, + "learning_rate": 7.171149347563219e-05, + "loss": 0.7629, + "step": 111810 + }, + { + "epoch": 0.7143861083781609, + "grad_norm": 1.0033005475997925, + "learning_rate": 7.17069734251298e-05, + "loss": 0.5576, + "step": 111820 + }, + { + "epoch": 0.7144499955278996, + "grad_norm": 0.8255144357681274, + "learning_rate": 7.170245315601942e-05, + "loss": 0.8515, + "step": 111830 + }, + { + "epoch": 0.7145138826776383, + "grad_norm": 0.6121490597724915, + "learning_rate": 7.169793266834657e-05, + "loss": 0.7734, + "step": 111840 + }, + { + "epoch": 0.714577769827377, + "grad_norm": 0.9062861204147339, + "learning_rate": 7.169341196215675e-05, + "loss": 0.9229, + "step": 111850 + }, + { + "epoch": 0.7146416569771156, + "grad_norm": 1.0304701328277588, + "learning_rate": 7.168889103749552e-05, + "loss": 0.7766, + "step": 111860 + }, + { + "epoch": 0.7147055441268543, + "grad_norm": 0.49770671129226685, + "learning_rate": 7.168436989440838e-05, + "loss": 0.9304, + "step": 111870 + }, + { + "epoch": 0.714769431276593, + "grad_norm": 2.51269268989563, + "learning_rate": 7.167984853294087e-05, + "loss": 0.9755, + "step": 111880 + }, + { + "epoch": 0.7148333184263317, + "grad_norm": 0.7806798219680786, + "learning_rate": 7.167532695313855e-05, + "loss": 0.8631, + "step": 111890 + }, + { + "epoch": 0.7148972055760704, + "grad_norm": 0.9068145155906677, + "learning_rate": 7.167080515504692e-05, + "loss": 0.8938, + "step": 111900 + }, + { + "epoch": 0.7149610927258091, + "grad_norm": 0.778988778591156, + "learning_rate": 7.166628313871155e-05, + "loss": 1.0556, + "step": 111910 + }, + { + "epoch": 0.7150249798755478, + "grad_norm": 1.0016741752624512, + "learning_rate": 7.166176090417794e-05, + "loss": 0.9363, + "step": 111920 + }, + { + "epoch": 0.7150888670252865, + "grad_norm": 1.0079103708267212, + "learning_rate": 7.165723845149169e-05, + "loss": 0.9302, + "step": 111930 + }, + { + "epoch": 0.7151527541750252, + "grad_norm": 1.5212242603302002, + "learning_rate": 7.165271578069827e-05, + "loss": 0.9895, + "step": 111940 + }, + { + "epoch": 0.7152166413247639, + "grad_norm": 0.8512176871299744, + "learning_rate": 7.16481928918433e-05, + "loss": 0.7161, + "step": 111950 + }, + { + "epoch": 0.7152805284745026, + "grad_norm": 1.4987943172454834, + "learning_rate": 7.16436697849723e-05, + "loss": 0.8515, + "step": 111960 + }, + { + "epoch": 0.7153444156242413, + "grad_norm": 1.0176150798797607, + "learning_rate": 7.163914646013082e-05, + "loss": 0.8803, + "step": 111970 + }, + { + "epoch": 0.71540830277398, + "grad_norm": 0.5044176578521729, + "learning_rate": 7.16346229173644e-05, + "loss": 0.7152, + "step": 111980 + }, + { + "epoch": 0.7154721899237187, + "grad_norm": 2.0591912269592285, + "learning_rate": 7.163009915671863e-05, + "loss": 0.8914, + "step": 111990 + }, + { + "epoch": 0.7155360770734575, + "grad_norm": 0.6870359778404236, + "learning_rate": 7.162557517823904e-05, + "loss": 0.9074, + "step": 112000 + }, + { + "epoch": 0.7155999642231962, + "grad_norm": 0.9231833815574646, + "learning_rate": 7.16210509819712e-05, + "loss": 0.9811, + "step": 112010 + }, + { + "epoch": 0.7156638513729349, + "grad_norm": 1.032814860343933, + "learning_rate": 7.161652656796068e-05, + "loss": 0.8585, + "step": 112020 + }, + { + "epoch": 0.7157277385226736, + "grad_norm": 0.6325913667678833, + "learning_rate": 7.161200193625302e-05, + "loss": 0.8663, + "step": 112030 + }, + { + "epoch": 0.7157916256724123, + "grad_norm": 0.9060227870941162, + "learning_rate": 7.16074770868938e-05, + "loss": 0.904, + "step": 112040 + }, + { + "epoch": 0.715855512822151, + "grad_norm": 1.0490593910217285, + "learning_rate": 7.16029520199286e-05, + "loss": 0.7224, + "step": 112050 + }, + { + "epoch": 0.7159193999718897, + "grad_norm": 0.7744579911231995, + "learning_rate": 7.1598426735403e-05, + "loss": 0.953, + "step": 112060 + }, + { + "epoch": 0.7159832871216284, + "grad_norm": 1.1592707633972168, + "learning_rate": 7.159390123336253e-05, + "loss": 0.8888, + "step": 112070 + }, + { + "epoch": 0.7160471742713671, + "grad_norm": 1.0290485620498657, + "learning_rate": 7.158937551385281e-05, + "loss": 0.7144, + "step": 112080 + }, + { + "epoch": 0.7161110614211058, + "grad_norm": 0.7626058459281921, + "learning_rate": 7.15848495769194e-05, + "loss": 0.8806, + "step": 112090 + }, + { + "epoch": 0.7161749485708444, + "grad_norm": 1.230587363243103, + "learning_rate": 7.158032342260787e-05, + "loss": 0.9783, + "step": 112100 + }, + { + "epoch": 0.7162388357205831, + "grad_norm": 1.150781273841858, + "learning_rate": 7.157579705096384e-05, + "loss": 0.8128, + "step": 112110 + }, + { + "epoch": 0.7163027228703218, + "grad_norm": 0.9989941716194153, + "learning_rate": 7.157127046203285e-05, + "loss": 0.982, + "step": 112120 + }, + { + "epoch": 0.7163666100200605, + "grad_norm": 0.5104489922523499, + "learning_rate": 7.15667436558605e-05, + "loss": 1.1264, + "step": 112130 + }, + { + "epoch": 0.7164304971697992, + "grad_norm": 0.8887497186660767, + "learning_rate": 7.156221663249238e-05, + "loss": 0.9689, + "step": 112140 + }, + { + "epoch": 0.7164943843195379, + "grad_norm": 0.9902895092964172, + "learning_rate": 7.155768939197411e-05, + "loss": 0.8488, + "step": 112150 + }, + { + "epoch": 0.7165582714692766, + "grad_norm": 0.9794628024101257, + "learning_rate": 7.155316193435123e-05, + "loss": 0.8772, + "step": 112160 + }, + { + "epoch": 0.7166221586190153, + "grad_norm": 1.3520082235336304, + "learning_rate": 7.154863425966938e-05, + "loss": 1.1314, + "step": 112170 + }, + { + "epoch": 0.716686045768754, + "grad_norm": 0.871411919593811, + "learning_rate": 7.154410636797413e-05, + "loss": 0.9373, + "step": 112180 + }, + { + "epoch": 0.7167499329184928, + "grad_norm": 1.0834548473358154, + "learning_rate": 7.15395782593111e-05, + "loss": 0.7362, + "step": 112190 + }, + { + "epoch": 0.7168138200682315, + "grad_norm": 0.9471587538719177, + "learning_rate": 7.153504993372587e-05, + "loss": 0.8928, + "step": 112200 + }, + { + "epoch": 0.7168777072179702, + "grad_norm": 1.114270806312561, + "learning_rate": 7.153052139126407e-05, + "loss": 0.7417, + "step": 112210 + }, + { + "epoch": 0.7169415943677089, + "grad_norm": 1.393097162246704, + "learning_rate": 7.152599263197128e-05, + "loss": 1.0208, + "step": 112220 + }, + { + "epoch": 0.7170054815174476, + "grad_norm": 0.7526148557662964, + "learning_rate": 7.152146365589313e-05, + "loss": 1.0583, + "step": 112230 + }, + { + "epoch": 0.7170693686671863, + "grad_norm": 1.4979679584503174, + "learning_rate": 7.151693446307524e-05, + "loss": 0.8657, + "step": 112240 + }, + { + "epoch": 0.717133255816925, + "grad_norm": 0.7757830619812012, + "learning_rate": 7.151240505356318e-05, + "loss": 0.7491, + "step": 112250 + }, + { + "epoch": 0.7171971429666637, + "grad_norm": 1.1512292623519897, + "learning_rate": 7.15078754274026e-05, + "loss": 0.818, + "step": 112260 + }, + { + "epoch": 0.7172610301164024, + "grad_norm": 1.370301365852356, + "learning_rate": 7.150334558463911e-05, + "loss": 0.8435, + "step": 112270 + }, + { + "epoch": 0.7173249172661411, + "grad_norm": 1.1701213121414185, + "learning_rate": 7.149881552531832e-05, + "loss": 1.1135, + "step": 112280 + }, + { + "epoch": 0.7173888044158798, + "grad_norm": 0.6550779342651367, + "learning_rate": 7.149428524948585e-05, + "loss": 0.8574, + "step": 112290 + }, + { + "epoch": 0.7174526915656185, + "grad_norm": 0.9960959553718567, + "learning_rate": 7.148975475718734e-05, + "loss": 0.9118, + "step": 112300 + }, + { + "epoch": 0.7175165787153572, + "grad_norm": 0.6779653429985046, + "learning_rate": 7.148522404846841e-05, + "loss": 0.9284, + "step": 112310 + }, + { + "epoch": 0.7175804658650959, + "grad_norm": 0.8120177984237671, + "learning_rate": 7.14806931233747e-05, + "loss": 1.0227, + "step": 112320 + }, + { + "epoch": 0.7176443530148346, + "grad_norm": 1.1333248615264893, + "learning_rate": 7.147616198195181e-05, + "loss": 0.7896, + "step": 112330 + }, + { + "epoch": 0.7177082401645734, + "grad_norm": 0.5088018178939819, + "learning_rate": 7.147163062424539e-05, + "loss": 0.6615, + "step": 112340 + }, + { + "epoch": 0.717772127314312, + "grad_norm": 0.9224886894226074, + "learning_rate": 7.146709905030108e-05, + "loss": 0.9036, + "step": 112350 + }, + { + "epoch": 0.7178360144640507, + "grad_norm": 0.9467249512672424, + "learning_rate": 7.146256726016452e-05, + "loss": 0.8896, + "step": 112360 + }, + { + "epoch": 0.7178999016137894, + "grad_norm": 1.132169246673584, + "learning_rate": 7.145803525388132e-05, + "loss": 1.1764, + "step": 112370 + }, + { + "epoch": 0.7179637887635281, + "grad_norm": 0.9718843102455139, + "learning_rate": 7.145350303149715e-05, + "loss": 0.8263, + "step": 112380 + }, + { + "epoch": 0.7180276759132668, + "grad_norm": 0.5153815746307373, + "learning_rate": 7.144897059305764e-05, + "loss": 0.9022, + "step": 112390 + }, + { + "epoch": 0.7180915630630055, + "grad_norm": 1.2004663944244385, + "learning_rate": 7.144443793860845e-05, + "loss": 0.8924, + "step": 112400 + }, + { + "epoch": 0.7181554502127442, + "grad_norm": 0.48930367827415466, + "learning_rate": 7.14399050681952e-05, + "loss": 0.6762, + "step": 112410 + }, + { + "epoch": 0.7182193373624829, + "grad_norm": 0.9120885729789734, + "learning_rate": 7.143537198186356e-05, + "loss": 0.7038, + "step": 112420 + }, + { + "epoch": 0.7182832245122216, + "grad_norm": 0.7880602478981018, + "learning_rate": 7.14308386796592e-05, + "loss": 0.6403, + "step": 112430 + }, + { + "epoch": 0.7183471116619603, + "grad_norm": 0.8461630940437317, + "learning_rate": 7.142630516162774e-05, + "loss": 1.245, + "step": 112440 + }, + { + "epoch": 0.718410998811699, + "grad_norm": 0.7924696803092957, + "learning_rate": 7.142177142781485e-05, + "loss": 0.7825, + "step": 112450 + }, + { + "epoch": 0.7184748859614377, + "grad_norm": 1.4075403213500977, + "learning_rate": 7.14172374782662e-05, + "loss": 1.0038, + "step": 112460 + }, + { + "epoch": 0.7185387731111764, + "grad_norm": 0.6617513298988342, + "learning_rate": 7.141270331302743e-05, + "loss": 0.9936, + "step": 112470 + }, + { + "epoch": 0.7186026602609151, + "grad_norm": 0.9950401186943054, + "learning_rate": 7.140816893214421e-05, + "loss": 0.9455, + "step": 112480 + }, + { + "epoch": 0.7186665474106538, + "grad_norm": 1.0970572233200073, + "learning_rate": 7.140363433566224e-05, + "loss": 0.9689, + "step": 112490 + }, + { + "epoch": 0.7187304345603925, + "grad_norm": 0.8020222783088684, + "learning_rate": 7.139909952362712e-05, + "loss": 0.8451, + "step": 112500 + }, + { + "epoch": 0.7187943217101312, + "grad_norm": 1.0068836212158203, + "learning_rate": 7.139456449608458e-05, + "loss": 1.3056, + "step": 112510 + }, + { + "epoch": 0.71885820885987, + "grad_norm": 0.8701362013816833, + "learning_rate": 7.139002925308024e-05, + "loss": 0.8939, + "step": 112520 + }, + { + "epoch": 0.7189220960096087, + "grad_norm": 2.2286477088928223, + "learning_rate": 7.138549379465982e-05, + "loss": 1.0509, + "step": 112530 + }, + { + "epoch": 0.7189859831593474, + "grad_norm": 0.22156091034412384, + "learning_rate": 7.138095812086896e-05, + "loss": 0.6598, + "step": 112540 + }, + { + "epoch": 0.7190498703090861, + "grad_norm": 1.0287528038024902, + "learning_rate": 7.137642223175337e-05, + "loss": 0.9708, + "step": 112550 + }, + { + "epoch": 0.7191137574588248, + "grad_norm": 0.772317111492157, + "learning_rate": 7.13718861273587e-05, + "loss": 0.8922, + "step": 112560 + }, + { + "epoch": 0.7191776446085635, + "grad_norm": 1.1389001607894897, + "learning_rate": 7.136734980773066e-05, + "loss": 0.6469, + "step": 112570 + }, + { + "epoch": 0.7192415317583022, + "grad_norm": 0.9356949329376221, + "learning_rate": 7.136281327291491e-05, + "loss": 0.8607, + "step": 112580 + }, + { + "epoch": 0.7193054189080408, + "grad_norm": 0.8384791016578674, + "learning_rate": 7.135827652295715e-05, + "loss": 0.8469, + "step": 112590 + }, + { + "epoch": 0.7193693060577795, + "grad_norm": 1.2862927913665771, + "learning_rate": 7.135373955790308e-05, + "loss": 0.8742, + "step": 112600 + }, + { + "epoch": 0.7194331932075182, + "grad_norm": 0.7916562557220459, + "learning_rate": 7.134920237779837e-05, + "loss": 0.9424, + "step": 112610 + }, + { + "epoch": 0.7194970803572569, + "grad_norm": 1.3547780513763428, + "learning_rate": 7.134466498268872e-05, + "loss": 0.9223, + "step": 112620 + }, + { + "epoch": 0.7195609675069956, + "grad_norm": 0.8931356072425842, + "learning_rate": 7.134012737261985e-05, + "loss": 0.9001, + "step": 112630 + }, + { + "epoch": 0.7196248546567343, + "grad_norm": 1.0962817668914795, + "learning_rate": 7.133558954763741e-05, + "loss": 0.6841, + "step": 112640 + }, + { + "epoch": 0.719688741806473, + "grad_norm": 0.8664074540138245, + "learning_rate": 7.133105150778714e-05, + "loss": 0.84, + "step": 112650 + }, + { + "epoch": 0.7197526289562117, + "grad_norm": 0.7376250624656677, + "learning_rate": 7.132651325311472e-05, + "loss": 1.0094, + "step": 112660 + }, + { + "epoch": 0.7198165161059504, + "grad_norm": 0.8420968055725098, + "learning_rate": 7.132197478366587e-05, + "loss": 1.052, + "step": 112670 + }, + { + "epoch": 0.7198804032556891, + "grad_norm": 1.0015677213668823, + "learning_rate": 7.131743609948628e-05, + "loss": 0.9265, + "step": 112680 + }, + { + "epoch": 0.7199442904054278, + "grad_norm": 0.7853860259056091, + "learning_rate": 7.131289720062167e-05, + "loss": 0.8856, + "step": 112690 + }, + { + "epoch": 0.7200081775551666, + "grad_norm": 1.2557804584503174, + "learning_rate": 7.130835808711773e-05, + "loss": 0.975, + "step": 112700 + }, + { + "epoch": 0.7200720647049053, + "grad_norm": 0.8270767331123352, + "learning_rate": 7.130381875902021e-05, + "loss": 1.0227, + "step": 112710 + }, + { + "epoch": 0.720135951854644, + "grad_norm": 1.5142698287963867, + "learning_rate": 7.12992792163748e-05, + "loss": 0.9033, + "step": 112720 + }, + { + "epoch": 0.7201998390043827, + "grad_norm": 0.9564334750175476, + "learning_rate": 7.129473945922722e-05, + "loss": 0.6761, + "step": 112730 + }, + { + "epoch": 0.7202637261541214, + "grad_norm": 0.5844874382019043, + "learning_rate": 7.129019948762319e-05, + "loss": 0.7452, + "step": 112740 + }, + { + "epoch": 0.7203276133038601, + "grad_norm": 0.7936009764671326, + "learning_rate": 7.128565930160844e-05, + "loss": 0.8532, + "step": 112750 + }, + { + "epoch": 0.7203915004535988, + "grad_norm": 1.0036197900772095, + "learning_rate": 7.128111890122868e-05, + "loss": 0.6713, + "step": 112760 + }, + { + "epoch": 0.7204553876033375, + "grad_norm": 0.6564218997955322, + "learning_rate": 7.127657828652964e-05, + "loss": 0.8136, + "step": 112770 + }, + { + "epoch": 0.7205192747530762, + "grad_norm": 0.7329919338226318, + "learning_rate": 7.127203745755705e-05, + "loss": 0.9779, + "step": 112780 + }, + { + "epoch": 0.7205831619028149, + "grad_norm": 0.9217239022254944, + "learning_rate": 7.126749641435664e-05, + "loss": 1.0803, + "step": 112790 + }, + { + "epoch": 0.7206470490525536, + "grad_norm": 1.523088812828064, + "learning_rate": 7.126295515697414e-05, + "loss": 0.8893, + "step": 112800 + }, + { + "epoch": 0.7207109362022923, + "grad_norm": 1.220182180404663, + "learning_rate": 7.125841368545529e-05, + "loss": 0.7783, + "step": 112810 + }, + { + "epoch": 0.720774823352031, + "grad_norm": 0.7349340319633484, + "learning_rate": 7.125387199984583e-05, + "loss": 0.8759, + "step": 112820 + }, + { + "epoch": 0.7208387105017696, + "grad_norm": 1.1350910663604736, + "learning_rate": 7.124933010019148e-05, + "loss": 0.8982, + "step": 112830 + }, + { + "epoch": 0.7209025976515083, + "grad_norm": 1.2227561473846436, + "learning_rate": 7.124478798653801e-05, + "loss": 0.6953, + "step": 112840 + }, + { + "epoch": 0.720966484801247, + "grad_norm": 1.2927758693695068, + "learning_rate": 7.124024565893112e-05, + "loss": 0.8957, + "step": 112850 + }, + { + "epoch": 0.7210303719509857, + "grad_norm": 0.8586512804031372, + "learning_rate": 7.12357031174166e-05, + "loss": 1.1933, + "step": 112860 + }, + { + "epoch": 0.7210942591007244, + "grad_norm": 1.5274994373321533, + "learning_rate": 7.123116036204017e-05, + "loss": 0.8434, + "step": 112870 + }, + { + "epoch": 0.7211581462504632, + "grad_norm": 0.8376038670539856, + "learning_rate": 7.122661739284759e-05, + "loss": 0.7802, + "step": 112880 + }, + { + "epoch": 0.7212220334002019, + "grad_norm": 0.9995211362838745, + "learning_rate": 7.122207420988462e-05, + "loss": 0.7681, + "step": 112890 + }, + { + "epoch": 0.7212859205499406, + "grad_norm": 1.5679831504821777, + "learning_rate": 7.121753081319699e-05, + "loss": 0.9598, + "step": 112900 + }, + { + "epoch": 0.7213498076996793, + "grad_norm": 1.7187330722808838, + "learning_rate": 7.121298720283048e-05, + "loss": 1.0863, + "step": 112910 + }, + { + "epoch": 0.721413694849418, + "grad_norm": 0.8918151259422302, + "learning_rate": 7.120844337883082e-05, + "loss": 1.0098, + "step": 112920 + }, + { + "epoch": 0.7214775819991567, + "grad_norm": 0.8101955056190491, + "learning_rate": 7.120389934124379e-05, + "loss": 1.092, + "step": 112930 + }, + { + "epoch": 0.7215414691488954, + "grad_norm": 0.7652488946914673, + "learning_rate": 7.119935509011516e-05, + "loss": 0.8215, + "step": 112940 + }, + { + "epoch": 0.7216053562986341, + "grad_norm": 1.972680687904358, + "learning_rate": 7.119481062549067e-05, + "loss": 0.8574, + "step": 112950 + }, + { + "epoch": 0.7216692434483728, + "grad_norm": 0.8050053715705872, + "learning_rate": 7.11902659474161e-05, + "loss": 1.347, + "step": 112960 + }, + { + "epoch": 0.7217331305981115, + "grad_norm": 1.1817753314971924, + "learning_rate": 7.118572105593725e-05, + "loss": 0.7859, + "step": 112970 + }, + { + "epoch": 0.7217970177478502, + "grad_norm": 0.73277747631073, + "learning_rate": 7.118117595109984e-05, + "loss": 0.8999, + "step": 112980 + }, + { + "epoch": 0.7218609048975889, + "grad_norm": 0.7769888639450073, + "learning_rate": 7.117663063294965e-05, + "loss": 0.9172, + "step": 112990 + }, + { + "epoch": 0.7219247920473276, + "grad_norm": 1.4832031726837158, + "learning_rate": 7.117253966426993e-05, + "loss": 1.0567, + "step": 113000 + }, + { + "epoch": 0.7219886791970663, + "grad_norm": 0.9582386612892151, + "learning_rate": 7.11679939409516e-05, + "loss": 0.6723, + "step": 113010 + }, + { + "epoch": 0.722052566346805, + "grad_norm": 0.8009851574897766, + "learning_rate": 7.116344800445327e-05, + "loss": 0.8508, + "step": 113020 + }, + { + "epoch": 0.7221164534965437, + "grad_norm": 0.7712252736091614, + "learning_rate": 7.115890185482071e-05, + "loss": 0.7957, + "step": 113030 + }, + { + "epoch": 0.7221803406462824, + "grad_norm": 0.7053341865539551, + "learning_rate": 7.11543554920997e-05, + "loss": 0.8246, + "step": 113040 + }, + { + "epoch": 0.7222442277960212, + "grad_norm": 0.8619422912597656, + "learning_rate": 7.114980891633602e-05, + "loss": 1.0908, + "step": 113050 + }, + { + "epoch": 0.7223081149457599, + "grad_norm": 0.6670997738838196, + "learning_rate": 7.114526212757549e-05, + "loss": 0.8958, + "step": 113060 + }, + { + "epoch": 0.7223720020954986, + "grad_norm": 0.7335458397865295, + "learning_rate": 7.114071512586385e-05, + "loss": 1.0648, + "step": 113070 + }, + { + "epoch": 0.7224358892452372, + "grad_norm": 1.3357338905334473, + "learning_rate": 7.113616791124694e-05, + "loss": 0.7619, + "step": 113080 + }, + { + "epoch": 0.7224997763949759, + "grad_norm": 0.960849404335022, + "learning_rate": 7.113162048377053e-05, + "loss": 0.9608, + "step": 113090 + }, + { + "epoch": 0.7225636635447146, + "grad_norm": 0.47955894470214844, + "learning_rate": 7.112707284348042e-05, + "loss": 0.7415, + "step": 113100 + }, + { + "epoch": 0.7226275506944533, + "grad_norm": 0.6127536296844482, + "learning_rate": 7.11225249904224e-05, + "loss": 0.8654, + "step": 113110 + }, + { + "epoch": 0.722691437844192, + "grad_norm": 0.9961467385292053, + "learning_rate": 7.11179769246423e-05, + "loss": 0.8188, + "step": 113120 + }, + { + "epoch": 0.7227553249939307, + "grad_norm": 0.8931620121002197, + "learning_rate": 7.11134286461859e-05, + "loss": 0.7527, + "step": 113130 + }, + { + "epoch": 0.7228192121436694, + "grad_norm": 1.2289701700210571, + "learning_rate": 7.1108880155099e-05, + "loss": 0.7977, + "step": 113140 + }, + { + "epoch": 0.7228830992934081, + "grad_norm": 0.7578348517417908, + "learning_rate": 7.110433145142741e-05, + "loss": 1.0557, + "step": 113150 + }, + { + "epoch": 0.7229469864431468, + "grad_norm": 0.5664851069450378, + "learning_rate": 7.109978253521694e-05, + "loss": 0.6821, + "step": 113160 + }, + { + "epoch": 0.7230108735928855, + "grad_norm": 1.7400668859481812, + "learning_rate": 7.109523340651342e-05, + "loss": 0.993, + "step": 113170 + }, + { + "epoch": 0.7230747607426242, + "grad_norm": 0.9611324071884155, + "learning_rate": 7.109068406536265e-05, + "loss": 0.9319, + "step": 113180 + }, + { + "epoch": 0.7231386478923629, + "grad_norm": 0.6872121691703796, + "learning_rate": 7.108613451181043e-05, + "loss": 1.1291, + "step": 113190 + }, + { + "epoch": 0.7232025350421016, + "grad_norm": 0.8210169076919556, + "learning_rate": 7.108158474590261e-05, + "loss": 0.8069, + "step": 113200 + }, + { + "epoch": 0.7232664221918403, + "grad_norm": 1.7412443161010742, + "learning_rate": 7.107703476768497e-05, + "loss": 0.8441, + "step": 113210 + }, + { + "epoch": 0.723330309341579, + "grad_norm": 0.7331858277320862, + "learning_rate": 7.107248457720337e-05, + "loss": 0.7081, + "step": 113220 + }, + { + "epoch": 0.7233941964913178, + "grad_norm": 1.0155048370361328, + "learning_rate": 7.106793417450362e-05, + "loss": 0.8548, + "step": 113230 + }, + { + "epoch": 0.7234580836410565, + "grad_norm": 0.6159857511520386, + "learning_rate": 7.106338355963155e-05, + "loss": 0.7694, + "step": 113240 + }, + { + "epoch": 0.7235219707907952, + "grad_norm": 0.9287189841270447, + "learning_rate": 7.105883273263298e-05, + "loss": 0.965, + "step": 113250 + }, + { + "epoch": 0.7235858579405339, + "grad_norm": 0.7032301425933838, + "learning_rate": 7.105428169355375e-05, + "loss": 0.7095, + "step": 113260 + }, + { + "epoch": 0.7236497450902726, + "grad_norm": 0.7181684970855713, + "learning_rate": 7.104973044243969e-05, + "loss": 1.1339, + "step": 113270 + }, + { + "epoch": 0.7237136322400113, + "grad_norm": 0.703015148639679, + "learning_rate": 7.104517897933662e-05, + "loss": 0.8453, + "step": 113280 + }, + { + "epoch": 0.72377751938975, + "grad_norm": 2.4534735679626465, + "learning_rate": 7.10406273042904e-05, + "loss": 0.8108, + "step": 113290 + }, + { + "epoch": 0.7238414065394887, + "grad_norm": 0.7906631231307983, + "learning_rate": 7.103607541734688e-05, + "loss": 1.0326, + "step": 113300 + }, + { + "epoch": 0.7239052936892274, + "grad_norm": 0.8502821922302246, + "learning_rate": 7.103152331855187e-05, + "loss": 0.8985, + "step": 113310 + }, + { + "epoch": 0.723969180838966, + "grad_norm": 1.1090575456619263, + "learning_rate": 7.102697100795122e-05, + "loss": 0.8659, + "step": 113320 + }, + { + "epoch": 0.7240330679887047, + "grad_norm": 1.2656605243682861, + "learning_rate": 7.102241848559077e-05, + "loss": 0.7826, + "step": 113330 + }, + { + "epoch": 0.7240969551384434, + "grad_norm": 0.6625283360481262, + "learning_rate": 7.101786575151639e-05, + "loss": 0.7639, + "step": 113340 + }, + { + "epoch": 0.7241608422881821, + "grad_norm": 0.988913357257843, + "learning_rate": 7.101331280577392e-05, + "loss": 1.0447, + "step": 113350 + }, + { + "epoch": 0.7242247294379208, + "grad_norm": 1.8037471771240234, + "learning_rate": 7.100875964840922e-05, + "loss": 1.0335, + "step": 113360 + }, + { + "epoch": 0.7242886165876595, + "grad_norm": 0.8089826107025146, + "learning_rate": 7.100420627946812e-05, + "loss": 0.8345, + "step": 113370 + }, + { + "epoch": 0.7243525037373982, + "grad_norm": 0.894549548625946, + "learning_rate": 7.099965269899648e-05, + "loss": 0.9204, + "step": 113380 + }, + { + "epoch": 0.7244163908871369, + "grad_norm": 0.7039246559143066, + "learning_rate": 7.099509890704019e-05, + "loss": 0.6761, + "step": 113390 + }, + { + "epoch": 0.7244802780368756, + "grad_norm": 0.907172679901123, + "learning_rate": 7.09905449036451e-05, + "loss": 0.6746, + "step": 113400 + }, + { + "epoch": 0.7245441651866144, + "grad_norm": 0.6476519107818604, + "learning_rate": 7.098599068885704e-05, + "loss": 0.9781, + "step": 113410 + }, + { + "epoch": 0.7246080523363531, + "grad_norm": 0.5977177023887634, + "learning_rate": 7.098143626272192e-05, + "loss": 0.856, + "step": 113420 + }, + { + "epoch": 0.7246719394860918, + "grad_norm": 1.3149287700653076, + "learning_rate": 7.097688162528556e-05, + "loss": 0.7815, + "step": 113430 + }, + { + "epoch": 0.7247358266358305, + "grad_norm": 0.7405048608779907, + "learning_rate": 7.097232677659387e-05, + "loss": 0.895, + "step": 113440 + }, + { + "epoch": 0.7247997137855692, + "grad_norm": 0.8446990847587585, + "learning_rate": 7.09677717166927e-05, + "loss": 0.6864, + "step": 113450 + }, + { + "epoch": 0.7248636009353079, + "grad_norm": 0.761848509311676, + "learning_rate": 7.096321644562793e-05, + "loss": 0.8818, + "step": 113460 + }, + { + "epoch": 0.7249274880850466, + "grad_norm": 0.8329225778579712, + "learning_rate": 7.095866096344544e-05, + "loss": 0.8544, + "step": 113470 + }, + { + "epoch": 0.7249913752347853, + "grad_norm": 1.2276092767715454, + "learning_rate": 7.095410527019111e-05, + "loss": 0.8486, + "step": 113480 + }, + { + "epoch": 0.725055262384524, + "grad_norm": 1.3161121606826782, + "learning_rate": 7.094954936591081e-05, + "loss": 0.7856, + "step": 113490 + }, + { + "epoch": 0.7251191495342627, + "grad_norm": 1.220802664756775, + "learning_rate": 7.09449932506504e-05, + "loss": 0.8293, + "step": 113500 + }, + { + "epoch": 0.7251830366840014, + "grad_norm": 1.623045802116394, + "learning_rate": 7.094043692445581e-05, + "loss": 1.4337, + "step": 113510 + }, + { + "epoch": 0.7252469238337401, + "grad_norm": 1.979931116104126, + "learning_rate": 7.09358803873729e-05, + "loss": 0.7465, + "step": 113520 + }, + { + "epoch": 0.7253108109834788, + "grad_norm": 1.0702683925628662, + "learning_rate": 7.093132363944756e-05, + "loss": 0.7137, + "step": 113530 + }, + { + "epoch": 0.7253746981332175, + "grad_norm": 0.940356969833374, + "learning_rate": 7.092676668072569e-05, + "loss": 0.9097, + "step": 113540 + }, + { + "epoch": 0.7254385852829562, + "grad_norm": 0.7708202004432678, + "learning_rate": 7.092220951125315e-05, + "loss": 1.0073, + "step": 113550 + }, + { + "epoch": 0.7255024724326948, + "grad_norm": 1.922377347946167, + "learning_rate": 7.091765213107589e-05, + "loss": 0.77, + "step": 113560 + }, + { + "epoch": 0.7255663595824335, + "grad_norm": 0.671876847743988, + "learning_rate": 7.091309454023976e-05, + "loss": 0.6341, + "step": 113570 + }, + { + "epoch": 0.7256302467321722, + "grad_norm": 0.9415796995162964, + "learning_rate": 7.090853673879068e-05, + "loss": 0.6898, + "step": 113580 + }, + { + "epoch": 0.725694133881911, + "grad_norm": 0.5804014205932617, + "learning_rate": 7.090397872677455e-05, + "loss": 0.8133, + "step": 113590 + }, + { + "epoch": 0.7257580210316497, + "grad_norm": 1.0379526615142822, + "learning_rate": 7.089942050423725e-05, + "loss": 0.855, + "step": 113600 + }, + { + "epoch": 0.7258219081813884, + "grad_norm": 0.6411370635032654, + "learning_rate": 7.089486207122474e-05, + "loss": 0.7423, + "step": 113610 + }, + { + "epoch": 0.7258857953311271, + "grad_norm": 2.031083106994629, + "learning_rate": 7.089030342778288e-05, + "loss": 0.7442, + "step": 113620 + }, + { + "epoch": 0.7259496824808658, + "grad_norm": 0.9208039045333862, + "learning_rate": 7.088574457395758e-05, + "loss": 0.8881, + "step": 113630 + }, + { + "epoch": 0.7260135696306045, + "grad_norm": 1.0437136888504028, + "learning_rate": 7.088118550979477e-05, + "loss": 1.015, + "step": 113640 + }, + { + "epoch": 0.7260774567803432, + "grad_norm": 1.4243861436843872, + "learning_rate": 7.087662623534036e-05, + "loss": 0.7728, + "step": 113650 + }, + { + "epoch": 0.7261413439300819, + "grad_norm": 1.2670698165893555, + "learning_rate": 7.087206675064026e-05, + "loss": 0.9942, + "step": 113660 + }, + { + "epoch": 0.7262052310798206, + "grad_norm": 0.9617191553115845, + "learning_rate": 7.086750705574038e-05, + "loss": 1.0011, + "step": 113670 + }, + { + "epoch": 0.7262691182295593, + "grad_norm": 1.161468505859375, + "learning_rate": 7.086294715068667e-05, + "loss": 1.22, + "step": 113680 + }, + { + "epoch": 0.726333005379298, + "grad_norm": 1.2413885593414307, + "learning_rate": 7.085838703552503e-05, + "loss": 0.9355, + "step": 113690 + }, + { + "epoch": 0.7263968925290367, + "grad_norm": 0.607776403427124, + "learning_rate": 7.085382671030138e-05, + "loss": 1.0288, + "step": 113700 + }, + { + "epoch": 0.7264607796787754, + "grad_norm": 1.1839098930358887, + "learning_rate": 7.084926617506166e-05, + "loss": 1.0052, + "step": 113710 + }, + { + "epoch": 0.7265246668285141, + "grad_norm": 0.5576828718185425, + "learning_rate": 7.084470542985178e-05, + "loss": 1.1483, + "step": 113720 + }, + { + "epoch": 0.7265885539782528, + "grad_norm": 0.6610636711120605, + "learning_rate": 7.084014447471769e-05, + "loss": 0.777, + "step": 113730 + }, + { + "epoch": 0.7266524411279915, + "grad_norm": 0.726379930973053, + "learning_rate": 7.083558330970532e-05, + "loss": 0.7009, + "step": 113740 + }, + { + "epoch": 0.7267163282777303, + "grad_norm": 0.8821578621864319, + "learning_rate": 7.083102193486058e-05, + "loss": 1.1292, + "step": 113750 + }, + { + "epoch": 0.726780215427469, + "grad_norm": 0.8552307486534119, + "learning_rate": 7.082646035022946e-05, + "loss": 0.936, + "step": 113760 + }, + { + "epoch": 0.7268441025772077, + "grad_norm": 0.9332167506217957, + "learning_rate": 7.082189855585784e-05, + "loss": 0.9092, + "step": 113770 + }, + { + "epoch": 0.7269079897269464, + "grad_norm": 0.7814182043075562, + "learning_rate": 7.081733655179171e-05, + "loss": 1.0203, + "step": 113780 + }, + { + "epoch": 0.7269718768766851, + "grad_norm": 1.0449836254119873, + "learning_rate": 7.081277433807697e-05, + "loss": 0.8905, + "step": 113790 + }, + { + "epoch": 0.7270357640264237, + "grad_norm": 0.9948442578315735, + "learning_rate": 7.080821191475962e-05, + "loss": 0.8614, + "step": 113800 + }, + { + "epoch": 0.7270996511761624, + "grad_norm": 0.6735957860946655, + "learning_rate": 7.080364928188555e-05, + "loss": 0.7434, + "step": 113810 + }, + { + "epoch": 0.7271635383259011, + "grad_norm": 1.021897554397583, + "learning_rate": 7.079908643950072e-05, + "loss": 0.8579, + "step": 113820 + }, + { + "epoch": 0.7272274254756398, + "grad_norm": 0.7740781903266907, + "learning_rate": 7.079452338765112e-05, + "loss": 0.9425, + "step": 113830 + }, + { + "epoch": 0.7272913126253785, + "grad_norm": 0.5648607015609741, + "learning_rate": 7.078996012638268e-05, + "loss": 0.8408, + "step": 113840 + }, + { + "epoch": 0.7273551997751172, + "grad_norm": 0.5744165182113647, + "learning_rate": 7.078539665574135e-05, + "loss": 0.8827, + "step": 113850 + }, + { + "epoch": 0.7274190869248559, + "grad_norm": 0.5075027346611023, + "learning_rate": 7.07808329757731e-05, + "loss": 0.741, + "step": 113860 + }, + { + "epoch": 0.7274829740745946, + "grad_norm": 2.1645233631134033, + "learning_rate": 7.077626908652387e-05, + "loss": 0.8138, + "step": 113870 + }, + { + "epoch": 0.7275468612243333, + "grad_norm": 1.198243498802185, + "learning_rate": 7.077170498803964e-05, + "loss": 1.0056, + "step": 113880 + }, + { + "epoch": 0.727610748374072, + "grad_norm": 0.8382686376571655, + "learning_rate": 7.076714068036639e-05, + "loss": 0.8976, + "step": 113890 + }, + { + "epoch": 0.7276746355238107, + "grad_norm": 0.751120388507843, + "learning_rate": 7.076257616355003e-05, + "loss": 0.918, + "step": 113900 + }, + { + "epoch": 0.7277385226735494, + "grad_norm": 0.7622794508934021, + "learning_rate": 7.075801143763658e-05, + "loss": 1.125, + "step": 113910 + }, + { + "epoch": 0.7278024098232881, + "grad_norm": 0.7073959112167358, + "learning_rate": 7.0753446502672e-05, + "loss": 0.8341, + "step": 113920 + }, + { + "epoch": 0.7278662969730268, + "grad_norm": 0.6458455324172974, + "learning_rate": 7.074888135870227e-05, + "loss": 1.1074, + "step": 113930 + }, + { + "epoch": 0.7279301841227656, + "grad_norm": 0.837853729724884, + "learning_rate": 7.074431600577335e-05, + "loss": 0.8365, + "step": 113940 + }, + { + "epoch": 0.7279940712725043, + "grad_norm": 0.8658714890480042, + "learning_rate": 7.073975044393121e-05, + "loss": 0.9752, + "step": 113950 + }, + { + "epoch": 0.728057958422243, + "grad_norm": 1.0802479982376099, + "learning_rate": 7.073518467322186e-05, + "loss": 0.7144, + "step": 113960 + }, + { + "epoch": 0.7281218455719817, + "grad_norm": 0.8560570478439331, + "learning_rate": 7.073061869369124e-05, + "loss": 0.6664, + "step": 113970 + }, + { + "epoch": 0.7281857327217204, + "grad_norm": 0.9239840507507324, + "learning_rate": 7.072605250538536e-05, + "loss": 1.1588, + "step": 113980 + }, + { + "epoch": 0.7282496198714591, + "grad_norm": 0.7781822085380554, + "learning_rate": 7.07214861083502e-05, + "loss": 1.1213, + "step": 113990 + }, + { + "epoch": 0.7283135070211978, + "grad_norm": 0.6489850282669067, + "learning_rate": 7.071691950263177e-05, + "loss": 0.9027, + "step": 114000 + }, + { + "epoch": 0.7283773941709365, + "grad_norm": 0.8504793047904968, + "learning_rate": 7.071235268827601e-05, + "loss": 1.1383, + "step": 114010 + }, + { + "epoch": 0.7284412813206752, + "grad_norm": 0.9675794243812561, + "learning_rate": 7.070778566532896e-05, + "loss": 1.1451, + "step": 114020 + }, + { + "epoch": 0.7285051684704139, + "grad_norm": 0.9038040041923523, + "learning_rate": 7.070321843383659e-05, + "loss": 0.9998, + "step": 114030 + }, + { + "epoch": 0.7285690556201526, + "grad_norm": 0.6558981537818909, + "learning_rate": 7.06986509938449e-05, + "loss": 0.8363, + "step": 114040 + }, + { + "epoch": 0.7286329427698912, + "grad_norm": 0.9019342660903931, + "learning_rate": 7.069408334539987e-05, + "loss": 0.992, + "step": 114050 + }, + { + "epoch": 0.7286968299196299, + "grad_norm": 0.7221403121948242, + "learning_rate": 7.068951548854755e-05, + "loss": 0.8516, + "step": 114060 + }, + { + "epoch": 0.7287607170693686, + "grad_norm": 1.507217288017273, + "learning_rate": 7.068494742333388e-05, + "loss": 0.7527, + "step": 114070 + }, + { + "epoch": 0.7288246042191073, + "grad_norm": 0.8516684770584106, + "learning_rate": 7.06803791498049e-05, + "loss": 0.6731, + "step": 114080 + }, + { + "epoch": 0.728888491368846, + "grad_norm": 1.2646251916885376, + "learning_rate": 7.067581066800661e-05, + "loss": 0.8718, + "step": 114090 + }, + { + "epoch": 0.7289523785185847, + "grad_norm": 0.7315905094146729, + "learning_rate": 7.067124197798504e-05, + "loss": 1.0181, + "step": 114100 + }, + { + "epoch": 0.7290162656683234, + "grad_norm": 1.3411294221878052, + "learning_rate": 7.066667307978617e-05, + "loss": 0.8343, + "step": 114110 + }, + { + "epoch": 0.7290801528180622, + "grad_norm": 1.737226963043213, + "learning_rate": 7.0662103973456e-05, + "loss": 0.9154, + "step": 114120 + }, + { + "epoch": 0.7291440399678009, + "grad_norm": 0.9302464127540588, + "learning_rate": 7.065753465904059e-05, + "loss": 0.7609, + "step": 114130 + }, + { + "epoch": 0.7292079271175396, + "grad_norm": 0.7784197926521301, + "learning_rate": 7.065296513658594e-05, + "loss": 1.1499, + "step": 114140 + }, + { + "epoch": 0.7292718142672783, + "grad_norm": 1.0966049432754517, + "learning_rate": 7.064839540613805e-05, + "loss": 0.8757, + "step": 114150 + }, + { + "epoch": 0.729335701417017, + "grad_norm": 0.7327684760093689, + "learning_rate": 7.064382546774297e-05, + "loss": 0.8503, + "step": 114160 + }, + { + "epoch": 0.7293995885667557, + "grad_norm": 0.817319393157959, + "learning_rate": 7.063925532144668e-05, + "loss": 1.0078, + "step": 114170 + }, + { + "epoch": 0.7294634757164944, + "grad_norm": 0.5275333523750305, + "learning_rate": 7.063468496729526e-05, + "loss": 1.1289, + "step": 114180 + }, + { + "epoch": 0.7295273628662331, + "grad_norm": 0.9112656712532043, + "learning_rate": 7.06301144053347e-05, + "loss": 0.9515, + "step": 114190 + }, + { + "epoch": 0.7295912500159718, + "grad_norm": 0.7202227711677551, + "learning_rate": 7.062554363561105e-05, + "loss": 1.0629, + "step": 114200 + }, + { + "epoch": 0.7296551371657105, + "grad_norm": 0.6695742011070251, + "learning_rate": 7.062097265817031e-05, + "loss": 0.8514, + "step": 114210 + }, + { + "epoch": 0.7297190243154492, + "grad_norm": 1.3881930112838745, + "learning_rate": 7.061640147305856e-05, + "loss": 0.8752, + "step": 114220 + }, + { + "epoch": 0.7297829114651879, + "grad_norm": 0.9712892770767212, + "learning_rate": 7.06118300803218e-05, + "loss": 0.8757, + "step": 114230 + }, + { + "epoch": 0.7298467986149266, + "grad_norm": 0.7274371981620789, + "learning_rate": 7.060725848000607e-05, + "loss": 0.8209, + "step": 114240 + }, + { + "epoch": 0.7299106857646653, + "grad_norm": 1.0749263763427734, + "learning_rate": 7.060268667215743e-05, + "loss": 1.059, + "step": 114250 + }, + { + "epoch": 0.729974572914404, + "grad_norm": 0.828329861164093, + "learning_rate": 7.059811465682192e-05, + "loss": 1.3196, + "step": 114260 + }, + { + "epoch": 0.7300384600641427, + "grad_norm": 1.0756471157073975, + "learning_rate": 7.059354243404555e-05, + "loss": 0.9988, + "step": 114270 + }, + { + "epoch": 0.7301023472138815, + "grad_norm": 1.016433835029602, + "learning_rate": 7.05889700038744e-05, + "loss": 0.6839, + "step": 114280 + }, + { + "epoch": 0.73016623436362, + "grad_norm": 0.9101114273071289, + "learning_rate": 7.058439736635454e-05, + "loss": 1.0044, + "step": 114290 + }, + { + "epoch": 0.7302301215133588, + "grad_norm": 1.0833832025527954, + "learning_rate": 7.057982452153196e-05, + "loss": 1.0722, + "step": 114300 + }, + { + "epoch": 0.7302940086630975, + "grad_norm": 1.2768323421478271, + "learning_rate": 7.057525146945276e-05, + "loss": 1.0266, + "step": 114310 + }, + { + "epoch": 0.7303578958128362, + "grad_norm": 0.776053786277771, + "learning_rate": 7.057067821016297e-05, + "loss": 0.9476, + "step": 114320 + }, + { + "epoch": 0.7304217829625749, + "grad_norm": 0.6913490295410156, + "learning_rate": 7.056610474370865e-05, + "loss": 0.8089, + "step": 114330 + }, + { + "epoch": 0.7304856701123136, + "grad_norm": 0.6486290693283081, + "learning_rate": 7.056153107013588e-05, + "loss": 0.813, + "step": 114340 + }, + { + "epoch": 0.7305495572620523, + "grad_norm": 0.7323374152183533, + "learning_rate": 7.05569571894907e-05, + "loss": 0.9397, + "step": 114350 + }, + { + "epoch": 0.730613444411791, + "grad_norm": 1.247586727142334, + "learning_rate": 7.055238310181915e-05, + "loss": 0.8101, + "step": 114360 + }, + { + "epoch": 0.7306773315615297, + "grad_norm": 1.2062445878982544, + "learning_rate": 7.054780880716733e-05, + "loss": 0.7138, + "step": 114370 + }, + { + "epoch": 0.7307412187112684, + "grad_norm": 2.156268358230591, + "learning_rate": 7.054323430558132e-05, + "loss": 0.8173, + "step": 114380 + }, + { + "epoch": 0.7308051058610071, + "grad_norm": 1.4800580739974976, + "learning_rate": 7.053865959710717e-05, + "loss": 0.8313, + "step": 114390 + }, + { + "epoch": 0.7308689930107458, + "grad_norm": 0.7692814469337463, + "learning_rate": 7.053408468179093e-05, + "loss": 0.7696, + "step": 114400 + }, + { + "epoch": 0.7309328801604845, + "grad_norm": 1.617647409439087, + "learning_rate": 7.052950955967869e-05, + "loss": 0.8307, + "step": 114410 + }, + { + "epoch": 0.7309967673102232, + "grad_norm": 1.1893336772918701, + "learning_rate": 7.052493423081655e-05, + "loss": 0.8407, + "step": 114420 + }, + { + "epoch": 0.7310606544599619, + "grad_norm": 1.0456907749176025, + "learning_rate": 7.052035869525053e-05, + "loss": 1.0971, + "step": 114430 + }, + { + "epoch": 0.7311245416097006, + "grad_norm": 0.8813536167144775, + "learning_rate": 7.051578295302676e-05, + "loss": 0.9145, + "step": 114440 + }, + { + "epoch": 0.7311884287594393, + "grad_norm": 0.9212775826454163, + "learning_rate": 7.051120700419131e-05, + "loss": 0.9651, + "step": 114450 + }, + { + "epoch": 0.731252315909178, + "grad_norm": 0.7198718786239624, + "learning_rate": 7.050663084879027e-05, + "loss": 0.844, + "step": 114460 + }, + { + "epoch": 0.7313162030589168, + "grad_norm": 0.91295325756073, + "learning_rate": 7.050205448686971e-05, + "loss": 0.7403, + "step": 114470 + }, + { + "epoch": 0.7313800902086555, + "grad_norm": 1.0438035726547241, + "learning_rate": 7.049747791847574e-05, + "loss": 0.893, + "step": 114480 + }, + { + "epoch": 0.7314439773583942, + "grad_norm": 0.6840182542800903, + "learning_rate": 7.049290114365441e-05, + "loss": 0.7915, + "step": 114490 + }, + { + "epoch": 0.7315078645081329, + "grad_norm": 0.7132487893104553, + "learning_rate": 7.048832416245185e-05, + "loss": 1.0361, + "step": 114500 + }, + { + "epoch": 0.7315717516578716, + "grad_norm": 1.2934064865112305, + "learning_rate": 7.048374697491414e-05, + "loss": 0.9494, + "step": 114510 + }, + { + "epoch": 0.7316356388076103, + "grad_norm": 1.7626994848251343, + "learning_rate": 7.047916958108737e-05, + "loss": 1.0967, + "step": 114520 + }, + { + "epoch": 0.7316995259573489, + "grad_norm": 1.077722191810608, + "learning_rate": 7.047459198101766e-05, + "loss": 0.8701, + "step": 114530 + }, + { + "epoch": 0.7317634131070876, + "grad_norm": 0.6629199981689453, + "learning_rate": 7.047001417475109e-05, + "loss": 0.9575, + "step": 114540 + }, + { + "epoch": 0.7318273002568263, + "grad_norm": 0.9705789089202881, + "learning_rate": 7.046543616233376e-05, + "loss": 0.6918, + "step": 114550 + }, + { + "epoch": 0.731891187406565, + "grad_norm": 0.897789478302002, + "learning_rate": 7.046085794381179e-05, + "loss": 0.9453, + "step": 114560 + }, + { + "epoch": 0.7319550745563037, + "grad_norm": 0.7040274143218994, + "learning_rate": 7.045627951923127e-05, + "loss": 0.9701, + "step": 114570 + }, + { + "epoch": 0.7320189617060424, + "grad_norm": 1.2493051290512085, + "learning_rate": 7.045170088863834e-05, + "loss": 1.0576, + "step": 114580 + }, + { + "epoch": 0.7320828488557811, + "grad_norm": 1.6483665704727173, + "learning_rate": 7.044712205207907e-05, + "loss": 0.9102, + "step": 114590 + }, + { + "epoch": 0.7321467360055198, + "grad_norm": 0.8068165183067322, + "learning_rate": 7.044254300959958e-05, + "loss": 0.9693, + "step": 114600 + }, + { + "epoch": 0.7322106231552585, + "grad_norm": 1.0065369606018066, + "learning_rate": 7.043796376124602e-05, + "loss": 1.0352, + "step": 114610 + }, + { + "epoch": 0.7322745103049972, + "grad_norm": 0.5376549959182739, + "learning_rate": 7.043338430706448e-05, + "loss": 1.0502, + "step": 114620 + }, + { + "epoch": 0.732338397454736, + "grad_norm": 0.7183248400688171, + "learning_rate": 7.042880464710106e-05, + "loss": 0.6899, + "step": 114630 + }, + { + "epoch": 0.7324022846044747, + "grad_norm": 0.9804075956344604, + "learning_rate": 7.042422478140194e-05, + "loss": 0.9902, + "step": 114640 + }, + { + "epoch": 0.7324661717542134, + "grad_norm": 1.1558316946029663, + "learning_rate": 7.041964471001318e-05, + "loss": 0.8413, + "step": 114650 + }, + { + "epoch": 0.7325300589039521, + "grad_norm": 0.6205730438232422, + "learning_rate": 7.041506443298093e-05, + "loss": 0.8944, + "step": 114660 + }, + { + "epoch": 0.7325939460536908, + "grad_norm": 1.1276192665100098, + "learning_rate": 7.041048395035135e-05, + "loss": 0.7895, + "step": 114670 + }, + { + "epoch": 0.7326578332034295, + "grad_norm": 0.9537439942359924, + "learning_rate": 7.040590326217052e-05, + "loss": 0.875, + "step": 114680 + }, + { + "epoch": 0.7327217203531682, + "grad_norm": 0.7265706062316895, + "learning_rate": 7.040132236848457e-05, + "loss": 1.1706, + "step": 114690 + }, + { + "epoch": 0.7327856075029069, + "grad_norm": 0.9442006945610046, + "learning_rate": 7.039674126933969e-05, + "loss": 0.9093, + "step": 114700 + }, + { + "epoch": 0.7328494946526456, + "grad_norm": 1.7052466869354248, + "learning_rate": 7.039215996478195e-05, + "loss": 0.7726, + "step": 114710 + }, + { + "epoch": 0.7329133818023843, + "grad_norm": 0.691369891166687, + "learning_rate": 7.038757845485754e-05, + "loss": 0.7376, + "step": 114720 + }, + { + "epoch": 0.732977268952123, + "grad_norm": 1.4131755828857422, + "learning_rate": 7.038299673961258e-05, + "loss": 1.1266, + "step": 114730 + }, + { + "epoch": 0.7330411561018617, + "grad_norm": 1.0800511837005615, + "learning_rate": 7.037841481909319e-05, + "loss": 0.8857, + "step": 114740 + }, + { + "epoch": 0.7331050432516004, + "grad_norm": 0.9274218678474426, + "learning_rate": 7.037383269334555e-05, + "loss": 0.9862, + "step": 114750 + }, + { + "epoch": 0.7331689304013391, + "grad_norm": 1.05172860622406, + "learning_rate": 7.036925036241578e-05, + "loss": 1.024, + "step": 114760 + }, + { + "epoch": 0.7332328175510778, + "grad_norm": 1.1139484643936157, + "learning_rate": 7.036466782635003e-05, + "loss": 0.7455, + "step": 114770 + }, + { + "epoch": 0.7332967047008164, + "grad_norm": 0.9190326929092407, + "learning_rate": 7.036008508519446e-05, + "loss": 0.7979, + "step": 114780 + }, + { + "epoch": 0.7333605918505551, + "grad_norm": 0.7148388028144836, + "learning_rate": 7.03555021389952e-05, + "loss": 0.8942, + "step": 114790 + }, + { + "epoch": 0.7334244790002938, + "grad_norm": 1.5679404735565186, + "learning_rate": 7.035091898779846e-05, + "loss": 0.7687, + "step": 114800 + }, + { + "epoch": 0.7334883661500325, + "grad_norm": 1.5291404724121094, + "learning_rate": 7.034633563165034e-05, + "loss": 0.7391, + "step": 114810 + }, + { + "epoch": 0.7335522532997713, + "grad_norm": 1.019393801689148, + "learning_rate": 7.034175207059704e-05, + "loss": 0.7867, + "step": 114820 + }, + { + "epoch": 0.73361614044951, + "grad_norm": 0.5086638331413269, + "learning_rate": 7.033716830468467e-05, + "loss": 0.8308, + "step": 114830 + }, + { + "epoch": 0.7336800275992487, + "grad_norm": 0.7575461268424988, + "learning_rate": 7.033258433395944e-05, + "loss": 0.6507, + "step": 114840 + }, + { + "epoch": 0.7337439147489874, + "grad_norm": 1.0149903297424316, + "learning_rate": 7.032800015846749e-05, + "loss": 0.9033, + "step": 114850 + }, + { + "epoch": 0.7338078018987261, + "grad_norm": 1.2490330934524536, + "learning_rate": 7.032341577825499e-05, + "loss": 0.7165, + "step": 114860 + }, + { + "epoch": 0.7338716890484648, + "grad_norm": 0.9309149384498596, + "learning_rate": 7.031883119336811e-05, + "loss": 0.7779, + "step": 114870 + }, + { + "epoch": 0.7339355761982035, + "grad_norm": 1.0880928039550781, + "learning_rate": 7.031424640385303e-05, + "loss": 0.8678, + "step": 114880 + }, + { + "epoch": 0.7339994633479422, + "grad_norm": 1.2819446325302124, + "learning_rate": 7.03096614097559e-05, + "loss": 0.9925, + "step": 114890 + }, + { + "epoch": 0.7340633504976809, + "grad_norm": 0.588641345500946, + "learning_rate": 7.030507621112293e-05, + "loss": 0.6344, + "step": 114900 + }, + { + "epoch": 0.7341272376474196, + "grad_norm": 0.7878732085227966, + "learning_rate": 7.030049080800025e-05, + "loss": 0.7343, + "step": 114910 + }, + { + "epoch": 0.7341911247971583, + "grad_norm": 1.4143778085708618, + "learning_rate": 7.029590520043409e-05, + "loss": 0.7973, + "step": 114920 + }, + { + "epoch": 0.734255011946897, + "grad_norm": 2.662449598312378, + "learning_rate": 7.02913193884706e-05, + "loss": 0.8654, + "step": 114930 + }, + { + "epoch": 0.7343188990966357, + "grad_norm": 0.9117518663406372, + "learning_rate": 7.028673337215596e-05, + "loss": 0.8811, + "step": 114940 + }, + { + "epoch": 0.7343827862463744, + "grad_norm": 1.0859434604644775, + "learning_rate": 7.028214715153636e-05, + "loss": 0.8603, + "step": 114950 + }, + { + "epoch": 0.7344466733961131, + "grad_norm": 0.9327182173728943, + "learning_rate": 7.027756072665798e-05, + "loss": 1.0731, + "step": 114960 + }, + { + "epoch": 0.7345105605458518, + "grad_norm": 0.760019838809967, + "learning_rate": 7.027297409756706e-05, + "loss": 0.8329, + "step": 114970 + }, + { + "epoch": 0.7345744476955905, + "grad_norm": 1.3158267736434937, + "learning_rate": 7.026838726430972e-05, + "loss": 1.0469, + "step": 114980 + }, + { + "epoch": 0.7346383348453293, + "grad_norm": 0.855215311050415, + "learning_rate": 7.026380022693219e-05, + "loss": 1.0875, + "step": 114990 + }, + { + "epoch": 0.734702221995068, + "grad_norm": 0.7679759860038757, + "learning_rate": 7.025921298548069e-05, + "loss": 0.7983, + "step": 115000 + }, + { + "epoch": 0.7347661091448067, + "grad_norm": 0.7040578722953796, + "learning_rate": 7.025462554000136e-05, + "loss": 0.8196, + "step": 115010 + }, + { + "epoch": 0.7348299962945453, + "grad_norm": 0.7212196588516235, + "learning_rate": 7.025003789054044e-05, + "loss": 0.9989, + "step": 115020 + }, + { + "epoch": 0.734893883444284, + "grad_norm": 0.892850935459137, + "learning_rate": 7.024545003714411e-05, + "loss": 1.2155, + "step": 115030 + }, + { + "epoch": 0.7349577705940227, + "grad_norm": 0.9270761013031006, + "learning_rate": 7.02408619798586e-05, + "loss": 0.8714, + "step": 115040 + }, + { + "epoch": 0.7350216577437614, + "grad_norm": 0.827022910118103, + "learning_rate": 7.023627371873008e-05, + "loss": 0.7664, + "step": 115050 + }, + { + "epoch": 0.7350855448935001, + "grad_norm": 0.9258151650428772, + "learning_rate": 7.023168525380479e-05, + "loss": 0.8016, + "step": 115060 + }, + { + "epoch": 0.7351494320432388, + "grad_norm": 0.732083797454834, + "learning_rate": 7.022709658512892e-05, + "loss": 0.7217, + "step": 115070 + }, + { + "epoch": 0.7352133191929775, + "grad_norm": 1.1232346296310425, + "learning_rate": 7.02225077127487e-05, + "loss": 0.9083, + "step": 115080 + }, + { + "epoch": 0.7352772063427162, + "grad_norm": 0.9207686185836792, + "learning_rate": 7.021791863671032e-05, + "loss": 0.9492, + "step": 115090 + }, + { + "epoch": 0.7353410934924549, + "grad_norm": 1.1235870122909546, + "learning_rate": 7.021332935706e-05, + "loss": 1.0061, + "step": 115100 + }, + { + "epoch": 0.7354049806421936, + "grad_norm": 0.5921577215194702, + "learning_rate": 7.020873987384398e-05, + "loss": 1.0309, + "step": 115110 + }, + { + "epoch": 0.7354688677919323, + "grad_norm": 0.9022099375724792, + "learning_rate": 7.020415018710846e-05, + "loss": 0.956, + "step": 115120 + }, + { + "epoch": 0.735532754941671, + "grad_norm": 0.7757014036178589, + "learning_rate": 7.019956029689968e-05, + "loss": 1.2207, + "step": 115130 + }, + { + "epoch": 0.7355966420914097, + "grad_norm": 0.7293660640716553, + "learning_rate": 7.019497020326384e-05, + "loss": 0.8932, + "step": 115140 + }, + { + "epoch": 0.7356605292411484, + "grad_norm": 0.7669858932495117, + "learning_rate": 7.019037990624718e-05, + "loss": 0.8368, + "step": 115150 + }, + { + "epoch": 0.7357244163908871, + "grad_norm": 1.5827726125717163, + "learning_rate": 7.018578940589592e-05, + "loss": 0.7371, + "step": 115160 + }, + { + "epoch": 0.7357883035406259, + "grad_norm": 1.0692715644836426, + "learning_rate": 7.018119870225632e-05, + "loss": 0.9331, + "step": 115170 + }, + { + "epoch": 0.7358521906903646, + "grad_norm": 1.1816247701644897, + "learning_rate": 7.017660779537458e-05, + "loss": 0.7824, + "step": 115180 + }, + { + "epoch": 0.7359160778401033, + "grad_norm": 1.0669434070587158, + "learning_rate": 7.017201668529695e-05, + "loss": 0.8516, + "step": 115190 + }, + { + "epoch": 0.735979964989842, + "grad_norm": 1.2087671756744385, + "learning_rate": 7.016742537206965e-05, + "loss": 1.0443, + "step": 115200 + }, + { + "epoch": 0.7360438521395807, + "grad_norm": 0.8520811200141907, + "learning_rate": 7.016283385573893e-05, + "loss": 0.7544, + "step": 115210 + }, + { + "epoch": 0.7361077392893194, + "grad_norm": 0.6800863146781921, + "learning_rate": 7.015824213635104e-05, + "loss": 1.0469, + "step": 115220 + }, + { + "epoch": 0.7361716264390581, + "grad_norm": 1.1106778383255005, + "learning_rate": 7.01536502139522e-05, + "loss": 1.0884, + "step": 115230 + }, + { + "epoch": 0.7362355135887968, + "grad_norm": 1.0716586112976074, + "learning_rate": 7.014905808858868e-05, + "loss": 0.8972, + "step": 115240 + }, + { + "epoch": 0.7362994007385355, + "grad_norm": 0.961650550365448, + "learning_rate": 7.01444657603067e-05, + "loss": 0.7001, + "step": 115250 + }, + { + "epoch": 0.7363632878882741, + "grad_norm": 0.7025936245918274, + "learning_rate": 7.013987322915252e-05, + "loss": 0.9839, + "step": 115260 + }, + { + "epoch": 0.7364271750380128, + "grad_norm": 0.7885773181915283, + "learning_rate": 7.013528049517241e-05, + "loss": 0.8556, + "step": 115270 + }, + { + "epoch": 0.7364910621877515, + "grad_norm": 0.9782485365867615, + "learning_rate": 7.013068755841258e-05, + "loss": 0.7966, + "step": 115280 + }, + { + "epoch": 0.7365549493374902, + "grad_norm": 1.2593889236450195, + "learning_rate": 7.012609441891934e-05, + "loss": 0.7664, + "step": 115290 + }, + { + "epoch": 0.7366188364872289, + "grad_norm": 0.7124470472335815, + "learning_rate": 7.01215010767389e-05, + "loss": 1.2127, + "step": 115300 + }, + { + "epoch": 0.7366827236369676, + "grad_norm": 1.3822424411773682, + "learning_rate": 7.011690753191754e-05, + "loss": 1.2461, + "step": 115310 + }, + { + "epoch": 0.7367466107867063, + "grad_norm": 1.3084379434585571, + "learning_rate": 7.011231378450152e-05, + "loss": 0.7628, + "step": 115320 + }, + { + "epoch": 0.736810497936445, + "grad_norm": 1.2158416509628296, + "learning_rate": 7.01077198345371e-05, + "loss": 0.7993, + "step": 115330 + }, + { + "epoch": 0.7368743850861837, + "grad_norm": 1.6002458333969116, + "learning_rate": 7.010312568207055e-05, + "loss": 0.9804, + "step": 115340 + }, + { + "epoch": 0.7369382722359225, + "grad_norm": 1.1068108081817627, + "learning_rate": 7.009853132714812e-05, + "loss": 0.7037, + "step": 115350 + }, + { + "epoch": 0.7370021593856612, + "grad_norm": 0.4763220548629761, + "learning_rate": 7.00939367698161e-05, + "loss": 0.6586, + "step": 115360 + }, + { + "epoch": 0.7370660465353999, + "grad_norm": 0.9020888805389404, + "learning_rate": 7.008934201012076e-05, + "loss": 1.0595, + "step": 115370 + }, + { + "epoch": 0.7371299336851386, + "grad_norm": 1.2904754877090454, + "learning_rate": 7.008474704810835e-05, + "loss": 0.8614, + "step": 115380 + }, + { + "epoch": 0.7371938208348773, + "grad_norm": 1.1419029235839844, + "learning_rate": 7.008015188382517e-05, + "loss": 0.7458, + "step": 115390 + }, + { + "epoch": 0.737257707984616, + "grad_norm": 0.887784481048584, + "learning_rate": 7.00755565173175e-05, + "loss": 0.7534, + "step": 115400 + }, + { + "epoch": 0.7373215951343547, + "grad_norm": 0.8305013179779053, + "learning_rate": 7.007096094863159e-05, + "loss": 0.9282, + "step": 115410 + }, + { + "epoch": 0.7373854822840934, + "grad_norm": 0.9823849201202393, + "learning_rate": 7.006636517781376e-05, + "loss": 1.0846, + "step": 115420 + }, + { + "epoch": 0.7374493694338321, + "grad_norm": 1.4131437540054321, + "learning_rate": 7.006176920491025e-05, + "loss": 0.7773, + "step": 115430 + }, + { + "epoch": 0.7375132565835708, + "grad_norm": 0.7678616642951965, + "learning_rate": 7.005717302996739e-05, + "loss": 1.084, + "step": 115440 + }, + { + "epoch": 0.7375771437333095, + "grad_norm": 0.655906617641449, + "learning_rate": 7.005257665303142e-05, + "loss": 0.9624, + "step": 115450 + }, + { + "epoch": 0.7376410308830482, + "grad_norm": 0.9372738003730774, + "learning_rate": 7.004798007414867e-05, + "loss": 0.8082, + "step": 115460 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 1.036291480064392, + "learning_rate": 7.004338329336541e-05, + "loss": 1.0762, + "step": 115470 + }, + { + "epoch": 0.7377688051825256, + "grad_norm": 0.8642080426216125, + "learning_rate": 7.003878631072794e-05, + "loss": 0.9366, + "step": 115480 + }, + { + "epoch": 0.7378326923322643, + "grad_norm": 1.0046378374099731, + "learning_rate": 7.003418912628257e-05, + "loss": 0.9384, + "step": 115490 + }, + { + "epoch": 0.7378965794820029, + "grad_norm": 0.8233842849731445, + "learning_rate": 7.002959174007558e-05, + "loss": 0.6712, + "step": 115500 + }, + { + "epoch": 0.7379604666317416, + "grad_norm": 1.354340672492981, + "learning_rate": 7.002499415215325e-05, + "loss": 0.9442, + "step": 115510 + }, + { + "epoch": 0.7380243537814803, + "grad_norm": 1.4941532611846924, + "learning_rate": 7.002039636256192e-05, + "loss": 0.7647, + "step": 115520 + }, + { + "epoch": 0.738088240931219, + "grad_norm": 1.0750895738601685, + "learning_rate": 7.001579837134789e-05, + "loss": 1.0533, + "step": 115530 + }, + { + "epoch": 0.7381521280809578, + "grad_norm": 1.1024094820022583, + "learning_rate": 7.001120017855745e-05, + "loss": 0.959, + "step": 115540 + }, + { + "epoch": 0.7382160152306965, + "grad_norm": 1.5375107526779175, + "learning_rate": 7.000660178423691e-05, + "loss": 0.9801, + "step": 115550 + }, + { + "epoch": 0.7382799023804352, + "grad_norm": 0.8644910454750061, + "learning_rate": 7.000200318843258e-05, + "loss": 0.9112, + "step": 115560 + }, + { + "epoch": 0.7383437895301739, + "grad_norm": 0.8466353416442871, + "learning_rate": 6.999740439119078e-05, + "loss": 1.028, + "step": 115570 + }, + { + "epoch": 0.7384076766799126, + "grad_norm": 0.8331221342086792, + "learning_rate": 6.99928053925578e-05, + "loss": 0.7816, + "step": 115580 + }, + { + "epoch": 0.7384715638296513, + "grad_norm": 1.330575704574585, + "learning_rate": 6.998820619257999e-05, + "loss": 1.1076, + "step": 115590 + }, + { + "epoch": 0.73853545097939, + "grad_norm": 1.0668553113937378, + "learning_rate": 6.998360679130364e-05, + "loss": 0.7225, + "step": 115600 + }, + { + "epoch": 0.7385993381291287, + "grad_norm": 2.099946975708008, + "learning_rate": 6.997900718877509e-05, + "loss": 0.8623, + "step": 115610 + }, + { + "epoch": 0.7386632252788674, + "grad_norm": 1.1908918619155884, + "learning_rate": 6.997440738504065e-05, + "loss": 0.9463, + "step": 115620 + }, + { + "epoch": 0.7387271124286061, + "grad_norm": 0.8096728920936584, + "learning_rate": 6.996980738014665e-05, + "loss": 0.7725, + "step": 115630 + }, + { + "epoch": 0.7387909995783448, + "grad_norm": 0.7145434617996216, + "learning_rate": 6.996520717413939e-05, + "loss": 0.9579, + "step": 115640 + }, + { + "epoch": 0.7388548867280835, + "grad_norm": 2.0472443103790283, + "learning_rate": 6.996060676706525e-05, + "loss": 1.1305, + "step": 115650 + }, + { + "epoch": 0.7389187738778222, + "grad_norm": 0.7645730376243591, + "learning_rate": 6.995600615897052e-05, + "loss": 1.1646, + "step": 115660 + }, + { + "epoch": 0.7389826610275609, + "grad_norm": 0.8512725234031677, + "learning_rate": 6.995140534990155e-05, + "loss": 0.9598, + "step": 115670 + }, + { + "epoch": 0.7390465481772996, + "grad_norm": 1.5269567966461182, + "learning_rate": 6.994680433990466e-05, + "loss": 0.6954, + "step": 115680 + }, + { + "epoch": 0.7391104353270384, + "grad_norm": 1.0922000408172607, + "learning_rate": 6.99422031290262e-05, + "loss": 0.6855, + "step": 115690 + }, + { + "epoch": 0.7391743224767771, + "grad_norm": 0.9279537796974182, + "learning_rate": 6.993760171731251e-05, + "loss": 1.0426, + "step": 115700 + }, + { + "epoch": 0.7392382096265158, + "grad_norm": 1.6063908338546753, + "learning_rate": 6.993300010480991e-05, + "loss": 0.779, + "step": 115710 + }, + { + "epoch": 0.7393020967762545, + "grad_norm": 2.612882614135742, + "learning_rate": 6.992839829156475e-05, + "loss": 0.9478, + "step": 115720 + }, + { + "epoch": 0.7393659839259932, + "grad_norm": 1.1300225257873535, + "learning_rate": 6.992379627762339e-05, + "loss": 0.8534, + "step": 115730 + }, + { + "epoch": 0.7394298710757319, + "grad_norm": 0.7757554650306702, + "learning_rate": 6.991919406303216e-05, + "loss": 1.0868, + "step": 115740 + }, + { + "epoch": 0.7394937582254705, + "grad_norm": 0.7098391652107239, + "learning_rate": 6.991459164783741e-05, + "loss": 0.8308, + "step": 115750 + }, + { + "epoch": 0.7395576453752092, + "grad_norm": 0.7299323678016663, + "learning_rate": 6.99099890320855e-05, + "loss": 0.9169, + "step": 115760 + }, + { + "epoch": 0.7396215325249479, + "grad_norm": 0.8165879249572754, + "learning_rate": 6.990538621582278e-05, + "loss": 0.8462, + "step": 115770 + }, + { + "epoch": 0.7396854196746866, + "grad_norm": 0.8184595108032227, + "learning_rate": 6.990078319909559e-05, + "loss": 1.0554, + "step": 115780 + }, + { + "epoch": 0.7397493068244253, + "grad_norm": 0.7778357863426208, + "learning_rate": 6.989617998195032e-05, + "loss": 0.9903, + "step": 115790 + }, + { + "epoch": 0.739813193974164, + "grad_norm": 0.9583641290664673, + "learning_rate": 6.989157656443327e-05, + "loss": 0.9248, + "step": 115800 + }, + { + "epoch": 0.7398770811239027, + "grad_norm": 1.5333011150360107, + "learning_rate": 6.988697294659085e-05, + "loss": 0.9296, + "step": 115810 + }, + { + "epoch": 0.7399409682736414, + "grad_norm": 0.9242226481437683, + "learning_rate": 6.98823691284694e-05, + "loss": 0.9196, + "step": 115820 + }, + { + "epoch": 0.7400048554233801, + "grad_norm": 1.562110185623169, + "learning_rate": 6.98777651101153e-05, + "loss": 0.8195, + "step": 115830 + }, + { + "epoch": 0.7400687425731188, + "grad_norm": 0.8381466865539551, + "learning_rate": 6.987316089157492e-05, + "loss": 0.757, + "step": 115840 + }, + { + "epoch": 0.7401326297228575, + "grad_norm": 0.7180655002593994, + "learning_rate": 6.986855647289461e-05, + "loss": 0.7828, + "step": 115850 + }, + { + "epoch": 0.7401965168725962, + "grad_norm": 1.6034409999847412, + "learning_rate": 6.986395185412073e-05, + "loss": 1.4035, + "step": 115860 + }, + { + "epoch": 0.740260404022335, + "grad_norm": 1.0359718799591064, + "learning_rate": 6.985934703529969e-05, + "loss": 0.8757, + "step": 115870 + }, + { + "epoch": 0.7403242911720737, + "grad_norm": 0.6328555345535278, + "learning_rate": 6.985474201647784e-05, + "loss": 0.6099, + "step": 115880 + }, + { + "epoch": 0.7403881783218124, + "grad_norm": 0.6946358680725098, + "learning_rate": 6.985013679770156e-05, + "loss": 0.8574, + "step": 115890 + }, + { + "epoch": 0.7404520654715511, + "grad_norm": 1.2363545894622803, + "learning_rate": 6.984553137901722e-05, + "loss": 0.8605, + "step": 115900 + }, + { + "epoch": 0.7405159526212898, + "grad_norm": 1.1174249649047852, + "learning_rate": 6.984092576047123e-05, + "loss": 0.7995, + "step": 115910 + }, + { + "epoch": 0.7405798397710285, + "grad_norm": 0.9834713339805603, + "learning_rate": 6.983631994210994e-05, + "loss": 1.31, + "step": 115920 + }, + { + "epoch": 0.7406437269207672, + "grad_norm": 1.397002935409546, + "learning_rate": 6.983171392397975e-05, + "loss": 0.8187, + "step": 115930 + }, + { + "epoch": 0.7407076140705059, + "grad_norm": 1.3270031213760376, + "learning_rate": 6.982710770612704e-05, + "loss": 1.112, + "step": 115940 + }, + { + "epoch": 0.7407715012202446, + "grad_norm": 0.6648245453834534, + "learning_rate": 6.98225012885982e-05, + "loss": 0.8693, + "step": 115950 + }, + { + "epoch": 0.7408353883699833, + "grad_norm": 0.7026130557060242, + "learning_rate": 6.981789467143965e-05, + "loss": 0.8299, + "step": 115960 + }, + { + "epoch": 0.740899275519722, + "grad_norm": 0.9298969507217407, + "learning_rate": 6.981328785469772e-05, + "loss": 0.8988, + "step": 115970 + }, + { + "epoch": 0.7409631626694607, + "grad_norm": 0.921608567237854, + "learning_rate": 6.980868083841887e-05, + "loss": 0.7417, + "step": 115980 + }, + { + "epoch": 0.7410270498191993, + "grad_norm": 0.960408091545105, + "learning_rate": 6.980407362264945e-05, + "loss": 0.8521, + "step": 115990 + }, + { + "epoch": 0.741090936968938, + "grad_norm": 0.6170063018798828, + "learning_rate": 6.979946620743587e-05, + "loss": 0.8537, + "step": 116000 + }, + { + "epoch": 0.7411548241186767, + "grad_norm": 1.5411709547042847, + "learning_rate": 6.979485859282453e-05, + "loss": 1.0145, + "step": 116010 + }, + { + "epoch": 0.7412187112684154, + "grad_norm": 1.529691457748413, + "learning_rate": 6.979025077886185e-05, + "loss": 0.8084, + "step": 116020 + }, + { + "epoch": 0.7412825984181541, + "grad_norm": 1.5334669351577759, + "learning_rate": 6.978564276559423e-05, + "loss": 0.6795, + "step": 116030 + }, + { + "epoch": 0.7413464855678928, + "grad_norm": 0.7906901240348816, + "learning_rate": 6.978103455306808e-05, + "loss": 0.8419, + "step": 116040 + }, + { + "epoch": 0.7414103727176315, + "grad_norm": 1.2242873907089233, + "learning_rate": 6.977642614132979e-05, + "loss": 0.8119, + "step": 116050 + }, + { + "epoch": 0.7414742598673703, + "grad_norm": 0.705220639705658, + "learning_rate": 6.977181753042577e-05, + "loss": 1.2762, + "step": 116060 + }, + { + "epoch": 0.741538147017109, + "grad_norm": 0.8187665343284607, + "learning_rate": 6.976720872040245e-05, + "loss": 0.8109, + "step": 116070 + }, + { + "epoch": 0.7416020341668477, + "grad_norm": 1.5236612558364868, + "learning_rate": 6.976259971130624e-05, + "loss": 0.9235, + "step": 116080 + }, + { + "epoch": 0.7416659213165864, + "grad_norm": 3.65586519241333, + "learning_rate": 6.975799050318355e-05, + "loss": 0.8398, + "step": 116090 + }, + { + "epoch": 0.7417298084663251, + "grad_norm": 0.6140323877334595, + "learning_rate": 6.97533810960808e-05, + "loss": 0.9414, + "step": 116100 + }, + { + "epoch": 0.7417936956160638, + "grad_norm": 0.6180686950683594, + "learning_rate": 6.974877149004441e-05, + "loss": 0.8651, + "step": 116110 + }, + { + "epoch": 0.7418575827658025, + "grad_norm": 0.8425277471542358, + "learning_rate": 6.97441616851208e-05, + "loss": 1.0168, + "step": 116120 + }, + { + "epoch": 0.7419214699155412, + "grad_norm": 0.7344639897346497, + "learning_rate": 6.973955168135642e-05, + "loss": 0.6021, + "step": 116130 + }, + { + "epoch": 0.7419853570652799, + "grad_norm": 0.8152681589126587, + "learning_rate": 6.973494147879767e-05, + "loss": 0.9441, + "step": 116140 + }, + { + "epoch": 0.7420492442150186, + "grad_norm": 0.810077428817749, + "learning_rate": 6.973033107749098e-05, + "loss": 0.7308, + "step": 116150 + }, + { + "epoch": 0.7421131313647573, + "grad_norm": 0.8788096308708191, + "learning_rate": 6.972572047748281e-05, + "loss": 0.7761, + "step": 116160 + }, + { + "epoch": 0.742177018514496, + "grad_norm": 1.0663613080978394, + "learning_rate": 6.972110967881953e-05, + "loss": 1.0319, + "step": 116170 + }, + { + "epoch": 0.7422409056642347, + "grad_norm": 1.0700383186340332, + "learning_rate": 6.971649868154764e-05, + "loss": 0.7867, + "step": 116180 + }, + { + "epoch": 0.7423047928139734, + "grad_norm": 1.0407042503356934, + "learning_rate": 6.971188748571355e-05, + "loss": 1.0002, + "step": 116190 + }, + { + "epoch": 0.7423686799637121, + "grad_norm": 0.9793998599052429, + "learning_rate": 6.97072760913637e-05, + "loss": 1.1765, + "step": 116200 + }, + { + "epoch": 0.7424325671134508, + "grad_norm": 1.643677830696106, + "learning_rate": 6.970266449854452e-05, + "loss": 0.9574, + "step": 116210 + }, + { + "epoch": 0.7424964542631896, + "grad_norm": 0.7675092220306396, + "learning_rate": 6.969805270730248e-05, + "loss": 0.7953, + "step": 116220 + }, + { + "epoch": 0.7425603414129281, + "grad_norm": 1.0452982187271118, + "learning_rate": 6.969344071768398e-05, + "loss": 0.9938, + "step": 116230 + }, + { + "epoch": 0.7426242285626669, + "grad_norm": 1.2801587581634521, + "learning_rate": 6.968882852973553e-05, + "loss": 0.8495, + "step": 116240 + }, + { + "epoch": 0.7426881157124056, + "grad_norm": 0.7818521857261658, + "learning_rate": 6.968421614350352e-05, + "loss": 0.9984, + "step": 116250 + }, + { + "epoch": 0.7427520028621443, + "grad_norm": 1.2182539701461792, + "learning_rate": 6.967960355903442e-05, + "loss": 0.8587, + "step": 116260 + }, + { + "epoch": 0.742815890011883, + "grad_norm": 0.7654426097869873, + "learning_rate": 6.96749907763747e-05, + "loss": 0.8998, + "step": 116270 + }, + { + "epoch": 0.7428797771616217, + "grad_norm": 4.754692554473877, + "learning_rate": 6.967037779557082e-05, + "loss": 0.8934, + "step": 116280 + }, + { + "epoch": 0.7429436643113604, + "grad_norm": 0.9145668745040894, + "learning_rate": 6.966576461666919e-05, + "loss": 1.082, + "step": 116290 + }, + { + "epoch": 0.7430075514610991, + "grad_norm": 0.6794173717498779, + "learning_rate": 6.96611512397163e-05, + "loss": 0.82, + "step": 116300 + }, + { + "epoch": 0.7430714386108378, + "grad_norm": 1.012447714805603, + "learning_rate": 6.965653766475862e-05, + "loss": 0.9966, + "step": 116310 + }, + { + "epoch": 0.7431353257605765, + "grad_norm": 0.8482702374458313, + "learning_rate": 6.96519238918426e-05, + "loss": 0.8475, + "step": 116320 + }, + { + "epoch": 0.7431992129103152, + "grad_norm": 0.8447071313858032, + "learning_rate": 6.964730992101468e-05, + "loss": 0.9462, + "step": 116330 + }, + { + "epoch": 0.7432631000600539, + "grad_norm": 0.6218491792678833, + "learning_rate": 6.964269575232138e-05, + "loss": 0.8291, + "step": 116340 + }, + { + "epoch": 0.7433269872097926, + "grad_norm": 1.0346623659133911, + "learning_rate": 6.963808138580912e-05, + "loss": 0.9003, + "step": 116350 + }, + { + "epoch": 0.7433908743595313, + "grad_norm": 0.7092266082763672, + "learning_rate": 6.96334668215244e-05, + "loss": 0.906, + "step": 116360 + }, + { + "epoch": 0.74345476150927, + "grad_norm": 2.0959596633911133, + "learning_rate": 6.962885205951369e-05, + "loss": 1.2469, + "step": 116370 + }, + { + "epoch": 0.7435186486590087, + "grad_norm": 0.5212934613227844, + "learning_rate": 6.962423709982345e-05, + "loss": 0.6491, + "step": 116380 + }, + { + "epoch": 0.7435825358087474, + "grad_norm": 0.7486282587051392, + "learning_rate": 6.961962194250017e-05, + "loss": 0.8276, + "step": 116390 + }, + { + "epoch": 0.7436464229584862, + "grad_norm": 0.8907299041748047, + "learning_rate": 6.961500658759033e-05, + "loss": 0.8261, + "step": 116400 + }, + { + "epoch": 0.7437103101082249, + "grad_norm": 1.0088621377944946, + "learning_rate": 6.961039103514039e-05, + "loss": 0.8501, + "step": 116410 + }, + { + "epoch": 0.7437741972579636, + "grad_norm": 0.852279007434845, + "learning_rate": 6.960577528519685e-05, + "loss": 1.0004, + "step": 116420 + }, + { + "epoch": 0.7438380844077023, + "grad_norm": 0.6919979453086853, + "learning_rate": 6.96011593378062e-05, + "loss": 0.9841, + "step": 116430 + }, + { + "epoch": 0.743901971557441, + "grad_norm": 1.0231778621673584, + "learning_rate": 6.959654319301492e-05, + "loss": 0.9452, + "step": 116440 + }, + { + "epoch": 0.7439658587071797, + "grad_norm": 1.6192152500152588, + "learning_rate": 6.959238849396364e-05, + "loss": 0.8734, + "step": 116450 + }, + { + "epoch": 0.7440297458569184, + "grad_norm": 1.1661548614501953, + "learning_rate": 6.958777197423922e-05, + "loss": 0.8547, + "step": 116460 + }, + { + "epoch": 0.7440936330066571, + "grad_norm": 1.9804883003234863, + "learning_rate": 6.958315525724901e-05, + "loss": 0.8883, + "step": 116470 + }, + { + "epoch": 0.7441575201563957, + "grad_norm": 0.9184843897819519, + "learning_rate": 6.957853834303946e-05, + "loss": 1.0803, + "step": 116480 + }, + { + "epoch": 0.7442214073061344, + "grad_norm": 0.9524401426315308, + "learning_rate": 6.957392123165711e-05, + "loss": 0.823, + "step": 116490 + }, + { + "epoch": 0.7442852944558731, + "grad_norm": 0.7346342206001282, + "learning_rate": 6.956930392314845e-05, + "loss": 0.842, + "step": 116500 + }, + { + "epoch": 0.7443491816056118, + "grad_norm": 0.6176126003265381, + "learning_rate": 6.956468641755994e-05, + "loss": 0.824, + "step": 116510 + }, + { + "epoch": 0.7444130687553505, + "grad_norm": 1.0610926151275635, + "learning_rate": 6.956006871493814e-05, + "loss": 0.8781, + "step": 116520 + }, + { + "epoch": 0.7444769559050892, + "grad_norm": 1.190373420715332, + "learning_rate": 6.95554508153295e-05, + "loss": 0.9477, + "step": 116530 + }, + { + "epoch": 0.7445408430548279, + "grad_norm": 1.2164260149002075, + "learning_rate": 6.955083271878056e-05, + "loss": 0.7542, + "step": 116540 + }, + { + "epoch": 0.7446047302045666, + "grad_norm": 0.9188566207885742, + "learning_rate": 6.954621442533784e-05, + "loss": 0.9441, + "step": 116550 + }, + { + "epoch": 0.7446686173543053, + "grad_norm": 1.2796574831008911, + "learning_rate": 6.954159593504781e-05, + "loss": 0.738, + "step": 116560 + }, + { + "epoch": 0.744732504504044, + "grad_norm": 0.8466264605522156, + "learning_rate": 6.953697724795702e-05, + "loss": 1.0705, + "step": 116570 + }, + { + "epoch": 0.7447963916537828, + "grad_norm": 0.9667829275131226, + "learning_rate": 6.953235836411194e-05, + "loss": 0.9622, + "step": 116580 + }, + { + "epoch": 0.7448602788035215, + "grad_norm": 3.007852792739868, + "learning_rate": 6.952773928355913e-05, + "loss": 0.8599, + "step": 116590 + }, + { + "epoch": 0.7449241659532602, + "grad_norm": 0.6320720314979553, + "learning_rate": 6.95231200063451e-05, + "loss": 0.9257, + "step": 116600 + }, + { + "epoch": 0.7449880531029989, + "grad_norm": 0.8152862787246704, + "learning_rate": 6.951850053251636e-05, + "loss": 0.7299, + "step": 116610 + }, + { + "epoch": 0.7450519402527376, + "grad_norm": 0.709783673286438, + "learning_rate": 6.951388086211943e-05, + "loss": 0.8396, + "step": 116620 + }, + { + "epoch": 0.7451158274024763, + "grad_norm": 0.844637393951416, + "learning_rate": 6.950926099520084e-05, + "loss": 0.5487, + "step": 116630 + }, + { + "epoch": 0.745179714552215, + "grad_norm": 1.2991611957550049, + "learning_rate": 6.95046409318071e-05, + "loss": 0.8171, + "step": 116640 + }, + { + "epoch": 0.7452436017019537, + "grad_norm": 1.085801124572754, + "learning_rate": 6.950002067198475e-05, + "loss": 0.9065, + "step": 116650 + }, + { + "epoch": 0.7453074888516924, + "grad_norm": 0.8042502999305725, + "learning_rate": 6.949540021578034e-05, + "loss": 0.9323, + "step": 116660 + }, + { + "epoch": 0.7453713760014311, + "grad_norm": 0.6853629946708679, + "learning_rate": 6.949077956324038e-05, + "loss": 0.8285, + "step": 116670 + }, + { + "epoch": 0.7454352631511698, + "grad_norm": 0.7794731259346008, + "learning_rate": 6.94861587144114e-05, + "loss": 0.9293, + "step": 116680 + }, + { + "epoch": 0.7454991503009085, + "grad_norm": 0.9132157564163208, + "learning_rate": 6.948153766933995e-05, + "loss": 1.0174, + "step": 116690 + }, + { + "epoch": 0.7455630374506472, + "grad_norm": 1.2286196947097778, + "learning_rate": 6.947691642807256e-05, + "loss": 1.0193, + "step": 116700 + }, + { + "epoch": 0.7456269246003859, + "grad_norm": 0.5221213102340698, + "learning_rate": 6.947229499065578e-05, + "loss": 0.7766, + "step": 116710 + }, + { + "epoch": 0.7456908117501245, + "grad_norm": 1.4980177879333496, + "learning_rate": 6.946767335713613e-05, + "loss": 0.8005, + "step": 116720 + }, + { + "epoch": 0.7457546988998632, + "grad_norm": 0.8484123349189758, + "learning_rate": 6.946305152756017e-05, + "loss": 0.6168, + "step": 116730 + }, + { + "epoch": 0.7458185860496019, + "grad_norm": 0.9975723028182983, + "learning_rate": 6.945842950197446e-05, + "loss": 0.9931, + "step": 116740 + }, + { + "epoch": 0.7458824731993406, + "grad_norm": 1.0813270807266235, + "learning_rate": 6.945380728042549e-05, + "loss": 0.8407, + "step": 116750 + }, + { + "epoch": 0.7459463603490794, + "grad_norm": 1.2126818895339966, + "learning_rate": 6.944918486295989e-05, + "loss": 0.8995, + "step": 116760 + }, + { + "epoch": 0.7460102474988181, + "grad_norm": 0.9279122948646545, + "learning_rate": 6.944456224962417e-05, + "loss": 0.819, + "step": 116770 + }, + { + "epoch": 0.7460741346485568, + "grad_norm": 0.9613460302352905, + "learning_rate": 6.943993944046487e-05, + "loss": 1.41, + "step": 116780 + }, + { + "epoch": 0.7461380217982955, + "grad_norm": 0.6390883922576904, + "learning_rate": 6.943531643552857e-05, + "loss": 0.7246, + "step": 116790 + }, + { + "epoch": 0.7462019089480342, + "grad_norm": 1.0150036811828613, + "learning_rate": 6.943069323486183e-05, + "loss": 0.7269, + "step": 116800 + }, + { + "epoch": 0.7462657960977729, + "grad_norm": 0.7378376722335815, + "learning_rate": 6.942606983851116e-05, + "loss": 0.9508, + "step": 116810 + }, + { + "epoch": 0.7463296832475116, + "grad_norm": 0.9831222295761108, + "learning_rate": 6.94214462465232e-05, + "loss": 0.9079, + "step": 116820 + }, + { + "epoch": 0.7463935703972503, + "grad_norm": 1.4024206399917603, + "learning_rate": 6.941682245894446e-05, + "loss": 0.8302, + "step": 116830 + }, + { + "epoch": 0.746457457546989, + "grad_norm": 2.6313674449920654, + "learning_rate": 6.94121984758215e-05, + "loss": 0.9034, + "step": 116840 + }, + { + "epoch": 0.7465213446967277, + "grad_norm": 0.7993502616882324, + "learning_rate": 6.940757429720094e-05, + "loss": 0.681, + "step": 116850 + }, + { + "epoch": 0.7465852318464664, + "grad_norm": 1.0172010660171509, + "learning_rate": 6.940294992312932e-05, + "loss": 1.0379, + "step": 116860 + }, + { + "epoch": 0.7466491189962051, + "grad_norm": 0.764042854309082, + "learning_rate": 6.939832535365319e-05, + "loss": 0.8982, + "step": 116870 + }, + { + "epoch": 0.7467130061459438, + "grad_norm": 1.4996678829193115, + "learning_rate": 6.939370058881914e-05, + "loss": 1.0452, + "step": 116880 + }, + { + "epoch": 0.7467768932956825, + "grad_norm": 0.6007105708122253, + "learning_rate": 6.938907562867374e-05, + "loss": 0.8955, + "step": 116890 + }, + { + "epoch": 0.7468407804454212, + "grad_norm": 0.753452718257904, + "learning_rate": 6.93844504732636e-05, + "loss": 0.8096, + "step": 116900 + }, + { + "epoch": 0.7469046675951599, + "grad_norm": 0.8642897009849548, + "learning_rate": 6.937982512263528e-05, + "loss": 0.7916, + "step": 116910 + }, + { + "epoch": 0.7469685547448986, + "grad_norm": 0.8521572947502136, + "learning_rate": 6.937519957683534e-05, + "loss": 0.7633, + "step": 116920 + }, + { + "epoch": 0.7470324418946374, + "grad_norm": 1.336063027381897, + "learning_rate": 6.937057383591037e-05, + "loss": 0.8625, + "step": 116930 + }, + { + "epoch": 0.7470963290443761, + "grad_norm": 2.280611276626587, + "learning_rate": 6.936594789990696e-05, + "loss": 1.225, + "step": 116940 + }, + { + "epoch": 0.7471602161941148, + "grad_norm": 0.5119796991348267, + "learning_rate": 6.936132176887171e-05, + "loss": 0.7675, + "step": 116950 + }, + { + "epoch": 0.7472241033438534, + "grad_norm": 1.0470057725906372, + "learning_rate": 6.93566954428512e-05, + "loss": 0.9467, + "step": 116960 + }, + { + "epoch": 0.7472879904935921, + "grad_norm": 0.823755145072937, + "learning_rate": 6.935206892189202e-05, + "loss": 0.7573, + "step": 116970 + }, + { + "epoch": 0.7473518776433308, + "grad_norm": 0.7342879176139832, + "learning_rate": 6.934744220604076e-05, + "loss": 0.926, + "step": 116980 + }, + { + "epoch": 0.7474157647930695, + "grad_norm": 0.5635343790054321, + "learning_rate": 6.934281529534403e-05, + "loss": 0.8136, + "step": 116990 + }, + { + "epoch": 0.7474796519428082, + "grad_norm": 0.515261173248291, + "learning_rate": 6.93381881898484e-05, + "loss": 0.6738, + "step": 117000 + }, + { + "epoch": 0.7475435390925469, + "grad_norm": 1.1176193952560425, + "learning_rate": 6.93335608896005e-05, + "loss": 0.7295, + "step": 117010 + }, + { + "epoch": 0.7476074262422856, + "grad_norm": 0.7528888583183289, + "learning_rate": 6.93289333946469e-05, + "loss": 0.9808, + "step": 117020 + }, + { + "epoch": 0.7476713133920243, + "grad_norm": 0.6792475581169128, + "learning_rate": 6.932430570503423e-05, + "loss": 0.7732, + "step": 117030 + }, + { + "epoch": 0.747735200541763, + "grad_norm": 0.49816229939460754, + "learning_rate": 6.931967782080908e-05, + "loss": 1.0765, + "step": 117040 + }, + { + "epoch": 0.7477990876915017, + "grad_norm": 0.6919913291931152, + "learning_rate": 6.931504974201806e-05, + "loss": 0.9868, + "step": 117050 + }, + { + "epoch": 0.7478629748412404, + "grad_norm": 1.421985387802124, + "learning_rate": 6.931042146870779e-05, + "loss": 1.0408, + "step": 117060 + }, + { + "epoch": 0.7479268619909791, + "grad_norm": 0.6791272163391113, + "learning_rate": 6.930579300092487e-05, + "loss": 1.0113, + "step": 117070 + }, + { + "epoch": 0.7479907491407178, + "grad_norm": 0.6858437657356262, + "learning_rate": 6.93011643387159e-05, + "loss": 0.7507, + "step": 117080 + }, + { + "epoch": 0.7480546362904565, + "grad_norm": 0.5481752753257751, + "learning_rate": 6.92965354821275e-05, + "loss": 0.7424, + "step": 117090 + }, + { + "epoch": 0.7481185234401952, + "grad_norm": 0.6619887948036194, + "learning_rate": 6.929190643120632e-05, + "loss": 0.8167, + "step": 117100 + }, + { + "epoch": 0.748182410589934, + "grad_norm": 1.6470102071762085, + "learning_rate": 6.928727718599893e-05, + "loss": 0.9335, + "step": 117110 + }, + { + "epoch": 0.7482462977396727, + "grad_norm": 0.7294175028800964, + "learning_rate": 6.928264774655198e-05, + "loss": 0.8117, + "step": 117120 + }, + { + "epoch": 0.7483101848894114, + "grad_norm": 1.0997979640960693, + "learning_rate": 6.927801811291209e-05, + "loss": 1.0709, + "step": 117130 + }, + { + "epoch": 0.7483740720391501, + "grad_norm": 0.9277620911598206, + "learning_rate": 6.927338828512588e-05, + "loss": 0.8636, + "step": 117140 + }, + { + "epoch": 0.7484379591888888, + "grad_norm": 1.3911194801330566, + "learning_rate": 6.926875826323997e-05, + "loss": 0.9657, + "step": 117150 + }, + { + "epoch": 0.7485018463386275, + "grad_norm": 0.7593978047370911, + "learning_rate": 6.9264128047301e-05, + "loss": 0.8314, + "step": 117160 + }, + { + "epoch": 0.7485657334883662, + "grad_norm": 0.8902554512023926, + "learning_rate": 6.92594976373556e-05, + "loss": 1.0123, + "step": 117170 + }, + { + "epoch": 0.7486296206381049, + "grad_norm": 0.8889968991279602, + "learning_rate": 6.925486703345038e-05, + "loss": 0.9378, + "step": 117180 + }, + { + "epoch": 0.7486935077878436, + "grad_norm": 0.7080551981925964, + "learning_rate": 6.925023623563201e-05, + "loss": 1.1536, + "step": 117190 + }, + { + "epoch": 0.7487573949375823, + "grad_norm": 0.7877635955810547, + "learning_rate": 6.924560524394709e-05, + "loss": 0.9396, + "step": 117200 + }, + { + "epoch": 0.7488212820873209, + "grad_norm": 3.540712594985962, + "learning_rate": 6.924097405844227e-05, + "loss": 0.9817, + "step": 117210 + }, + { + "epoch": 0.7488851692370596, + "grad_norm": 0.6759431958198547, + "learning_rate": 6.923634267916422e-05, + "loss": 0.9781, + "step": 117220 + }, + { + "epoch": 0.7489490563867983, + "grad_norm": 0.7441065311431885, + "learning_rate": 6.923171110615954e-05, + "loss": 0.6876, + "step": 117230 + }, + { + "epoch": 0.749012943536537, + "grad_norm": 0.90581214427948, + "learning_rate": 6.92270793394749e-05, + "loss": 0.9416, + "step": 117240 + }, + { + "epoch": 0.7490768306862757, + "grad_norm": 0.991303563117981, + "learning_rate": 6.922244737915692e-05, + "loss": 0.9986, + "step": 117250 + }, + { + "epoch": 0.7491407178360144, + "grad_norm": 0.9509017467498779, + "learning_rate": 6.921781522525229e-05, + "loss": 0.9976, + "step": 117260 + }, + { + "epoch": 0.7492046049857531, + "grad_norm": 1.060160756111145, + "learning_rate": 6.921318287780763e-05, + "loss": 0.9884, + "step": 117270 + }, + { + "epoch": 0.7492684921354918, + "grad_norm": 0.8051804900169373, + "learning_rate": 6.920855033686959e-05, + "loss": 1.0251, + "step": 117280 + }, + { + "epoch": 0.7493323792852306, + "grad_norm": 0.6516896486282349, + "learning_rate": 6.920391760248482e-05, + "loss": 1.0851, + "step": 117290 + }, + { + "epoch": 0.7493962664349693, + "grad_norm": 1.0302486419677734, + "learning_rate": 6.91992846747e-05, + "loss": 0.9056, + "step": 117300 + }, + { + "epoch": 0.749460153584708, + "grad_norm": 0.5640853047370911, + "learning_rate": 6.919465155356177e-05, + "loss": 0.8081, + "step": 117310 + }, + { + "epoch": 0.7495240407344467, + "grad_norm": 0.6302241086959839, + "learning_rate": 6.91900182391168e-05, + "loss": 0.8306, + "step": 117320 + }, + { + "epoch": 0.7495879278841854, + "grad_norm": 1.006842851638794, + "learning_rate": 6.918538473141174e-05, + "loss": 0.7887, + "step": 117330 + }, + { + "epoch": 0.7496518150339241, + "grad_norm": 0.8143184781074524, + "learning_rate": 6.918075103049325e-05, + "loss": 1.0579, + "step": 117340 + }, + { + "epoch": 0.7497157021836628, + "grad_norm": 0.6692333817481995, + "learning_rate": 6.9176117136408e-05, + "loss": 0.7448, + "step": 117350 + }, + { + "epoch": 0.7497795893334015, + "grad_norm": 1.042280912399292, + "learning_rate": 6.917148304920267e-05, + "loss": 0.8737, + "step": 117360 + }, + { + "epoch": 0.7498434764831402, + "grad_norm": 1.1402003765106201, + "learning_rate": 6.916684876892391e-05, + "loss": 0.8044, + "step": 117370 + }, + { + "epoch": 0.7499073636328789, + "grad_norm": 1.2697054147720337, + "learning_rate": 6.916221429561843e-05, + "loss": 0.7297, + "step": 117380 + }, + { + "epoch": 0.7499712507826176, + "grad_norm": 0.7075098156929016, + "learning_rate": 6.915757962933284e-05, + "loss": 0.696, + "step": 117390 + }, + { + "epoch": 0.7500351379323563, + "grad_norm": 1.1511503458023071, + "learning_rate": 6.915294477011389e-05, + "loss": 0.8506, + "step": 117400 + }, + { + "epoch": 0.750099025082095, + "grad_norm": 0.8749009370803833, + "learning_rate": 6.914830971800818e-05, + "loss": 1.041, + "step": 117410 + }, + { + "epoch": 0.7501629122318337, + "grad_norm": 0.7011004686355591, + "learning_rate": 6.914367447306244e-05, + "loss": 0.8774, + "step": 117420 + }, + { + "epoch": 0.7502267993815724, + "grad_norm": 1.408872365951538, + "learning_rate": 6.913903903532334e-05, + "loss": 0.8093, + "step": 117430 + }, + { + "epoch": 0.7502906865313111, + "grad_norm": 0.79103022813797, + "learning_rate": 6.913440340483755e-05, + "loss": 0.6807, + "step": 117440 + }, + { + "epoch": 0.7503545736810497, + "grad_norm": 1.1001317501068115, + "learning_rate": 6.912976758165177e-05, + "loss": 0.8683, + "step": 117450 + }, + { + "epoch": 0.7504184608307884, + "grad_norm": 0.7966405749320984, + "learning_rate": 6.912513156581267e-05, + "loss": 0.8673, + "step": 117460 + }, + { + "epoch": 0.7504823479805272, + "grad_norm": 0.5426264405250549, + "learning_rate": 6.912049535736697e-05, + "loss": 0.6471, + "step": 117470 + }, + { + "epoch": 0.7505462351302659, + "grad_norm": 0.8461551070213318, + "learning_rate": 6.911585895636132e-05, + "loss": 0.9877, + "step": 117480 + }, + { + "epoch": 0.7506101222800046, + "grad_norm": 0.7813534736633301, + "learning_rate": 6.911122236284244e-05, + "loss": 0.8591, + "step": 117490 + }, + { + "epoch": 0.7506740094297433, + "grad_norm": 2.3332297801971436, + "learning_rate": 6.910658557685701e-05, + "loss": 1.2769, + "step": 117500 + }, + { + "epoch": 0.750737896579482, + "grad_norm": 0.9277547597885132, + "learning_rate": 6.910194859845174e-05, + "loss": 0.7398, + "step": 117510 + }, + { + "epoch": 0.7508017837292207, + "grad_norm": 0.5283322930335999, + "learning_rate": 6.909731142767333e-05, + "loss": 0.8047, + "step": 117520 + }, + { + "epoch": 0.7508656708789594, + "grad_norm": 0.9669355154037476, + "learning_rate": 6.909267406456847e-05, + "loss": 1.0771, + "step": 117530 + }, + { + "epoch": 0.7509295580286981, + "grad_norm": 0.8970743417739868, + "learning_rate": 6.908803650918385e-05, + "loss": 0.9949, + "step": 117540 + }, + { + "epoch": 0.7509934451784368, + "grad_norm": 0.9849328398704529, + "learning_rate": 6.90833987615662e-05, + "loss": 0.6818, + "step": 117550 + }, + { + "epoch": 0.7510573323281755, + "grad_norm": 0.4582323431968689, + "learning_rate": 6.907876082176222e-05, + "loss": 0.8551, + "step": 117560 + }, + { + "epoch": 0.7511212194779142, + "grad_norm": 0.8646737933158875, + "learning_rate": 6.90741226898186e-05, + "loss": 0.8972, + "step": 117570 + }, + { + "epoch": 0.7511851066276529, + "grad_norm": 1.1300363540649414, + "learning_rate": 6.906948436578206e-05, + "loss": 0.8249, + "step": 117580 + }, + { + "epoch": 0.7512489937773916, + "grad_norm": 0.7908068895339966, + "learning_rate": 6.906484584969934e-05, + "loss": 0.8863, + "step": 117590 + }, + { + "epoch": 0.7513128809271303, + "grad_norm": 1.099071741104126, + "learning_rate": 6.906020714161711e-05, + "loss": 0.9949, + "step": 117600 + }, + { + "epoch": 0.751376768076869, + "grad_norm": 0.874218761920929, + "learning_rate": 6.905556824158212e-05, + "loss": 0.8182, + "step": 117610 + }, + { + "epoch": 0.7514406552266077, + "grad_norm": 1.2314951419830322, + "learning_rate": 6.905092914964105e-05, + "loss": 1.1335, + "step": 117620 + }, + { + "epoch": 0.7515045423763465, + "grad_norm": 0.8381962180137634, + "learning_rate": 6.904628986584066e-05, + "loss": 0.8751, + "step": 117630 + }, + { + "epoch": 0.7515684295260852, + "grad_norm": 1.2576111555099487, + "learning_rate": 6.904165039022766e-05, + "loss": 0.8728, + "step": 117640 + }, + { + "epoch": 0.7516323166758239, + "grad_norm": 0.8034125566482544, + "learning_rate": 6.903701072284875e-05, + "loss": 0.8502, + "step": 117650 + }, + { + "epoch": 0.7516962038255626, + "grad_norm": 0.6873534917831421, + "learning_rate": 6.903237086375068e-05, + "loss": 0.8783, + "step": 117660 + }, + { + "epoch": 0.7517600909753013, + "grad_norm": 0.8254218101501465, + "learning_rate": 6.902773081298015e-05, + "loss": 0.8267, + "step": 117670 + }, + { + "epoch": 0.75182397812504, + "grad_norm": 2.0020554065704346, + "learning_rate": 6.902309057058393e-05, + "loss": 0.8673, + "step": 117680 + }, + { + "epoch": 0.7518878652747786, + "grad_norm": 1.3021756410598755, + "learning_rate": 6.901845013660873e-05, + "loss": 0.9579, + "step": 117690 + }, + { + "epoch": 0.7519517524245173, + "grad_norm": 0.6825690865516663, + "learning_rate": 6.901380951110128e-05, + "loss": 0.6835, + "step": 117700 + }, + { + "epoch": 0.752015639574256, + "grad_norm": 1.2867560386657715, + "learning_rate": 6.900916869410831e-05, + "loss": 0.9329, + "step": 117710 + }, + { + "epoch": 0.7520795267239947, + "grad_norm": 0.7000182271003723, + "learning_rate": 6.900452768567657e-05, + "loss": 0.7614, + "step": 117720 + }, + { + "epoch": 0.7521434138737334, + "grad_norm": 0.7229273319244385, + "learning_rate": 6.89998864858528e-05, + "loss": 1.082, + "step": 117730 + }, + { + "epoch": 0.7522073010234721, + "grad_norm": 0.7700644135475159, + "learning_rate": 6.899524509468375e-05, + "loss": 1.1019, + "step": 117740 + }, + { + "epoch": 0.7522711881732108, + "grad_norm": 1.3923498392105103, + "learning_rate": 6.899060351221613e-05, + "loss": 0.8848, + "step": 117750 + }, + { + "epoch": 0.7523350753229495, + "grad_norm": 0.6365454792976379, + "learning_rate": 6.898596173849672e-05, + "loss": 1.1148, + "step": 117760 + }, + { + "epoch": 0.7523989624726882, + "grad_norm": 1.0675365924835205, + "learning_rate": 6.898131977357223e-05, + "loss": 0.9215, + "step": 117770 + }, + { + "epoch": 0.7524628496224269, + "grad_norm": 1.0569766759872437, + "learning_rate": 6.897667761748943e-05, + "loss": 0.7153, + "step": 117780 + }, + { + "epoch": 0.7525267367721656, + "grad_norm": 1.0126324892044067, + "learning_rate": 6.897203527029508e-05, + "loss": 0.925, + "step": 117790 + }, + { + "epoch": 0.7525906239219043, + "grad_norm": 0.9633859992027283, + "learning_rate": 6.896739273203592e-05, + "loss": 1.0008, + "step": 117800 + }, + { + "epoch": 0.752654511071643, + "grad_norm": 1.014870047569275, + "learning_rate": 6.896275000275872e-05, + "loss": 0.8294, + "step": 117810 + }, + { + "epoch": 0.7527183982213818, + "grad_norm": 1.1315689086914062, + "learning_rate": 6.895810708251019e-05, + "loss": 0.8227, + "step": 117820 + }, + { + "epoch": 0.7527822853711205, + "grad_norm": 2.6392271518707275, + "learning_rate": 6.895346397133714e-05, + "loss": 0.8076, + "step": 117830 + }, + { + "epoch": 0.7528461725208592, + "grad_norm": 0.7529024481773376, + "learning_rate": 6.89488206692863e-05, + "loss": 0.8401, + "step": 117840 + }, + { + "epoch": 0.7529100596705979, + "grad_norm": 0.9664776921272278, + "learning_rate": 6.894417717640447e-05, + "loss": 1.1103, + "step": 117850 + }, + { + "epoch": 0.7529739468203366, + "grad_norm": 0.732601523399353, + "learning_rate": 6.893953349273836e-05, + "loss": 0.7659, + "step": 117860 + }, + { + "epoch": 0.7530378339700753, + "grad_norm": 0.8885082006454468, + "learning_rate": 6.893488961833477e-05, + "loss": 0.7957, + "step": 117870 + }, + { + "epoch": 0.753101721119814, + "grad_norm": 1.1969749927520752, + "learning_rate": 6.893024555324045e-05, + "loss": 1.0971, + "step": 117880 + }, + { + "epoch": 0.7531656082695527, + "grad_norm": 1.1240543127059937, + "learning_rate": 6.892560129750221e-05, + "loss": 0.7384, + "step": 117890 + }, + { + "epoch": 0.7532294954192914, + "grad_norm": 1.059037208557129, + "learning_rate": 6.892095685116677e-05, + "loss": 0.7629, + "step": 117900 + }, + { + "epoch": 0.7532933825690301, + "grad_norm": 0.8897120356559753, + "learning_rate": 6.891631221428092e-05, + "loss": 1.11, + "step": 117910 + }, + { + "epoch": 0.7533572697187688, + "grad_norm": 1.0134267807006836, + "learning_rate": 6.891166738689146e-05, + "loss": 0.8046, + "step": 117920 + }, + { + "epoch": 0.7534211568685074, + "grad_norm": 1.6655761003494263, + "learning_rate": 6.890702236904514e-05, + "loss": 0.9587, + "step": 117930 + }, + { + "epoch": 0.7534850440182461, + "grad_norm": 1.1611841917037964, + "learning_rate": 6.890237716078874e-05, + "loss": 1.0593, + "step": 117940 + }, + { + "epoch": 0.7535489311679848, + "grad_norm": 0.8009079098701477, + "learning_rate": 6.889773176216905e-05, + "loss": 0.901, + "step": 117950 + }, + { + "epoch": 0.7536128183177235, + "grad_norm": 1.4743709564208984, + "learning_rate": 6.889308617323286e-05, + "loss": 1.0526, + "step": 117960 + }, + { + "epoch": 0.7536767054674622, + "grad_norm": 0.7194849252700806, + "learning_rate": 6.888844039402695e-05, + "loss": 0.7397, + "step": 117970 + }, + { + "epoch": 0.7537405926172009, + "grad_norm": 1.0453287363052368, + "learning_rate": 6.88837944245981e-05, + "loss": 1.104, + "step": 117980 + }, + { + "epoch": 0.7538044797669397, + "grad_norm": 0.8043333292007446, + "learning_rate": 6.88791482649931e-05, + "loss": 0.798, + "step": 117990 + }, + { + "epoch": 0.7538683669166784, + "grad_norm": 0.8196787238121033, + "learning_rate": 6.887450191525875e-05, + "loss": 0.8353, + "step": 118000 + }, + { + "epoch": 0.7539322540664171, + "grad_norm": 0.6180843114852905, + "learning_rate": 6.886985537544183e-05, + "loss": 1.0143, + "step": 118010 + }, + { + "epoch": 0.7539961412161558, + "grad_norm": 1.4987810850143433, + "learning_rate": 6.886520864558914e-05, + "loss": 0.7404, + "step": 118020 + }, + { + "epoch": 0.7540600283658945, + "grad_norm": 0.7595736384391785, + "learning_rate": 6.886056172574747e-05, + "loss": 1.0103, + "step": 118030 + }, + { + "epoch": 0.7541239155156332, + "grad_norm": 0.7522437572479248, + "learning_rate": 6.885591461596364e-05, + "loss": 1.1119, + "step": 118040 + }, + { + "epoch": 0.7541878026653719, + "grad_norm": 0.7806057333946228, + "learning_rate": 6.885126731628445e-05, + "loss": 0.921, + "step": 118050 + }, + { + "epoch": 0.7542516898151106, + "grad_norm": 0.722546398639679, + "learning_rate": 6.884661982675666e-05, + "loss": 0.9464, + "step": 118060 + }, + { + "epoch": 0.7543155769648493, + "grad_norm": 1.262364149093628, + "learning_rate": 6.884197214742713e-05, + "loss": 0.9972, + "step": 118070 + }, + { + "epoch": 0.754379464114588, + "grad_norm": 0.8939563035964966, + "learning_rate": 6.883732427834263e-05, + "loss": 1.019, + "step": 118080 + }, + { + "epoch": 0.7544433512643267, + "grad_norm": 1.2244893312454224, + "learning_rate": 6.883267621954998e-05, + "loss": 0.965, + "step": 118090 + }, + { + "epoch": 0.7545072384140654, + "grad_norm": 0.7305797338485718, + "learning_rate": 6.882802797109599e-05, + "loss": 0.976, + "step": 118100 + }, + { + "epoch": 0.7545711255638041, + "grad_norm": 0.7178799510002136, + "learning_rate": 6.882337953302747e-05, + "loss": 0.781, + "step": 118110 + }, + { + "epoch": 0.7546350127135428, + "grad_norm": 1.3888294696807861, + "learning_rate": 6.881873090539121e-05, + "loss": 1.1481, + "step": 118120 + }, + { + "epoch": 0.7546988998632815, + "grad_norm": 1.024906873703003, + "learning_rate": 6.881408208823409e-05, + "loss": 0.8872, + "step": 118130 + }, + { + "epoch": 0.7547627870130202, + "grad_norm": 1.0983256101608276, + "learning_rate": 6.880943308160287e-05, + "loss": 0.7869, + "step": 118140 + }, + { + "epoch": 0.754826674162759, + "grad_norm": 0.6162832975387573, + "learning_rate": 6.880478388554438e-05, + "loss": 0.5942, + "step": 118150 + }, + { + "epoch": 0.7548905613124977, + "grad_norm": 0.8574840426445007, + "learning_rate": 6.880013450010545e-05, + "loss": 0.8025, + "step": 118160 + }, + { + "epoch": 0.7549544484622364, + "grad_norm": 0.7543234825134277, + "learning_rate": 6.87954849253329e-05, + "loss": 0.6696, + "step": 118170 + }, + { + "epoch": 0.755018335611975, + "grad_norm": 0.6971087455749512, + "learning_rate": 6.879083516127356e-05, + "loss": 1.0234, + "step": 118180 + }, + { + "epoch": 0.7550822227617137, + "grad_norm": 1.7899150848388672, + "learning_rate": 6.878618520797424e-05, + "loss": 0.791, + "step": 118190 + }, + { + "epoch": 0.7551461099114524, + "grad_norm": 0.8531982898712158, + "learning_rate": 6.87815350654818e-05, + "loss": 0.9281, + "step": 118200 + }, + { + "epoch": 0.7552099970611911, + "grad_norm": 1.196276068687439, + "learning_rate": 6.877688473384304e-05, + "loss": 0.8512, + "step": 118210 + }, + { + "epoch": 0.7552738842109298, + "grad_norm": 0.9306949377059937, + "learning_rate": 6.877223421310481e-05, + "loss": 1.0111, + "step": 118220 + }, + { + "epoch": 0.7553377713606685, + "grad_norm": 0.6089597940444946, + "learning_rate": 6.876758350331395e-05, + "loss": 1.1216, + "step": 118230 + }, + { + "epoch": 0.7554016585104072, + "grad_norm": 0.9803066253662109, + "learning_rate": 6.876293260451728e-05, + "loss": 0.8555, + "step": 118240 + }, + { + "epoch": 0.7554655456601459, + "grad_norm": 1.0859441757202148, + "learning_rate": 6.875828151676165e-05, + "loss": 0.9888, + "step": 118250 + }, + { + "epoch": 0.7555294328098846, + "grad_norm": 2.5297441482543945, + "learning_rate": 6.875363024009389e-05, + "loss": 1.0641, + "step": 118260 + }, + { + "epoch": 0.7555933199596233, + "grad_norm": 0.7911348938941956, + "learning_rate": 6.874897877456086e-05, + "loss": 0.9526, + "step": 118270 + }, + { + "epoch": 0.755657207109362, + "grad_norm": 0.6605247855186462, + "learning_rate": 6.874432712020938e-05, + "loss": 0.7285, + "step": 118280 + }, + { + "epoch": 0.7557210942591007, + "grad_norm": 1.0286058187484741, + "learning_rate": 6.873967527708633e-05, + "loss": 0.8865, + "step": 118290 + }, + { + "epoch": 0.7557849814088394, + "grad_norm": 0.8798809051513672, + "learning_rate": 6.873502324523852e-05, + "loss": 0.8385, + "step": 118300 + }, + { + "epoch": 0.7558488685585781, + "grad_norm": 0.9660366177558899, + "learning_rate": 6.873037102471283e-05, + "loss": 0.8482, + "step": 118310 + }, + { + "epoch": 0.7559127557083168, + "grad_norm": 0.5325950980186462, + "learning_rate": 6.872571861555609e-05, + "loss": 0.903, + "step": 118320 + }, + { + "epoch": 0.7559766428580555, + "grad_norm": 0.715363085269928, + "learning_rate": 6.872106601781518e-05, + "loss": 1.1637, + "step": 118330 + }, + { + "epoch": 0.7560405300077943, + "grad_norm": 0.5783165097236633, + "learning_rate": 6.871641323153692e-05, + "loss": 1.0579, + "step": 118340 + }, + { + "epoch": 0.756104417157533, + "grad_norm": 0.7666789889335632, + "learning_rate": 6.871176025676818e-05, + "loss": 0.673, + "step": 118350 + }, + { + "epoch": 0.7561683043072717, + "grad_norm": 0.9602919816970825, + "learning_rate": 6.870710709355584e-05, + "loss": 0.795, + "step": 118360 + }, + { + "epoch": 0.7562321914570104, + "grad_norm": 0.9613474011421204, + "learning_rate": 6.870245374194675e-05, + "loss": 0.8152, + "step": 118370 + }, + { + "epoch": 0.7562960786067491, + "grad_norm": 0.9112039804458618, + "learning_rate": 6.869780020198777e-05, + "loss": 0.797, + "step": 118380 + }, + { + "epoch": 0.7563599657564878, + "grad_norm": 1.0254278182983398, + "learning_rate": 6.869314647372577e-05, + "loss": 1.0978, + "step": 118390 + }, + { + "epoch": 0.7564238529062265, + "grad_norm": 1.1823453903198242, + "learning_rate": 6.86884925572076e-05, + "loss": 1.1565, + "step": 118400 + }, + { + "epoch": 0.7564877400559652, + "grad_norm": 0.7849447727203369, + "learning_rate": 6.868383845248015e-05, + "loss": 0.949, + "step": 118410 + }, + { + "epoch": 0.7565516272057038, + "grad_norm": 1.0870212316513062, + "learning_rate": 6.867918415959028e-05, + "loss": 1.1851, + "step": 118420 + }, + { + "epoch": 0.7566155143554425, + "grad_norm": 1.099289894104004, + "learning_rate": 6.867452967858487e-05, + "loss": 0.6001, + "step": 118430 + }, + { + "epoch": 0.7566794015051812, + "grad_norm": 0.7557351589202881, + "learning_rate": 6.866987500951079e-05, + "loss": 1.1415, + "step": 118440 + }, + { + "epoch": 0.7567432886549199, + "grad_norm": 1.02070152759552, + "learning_rate": 6.866522015241493e-05, + "loss": 0.8612, + "step": 118450 + }, + { + "epoch": 0.7568071758046586, + "grad_norm": 1.3030376434326172, + "learning_rate": 6.866056510734414e-05, + "loss": 1.0833, + "step": 118460 + }, + { + "epoch": 0.7568710629543973, + "grad_norm": 1.1571980714797974, + "learning_rate": 6.86559098743453e-05, + "loss": 1.0281, + "step": 118470 + }, + { + "epoch": 0.756934950104136, + "grad_norm": 0.5520379543304443, + "learning_rate": 6.865125445346533e-05, + "loss": 0.7683, + "step": 118480 + }, + { + "epoch": 0.7569988372538747, + "grad_norm": 1.1610947847366333, + "learning_rate": 6.864659884475108e-05, + "loss": 0.8432, + "step": 118490 + }, + { + "epoch": 0.7570627244036134, + "grad_norm": 0.5098273754119873, + "learning_rate": 6.864194304824946e-05, + "loss": 0.9143, + "step": 118500 + }, + { + "epoch": 0.7571266115533521, + "grad_norm": 0.9760499596595764, + "learning_rate": 6.863728706400734e-05, + "loss": 0.8953, + "step": 118510 + }, + { + "epoch": 0.7571904987030909, + "grad_norm": 0.7442782521247864, + "learning_rate": 6.863263089207162e-05, + "loss": 1.1744, + "step": 118520 + }, + { + "epoch": 0.7572543858528296, + "grad_norm": 1.004907488822937, + "learning_rate": 6.862797453248918e-05, + "loss": 0.8577, + "step": 118530 + }, + { + "epoch": 0.7573182730025683, + "grad_norm": 1.782226800918579, + "learning_rate": 6.862331798530692e-05, + "loss": 0.8857, + "step": 118540 + }, + { + "epoch": 0.757382160152307, + "grad_norm": 0.9573549032211304, + "learning_rate": 6.861866125057175e-05, + "loss": 0.8601, + "step": 118550 + }, + { + "epoch": 0.7574460473020457, + "grad_norm": 1.518573522567749, + "learning_rate": 6.861400432833053e-05, + "loss": 1.1323, + "step": 118560 + }, + { + "epoch": 0.7575099344517844, + "grad_norm": 0.9608773589134216, + "learning_rate": 6.86093472186302e-05, + "loss": 0.9014, + "step": 118570 + }, + { + "epoch": 0.7575738216015231, + "grad_norm": 1.326219081878662, + "learning_rate": 6.860468992151764e-05, + "loss": 0.8637, + "step": 118580 + }, + { + "epoch": 0.7576377087512618, + "grad_norm": 0.6975284218788147, + "learning_rate": 6.860003243703976e-05, + "loss": 0.8435, + "step": 118590 + }, + { + "epoch": 0.7577015959010005, + "grad_norm": 0.8688485026359558, + "learning_rate": 6.859537476524346e-05, + "loss": 0.7777, + "step": 118600 + }, + { + "epoch": 0.7577654830507392, + "grad_norm": 1.3222057819366455, + "learning_rate": 6.859071690617565e-05, + "loss": 0.8211, + "step": 118610 + }, + { + "epoch": 0.7578293702004779, + "grad_norm": 0.7221174836158752, + "learning_rate": 6.858605885988325e-05, + "loss": 0.9684, + "step": 118620 + }, + { + "epoch": 0.7578932573502166, + "grad_norm": 1.1914112567901611, + "learning_rate": 6.858140062641313e-05, + "loss": 0.7354, + "step": 118630 + }, + { + "epoch": 0.7579571444999553, + "grad_norm": 0.7366169095039368, + "learning_rate": 6.857674220581225e-05, + "loss": 0.8568, + "step": 118640 + }, + { + "epoch": 0.758021031649694, + "grad_norm": 0.91144859790802, + "learning_rate": 6.85720835981275e-05, + "loss": 0.8024, + "step": 118650 + }, + { + "epoch": 0.7580849187994326, + "grad_norm": 1.1600289344787598, + "learning_rate": 6.856742480340581e-05, + "loss": 0.7997, + "step": 118660 + }, + { + "epoch": 0.7581488059491713, + "grad_norm": 1.0003736019134521, + "learning_rate": 6.856276582169408e-05, + "loss": 0.8877, + "step": 118670 + }, + { + "epoch": 0.75821269309891, + "grad_norm": 1.289528489112854, + "learning_rate": 6.855810665303923e-05, + "loss": 0.6617, + "step": 118680 + }, + { + "epoch": 0.7582765802486487, + "grad_norm": 0.9788122773170471, + "learning_rate": 6.85534472974882e-05, + "loss": 0.7778, + "step": 118690 + }, + { + "epoch": 0.7583404673983875, + "grad_norm": 0.6570330858230591, + "learning_rate": 6.854878775508792e-05, + "loss": 0.9725, + "step": 118700 + }, + { + "epoch": 0.7584043545481262, + "grad_norm": 0.911495566368103, + "learning_rate": 6.854412802588528e-05, + "loss": 0.83, + "step": 118710 + }, + { + "epoch": 0.7584682416978649, + "grad_norm": 0.729153037071228, + "learning_rate": 6.853946810992722e-05, + "loss": 1.1369, + "step": 118720 + }, + { + "epoch": 0.7585321288476036, + "grad_norm": 1.3419857025146484, + "learning_rate": 6.853480800726069e-05, + "loss": 0.7661, + "step": 118730 + }, + { + "epoch": 0.7585960159973423, + "grad_norm": 0.9699906706809998, + "learning_rate": 6.853014771793261e-05, + "loss": 0.9556, + "step": 118740 + }, + { + "epoch": 0.758659903147081, + "grad_norm": 0.9490513205528259, + "learning_rate": 6.852548724198992e-05, + "loss": 0.8333, + "step": 118750 + }, + { + "epoch": 0.7587237902968197, + "grad_norm": 0.9197043776512146, + "learning_rate": 6.852082657947953e-05, + "loss": 0.888, + "step": 118760 + }, + { + "epoch": 0.7587876774465584, + "grad_norm": 0.6110111474990845, + "learning_rate": 6.85161657304484e-05, + "loss": 0.8897, + "step": 118770 + }, + { + "epoch": 0.7588515645962971, + "grad_norm": 0.7827733159065247, + "learning_rate": 6.851150469494347e-05, + "loss": 0.6686, + "step": 118780 + }, + { + "epoch": 0.7589154517460358, + "grad_norm": 0.9810250997543335, + "learning_rate": 6.850684347301166e-05, + "loss": 0.9115, + "step": 118790 + }, + { + "epoch": 0.7589793388957745, + "grad_norm": 0.816536545753479, + "learning_rate": 6.850218206469993e-05, + "loss": 1.0305, + "step": 118800 + }, + { + "epoch": 0.7590432260455132, + "grad_norm": 0.9732635617256165, + "learning_rate": 6.849752047005522e-05, + "loss": 0.8531, + "step": 118810 + }, + { + "epoch": 0.7591071131952519, + "grad_norm": 1.23931086063385, + "learning_rate": 6.849285868912448e-05, + "loss": 0.9552, + "step": 118820 + }, + { + "epoch": 0.7591710003449906, + "grad_norm": 1.2935327291488647, + "learning_rate": 6.848819672195466e-05, + "loss": 0.8719, + "step": 118830 + }, + { + "epoch": 0.7592348874947293, + "grad_norm": 1.2548699378967285, + "learning_rate": 6.84835345685927e-05, + "loss": 0.9801, + "step": 118840 + }, + { + "epoch": 0.759298774644468, + "grad_norm": 1.043545126914978, + "learning_rate": 6.847887222908555e-05, + "loss": 1.1205, + "step": 118850 + }, + { + "epoch": 0.7593626617942068, + "grad_norm": 0.5109646320343018, + "learning_rate": 6.847420970348018e-05, + "loss": 0.8246, + "step": 118860 + }, + { + "epoch": 0.7594265489439455, + "grad_norm": 0.5622779130935669, + "learning_rate": 6.846954699182352e-05, + "loss": 0.7426, + "step": 118870 + }, + { + "epoch": 0.7594904360936842, + "grad_norm": 0.672226071357727, + "learning_rate": 6.846488409416256e-05, + "loss": 0.7645, + "step": 118880 + }, + { + "epoch": 0.7595543232434229, + "grad_norm": 0.6986059546470642, + "learning_rate": 6.846022101054422e-05, + "loss": 1.0862, + "step": 118890 + }, + { + "epoch": 0.7596182103931616, + "grad_norm": 1.0945719480514526, + "learning_rate": 6.84555577410155e-05, + "loss": 1.0323, + "step": 118900 + }, + { + "epoch": 0.7596820975429002, + "grad_norm": 0.9224714040756226, + "learning_rate": 6.845089428562336e-05, + "loss": 0.8719, + "step": 118910 + }, + { + "epoch": 0.7597459846926389, + "grad_norm": 0.9247092008590698, + "learning_rate": 6.844623064441473e-05, + "loss": 0.8067, + "step": 118920 + }, + { + "epoch": 0.7598098718423776, + "grad_norm": 0.732523500919342, + "learning_rate": 6.84415668174366e-05, + "loss": 1.2519, + "step": 118930 + }, + { + "epoch": 0.7598737589921163, + "grad_norm": 0.9560425281524658, + "learning_rate": 6.843690280473596e-05, + "loss": 0.903, + "step": 118940 + }, + { + "epoch": 0.759937646141855, + "grad_norm": 0.913837194442749, + "learning_rate": 6.843223860635974e-05, + "loss": 0.9495, + "step": 118950 + }, + { + "epoch": 0.7600015332915937, + "grad_norm": 1.002140998840332, + "learning_rate": 6.842757422235494e-05, + "loss": 0.8853, + "step": 118960 + }, + { + "epoch": 0.7600654204413324, + "grad_norm": 0.9161799550056458, + "learning_rate": 6.842290965276852e-05, + "loss": 0.8618, + "step": 118970 + }, + { + "epoch": 0.7601293075910711, + "grad_norm": 0.6475778222084045, + "learning_rate": 6.841824489764746e-05, + "loss": 0.809, + "step": 118980 + }, + { + "epoch": 0.7601931947408098, + "grad_norm": 1.090684413909912, + "learning_rate": 6.841357995703874e-05, + "loss": 1.0392, + "step": 118990 + }, + { + "epoch": 0.7602570818905485, + "grad_norm": 0.6572669148445129, + "learning_rate": 6.840891483098935e-05, + "loss": 1.0299, + "step": 119000 + }, + { + "epoch": 0.7603209690402872, + "grad_norm": 0.9290599226951599, + "learning_rate": 6.840424951954625e-05, + "loss": 0.9194, + "step": 119010 + }, + { + "epoch": 0.7603848561900259, + "grad_norm": 0.7174366116523743, + "learning_rate": 6.839958402275643e-05, + "loss": 0.8349, + "step": 119020 + }, + { + "epoch": 0.7604487433397646, + "grad_norm": 0.9106315970420837, + "learning_rate": 6.839491834066691e-05, + "loss": 0.8991, + "step": 119030 + }, + { + "epoch": 0.7605126304895033, + "grad_norm": 1.6887893676757812, + "learning_rate": 6.839025247332462e-05, + "loss": 0.8116, + "step": 119040 + }, + { + "epoch": 0.7605765176392421, + "grad_norm": 0.7932513356208801, + "learning_rate": 6.838558642077658e-05, + "loss": 0.952, + "step": 119050 + }, + { + "epoch": 0.7606404047889808, + "grad_norm": 0.733961284160614, + "learning_rate": 6.838092018306979e-05, + "loss": 0.9746, + "step": 119060 + }, + { + "epoch": 0.7607042919387195, + "grad_norm": 0.9534251093864441, + "learning_rate": 6.837625376025123e-05, + "loss": 0.9129, + "step": 119070 + }, + { + "epoch": 0.7607681790884582, + "grad_norm": 0.867732048034668, + "learning_rate": 6.837158715236789e-05, + "loss": 1.0368, + "step": 119080 + }, + { + "epoch": 0.7608320662381969, + "grad_norm": 0.9323291778564453, + "learning_rate": 6.836692035946677e-05, + "loss": 0.7597, + "step": 119090 + }, + { + "epoch": 0.7608959533879356, + "grad_norm": 1.1437997817993164, + "learning_rate": 6.83622533815949e-05, + "loss": 0.9955, + "step": 119100 + }, + { + "epoch": 0.7609598405376743, + "grad_norm": 0.6462964415550232, + "learning_rate": 6.835758621879922e-05, + "loss": 1.2323, + "step": 119110 + }, + { + "epoch": 0.761023727687413, + "grad_norm": 0.7670947313308716, + "learning_rate": 6.835291887112678e-05, + "loss": 0.762, + "step": 119120 + }, + { + "epoch": 0.7610876148371517, + "grad_norm": 1.1615947484970093, + "learning_rate": 6.834825133862457e-05, + "loss": 0.8781, + "step": 119130 + }, + { + "epoch": 0.7611515019868904, + "grad_norm": 0.6855329871177673, + "learning_rate": 6.834358362133959e-05, + "loss": 0.9, + "step": 119140 + }, + { + "epoch": 0.761215389136629, + "grad_norm": 1.5815876722335815, + "learning_rate": 6.833891571931886e-05, + "loss": 0.6895, + "step": 119150 + }, + { + "epoch": 0.7612792762863677, + "grad_norm": 0.804578423500061, + "learning_rate": 6.833424763260938e-05, + "loss": 0.8916, + "step": 119160 + }, + { + "epoch": 0.7613431634361064, + "grad_norm": 0.6342503428459167, + "learning_rate": 6.832957936125816e-05, + "loss": 1.0142, + "step": 119170 + }, + { + "epoch": 0.7614070505858451, + "grad_norm": 1.194042682647705, + "learning_rate": 6.832491090531223e-05, + "loss": 0.7734, + "step": 119180 + }, + { + "epoch": 0.7614709377355838, + "grad_norm": 0.8138452172279358, + "learning_rate": 6.83202422648186e-05, + "loss": 0.833, + "step": 119190 + }, + { + "epoch": 0.7615348248853225, + "grad_norm": 0.6419638395309448, + "learning_rate": 6.831557343982427e-05, + "loss": 0.8826, + "step": 119200 + }, + { + "epoch": 0.7615987120350612, + "grad_norm": 0.9119747281074524, + "learning_rate": 6.831090443037626e-05, + "loss": 0.8635, + "step": 119210 + }, + { + "epoch": 0.7616625991848, + "grad_norm": 1.2391308546066284, + "learning_rate": 6.83062352365216e-05, + "loss": 1.0166, + "step": 119220 + }, + { + "epoch": 0.7617264863345387, + "grad_norm": 1.1494985818862915, + "learning_rate": 6.830156585830734e-05, + "loss": 1.0373, + "step": 119230 + }, + { + "epoch": 0.7617903734842774, + "grad_norm": 0.8222819566726685, + "learning_rate": 6.829689629578046e-05, + "loss": 0.7228, + "step": 119240 + }, + { + "epoch": 0.7618542606340161, + "grad_norm": 0.60460364818573, + "learning_rate": 6.829222654898799e-05, + "loss": 0.8322, + "step": 119250 + }, + { + "epoch": 0.7619181477837548, + "grad_norm": 1.7040772438049316, + "learning_rate": 6.828755661797699e-05, + "loss": 1.1171, + "step": 119260 + }, + { + "epoch": 0.7619820349334935, + "grad_norm": 0.7591485977172852, + "learning_rate": 6.828288650279448e-05, + "loss": 0.8535, + "step": 119270 + }, + { + "epoch": 0.7620459220832322, + "grad_norm": 1.0769449472427368, + "learning_rate": 6.827821620348749e-05, + "loss": 1.0974, + "step": 119280 + }, + { + "epoch": 0.7621098092329709, + "grad_norm": 0.7819190621376038, + "learning_rate": 6.827354572010303e-05, + "loss": 0.9247, + "step": 119290 + }, + { + "epoch": 0.7621736963827096, + "grad_norm": 0.7512619495391846, + "learning_rate": 6.826887505268818e-05, + "loss": 1.4029, + "step": 119300 + }, + { + "epoch": 0.7622375835324483, + "grad_norm": 0.7051581740379333, + "learning_rate": 6.826420420128993e-05, + "loss": 0.7934, + "step": 119310 + }, + { + "epoch": 0.762301470682187, + "grad_norm": 2.504819393157959, + "learning_rate": 6.825953316595535e-05, + "loss": 0.9571, + "step": 119320 + }, + { + "epoch": 0.7623653578319257, + "grad_norm": 1.1130508184432983, + "learning_rate": 6.825486194673148e-05, + "loss": 0.8393, + "step": 119330 + }, + { + "epoch": 0.7624292449816644, + "grad_norm": 0.8944107890129089, + "learning_rate": 6.825019054366536e-05, + "loss": 0.7698, + "step": 119340 + }, + { + "epoch": 0.7624931321314031, + "grad_norm": 1.020553469657898, + "learning_rate": 6.824551895680404e-05, + "loss": 0.8631, + "step": 119350 + }, + { + "epoch": 0.7625570192811418, + "grad_norm": 0.9540114402770996, + "learning_rate": 6.824084718619454e-05, + "loss": 0.8874, + "step": 119360 + }, + { + "epoch": 0.7626209064308805, + "grad_norm": 0.6075379252433777, + "learning_rate": 6.823617523188394e-05, + "loss": 0.8942, + "step": 119370 + }, + { + "epoch": 0.7626847935806192, + "grad_norm": 1.7551565170288086, + "learning_rate": 6.823150309391928e-05, + "loss": 1.041, + "step": 119380 + }, + { + "epoch": 0.7627486807303578, + "grad_norm": 0.7924169898033142, + "learning_rate": 6.82268307723476e-05, + "loss": 0.8415, + "step": 119390 + }, + { + "epoch": 0.7628125678800965, + "grad_norm": 0.9569699168205261, + "learning_rate": 6.822215826721597e-05, + "loss": 0.8566, + "step": 119400 + }, + { + "epoch": 0.7628764550298353, + "grad_norm": 0.8898611068725586, + "learning_rate": 6.821748557857144e-05, + "loss": 0.6289, + "step": 119410 + }, + { + "epoch": 0.762940342179574, + "grad_norm": 1.3441417217254639, + "learning_rate": 6.821281270646106e-05, + "loss": 0.7926, + "step": 119420 + }, + { + "epoch": 0.7630042293293127, + "grad_norm": 0.8085065484046936, + "learning_rate": 6.820813965093193e-05, + "loss": 1.0383, + "step": 119430 + }, + { + "epoch": 0.7630681164790514, + "grad_norm": 1.1543138027191162, + "learning_rate": 6.820346641203106e-05, + "loss": 0.769, + "step": 119440 + }, + { + "epoch": 0.7631320036287901, + "grad_norm": 1.0326635837554932, + "learning_rate": 6.819879298980553e-05, + "loss": 0.6679, + "step": 119450 + }, + { + "epoch": 0.7631958907785288, + "grad_norm": 0.7548609972000122, + "learning_rate": 6.819411938430243e-05, + "loss": 0.8651, + "step": 119460 + }, + { + "epoch": 0.7632597779282675, + "grad_norm": 0.6115458607673645, + "learning_rate": 6.818944559556879e-05, + "loss": 0.9125, + "step": 119470 + }, + { + "epoch": 0.7633236650780062, + "grad_norm": 0.8484747409820557, + "learning_rate": 6.818477162365172e-05, + "loss": 0.94, + "step": 119480 + }, + { + "epoch": 0.7633875522277449, + "grad_norm": 0.9800739288330078, + "learning_rate": 6.818009746859823e-05, + "loss": 0.6768, + "step": 119490 + }, + { + "epoch": 0.7634514393774836, + "grad_norm": 1.5265213251113892, + "learning_rate": 6.817542313045547e-05, + "loss": 1.0567, + "step": 119500 + }, + { + "epoch": 0.7635153265272223, + "grad_norm": 1.5931601524353027, + "learning_rate": 6.817074860927045e-05, + "loss": 1.4033, + "step": 119510 + }, + { + "epoch": 0.763579213676961, + "grad_norm": 0.976694643497467, + "learning_rate": 6.816607390509028e-05, + "loss": 0.8279, + "step": 119520 + }, + { + "epoch": 0.7636431008266997, + "grad_norm": 0.9799617528915405, + "learning_rate": 6.816139901796202e-05, + "loss": 1.0672, + "step": 119530 + }, + { + "epoch": 0.7637069879764384, + "grad_norm": 1.1323072910308838, + "learning_rate": 6.815672394793277e-05, + "loss": 0.9828, + "step": 119540 + }, + { + "epoch": 0.7637708751261771, + "grad_norm": 1.2014492750167847, + "learning_rate": 6.815204869504961e-05, + "loss": 1.0682, + "step": 119550 + }, + { + "epoch": 0.7638347622759158, + "grad_norm": 3.5408363342285156, + "learning_rate": 6.81473732593596e-05, + "loss": 1.0801, + "step": 119560 + }, + { + "epoch": 0.7638986494256546, + "grad_norm": 0.9492976069450378, + "learning_rate": 6.814269764090986e-05, + "loss": 0.9406, + "step": 119570 + }, + { + "epoch": 0.7639625365753933, + "grad_norm": 0.7474743723869324, + "learning_rate": 6.813802183974745e-05, + "loss": 0.8298, + "step": 119580 + }, + { + "epoch": 0.764026423725132, + "grad_norm": 1.4195810556411743, + "learning_rate": 6.813334585591946e-05, + "loss": 1.1686, + "step": 119590 + }, + { + "epoch": 0.7640903108748707, + "grad_norm": 1.6396797895431519, + "learning_rate": 6.8128669689473e-05, + "loss": 0.7154, + "step": 119600 + }, + { + "epoch": 0.7641541980246094, + "grad_norm": 1.0308012962341309, + "learning_rate": 6.812399334045514e-05, + "loss": 0.6851, + "step": 119610 + }, + { + "epoch": 0.7642180851743481, + "grad_norm": 0.7701680064201355, + "learning_rate": 6.8119316808913e-05, + "loss": 0.987, + "step": 119620 + }, + { + "epoch": 0.7642819723240867, + "grad_norm": 0.8354985117912292, + "learning_rate": 6.811464009489365e-05, + "loss": 0.8276, + "step": 119630 + }, + { + "epoch": 0.7643458594738254, + "grad_norm": 1.0913121700286865, + "learning_rate": 6.810996319844422e-05, + "loss": 0.8687, + "step": 119640 + }, + { + "epoch": 0.7644097466235641, + "grad_norm": 0.845710039138794, + "learning_rate": 6.81052861196118e-05, + "loss": 0.9144, + "step": 119650 + }, + { + "epoch": 0.7644736337733028, + "grad_norm": 0.9504249095916748, + "learning_rate": 6.810060885844346e-05, + "loss": 0.9089, + "step": 119660 + }, + { + "epoch": 0.7645375209230415, + "grad_norm": 0.846555769443512, + "learning_rate": 6.809593141498633e-05, + "loss": 0.9722, + "step": 119670 + }, + { + "epoch": 0.7646014080727802, + "grad_norm": 1.736290693283081, + "learning_rate": 6.809125378928754e-05, + "loss": 0.8593, + "step": 119680 + }, + { + "epoch": 0.7646652952225189, + "grad_norm": 0.732244610786438, + "learning_rate": 6.808657598139416e-05, + "loss": 0.8176, + "step": 119690 + }, + { + "epoch": 0.7647291823722576, + "grad_norm": 0.5221996307373047, + "learning_rate": 6.80818979913533e-05, + "loss": 0.7639, + "step": 119700 + }, + { + "epoch": 0.7647930695219963, + "grad_norm": 0.8514750003814697, + "learning_rate": 6.80772198192121e-05, + "loss": 0.9472, + "step": 119710 + }, + { + "epoch": 0.764856956671735, + "grad_norm": 0.9706042408943176, + "learning_rate": 6.807254146501766e-05, + "loss": 0.8994, + "step": 119720 + }, + { + "epoch": 0.7649208438214737, + "grad_norm": 0.792775571346283, + "learning_rate": 6.806786292881708e-05, + "loss": 0.9944, + "step": 119730 + }, + { + "epoch": 0.7649847309712124, + "grad_norm": 0.786178469657898, + "learning_rate": 6.80631842106575e-05, + "loss": 0.8212, + "step": 119740 + }, + { + "epoch": 0.7650486181209512, + "grad_norm": 0.7634421586990356, + "learning_rate": 6.805850531058604e-05, + "loss": 0.896, + "step": 119750 + }, + { + "epoch": 0.7651125052706899, + "grad_norm": 1.2600396871566772, + "learning_rate": 6.805382622864978e-05, + "loss": 0.8976, + "step": 119760 + }, + { + "epoch": 0.7651763924204286, + "grad_norm": 0.9852913618087769, + "learning_rate": 6.804914696489587e-05, + "loss": 0.807, + "step": 119770 + }, + { + "epoch": 0.7652402795701673, + "grad_norm": 0.7352543473243713, + "learning_rate": 6.804446751937146e-05, + "loss": 0.9483, + "step": 119780 + }, + { + "epoch": 0.765304166719906, + "grad_norm": 0.6477217674255371, + "learning_rate": 6.803978789212363e-05, + "loss": 0.7509, + "step": 119790 + }, + { + "epoch": 0.7653680538696447, + "grad_norm": 0.7691764831542969, + "learning_rate": 6.803510808319954e-05, + "loss": 0.9045, + "step": 119800 + }, + { + "epoch": 0.7654319410193834, + "grad_norm": 1.1947227716445923, + "learning_rate": 6.803042809264632e-05, + "loss": 1.0757, + "step": 119810 + }, + { + "epoch": 0.7654958281691221, + "grad_norm": 0.9047258496284485, + "learning_rate": 6.802574792051107e-05, + "loss": 0.8635, + "step": 119820 + }, + { + "epoch": 0.7655597153188608, + "grad_norm": 0.8005874752998352, + "learning_rate": 6.802106756684096e-05, + "loss": 0.7446, + "step": 119830 + }, + { + "epoch": 0.7656236024685995, + "grad_norm": 0.7462660670280457, + "learning_rate": 6.80163870316831e-05, + "loss": 0.6892, + "step": 119840 + }, + { + "epoch": 0.7656874896183382, + "grad_norm": 0.7342929244041443, + "learning_rate": 6.801170631508465e-05, + "loss": 0.8575, + "step": 119850 + }, + { + "epoch": 0.7657513767680769, + "grad_norm": 0.6299241781234741, + "learning_rate": 6.800702541709272e-05, + "loss": 1.0322, + "step": 119860 + }, + { + "epoch": 0.7658152639178156, + "grad_norm": 2.7845346927642822, + "learning_rate": 6.800234433775448e-05, + "loss": 0.9482, + "step": 119870 + }, + { + "epoch": 0.7658791510675542, + "grad_norm": 0.7888658046722412, + "learning_rate": 6.799766307711704e-05, + "loss": 0.9034, + "step": 119880 + }, + { + "epoch": 0.7659430382172929, + "grad_norm": 1.2494713068008423, + "learning_rate": 6.799298163522757e-05, + "loss": 0.7792, + "step": 119890 + }, + { + "epoch": 0.7660069253670316, + "grad_norm": 0.8245709538459778, + "learning_rate": 6.79883000121332e-05, + "loss": 0.7697, + "step": 119900 + }, + { + "epoch": 0.7660708125167703, + "grad_norm": 0.6942414045333862, + "learning_rate": 6.79836182078811e-05, + "loss": 0.9865, + "step": 119910 + }, + { + "epoch": 0.766134699666509, + "grad_norm": 0.8170196413993835, + "learning_rate": 6.797893622251841e-05, + "loss": 0.8344, + "step": 119920 + }, + { + "epoch": 0.7661985868162478, + "grad_norm": 1.3100725412368774, + "learning_rate": 6.797425405609225e-05, + "loss": 0.8997, + "step": 119930 + }, + { + "epoch": 0.7662624739659865, + "grad_norm": 0.8601463437080383, + "learning_rate": 6.796957170864984e-05, + "loss": 0.7987, + "step": 119940 + }, + { + "epoch": 0.7663263611157252, + "grad_norm": 0.9720836877822876, + "learning_rate": 6.796488918023827e-05, + "loss": 0.725, + "step": 119950 + }, + { + "epoch": 0.7663902482654639, + "grad_norm": 0.8214823007583618, + "learning_rate": 6.796020647090472e-05, + "loss": 0.6698, + "step": 119960 + }, + { + "epoch": 0.7664541354152026, + "grad_norm": 1.443429946899414, + "learning_rate": 6.795552358069637e-05, + "loss": 0.9442, + "step": 119970 + }, + { + "epoch": 0.7665180225649413, + "grad_norm": 0.8205900192260742, + "learning_rate": 6.795084050966038e-05, + "loss": 0.9229, + "step": 119980 + }, + { + "epoch": 0.76658190971468, + "grad_norm": 0.7707697153091431, + "learning_rate": 6.794615725784386e-05, + "loss": 1.2311, + "step": 119990 + }, + { + "epoch": 0.7666457968644187, + "grad_norm": 0.9643944501876831, + "learning_rate": 6.794147382529403e-05, + "loss": 0.8979, + "step": 120000 + }, + { + "epoch": 0.7667096840141574, + "grad_norm": 0.6266613006591797, + "learning_rate": 6.793679021205804e-05, + "loss": 0.8486, + "step": 120010 + }, + { + "epoch": 0.7667735711638961, + "grad_norm": 0.7396105527877808, + "learning_rate": 6.793210641818305e-05, + "loss": 0.6949, + "step": 120020 + }, + { + "epoch": 0.7668374583136348, + "grad_norm": 1.156002402305603, + "learning_rate": 6.792742244371624e-05, + "loss": 0.8869, + "step": 120030 + }, + { + "epoch": 0.7669013454633735, + "grad_norm": 0.8425427079200745, + "learning_rate": 6.792273828870477e-05, + "loss": 0.6861, + "step": 120040 + }, + { + "epoch": 0.7669652326131122, + "grad_norm": 0.8769024610519409, + "learning_rate": 6.791805395319582e-05, + "loss": 0.7712, + "step": 120050 + }, + { + "epoch": 0.7670291197628509, + "grad_norm": 0.7793144583702087, + "learning_rate": 6.791336943723657e-05, + "loss": 0.8332, + "step": 120060 + }, + { + "epoch": 0.7670930069125896, + "grad_norm": 0.6223095655441284, + "learning_rate": 6.790868474087419e-05, + "loss": 0.8934, + "step": 120070 + }, + { + "epoch": 0.7671568940623283, + "grad_norm": 1.2079180479049683, + "learning_rate": 6.790399986415587e-05, + "loss": 1.0223, + "step": 120080 + }, + { + "epoch": 0.767220781212067, + "grad_norm": 3.4866483211517334, + "learning_rate": 6.789931480712876e-05, + "loss": 0.9603, + "step": 120090 + }, + { + "epoch": 0.7672846683618058, + "grad_norm": 0.7265200614929199, + "learning_rate": 6.789462956984008e-05, + "loss": 0.8821, + "step": 120100 + }, + { + "epoch": 0.7673485555115445, + "grad_norm": 0.47647228837013245, + "learning_rate": 6.788994415233699e-05, + "loss": 0.8709, + "step": 120110 + }, + { + "epoch": 0.7674124426612831, + "grad_norm": 0.7587248682975769, + "learning_rate": 6.78852585546667e-05, + "loss": 0.9765, + "step": 120120 + }, + { + "epoch": 0.7674763298110218, + "grad_norm": 1.0275150537490845, + "learning_rate": 6.788057277687638e-05, + "loss": 0.9257, + "step": 120130 + }, + { + "epoch": 0.7675402169607605, + "grad_norm": 2.4082839488983154, + "learning_rate": 6.787588681901321e-05, + "loss": 0.7645, + "step": 120140 + }, + { + "epoch": 0.7676041041104992, + "grad_norm": 0.8140583634376526, + "learning_rate": 6.78712006811244e-05, + "loss": 0.9093, + "step": 120150 + }, + { + "epoch": 0.7676679912602379, + "grad_norm": 0.5556838512420654, + "learning_rate": 6.786651436325715e-05, + "loss": 0.8601, + "step": 120160 + }, + { + "epoch": 0.7677318784099766, + "grad_norm": 0.8413486480712891, + "learning_rate": 6.786182786545863e-05, + "loss": 0.7334, + "step": 120170 + }, + { + "epoch": 0.7677957655597153, + "grad_norm": 0.7864370346069336, + "learning_rate": 6.785714118777607e-05, + "loss": 1.0277, + "step": 120180 + }, + { + "epoch": 0.767859652709454, + "grad_norm": 0.9981165528297424, + "learning_rate": 6.785245433025662e-05, + "loss": 0.7542, + "step": 120190 + }, + { + "epoch": 0.7679235398591927, + "grad_norm": 2.119781255722046, + "learning_rate": 6.784776729294752e-05, + "loss": 0.6569, + "step": 120200 + }, + { + "epoch": 0.7679874270089314, + "grad_norm": 1.0580250024795532, + "learning_rate": 6.784308007589598e-05, + "loss": 0.7881, + "step": 120210 + }, + { + "epoch": 0.7680513141586701, + "grad_norm": 0.9876987338066101, + "learning_rate": 6.783839267914918e-05, + "loss": 0.7479, + "step": 120220 + }, + { + "epoch": 0.7681152013084088, + "grad_norm": 0.8951814770698547, + "learning_rate": 6.783370510275433e-05, + "loss": 0.8872, + "step": 120230 + }, + { + "epoch": 0.7681790884581475, + "grad_norm": 2.5809152126312256, + "learning_rate": 6.782901734675864e-05, + "loss": 0.8542, + "step": 120240 + }, + { + "epoch": 0.7682429756078862, + "grad_norm": 0.884162962436676, + "learning_rate": 6.782432941120932e-05, + "loss": 0.7915, + "step": 120250 + }, + { + "epoch": 0.7683068627576249, + "grad_norm": 0.7958625555038452, + "learning_rate": 6.781964129615359e-05, + "loss": 0.9709, + "step": 120260 + }, + { + "epoch": 0.7683707499073636, + "grad_norm": 0.8712575435638428, + "learning_rate": 6.781495300163865e-05, + "loss": 0.8752, + "step": 120270 + }, + { + "epoch": 0.7684346370571024, + "grad_norm": 0.8485830426216125, + "learning_rate": 6.781026452771172e-05, + "loss": 1.0295, + "step": 120280 + }, + { + "epoch": 0.7684985242068411, + "grad_norm": 0.7899221777915955, + "learning_rate": 6.780557587442001e-05, + "loss": 0.7579, + "step": 120290 + }, + { + "epoch": 0.7685624113565798, + "grad_norm": 0.5380288362503052, + "learning_rate": 6.780088704181075e-05, + "loss": 1.2273, + "step": 120300 + }, + { + "epoch": 0.7686262985063185, + "grad_norm": 0.8067999482154846, + "learning_rate": 6.779619802993118e-05, + "loss": 0.9209, + "step": 120310 + }, + { + "epoch": 0.7686901856560572, + "grad_norm": 0.7237452268600464, + "learning_rate": 6.779150883882848e-05, + "loss": 1.0752, + "step": 120320 + }, + { + "epoch": 0.7687540728057959, + "grad_norm": 0.6322763562202454, + "learning_rate": 6.77868194685499e-05, + "loss": 0.7969, + "step": 120330 + }, + { + "epoch": 0.7688179599555346, + "grad_norm": 1.1552351713180542, + "learning_rate": 6.778212991914266e-05, + "loss": 0.9154, + "step": 120340 + }, + { + "epoch": 0.7688818471052733, + "grad_norm": 1.1436083316802979, + "learning_rate": 6.777744019065399e-05, + "loss": 0.9167, + "step": 120350 + }, + { + "epoch": 0.7689457342550119, + "grad_norm": 1.0631415843963623, + "learning_rate": 6.77727502831311e-05, + "loss": 0.8223, + "step": 120360 + }, + { + "epoch": 0.7690096214047506, + "grad_norm": 0.9322156310081482, + "learning_rate": 6.776806019662127e-05, + "loss": 0.9355, + "step": 120370 + }, + { + "epoch": 0.7690735085544893, + "grad_norm": 0.9718419909477234, + "learning_rate": 6.776336993117168e-05, + "loss": 0.8536, + "step": 120380 + }, + { + "epoch": 0.769137395704228, + "grad_norm": 1.3241702318191528, + "learning_rate": 6.775867948682959e-05, + "loss": 0.9899, + "step": 120390 + }, + { + "epoch": 0.7692012828539667, + "grad_norm": 1.2391200065612793, + "learning_rate": 6.775398886364224e-05, + "loss": 0.7317, + "step": 120400 + }, + { + "epoch": 0.7692651700037054, + "grad_norm": 0.8078621029853821, + "learning_rate": 6.774929806165686e-05, + "loss": 1.015, + "step": 120410 + }, + { + "epoch": 0.7693290571534441, + "grad_norm": 0.7837865948677063, + "learning_rate": 6.77446070809207e-05, + "loss": 0.8237, + "step": 120420 + }, + { + "epoch": 0.7693929443031828, + "grad_norm": 0.9741398096084595, + "learning_rate": 6.773991592148098e-05, + "loss": 0.8702, + "step": 120430 + }, + { + "epoch": 0.7694568314529215, + "grad_norm": 1.1501209735870361, + "learning_rate": 6.773522458338497e-05, + "loss": 0.9291, + "step": 120440 + }, + { + "epoch": 0.7695207186026602, + "grad_norm": 0.5776450634002686, + "learning_rate": 6.77305330666799e-05, + "loss": 1.0505, + "step": 120450 + }, + { + "epoch": 0.769584605752399, + "grad_norm": 0.7684696316719055, + "learning_rate": 6.772584137141302e-05, + "loss": 0.7328, + "step": 120460 + }, + { + "epoch": 0.7696484929021377, + "grad_norm": 0.680523157119751, + "learning_rate": 6.772114949763158e-05, + "loss": 0.9261, + "step": 120470 + }, + { + "epoch": 0.7697123800518764, + "grad_norm": 0.8536334037780762, + "learning_rate": 6.771645744538284e-05, + "loss": 1.0571, + "step": 120480 + }, + { + "epoch": 0.7697762672016151, + "grad_norm": 0.7580819129943848, + "learning_rate": 6.771176521471405e-05, + "loss": 0.8517, + "step": 120490 + }, + { + "epoch": 0.7698401543513538, + "grad_norm": 0.9832444190979004, + "learning_rate": 6.770707280567247e-05, + "loss": 0.9181, + "step": 120500 + }, + { + "epoch": 0.7699040415010925, + "grad_norm": 0.7702086567878723, + "learning_rate": 6.770238021830532e-05, + "loss": 0.9504, + "step": 120510 + }, + { + "epoch": 0.7699679286508312, + "grad_norm": 1.5449334383010864, + "learning_rate": 6.769768745265991e-05, + "loss": 0.7662, + "step": 120520 + }, + { + "epoch": 0.7700318158005699, + "grad_norm": 1.5101966857910156, + "learning_rate": 6.769299450878349e-05, + "loss": 0.8513, + "step": 120530 + }, + { + "epoch": 0.7700957029503086, + "grad_norm": 0.6847185492515564, + "learning_rate": 6.768830138672327e-05, + "loss": 0.8803, + "step": 120540 + }, + { + "epoch": 0.7701595901000473, + "grad_norm": 0.6780611276626587, + "learning_rate": 6.768360808652659e-05, + "loss": 0.7456, + "step": 120550 + }, + { + "epoch": 0.770223477249786, + "grad_norm": 1.0173395872116089, + "learning_rate": 6.767891460824066e-05, + "loss": 0.7139, + "step": 120560 + }, + { + "epoch": 0.7702873643995247, + "grad_norm": 0.6809027791023254, + "learning_rate": 6.767422095191277e-05, + "loss": 1.1505, + "step": 120570 + }, + { + "epoch": 0.7703512515492634, + "grad_norm": 0.9474708437919617, + "learning_rate": 6.766952711759018e-05, + "loss": 0.8336, + "step": 120580 + }, + { + "epoch": 0.7704151386990021, + "grad_norm": 0.5041736960411072, + "learning_rate": 6.766483310532017e-05, + "loss": 0.7056, + "step": 120590 + }, + { + "epoch": 0.7704790258487408, + "grad_norm": 1.975868582725525, + "learning_rate": 6.766013891515e-05, + "loss": 0.9774, + "step": 120600 + }, + { + "epoch": 0.7705429129984794, + "grad_norm": 0.8999959826469421, + "learning_rate": 6.765544454712696e-05, + "loss": 0.8933, + "step": 120610 + }, + { + "epoch": 0.7706068001482181, + "grad_norm": 0.839335560798645, + "learning_rate": 6.765075000129831e-05, + "loss": 0.8328, + "step": 120620 + }, + { + "epoch": 0.7706706872979568, + "grad_norm": 0.8537786602973938, + "learning_rate": 6.764605527771133e-05, + "loss": 0.8143, + "step": 120630 + }, + { + "epoch": 0.7707345744476956, + "grad_norm": 0.7219642996788025, + "learning_rate": 6.764136037641333e-05, + "loss": 0.7989, + "step": 120640 + }, + { + "epoch": 0.7707984615974343, + "grad_norm": 0.6712138056755066, + "learning_rate": 6.763666529745156e-05, + "loss": 1.1548, + "step": 120650 + }, + { + "epoch": 0.770862348747173, + "grad_norm": 0.8392811417579651, + "learning_rate": 6.763197004087331e-05, + "loss": 0.7134, + "step": 120660 + }, + { + "epoch": 0.7709262358969117, + "grad_norm": 0.8870442509651184, + "learning_rate": 6.762727460672586e-05, + "loss": 0.7751, + "step": 120670 + }, + { + "epoch": 0.7709901230466504, + "grad_norm": 1.1646859645843506, + "learning_rate": 6.762257899505653e-05, + "loss": 1.0547, + "step": 120680 + }, + { + "epoch": 0.7710540101963891, + "grad_norm": 1.5529083013534546, + "learning_rate": 6.761788320591257e-05, + "loss": 0.8419, + "step": 120690 + }, + { + "epoch": 0.7711178973461278, + "grad_norm": 2.7997336387634277, + "learning_rate": 6.761318723934128e-05, + "loss": 0.9536, + "step": 120700 + }, + { + "epoch": 0.7711817844958665, + "grad_norm": 1.0143194198608398, + "learning_rate": 6.760849109538996e-05, + "loss": 0.9442, + "step": 120710 + }, + { + "epoch": 0.7712456716456052, + "grad_norm": 1.0322544574737549, + "learning_rate": 6.76037947741059e-05, + "loss": 0.951, + "step": 120720 + }, + { + "epoch": 0.7713095587953439, + "grad_norm": 0.9332642555236816, + "learning_rate": 6.759956793336986e-05, + "loss": 0.7949, + "step": 120730 + }, + { + "epoch": 0.7713734459450826, + "grad_norm": 0.8280695080757141, + "learning_rate": 6.759487127528388e-05, + "loss": 0.7483, + "step": 120740 + }, + { + "epoch": 0.7714373330948213, + "grad_norm": 0.6056891679763794, + "learning_rate": 6.759017444000235e-05, + "loss": 0.8854, + "step": 120750 + }, + { + "epoch": 0.77150122024456, + "grad_norm": 1.0320556163787842, + "learning_rate": 6.758547742757254e-05, + "loss": 0.9241, + "step": 120760 + }, + { + "epoch": 0.7715651073942987, + "grad_norm": 1.820793867111206, + "learning_rate": 6.758078023804176e-05, + "loss": 1.0678, + "step": 120770 + }, + { + "epoch": 0.7716289945440374, + "grad_norm": 2.8395168781280518, + "learning_rate": 6.757608287145731e-05, + "loss": 0.7885, + "step": 120780 + }, + { + "epoch": 0.7716928816937761, + "grad_norm": 0.7798264026641846, + "learning_rate": 6.75713853278665e-05, + "loss": 0.9502, + "step": 120790 + }, + { + "epoch": 0.7717567688435149, + "grad_norm": 0.8113306164741516, + "learning_rate": 6.756668760731665e-05, + "loss": 0.7035, + "step": 120800 + }, + { + "epoch": 0.7718206559932536, + "grad_norm": 0.7610470056533813, + "learning_rate": 6.756198970985506e-05, + "loss": 0.9429, + "step": 120810 + }, + { + "epoch": 0.7718845431429923, + "grad_norm": 0.8831712603569031, + "learning_rate": 6.755729163552902e-05, + "loss": 0.9622, + "step": 120820 + }, + { + "epoch": 0.771948430292731, + "grad_norm": 0.9428032040596008, + "learning_rate": 6.755259338438588e-05, + "loss": 0.9375, + "step": 120830 + }, + { + "epoch": 0.7720123174424697, + "grad_norm": 1.5266450643539429, + "learning_rate": 6.754789495647293e-05, + "loss": 0.9392, + "step": 120840 + }, + { + "epoch": 0.7720762045922083, + "grad_norm": 1.08087956905365, + "learning_rate": 6.75431963518375e-05, + "loss": 1.0333, + "step": 120850 + }, + { + "epoch": 0.772140091741947, + "grad_norm": 1.0593822002410889, + "learning_rate": 6.75384975705269e-05, + "loss": 0.9502, + "step": 120860 + }, + { + "epoch": 0.7722039788916857, + "grad_norm": 0.902668297290802, + "learning_rate": 6.753379861258846e-05, + "loss": 0.5924, + "step": 120870 + }, + { + "epoch": 0.7722678660414244, + "grad_norm": 1.1227551698684692, + "learning_rate": 6.752909947806951e-05, + "loss": 0.7154, + "step": 120880 + }, + { + "epoch": 0.7723317531911631, + "grad_norm": 0.7121851444244385, + "learning_rate": 6.752440016701736e-05, + "loss": 0.8883, + "step": 120890 + }, + { + "epoch": 0.7723956403409018, + "grad_norm": 0.878093421459198, + "learning_rate": 6.751970067947932e-05, + "loss": 1.0066, + "step": 120900 + }, + { + "epoch": 0.7724595274906405, + "grad_norm": 2.005844831466675, + "learning_rate": 6.751500101550275e-05, + "loss": 0.8736, + "step": 120910 + }, + { + "epoch": 0.7725234146403792, + "grad_norm": 1.369321346282959, + "learning_rate": 6.751030117513497e-05, + "loss": 1.1788, + "step": 120920 + }, + { + "epoch": 0.7725873017901179, + "grad_norm": 0.6035107374191284, + "learning_rate": 6.750560115842332e-05, + "loss": 1.1607, + "step": 120930 + }, + { + "epoch": 0.7726511889398566, + "grad_norm": 1.0282695293426514, + "learning_rate": 6.750090096541511e-05, + "loss": 0.7348, + "step": 120940 + }, + { + "epoch": 0.7727150760895953, + "grad_norm": 0.5575137734413147, + "learning_rate": 6.749620059615768e-05, + "loss": 0.8886, + "step": 120950 + }, + { + "epoch": 0.772778963239334, + "grad_norm": 0.8261436223983765, + "learning_rate": 6.749150005069838e-05, + "loss": 0.928, + "step": 120960 + }, + { + "epoch": 0.7728428503890727, + "grad_norm": 0.8338256478309631, + "learning_rate": 6.748679932908454e-05, + "loss": 0.646, + "step": 120970 + }, + { + "epoch": 0.7729067375388115, + "grad_norm": 0.7634387612342834, + "learning_rate": 6.74820984313635e-05, + "loss": 0.9501, + "step": 120980 + }, + { + "epoch": 0.7729706246885502, + "grad_norm": 0.8158954977989197, + "learning_rate": 6.747739735758262e-05, + "loss": 0.937, + "step": 120990 + }, + { + "epoch": 0.7730345118382889, + "grad_norm": 0.8353099226951599, + "learning_rate": 6.747269610778922e-05, + "loss": 0.9787, + "step": 121000 + }, + { + "epoch": 0.7730983989880276, + "grad_norm": 1.1733039617538452, + "learning_rate": 6.746799468203064e-05, + "loss": 0.924, + "step": 121010 + }, + { + "epoch": 0.7731622861377663, + "grad_norm": 0.6754859685897827, + "learning_rate": 6.746329308035426e-05, + "loss": 1.1289, + "step": 121020 + }, + { + "epoch": 0.773226173287505, + "grad_norm": 0.7313271164894104, + "learning_rate": 6.745859130280741e-05, + "loss": 0.8438, + "step": 121030 + }, + { + "epoch": 0.7732900604372437, + "grad_norm": 1.6041016578674316, + "learning_rate": 6.745388934943743e-05, + "loss": 0.7458, + "step": 121040 + }, + { + "epoch": 0.7733539475869824, + "grad_norm": 0.7553384900093079, + "learning_rate": 6.744918722029169e-05, + "loss": 0.8966, + "step": 121050 + }, + { + "epoch": 0.7734178347367211, + "grad_norm": 0.4830940365791321, + "learning_rate": 6.744448491541754e-05, + "loss": 0.6584, + "step": 121060 + }, + { + "epoch": 0.7734817218864598, + "grad_norm": 0.8653696179389954, + "learning_rate": 6.743978243486233e-05, + "loss": 1.2337, + "step": 121070 + }, + { + "epoch": 0.7735456090361985, + "grad_norm": 0.8902184963226318, + "learning_rate": 6.743507977867342e-05, + "loss": 0.8364, + "step": 121080 + }, + { + "epoch": 0.7736094961859371, + "grad_norm": 0.9658520817756653, + "learning_rate": 6.74303769468982e-05, + "loss": 0.9397, + "step": 121090 + }, + { + "epoch": 0.7736733833356758, + "grad_norm": 0.7507711052894592, + "learning_rate": 6.742567393958398e-05, + "loss": 0.723, + "step": 121100 + }, + { + "epoch": 0.7737372704854145, + "grad_norm": 0.6307206153869629, + "learning_rate": 6.742097075677815e-05, + "loss": 0.7924, + "step": 121110 + }, + { + "epoch": 0.7738011576351532, + "grad_norm": 1.159859538078308, + "learning_rate": 6.741626739852806e-05, + "loss": 0.8277, + "step": 121120 + }, + { + "epoch": 0.7738650447848919, + "grad_norm": 0.7750802636146545, + "learning_rate": 6.741156386488112e-05, + "loss": 1.0919, + "step": 121130 + }, + { + "epoch": 0.7739289319346306, + "grad_norm": 0.9529350399971008, + "learning_rate": 6.740686015588465e-05, + "loss": 1.0912, + "step": 121140 + }, + { + "epoch": 0.7739928190843693, + "grad_norm": 0.8599395751953125, + "learning_rate": 6.740215627158605e-05, + "loss": 1.0332, + "step": 121150 + }, + { + "epoch": 0.774056706234108, + "grad_norm": 0.9793898463249207, + "learning_rate": 6.739745221203268e-05, + "loss": 0.7607, + "step": 121160 + }, + { + "epoch": 0.7741205933838468, + "grad_norm": 0.8916863799095154, + "learning_rate": 6.739274797727191e-05, + "loss": 1.0146, + "step": 121170 + }, + { + "epoch": 0.7741844805335855, + "grad_norm": 1.1108578443527222, + "learning_rate": 6.738804356735113e-05, + "loss": 0.9828, + "step": 121180 + }, + { + "epoch": 0.7742483676833242, + "grad_norm": 1.299629807472229, + "learning_rate": 6.73833389823177e-05, + "loss": 1.1273, + "step": 121190 + }, + { + "epoch": 0.7743122548330629, + "grad_norm": 0.9776250123977661, + "learning_rate": 6.737863422221902e-05, + "loss": 0.7339, + "step": 121200 + }, + { + "epoch": 0.7743761419828016, + "grad_norm": 0.8308240175247192, + "learning_rate": 6.737392928710245e-05, + "loss": 0.9728, + "step": 121210 + }, + { + "epoch": 0.7744400291325403, + "grad_norm": 1.0700846910476685, + "learning_rate": 6.736922417701537e-05, + "loss": 0.8842, + "step": 121220 + }, + { + "epoch": 0.774503916282279, + "grad_norm": 0.8977962732315063, + "learning_rate": 6.736451889200518e-05, + "loss": 0.8738, + "step": 121230 + }, + { + "epoch": 0.7745678034320177, + "grad_norm": 0.8981806039810181, + "learning_rate": 6.735981343211927e-05, + "loss": 0.82, + "step": 121240 + }, + { + "epoch": 0.7746316905817564, + "grad_norm": 1.0636060237884521, + "learning_rate": 6.735510779740502e-05, + "loss": 0.8949, + "step": 121250 + }, + { + "epoch": 0.7746955777314951, + "grad_norm": 0.8184270858764648, + "learning_rate": 6.735040198790982e-05, + "loss": 0.8559, + "step": 121260 + }, + { + "epoch": 0.7747594648812338, + "grad_norm": 1.0047454833984375, + "learning_rate": 6.734569600368105e-05, + "loss": 1.2097, + "step": 121270 + }, + { + "epoch": 0.7748233520309725, + "grad_norm": 1.6020781993865967, + "learning_rate": 6.734098984476612e-05, + "loss": 0.757, + "step": 121280 + }, + { + "epoch": 0.7748872391807112, + "grad_norm": 0.7962193489074707, + "learning_rate": 6.733628351121243e-05, + "loss": 0.8267, + "step": 121290 + }, + { + "epoch": 0.7749511263304499, + "grad_norm": 1.1019634008407593, + "learning_rate": 6.733157700306737e-05, + "loss": 0.869, + "step": 121300 + }, + { + "epoch": 0.7750150134801886, + "grad_norm": 0.6633391976356506, + "learning_rate": 6.732687032037832e-05, + "loss": 0.843, + "step": 121310 + }, + { + "epoch": 0.7750789006299273, + "grad_norm": 1.0275635719299316, + "learning_rate": 6.73221634631927e-05, + "loss": 0.9104, + "step": 121320 + }, + { + "epoch": 0.7751427877796659, + "grad_norm": 0.7791745662689209, + "learning_rate": 6.73174564315579e-05, + "loss": 0.9885, + "step": 121330 + }, + { + "epoch": 0.7752066749294046, + "grad_norm": 1.2673611640930176, + "learning_rate": 6.731274922552135e-05, + "loss": 0.6765, + "step": 121340 + }, + { + "epoch": 0.7752705620791434, + "grad_norm": 0.8525099754333496, + "learning_rate": 6.730804184513044e-05, + "loss": 0.8447, + "step": 121350 + }, + { + "epoch": 0.7753344492288821, + "grad_norm": 0.8787998557090759, + "learning_rate": 6.730333429043256e-05, + "loss": 0.8673, + "step": 121360 + }, + { + "epoch": 0.7753983363786208, + "grad_norm": 0.7278786897659302, + "learning_rate": 6.729862656147514e-05, + "loss": 0.7846, + "step": 121370 + }, + { + "epoch": 0.7754622235283595, + "grad_norm": 1.0714443922042847, + "learning_rate": 6.729391865830559e-05, + "loss": 0.809, + "step": 121380 + }, + { + "epoch": 0.7755261106780982, + "grad_norm": 0.820010244846344, + "learning_rate": 6.72892105809713e-05, + "loss": 0.8847, + "step": 121390 + }, + { + "epoch": 0.7755899978278369, + "grad_norm": 1.3069791793823242, + "learning_rate": 6.728450232951972e-05, + "loss": 0.8478, + "step": 121400 + }, + { + "epoch": 0.7756538849775756, + "grad_norm": 0.992739737033844, + "learning_rate": 6.727979390399825e-05, + "loss": 1.0541, + "step": 121410 + }, + { + "epoch": 0.7757177721273143, + "grad_norm": 0.8804332613945007, + "learning_rate": 6.72750853044543e-05, + "loss": 0.9293, + "step": 121420 + }, + { + "epoch": 0.775781659277053, + "grad_norm": 0.9958817958831787, + "learning_rate": 6.72703765309353e-05, + "loss": 0.959, + "step": 121430 + }, + { + "epoch": 0.7758455464267917, + "grad_norm": 0.8248307704925537, + "learning_rate": 6.726566758348867e-05, + "loss": 0.8786, + "step": 121440 + }, + { + "epoch": 0.7759094335765304, + "grad_norm": 0.9788550138473511, + "learning_rate": 6.726095846216181e-05, + "loss": 0.6713, + "step": 121450 + }, + { + "epoch": 0.7759733207262691, + "grad_norm": 1.0201257467269897, + "learning_rate": 6.725624916700218e-05, + "loss": 1.1351, + "step": 121460 + }, + { + "epoch": 0.7760372078760078, + "grad_norm": 0.550969123840332, + "learning_rate": 6.72515396980572e-05, + "loss": 0.9425, + "step": 121470 + }, + { + "epoch": 0.7761010950257465, + "grad_norm": 0.7929662466049194, + "learning_rate": 6.724683005537427e-05, + "loss": 1.0115, + "step": 121480 + }, + { + "epoch": 0.7761649821754852, + "grad_norm": 1.1260933876037598, + "learning_rate": 6.724212023900086e-05, + "loss": 0.8828, + "step": 121490 + }, + { + "epoch": 0.776228869325224, + "grad_norm": 1.8724021911621094, + "learning_rate": 6.723741024898438e-05, + "loss": 1.1478, + "step": 121500 + }, + { + "epoch": 0.7762927564749627, + "grad_norm": 0.7166033387184143, + "learning_rate": 6.723270008537225e-05, + "loss": 0.9861, + "step": 121510 + }, + { + "epoch": 0.7763566436247014, + "grad_norm": 0.6689345240592957, + "learning_rate": 6.722798974821193e-05, + "loss": 1.0694, + "step": 121520 + }, + { + "epoch": 0.7764205307744401, + "grad_norm": 0.7042884230613708, + "learning_rate": 6.722327923755086e-05, + "loss": 1.0029, + "step": 121530 + }, + { + "epoch": 0.7764844179241788, + "grad_norm": 0.6996636390686035, + "learning_rate": 6.721856855343647e-05, + "loss": 0.7168, + "step": 121540 + }, + { + "epoch": 0.7765483050739175, + "grad_norm": 0.7664635181427002, + "learning_rate": 6.721385769591618e-05, + "loss": 1.13, + "step": 121550 + }, + { + "epoch": 0.7766121922236562, + "grad_norm": 1.2732270956039429, + "learning_rate": 6.720914666503746e-05, + "loss": 0.7654, + "step": 121560 + }, + { + "epoch": 0.7766760793733949, + "grad_norm": 1.799688458442688, + "learning_rate": 6.720443546084775e-05, + "loss": 0.9057, + "step": 121570 + }, + { + "epoch": 0.7767399665231335, + "grad_norm": 0.6729174852371216, + "learning_rate": 6.719972408339447e-05, + "loss": 0.8531, + "step": 121580 + }, + { + "epoch": 0.7768038536728722, + "grad_norm": 0.8907155394554138, + "learning_rate": 6.719501253272513e-05, + "loss": 0.6519, + "step": 121590 + }, + { + "epoch": 0.7768677408226109, + "grad_norm": 0.7867339253425598, + "learning_rate": 6.71903008088871e-05, + "loss": 0.7657, + "step": 121600 + }, + { + "epoch": 0.7769316279723496, + "grad_norm": 0.8111919164657593, + "learning_rate": 6.718558891192788e-05, + "loss": 0.89, + "step": 121610 + }, + { + "epoch": 0.7769955151220883, + "grad_norm": 1.4637147188186646, + "learning_rate": 6.718087684189491e-05, + "loss": 0.7084, + "step": 121620 + }, + { + "epoch": 0.777059402271827, + "grad_norm": 0.9606701731681824, + "learning_rate": 6.717616459883564e-05, + "loss": 0.8442, + "step": 121630 + }, + { + "epoch": 0.7771232894215657, + "grad_norm": 0.5550901293754578, + "learning_rate": 6.717145218279755e-05, + "loss": 0.785, + "step": 121640 + }, + { + "epoch": 0.7771871765713044, + "grad_norm": 0.8698827028274536, + "learning_rate": 6.716673959382806e-05, + "loss": 0.9919, + "step": 121650 + }, + { + "epoch": 0.7772510637210431, + "grad_norm": 1.0764882564544678, + "learning_rate": 6.716202683197468e-05, + "loss": 0.8303, + "step": 121660 + }, + { + "epoch": 0.7773149508707818, + "grad_norm": 0.6857743263244629, + "learning_rate": 6.715731389728484e-05, + "loss": 0.8815, + "step": 121670 + }, + { + "epoch": 0.7773788380205205, + "grad_norm": 1.6733158826828003, + "learning_rate": 6.715260078980599e-05, + "loss": 0.7927, + "step": 121680 + }, + { + "epoch": 0.7774427251702593, + "grad_norm": 0.7211989164352417, + "learning_rate": 6.714788750958561e-05, + "loss": 0.9503, + "step": 121690 + }, + { + "epoch": 0.777506612319998, + "grad_norm": 1.002265453338623, + "learning_rate": 6.714317405667118e-05, + "loss": 1.0882, + "step": 121700 + }, + { + "epoch": 0.7775704994697367, + "grad_norm": 0.9164408445358276, + "learning_rate": 6.713846043111014e-05, + "loss": 0.886, + "step": 121710 + }, + { + "epoch": 0.7776343866194754, + "grad_norm": 0.5526295304298401, + "learning_rate": 6.713374663294999e-05, + "loss": 0.7163, + "step": 121720 + }, + { + "epoch": 0.7776982737692141, + "grad_norm": 1.0541777610778809, + "learning_rate": 6.712903266223818e-05, + "loss": 1.219, + "step": 121730 + }, + { + "epoch": 0.7777621609189528, + "grad_norm": 1.3423256874084473, + "learning_rate": 6.71243185190222e-05, + "loss": 1.2439, + "step": 121740 + }, + { + "epoch": 0.7778260480686915, + "grad_norm": 0.900256335735321, + "learning_rate": 6.711960420334951e-05, + "loss": 0.9215, + "step": 121750 + }, + { + "epoch": 0.7778899352184302, + "grad_norm": 0.7287362813949585, + "learning_rate": 6.71148897152676e-05, + "loss": 0.7935, + "step": 121760 + }, + { + "epoch": 0.7779538223681689, + "grad_norm": 0.6165835857391357, + "learning_rate": 6.711017505482395e-05, + "loss": 0.8651, + "step": 121770 + }, + { + "epoch": 0.7780177095179076, + "grad_norm": 1.222276210784912, + "learning_rate": 6.710546022206603e-05, + "loss": 0.7607, + "step": 121780 + }, + { + "epoch": 0.7780815966676463, + "grad_norm": 0.9571607112884521, + "learning_rate": 6.71007452170413e-05, + "loss": 0.9691, + "step": 121790 + }, + { + "epoch": 0.778145483817385, + "grad_norm": 0.7661402821540833, + "learning_rate": 6.709603003979729e-05, + "loss": 0.8724, + "step": 121800 + }, + { + "epoch": 0.7782093709671237, + "grad_norm": 0.9721023440361023, + "learning_rate": 6.709131469038149e-05, + "loss": 0.6902, + "step": 121810 + }, + { + "epoch": 0.7782732581168623, + "grad_norm": 0.9388052821159363, + "learning_rate": 6.708659916884135e-05, + "loss": 0.8722, + "step": 121820 + }, + { + "epoch": 0.778337145266601, + "grad_norm": 1.3385436534881592, + "learning_rate": 6.708188347522438e-05, + "loss": 0.9258, + "step": 121830 + }, + { + "epoch": 0.7784010324163397, + "grad_norm": 1.1021685600280762, + "learning_rate": 6.707716760957805e-05, + "loss": 1.1272, + "step": 121840 + }, + { + "epoch": 0.7784649195660784, + "grad_norm": 0.7849000096321106, + "learning_rate": 6.707245157194987e-05, + "loss": 0.8731, + "step": 121850 + }, + { + "epoch": 0.7785288067158171, + "grad_norm": 0.7973129153251648, + "learning_rate": 6.706773536238734e-05, + "loss": 1.0259, + "step": 121860 + }, + { + "epoch": 0.7785926938655559, + "grad_norm": 0.8367007970809937, + "learning_rate": 6.706301898093795e-05, + "loss": 0.8157, + "step": 121870 + }, + { + "epoch": 0.7786565810152946, + "grad_norm": 0.8635137677192688, + "learning_rate": 6.70583024276492e-05, + "loss": 0.907, + "step": 121880 + }, + { + "epoch": 0.7787204681650333, + "grad_norm": 0.8511916399002075, + "learning_rate": 6.705358570256858e-05, + "loss": 0.9413, + "step": 121890 + }, + { + "epoch": 0.778784355314772, + "grad_norm": 0.8517649173736572, + "learning_rate": 6.70488688057436e-05, + "loss": 0.9937, + "step": 121900 + }, + { + "epoch": 0.7788482424645107, + "grad_norm": 0.8689191341400146, + "learning_rate": 6.704415173722176e-05, + "loss": 0.9242, + "step": 121910 + }, + { + "epoch": 0.7789121296142494, + "grad_norm": 0.7940566539764404, + "learning_rate": 6.70394344970506e-05, + "loss": 0.7244, + "step": 121920 + }, + { + "epoch": 0.7789760167639881, + "grad_norm": 1.276955008506775, + "learning_rate": 6.703471708527756e-05, + "loss": 1.1358, + "step": 121930 + }, + { + "epoch": 0.7790399039137268, + "grad_norm": 1.2477837800979614, + "learning_rate": 6.702999950195017e-05, + "loss": 1.0286, + "step": 121940 + }, + { + "epoch": 0.7791037910634655, + "grad_norm": 0.9349541664123535, + "learning_rate": 6.702528174711597e-05, + "loss": 0.9723, + "step": 121950 + }, + { + "epoch": 0.7791676782132042, + "grad_norm": 0.8674134612083435, + "learning_rate": 6.702056382082245e-05, + "loss": 1.0462, + "step": 121960 + }, + { + "epoch": 0.7792315653629429, + "grad_norm": 1.3984166383743286, + "learning_rate": 6.701584572311712e-05, + "loss": 0.9276, + "step": 121970 + }, + { + "epoch": 0.7792954525126816, + "grad_norm": 1.5692760944366455, + "learning_rate": 6.701112745404752e-05, + "loss": 0.7578, + "step": 121980 + }, + { + "epoch": 0.7793593396624203, + "grad_norm": 1.1319806575775146, + "learning_rate": 6.700640901366113e-05, + "loss": 0.927, + "step": 121990 + }, + { + "epoch": 0.779423226812159, + "grad_norm": 1.1491388082504272, + "learning_rate": 6.700169040200551e-05, + "loss": 0.8779, + "step": 122000 + }, + { + "epoch": 0.7794871139618977, + "grad_norm": 0.9306029081344604, + "learning_rate": 6.699697161912815e-05, + "loss": 0.7728, + "step": 122010 + }, + { + "epoch": 0.7795510011116364, + "grad_norm": 2.3695061206817627, + "learning_rate": 6.699225266507658e-05, + "loss": 1.0062, + "step": 122020 + }, + { + "epoch": 0.7796148882613751, + "grad_norm": 0.8340387344360352, + "learning_rate": 6.698753353989831e-05, + "loss": 0.7666, + "step": 122030 + }, + { + "epoch": 0.7796787754111139, + "grad_norm": 1.7601200342178345, + "learning_rate": 6.69828142436409e-05, + "loss": 0.9149, + "step": 122040 + }, + { + "epoch": 0.7797426625608526, + "grad_norm": 0.7318232655525208, + "learning_rate": 6.697809477635187e-05, + "loss": 0.8447, + "step": 122050 + }, + { + "epoch": 0.7798065497105912, + "grad_norm": 1.044084906578064, + "learning_rate": 6.697384710959896e-05, + "loss": 1.3071, + "step": 122060 + }, + { + "epoch": 0.7798704368603299, + "grad_norm": 0.5054813623428345, + "learning_rate": 6.696912731748075e-05, + "loss": 0.8569, + "step": 122070 + }, + { + "epoch": 0.7799343240100686, + "grad_norm": 0.8287333846092224, + "learning_rate": 6.696440735446876e-05, + "loss": 1.0033, + "step": 122080 + }, + { + "epoch": 0.7799982111598073, + "grad_norm": 0.8262792825698853, + "learning_rate": 6.695968722061052e-05, + "loss": 0.8806, + "step": 122090 + }, + { + "epoch": 0.780062098309546, + "grad_norm": 0.9939790964126587, + "learning_rate": 6.695496691595354e-05, + "loss": 1.0262, + "step": 122100 + }, + { + "epoch": 0.7801259854592847, + "grad_norm": 0.7839484810829163, + "learning_rate": 6.695024644054537e-05, + "loss": 1.0321, + "step": 122110 + }, + { + "epoch": 0.7801898726090234, + "grad_norm": 0.7679548859596252, + "learning_rate": 6.694552579443358e-05, + "loss": 0.8247, + "step": 122120 + }, + { + "epoch": 0.7802537597587621, + "grad_norm": 1.0078667402267456, + "learning_rate": 6.694080497766567e-05, + "loss": 0.8459, + "step": 122130 + }, + { + "epoch": 0.7803176469085008, + "grad_norm": 0.964644730091095, + "learning_rate": 6.69360839902892e-05, + "loss": 0.9135, + "step": 122140 + }, + { + "epoch": 0.7803815340582395, + "grad_norm": 1.4966347217559814, + "learning_rate": 6.69313628323517e-05, + "loss": 0.756, + "step": 122150 + }, + { + "epoch": 0.7804454212079782, + "grad_norm": 0.8424573540687561, + "learning_rate": 6.692664150390073e-05, + "loss": 0.8184, + "step": 122160 + }, + { + "epoch": 0.7805093083577169, + "grad_norm": 0.6761122345924377, + "learning_rate": 6.692192000498385e-05, + "loss": 1.1223, + "step": 122170 + }, + { + "epoch": 0.7805731955074556, + "grad_norm": 1.4543062448501587, + "learning_rate": 6.69171983356486e-05, + "loss": 0.7813, + "step": 122180 + }, + { + "epoch": 0.7806370826571943, + "grad_norm": 0.830903172492981, + "learning_rate": 6.691247649594251e-05, + "loss": 0.9245, + "step": 122190 + }, + { + "epoch": 0.780700969806933, + "grad_norm": 0.994420051574707, + "learning_rate": 6.690775448591316e-05, + "loss": 0.8715, + "step": 122200 + }, + { + "epoch": 0.7807648569566717, + "grad_norm": 0.7622717618942261, + "learning_rate": 6.69030323056081e-05, + "loss": 0.9868, + "step": 122210 + }, + { + "epoch": 0.7808287441064105, + "grad_norm": 0.8121097683906555, + "learning_rate": 6.689830995507487e-05, + "loss": 1.1996, + "step": 122220 + }, + { + "epoch": 0.7808926312561492, + "grad_norm": 0.6650993227958679, + "learning_rate": 6.689358743436105e-05, + "loss": 0.6315, + "step": 122230 + }, + { + "epoch": 0.7809565184058879, + "grad_norm": 1.005804419517517, + "learning_rate": 6.68888647435142e-05, + "loss": 0.9496, + "step": 122240 + }, + { + "epoch": 0.7810204055556266, + "grad_norm": 0.9154719114303589, + "learning_rate": 6.688414188258185e-05, + "loss": 0.7727, + "step": 122250 + }, + { + "epoch": 0.7810842927053653, + "grad_norm": 0.7355092167854309, + "learning_rate": 6.687941885161158e-05, + "loss": 0.84, + "step": 122260 + }, + { + "epoch": 0.781148179855104, + "grad_norm": 0.5671817064285278, + "learning_rate": 6.687469565065096e-05, + "loss": 0.9975, + "step": 122270 + }, + { + "epoch": 0.7812120670048427, + "grad_norm": 1.7897629737854004, + "learning_rate": 6.686997227974756e-05, + "loss": 0.9906, + "step": 122280 + }, + { + "epoch": 0.7812759541545814, + "grad_norm": 0.9264022707939148, + "learning_rate": 6.686524873894894e-05, + "loss": 0.9234, + "step": 122290 + }, + { + "epoch": 0.7813398413043201, + "grad_norm": 1.2895312309265137, + "learning_rate": 6.68605250283027e-05, + "loss": 0.8688, + "step": 122300 + }, + { + "epoch": 0.7814037284540587, + "grad_norm": 1.5715874433517456, + "learning_rate": 6.685580114785638e-05, + "loss": 0.951, + "step": 122310 + }, + { + "epoch": 0.7814676156037974, + "grad_norm": 0.6482036709785461, + "learning_rate": 6.685107709765755e-05, + "loss": 0.9561, + "step": 122320 + }, + { + "epoch": 0.7815315027535361, + "grad_norm": 0.810217559337616, + "learning_rate": 6.684635287775381e-05, + "loss": 1.0825, + "step": 122330 + }, + { + "epoch": 0.7815953899032748, + "grad_norm": 0.8282271027565002, + "learning_rate": 6.68416284881927e-05, + "loss": 0.8896, + "step": 122340 + }, + { + "epoch": 0.7816592770530135, + "grad_norm": 0.6604433059692383, + "learning_rate": 6.683690392902184e-05, + "loss": 0.8456, + "step": 122350 + }, + { + "epoch": 0.7817231642027522, + "grad_norm": 0.8533942699432373, + "learning_rate": 6.683217920028876e-05, + "loss": 0.8331, + "step": 122360 + }, + { + "epoch": 0.7817870513524909, + "grad_norm": 1.0768920183181763, + "learning_rate": 6.68274543020411e-05, + "loss": 0.9044, + "step": 122370 + }, + { + "epoch": 0.7818509385022296, + "grad_norm": 1.2052894830703735, + "learning_rate": 6.682272923432643e-05, + "loss": 0.9181, + "step": 122380 + }, + { + "epoch": 0.7819148256519683, + "grad_norm": 1.163033127784729, + "learning_rate": 6.681800399719229e-05, + "loss": 0.9498, + "step": 122390 + }, + { + "epoch": 0.781978712801707, + "grad_norm": 1.0256508588790894, + "learning_rate": 6.681327859068633e-05, + "loss": 0.9087, + "step": 122400 + }, + { + "epoch": 0.7820425999514458, + "grad_norm": 1.141444444656372, + "learning_rate": 6.680855301485609e-05, + "loss": 0.8819, + "step": 122410 + }, + { + "epoch": 0.7821064871011845, + "grad_norm": 1.1306743621826172, + "learning_rate": 6.680382726974918e-05, + "loss": 0.9268, + "step": 122420 + }, + { + "epoch": 0.7821703742509232, + "grad_norm": 0.9543069005012512, + "learning_rate": 6.67991013554132e-05, + "loss": 0.8443, + "step": 122430 + }, + { + "epoch": 0.7822342614006619, + "grad_norm": 0.49687138199806213, + "learning_rate": 6.679437527189571e-05, + "loss": 0.9003, + "step": 122440 + }, + { + "epoch": 0.7822981485504006, + "grad_norm": 1.3681707382202148, + "learning_rate": 6.678964901924435e-05, + "loss": 1.0405, + "step": 122450 + }, + { + "epoch": 0.7823620357001393, + "grad_norm": 2.7566139698028564, + "learning_rate": 6.678492259750672e-05, + "loss": 1.1397, + "step": 122460 + }, + { + "epoch": 0.782425922849878, + "grad_norm": 1.4756008386611938, + "learning_rate": 6.678019600673037e-05, + "loss": 0.8729, + "step": 122470 + }, + { + "epoch": 0.7824898099996167, + "grad_norm": 1.502285122871399, + "learning_rate": 6.677546924696295e-05, + "loss": 0.7695, + "step": 122480 + }, + { + "epoch": 0.7825536971493554, + "grad_norm": 0.9394874572753906, + "learning_rate": 6.677074231825203e-05, + "loss": 0.8499, + "step": 122490 + }, + { + "epoch": 0.7826175842990941, + "grad_norm": 0.6078043580055237, + "learning_rate": 6.676601522064522e-05, + "loss": 0.7862, + "step": 122500 + }, + { + "epoch": 0.7826814714488328, + "grad_norm": 0.6710939407348633, + "learning_rate": 6.676128795419015e-05, + "loss": 1.0033, + "step": 122510 + }, + { + "epoch": 0.7827453585985715, + "grad_norm": 1.7409946918487549, + "learning_rate": 6.67565605189344e-05, + "loss": 0.7387, + "step": 122520 + }, + { + "epoch": 0.7828092457483102, + "grad_norm": 1.1592122316360474, + "learning_rate": 6.67518329149256e-05, + "loss": 0.8663, + "step": 122530 + }, + { + "epoch": 0.7828731328980489, + "grad_norm": 0.7959754467010498, + "learning_rate": 6.674710514221133e-05, + "loss": 0.9003, + "step": 122540 + }, + { + "epoch": 0.7829370200477875, + "grad_norm": 1.363761305809021, + "learning_rate": 6.674237720083924e-05, + "loss": 0.7244, + "step": 122550 + }, + { + "epoch": 0.7830009071975262, + "grad_norm": 0.603500247001648, + "learning_rate": 6.673764909085692e-05, + "loss": 0.7391, + "step": 122560 + }, + { + "epoch": 0.783064794347265, + "grad_norm": 1.3545539379119873, + "learning_rate": 6.6732920812312e-05, + "loss": 0.8183, + "step": 122570 + }, + { + "epoch": 0.7831286814970037, + "grad_norm": 0.8553158044815063, + "learning_rate": 6.672819236525208e-05, + "loss": 0.8083, + "step": 122580 + }, + { + "epoch": 0.7831925686467424, + "grad_norm": 1.2060186862945557, + "learning_rate": 6.67234637497248e-05, + "loss": 1.0407, + "step": 122590 + }, + { + "epoch": 0.7832564557964811, + "grad_norm": 0.7966336607933044, + "learning_rate": 6.671873496577777e-05, + "loss": 1.004, + "step": 122600 + }, + { + "epoch": 0.7833203429462198, + "grad_norm": 1.0027638673782349, + "learning_rate": 6.671400601345861e-05, + "loss": 1.013, + "step": 122610 + }, + { + "epoch": 0.7833842300959585, + "grad_norm": 0.8682675957679749, + "learning_rate": 6.670927689281494e-05, + "loss": 1.0311, + "step": 122620 + }, + { + "epoch": 0.7834481172456972, + "grad_norm": 1.4984140396118164, + "learning_rate": 6.670454760389442e-05, + "loss": 0.6875, + "step": 122630 + }, + { + "epoch": 0.7835120043954359, + "grad_norm": 0.7457048892974854, + "learning_rate": 6.669981814674464e-05, + "loss": 0.6236, + "step": 122640 + }, + { + "epoch": 0.7835758915451746, + "grad_norm": 0.7693182826042175, + "learning_rate": 6.669508852141325e-05, + "loss": 0.7395, + "step": 122650 + }, + { + "epoch": 0.7836397786949133, + "grad_norm": 2.3714489936828613, + "learning_rate": 6.669035872794786e-05, + "loss": 1.1034, + "step": 122660 + }, + { + "epoch": 0.783703665844652, + "grad_norm": 0.9617331027984619, + "learning_rate": 6.668562876639614e-05, + "loss": 0.9657, + "step": 122670 + }, + { + "epoch": 0.7837675529943907, + "grad_norm": 1.0759632587432861, + "learning_rate": 6.66808986368057e-05, + "loss": 0.6798, + "step": 122680 + }, + { + "epoch": 0.7838314401441294, + "grad_norm": 0.8608376383781433, + "learning_rate": 6.667616833922416e-05, + "loss": 0.8573, + "step": 122690 + }, + { + "epoch": 0.7838953272938681, + "grad_norm": 0.802783727645874, + "learning_rate": 6.66714378736992e-05, + "loss": 0.8653, + "step": 122700 + }, + { + "epoch": 0.7839592144436068, + "grad_norm": 1.2049453258514404, + "learning_rate": 6.666670724027844e-05, + "loss": 0.8514, + "step": 122710 + }, + { + "epoch": 0.7840231015933455, + "grad_norm": 1.9265187978744507, + "learning_rate": 6.66619764390095e-05, + "loss": 0.8429, + "step": 122720 + }, + { + "epoch": 0.7840869887430842, + "grad_norm": 1.0026494264602661, + "learning_rate": 6.665724546994005e-05, + "loss": 1.0991, + "step": 122730 + }, + { + "epoch": 0.784150875892823, + "grad_norm": 1.0881091356277466, + "learning_rate": 6.665251433311773e-05, + "loss": 0.8073, + "step": 122740 + }, + { + "epoch": 0.7842147630425617, + "grad_norm": 0.8942010998725891, + "learning_rate": 6.664778302859018e-05, + "loss": 0.9164, + "step": 122750 + }, + { + "epoch": 0.7842786501923004, + "grad_norm": 1.0254307985305786, + "learning_rate": 6.664305155640507e-05, + "loss": 0.8306, + "step": 122760 + }, + { + "epoch": 0.7843425373420391, + "grad_norm": 1.6653152704238892, + "learning_rate": 6.663831991661002e-05, + "loss": 1.1331, + "step": 122770 + }, + { + "epoch": 0.7844064244917778, + "grad_norm": 0.8588860034942627, + "learning_rate": 6.663358810925269e-05, + "loss": 0.9881, + "step": 122780 + }, + { + "epoch": 0.7844703116415164, + "grad_norm": 0.5761985778808594, + "learning_rate": 6.662885613438074e-05, + "loss": 0.6635, + "step": 122790 + }, + { + "epoch": 0.7845341987912551, + "grad_norm": 0.762401282787323, + "learning_rate": 6.662412399204182e-05, + "loss": 0.9458, + "step": 122800 + }, + { + "epoch": 0.7845980859409938, + "grad_norm": 1.0655889511108398, + "learning_rate": 6.661939168228359e-05, + "loss": 0.9319, + "step": 122810 + }, + { + "epoch": 0.7846619730907325, + "grad_norm": 0.7457488775253296, + "learning_rate": 6.66146592051537e-05, + "loss": 0.6604, + "step": 122820 + }, + { + "epoch": 0.7847258602404712, + "grad_norm": 0.5781500935554504, + "learning_rate": 6.660992656069984e-05, + "loss": 0.9065, + "step": 122830 + }, + { + "epoch": 0.7847897473902099, + "grad_norm": 1.0758085250854492, + "learning_rate": 6.660519374896964e-05, + "loss": 0.8102, + "step": 122840 + }, + { + "epoch": 0.7848536345399486, + "grad_norm": 3.073899745941162, + "learning_rate": 6.660046077001076e-05, + "loss": 0.961, + "step": 122850 + }, + { + "epoch": 0.7849175216896873, + "grad_norm": 1.818791389465332, + "learning_rate": 6.65957276238709e-05, + "loss": 0.7407, + "step": 122860 + }, + { + "epoch": 0.784981408839426, + "grad_norm": 1.0063798427581787, + "learning_rate": 6.65909943105977e-05, + "loss": 0.8685, + "step": 122870 + }, + { + "epoch": 0.7850452959891647, + "grad_norm": 0.8023037314414978, + "learning_rate": 6.658626083023883e-05, + "loss": 0.6783, + "step": 122880 + }, + { + "epoch": 0.7851091831389034, + "grad_norm": 0.8545927405357361, + "learning_rate": 6.658152718284197e-05, + "loss": 0.8662, + "step": 122890 + }, + { + "epoch": 0.7851730702886421, + "grad_norm": 0.9020628929138184, + "learning_rate": 6.657679336845478e-05, + "loss": 0.8088, + "step": 122900 + }, + { + "epoch": 0.7852369574383808, + "grad_norm": 1.1107025146484375, + "learning_rate": 6.657205938712492e-05, + "loss": 0.9401, + "step": 122910 + }, + { + "epoch": 0.7853008445881196, + "grad_norm": 0.7026088833808899, + "learning_rate": 6.656732523890012e-05, + "loss": 0.7771, + "step": 122920 + }, + { + "epoch": 0.7853647317378583, + "grad_norm": 1.3951656818389893, + "learning_rate": 6.656259092382801e-05, + "loss": 1.0259, + "step": 122930 + }, + { + "epoch": 0.785428618887597, + "grad_norm": 1.104836106300354, + "learning_rate": 6.655785644195627e-05, + "loss": 0.9918, + "step": 122940 + }, + { + "epoch": 0.7854925060373357, + "grad_norm": 0.6789342761039734, + "learning_rate": 6.655312179333259e-05, + "loss": 0.8964, + "step": 122950 + }, + { + "epoch": 0.7855563931870744, + "grad_norm": 0.7267434597015381, + "learning_rate": 6.654838697800467e-05, + "loss": 1.1021, + "step": 122960 + }, + { + "epoch": 0.7856202803368131, + "grad_norm": 0.9028590321540833, + "learning_rate": 6.654365199602016e-05, + "loss": 0.9037, + "step": 122970 + }, + { + "epoch": 0.7856841674865518, + "grad_norm": 1.3182995319366455, + "learning_rate": 6.653891684742677e-05, + "loss": 1.2179, + "step": 122980 + }, + { + "epoch": 0.7857480546362905, + "grad_norm": 0.6001270413398743, + "learning_rate": 6.653418153227218e-05, + "loss": 0.7268, + "step": 122990 + }, + { + "epoch": 0.7858119417860292, + "grad_norm": 0.8544936776161194, + "learning_rate": 6.652944605060409e-05, + "loss": 0.8634, + "step": 123000 + }, + { + "epoch": 0.7858758289357679, + "grad_norm": 0.7163207530975342, + "learning_rate": 6.652471040247016e-05, + "loss": 0.7325, + "step": 123010 + }, + { + "epoch": 0.7859397160855066, + "grad_norm": 1.0763355493545532, + "learning_rate": 6.65199745879181e-05, + "loss": 1.0254, + "step": 123020 + }, + { + "epoch": 0.7860036032352452, + "grad_norm": 0.8155850172042847, + "learning_rate": 6.651523860699562e-05, + "loss": 0.9767, + "step": 123030 + }, + { + "epoch": 0.7860674903849839, + "grad_norm": 0.845534086227417, + "learning_rate": 6.651050245975039e-05, + "loss": 0.9393, + "step": 123040 + }, + { + "epoch": 0.7861313775347226, + "grad_norm": 1.392137885093689, + "learning_rate": 6.650576614623012e-05, + "loss": 0.7875, + "step": 123050 + }, + { + "epoch": 0.7861952646844613, + "grad_norm": 0.9796945452690125, + "learning_rate": 6.65010296664825e-05, + "loss": 0.7574, + "step": 123060 + }, + { + "epoch": 0.7862591518342, + "grad_norm": 0.8178929686546326, + "learning_rate": 6.649629302055524e-05, + "loss": 0.746, + "step": 123070 + }, + { + "epoch": 0.7863230389839387, + "grad_norm": 0.6431455612182617, + "learning_rate": 6.649155620849605e-05, + "loss": 0.8506, + "step": 123080 + }, + { + "epoch": 0.7863869261336774, + "grad_norm": 0.8767764568328857, + "learning_rate": 6.648681923035261e-05, + "loss": 0.7546, + "step": 123090 + }, + { + "epoch": 0.7864508132834162, + "grad_norm": 2.9324655532836914, + "learning_rate": 6.648208208617262e-05, + "loss": 0.7744, + "step": 123100 + }, + { + "epoch": 0.7865147004331549, + "grad_norm": 1.0613309144973755, + "learning_rate": 6.647734477600383e-05, + "loss": 0.7883, + "step": 123110 + }, + { + "epoch": 0.7865785875828936, + "grad_norm": 1.001646637916565, + "learning_rate": 6.647260729989391e-05, + "loss": 1.2337, + "step": 123120 + }, + { + "epoch": 0.7866424747326323, + "grad_norm": 1.7810345888137817, + "learning_rate": 6.646786965789057e-05, + "loss": 0.8567, + "step": 123130 + }, + { + "epoch": 0.786706361882371, + "grad_norm": 0.8368769288063049, + "learning_rate": 6.646313185004155e-05, + "loss": 0.935, + "step": 123140 + }, + { + "epoch": 0.7867702490321097, + "grad_norm": 0.6469975709915161, + "learning_rate": 6.645839387639456e-05, + "loss": 0.7664, + "step": 123150 + }, + { + "epoch": 0.7868341361818484, + "grad_norm": 0.7839746475219727, + "learning_rate": 6.645365573699729e-05, + "loss": 1.0381, + "step": 123160 + }, + { + "epoch": 0.7868980233315871, + "grad_norm": 1.2214899063110352, + "learning_rate": 6.644891743189749e-05, + "loss": 0.94, + "step": 123170 + }, + { + "epoch": 0.7869619104813258, + "grad_norm": 1.0104191303253174, + "learning_rate": 6.644417896114285e-05, + "loss": 0.9163, + "step": 123180 + }, + { + "epoch": 0.7870257976310645, + "grad_norm": 0.9507836699485779, + "learning_rate": 6.643944032478109e-05, + "loss": 0.9046, + "step": 123190 + }, + { + "epoch": 0.7870896847808032, + "grad_norm": 1.0643188953399658, + "learning_rate": 6.643470152285995e-05, + "loss": 0.9449, + "step": 123200 + }, + { + "epoch": 0.7871535719305419, + "grad_norm": 0.8711426258087158, + "learning_rate": 6.642996255542717e-05, + "loss": 0.7908, + "step": 123210 + }, + { + "epoch": 0.7872174590802806, + "grad_norm": 1.0922789573669434, + "learning_rate": 6.642522342253042e-05, + "loss": 0.8448, + "step": 123220 + }, + { + "epoch": 0.7872813462300193, + "grad_norm": 0.9862490892410278, + "learning_rate": 6.642048412421749e-05, + "loss": 0.758, + "step": 123230 + }, + { + "epoch": 0.787345233379758, + "grad_norm": 0.6439590454101562, + "learning_rate": 6.641574466053607e-05, + "loss": 0.6805, + "step": 123240 + }, + { + "epoch": 0.7874091205294967, + "grad_norm": 0.9654756784439087, + "learning_rate": 6.641100503153388e-05, + "loss": 0.7238, + "step": 123250 + }, + { + "epoch": 0.7874730076792354, + "grad_norm": 0.7384721636772156, + "learning_rate": 6.64062652372587e-05, + "loss": 0.9328, + "step": 123260 + }, + { + "epoch": 0.7875368948289742, + "grad_norm": 1.083748698234558, + "learning_rate": 6.640152527775821e-05, + "loss": 0.9526, + "step": 123270 + }, + { + "epoch": 0.7876007819787127, + "grad_norm": 1.286015510559082, + "learning_rate": 6.63967851530802e-05, + "loss": 0.9245, + "step": 123280 + }, + { + "epoch": 0.7876646691284515, + "grad_norm": 0.5310730338096619, + "learning_rate": 6.639204486327236e-05, + "loss": 0.8381, + "step": 123290 + }, + { + "epoch": 0.7877285562781902, + "grad_norm": 1.1618332862854004, + "learning_rate": 6.638730440838244e-05, + "loss": 1.1095, + "step": 123300 + }, + { + "epoch": 0.7877924434279289, + "grad_norm": 0.8446438908576965, + "learning_rate": 6.63825637884582e-05, + "loss": 0.9877, + "step": 123310 + }, + { + "epoch": 0.7878563305776676, + "grad_norm": 0.6772144436836243, + "learning_rate": 6.637782300354737e-05, + "loss": 0.6984, + "step": 123320 + }, + { + "epoch": 0.7879202177274063, + "grad_norm": 0.6928181052207947, + "learning_rate": 6.63730820536977e-05, + "loss": 1.007, + "step": 123330 + }, + { + "epoch": 0.787984104877145, + "grad_norm": 1.1651209592819214, + "learning_rate": 6.63683409389569e-05, + "loss": 0.889, + "step": 123340 + }, + { + "epoch": 0.7880479920268837, + "grad_norm": 1.0764139890670776, + "learning_rate": 6.636359965937278e-05, + "loss": 0.8781, + "step": 123350 + }, + { + "epoch": 0.7881118791766224, + "grad_norm": 0.5415597558021545, + "learning_rate": 6.635885821499304e-05, + "loss": 0.9395, + "step": 123360 + }, + { + "epoch": 0.7881757663263611, + "grad_norm": 1.080687165260315, + "learning_rate": 6.635411660586543e-05, + "loss": 0.9444, + "step": 123370 + }, + { + "epoch": 0.7882396534760998, + "grad_norm": 0.8479616045951843, + "learning_rate": 6.634937483203773e-05, + "loss": 0.914, + "step": 123380 + }, + { + "epoch": 0.7883035406258385, + "grad_norm": 0.7916926145553589, + "learning_rate": 6.634463289355768e-05, + "loss": 1.037, + "step": 123390 + }, + { + "epoch": 0.7883674277755772, + "grad_norm": 0.9710598587989807, + "learning_rate": 6.633989079047306e-05, + "loss": 0.8785, + "step": 123400 + }, + { + "epoch": 0.7884313149253159, + "grad_norm": 1.142552137374878, + "learning_rate": 6.633514852283159e-05, + "loss": 0.8654, + "step": 123410 + }, + { + "epoch": 0.7884952020750546, + "grad_norm": 0.7896512746810913, + "learning_rate": 6.633040609068103e-05, + "loss": 0.755, + "step": 123420 + }, + { + "epoch": 0.7885590892247933, + "grad_norm": 1.4209386110305786, + "learning_rate": 6.632566349406916e-05, + "loss": 0.797, + "step": 123430 + }, + { + "epoch": 0.788622976374532, + "grad_norm": 0.843025267124176, + "learning_rate": 6.632092073304374e-05, + "loss": 1.0926, + "step": 123440 + }, + { + "epoch": 0.7886868635242708, + "grad_norm": 0.7612596750259399, + "learning_rate": 6.631617780765252e-05, + "loss": 0.979, + "step": 123450 + }, + { + "epoch": 0.7887507506740095, + "grad_norm": 1.0904258489608765, + "learning_rate": 6.631143471794328e-05, + "loss": 0.9336, + "step": 123460 + }, + { + "epoch": 0.7888146378237482, + "grad_norm": 1.2260910272598267, + "learning_rate": 6.630669146396376e-05, + "loss": 1.0369, + "step": 123470 + }, + { + "epoch": 0.7888785249734869, + "grad_norm": 1.343691349029541, + "learning_rate": 6.630194804576177e-05, + "loss": 0.7612, + "step": 123480 + }, + { + "epoch": 0.7889424121232256, + "grad_norm": 0.8414347171783447, + "learning_rate": 6.629720446338506e-05, + "loss": 0.788, + "step": 123490 + }, + { + "epoch": 0.7890062992729643, + "grad_norm": 0.814517080783844, + "learning_rate": 6.62924607168814e-05, + "loss": 0.931, + "step": 123500 + }, + { + "epoch": 0.789070186422703, + "grad_norm": 1.7236671447753906, + "learning_rate": 6.628771680629856e-05, + "loss": 0.8326, + "step": 123510 + }, + { + "epoch": 0.7891340735724416, + "grad_norm": 0.738429605960846, + "learning_rate": 6.628297273168433e-05, + "loss": 0.8309, + "step": 123520 + }, + { + "epoch": 0.7891979607221803, + "grad_norm": 0.8377928137779236, + "learning_rate": 6.627822849308648e-05, + "loss": 0.8578, + "step": 123530 + }, + { + "epoch": 0.789261847871919, + "grad_norm": 3.6376450061798096, + "learning_rate": 6.627348409055278e-05, + "loss": 0.8637, + "step": 123540 + }, + { + "epoch": 0.7893257350216577, + "grad_norm": 1.038316011428833, + "learning_rate": 6.626873952413102e-05, + "loss": 1.0122, + "step": 123550 + }, + { + "epoch": 0.7893896221713964, + "grad_norm": 0.8108795881271362, + "learning_rate": 6.626399479386898e-05, + "loss": 1.0062, + "step": 123560 + }, + { + "epoch": 0.7894535093211351, + "grad_norm": 0.9439957737922668, + "learning_rate": 6.625924989981444e-05, + "loss": 0.8456, + "step": 123570 + }, + { + "epoch": 0.7895173964708738, + "grad_norm": 0.6934586763381958, + "learning_rate": 6.625450484201519e-05, + "loss": 0.7537, + "step": 123580 + }, + { + "epoch": 0.7895812836206125, + "grad_norm": 0.6399053335189819, + "learning_rate": 6.6249759620519e-05, + "loss": 0.6623, + "step": 123590 + }, + { + "epoch": 0.7896451707703512, + "grad_norm": 0.7249269485473633, + "learning_rate": 6.624501423537368e-05, + "loss": 0.8404, + "step": 123600 + }, + { + "epoch": 0.7897090579200899, + "grad_norm": 0.655685305595398, + "learning_rate": 6.624026868662701e-05, + "loss": 0.8336, + "step": 123610 + }, + { + "epoch": 0.7897729450698286, + "grad_norm": 0.9134299159049988, + "learning_rate": 6.623552297432679e-05, + "loss": 0.6717, + "step": 123620 + }, + { + "epoch": 0.7898368322195674, + "grad_norm": 0.7788447737693787, + "learning_rate": 6.623077709852081e-05, + "loss": 0.8576, + "step": 123630 + }, + { + "epoch": 0.7899007193693061, + "grad_norm": 0.6765179634094238, + "learning_rate": 6.622603105925686e-05, + "loss": 1.0578, + "step": 123640 + }, + { + "epoch": 0.7899646065190448, + "grad_norm": 0.7470600605010986, + "learning_rate": 6.622128485658273e-05, + "loss": 0.7213, + "step": 123650 + }, + { + "epoch": 0.7900284936687835, + "grad_norm": 0.9108386635780334, + "learning_rate": 6.621653849054623e-05, + "loss": 1.0726, + "step": 123660 + }, + { + "epoch": 0.7900923808185222, + "grad_norm": 1.1410408020019531, + "learning_rate": 6.621179196119518e-05, + "loss": 0.8654, + "step": 123670 + }, + { + "epoch": 0.7901562679682609, + "grad_norm": 1.901923656463623, + "learning_rate": 6.620704526857734e-05, + "loss": 1.0014, + "step": 123680 + }, + { + "epoch": 0.7902201551179996, + "grad_norm": 1.0020592212677002, + "learning_rate": 6.620229841274054e-05, + "loss": 0.7709, + "step": 123690 + }, + { + "epoch": 0.7902840422677383, + "grad_norm": 1.4192286729812622, + "learning_rate": 6.619755139373257e-05, + "loss": 0.8418, + "step": 123700 + }, + { + "epoch": 0.790347929417477, + "grad_norm": 0.7686444520950317, + "learning_rate": 6.619280421160125e-05, + "loss": 0.8845, + "step": 123710 + }, + { + "epoch": 0.7904118165672157, + "grad_norm": 1.1990658044815063, + "learning_rate": 6.618805686639439e-05, + "loss": 0.9405, + "step": 123720 + }, + { + "epoch": 0.7904757037169544, + "grad_norm": 1.1820399761199951, + "learning_rate": 6.618330935815979e-05, + "loss": 0.9327, + "step": 123730 + }, + { + "epoch": 0.7905395908666931, + "grad_norm": 1.1175780296325684, + "learning_rate": 6.617856168694526e-05, + "loss": 0.756, + "step": 123740 + }, + { + "epoch": 0.7906034780164318, + "grad_norm": 1.217108130455017, + "learning_rate": 6.617381385279862e-05, + "loss": 0.9991, + "step": 123750 + }, + { + "epoch": 0.7906673651661704, + "grad_norm": 0.42774638533592224, + "learning_rate": 6.616906585576768e-05, + "loss": 0.7166, + "step": 123760 + }, + { + "epoch": 0.7907312523159091, + "grad_norm": 0.831774890422821, + "learning_rate": 6.616431769590027e-05, + "loss": 0.9865, + "step": 123770 + }, + { + "epoch": 0.7907951394656478, + "grad_norm": 0.9391918778419495, + "learning_rate": 6.615956937324418e-05, + "loss": 0.8336, + "step": 123780 + }, + { + "epoch": 0.7908590266153865, + "grad_norm": 1.3029203414916992, + "learning_rate": 6.615482088784726e-05, + "loss": 1.0872, + "step": 123790 + }, + { + "epoch": 0.7909229137651252, + "grad_norm": 0.6850435733795166, + "learning_rate": 6.615007223975732e-05, + "loss": 0.6906, + "step": 123800 + }, + { + "epoch": 0.790986800914864, + "grad_norm": 1.2866231203079224, + "learning_rate": 6.614532342902216e-05, + "loss": 0.9472, + "step": 123810 + }, + { + "epoch": 0.7910506880646027, + "grad_norm": 2.2075138092041016, + "learning_rate": 6.614057445568961e-05, + "loss": 0.9311, + "step": 123820 + }, + { + "epoch": 0.7911145752143414, + "grad_norm": 0.7719504833221436, + "learning_rate": 6.613582531980755e-05, + "loss": 0.8272, + "step": 123830 + }, + { + "epoch": 0.7911784623640801, + "grad_norm": 0.922818660736084, + "learning_rate": 6.613107602142376e-05, + "loss": 0.9421, + "step": 123840 + }, + { + "epoch": 0.7912423495138188, + "grad_norm": 0.7339285612106323, + "learning_rate": 6.612632656058608e-05, + "loss": 0.8638, + "step": 123850 + }, + { + "epoch": 0.7913062366635575, + "grad_norm": 0.9554028511047363, + "learning_rate": 6.612157693734233e-05, + "loss": 0.7373, + "step": 123860 + }, + { + "epoch": 0.7913701238132962, + "grad_norm": 1.1785390377044678, + "learning_rate": 6.611682715174036e-05, + "loss": 0.6176, + "step": 123870 + }, + { + "epoch": 0.7914340109630349, + "grad_norm": 0.7205845713615417, + "learning_rate": 6.6112077203828e-05, + "loss": 0.844, + "step": 123880 + }, + { + "epoch": 0.7914978981127736, + "grad_norm": 0.773068368434906, + "learning_rate": 6.610732709365308e-05, + "loss": 0.7221, + "step": 123890 + }, + { + "epoch": 0.7915617852625123, + "grad_norm": 0.8639894127845764, + "learning_rate": 6.610257682126344e-05, + "loss": 0.8949, + "step": 123900 + }, + { + "epoch": 0.791625672412251, + "grad_norm": 0.8907663822174072, + "learning_rate": 6.609782638670692e-05, + "loss": 1.0061, + "step": 123910 + }, + { + "epoch": 0.7916895595619897, + "grad_norm": 0.8259413242340088, + "learning_rate": 6.609307579003136e-05, + "loss": 1.1113, + "step": 123920 + }, + { + "epoch": 0.7917534467117284, + "grad_norm": 0.9798697233200073, + "learning_rate": 6.608832503128461e-05, + "loss": 0.607, + "step": 123930 + }, + { + "epoch": 0.7918173338614671, + "grad_norm": 0.7070125937461853, + "learning_rate": 6.608357411051451e-05, + "loss": 0.7719, + "step": 123940 + }, + { + "epoch": 0.7918812210112058, + "grad_norm": 1.0658955574035645, + "learning_rate": 6.607882302776892e-05, + "loss": 0.9488, + "step": 123950 + }, + { + "epoch": 0.7919451081609445, + "grad_norm": 0.6668451428413391, + "learning_rate": 6.607407178309564e-05, + "loss": 0.8527, + "step": 123960 + }, + { + "epoch": 0.7920089953106833, + "grad_norm": 0.6440159678459167, + "learning_rate": 6.606932037654256e-05, + "loss": 0.9539, + "step": 123970 + }, + { + "epoch": 0.792072882460422, + "grad_norm": 0.7459390759468079, + "learning_rate": 6.606456880815754e-05, + "loss": 0.8549, + "step": 123980 + }, + { + "epoch": 0.7921367696101607, + "grad_norm": 0.9598776698112488, + "learning_rate": 6.60598170779884e-05, + "loss": 1.0577, + "step": 123990 + }, + { + "epoch": 0.7922006567598994, + "grad_norm": 1.091723084449768, + "learning_rate": 6.6055065186083e-05, + "loss": 0.8536, + "step": 124000 + }, + { + "epoch": 0.792264543909638, + "grad_norm": 1.7928060293197632, + "learning_rate": 6.605031313248922e-05, + "loss": 0.8274, + "step": 124010 + }, + { + "epoch": 0.7923284310593767, + "grad_norm": 1.0446691513061523, + "learning_rate": 6.604556091725489e-05, + "loss": 1.192, + "step": 124020 + }, + { + "epoch": 0.7923923182091154, + "grad_norm": 0.7123937010765076, + "learning_rate": 6.604080854042789e-05, + "loss": 0.9224, + "step": 124030 + }, + { + "epoch": 0.7924562053588541, + "grad_norm": 0.9317911863327026, + "learning_rate": 6.603605600205606e-05, + "loss": 0.8837, + "step": 124040 + }, + { + "epoch": 0.7925200925085928, + "grad_norm": 0.8845491409301758, + "learning_rate": 6.603130330218727e-05, + "loss": 0.9611, + "step": 124050 + }, + { + "epoch": 0.7925839796583315, + "grad_norm": 1.66157865524292, + "learning_rate": 6.60265504408694e-05, + "loss": 0.6554, + "step": 124060 + }, + { + "epoch": 0.7926478668080702, + "grad_norm": 0.9395459294319153, + "learning_rate": 6.60217974181503e-05, + "loss": 0.8756, + "step": 124070 + }, + { + "epoch": 0.7927117539578089, + "grad_norm": 1.385488748550415, + "learning_rate": 6.601704423407784e-05, + "loss": 0.9138, + "step": 124080 + }, + { + "epoch": 0.7927756411075476, + "grad_norm": 0.8289713263511658, + "learning_rate": 6.601229088869988e-05, + "loss": 0.8807, + "step": 124090 + }, + { + "epoch": 0.7928395282572863, + "grad_norm": 0.7335384488105774, + "learning_rate": 6.60075373820643e-05, + "loss": 0.8432, + "step": 124100 + }, + { + "epoch": 0.792903415407025, + "grad_norm": 0.8279372453689575, + "learning_rate": 6.600278371421898e-05, + "loss": 0.9776, + "step": 124110 + }, + { + "epoch": 0.7929673025567637, + "grad_norm": 0.7263229489326477, + "learning_rate": 6.599802988521178e-05, + "loss": 0.7568, + "step": 124120 + }, + { + "epoch": 0.7930311897065024, + "grad_norm": 0.8773966431617737, + "learning_rate": 6.599327589509056e-05, + "loss": 0.7536, + "step": 124130 + }, + { + "epoch": 0.7930950768562411, + "grad_norm": 0.7500774264335632, + "learning_rate": 6.598852174390324e-05, + "loss": 0.8422, + "step": 124140 + }, + { + "epoch": 0.7931589640059798, + "grad_norm": 0.6933243274688721, + "learning_rate": 6.598376743169767e-05, + "loss": 0.8359, + "step": 124150 + }, + { + "epoch": 0.7932228511557186, + "grad_norm": 0.848579466342926, + "learning_rate": 6.597901295852172e-05, + "loss": 0.8249, + "step": 124160 + }, + { + "epoch": 0.7932867383054573, + "grad_norm": 1.0843689441680908, + "learning_rate": 6.59742583244233e-05, + "loss": 0.8095, + "step": 124170 + }, + { + "epoch": 0.793350625455196, + "grad_norm": 0.6765615940093994, + "learning_rate": 6.596950352945026e-05, + "loss": 0.8884, + "step": 124180 + }, + { + "epoch": 0.7934145126049347, + "grad_norm": 0.7143245935440063, + "learning_rate": 6.596474857365052e-05, + "loss": 0.8451, + "step": 124190 + }, + { + "epoch": 0.7934783997546734, + "grad_norm": 0.9047801494598389, + "learning_rate": 6.595999345707195e-05, + "loss": 1.2198, + "step": 124200 + }, + { + "epoch": 0.7935422869044121, + "grad_norm": 0.6656885147094727, + "learning_rate": 6.595523817976243e-05, + "loss": 0.8917, + "step": 124210 + }, + { + "epoch": 0.7936061740541508, + "grad_norm": 0.7534079551696777, + "learning_rate": 6.595048274176986e-05, + "loss": 1.0604, + "step": 124220 + }, + { + "epoch": 0.7936700612038895, + "grad_norm": 1.1482276916503906, + "learning_rate": 6.594572714314213e-05, + "loss": 0.8904, + "step": 124230 + }, + { + "epoch": 0.7937339483536282, + "grad_norm": 0.8140796422958374, + "learning_rate": 6.594097138392715e-05, + "loss": 0.7813, + "step": 124240 + }, + { + "epoch": 0.7937978355033668, + "grad_norm": 1.066030502319336, + "learning_rate": 6.593621546417279e-05, + "loss": 0.6392, + "step": 124250 + }, + { + "epoch": 0.7938617226531055, + "grad_norm": 0.5749644041061401, + "learning_rate": 6.593145938392694e-05, + "loss": 1.107, + "step": 124260 + }, + { + "epoch": 0.7939256098028442, + "grad_norm": 0.8076421022415161, + "learning_rate": 6.592670314323753e-05, + "loss": 0.7277, + "step": 124270 + }, + { + "epoch": 0.7939894969525829, + "grad_norm": 0.9243035316467285, + "learning_rate": 6.592194674215242e-05, + "loss": 0.8893, + "step": 124280 + }, + { + "epoch": 0.7940533841023216, + "grad_norm": 0.696916401386261, + "learning_rate": 6.591719018071955e-05, + "loss": 0.848, + "step": 124290 + }, + { + "epoch": 0.7941172712520603, + "grad_norm": 1.0850187540054321, + "learning_rate": 6.591243345898679e-05, + "loss": 0.9137, + "step": 124300 + }, + { + "epoch": 0.794181158401799, + "grad_norm": 0.7324548363685608, + "learning_rate": 6.590767657700207e-05, + "loss": 0.9306, + "step": 124310 + }, + { + "epoch": 0.7942450455515377, + "grad_norm": 1.1049119234085083, + "learning_rate": 6.590291953481326e-05, + "loss": 0.9547, + "step": 124320 + }, + { + "epoch": 0.7943089327012764, + "grad_norm": 0.8178719878196716, + "learning_rate": 6.589816233246832e-05, + "loss": 0.9391, + "step": 124330 + }, + { + "epoch": 0.7943728198510152, + "grad_norm": 0.793376088142395, + "learning_rate": 6.589340497001511e-05, + "loss": 0.9583, + "step": 124340 + }, + { + "epoch": 0.7944367070007539, + "grad_norm": 0.9736217856407166, + "learning_rate": 6.588864744750158e-05, + "loss": 0.624, + "step": 124350 + }, + { + "epoch": 0.7945005941504926, + "grad_norm": 1.2875404357910156, + "learning_rate": 6.588388976497563e-05, + "loss": 0.866, + "step": 124360 + }, + { + "epoch": 0.7945644813002313, + "grad_norm": 1.0183568000793457, + "learning_rate": 6.587913192248515e-05, + "loss": 0.8561, + "step": 124370 + }, + { + "epoch": 0.79462836844997, + "grad_norm": 0.6129552125930786, + "learning_rate": 6.587437392007809e-05, + "loss": 0.8037, + "step": 124380 + }, + { + "epoch": 0.7946922555997087, + "grad_norm": 1.3483234643936157, + "learning_rate": 6.586961575780233e-05, + "loss": 0.7024, + "step": 124390 + }, + { + "epoch": 0.7947561427494474, + "grad_norm": 0.7584971189498901, + "learning_rate": 6.586485743570583e-05, + "loss": 0.9416, + "step": 124400 + }, + { + "epoch": 0.7948200298991861, + "grad_norm": 0.6718287467956543, + "learning_rate": 6.58600989538365e-05, + "loss": 0.9615, + "step": 124410 + }, + { + "epoch": 0.7948839170489248, + "grad_norm": 1.4930849075317383, + "learning_rate": 6.585534031224223e-05, + "loss": 1.3212, + "step": 124420 + }, + { + "epoch": 0.7949478041986635, + "grad_norm": 0.9690805673599243, + "learning_rate": 6.585058151097097e-05, + "loss": 0.9029, + "step": 124430 + }, + { + "epoch": 0.7950116913484022, + "grad_norm": 0.528648316860199, + "learning_rate": 6.584582255007065e-05, + "loss": 0.6965, + "step": 124440 + }, + { + "epoch": 0.7950755784981409, + "grad_norm": 0.8390137553215027, + "learning_rate": 6.584106342958917e-05, + "loss": 0.6836, + "step": 124450 + }, + { + "epoch": 0.7951394656478796, + "grad_norm": 0.4811376929283142, + "learning_rate": 6.583630414957449e-05, + "loss": 1.126, + "step": 124460 + }, + { + "epoch": 0.7952033527976183, + "grad_norm": 0.8852686882019043, + "learning_rate": 6.583154471007453e-05, + "loss": 0.8745, + "step": 124470 + }, + { + "epoch": 0.795267239947357, + "grad_norm": 1.2516735792160034, + "learning_rate": 6.582678511113722e-05, + "loss": 1.1301, + "step": 124480 + }, + { + "epoch": 0.7953311270970956, + "grad_norm": 0.831717848777771, + "learning_rate": 6.58220253528105e-05, + "loss": 0.8727, + "step": 124490 + }, + { + "epoch": 0.7953950142468343, + "grad_norm": 0.7440255284309387, + "learning_rate": 6.581726543514227e-05, + "loss": 0.8061, + "step": 124500 + }, + { + "epoch": 0.795458901396573, + "grad_norm": 1.1145399808883667, + "learning_rate": 6.581250535818051e-05, + "loss": 1.0605, + "step": 124510 + }, + { + "epoch": 0.7955227885463118, + "grad_norm": 0.678453266620636, + "learning_rate": 6.580774512197314e-05, + "loss": 0.9662, + "step": 124520 + }, + { + "epoch": 0.7955866756960505, + "grad_norm": 1.0213743448257446, + "learning_rate": 6.58029847265681e-05, + "loss": 0.915, + "step": 124530 + }, + { + "epoch": 0.7956505628457892, + "grad_norm": 0.8792978525161743, + "learning_rate": 6.579822417201333e-05, + "loss": 0.8723, + "step": 124540 + }, + { + "epoch": 0.7957144499955279, + "grad_norm": 0.9746803045272827, + "learning_rate": 6.579346345835677e-05, + "loss": 0.8153, + "step": 124550 + }, + { + "epoch": 0.7957783371452666, + "grad_norm": 0.7419812083244324, + "learning_rate": 6.578870258564637e-05, + "loss": 0.7329, + "step": 124560 + }, + { + "epoch": 0.7958422242950053, + "grad_norm": 0.9181807041168213, + "learning_rate": 6.57839415539301e-05, + "loss": 0.8517, + "step": 124570 + }, + { + "epoch": 0.795906111444744, + "grad_norm": 0.7871003746986389, + "learning_rate": 6.577918036325586e-05, + "loss": 0.7752, + "step": 124580 + }, + { + "epoch": 0.7959699985944827, + "grad_norm": 0.8576268553733826, + "learning_rate": 6.577441901367163e-05, + "loss": 0.94, + "step": 124590 + }, + { + "epoch": 0.7960338857442214, + "grad_norm": 1.1811336278915405, + "learning_rate": 6.576965750522534e-05, + "loss": 0.9644, + "step": 124600 + }, + { + "epoch": 0.7960977728939601, + "grad_norm": 1.184383511543274, + "learning_rate": 6.576489583796498e-05, + "loss": 1.1323, + "step": 124610 + }, + { + "epoch": 0.7961616600436988, + "grad_norm": 0.9622499346733093, + "learning_rate": 6.576013401193846e-05, + "loss": 0.8139, + "step": 124620 + }, + { + "epoch": 0.7962255471934375, + "grad_norm": 1.1531530618667603, + "learning_rate": 6.575537202719377e-05, + "loss": 0.7081, + "step": 124630 + }, + { + "epoch": 0.7962894343431762, + "grad_norm": 1.1562443971633911, + "learning_rate": 6.575060988377885e-05, + "loss": 0.9157, + "step": 124640 + }, + { + "epoch": 0.7963533214929149, + "grad_norm": 0.8580940365791321, + "learning_rate": 6.574584758174166e-05, + "loss": 1.0154, + "step": 124650 + }, + { + "epoch": 0.7964172086426536, + "grad_norm": 0.7232387065887451, + "learning_rate": 6.574108512113016e-05, + "loss": 0.8085, + "step": 124660 + }, + { + "epoch": 0.7964810957923923, + "grad_norm": 0.9032987952232361, + "learning_rate": 6.573632250199234e-05, + "loss": 1.0046, + "step": 124670 + }, + { + "epoch": 0.796544982942131, + "grad_norm": 0.9355868697166443, + "learning_rate": 6.57315597243761e-05, + "loss": 0.7971, + "step": 124680 + }, + { + "epoch": 0.7966088700918698, + "grad_norm": 0.8538186550140381, + "learning_rate": 6.572679678832946e-05, + "loss": 0.9047, + "step": 124690 + }, + { + "epoch": 0.7966727572416085, + "grad_norm": 0.8761003017425537, + "learning_rate": 6.572203369390038e-05, + "loss": 0.7487, + "step": 124700 + }, + { + "epoch": 0.7967366443913472, + "grad_norm": 0.643221914768219, + "learning_rate": 6.571727044113679e-05, + "loss": 0.8214, + "step": 124710 + }, + { + "epoch": 0.7968005315410859, + "grad_norm": 0.8245800137519836, + "learning_rate": 6.571250703008671e-05, + "loss": 0.996, + "step": 124720 + }, + { + "epoch": 0.7968644186908246, + "grad_norm": 0.692182719707489, + "learning_rate": 6.57077434607981e-05, + "loss": 0.8932, + "step": 124730 + }, + { + "epoch": 0.7969283058405632, + "grad_norm": 0.6998267769813538, + "learning_rate": 6.570297973331892e-05, + "loss": 0.8643, + "step": 124740 + }, + { + "epoch": 0.7969921929903019, + "grad_norm": 0.6680889129638672, + "learning_rate": 6.569821584769714e-05, + "loss": 0.8156, + "step": 124750 + }, + { + "epoch": 0.7970560801400406, + "grad_norm": 0.7822675704956055, + "learning_rate": 6.569345180398075e-05, + "loss": 0.8655, + "step": 124760 + }, + { + "epoch": 0.7971199672897793, + "grad_norm": 0.9974295496940613, + "learning_rate": 6.568868760221773e-05, + "loss": 0.6725, + "step": 124770 + }, + { + "epoch": 0.797183854439518, + "grad_norm": 0.6198078989982605, + "learning_rate": 6.568392324245605e-05, + "loss": 0.6848, + "step": 124780 + }, + { + "epoch": 0.7972477415892567, + "grad_norm": 1.088592767715454, + "learning_rate": 6.567915872474368e-05, + "loss": 1.1632, + "step": 124790 + }, + { + "epoch": 0.7973116287389954, + "grad_norm": 0.636913537979126, + "learning_rate": 6.567439404912864e-05, + "loss": 0.8826, + "step": 124800 + }, + { + "epoch": 0.7973755158887341, + "grad_norm": 0.7936016321182251, + "learning_rate": 6.566962921565886e-05, + "loss": 0.737, + "step": 124810 + }, + { + "epoch": 0.7974394030384728, + "grad_norm": 0.8261633515357971, + "learning_rate": 6.566486422438238e-05, + "loss": 0.8341, + "step": 124820 + }, + { + "epoch": 0.7975032901882115, + "grad_norm": 0.8050313591957092, + "learning_rate": 6.566009907534717e-05, + "loss": 0.7059, + "step": 124830 + }, + { + "epoch": 0.7975671773379502, + "grad_norm": 1.183261513710022, + "learning_rate": 6.565533376860121e-05, + "loss": 0.9832, + "step": 124840 + }, + { + "epoch": 0.797631064487689, + "grad_norm": 0.7606709003448486, + "learning_rate": 6.565056830419249e-05, + "loss": 0.9096, + "step": 124850 + }, + { + "epoch": 0.7976949516374277, + "grad_norm": 1.1363462209701538, + "learning_rate": 6.564580268216901e-05, + "loss": 0.6826, + "step": 124860 + }, + { + "epoch": 0.7977588387871664, + "grad_norm": 1.563015103340149, + "learning_rate": 6.564103690257875e-05, + "loss": 1.0033, + "step": 124870 + }, + { + "epoch": 0.7978227259369051, + "grad_norm": 1.6602332592010498, + "learning_rate": 6.563627096546973e-05, + "loss": 0.7878, + "step": 124880 + }, + { + "epoch": 0.7978866130866438, + "grad_norm": 1.8670592308044434, + "learning_rate": 6.563150487088994e-05, + "loss": 1.1101, + "step": 124890 + }, + { + "epoch": 0.7979505002363825, + "grad_norm": 0.8670040369033813, + "learning_rate": 6.562673861888735e-05, + "loss": 1.1995, + "step": 124900 + }, + { + "epoch": 0.7980143873861212, + "grad_norm": 0.7766642570495605, + "learning_rate": 6.562197220951e-05, + "loss": 0.8249, + "step": 124910 + }, + { + "epoch": 0.7980782745358599, + "grad_norm": 1.183617115020752, + "learning_rate": 6.561720564280588e-05, + "loss": 0.8269, + "step": 124920 + }, + { + "epoch": 0.7981421616855986, + "grad_norm": 0.9451962113380432, + "learning_rate": 6.561243891882298e-05, + "loss": 0.8455, + "step": 124930 + }, + { + "epoch": 0.7982060488353373, + "grad_norm": 1.272316813468933, + "learning_rate": 6.560767203760932e-05, + "loss": 0.9672, + "step": 124940 + }, + { + "epoch": 0.798269935985076, + "grad_norm": 0.775259256362915, + "learning_rate": 6.560290499921288e-05, + "loss": 0.7095, + "step": 124950 + }, + { + "epoch": 0.7983338231348147, + "grad_norm": 0.8193401098251343, + "learning_rate": 6.559813780368172e-05, + "loss": 0.8389, + "step": 124960 + }, + { + "epoch": 0.7983977102845534, + "grad_norm": 0.6283045411109924, + "learning_rate": 6.55933704510638e-05, + "loss": 0.969, + "step": 124970 + }, + { + "epoch": 0.798461597434292, + "grad_norm": 0.7653422951698303, + "learning_rate": 6.558860294140715e-05, + "loss": 0.9878, + "step": 124980 + }, + { + "epoch": 0.7985254845840307, + "grad_norm": 0.6775907874107361, + "learning_rate": 6.558383527475978e-05, + "loss": 0.8479, + "step": 124990 + }, + { + "epoch": 0.7985893717337694, + "grad_norm": 1.2088565826416016, + "learning_rate": 6.557906745116972e-05, + "loss": 1.0976, + "step": 125000 + }, + { + "epoch": 0.7986532588835081, + "grad_norm": 1.290498971939087, + "learning_rate": 6.557429947068496e-05, + "loss": 1.044, + "step": 125010 + }, + { + "epoch": 0.7987171460332468, + "grad_norm": 1.1517268419265747, + "learning_rate": 6.556953133335353e-05, + "loss": 1.1142, + "step": 125020 + }, + { + "epoch": 0.7987810331829855, + "grad_norm": 0.6305286884307861, + "learning_rate": 6.556476303922344e-05, + "loss": 0.9341, + "step": 125030 + }, + { + "epoch": 0.7988449203327243, + "grad_norm": 1.3918240070343018, + "learning_rate": 6.555999458834273e-05, + "loss": 0.7964, + "step": 125040 + }, + { + "epoch": 0.798908807482463, + "grad_norm": 1.885688304901123, + "learning_rate": 6.555522598075943e-05, + "loss": 0.9877, + "step": 125050 + }, + { + "epoch": 0.7989726946322017, + "grad_norm": 0.8548856973648071, + "learning_rate": 6.555045721652153e-05, + "loss": 0.8515, + "step": 125060 + }, + { + "epoch": 0.7990365817819404, + "grad_norm": 1.1888582706451416, + "learning_rate": 6.554568829567708e-05, + "loss": 0.8533, + "step": 125070 + }, + { + "epoch": 0.7991004689316791, + "grad_norm": 0.8727964162826538, + "learning_rate": 6.554091921827409e-05, + "loss": 0.6094, + "step": 125080 + }, + { + "epoch": 0.7991643560814178, + "grad_norm": 3.3259003162384033, + "learning_rate": 6.55361499843606e-05, + "loss": 1.0581, + "step": 125090 + }, + { + "epoch": 0.7992282432311565, + "grad_norm": 0.6152466535568237, + "learning_rate": 6.553138059398465e-05, + "loss": 0.6899, + "step": 125100 + }, + { + "epoch": 0.7992921303808952, + "grad_norm": 1.0475343465805054, + "learning_rate": 6.552661104719426e-05, + "loss": 0.9867, + "step": 125110 + }, + { + "epoch": 0.7993560175306339, + "grad_norm": 0.5357400178909302, + "learning_rate": 6.552184134403745e-05, + "loss": 0.8811, + "step": 125120 + }, + { + "epoch": 0.7994199046803726, + "grad_norm": 0.9087369441986084, + "learning_rate": 6.551707148456229e-05, + "loss": 0.8129, + "step": 125130 + }, + { + "epoch": 0.7994837918301113, + "grad_norm": 0.9281877279281616, + "learning_rate": 6.551230146881678e-05, + "loss": 0.8924, + "step": 125140 + }, + { + "epoch": 0.79954767897985, + "grad_norm": 1.1757920980453491, + "learning_rate": 6.550753129684897e-05, + "loss": 1.0507, + "step": 125150 + }, + { + "epoch": 0.7996115661295887, + "grad_norm": 1.0355859994888306, + "learning_rate": 6.550276096870692e-05, + "loss": 0.7105, + "step": 125160 + }, + { + "epoch": 0.7996754532793274, + "grad_norm": 1.1129631996154785, + "learning_rate": 6.549799048443865e-05, + "loss": 0.9878, + "step": 125170 + }, + { + "epoch": 0.7997393404290661, + "grad_norm": 0.9657943248748779, + "learning_rate": 6.549321984409221e-05, + "loss": 0.8496, + "step": 125180 + }, + { + "epoch": 0.7998032275788048, + "grad_norm": 0.8863055109977722, + "learning_rate": 6.548844904771564e-05, + "loss": 0.8804, + "step": 125190 + }, + { + "epoch": 0.7998671147285435, + "grad_norm": 0.6890332698822021, + "learning_rate": 6.548367809535699e-05, + "loss": 0.766, + "step": 125200 + }, + { + "epoch": 0.7999310018782823, + "grad_norm": 1.1075465679168701, + "learning_rate": 6.54789069870643e-05, + "loss": 0.8441, + "step": 125210 + }, + { + "epoch": 0.7999948890280209, + "grad_norm": 0.981423020362854, + "learning_rate": 6.547413572288564e-05, + "loss": 0.7874, + "step": 125220 + }, + { + "epoch": 0.8000587761777596, + "grad_norm": 0.9776214957237244, + "learning_rate": 6.546936430286903e-05, + "loss": 1.1112, + "step": 125230 + }, + { + "epoch": 0.8001226633274983, + "grad_norm": 0.6384032964706421, + "learning_rate": 6.546459272706254e-05, + "loss": 0.781, + "step": 125240 + }, + { + "epoch": 0.800186550477237, + "grad_norm": 1.0786020755767822, + "learning_rate": 6.545982099551422e-05, + "loss": 0.9097, + "step": 125250 + }, + { + "epoch": 0.8002504376269757, + "grad_norm": 1.0218867063522339, + "learning_rate": 6.545504910827214e-05, + "loss": 0.8466, + "step": 125260 + }, + { + "epoch": 0.8003143247767144, + "grad_norm": 0.5271647572517395, + "learning_rate": 6.545027706538434e-05, + "loss": 0.8278, + "step": 125270 + }, + { + "epoch": 0.8003782119264531, + "grad_norm": 0.5940924286842346, + "learning_rate": 6.544550486689889e-05, + "loss": 0.7146, + "step": 125280 + }, + { + "epoch": 0.8004420990761918, + "grad_norm": 1.223508596420288, + "learning_rate": 6.544073251286383e-05, + "loss": 0.9559, + "step": 125290 + }, + { + "epoch": 0.8005059862259305, + "grad_norm": 0.8251738548278809, + "learning_rate": 6.543596000332724e-05, + "loss": 0.8241, + "step": 125300 + }, + { + "epoch": 0.8005698733756692, + "grad_norm": 1.2967746257781982, + "learning_rate": 6.543118733833719e-05, + "loss": 0.7866, + "step": 125310 + }, + { + "epoch": 0.8006337605254079, + "grad_norm": 1.4892044067382812, + "learning_rate": 6.542641451794172e-05, + "loss": 0.785, + "step": 125320 + }, + { + "epoch": 0.8006976476751466, + "grad_norm": 0.7800642848014832, + "learning_rate": 6.54216415421889e-05, + "loss": 1.0806, + "step": 125330 + }, + { + "epoch": 0.8007615348248853, + "grad_norm": 1.0607541799545288, + "learning_rate": 6.541686841112685e-05, + "loss": 0.8438, + "step": 125340 + }, + { + "epoch": 0.800825421974624, + "grad_norm": 1.6348508596420288, + "learning_rate": 6.541209512480355e-05, + "loss": 0.9009, + "step": 125350 + }, + { + "epoch": 0.8008893091243627, + "grad_norm": 0.7919349670410156, + "learning_rate": 6.540732168326715e-05, + "loss": 0.9583, + "step": 125360 + }, + { + "epoch": 0.8009531962741014, + "grad_norm": 0.8712650537490845, + "learning_rate": 6.540254808656567e-05, + "loss": 0.7806, + "step": 125370 + }, + { + "epoch": 0.8010170834238401, + "grad_norm": 0.9894066452980042, + "learning_rate": 6.539825171690796e-05, + "loss": 0.9936, + "step": 125380 + }, + { + "epoch": 0.8010809705735789, + "grad_norm": 1.3204667568206787, + "learning_rate": 6.539347782552532e-05, + "loss": 0.9534, + "step": 125390 + }, + { + "epoch": 0.8011448577233176, + "grad_norm": 0.7939335107803345, + "learning_rate": 6.538870377911706e-05, + "loss": 0.9815, + "step": 125400 + }, + { + "epoch": 0.8012087448730563, + "grad_norm": 0.7755239605903625, + "learning_rate": 6.538392957773122e-05, + "loss": 0.8787, + "step": 125410 + }, + { + "epoch": 0.801272632022795, + "grad_norm": 1.006554126739502, + "learning_rate": 6.53791552214159e-05, + "loss": 0.9379, + "step": 125420 + }, + { + "epoch": 0.8013365191725337, + "grad_norm": 0.7018999457359314, + "learning_rate": 6.53743807102192e-05, + "loss": 0.8513, + "step": 125430 + }, + { + "epoch": 0.8014004063222724, + "grad_norm": 0.9612287878990173, + "learning_rate": 6.536960604418918e-05, + "loss": 1.0788, + "step": 125440 + }, + { + "epoch": 0.8014642934720111, + "grad_norm": 0.7687857151031494, + "learning_rate": 6.536483122337391e-05, + "loss": 0.9172, + "step": 125450 + }, + { + "epoch": 0.8015281806217497, + "grad_norm": 1.8492335081100464, + "learning_rate": 6.536005624782152e-05, + "loss": 0.8897, + "step": 125460 + }, + { + "epoch": 0.8015920677714884, + "grad_norm": 0.7783719301223755, + "learning_rate": 6.535528111758006e-05, + "loss": 1.1489, + "step": 125470 + }, + { + "epoch": 0.8016559549212271, + "grad_norm": 1.056986927986145, + "learning_rate": 6.535050583269764e-05, + "loss": 0.7073, + "step": 125480 + }, + { + "epoch": 0.8017198420709658, + "grad_norm": 0.8337403535842896, + "learning_rate": 6.534573039322235e-05, + "loss": 0.8441, + "step": 125490 + }, + { + "epoch": 0.8017837292207045, + "grad_norm": 0.8885868191719055, + "learning_rate": 6.534095479920227e-05, + "loss": 0.7835, + "step": 125500 + }, + { + "epoch": 0.8018476163704432, + "grad_norm": 1.2602735757827759, + "learning_rate": 6.533617905068549e-05, + "loss": 0.7678, + "step": 125510 + }, + { + "epoch": 0.8019115035201819, + "grad_norm": 1.2273060083389282, + "learning_rate": 6.533140314772015e-05, + "loss": 1.3602, + "step": 125520 + }, + { + "epoch": 0.8019753906699206, + "grad_norm": 0.9865765571594238, + "learning_rate": 6.532662709035431e-05, + "loss": 1.0048, + "step": 125530 + }, + { + "epoch": 0.8020392778196593, + "grad_norm": 0.8420624136924744, + "learning_rate": 6.532185087863607e-05, + "loss": 0.7444, + "step": 125540 + }, + { + "epoch": 0.802103164969398, + "grad_norm": 4.6297712326049805, + "learning_rate": 6.531707451261354e-05, + "loss": 0.921, + "step": 125550 + }, + { + "epoch": 0.8021670521191367, + "grad_norm": 0.9108629822731018, + "learning_rate": 6.531229799233482e-05, + "loss": 1.1591, + "step": 125560 + }, + { + "epoch": 0.8022309392688755, + "grad_norm": 1.6921429634094238, + "learning_rate": 6.530752131784801e-05, + "loss": 0.8015, + "step": 125570 + }, + { + "epoch": 0.8022948264186142, + "grad_norm": 0.6055482029914856, + "learning_rate": 6.530274448920122e-05, + "loss": 0.8499, + "step": 125580 + }, + { + "epoch": 0.8023587135683529, + "grad_norm": 1.0739818811416626, + "learning_rate": 6.529796750644255e-05, + "loss": 0.8019, + "step": 125590 + }, + { + "epoch": 0.8024226007180916, + "grad_norm": 1.0607513189315796, + "learning_rate": 6.52931903696201e-05, + "loss": 0.9353, + "step": 125600 + }, + { + "epoch": 0.8024864878678303, + "grad_norm": 0.9030999541282654, + "learning_rate": 6.528841307878201e-05, + "loss": 1.1511, + "step": 125610 + }, + { + "epoch": 0.802550375017569, + "grad_norm": 1.075486183166504, + "learning_rate": 6.528363563397638e-05, + "loss": 0.8964, + "step": 125620 + }, + { + "epoch": 0.8026142621673077, + "grad_norm": 1.8785415887832642, + "learning_rate": 6.527885803525131e-05, + "loss": 0.7627, + "step": 125630 + }, + { + "epoch": 0.8026781493170464, + "grad_norm": 0.9460232853889465, + "learning_rate": 6.527408028265491e-05, + "loss": 0.8828, + "step": 125640 + }, + { + "epoch": 0.8027420364667851, + "grad_norm": 0.7924548387527466, + "learning_rate": 6.526930237623533e-05, + "loss": 0.7436, + "step": 125650 + }, + { + "epoch": 0.8028059236165238, + "grad_norm": 0.8257904052734375, + "learning_rate": 6.526452431604065e-05, + "loss": 1.0438, + "step": 125660 + }, + { + "epoch": 0.8028698107662625, + "grad_norm": 1.1398284435272217, + "learning_rate": 6.5259746102119e-05, + "loss": 0.8756, + "step": 125670 + }, + { + "epoch": 0.8029336979160012, + "grad_norm": 0.955585777759552, + "learning_rate": 6.52549677345185e-05, + "loss": 1.0752, + "step": 125680 + }, + { + "epoch": 0.8029975850657399, + "grad_norm": 0.8351637721061707, + "learning_rate": 6.525018921328729e-05, + "loss": 0.9187, + "step": 125690 + }, + { + "epoch": 0.8030614722154786, + "grad_norm": 0.9746791124343872, + "learning_rate": 6.524541053847349e-05, + "loss": 0.6942, + "step": 125700 + }, + { + "epoch": 0.8031253593652172, + "grad_norm": 0.697482705116272, + "learning_rate": 6.52406317101252e-05, + "loss": 0.7528, + "step": 125710 + }, + { + "epoch": 0.8031892465149559, + "grad_norm": 0.9149326682090759, + "learning_rate": 6.523585272829056e-05, + "loss": 0.85, + "step": 125720 + }, + { + "epoch": 0.8032531336646946, + "grad_norm": 0.9170807003974915, + "learning_rate": 6.52310735930177e-05, + "loss": 1.1702, + "step": 125730 + }, + { + "epoch": 0.8033170208144333, + "grad_norm": 0.8044551014900208, + "learning_rate": 6.522629430435479e-05, + "loss": 1.0825, + "step": 125740 + }, + { + "epoch": 0.803380907964172, + "grad_norm": 1.228047490119934, + "learning_rate": 6.522151486234989e-05, + "loss": 0.7574, + "step": 125750 + }, + { + "epoch": 0.8034447951139108, + "grad_norm": 0.9429476857185364, + "learning_rate": 6.521673526705116e-05, + "loss": 1.0447, + "step": 125760 + }, + { + "epoch": 0.8035086822636495, + "grad_norm": 1.0148427486419678, + "learning_rate": 6.521195551850676e-05, + "loss": 1.0113, + "step": 125770 + }, + { + "epoch": 0.8035725694133882, + "grad_norm": 0.9460819959640503, + "learning_rate": 6.520717561676481e-05, + "loss": 1.0225, + "step": 125780 + }, + { + "epoch": 0.8036364565631269, + "grad_norm": 1.2216135263442993, + "learning_rate": 6.520239556187345e-05, + "loss": 0.999, + "step": 125790 + }, + { + "epoch": 0.8037003437128656, + "grad_norm": 0.7542139887809753, + "learning_rate": 6.519761535388079e-05, + "loss": 1.1307, + "step": 125800 + }, + { + "epoch": 0.8037642308626043, + "grad_norm": 0.6314334273338318, + "learning_rate": 6.519283499283502e-05, + "loss": 0.8114, + "step": 125810 + }, + { + "epoch": 0.803828118012343, + "grad_norm": 1.1564096212387085, + "learning_rate": 6.518805447878425e-05, + "loss": 0.8931, + "step": 125820 + }, + { + "epoch": 0.8038920051620817, + "grad_norm": 0.7837060689926147, + "learning_rate": 6.518327381177663e-05, + "loss": 0.7861, + "step": 125830 + }, + { + "epoch": 0.8039558923118204, + "grad_norm": 0.8681246042251587, + "learning_rate": 6.51784929918603e-05, + "loss": 1.194, + "step": 125840 + }, + { + "epoch": 0.8040197794615591, + "grad_norm": 1.4381413459777832, + "learning_rate": 6.517371201908342e-05, + "loss": 0.8307, + "step": 125850 + }, + { + "epoch": 0.8040836666112978, + "grad_norm": 1.4342834949493408, + "learning_rate": 6.516893089349414e-05, + "loss": 0.7483, + "step": 125860 + }, + { + "epoch": 0.8041475537610365, + "grad_norm": 0.9879970550537109, + "learning_rate": 6.516414961514059e-05, + "loss": 1.1164, + "step": 125870 + }, + { + "epoch": 0.8042114409107752, + "grad_norm": 1.0189735889434814, + "learning_rate": 6.515936818407095e-05, + "loss": 0.9046, + "step": 125880 + }, + { + "epoch": 0.8042753280605139, + "grad_norm": 1.1108025312423706, + "learning_rate": 6.515458660033335e-05, + "loss": 0.913, + "step": 125890 + }, + { + "epoch": 0.8043392152102526, + "grad_norm": 0.862022876739502, + "learning_rate": 6.514980486397595e-05, + "loss": 0.913, + "step": 125900 + }, + { + "epoch": 0.8044031023599914, + "grad_norm": 0.8951718807220459, + "learning_rate": 6.51450229750469e-05, + "loss": 0.8194, + "step": 125910 + }, + { + "epoch": 0.8044669895097301, + "grad_norm": 0.9488630890846252, + "learning_rate": 6.514024093359438e-05, + "loss": 0.9198, + "step": 125920 + }, + { + "epoch": 0.8045308766594688, + "grad_norm": 1.038546085357666, + "learning_rate": 6.513545873966654e-05, + "loss": 0.8252, + "step": 125930 + }, + { + "epoch": 0.8045947638092075, + "grad_norm": 0.8957170844078064, + "learning_rate": 6.513067639331151e-05, + "loss": 0.9968, + "step": 125940 + }, + { + "epoch": 0.8046586509589461, + "grad_norm": 0.7613710761070251, + "learning_rate": 6.512589389457751e-05, + "loss": 0.9103, + "step": 125950 + }, + { + "epoch": 0.8047225381086848, + "grad_norm": 1.0033246278762817, + "learning_rate": 6.512111124351265e-05, + "loss": 0.9965, + "step": 125960 + }, + { + "epoch": 0.8047864252584235, + "grad_norm": 0.5386576652526855, + "learning_rate": 6.511632844016512e-05, + "loss": 0.7118, + "step": 125970 + }, + { + "epoch": 0.8048503124081622, + "grad_norm": 0.74485844373703, + "learning_rate": 6.511154548458312e-05, + "loss": 0.7851, + "step": 125980 + }, + { + "epoch": 0.8049141995579009, + "grad_norm": 0.9282761812210083, + "learning_rate": 6.510676237681475e-05, + "loss": 0.9678, + "step": 125990 + }, + { + "epoch": 0.8049780867076396, + "grad_norm": 2.0792996883392334, + "learning_rate": 6.510197911690822e-05, + "loss": 1.4649, + "step": 126000 + }, + { + "epoch": 0.8050419738573783, + "grad_norm": 0.6187208294868469, + "learning_rate": 6.50971957049117e-05, + "loss": 0.9499, + "step": 126010 + }, + { + "epoch": 0.805105861007117, + "grad_norm": 0.8118966221809387, + "learning_rate": 6.509241214087334e-05, + "loss": 0.7766, + "step": 126020 + }, + { + "epoch": 0.8051697481568557, + "grad_norm": 0.8239946365356445, + "learning_rate": 6.508762842484135e-05, + "loss": 0.8789, + "step": 126030 + }, + { + "epoch": 0.8052336353065944, + "grad_norm": 1.1240622997283936, + "learning_rate": 6.508284455686388e-05, + "loss": 0.6257, + "step": 126040 + }, + { + "epoch": 0.8052975224563331, + "grad_norm": 1.1769168376922607, + "learning_rate": 6.507806053698912e-05, + "loss": 0.8184, + "step": 126050 + }, + { + "epoch": 0.8053614096060718, + "grad_norm": 1.071930170059204, + "learning_rate": 6.507327636526526e-05, + "loss": 0.778, + "step": 126060 + }, + { + "epoch": 0.8054252967558105, + "grad_norm": 0.9074715375900269, + "learning_rate": 6.506849204174045e-05, + "loss": 1.0664, + "step": 126070 + }, + { + "epoch": 0.8054891839055492, + "grad_norm": 1.1464279890060425, + "learning_rate": 6.50637075664629e-05, + "loss": 0.9875, + "step": 126080 + }, + { + "epoch": 0.805553071055288, + "grad_norm": 1.2106982469558716, + "learning_rate": 6.505892293948077e-05, + "loss": 0.9013, + "step": 126090 + }, + { + "epoch": 0.8056169582050267, + "grad_norm": 0.763820230960846, + "learning_rate": 6.505413816084227e-05, + "loss": 0.8071, + "step": 126100 + }, + { + "epoch": 0.8056808453547654, + "grad_norm": 1.4662244319915771, + "learning_rate": 6.504935323059558e-05, + "loss": 0.7963, + "step": 126110 + }, + { + "epoch": 0.8057447325045041, + "grad_norm": 1.1400254964828491, + "learning_rate": 6.504456814878888e-05, + "loss": 0.8126, + "step": 126120 + }, + { + "epoch": 0.8058086196542428, + "grad_norm": 0.844118595123291, + "learning_rate": 6.503978291547035e-05, + "loss": 0.8508, + "step": 126130 + }, + { + "epoch": 0.8058725068039815, + "grad_norm": 0.9051877856254578, + "learning_rate": 6.50349975306882e-05, + "loss": 0.788, + "step": 126140 + }, + { + "epoch": 0.8059363939537202, + "grad_norm": 0.8042912483215332, + "learning_rate": 6.503021199449063e-05, + "loss": 0.9271, + "step": 126150 + }, + { + "epoch": 0.8060002811034589, + "grad_norm": 0.8122944235801697, + "learning_rate": 6.50254263069258e-05, + "loss": 1.2381, + "step": 126160 + }, + { + "epoch": 0.8060641682531976, + "grad_norm": 0.9089512228965759, + "learning_rate": 6.502064046804193e-05, + "loss": 0.8603, + "step": 126170 + }, + { + "epoch": 0.8061280554029363, + "grad_norm": 0.9631441235542297, + "learning_rate": 6.501585447788724e-05, + "loss": 1.0211, + "step": 126180 + }, + { + "epoch": 0.8061919425526749, + "grad_norm": 0.5467532873153687, + "learning_rate": 6.501106833650989e-05, + "loss": 0.7911, + "step": 126190 + }, + { + "epoch": 0.8062558297024136, + "grad_norm": 0.8144317269325256, + "learning_rate": 6.500628204395809e-05, + "loss": 1.0271, + "step": 126200 + }, + { + "epoch": 0.8063197168521523, + "grad_norm": 0.6521165370941162, + "learning_rate": 6.500149560028005e-05, + "loss": 0.9794, + "step": 126210 + }, + { + "epoch": 0.806383604001891, + "grad_norm": 0.6328021287918091, + "learning_rate": 6.499670900552397e-05, + "loss": 0.8287, + "step": 126220 + }, + { + "epoch": 0.8064474911516297, + "grad_norm": 0.565212607383728, + "learning_rate": 6.499192225973806e-05, + "loss": 0.8106, + "step": 126230 + }, + { + "epoch": 0.8065113783013684, + "grad_norm": 0.8968755602836609, + "learning_rate": 6.498713536297053e-05, + "loss": 0.7972, + "step": 126240 + }, + { + "epoch": 0.8065752654511071, + "grad_norm": 0.9558743834495544, + "learning_rate": 6.498234831526957e-05, + "loss": 0.8166, + "step": 126250 + }, + { + "epoch": 0.8066391526008458, + "grad_norm": 0.8803595900535583, + "learning_rate": 6.497756111668342e-05, + "loss": 0.7788, + "step": 126260 + }, + { + "epoch": 0.8067030397505845, + "grad_norm": 0.6762875914573669, + "learning_rate": 6.497277376726025e-05, + "loss": 0.9889, + "step": 126270 + }, + { + "epoch": 0.8067669269003233, + "grad_norm": 0.7682203650474548, + "learning_rate": 6.496798626704831e-05, + "loss": 1.016, + "step": 126280 + }, + { + "epoch": 0.806830814050062, + "grad_norm": 0.8153054714202881, + "learning_rate": 6.496319861609579e-05, + "loss": 0.8005, + "step": 126290 + }, + { + "epoch": 0.8068947011998007, + "grad_norm": 0.767785906791687, + "learning_rate": 6.495841081445091e-05, + "loss": 0.678, + "step": 126300 + }, + { + "epoch": 0.8069585883495394, + "grad_norm": 1.054632544517517, + "learning_rate": 6.495362286216191e-05, + "loss": 0.8752, + "step": 126310 + }, + { + "epoch": 0.8070224754992781, + "grad_norm": 1.0451246500015259, + "learning_rate": 6.494883475927698e-05, + "loss": 0.8354, + "step": 126320 + }, + { + "epoch": 0.8070863626490168, + "grad_norm": 0.6930572986602783, + "learning_rate": 6.494404650584435e-05, + "loss": 0.9319, + "step": 126330 + }, + { + "epoch": 0.8071502497987555, + "grad_norm": 0.9584304094314575, + "learning_rate": 6.493925810191226e-05, + "loss": 0.924, + "step": 126340 + }, + { + "epoch": 0.8072141369484942, + "grad_norm": 1.0455424785614014, + "learning_rate": 6.49344695475289e-05, + "loss": 1.0299, + "step": 126350 + }, + { + "epoch": 0.8072780240982329, + "grad_norm": 1.3468433618545532, + "learning_rate": 6.49296808427425e-05, + "loss": 0.7191, + "step": 126360 + }, + { + "epoch": 0.8073419112479716, + "grad_norm": 1.8125096559524536, + "learning_rate": 6.492489198760131e-05, + "loss": 0.8762, + "step": 126370 + }, + { + "epoch": 0.8074057983977103, + "grad_norm": 1.148374080657959, + "learning_rate": 6.492010298215355e-05, + "loss": 0.8672, + "step": 126380 + }, + { + "epoch": 0.807469685547449, + "grad_norm": 0.7599702477455139, + "learning_rate": 6.491531382644744e-05, + "loss": 0.9156, + "step": 126390 + }, + { + "epoch": 0.8075335726971877, + "grad_norm": 1.1603766679763794, + "learning_rate": 6.491052452053123e-05, + "loss": 0.7981, + "step": 126400 + }, + { + "epoch": 0.8075974598469264, + "grad_norm": 0.9405614733695984, + "learning_rate": 6.490573506445312e-05, + "loss": 0.8393, + "step": 126410 + }, + { + "epoch": 0.8076613469966651, + "grad_norm": 1.048951268196106, + "learning_rate": 6.490094545826137e-05, + "loss": 1.0174, + "step": 126420 + }, + { + "epoch": 0.8077252341464038, + "grad_norm": 2.152139663696289, + "learning_rate": 6.48961557020042e-05, + "loss": 1.0473, + "step": 126430 + }, + { + "epoch": 0.8077891212961424, + "grad_norm": 0.6861464977264404, + "learning_rate": 6.489136579572987e-05, + "loss": 0.7224, + "step": 126440 + }, + { + "epoch": 0.8078530084458811, + "grad_norm": 0.8665691018104553, + "learning_rate": 6.48865757394866e-05, + "loss": 1.0713, + "step": 126450 + }, + { + "epoch": 0.8079168955956199, + "grad_norm": 0.651671826839447, + "learning_rate": 6.488178553332262e-05, + "loss": 1.0617, + "step": 126460 + }, + { + "epoch": 0.8079807827453586, + "grad_norm": 1.4242401123046875, + "learning_rate": 6.487699517728621e-05, + "loss": 0.8041, + "step": 126470 + }, + { + "epoch": 0.8080446698950973, + "grad_norm": 0.5821726322174072, + "learning_rate": 6.487220467142556e-05, + "loss": 0.9239, + "step": 126480 + }, + { + "epoch": 0.808108557044836, + "grad_norm": 0.5187436938285828, + "learning_rate": 6.486741401578897e-05, + "loss": 0.7294, + "step": 126490 + }, + { + "epoch": 0.8081724441945747, + "grad_norm": 0.7180354595184326, + "learning_rate": 6.486262321042465e-05, + "loss": 0.8105, + "step": 126500 + }, + { + "epoch": 0.8082363313443134, + "grad_norm": 1.0905053615570068, + "learning_rate": 6.485783225538084e-05, + "loss": 0.7558, + "step": 126510 + }, + { + "epoch": 0.8083002184940521, + "grad_norm": 0.7358648777008057, + "learning_rate": 6.485304115070582e-05, + "loss": 0.7867, + "step": 126520 + }, + { + "epoch": 0.8083641056437908, + "grad_norm": 0.6395271420478821, + "learning_rate": 6.484824989644783e-05, + "loss": 0.8508, + "step": 126530 + }, + { + "epoch": 0.8084279927935295, + "grad_norm": 0.877444863319397, + "learning_rate": 6.48434584926551e-05, + "loss": 1.0844, + "step": 126540 + }, + { + "epoch": 0.8084918799432682, + "grad_norm": 0.5219199061393738, + "learning_rate": 6.483866693937591e-05, + "loss": 0.7814, + "step": 126550 + }, + { + "epoch": 0.8085557670930069, + "grad_norm": 2.268413543701172, + "learning_rate": 6.483387523665852e-05, + "loss": 0.8243, + "step": 126560 + }, + { + "epoch": 0.8086196542427456, + "grad_norm": 0.6467793583869934, + "learning_rate": 6.482908338455113e-05, + "loss": 0.8682, + "step": 126570 + }, + { + "epoch": 0.8086835413924843, + "grad_norm": 1.109560489654541, + "learning_rate": 6.48242913831021e-05, + "loss": 0.7325, + "step": 126580 + }, + { + "epoch": 0.808747428542223, + "grad_norm": 2.1238925457000732, + "learning_rate": 6.48194992323596e-05, + "loss": 0.7621, + "step": 126590 + }, + { + "epoch": 0.8088113156919617, + "grad_norm": 0.7818292379379272, + "learning_rate": 6.481470693237193e-05, + "loss": 0.8513, + "step": 126600 + }, + { + "epoch": 0.8088752028417004, + "grad_norm": 0.8651500344276428, + "learning_rate": 6.480991448318735e-05, + "loss": 1.1925, + "step": 126610 + }, + { + "epoch": 0.8089390899914392, + "grad_norm": 1.0120964050292969, + "learning_rate": 6.48051218848541e-05, + "loss": 0.7734, + "step": 126620 + }, + { + "epoch": 0.8090029771411779, + "grad_norm": 0.6055595278739929, + "learning_rate": 6.480032913742047e-05, + "loss": 0.8752, + "step": 126630 + }, + { + "epoch": 0.8090668642909166, + "grad_norm": 0.9761593341827393, + "learning_rate": 6.479553624093473e-05, + "loss": 1.0153, + "step": 126640 + }, + { + "epoch": 0.8091307514406553, + "grad_norm": 0.926140308380127, + "learning_rate": 6.479074319544513e-05, + "loss": 0.7519, + "step": 126650 + }, + { + "epoch": 0.809194638590394, + "grad_norm": 1.0344536304473877, + "learning_rate": 6.478595000099996e-05, + "loss": 0.8786, + "step": 126660 + }, + { + "epoch": 0.8092585257401327, + "grad_norm": 1.2882970571517944, + "learning_rate": 6.478115665764748e-05, + "loss": 0.8631, + "step": 126670 + }, + { + "epoch": 0.8093224128898713, + "grad_norm": 0.98709636926651, + "learning_rate": 6.477636316543596e-05, + "loss": 0.9382, + "step": 126680 + }, + { + "epoch": 0.80938630003961, + "grad_norm": 0.9741780161857605, + "learning_rate": 6.477156952441368e-05, + "loss": 0.7388, + "step": 126690 + }, + { + "epoch": 0.8094501871893487, + "grad_norm": 0.7120775580406189, + "learning_rate": 6.476677573462893e-05, + "loss": 0.8167, + "step": 126700 + }, + { + "epoch": 0.8095140743390874, + "grad_norm": 0.9984919428825378, + "learning_rate": 6.476198179612995e-05, + "loss": 0.897, + "step": 126710 + }, + { + "epoch": 0.8095779614888261, + "grad_norm": 2.221468925476074, + "learning_rate": 6.475718770896505e-05, + "loss": 1.0463, + "step": 126720 + }, + { + "epoch": 0.8096418486385648, + "grad_norm": 0.9233216643333435, + "learning_rate": 6.47523934731825e-05, + "loss": 0.8333, + "step": 126730 + }, + { + "epoch": 0.8097057357883035, + "grad_norm": 0.7584207057952881, + "learning_rate": 6.47475990888306e-05, + "loss": 1.0903, + "step": 126740 + }, + { + "epoch": 0.8097696229380422, + "grad_norm": 0.913167417049408, + "learning_rate": 6.474280455595761e-05, + "loss": 0.9977, + "step": 126750 + }, + { + "epoch": 0.8098335100877809, + "grad_norm": 0.8217071890830994, + "learning_rate": 6.473800987461182e-05, + "loss": 0.8709, + "step": 126760 + }, + { + "epoch": 0.8098973972375196, + "grad_norm": 0.8127371072769165, + "learning_rate": 6.473321504484152e-05, + "loss": 0.9532, + "step": 126770 + }, + { + "epoch": 0.8099612843872583, + "grad_norm": 0.5659823417663574, + "learning_rate": 6.4728420066695e-05, + "loss": 0.9879, + "step": 126780 + }, + { + "epoch": 0.810025171536997, + "grad_norm": 4.89599084854126, + "learning_rate": 6.472362494022055e-05, + "loss": 0.973, + "step": 126790 + }, + { + "epoch": 0.8100890586867358, + "grad_norm": 2.1333658695220947, + "learning_rate": 6.471882966546647e-05, + "loss": 0.6452, + "step": 126800 + }, + { + "epoch": 0.8101529458364745, + "grad_norm": 0.8865774869918823, + "learning_rate": 6.471403424248102e-05, + "loss": 0.7954, + "step": 126810 + }, + { + "epoch": 0.8102168329862132, + "grad_norm": 0.8974156975746155, + "learning_rate": 6.470923867131254e-05, + "loss": 0.9977, + "step": 126820 + }, + { + "epoch": 0.8102807201359519, + "grad_norm": 1.3754866123199463, + "learning_rate": 6.47044429520093e-05, + "loss": 0.8972, + "step": 126830 + }, + { + "epoch": 0.8103446072856906, + "grad_norm": 0.8997700214385986, + "learning_rate": 6.469964708461957e-05, + "loss": 0.6717, + "step": 126840 + }, + { + "epoch": 0.8104084944354293, + "grad_norm": 0.7010866403579712, + "learning_rate": 6.469485106919171e-05, + "loss": 0.688, + "step": 126850 + }, + { + "epoch": 0.810472381585168, + "grad_norm": 1.2997609376907349, + "learning_rate": 6.469005490577397e-05, + "loss": 1.0006, + "step": 126860 + }, + { + "epoch": 0.8105362687349067, + "grad_norm": 0.8053306937217712, + "learning_rate": 6.468525859441466e-05, + "loss": 0.8973, + "step": 126870 + }, + { + "epoch": 0.8106001558846454, + "grad_norm": 0.7065293192863464, + "learning_rate": 6.46804621351621e-05, + "loss": 1.0594, + "step": 126880 + }, + { + "epoch": 0.8106640430343841, + "grad_norm": 1.1768196821212769, + "learning_rate": 6.467566552806458e-05, + "loss": 0.9089, + "step": 126890 + }, + { + "epoch": 0.8107279301841228, + "grad_norm": 1.5799227952957153, + "learning_rate": 6.467086877317042e-05, + "loss": 0.7268, + "step": 126900 + }, + { + "epoch": 0.8107918173338615, + "grad_norm": 0.7918219566345215, + "learning_rate": 6.466607187052791e-05, + "loss": 1.016, + "step": 126910 + }, + { + "epoch": 0.8108557044836001, + "grad_norm": 1.188558578491211, + "learning_rate": 6.466127482018538e-05, + "loss": 1.0635, + "step": 126920 + }, + { + "epoch": 0.8109195916333388, + "grad_norm": 0.8027870059013367, + "learning_rate": 6.465647762219113e-05, + "loss": 0.7027, + "step": 126930 + }, + { + "epoch": 0.8109834787830775, + "grad_norm": 0.84566330909729, + "learning_rate": 6.465168027659347e-05, + "loss": 0.8105, + "step": 126940 + }, + { + "epoch": 0.8110473659328162, + "grad_norm": 0.6688374280929565, + "learning_rate": 6.46468827834407e-05, + "loss": 0.7993, + "step": 126950 + }, + { + "epoch": 0.8111112530825549, + "grad_norm": 0.7672613263130188, + "learning_rate": 6.464208514278117e-05, + "loss": 0.9798, + "step": 126960 + }, + { + "epoch": 0.8111751402322936, + "grad_norm": 1.0003461837768555, + "learning_rate": 6.463728735466316e-05, + "loss": 0.8659, + "step": 126970 + }, + { + "epoch": 0.8112390273820324, + "grad_norm": 1.0807254314422607, + "learning_rate": 6.4632489419135e-05, + "loss": 0.9423, + "step": 126980 + }, + { + "epoch": 0.8113029145317711, + "grad_norm": 0.6084434390068054, + "learning_rate": 6.462769133624502e-05, + "loss": 0.6477, + "step": 126990 + }, + { + "epoch": 0.8113668016815098, + "grad_norm": 0.7331100106239319, + "learning_rate": 6.462289310604152e-05, + "loss": 1.0194, + "step": 127000 + }, + { + "epoch": 0.8114306888312485, + "grad_norm": 1.0678889751434326, + "learning_rate": 6.461809472857287e-05, + "loss": 0.7349, + "step": 127010 + }, + { + "epoch": 0.8114945759809872, + "grad_norm": 1.4180760383605957, + "learning_rate": 6.461329620388733e-05, + "loss": 0.7278, + "step": 127020 + }, + { + "epoch": 0.8115584631307259, + "grad_norm": 0.904155433177948, + "learning_rate": 6.460849753203326e-05, + "loss": 0.8996, + "step": 127030 + }, + { + "epoch": 0.8116223502804646, + "grad_norm": 0.8179849982261658, + "learning_rate": 6.460369871305899e-05, + "loss": 0.8029, + "step": 127040 + }, + { + "epoch": 0.8116862374302033, + "grad_norm": 0.8025046586990356, + "learning_rate": 6.459889974701284e-05, + "loss": 1.1434, + "step": 127050 + }, + { + "epoch": 0.811750124579942, + "grad_norm": 0.9315536022186279, + "learning_rate": 6.459410063394314e-05, + "loss": 0.7199, + "step": 127060 + }, + { + "epoch": 0.8118140117296807, + "grad_norm": 1.0310189723968506, + "learning_rate": 6.458930137389821e-05, + "loss": 0.8107, + "step": 127070 + }, + { + "epoch": 0.8118778988794194, + "grad_norm": 0.897158682346344, + "learning_rate": 6.45845019669264e-05, + "loss": 0.9007, + "step": 127080 + }, + { + "epoch": 0.8119417860291581, + "grad_norm": 0.8485071659088135, + "learning_rate": 6.457970241307603e-05, + "loss": 0.9143, + "step": 127090 + }, + { + "epoch": 0.8120056731788968, + "grad_norm": 1.0846539735794067, + "learning_rate": 6.457490271239546e-05, + "loss": 0.5329, + "step": 127100 + }, + { + "epoch": 0.8120695603286355, + "grad_norm": 1.0660455226898193, + "learning_rate": 6.457010286493299e-05, + "loss": 0.8025, + "step": 127110 + }, + { + "epoch": 0.8121334474783742, + "grad_norm": 0.7800552248954773, + "learning_rate": 6.4565302870737e-05, + "loss": 0.7744, + "step": 127120 + }, + { + "epoch": 0.8121973346281129, + "grad_norm": 1.9460397958755493, + "learning_rate": 6.45605027298558e-05, + "loss": 1.0076, + "step": 127130 + }, + { + "epoch": 0.8122612217778516, + "grad_norm": 0.7242342233657837, + "learning_rate": 6.455570244233774e-05, + "loss": 0.878, + "step": 127140 + }, + { + "epoch": 0.8123251089275904, + "grad_norm": 1.0195945501327515, + "learning_rate": 6.455090200823117e-05, + "loss": 1.0594, + "step": 127150 + }, + { + "epoch": 0.812388996077329, + "grad_norm": 0.4291139841079712, + "learning_rate": 6.454610142758442e-05, + "loss": 0.8271, + "step": 127160 + }, + { + "epoch": 0.8124528832270677, + "grad_norm": 0.8189600110054016, + "learning_rate": 6.454130070044584e-05, + "loss": 1.0931, + "step": 127170 + }, + { + "epoch": 0.8125167703768064, + "grad_norm": 0.6839133501052856, + "learning_rate": 6.45364998268638e-05, + "loss": 0.9161, + "step": 127180 + }, + { + "epoch": 0.8125806575265451, + "grad_norm": 0.835392951965332, + "learning_rate": 6.45316988068866e-05, + "loss": 0.9721, + "step": 127190 + }, + { + "epoch": 0.8126445446762838, + "grad_norm": 0.5460143089294434, + "learning_rate": 6.452689764056265e-05, + "loss": 0.9177, + "step": 127200 + }, + { + "epoch": 0.8127084318260225, + "grad_norm": 1.0494486093521118, + "learning_rate": 6.452209632794027e-05, + "loss": 0.8844, + "step": 127210 + }, + { + "epoch": 0.8127723189757612, + "grad_norm": 0.6247775554656982, + "learning_rate": 6.451729486906781e-05, + "loss": 0.9528, + "step": 127220 + }, + { + "epoch": 0.8128362061254999, + "grad_norm": 1.8002761602401733, + "learning_rate": 6.451249326399364e-05, + "loss": 1.1712, + "step": 127230 + }, + { + "epoch": 0.8129000932752386, + "grad_norm": 0.9478850960731506, + "learning_rate": 6.45076915127661e-05, + "loss": 0.8959, + "step": 127240 + }, + { + "epoch": 0.8129639804249773, + "grad_norm": 0.707378089427948, + "learning_rate": 6.450288961543355e-05, + "loss": 0.8968, + "step": 127250 + }, + { + "epoch": 0.813027867574716, + "grad_norm": 0.9674128890037537, + "learning_rate": 6.449808757204435e-05, + "loss": 0.862, + "step": 127260 + }, + { + "epoch": 0.8130917547244547, + "grad_norm": 1.1867669820785522, + "learning_rate": 6.449328538264687e-05, + "loss": 0.808, + "step": 127270 + }, + { + "epoch": 0.8131556418741934, + "grad_norm": 1.1251099109649658, + "learning_rate": 6.448848304728949e-05, + "loss": 0.8379, + "step": 127280 + }, + { + "epoch": 0.8132195290239321, + "grad_norm": 0.891304612159729, + "learning_rate": 6.448368056602053e-05, + "loss": 0.9116, + "step": 127290 + }, + { + "epoch": 0.8132834161736708, + "grad_norm": 1.0595531463623047, + "learning_rate": 6.447887793888838e-05, + "loss": 0.8859, + "step": 127300 + }, + { + "epoch": 0.8133473033234095, + "grad_norm": 0.8898464441299438, + "learning_rate": 6.447407516594142e-05, + "loss": 0.982, + "step": 127310 + }, + { + "epoch": 0.8134111904731482, + "grad_norm": 2.1470937728881836, + "learning_rate": 6.446927224722799e-05, + "loss": 0.8127, + "step": 127320 + }, + { + "epoch": 0.813475077622887, + "grad_norm": 1.043031096458435, + "learning_rate": 6.446446918279647e-05, + "loss": 0.8647, + "step": 127330 + }, + { + "epoch": 0.8135389647726257, + "grad_norm": 0.8971779942512512, + "learning_rate": 6.445966597269522e-05, + "loss": 1.0, + "step": 127340 + }, + { + "epoch": 0.8136028519223644, + "grad_norm": 0.8842697739601135, + "learning_rate": 6.445486261697263e-05, + "loss": 0.8011, + "step": 127350 + }, + { + "epoch": 0.8136667390721031, + "grad_norm": 0.8753737211227417, + "learning_rate": 6.445005911567707e-05, + "loss": 0.9761, + "step": 127360 + }, + { + "epoch": 0.8137306262218418, + "grad_norm": 0.7797544598579407, + "learning_rate": 6.444525546885692e-05, + "loss": 0.7789, + "step": 127370 + }, + { + "epoch": 0.8137945133715805, + "grad_norm": 0.49460268020629883, + "learning_rate": 6.444045167656055e-05, + "loss": 0.836, + "step": 127380 + }, + { + "epoch": 0.8138584005213192, + "grad_norm": 1.005393385887146, + "learning_rate": 6.443564773883634e-05, + "loss": 0.8948, + "step": 127390 + }, + { + "epoch": 0.8139222876710579, + "grad_norm": 0.7220799922943115, + "learning_rate": 6.443084365573265e-05, + "loss": 0.8677, + "step": 127400 + }, + { + "epoch": 0.8139861748207965, + "grad_norm": 0.8531742691993713, + "learning_rate": 6.44260394272979e-05, + "loss": 0.7786, + "step": 127410 + }, + { + "epoch": 0.8140500619705352, + "grad_norm": 1.4867233037948608, + "learning_rate": 6.442123505358043e-05, + "loss": 0.8496, + "step": 127420 + }, + { + "epoch": 0.8141139491202739, + "grad_norm": 0.6640691161155701, + "learning_rate": 6.441643053462867e-05, + "loss": 0.847, + "step": 127430 + }, + { + "epoch": 0.8141778362700126, + "grad_norm": 0.5438361763954163, + "learning_rate": 6.441162587049096e-05, + "loss": 0.7101, + "step": 127440 + }, + { + "epoch": 0.8142417234197513, + "grad_norm": 0.879038393497467, + "learning_rate": 6.440682106121574e-05, + "loss": 0.9942, + "step": 127450 + }, + { + "epoch": 0.81430561056949, + "grad_norm": 0.6721540689468384, + "learning_rate": 6.440201610685135e-05, + "loss": 0.6765, + "step": 127460 + }, + { + "epoch": 0.8143694977192287, + "grad_norm": 0.627669095993042, + "learning_rate": 6.43972110074462e-05, + "loss": 0.8154, + "step": 127470 + }, + { + "epoch": 0.8144333848689674, + "grad_norm": 1.5187098979949951, + "learning_rate": 6.439240576304868e-05, + "loss": 1.1218, + "step": 127480 + }, + { + "epoch": 0.8144972720187061, + "grad_norm": 0.519985556602478, + "learning_rate": 6.438760037370719e-05, + "loss": 0.7047, + "step": 127490 + }, + { + "epoch": 0.8145611591684448, + "grad_norm": 0.7375752329826355, + "learning_rate": 6.43827948394701e-05, + "loss": 0.8807, + "step": 127500 + }, + { + "epoch": 0.8146250463181836, + "grad_norm": 0.5669057369232178, + "learning_rate": 6.437798916038584e-05, + "loss": 0.8591, + "step": 127510 + }, + { + "epoch": 0.8146889334679223, + "grad_norm": 0.9515382051467896, + "learning_rate": 6.437318333650279e-05, + "loss": 0.8639, + "step": 127520 + }, + { + "epoch": 0.814752820617661, + "grad_norm": 0.9715726971626282, + "learning_rate": 6.436837736786934e-05, + "loss": 0.6827, + "step": 127530 + }, + { + "epoch": 0.8148167077673997, + "grad_norm": 1.2894679307937622, + "learning_rate": 6.43635712545339e-05, + "loss": 0.7838, + "step": 127540 + }, + { + "epoch": 0.8148805949171384, + "grad_norm": 0.9113032817840576, + "learning_rate": 6.43587649965449e-05, + "loss": 0.9437, + "step": 127550 + }, + { + "epoch": 0.8149444820668771, + "grad_norm": 0.8050090074539185, + "learning_rate": 6.435395859395068e-05, + "loss": 1.0804, + "step": 127560 + }, + { + "epoch": 0.8150083692166158, + "grad_norm": 1.1734331846237183, + "learning_rate": 6.434915204679969e-05, + "loss": 0.9166, + "step": 127570 + }, + { + "epoch": 0.8150722563663545, + "grad_norm": 1.3602896928787231, + "learning_rate": 6.434434535514031e-05, + "loss": 0.8164, + "step": 127580 + }, + { + "epoch": 0.8151361435160932, + "grad_norm": 0.9085065722465515, + "learning_rate": 6.433953851902097e-05, + "loss": 1.0613, + "step": 127590 + }, + { + "epoch": 0.8152000306658319, + "grad_norm": 0.8501441478729248, + "learning_rate": 6.433473153849007e-05, + "loss": 0.7548, + "step": 127600 + }, + { + "epoch": 0.8152639178155706, + "grad_norm": 1.4549124240875244, + "learning_rate": 6.432992441359605e-05, + "loss": 0.8768, + "step": 127610 + }, + { + "epoch": 0.8153278049653093, + "grad_norm": 0.9004676938056946, + "learning_rate": 6.432511714438727e-05, + "loss": 0.9546, + "step": 127620 + }, + { + "epoch": 0.815391692115048, + "grad_norm": 0.6986418962478638, + "learning_rate": 6.432030973091216e-05, + "loss": 0.8329, + "step": 127630 + }, + { + "epoch": 0.8154555792647867, + "grad_norm": 1.1968231201171875, + "learning_rate": 6.431550217321916e-05, + "loss": 0.8781, + "step": 127640 + }, + { + "epoch": 0.8155194664145253, + "grad_norm": 0.9904518723487854, + "learning_rate": 6.431069447135665e-05, + "loss": 0.7686, + "step": 127650 + }, + { + "epoch": 0.815583353564264, + "grad_norm": 0.846964955329895, + "learning_rate": 6.43058866253731e-05, + "loss": 0.7053, + "step": 127660 + }, + { + "epoch": 0.8156472407140027, + "grad_norm": 0.9893980026245117, + "learning_rate": 6.430107863531685e-05, + "loss": 0.7232, + "step": 127670 + }, + { + "epoch": 0.8157111278637414, + "grad_norm": 0.9716863632202148, + "learning_rate": 6.42962705012364e-05, + "loss": 0.9921, + "step": 127680 + }, + { + "epoch": 0.8157750150134802, + "grad_norm": 0.5748932361602783, + "learning_rate": 6.429146222318013e-05, + "loss": 0.7242, + "step": 127690 + }, + { + "epoch": 0.8158389021632189, + "grad_norm": 0.6904158592224121, + "learning_rate": 6.428665380119648e-05, + "loss": 1.1946, + "step": 127700 + }, + { + "epoch": 0.8159027893129576, + "grad_norm": 0.8382551074028015, + "learning_rate": 6.428184523533384e-05, + "loss": 0.9143, + "step": 127710 + }, + { + "epoch": 0.8159666764626963, + "grad_norm": 1.1233938932418823, + "learning_rate": 6.427703652564067e-05, + "loss": 1.239, + "step": 127720 + }, + { + "epoch": 0.816030563612435, + "grad_norm": 0.6471089720726013, + "learning_rate": 6.42722276721654e-05, + "loss": 0.9955, + "step": 127730 + }, + { + "epoch": 0.8160944507621737, + "grad_norm": 0.796449601650238, + "learning_rate": 6.426741867495645e-05, + "loss": 1.1798, + "step": 127740 + }, + { + "epoch": 0.8161583379119124, + "grad_norm": 1.216551423072815, + "learning_rate": 6.426260953406225e-05, + "loss": 0.9472, + "step": 127750 + }, + { + "epoch": 0.8162222250616511, + "grad_norm": 0.4935864508152008, + "learning_rate": 6.425780024953124e-05, + "loss": 1.0413, + "step": 127760 + }, + { + "epoch": 0.8162861122113898, + "grad_norm": 1.2737202644348145, + "learning_rate": 6.425299082141184e-05, + "loss": 1.1372, + "step": 127770 + }, + { + "epoch": 0.8163499993611285, + "grad_norm": 1.7472068071365356, + "learning_rate": 6.424818124975248e-05, + "loss": 0.7832, + "step": 127780 + }, + { + "epoch": 0.8164138865108672, + "grad_norm": 0.7582964897155762, + "learning_rate": 6.424337153460162e-05, + "loss": 0.8762, + "step": 127790 + }, + { + "epoch": 0.8164777736606059, + "grad_norm": 1.1670618057250977, + "learning_rate": 6.42385616760077e-05, + "loss": 1.2763, + "step": 127800 + }, + { + "epoch": 0.8165416608103446, + "grad_norm": 0.6307504773139954, + "learning_rate": 6.423375167401912e-05, + "loss": 0.6937, + "step": 127810 + }, + { + "epoch": 0.8166055479600833, + "grad_norm": 1.9756511449813843, + "learning_rate": 6.422894152868437e-05, + "loss": 0.7304, + "step": 127820 + }, + { + "epoch": 0.816669435109822, + "grad_norm": 1.0273828506469727, + "learning_rate": 6.422413124005185e-05, + "loss": 0.7999, + "step": 127830 + }, + { + "epoch": 0.8167333222595607, + "grad_norm": 1.1379588842391968, + "learning_rate": 6.421932080817003e-05, + "loss": 0.8498, + "step": 127840 + }, + { + "epoch": 0.8167972094092995, + "grad_norm": 0.7161405086517334, + "learning_rate": 6.421451023308735e-05, + "loss": 1.4902, + "step": 127850 + }, + { + "epoch": 0.8168610965590382, + "grad_norm": 1.0441093444824219, + "learning_rate": 6.420969951485225e-05, + "loss": 0.8677, + "step": 127860 + }, + { + "epoch": 0.8169249837087769, + "grad_norm": 0.8484379053115845, + "learning_rate": 6.42048886535132e-05, + "loss": 0.7921, + "step": 127870 + }, + { + "epoch": 0.8169888708585156, + "grad_norm": 0.8539422750473022, + "learning_rate": 6.420007764911861e-05, + "loss": 0.7991, + "step": 127880 + }, + { + "epoch": 0.8170527580082542, + "grad_norm": 0.5614151954650879, + "learning_rate": 6.419526650171697e-05, + "loss": 1.0511, + "step": 127890 + }, + { + "epoch": 0.8171166451579929, + "grad_norm": 0.9628438949584961, + "learning_rate": 6.41904552113567e-05, + "loss": 0.6749, + "step": 127900 + }, + { + "epoch": 0.8171805323077316, + "grad_norm": 0.8185387253761292, + "learning_rate": 6.418564377808627e-05, + "loss": 0.8436, + "step": 127910 + }, + { + "epoch": 0.8172444194574703, + "grad_norm": 0.6179929971694946, + "learning_rate": 6.418083220195414e-05, + "loss": 0.7563, + "step": 127920 + }, + { + "epoch": 0.817308306607209, + "grad_norm": 0.7823129892349243, + "learning_rate": 6.417602048300877e-05, + "loss": 0.8868, + "step": 127930 + }, + { + "epoch": 0.8173721937569477, + "grad_norm": 1.1083999872207642, + "learning_rate": 6.41712086212986e-05, + "loss": 0.8661, + "step": 127940 + }, + { + "epoch": 0.8174360809066864, + "grad_norm": 2.1160151958465576, + "learning_rate": 6.41663966168721e-05, + "loss": 0.8088, + "step": 127950 + }, + { + "epoch": 0.8174999680564251, + "grad_norm": 0.6990865468978882, + "learning_rate": 6.416158446977772e-05, + "loss": 1.0852, + "step": 127960 + }, + { + "epoch": 0.8175638552061638, + "grad_norm": 0.8294287919998169, + "learning_rate": 6.415677218006395e-05, + "loss": 0.8674, + "step": 127970 + }, + { + "epoch": 0.8176277423559025, + "grad_norm": 1.1241607666015625, + "learning_rate": 6.415195974777923e-05, + "loss": 0.939, + "step": 127980 + }, + { + "epoch": 0.8176916295056412, + "grad_norm": 0.8454298377037048, + "learning_rate": 6.414714717297203e-05, + "loss": 0.8675, + "step": 127990 + }, + { + "epoch": 0.8177555166553799, + "grad_norm": 0.7560257315635681, + "learning_rate": 6.414233445569083e-05, + "loss": 1.0024, + "step": 128000 + }, + { + "epoch": 0.8178194038051186, + "grad_norm": 0.7482271790504456, + "learning_rate": 6.413752159598408e-05, + "loss": 0.8029, + "step": 128010 + }, + { + "epoch": 0.8178832909548573, + "grad_norm": 1.5129786729812622, + "learning_rate": 6.413270859390026e-05, + "loss": 1.0656, + "step": 128020 + }, + { + "epoch": 0.817947178104596, + "grad_norm": 1.2773970365524292, + "learning_rate": 6.412789544948782e-05, + "loss": 0.9819, + "step": 128030 + }, + { + "epoch": 0.8180110652543348, + "grad_norm": 0.8020282983779907, + "learning_rate": 6.41230821627953e-05, + "loss": 0.6803, + "step": 128040 + }, + { + "epoch": 0.8180749524040735, + "grad_norm": 1.2904086112976074, + "learning_rate": 6.411826873387108e-05, + "loss": 1.1785, + "step": 128050 + }, + { + "epoch": 0.8181388395538122, + "grad_norm": 0.6891657710075378, + "learning_rate": 6.41134551627637e-05, + "loss": 0.8503, + "step": 128060 + }, + { + "epoch": 0.8182027267035509, + "grad_norm": 0.9676710963249207, + "learning_rate": 6.41086414495216e-05, + "loss": 0.8886, + "step": 128070 + }, + { + "epoch": 0.8182666138532896, + "grad_norm": 0.881633460521698, + "learning_rate": 6.410382759419328e-05, + "loss": 0.8703, + "step": 128080 + }, + { + "epoch": 0.8183305010030283, + "grad_norm": 2.246070146560669, + "learning_rate": 6.409901359682722e-05, + "loss": 0.9637, + "step": 128090 + }, + { + "epoch": 0.818394388152767, + "grad_norm": 0.9771053194999695, + "learning_rate": 6.409419945747189e-05, + "loss": 0.8403, + "step": 128100 + }, + { + "epoch": 0.8184582753025057, + "grad_norm": 0.9186480641365051, + "learning_rate": 6.408938517617576e-05, + "loss": 0.8156, + "step": 128110 + }, + { + "epoch": 0.8185221624522444, + "grad_norm": 0.9104690551757812, + "learning_rate": 6.408457075298734e-05, + "loss": 0.9832, + "step": 128120 + }, + { + "epoch": 0.8185860496019831, + "grad_norm": 0.849088191986084, + "learning_rate": 6.407975618795514e-05, + "loss": 1.0383, + "step": 128130 + }, + { + "epoch": 0.8186499367517217, + "grad_norm": 0.667122483253479, + "learning_rate": 6.40749414811276e-05, + "loss": 1.0298, + "step": 128140 + }, + { + "epoch": 0.8187138239014604, + "grad_norm": 0.7279898524284363, + "learning_rate": 6.407012663255321e-05, + "loss": 0.8889, + "step": 128150 + }, + { + "epoch": 0.8187777110511991, + "grad_norm": 0.6628199219703674, + "learning_rate": 6.406531164228048e-05, + "loss": 1.0062, + "step": 128160 + }, + { + "epoch": 0.8188415982009378, + "grad_norm": 0.8573051691055298, + "learning_rate": 6.406049651035789e-05, + "loss": 0.8815, + "step": 128170 + }, + { + "epoch": 0.8189054853506765, + "grad_norm": 0.6094196438789368, + "learning_rate": 6.405568123683395e-05, + "loss": 1.0032, + "step": 128180 + }, + { + "epoch": 0.8189693725004152, + "grad_norm": 1.0880091190338135, + "learning_rate": 6.405086582175712e-05, + "loss": 0.7361, + "step": 128190 + }, + { + "epoch": 0.8190332596501539, + "grad_norm": 4.45365571975708, + "learning_rate": 6.404605026517592e-05, + "loss": 0.925, + "step": 128200 + }, + { + "epoch": 0.8190971467998927, + "grad_norm": 1.8221222162246704, + "learning_rate": 6.404123456713884e-05, + "loss": 0.8888, + "step": 128210 + }, + { + "epoch": 0.8191610339496314, + "grad_norm": 0.8215370774269104, + "learning_rate": 6.403641872769439e-05, + "loss": 0.806, + "step": 128220 + }, + { + "epoch": 0.8192249210993701, + "grad_norm": 0.7593247890472412, + "learning_rate": 6.403160274689107e-05, + "loss": 0.7808, + "step": 128230 + }, + { + "epoch": 0.8192888082491088, + "grad_norm": 0.8794440031051636, + "learning_rate": 6.402678662477735e-05, + "loss": 0.9681, + "step": 128240 + }, + { + "epoch": 0.8193526953988475, + "grad_norm": 0.9610484838485718, + "learning_rate": 6.402197036140176e-05, + "loss": 0.6683, + "step": 128250 + }, + { + "epoch": 0.8194165825485862, + "grad_norm": 1.84084153175354, + "learning_rate": 6.40171539568128e-05, + "loss": 0.8565, + "step": 128260 + }, + { + "epoch": 0.8194804696983249, + "grad_norm": 0.8622168898582458, + "learning_rate": 6.401233741105898e-05, + "loss": 0.8958, + "step": 128270 + }, + { + "epoch": 0.8195443568480636, + "grad_norm": 0.8887889981269836, + "learning_rate": 6.400752072418878e-05, + "loss": 0.6639, + "step": 128280 + }, + { + "epoch": 0.8196082439978023, + "grad_norm": 1.2179750204086304, + "learning_rate": 6.400270389625075e-05, + "loss": 0.8348, + "step": 128290 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 0.7495495080947876, + "learning_rate": 6.399788692729337e-05, + "loss": 0.732, + "step": 128300 + }, + { + "epoch": 0.8197360182972797, + "grad_norm": 0.9939058423042297, + "learning_rate": 6.399306981736515e-05, + "loss": 0.9292, + "step": 128310 + }, + { + "epoch": 0.8197999054470184, + "grad_norm": 1.0423085689544678, + "learning_rate": 6.398825256651463e-05, + "loss": 1.0162, + "step": 128320 + }, + { + "epoch": 0.8198637925967571, + "grad_norm": 1.109744668006897, + "learning_rate": 6.398343517479029e-05, + "loss": 0.8332, + "step": 128330 + }, + { + "epoch": 0.8199276797464958, + "grad_norm": 0.8410364389419556, + "learning_rate": 6.397861764224067e-05, + "loss": 0.6665, + "step": 128340 + }, + { + "epoch": 0.8199915668962345, + "grad_norm": 1.0010278224945068, + "learning_rate": 6.397379996891426e-05, + "loss": 0.8106, + "step": 128350 + }, + { + "epoch": 0.8200554540459732, + "grad_norm": 0.9419941902160645, + "learning_rate": 6.396898215485962e-05, + "loss": 0.768, + "step": 128360 + }, + { + "epoch": 0.820119341195712, + "grad_norm": 0.6813206076622009, + "learning_rate": 6.396416420012523e-05, + "loss": 0.7858, + "step": 128370 + }, + { + "epoch": 0.8201832283454505, + "grad_norm": 1.2011756896972656, + "learning_rate": 6.395934610475963e-05, + "loss": 0.8183, + "step": 128380 + }, + { + "epoch": 0.8202471154951892, + "grad_norm": 5.552373886108398, + "learning_rate": 6.395452786881133e-05, + "loss": 1.1759, + "step": 128390 + }, + { + "epoch": 0.820311002644928, + "grad_norm": 0.658540666103363, + "learning_rate": 6.394970949232887e-05, + "loss": 0.7854, + "step": 128400 + }, + { + "epoch": 0.8203748897946667, + "grad_norm": 1.2087280750274658, + "learning_rate": 6.394489097536076e-05, + "loss": 0.9018, + "step": 128410 + }, + { + "epoch": 0.8204387769444054, + "grad_norm": 0.8210545182228088, + "learning_rate": 6.394007231795554e-05, + "loss": 0.809, + "step": 128420 + }, + { + "epoch": 0.8205026640941441, + "grad_norm": 1.1109236478805542, + "learning_rate": 6.393525352016174e-05, + "loss": 0.8139, + "step": 128430 + }, + { + "epoch": 0.8205665512438828, + "grad_norm": 0.5671748518943787, + "learning_rate": 6.393043458202787e-05, + "loss": 0.7467, + "step": 128440 + }, + { + "epoch": 0.8206304383936215, + "grad_norm": 0.7894589304924011, + "learning_rate": 6.392561550360247e-05, + "loss": 0.7735, + "step": 128450 + }, + { + "epoch": 0.8206943255433602, + "grad_norm": 1.4838154315948486, + "learning_rate": 6.392079628493407e-05, + "loss": 0.8314, + "step": 128460 + }, + { + "epoch": 0.8207582126930989, + "grad_norm": 0.8588756918907166, + "learning_rate": 6.391597692607121e-05, + "loss": 0.9544, + "step": 128470 + }, + { + "epoch": 0.8208220998428376, + "grad_norm": 1.500656247138977, + "learning_rate": 6.391115742706243e-05, + "loss": 0.7166, + "step": 128480 + }, + { + "epoch": 0.8208859869925763, + "grad_norm": 0.9322123527526855, + "learning_rate": 6.390633778795626e-05, + "loss": 0.8543, + "step": 128490 + }, + { + "epoch": 0.820949874142315, + "grad_norm": 0.8133841753005981, + "learning_rate": 6.390151800880124e-05, + "loss": 0.935, + "step": 128500 + }, + { + "epoch": 0.8210137612920537, + "grad_norm": 1.0100511312484741, + "learning_rate": 6.38966980896459e-05, + "loss": 0.7153, + "step": 128510 + }, + { + "epoch": 0.8210776484417924, + "grad_norm": 1.2132251262664795, + "learning_rate": 6.38918780305388e-05, + "loss": 0.8784, + "step": 128520 + }, + { + "epoch": 0.8211415355915311, + "grad_norm": 0.8456812500953674, + "learning_rate": 6.388705783152846e-05, + "loss": 1.0612, + "step": 128530 + }, + { + "epoch": 0.8212054227412698, + "grad_norm": 0.9363921284675598, + "learning_rate": 6.388223749266344e-05, + "loss": 1.0582, + "step": 128540 + }, + { + "epoch": 0.8212693098910085, + "grad_norm": 2.2319607734680176, + "learning_rate": 6.387741701399228e-05, + "loss": 0.717, + "step": 128550 + }, + { + "epoch": 0.8213331970407473, + "grad_norm": 1.1460880041122437, + "learning_rate": 6.387259639556352e-05, + "loss": 0.8053, + "step": 128560 + }, + { + "epoch": 0.821397084190486, + "grad_norm": 0.7786633372306824, + "learning_rate": 6.386777563742571e-05, + "loss": 0.9295, + "step": 128570 + }, + { + "epoch": 0.8214609713402247, + "grad_norm": 0.8172239661216736, + "learning_rate": 6.38629547396274e-05, + "loss": 0.8758, + "step": 128580 + }, + { + "epoch": 0.8215248584899634, + "grad_norm": 0.6958255171775818, + "learning_rate": 6.385813370221716e-05, + "loss": 0.8538, + "step": 128590 + }, + { + "epoch": 0.8215887456397021, + "grad_norm": 0.9050196409225464, + "learning_rate": 6.38533125252435e-05, + "loss": 0.8245, + "step": 128600 + }, + { + "epoch": 0.8216526327894408, + "grad_norm": 0.9731044769287109, + "learning_rate": 6.384849120875502e-05, + "loss": 0.7545, + "step": 128610 + }, + { + "epoch": 0.8217165199391794, + "grad_norm": 1.5019901990890503, + "learning_rate": 6.384366975280024e-05, + "loss": 0.8528, + "step": 128620 + }, + { + "epoch": 0.8217804070889181, + "grad_norm": 0.9366435408592224, + "learning_rate": 6.383884815742772e-05, + "loss": 0.8599, + "step": 128630 + }, + { + "epoch": 0.8218442942386568, + "grad_norm": 0.6195958852767944, + "learning_rate": 6.383402642268603e-05, + "loss": 0.8433, + "step": 128640 + }, + { + "epoch": 0.8219081813883955, + "grad_norm": 0.6187912225723267, + "learning_rate": 6.382920454862374e-05, + "loss": 0.9079, + "step": 128650 + }, + { + "epoch": 0.8219720685381342, + "grad_norm": 1.1059194803237915, + "learning_rate": 6.382438253528939e-05, + "loss": 0.7321, + "step": 128660 + }, + { + "epoch": 0.8220359556878729, + "grad_norm": 0.8085727095603943, + "learning_rate": 6.381956038273156e-05, + "loss": 0.9373, + "step": 128670 + }, + { + "epoch": 0.8220998428376116, + "grad_norm": 2.0732922554016113, + "learning_rate": 6.381473809099878e-05, + "loss": 0.8538, + "step": 128680 + }, + { + "epoch": 0.8221637299873503, + "grad_norm": 0.6032936573028564, + "learning_rate": 6.380991566013966e-05, + "loss": 0.7414, + "step": 128690 + }, + { + "epoch": 0.822227617137089, + "grad_norm": 1.1853028535842896, + "learning_rate": 6.380509309020272e-05, + "loss": 0.8288, + "step": 128700 + }, + { + "epoch": 0.8222915042868277, + "grad_norm": 0.6363354325294495, + "learning_rate": 6.380027038123654e-05, + "loss": 0.8664, + "step": 128710 + }, + { + "epoch": 0.8223553914365664, + "grad_norm": 1.0098567008972168, + "learning_rate": 6.379544753328973e-05, + "loss": 1.0163, + "step": 128720 + }, + { + "epoch": 0.8224192785863051, + "grad_norm": 2.520233631134033, + "learning_rate": 6.379062454641081e-05, + "loss": 0.7209, + "step": 128730 + }, + { + "epoch": 0.8224831657360439, + "grad_norm": 1.0344457626342773, + "learning_rate": 6.378580142064838e-05, + "loss": 0.8217, + "step": 128740 + }, + { + "epoch": 0.8225470528857826, + "grad_norm": 0.8147494792938232, + "learning_rate": 6.378097815605099e-05, + "loss": 0.7642, + "step": 128750 + }, + { + "epoch": 0.8226109400355213, + "grad_norm": 1.220113754272461, + "learning_rate": 6.377615475266724e-05, + "loss": 0.9745, + "step": 128760 + }, + { + "epoch": 0.82267482718526, + "grad_norm": 0.7933838367462158, + "learning_rate": 6.377133121054571e-05, + "loss": 0.6175, + "step": 128770 + }, + { + "epoch": 0.8227387143349987, + "grad_norm": 1.4493178129196167, + "learning_rate": 6.376650752973493e-05, + "loss": 0.8895, + "step": 128780 + }, + { + "epoch": 0.8228026014847374, + "grad_norm": 0.9888495802879333, + "learning_rate": 6.376168371028351e-05, + "loss": 0.9483, + "step": 128790 + }, + { + "epoch": 0.8228664886344761, + "grad_norm": 1.066873550415039, + "learning_rate": 6.375685975224004e-05, + "loss": 0.9853, + "step": 128800 + }, + { + "epoch": 0.8229303757842148, + "grad_norm": 0.903079092502594, + "learning_rate": 6.375203565565308e-05, + "loss": 0.9505, + "step": 128810 + }, + { + "epoch": 0.8229942629339535, + "grad_norm": 1.5976263284683228, + "learning_rate": 6.374721142057125e-05, + "loss": 0.9695, + "step": 128820 + }, + { + "epoch": 0.8230581500836922, + "grad_norm": 0.6900495886802673, + "learning_rate": 6.374238704704308e-05, + "loss": 1.0342, + "step": 128830 + }, + { + "epoch": 0.8231220372334309, + "grad_norm": 0.9816431403160095, + "learning_rate": 6.37375625351172e-05, + "loss": 0.9036, + "step": 128840 + }, + { + "epoch": 0.8231859243831696, + "grad_norm": 1.096856713294983, + "learning_rate": 6.373273788484217e-05, + "loss": 0.7925, + "step": 128850 + }, + { + "epoch": 0.8232498115329082, + "grad_norm": 0.6943714022636414, + "learning_rate": 6.37279130962666e-05, + "loss": 0.836, + "step": 128860 + }, + { + "epoch": 0.8233136986826469, + "grad_norm": 1.0958727598190308, + "learning_rate": 6.372308816943908e-05, + "loss": 0.8033, + "step": 128870 + }, + { + "epoch": 0.8233775858323856, + "grad_norm": 0.9813776016235352, + "learning_rate": 6.371826310440816e-05, + "loss": 1.1422, + "step": 128880 + }, + { + "epoch": 0.8234414729821243, + "grad_norm": 1.0429863929748535, + "learning_rate": 6.371343790122249e-05, + "loss": 0.9248, + "step": 128890 + }, + { + "epoch": 0.823505360131863, + "grad_norm": 1.0575244426727295, + "learning_rate": 6.370861255993062e-05, + "loss": 1.0187, + "step": 128900 + }, + { + "epoch": 0.8235692472816017, + "grad_norm": 1.6961395740509033, + "learning_rate": 6.370378708058115e-05, + "loss": 0.7942, + "step": 128910 + }, + { + "epoch": 0.8236331344313405, + "grad_norm": 1.0016497373580933, + "learning_rate": 6.36989614632227e-05, + "loss": 0.8602, + "step": 128920 + }, + { + "epoch": 0.8236970215810792, + "grad_norm": 1.1897591352462769, + "learning_rate": 6.369413570790386e-05, + "loss": 0.8869, + "step": 128930 + }, + { + "epoch": 0.8237609087308179, + "grad_norm": 1.0436471700668335, + "learning_rate": 6.368930981467323e-05, + "loss": 0.9429, + "step": 128940 + }, + { + "epoch": 0.8238247958805566, + "grad_norm": 0.9144713878631592, + "learning_rate": 6.368448378357941e-05, + "loss": 0.7866, + "step": 128950 + }, + { + "epoch": 0.8238886830302953, + "grad_norm": 1.058947205543518, + "learning_rate": 6.367965761467098e-05, + "loss": 0.8541, + "step": 128960 + }, + { + "epoch": 0.823952570180034, + "grad_norm": 0.9938645958900452, + "learning_rate": 6.367483130799659e-05, + "loss": 0.9613, + "step": 128970 + }, + { + "epoch": 0.8240164573297727, + "grad_norm": 0.8759385943412781, + "learning_rate": 6.36700048636048e-05, + "loss": 1.0366, + "step": 128980 + }, + { + "epoch": 0.8240803444795114, + "grad_norm": 0.4528246223926544, + "learning_rate": 6.366517828154424e-05, + "loss": 1.0067, + "step": 128990 + }, + { + "epoch": 0.8241442316292501, + "grad_norm": 0.7166324257850647, + "learning_rate": 6.36603515618635e-05, + "loss": 0.8826, + "step": 129000 + }, + { + "epoch": 0.8242081187789888, + "grad_norm": 0.7682775259017944, + "learning_rate": 6.365552470461122e-05, + "loss": 0.8674, + "step": 129010 + }, + { + "epoch": 0.8242720059287275, + "grad_norm": 1.0459059476852417, + "learning_rate": 6.3650697709836e-05, + "loss": 0.7448, + "step": 129020 + }, + { + "epoch": 0.8243358930784662, + "grad_norm": 0.9293308854103088, + "learning_rate": 6.364587057758642e-05, + "loss": 1.0024, + "step": 129030 + }, + { + "epoch": 0.8243997802282049, + "grad_norm": 0.9621434807777405, + "learning_rate": 6.364104330791113e-05, + "loss": 0.853, + "step": 129040 + }, + { + "epoch": 0.8244636673779436, + "grad_norm": 0.5361142158508301, + "learning_rate": 6.363621590085873e-05, + "loss": 0.904, + "step": 129050 + }, + { + "epoch": 0.8245275545276823, + "grad_norm": 1.060625672340393, + "learning_rate": 6.363138835647784e-05, + "loss": 0.9679, + "step": 129060 + }, + { + "epoch": 0.824591441677421, + "grad_norm": 0.9136219024658203, + "learning_rate": 6.362656067481708e-05, + "loss": 1.0068, + "step": 129070 + }, + { + "epoch": 0.8246553288271598, + "grad_norm": 0.9109070301055908, + "learning_rate": 6.362173285592507e-05, + "loss": 0.7127, + "step": 129080 + }, + { + "epoch": 0.8247192159768985, + "grad_norm": 0.6838903427124023, + "learning_rate": 6.361690489985041e-05, + "loss": 0.806, + "step": 129090 + }, + { + "epoch": 0.8247831031266372, + "grad_norm": 0.5887237787246704, + "learning_rate": 6.361207680664174e-05, + "loss": 0.6983, + "step": 129100 + }, + { + "epoch": 0.8248469902763758, + "grad_norm": 1.1329760551452637, + "learning_rate": 6.36072485763477e-05, + "loss": 1.0387, + "step": 129110 + }, + { + "epoch": 0.8249108774261145, + "grad_norm": 1.2478314638137817, + "learning_rate": 6.360242020901688e-05, + "loss": 0.8467, + "step": 129120 + }, + { + "epoch": 0.8249747645758532, + "grad_norm": 1.1845048666000366, + "learning_rate": 6.359759170469791e-05, + "loss": 0.9406, + "step": 129130 + }, + { + "epoch": 0.8250386517255919, + "grad_norm": 0.9721893668174744, + "learning_rate": 6.359276306343944e-05, + "loss": 0.9239, + "step": 129140 + }, + { + "epoch": 0.8251025388753306, + "grad_norm": 0.7293797135353088, + "learning_rate": 6.358793428529008e-05, + "loss": 1.0397, + "step": 129150 + }, + { + "epoch": 0.8251664260250693, + "grad_norm": 1.1368262767791748, + "learning_rate": 6.358310537029847e-05, + "loss": 0.7218, + "step": 129160 + }, + { + "epoch": 0.825230313174808, + "grad_norm": 1.327905535697937, + "learning_rate": 6.357827631851324e-05, + "loss": 0.716, + "step": 129170 + }, + { + "epoch": 0.8252942003245467, + "grad_norm": 1.3848083019256592, + "learning_rate": 6.357344712998302e-05, + "loss": 0.8165, + "step": 129180 + }, + { + "epoch": 0.8253580874742854, + "grad_norm": 0.8067914843559265, + "learning_rate": 6.356861780475645e-05, + "loss": 0.8678, + "step": 129190 + }, + { + "epoch": 0.8254219746240241, + "grad_norm": 0.8428422212600708, + "learning_rate": 6.356378834288216e-05, + "loss": 0.9308, + "step": 129200 + }, + { + "epoch": 0.8254858617737628, + "grad_norm": 0.8563640713691711, + "learning_rate": 6.355895874440878e-05, + "loss": 1.295, + "step": 129210 + }, + { + "epoch": 0.8255497489235015, + "grad_norm": 0.9378432035446167, + "learning_rate": 6.355412900938496e-05, + "loss": 1.1254, + "step": 129220 + }, + { + "epoch": 0.8256136360732402, + "grad_norm": 1.2021596431732178, + "learning_rate": 6.354929913785932e-05, + "loss": 1.1068, + "step": 129230 + }, + { + "epoch": 0.8256775232229789, + "grad_norm": 0.6594062447547913, + "learning_rate": 6.354446912988053e-05, + "loss": 0.7613, + "step": 129240 + }, + { + "epoch": 0.8257414103727176, + "grad_norm": 0.6651197671890259, + "learning_rate": 6.353963898549723e-05, + "loss": 0.684, + "step": 129250 + }, + { + "epoch": 0.8258052975224563, + "grad_norm": 0.7577025890350342, + "learning_rate": 6.353480870475805e-05, + "loss": 0.639, + "step": 129260 + }, + { + "epoch": 0.8258691846721951, + "grad_norm": 1.147277593612671, + "learning_rate": 6.352997828771162e-05, + "loss": 1.1541, + "step": 129270 + }, + { + "epoch": 0.8259330718219338, + "grad_norm": 0.7437235116958618, + "learning_rate": 6.35251477344066e-05, + "loss": 0.9665, + "step": 129280 + }, + { + "epoch": 0.8259969589716725, + "grad_norm": 1.1867269277572632, + "learning_rate": 6.352031704489166e-05, + "loss": 1.0024, + "step": 129290 + }, + { + "epoch": 0.8260608461214112, + "grad_norm": 0.9756374359130859, + "learning_rate": 6.351548621921542e-05, + "loss": 0.9174, + "step": 129300 + }, + { + "epoch": 0.8261247332711499, + "grad_norm": 0.6619752049446106, + "learning_rate": 6.351065525742655e-05, + "loss": 1.0489, + "step": 129310 + }, + { + "epoch": 0.8261886204208886, + "grad_norm": 1.198135256767273, + "learning_rate": 6.350582415957367e-05, + "loss": 0.8207, + "step": 129320 + }, + { + "epoch": 0.8262525075706273, + "grad_norm": 1.183821678161621, + "learning_rate": 6.350099292570547e-05, + "loss": 1.1444, + "step": 129330 + }, + { + "epoch": 0.826316394720366, + "grad_norm": 0.8684186935424805, + "learning_rate": 6.349616155587059e-05, + "loss": 0.7594, + "step": 129340 + }, + { + "epoch": 0.8263802818701046, + "grad_norm": 0.7701572775840759, + "learning_rate": 6.34913300501177e-05, + "loss": 0.7771, + "step": 129350 + }, + { + "epoch": 0.8264441690198433, + "grad_norm": 0.8787345290184021, + "learning_rate": 6.348649840849543e-05, + "loss": 0.9283, + "step": 129360 + }, + { + "epoch": 0.826508056169582, + "grad_norm": 0.7705711126327515, + "learning_rate": 6.348166663105247e-05, + "loss": 0.908, + "step": 129370 + }, + { + "epoch": 0.8265719433193207, + "grad_norm": 0.7562994956970215, + "learning_rate": 6.347683471783744e-05, + "loss": 0.8175, + "step": 129380 + }, + { + "epoch": 0.8266358304690594, + "grad_norm": 0.6645038723945618, + "learning_rate": 6.347200266889904e-05, + "loss": 1.1337, + "step": 129390 + }, + { + "epoch": 0.8266997176187981, + "grad_norm": 0.8476589322090149, + "learning_rate": 6.346717048428592e-05, + "loss": 0.8446, + "step": 129400 + }, + { + "epoch": 0.8267636047685368, + "grad_norm": 0.7244347333908081, + "learning_rate": 6.346233816404674e-05, + "loss": 0.8702, + "step": 129410 + }, + { + "epoch": 0.8268274919182755, + "grad_norm": 1.6287295818328857, + "learning_rate": 6.345750570823017e-05, + "loss": 0.7862, + "step": 129420 + }, + { + "epoch": 0.8268913790680142, + "grad_norm": 0.9641634821891785, + "learning_rate": 6.345267311688486e-05, + "loss": 0.8193, + "step": 129430 + }, + { + "epoch": 0.826955266217753, + "grad_norm": 0.7857980728149414, + "learning_rate": 6.344784039005951e-05, + "loss": 1.0366, + "step": 129440 + }, + { + "epoch": 0.8270191533674917, + "grad_norm": 1.0319744348526, + "learning_rate": 6.344300752780277e-05, + "loss": 0.72, + "step": 129450 + }, + { + "epoch": 0.8270830405172304, + "grad_norm": 1.3282662630081177, + "learning_rate": 6.343817453016332e-05, + "loss": 0.7655, + "step": 129460 + }, + { + "epoch": 0.8271469276669691, + "grad_norm": 0.7755960822105408, + "learning_rate": 6.343334139718982e-05, + "loss": 0.7769, + "step": 129470 + }, + { + "epoch": 0.8272108148167078, + "grad_norm": 1.4779821634292603, + "learning_rate": 6.342850812893094e-05, + "loss": 0.7363, + "step": 129480 + }, + { + "epoch": 0.8272747019664465, + "grad_norm": 0.5803090929985046, + "learning_rate": 6.342367472543537e-05, + "loss": 0.7171, + "step": 129490 + }, + { + "epoch": 0.8273385891161852, + "grad_norm": 0.5924243330955505, + "learning_rate": 6.34188411867518e-05, + "loss": 0.8442, + "step": 129500 + }, + { + "epoch": 0.8274024762659239, + "grad_norm": 1.4736064672470093, + "learning_rate": 6.341400751292888e-05, + "loss": 0.8249, + "step": 129510 + }, + { + "epoch": 0.8274663634156626, + "grad_norm": 0.9478740692138672, + "learning_rate": 6.34091737040153e-05, + "loss": 0.9925, + "step": 129520 + }, + { + "epoch": 0.8275302505654013, + "grad_norm": 1.2648018598556519, + "learning_rate": 6.340433976005975e-05, + "loss": 0.8581, + "step": 129530 + }, + { + "epoch": 0.82759413771514, + "grad_norm": 0.4751732349395752, + "learning_rate": 6.339950568111088e-05, + "loss": 0.8622, + "step": 129540 + }, + { + "epoch": 0.8276580248648787, + "grad_norm": 0.8093522191047668, + "learning_rate": 6.339467146721741e-05, + "loss": 0.9958, + "step": 129550 + }, + { + "epoch": 0.8277219120146174, + "grad_norm": 0.8439111709594727, + "learning_rate": 6.3389837118428e-05, + "loss": 0.9307, + "step": 129560 + }, + { + "epoch": 0.8277857991643561, + "grad_norm": 0.615764319896698, + "learning_rate": 6.338500263479136e-05, + "loss": 0.9045, + "step": 129570 + }, + { + "epoch": 0.8278496863140948, + "grad_norm": 0.8498043417930603, + "learning_rate": 6.338016801635615e-05, + "loss": 0.9691, + "step": 129580 + }, + { + "epoch": 0.8279135734638334, + "grad_norm": 1.2179980278015137, + "learning_rate": 6.337533326317108e-05, + "loss": 0.6736, + "step": 129590 + }, + { + "epoch": 0.8279774606135721, + "grad_norm": 0.5712013244628906, + "learning_rate": 6.337049837528483e-05, + "loss": 0.8732, + "step": 129600 + }, + { + "epoch": 0.8280413477633108, + "grad_norm": 1.027066946029663, + "learning_rate": 6.336566335274609e-05, + "loss": 0.8487, + "step": 129610 + }, + { + "epoch": 0.8281052349130495, + "grad_norm": 0.8761221170425415, + "learning_rate": 6.336082819560357e-05, + "loss": 0.8251, + "step": 129620 + }, + { + "epoch": 0.8281691220627883, + "grad_norm": 0.9286890625953674, + "learning_rate": 6.335599290390595e-05, + "loss": 0.8774, + "step": 129630 + }, + { + "epoch": 0.828233009212527, + "grad_norm": 1.0241472721099854, + "learning_rate": 6.335115747770192e-05, + "loss": 1.0086, + "step": 129640 + }, + { + "epoch": 0.8282968963622657, + "grad_norm": 2.582179069519043, + "learning_rate": 6.334632191704018e-05, + "loss": 1.0137, + "step": 129650 + }, + { + "epoch": 0.8283607835120044, + "grad_norm": 0.8017721176147461, + "learning_rate": 6.334148622196945e-05, + "loss": 0.8961, + "step": 129660 + }, + { + "epoch": 0.8284246706617431, + "grad_norm": 1.0686819553375244, + "learning_rate": 6.33366503925384e-05, + "loss": 1.0203, + "step": 129670 + }, + { + "epoch": 0.8284885578114818, + "grad_norm": 0.6839226484298706, + "learning_rate": 6.333181442879573e-05, + "loss": 1.018, + "step": 129680 + }, + { + "epoch": 0.8285524449612205, + "grad_norm": 1.2881569862365723, + "learning_rate": 6.332697833079017e-05, + "loss": 0.7736, + "step": 129690 + }, + { + "epoch": 0.8286163321109592, + "grad_norm": 1.0879745483398438, + "learning_rate": 6.33221420985704e-05, + "loss": 0.8662, + "step": 129700 + }, + { + "epoch": 0.8286802192606979, + "grad_norm": 0.7041977643966675, + "learning_rate": 6.331730573218514e-05, + "loss": 0.6425, + "step": 129710 + }, + { + "epoch": 0.8287441064104366, + "grad_norm": 1.2785156965255737, + "learning_rate": 6.33124692316831e-05, + "loss": 1.1758, + "step": 129720 + }, + { + "epoch": 0.8288079935601753, + "grad_norm": 1.3215516805648804, + "learning_rate": 6.330763259711295e-05, + "loss": 0.901, + "step": 129730 + }, + { + "epoch": 0.828871880709914, + "grad_norm": 0.8521394729614258, + "learning_rate": 6.330279582852347e-05, + "loss": 0.9565, + "step": 129740 + }, + { + "epoch": 0.8289357678596527, + "grad_norm": 0.8731998205184937, + "learning_rate": 6.32979589259633e-05, + "loss": 0.9697, + "step": 129750 + }, + { + "epoch": 0.8289996550093914, + "grad_norm": 0.8813868761062622, + "learning_rate": 6.329312188948118e-05, + "loss": 1.3262, + "step": 129760 + }, + { + "epoch": 0.8290635421591301, + "grad_norm": 0.9669440984725952, + "learning_rate": 6.328828471912582e-05, + "loss": 1.0292, + "step": 129770 + }, + { + "epoch": 0.8291274293088688, + "grad_norm": 1.7891266345977783, + "learning_rate": 6.328344741494594e-05, + "loss": 1.0843, + "step": 129780 + }, + { + "epoch": 0.8291913164586076, + "grad_norm": 2.168109655380249, + "learning_rate": 6.327860997699025e-05, + "loss": 0.9686, + "step": 129790 + }, + { + "epoch": 0.8292552036083463, + "grad_norm": 1.232346773147583, + "learning_rate": 6.327377240530747e-05, + "loss": 0.8645, + "step": 129800 + }, + { + "epoch": 0.829319090758085, + "grad_norm": 0.9927831292152405, + "learning_rate": 6.326893469994633e-05, + "loss": 1.1181, + "step": 129810 + }, + { + "epoch": 0.8293829779078237, + "grad_norm": 0.6466804146766663, + "learning_rate": 6.326409686095553e-05, + "loss": 0.9161, + "step": 129820 + }, + { + "epoch": 0.8294468650575624, + "grad_norm": 1.5198026895523071, + "learning_rate": 6.325925888838379e-05, + "loss": 1.1528, + "step": 129830 + }, + { + "epoch": 0.829510752207301, + "grad_norm": 0.6202889084815979, + "learning_rate": 6.325442078227986e-05, + "loss": 0.9987, + "step": 129840 + }, + { + "epoch": 0.8295746393570397, + "grad_norm": 0.7182409167289734, + "learning_rate": 6.324958254269243e-05, + "loss": 0.9878, + "step": 129850 + }, + { + "epoch": 0.8296385265067784, + "grad_norm": 0.626463770866394, + "learning_rate": 6.324474416967024e-05, + "loss": 1.0748, + "step": 129860 + }, + { + "epoch": 0.8297024136565171, + "grad_norm": 1.071711540222168, + "learning_rate": 6.323990566326203e-05, + "loss": 1.0839, + "step": 129870 + }, + { + "epoch": 0.8297663008062558, + "grad_norm": 0.5460740327835083, + "learning_rate": 6.323506702351651e-05, + "loss": 1.0518, + "step": 129880 + }, + { + "epoch": 0.8298301879559945, + "grad_norm": 0.813046395778656, + "learning_rate": 6.323022825048243e-05, + "loss": 1.0735, + "step": 129890 + }, + { + "epoch": 0.8298940751057332, + "grad_norm": 1.204576849937439, + "learning_rate": 6.322538934420849e-05, + "loss": 0.9745, + "step": 129900 + }, + { + "epoch": 0.8299579622554719, + "grad_norm": 0.6870941519737244, + "learning_rate": 6.322055030474345e-05, + "loss": 0.7515, + "step": 129910 + }, + { + "epoch": 0.8300218494052106, + "grad_norm": 0.5987921357154846, + "learning_rate": 6.321571113213602e-05, + "loss": 0.8308, + "step": 129920 + }, + { + "epoch": 0.8300857365549493, + "grad_norm": 0.8666192889213562, + "learning_rate": 6.321087182643495e-05, + "loss": 1.0491, + "step": 129930 + }, + { + "epoch": 0.830149623704688, + "grad_norm": 0.699948251247406, + "learning_rate": 6.320603238768896e-05, + "loss": 0.8764, + "step": 129940 + }, + { + "epoch": 0.8302135108544267, + "grad_norm": 0.7432700991630554, + "learning_rate": 6.320119281594681e-05, + "loss": 0.7359, + "step": 129950 + }, + { + "epoch": 0.8302773980041654, + "grad_norm": 0.7470651268959045, + "learning_rate": 6.319635311125722e-05, + "loss": 0.9569, + "step": 129960 + }, + { + "epoch": 0.8303412851539042, + "grad_norm": 1.7274110317230225, + "learning_rate": 6.319151327366894e-05, + "loss": 0.6801, + "step": 129970 + }, + { + "epoch": 0.8304051723036429, + "grad_norm": 1.1418910026550293, + "learning_rate": 6.318667330323074e-05, + "loss": 0.9272, + "step": 129980 + }, + { + "epoch": 0.8304690594533816, + "grad_norm": 0.478630930185318, + "learning_rate": 6.31818331999913e-05, + "loss": 0.5905, + "step": 129990 + }, + { + "epoch": 0.8305329466031203, + "grad_norm": 1.0310248136520386, + "learning_rate": 6.317699296399939e-05, + "loss": 1.3088, + "step": 130000 + }, + { + "epoch": 0.830596833752859, + "grad_norm": 1.2773408889770508, + "learning_rate": 6.317215259530377e-05, + "loss": 0.7674, + "step": 130010 + }, + { + "epoch": 0.8306607209025977, + "grad_norm": 0.8546721339225769, + "learning_rate": 6.316731209395318e-05, + "loss": 0.8664, + "step": 130020 + }, + { + "epoch": 0.8307246080523364, + "grad_norm": 1.4127484560012817, + "learning_rate": 6.316247145999636e-05, + "loss": 0.7683, + "step": 130030 + }, + { + "epoch": 0.8307884952020751, + "grad_norm": 1.9064104557037354, + "learning_rate": 6.315763069348208e-05, + "loss": 0.865, + "step": 130040 + }, + { + "epoch": 0.8308523823518138, + "grad_norm": 1.1948989629745483, + "learning_rate": 6.315278979445906e-05, + "loss": 0.9118, + "step": 130050 + }, + { + "epoch": 0.8309162695015525, + "grad_norm": 1.745757818222046, + "learning_rate": 6.314794876297607e-05, + "loss": 0.8204, + "step": 130060 + }, + { + "epoch": 0.8309801566512912, + "grad_norm": 0.840900719165802, + "learning_rate": 6.314310759908187e-05, + "loss": 1.0959, + "step": 130070 + }, + { + "epoch": 0.8310440438010298, + "grad_norm": 1.8418869972229004, + "learning_rate": 6.313826630282521e-05, + "loss": 1.1114, + "step": 130080 + }, + { + "epoch": 0.8311079309507685, + "grad_norm": 1.024966835975647, + "learning_rate": 6.313342487425483e-05, + "loss": 0.7167, + "step": 130090 + }, + { + "epoch": 0.8311718181005072, + "grad_norm": 0.7477736473083496, + "learning_rate": 6.312858331341951e-05, + "loss": 0.971, + "step": 130100 + }, + { + "epoch": 0.8312357052502459, + "grad_norm": 0.9230465888977051, + "learning_rate": 6.312374162036798e-05, + "loss": 1.0544, + "step": 130110 + }, + { + "epoch": 0.8312995923999846, + "grad_norm": 0.986064612865448, + "learning_rate": 6.311889979514904e-05, + "loss": 1.332, + "step": 130120 + }, + { + "epoch": 0.8313634795497233, + "grad_norm": 0.894123375415802, + "learning_rate": 6.311405783781141e-05, + "loss": 1.1246, + "step": 130130 + }, + { + "epoch": 0.831427366699462, + "grad_norm": 0.7910020351409912, + "learning_rate": 6.310921574840389e-05, + "loss": 0.7583, + "step": 130140 + }, + { + "epoch": 0.8314912538492008, + "grad_norm": 1.064899206161499, + "learning_rate": 6.310437352697522e-05, + "loss": 0.7065, + "step": 130150 + }, + { + "epoch": 0.8315551409989395, + "grad_norm": 0.8578174710273743, + "learning_rate": 6.309953117357416e-05, + "loss": 0.7956, + "step": 130160 + }, + { + "epoch": 0.8316190281486782, + "grad_norm": 0.7391040325164795, + "learning_rate": 6.30946886882495e-05, + "loss": 0.801, + "step": 130170 + }, + { + "epoch": 0.8316829152984169, + "grad_norm": 0.6705387830734253, + "learning_rate": 6.308984607104999e-05, + "loss": 0.7235, + "step": 130180 + }, + { + "epoch": 0.8317468024481556, + "grad_norm": 0.5788788199424744, + "learning_rate": 6.308500332202443e-05, + "loss": 0.7199, + "step": 130190 + }, + { + "epoch": 0.8318106895978943, + "grad_norm": 0.9009729027748108, + "learning_rate": 6.308016044122153e-05, + "loss": 0.7121, + "step": 130200 + }, + { + "epoch": 0.831874576747633, + "grad_norm": 0.7175112962722778, + "learning_rate": 6.307531742869012e-05, + "loss": 0.7884, + "step": 130210 + }, + { + "epoch": 0.8319384638973717, + "grad_norm": 0.9665817022323608, + "learning_rate": 6.307047428447894e-05, + "loss": 0.9997, + "step": 130220 + }, + { + "epoch": 0.8320023510471104, + "grad_norm": 1.334688663482666, + "learning_rate": 6.306563100863679e-05, + "loss": 0.8198, + "step": 130230 + }, + { + "epoch": 0.8320662381968491, + "grad_norm": 0.5525422096252441, + "learning_rate": 6.306078760121243e-05, + "loss": 0.8491, + "step": 130240 + }, + { + "epoch": 0.8321301253465878, + "grad_norm": 0.7009544968605042, + "learning_rate": 6.305594406225464e-05, + "loss": 0.7649, + "step": 130250 + }, + { + "epoch": 0.8321940124963265, + "grad_norm": 1.0132756233215332, + "learning_rate": 6.305110039181219e-05, + "loss": 0.8507, + "step": 130260 + }, + { + "epoch": 0.8322578996460652, + "grad_norm": 1.0832351446151733, + "learning_rate": 6.304625658993388e-05, + "loss": 0.9625, + "step": 130270 + }, + { + "epoch": 0.8323217867958039, + "grad_norm": 1.351954698562622, + "learning_rate": 6.304141265666846e-05, + "loss": 1.1904, + "step": 130280 + }, + { + "epoch": 0.8323856739455426, + "grad_norm": 0.9040460586547852, + "learning_rate": 6.303656859206475e-05, + "loss": 1.0508, + "step": 130290 + }, + { + "epoch": 0.8324495610952813, + "grad_norm": 2.278919219970703, + "learning_rate": 6.30317243961715e-05, + "loss": 0.8939, + "step": 130300 + }, + { + "epoch": 0.83251344824502, + "grad_norm": 1.002038836479187, + "learning_rate": 6.302688006903753e-05, + "loss": 0.9378, + "step": 130310 + }, + { + "epoch": 0.8325773353947586, + "grad_norm": 1.055732250213623, + "learning_rate": 6.30220356107116e-05, + "loss": 1.185, + "step": 130320 + }, + { + "epoch": 0.8326412225444974, + "grad_norm": 0.8173093199729919, + "learning_rate": 6.301719102124251e-05, + "loss": 0.8708, + "step": 130330 + }, + { + "epoch": 0.8327051096942361, + "grad_norm": 0.8317503333091736, + "learning_rate": 6.301234630067902e-05, + "loss": 0.8327, + "step": 130340 + }, + { + "epoch": 0.8327689968439748, + "grad_norm": 0.7437106370925903, + "learning_rate": 6.300750144906997e-05, + "loss": 0.8685, + "step": 130350 + }, + { + "epoch": 0.8328328839937135, + "grad_norm": 0.8775268793106079, + "learning_rate": 6.300265646646413e-05, + "loss": 0.7828, + "step": 130360 + }, + { + "epoch": 0.8328967711434522, + "grad_norm": 0.8213568329811096, + "learning_rate": 6.299781135291028e-05, + "loss": 0.8267, + "step": 130370 + }, + { + "epoch": 0.8329606582931909, + "grad_norm": 0.6800155639648438, + "learning_rate": 6.299296610845721e-05, + "loss": 0.752, + "step": 130380 + }, + { + "epoch": 0.8330245454429296, + "grad_norm": 0.9931345582008362, + "learning_rate": 6.298812073315375e-05, + "loss": 0.7927, + "step": 130390 + }, + { + "epoch": 0.8330884325926683, + "grad_norm": 0.6396980881690979, + "learning_rate": 6.298327522704869e-05, + "loss": 0.8008, + "step": 130400 + }, + { + "epoch": 0.833152319742407, + "grad_norm": 1.1802109479904175, + "learning_rate": 6.29784295901908e-05, + "loss": 0.8763, + "step": 130410 + }, + { + "epoch": 0.8332162068921457, + "grad_norm": 0.64454185962677, + "learning_rate": 6.29735838226289e-05, + "loss": 0.8682, + "step": 130420 + }, + { + "epoch": 0.8332800940418844, + "grad_norm": 0.9595576524734497, + "learning_rate": 6.296873792441179e-05, + "loss": 0.7091, + "step": 130430 + }, + { + "epoch": 0.8333439811916231, + "grad_norm": 0.9316465854644775, + "learning_rate": 6.296389189558825e-05, + "loss": 0.9267, + "step": 130440 + }, + { + "epoch": 0.8334078683413618, + "grad_norm": 0.6795920133590698, + "learning_rate": 6.295904573620712e-05, + "loss": 0.9662, + "step": 130450 + }, + { + "epoch": 0.8334717554911005, + "grad_norm": 0.8197570443153381, + "learning_rate": 6.29541994463172e-05, + "loss": 0.9202, + "step": 130460 + }, + { + "epoch": 0.8335356426408392, + "grad_norm": 1.0630313158035278, + "learning_rate": 6.294935302596727e-05, + "loss": 1.089, + "step": 130470 + }, + { + "epoch": 0.8335995297905779, + "grad_norm": 1.3294517993927002, + "learning_rate": 6.294450647520616e-05, + "loss": 0.9093, + "step": 130480 + }, + { + "epoch": 0.8336634169403166, + "grad_norm": 1.1934936046600342, + "learning_rate": 6.293965979408267e-05, + "loss": 0.7484, + "step": 130490 + }, + { + "epoch": 0.8337273040900554, + "grad_norm": 0.4786630868911743, + "learning_rate": 6.29348129826456e-05, + "loss": 0.8751, + "step": 130500 + }, + { + "epoch": 0.8337911912397941, + "grad_norm": 1.0088316202163696, + "learning_rate": 6.292996604094378e-05, + "loss": 0.9007, + "step": 130510 + }, + { + "epoch": 0.8338550783895328, + "grad_norm": 0.8918789029121399, + "learning_rate": 6.292511896902602e-05, + "loss": 0.625, + "step": 130520 + }, + { + "epoch": 0.8339189655392715, + "grad_norm": 0.9836956262588501, + "learning_rate": 6.292027176694112e-05, + "loss": 1.0436, + "step": 130530 + }, + { + "epoch": 0.8339828526890102, + "grad_norm": 0.6663161516189575, + "learning_rate": 6.29154244347379e-05, + "loss": 1.0225, + "step": 130540 + }, + { + "epoch": 0.8340467398387489, + "grad_norm": 0.9562898278236389, + "learning_rate": 6.29105769724652e-05, + "loss": 1.0247, + "step": 130550 + }, + { + "epoch": 0.8341106269884875, + "grad_norm": 0.8911415338516235, + "learning_rate": 6.29057293801718e-05, + "loss": 0.7822, + "step": 130560 + }, + { + "epoch": 0.8341745141382262, + "grad_norm": 0.6790681481361389, + "learning_rate": 6.290088165790658e-05, + "loss": 0.8417, + "step": 130570 + }, + { + "epoch": 0.8342384012879649, + "grad_norm": 0.6850691437721252, + "learning_rate": 6.289603380571828e-05, + "loss": 0.808, + "step": 130580 + }, + { + "epoch": 0.8343022884377036, + "grad_norm": 0.5608751773834229, + "learning_rate": 6.289118582365578e-05, + "loss": 0.9613, + "step": 130590 + }, + { + "epoch": 0.8343661755874423, + "grad_norm": 1.013020634651184, + "learning_rate": 6.288633771176789e-05, + "loss": 1.102, + "step": 130600 + }, + { + "epoch": 0.834430062737181, + "grad_norm": 1.1625406742095947, + "learning_rate": 6.288148947010342e-05, + "loss": 0.7877, + "step": 130610 + }, + { + "epoch": 0.8344939498869197, + "grad_norm": 1.0570250749588013, + "learning_rate": 6.287664109871121e-05, + "loss": 0.9154, + "step": 130620 + }, + { + "epoch": 0.8345578370366584, + "grad_norm": 0.5707595348358154, + "learning_rate": 6.287179259764008e-05, + "loss": 0.7431, + "step": 130630 + }, + { + "epoch": 0.8346217241863971, + "grad_norm": 1.400903344154358, + "learning_rate": 6.286694396693888e-05, + "loss": 0.648, + "step": 130640 + }, + { + "epoch": 0.8346856113361358, + "grad_norm": 0.6360903382301331, + "learning_rate": 6.286209520665641e-05, + "loss": 0.9326, + "step": 130650 + }, + { + "epoch": 0.8347494984858745, + "grad_norm": 1.020822286605835, + "learning_rate": 6.285724631684153e-05, + "loss": 0.9831, + "step": 130660 + }, + { + "epoch": 0.8348133856356132, + "grad_norm": 0.9532088041305542, + "learning_rate": 6.285239729754304e-05, + "loss": 0.9734, + "step": 130670 + }, + { + "epoch": 0.834877272785352, + "grad_norm": 1.0500184297561646, + "learning_rate": 6.284754814880979e-05, + "loss": 1.1216, + "step": 130680 + }, + { + "epoch": 0.8349411599350907, + "grad_norm": 0.7444097399711609, + "learning_rate": 6.284269887069061e-05, + "loss": 0.6931, + "step": 130690 + }, + { + "epoch": 0.8350050470848294, + "grad_norm": 1.272265076637268, + "learning_rate": 6.283784946323435e-05, + "loss": 0.9984, + "step": 130700 + }, + { + "epoch": 0.8350689342345681, + "grad_norm": 3.133178949356079, + "learning_rate": 6.283299992648985e-05, + "loss": 1.0528, + "step": 130710 + }, + { + "epoch": 0.8351328213843068, + "grad_norm": 0.9269975423812866, + "learning_rate": 6.282815026050593e-05, + "loss": 0.7731, + "step": 130720 + }, + { + "epoch": 0.8351967085340455, + "grad_norm": 1.0282938480377197, + "learning_rate": 6.282330046533144e-05, + "loss": 0.9324, + "step": 130730 + }, + { + "epoch": 0.8352605956837842, + "grad_norm": 0.8756586909294128, + "learning_rate": 6.281845054101522e-05, + "loss": 0.7738, + "step": 130740 + }, + { + "epoch": 0.8353244828335229, + "grad_norm": 0.8821702599525452, + "learning_rate": 6.28136004876061e-05, + "loss": 0.9636, + "step": 130750 + }, + { + "epoch": 0.8353883699832616, + "grad_norm": 0.8497743010520935, + "learning_rate": 6.280875030515295e-05, + "loss": 0.8596, + "step": 130760 + }, + { + "epoch": 0.8354522571330003, + "grad_norm": 0.6755285263061523, + "learning_rate": 6.28038999937046e-05, + "loss": 0.8027, + "step": 130770 + }, + { + "epoch": 0.835516144282739, + "grad_norm": 1.5908513069152832, + "learning_rate": 6.279904955330991e-05, + "loss": 1.274, + "step": 130780 + }, + { + "epoch": 0.8355800314324777, + "grad_norm": 0.8962165713310242, + "learning_rate": 6.279419898401772e-05, + "loss": 0.8612, + "step": 130790 + }, + { + "epoch": 0.8356439185822164, + "grad_norm": 1.3172128200531006, + "learning_rate": 6.278934828587686e-05, + "loss": 0.8991, + "step": 130800 + }, + { + "epoch": 0.835707805731955, + "grad_norm": 0.7222731113433838, + "learning_rate": 6.278449745893621e-05, + "loss": 1.0712, + "step": 130810 + }, + { + "epoch": 0.8357716928816937, + "grad_norm": 0.9182828664779663, + "learning_rate": 6.27796465032446e-05, + "loss": 1.2312, + "step": 130820 + }, + { + "epoch": 0.8358355800314324, + "grad_norm": 0.9249597191810608, + "learning_rate": 6.277479541885091e-05, + "loss": 0.9577, + "step": 130830 + }, + { + "epoch": 0.8358994671811711, + "grad_norm": 0.8450763821601868, + "learning_rate": 6.276994420580397e-05, + "loss": 0.8475, + "step": 130840 + }, + { + "epoch": 0.8359633543309098, + "grad_norm": 0.8186871409416199, + "learning_rate": 6.276509286415265e-05, + "loss": 0.677, + "step": 130850 + }, + { + "epoch": 0.8360272414806486, + "grad_norm": 0.7687765955924988, + "learning_rate": 6.276024139394578e-05, + "loss": 0.7434, + "step": 130860 + }, + { + "epoch": 0.8360911286303873, + "grad_norm": 0.8875823616981506, + "learning_rate": 6.275538979523227e-05, + "loss": 1.1177, + "step": 130870 + }, + { + "epoch": 0.836155015780126, + "grad_norm": 1.5310128927230835, + "learning_rate": 6.275053806806093e-05, + "loss": 0.8904, + "step": 130880 + }, + { + "epoch": 0.8362189029298647, + "grad_norm": 0.9724732041358948, + "learning_rate": 6.274568621248065e-05, + "loss": 0.901, + "step": 130890 + }, + { + "epoch": 0.8362827900796034, + "grad_norm": 0.7733339071273804, + "learning_rate": 6.274083422854026e-05, + "loss": 0.8989, + "step": 130900 + }, + { + "epoch": 0.8363466772293421, + "grad_norm": 0.7655275464057922, + "learning_rate": 6.273598211628867e-05, + "loss": 0.9544, + "step": 130910 + }, + { + "epoch": 0.8364105643790808, + "grad_norm": 0.7450165748596191, + "learning_rate": 6.273112987577472e-05, + "loss": 0.5685, + "step": 130920 + }, + { + "epoch": 0.8364744515288195, + "grad_norm": 0.9278712868690491, + "learning_rate": 6.272627750704727e-05, + "loss": 0.9439, + "step": 130930 + }, + { + "epoch": 0.8365383386785582, + "grad_norm": 0.9437885284423828, + "learning_rate": 6.272142501015521e-05, + "loss": 0.8225, + "step": 130940 + }, + { + "epoch": 0.8366022258282969, + "grad_norm": 2.503553628921509, + "learning_rate": 6.27165723851474e-05, + "loss": 1.0931, + "step": 130950 + }, + { + "epoch": 0.8366661129780356, + "grad_norm": 0.7560781836509705, + "learning_rate": 6.271171963207269e-05, + "loss": 0.8088, + "step": 130960 + }, + { + "epoch": 0.8367300001277743, + "grad_norm": 0.9613698124885559, + "learning_rate": 6.270686675097997e-05, + "loss": 0.8562, + "step": 130970 + }, + { + "epoch": 0.836793887277513, + "grad_norm": 1.2895517349243164, + "learning_rate": 6.27020137419181e-05, + "loss": 0.9257, + "step": 130980 + }, + { + "epoch": 0.8368577744272517, + "grad_norm": 0.9719459414482117, + "learning_rate": 6.269716060493597e-05, + "loss": 0.9031, + "step": 130990 + }, + { + "epoch": 0.8369216615769904, + "grad_norm": 1.973030924797058, + "learning_rate": 6.269230734008245e-05, + "loss": 0.8468, + "step": 131000 + }, + { + "epoch": 0.8369855487267291, + "grad_norm": 0.9847891330718994, + "learning_rate": 6.26874539474064e-05, + "loss": 0.7736, + "step": 131010 + }, + { + "epoch": 0.8370494358764679, + "grad_norm": 0.9893288016319275, + "learning_rate": 6.268260042695672e-05, + "loss": 0.7408, + "step": 131020 + }, + { + "epoch": 0.8371133230262066, + "grad_norm": 0.742874026298523, + "learning_rate": 6.267823214934596e-05, + "loss": 1.013, + "step": 131030 + }, + { + "epoch": 0.8371772101759453, + "grad_norm": 1.0840404033660889, + "learning_rate": 6.267337838626103e-05, + "loss": 1.0696, + "step": 131040 + }, + { + "epoch": 0.8372410973256839, + "grad_norm": 1.0095386505126953, + "learning_rate": 6.266852449554422e-05, + "loss": 0.9302, + "step": 131050 + }, + { + "epoch": 0.8373049844754226, + "grad_norm": 1.0238926410675049, + "learning_rate": 6.266367047724442e-05, + "loss": 0.6575, + "step": 131060 + }, + { + "epoch": 0.8373688716251613, + "grad_norm": 1.125106930732727, + "learning_rate": 6.265881633141049e-05, + "loss": 0.891, + "step": 131070 + }, + { + "epoch": 0.8374327587749, + "grad_norm": 0.716673731803894, + "learning_rate": 6.265396205809132e-05, + "loss": 0.7425, + "step": 131080 + }, + { + "epoch": 0.8374966459246387, + "grad_norm": 1.2633848190307617, + "learning_rate": 6.264910765733582e-05, + "loss": 0.9292, + "step": 131090 + }, + { + "epoch": 0.8375605330743774, + "grad_norm": 1.3997453451156616, + "learning_rate": 6.264425312919288e-05, + "loss": 0.6964, + "step": 131100 + }, + { + "epoch": 0.8376244202241161, + "grad_norm": 1.081032156944275, + "learning_rate": 6.263939847371134e-05, + "loss": 0.885, + "step": 131110 + }, + { + "epoch": 0.8376883073738548, + "grad_norm": 0.6516674757003784, + "learning_rate": 6.263454369094014e-05, + "loss": 0.7648, + "step": 131120 + }, + { + "epoch": 0.8377521945235935, + "grad_norm": 0.8595327138900757, + "learning_rate": 6.262968878092814e-05, + "loss": 0.7661, + "step": 131130 + }, + { + "epoch": 0.8378160816733322, + "grad_norm": 1.0388331413269043, + "learning_rate": 6.262483374372426e-05, + "loss": 0.6961, + "step": 131140 + }, + { + "epoch": 0.8378799688230709, + "grad_norm": 1.5679370164871216, + "learning_rate": 6.261997857937738e-05, + "loss": 0.8494, + "step": 131150 + }, + { + "epoch": 0.8379438559728096, + "grad_norm": 1.0286293029785156, + "learning_rate": 6.261512328793639e-05, + "loss": 0.8269, + "step": 131160 + }, + { + "epoch": 0.8380077431225483, + "grad_norm": 1.3701919317245483, + "learning_rate": 6.261026786945021e-05, + "loss": 0.7509, + "step": 131170 + }, + { + "epoch": 0.838071630272287, + "grad_norm": 0.7950007915496826, + "learning_rate": 6.260541232396771e-05, + "loss": 1.3381, + "step": 131180 + }, + { + "epoch": 0.8381355174220257, + "grad_norm": 0.7836694717407227, + "learning_rate": 6.26005566515378e-05, + "loss": 0.9361, + "step": 131190 + }, + { + "epoch": 0.8381994045717645, + "grad_norm": 0.7753838300704956, + "learning_rate": 6.259570085220939e-05, + "loss": 0.8256, + "step": 131200 + }, + { + "epoch": 0.8382632917215032, + "grad_norm": 0.8219996690750122, + "learning_rate": 6.259084492603138e-05, + "loss": 0.7421, + "step": 131210 + }, + { + "epoch": 0.8383271788712419, + "grad_norm": 1.0222669839859009, + "learning_rate": 6.258598887305265e-05, + "loss": 0.6588, + "step": 131220 + }, + { + "epoch": 0.8383910660209806, + "grad_norm": 0.9310179352760315, + "learning_rate": 6.258113269332215e-05, + "loss": 0.9836, + "step": 131230 + }, + { + "epoch": 0.8384549531707193, + "grad_norm": 0.8922819495201111, + "learning_rate": 6.257627638688875e-05, + "loss": 0.8864, + "step": 131240 + }, + { + "epoch": 0.838518840320458, + "grad_norm": 1.2268577814102173, + "learning_rate": 6.257141995380136e-05, + "loss": 1.1079, + "step": 131250 + }, + { + "epoch": 0.8385827274701967, + "grad_norm": 0.7743911743164062, + "learning_rate": 6.25665633941089e-05, + "loss": 0.8269, + "step": 131260 + }, + { + "epoch": 0.8386466146199354, + "grad_norm": 2.4467434883117676, + "learning_rate": 6.256170670786028e-05, + "loss": 1.1068, + "step": 131270 + }, + { + "epoch": 0.8387105017696741, + "grad_norm": 1.4067353010177612, + "learning_rate": 6.25568498951044e-05, + "loss": 0.7124, + "step": 131280 + }, + { + "epoch": 0.8387743889194127, + "grad_norm": 0.9311875700950623, + "learning_rate": 6.255199295589018e-05, + "loss": 1.0478, + "step": 131290 + }, + { + "epoch": 0.8388382760691514, + "grad_norm": 0.8435894250869751, + "learning_rate": 6.254713589026652e-05, + "loss": 0.758, + "step": 131300 + }, + { + "epoch": 0.8389021632188901, + "grad_norm": 1.4599822759628296, + "learning_rate": 6.254227869828237e-05, + "loss": 1.0697, + "step": 131310 + }, + { + "epoch": 0.8389660503686288, + "grad_norm": 0.8313419818878174, + "learning_rate": 6.253742137998661e-05, + "loss": 1.1126, + "step": 131320 + }, + { + "epoch": 0.8390299375183675, + "grad_norm": 0.9159870743751526, + "learning_rate": 6.253256393542817e-05, + "loss": 0.8222, + "step": 131330 + }, + { + "epoch": 0.8390938246681062, + "grad_norm": 1.2503025531768799, + "learning_rate": 6.252770636465597e-05, + "loss": 0.9694, + "step": 131340 + }, + { + "epoch": 0.8391577118178449, + "grad_norm": 0.7653509974479675, + "learning_rate": 6.252284866771894e-05, + "loss": 0.9817, + "step": 131350 + }, + { + "epoch": 0.8392215989675836, + "grad_norm": 0.9600273370742798, + "learning_rate": 6.251799084466596e-05, + "loss": 0.9824, + "step": 131360 + }, + { + "epoch": 0.8392854861173223, + "grad_norm": 0.5949135422706604, + "learning_rate": 6.251313289554601e-05, + "loss": 0.9494, + "step": 131370 + }, + { + "epoch": 0.839349373267061, + "grad_norm": 0.6030875444412231, + "learning_rate": 6.250827482040797e-05, + "loss": 0.7904, + "step": 131380 + }, + { + "epoch": 0.8394132604167998, + "grad_norm": 0.9871885180473328, + "learning_rate": 6.25034166193008e-05, + "loss": 0.7854, + "step": 131390 + }, + { + "epoch": 0.8394771475665385, + "grad_norm": 1.0872247219085693, + "learning_rate": 6.24985582922734e-05, + "loss": 1.0537, + "step": 131400 + }, + { + "epoch": 0.8395410347162772, + "grad_norm": 0.9975283741950989, + "learning_rate": 6.24936998393747e-05, + "loss": 0.8564, + "step": 131410 + }, + { + "epoch": 0.8396049218660159, + "grad_norm": 1.5955655574798584, + "learning_rate": 6.248884126065364e-05, + "loss": 0.7626, + "step": 131420 + }, + { + "epoch": 0.8396688090157546, + "grad_norm": 0.8411442041397095, + "learning_rate": 6.248398255615913e-05, + "loss": 1.0739, + "step": 131430 + }, + { + "epoch": 0.8397326961654933, + "grad_norm": 0.9352008700370789, + "learning_rate": 6.247912372594013e-05, + "loss": 0.7532, + "step": 131440 + }, + { + "epoch": 0.839796583315232, + "grad_norm": 1.1712517738342285, + "learning_rate": 6.247426477004555e-05, + "loss": 0.7926, + "step": 131450 + }, + { + "epoch": 0.8398604704649707, + "grad_norm": 0.6964886784553528, + "learning_rate": 6.246940568852435e-05, + "loss": 0.7795, + "step": 131460 + }, + { + "epoch": 0.8399243576147094, + "grad_norm": 0.7911357283592224, + "learning_rate": 6.246454648142542e-05, + "loss": 1.0756, + "step": 131470 + }, + { + "epoch": 0.8399882447644481, + "grad_norm": 0.9874993562698364, + "learning_rate": 6.245968714879773e-05, + "loss": 1.0366, + "step": 131480 + }, + { + "epoch": 0.8400521319141868, + "grad_norm": 0.5494539141654968, + "learning_rate": 6.245482769069023e-05, + "loss": 0.8724, + "step": 131490 + }, + { + "epoch": 0.8401160190639255, + "grad_norm": 0.962289571762085, + "learning_rate": 6.244996810715183e-05, + "loss": 0.8694, + "step": 131500 + }, + { + "epoch": 0.8401799062136642, + "grad_norm": 0.762205183506012, + "learning_rate": 6.244510839823147e-05, + "loss": 0.8485, + "step": 131510 + }, + { + "epoch": 0.8402437933634029, + "grad_norm": 0.9968754649162292, + "learning_rate": 6.244024856397812e-05, + "loss": 0.7702, + "step": 131520 + }, + { + "epoch": 0.8403076805131416, + "grad_norm": 1.0157196521759033, + "learning_rate": 6.24353886044407e-05, + "loss": 0.7992, + "step": 131530 + }, + { + "epoch": 0.8403715676628802, + "grad_norm": 1.0793529748916626, + "learning_rate": 6.243052851966816e-05, + "loss": 0.8393, + "step": 131540 + }, + { + "epoch": 0.8404354548126189, + "grad_norm": 0.6391186118125916, + "learning_rate": 6.242566830970941e-05, + "loss": 1.1761, + "step": 131550 + }, + { + "epoch": 0.8404993419623576, + "grad_norm": 0.727288544178009, + "learning_rate": 6.242080797461346e-05, + "loss": 1.3424, + "step": 131560 + }, + { + "epoch": 0.8405632291120964, + "grad_norm": 0.7789543271064758, + "learning_rate": 6.241594751442923e-05, + "loss": 0.8127, + "step": 131570 + }, + { + "epoch": 0.8406271162618351, + "grad_norm": 0.8413047790527344, + "learning_rate": 6.241108692920566e-05, + "loss": 0.8366, + "step": 131580 + }, + { + "epoch": 0.8406910034115738, + "grad_norm": 1.12662672996521, + "learning_rate": 6.240622621899173e-05, + "loss": 0.7906, + "step": 131590 + }, + { + "epoch": 0.8407548905613125, + "grad_norm": 1.1771347522735596, + "learning_rate": 6.240136538383635e-05, + "loss": 0.9557, + "step": 131600 + }, + { + "epoch": 0.8408187777110512, + "grad_norm": 1.3687745332717896, + "learning_rate": 6.239650442378848e-05, + "loss": 1.0635, + "step": 131610 + }, + { + "epoch": 0.8408826648607899, + "grad_norm": 0.5779715180397034, + "learning_rate": 6.239164333889711e-05, + "loss": 0.6679, + "step": 131620 + }, + { + "epoch": 0.8409465520105286, + "grad_norm": 0.9042608737945557, + "learning_rate": 6.238678212921115e-05, + "loss": 0.9131, + "step": 131630 + }, + { + "epoch": 0.8410104391602673, + "grad_norm": 0.7568997144699097, + "learning_rate": 6.238192079477959e-05, + "loss": 0.8784, + "step": 131640 + }, + { + "epoch": 0.841074326310006, + "grad_norm": 0.7763069868087769, + "learning_rate": 6.237705933565137e-05, + "loss": 1.0966, + "step": 131650 + }, + { + "epoch": 0.8411382134597447, + "grad_norm": 0.7040805220603943, + "learning_rate": 6.237219775187545e-05, + "loss": 0.9219, + "step": 131660 + }, + { + "epoch": 0.8412021006094834, + "grad_norm": 1.0088971853256226, + "learning_rate": 6.236733604350081e-05, + "loss": 0.8971, + "step": 131670 + }, + { + "epoch": 0.8412659877592221, + "grad_norm": 1.101572036743164, + "learning_rate": 6.236247421057639e-05, + "loss": 0.7638, + "step": 131680 + }, + { + "epoch": 0.8413298749089608, + "grad_norm": 0.8880965709686279, + "learning_rate": 6.235761225315117e-05, + "loss": 1.1101, + "step": 131690 + }, + { + "epoch": 0.8413937620586995, + "grad_norm": 0.6575754284858704, + "learning_rate": 6.235275017127409e-05, + "loss": 0.7849, + "step": 131700 + }, + { + "epoch": 0.8414576492084382, + "grad_norm": 1.0469022989273071, + "learning_rate": 6.234788796499411e-05, + "loss": 1.1646, + "step": 131710 + }, + { + "epoch": 0.841521536358177, + "grad_norm": 0.6429979801177979, + "learning_rate": 6.234302563436024e-05, + "loss": 0.6877, + "step": 131720 + }, + { + "epoch": 0.8415854235079157, + "grad_norm": 0.8447049260139465, + "learning_rate": 6.233816317942143e-05, + "loss": 0.836, + "step": 131730 + }, + { + "epoch": 0.8416493106576544, + "grad_norm": 1.907248854637146, + "learning_rate": 6.233330060022662e-05, + "loss": 0.7056, + "step": 131740 + }, + { + "epoch": 0.8417131978073931, + "grad_norm": 0.9654474854469299, + "learning_rate": 6.232843789682483e-05, + "loss": 0.8356, + "step": 131750 + }, + { + "epoch": 0.8417770849571318, + "grad_norm": 1.3447314500808716, + "learning_rate": 6.2323575069265e-05, + "loss": 0.7557, + "step": 131760 + }, + { + "epoch": 0.8418409721068705, + "grad_norm": 2.194685220718384, + "learning_rate": 6.231871211759609e-05, + "loss": 0.885, + "step": 131770 + }, + { + "epoch": 0.8419048592566091, + "grad_norm": 1.1542408466339111, + "learning_rate": 6.23138490418671e-05, + "loss": 0.7755, + "step": 131780 + }, + { + "epoch": 0.8419687464063478, + "grad_norm": 0.7625725865364075, + "learning_rate": 6.2308985842127e-05, + "loss": 1.0261, + "step": 131790 + }, + { + "epoch": 0.8420326335560865, + "grad_norm": 1.0486656427383423, + "learning_rate": 6.230412251842477e-05, + "loss": 0.9115, + "step": 131800 + }, + { + "epoch": 0.8420965207058252, + "grad_norm": 1.7022581100463867, + "learning_rate": 6.229925907080937e-05, + "loss": 0.853, + "step": 131810 + }, + { + "epoch": 0.8421604078555639, + "grad_norm": 0.8428347110748291, + "learning_rate": 6.229439549932979e-05, + "loss": 0.9491, + "step": 131820 + }, + { + "epoch": 0.8422242950053026, + "grad_norm": 1.126781940460205, + "learning_rate": 6.228953180403503e-05, + "loss": 0.7966, + "step": 131830 + }, + { + "epoch": 0.8422881821550413, + "grad_norm": 0.7675802111625671, + "learning_rate": 6.228466798497403e-05, + "loss": 0.8985, + "step": 131840 + }, + { + "epoch": 0.84235206930478, + "grad_norm": 0.7653781771659851, + "learning_rate": 6.227980404219581e-05, + "loss": 0.8768, + "step": 131850 + }, + { + "epoch": 0.8424159564545187, + "grad_norm": 0.9524831175804138, + "learning_rate": 6.227493997574933e-05, + "loss": 0.93, + "step": 131860 + }, + { + "epoch": 0.8424798436042574, + "grad_norm": 0.7941969037055969, + "learning_rate": 6.227007578568358e-05, + "loss": 0.9524, + "step": 131870 + }, + { + "epoch": 0.8425437307539961, + "grad_norm": 1.5713688135147095, + "learning_rate": 6.226521147204757e-05, + "loss": 0.8452, + "step": 131880 + }, + { + "epoch": 0.8426076179037348, + "grad_norm": 0.7123538255691528, + "learning_rate": 6.226034703489025e-05, + "loss": 0.962, + "step": 131890 + }, + { + "epoch": 0.8426715050534735, + "grad_norm": 0.9128240942955017, + "learning_rate": 6.225548247426064e-05, + "loss": 0.6589, + "step": 131900 + }, + { + "epoch": 0.8427353922032123, + "grad_norm": 1.0599886178970337, + "learning_rate": 6.225061779020773e-05, + "loss": 0.7505, + "step": 131910 + }, + { + "epoch": 0.842799279352951, + "grad_norm": 0.7445502877235413, + "learning_rate": 6.224575298278048e-05, + "loss": 0.9809, + "step": 131920 + }, + { + "epoch": 0.8428631665026897, + "grad_norm": 1.1872540712356567, + "learning_rate": 6.224088805202791e-05, + "loss": 0.9065, + "step": 131930 + }, + { + "epoch": 0.8429270536524284, + "grad_norm": 1.1298291683197021, + "learning_rate": 6.2236022997999e-05, + "loss": 1.4187, + "step": 131940 + }, + { + "epoch": 0.8429909408021671, + "grad_norm": 0.8236765265464783, + "learning_rate": 6.223115782074278e-05, + "loss": 0.7978, + "step": 131950 + }, + { + "epoch": 0.8430548279519058, + "grad_norm": 0.953170895576477, + "learning_rate": 6.22262925203082e-05, + "loss": 0.8399, + "step": 131960 + }, + { + "epoch": 0.8431187151016445, + "grad_norm": 0.5813484787940979, + "learning_rate": 6.222142709674428e-05, + "loss": 0.7039, + "step": 131970 + }, + { + "epoch": 0.8431826022513832, + "grad_norm": 0.963085412979126, + "learning_rate": 6.22165615501e-05, + "loss": 0.7868, + "step": 131980 + }, + { + "epoch": 0.8432464894011219, + "grad_norm": 0.8191744089126587, + "learning_rate": 6.22116958804244e-05, + "loss": 0.9115, + "step": 131990 + }, + { + "epoch": 0.8433103765508606, + "grad_norm": 0.9577305912971497, + "learning_rate": 6.220683008776645e-05, + "loss": 0.7448, + "step": 132000 + }, + { + "epoch": 0.8433742637005993, + "grad_norm": 1.459100604057312, + "learning_rate": 6.220196417217516e-05, + "loss": 0.9975, + "step": 132010 + }, + { + "epoch": 0.8434381508503379, + "grad_norm": 0.7961175441741943, + "learning_rate": 6.219709813369953e-05, + "loss": 1.0328, + "step": 132020 + }, + { + "epoch": 0.8435020380000766, + "grad_norm": 0.9462569355964661, + "learning_rate": 6.219223197238858e-05, + "loss": 0.8782, + "step": 132030 + }, + { + "epoch": 0.8435659251498153, + "grad_norm": 1.1649507284164429, + "learning_rate": 6.21873656882913e-05, + "loss": 0.8889, + "step": 132040 + }, + { + "epoch": 0.843629812299554, + "grad_norm": 0.6289653182029724, + "learning_rate": 6.218249928145671e-05, + "loss": 1.052, + "step": 132050 + }, + { + "epoch": 0.8436936994492927, + "grad_norm": 0.8261928558349609, + "learning_rate": 6.21776327519338e-05, + "loss": 0.8439, + "step": 132060 + }, + { + "epoch": 0.8437575865990314, + "grad_norm": 1.1747194528579712, + "learning_rate": 6.21727660997716e-05, + "loss": 1.1163, + "step": 132070 + }, + { + "epoch": 0.8438214737487701, + "grad_norm": 0.8616983294487, + "learning_rate": 6.216789932501912e-05, + "loss": 0.7076, + "step": 132080 + }, + { + "epoch": 0.8438853608985089, + "grad_norm": 1.5313186645507812, + "learning_rate": 6.216303242772535e-05, + "loss": 0.763, + "step": 132090 + }, + { + "epoch": 0.8439492480482476, + "grad_norm": 0.84382164478302, + "learning_rate": 6.215816540793934e-05, + "loss": 0.9195, + "step": 132100 + }, + { + "epoch": 0.8440131351979863, + "grad_norm": 1.5501306056976318, + "learning_rate": 6.215329826571008e-05, + "loss": 1.2185, + "step": 132110 + }, + { + "epoch": 0.844077022347725, + "grad_norm": 1.0050925016403198, + "learning_rate": 6.214843100108659e-05, + "loss": 0.8055, + "step": 132120 + }, + { + "epoch": 0.8441409094974637, + "grad_norm": 1.1478122472763062, + "learning_rate": 6.214356361411788e-05, + "loss": 0.7929, + "step": 132130 + }, + { + "epoch": 0.8442047966472024, + "grad_norm": 1.0138068199157715, + "learning_rate": 6.2138696104853e-05, + "loss": 0.8049, + "step": 132140 + }, + { + "epoch": 0.8442686837969411, + "grad_norm": 3.256178379058838, + "learning_rate": 6.213382847334094e-05, + "loss": 0.8957, + "step": 132150 + }, + { + "epoch": 0.8443325709466798, + "grad_norm": 0.8341447710990906, + "learning_rate": 6.212896071963072e-05, + "loss": 1.0626, + "step": 132160 + }, + { + "epoch": 0.8443964580964185, + "grad_norm": 0.8610829710960388, + "learning_rate": 6.212409284377138e-05, + "loss": 0.8818, + "step": 132170 + }, + { + "epoch": 0.8444603452461572, + "grad_norm": 0.9030302166938782, + "learning_rate": 6.211922484581194e-05, + "loss": 0.9542, + "step": 132180 + }, + { + "epoch": 0.8445242323958959, + "grad_norm": 0.8101795315742493, + "learning_rate": 6.211435672580143e-05, + "loss": 1.069, + "step": 132190 + }, + { + "epoch": 0.8445881195456346, + "grad_norm": 0.8379024267196655, + "learning_rate": 6.210948848378884e-05, + "loss": 1.0042, + "step": 132200 + }, + { + "epoch": 0.8446520066953733, + "grad_norm": 1.2797821760177612, + "learning_rate": 6.210462011982325e-05, + "loss": 1.0052, + "step": 132210 + }, + { + "epoch": 0.844715893845112, + "grad_norm": 0.7084786295890808, + "learning_rate": 6.209975163395365e-05, + "loss": 0.9553, + "step": 132220 + }, + { + "epoch": 0.8447797809948507, + "grad_norm": 1.345314860343933, + "learning_rate": 6.209488302622909e-05, + "loss": 1.0062, + "step": 132230 + }, + { + "epoch": 0.8448436681445894, + "grad_norm": 1.0037018060684204, + "learning_rate": 6.209001429669859e-05, + "loss": 0.8894, + "step": 132240 + }, + { + "epoch": 0.8449075552943281, + "grad_norm": 0.7273179292678833, + "learning_rate": 6.208514544541118e-05, + "loss": 0.7736, + "step": 132250 + }, + { + "epoch": 0.8449714424440669, + "grad_norm": 1.832634687423706, + "learning_rate": 6.208027647241591e-05, + "loss": 0.79, + "step": 132260 + }, + { + "epoch": 0.8450353295938055, + "grad_norm": 0.895901083946228, + "learning_rate": 6.207540737776179e-05, + "loss": 0.8047, + "step": 132270 + }, + { + "epoch": 0.8450992167435442, + "grad_norm": 0.9929360151290894, + "learning_rate": 6.207053816149789e-05, + "loss": 0.9768, + "step": 132280 + }, + { + "epoch": 0.8451631038932829, + "grad_norm": 2.9557993412017822, + "learning_rate": 6.206566882367323e-05, + "loss": 0.6881, + "step": 132290 + }, + { + "epoch": 0.8452269910430216, + "grad_norm": 1.2809102535247803, + "learning_rate": 6.206079936433685e-05, + "loss": 0.7503, + "step": 132300 + }, + { + "epoch": 0.8452908781927603, + "grad_norm": 0.7714309692382812, + "learning_rate": 6.205592978353776e-05, + "loss": 0.8082, + "step": 132310 + }, + { + "epoch": 0.845354765342499, + "grad_norm": 1.0882459878921509, + "learning_rate": 6.205106008132505e-05, + "loss": 0.7081, + "step": 132320 + }, + { + "epoch": 0.8454186524922377, + "grad_norm": 1.8441599607467651, + "learning_rate": 6.204619025774774e-05, + "loss": 0.8468, + "step": 132330 + }, + { + "epoch": 0.8454825396419764, + "grad_norm": 0.8650910258293152, + "learning_rate": 6.204132031285485e-05, + "loss": 0.7917, + "step": 132340 + }, + { + "epoch": 0.8455464267917151, + "grad_norm": 0.9829961061477661, + "learning_rate": 6.203645024669548e-05, + "loss": 0.861, + "step": 132350 + }, + { + "epoch": 0.8456103139414538, + "grad_norm": 1.0663328170776367, + "learning_rate": 6.203158005931861e-05, + "loss": 0.9636, + "step": 132360 + }, + { + "epoch": 0.8456742010911925, + "grad_norm": 0.7675907611846924, + "learning_rate": 6.202670975077334e-05, + "loss": 0.985, + "step": 132370 + }, + { + "epoch": 0.8457380882409312, + "grad_norm": 0.8641635179519653, + "learning_rate": 6.20218393211087e-05, + "loss": 0.7433, + "step": 132380 + }, + { + "epoch": 0.8458019753906699, + "grad_norm": 1.2303731441497803, + "learning_rate": 6.201696877037373e-05, + "loss": 0.8609, + "step": 132390 + }, + { + "epoch": 0.8458658625404086, + "grad_norm": 0.7559641599655151, + "learning_rate": 6.201209809861747e-05, + "loss": 0.879, + "step": 132400 + }, + { + "epoch": 0.8459297496901473, + "grad_norm": 0.9814415574073792, + "learning_rate": 6.200722730588901e-05, + "loss": 0.9192, + "step": 132410 + }, + { + "epoch": 0.845993636839886, + "grad_norm": 0.9890975952148438, + "learning_rate": 6.20023563922374e-05, + "loss": 0.9853, + "step": 132420 + }, + { + "epoch": 0.8460575239896247, + "grad_norm": 0.8274144530296326, + "learning_rate": 6.199748535771165e-05, + "loss": 0.8835, + "step": 132430 + }, + { + "epoch": 0.8461214111393635, + "grad_norm": 0.6956206560134888, + "learning_rate": 6.199261420236086e-05, + "loss": 0.9952, + "step": 132440 + }, + { + "epoch": 0.8461852982891022, + "grad_norm": 0.6252676844596863, + "learning_rate": 6.198774292623406e-05, + "loss": 0.8467, + "step": 132450 + }, + { + "epoch": 0.8462491854388409, + "grad_norm": 1.0025311708450317, + "learning_rate": 6.198287152938031e-05, + "loss": 0.9938, + "step": 132460 + }, + { + "epoch": 0.8463130725885796, + "grad_norm": 1.1646289825439453, + "learning_rate": 6.197800001184869e-05, + "loss": 1.0137, + "step": 132470 + }, + { + "epoch": 0.8463769597383183, + "grad_norm": 0.8148912191390991, + "learning_rate": 6.197312837368825e-05, + "loss": 0.937, + "step": 132480 + }, + { + "epoch": 0.846440846888057, + "grad_norm": 1.7211027145385742, + "learning_rate": 6.196825661494805e-05, + "loss": 0.852, + "step": 132490 + }, + { + "epoch": 0.8465047340377957, + "grad_norm": 0.7814443707466125, + "learning_rate": 6.196338473567714e-05, + "loss": 1.0702, + "step": 132500 + }, + { + "epoch": 0.8465686211875343, + "grad_norm": 1.2225807905197144, + "learning_rate": 6.19585127359246e-05, + "loss": 0.7909, + "step": 132510 + }, + { + "epoch": 0.846632508337273, + "grad_norm": 0.9283073544502258, + "learning_rate": 6.19536406157395e-05, + "loss": 0.8267, + "step": 132520 + }, + { + "epoch": 0.8466963954870117, + "grad_norm": 0.9126421809196472, + "learning_rate": 6.194876837517089e-05, + "loss": 0.7771, + "step": 132530 + }, + { + "epoch": 0.8467602826367504, + "grad_norm": 1.5700596570968628, + "learning_rate": 6.194389601426784e-05, + "loss": 1.0307, + "step": 132540 + }, + { + "epoch": 0.8468241697864891, + "grad_norm": 0.8613380193710327, + "learning_rate": 6.193902353307943e-05, + "loss": 0.7906, + "step": 132550 + }, + { + "epoch": 0.8468880569362278, + "grad_norm": 0.9143043756484985, + "learning_rate": 6.193415093165473e-05, + "loss": 0.9351, + "step": 132560 + }, + { + "epoch": 0.8469519440859665, + "grad_norm": 1.215604543685913, + "learning_rate": 6.192927821004281e-05, + "loss": 0.8207, + "step": 132570 + }, + { + "epoch": 0.8470158312357052, + "grad_norm": 1.1382776498794556, + "learning_rate": 6.192440536829272e-05, + "loss": 0.6806, + "step": 132580 + }, + { + "epoch": 0.8470797183854439, + "grad_norm": 0.8516457080841064, + "learning_rate": 6.191953240645356e-05, + "loss": 0.9752, + "step": 132590 + }, + { + "epoch": 0.8471436055351826, + "grad_norm": 0.6986536383628845, + "learning_rate": 6.191465932457439e-05, + "loss": 0.7651, + "step": 132600 + }, + { + "epoch": 0.8472074926849213, + "grad_norm": 1.2519372701644897, + "learning_rate": 6.19097861227043e-05, + "loss": 0.8898, + "step": 132610 + }, + { + "epoch": 0.84727137983466, + "grad_norm": 0.8474778532981873, + "learning_rate": 6.190491280089236e-05, + "loss": 0.9425, + "step": 132620 + }, + { + "epoch": 0.8473352669843988, + "grad_norm": 0.9403843879699707, + "learning_rate": 6.190003935918766e-05, + "loss": 0.8585, + "step": 132630 + }, + { + "epoch": 0.8473991541341375, + "grad_norm": 0.9674764275550842, + "learning_rate": 6.189516579763925e-05, + "loss": 1.0548, + "step": 132640 + }, + { + "epoch": 0.8474630412838762, + "grad_norm": 1.1590251922607422, + "learning_rate": 6.189029211629625e-05, + "loss": 0.8096, + "step": 132650 + }, + { + "epoch": 0.8475269284336149, + "grad_norm": 1.0220919847488403, + "learning_rate": 6.188541831520772e-05, + "loss": 0.7432, + "step": 132660 + }, + { + "epoch": 0.8475908155833536, + "grad_norm": 0.7467325329780579, + "learning_rate": 6.188054439442273e-05, + "loss": 0.8427, + "step": 132670 + }, + { + "epoch": 0.8476547027330923, + "grad_norm": 1.00656259059906, + "learning_rate": 6.187567035399038e-05, + "loss": 1.0052, + "step": 132680 + }, + { + "epoch": 0.847718589882831, + "grad_norm": 1.0796363353729248, + "learning_rate": 6.187079619395976e-05, + "loss": 0.7158, + "step": 132690 + }, + { + "epoch": 0.8477824770325697, + "grad_norm": 0.8648567795753479, + "learning_rate": 6.186592191437995e-05, + "loss": 0.8839, + "step": 132700 + }, + { + "epoch": 0.8478463641823084, + "grad_norm": 1.2238774299621582, + "learning_rate": 6.186104751530004e-05, + "loss": 0.9307, + "step": 132710 + }, + { + "epoch": 0.8479102513320471, + "grad_norm": 1.1575216054916382, + "learning_rate": 6.185617299676913e-05, + "loss": 0.9198, + "step": 132720 + }, + { + "epoch": 0.8479741384817858, + "grad_norm": 0.6900240778923035, + "learning_rate": 6.18512983588363e-05, + "loss": 0.9578, + "step": 132730 + }, + { + "epoch": 0.8480380256315245, + "grad_norm": 1.0454217195510864, + "learning_rate": 6.184642360155062e-05, + "loss": 0.7726, + "step": 132740 + }, + { + "epoch": 0.8481019127812631, + "grad_norm": 1.078119158744812, + "learning_rate": 6.184154872496124e-05, + "loss": 0.8945, + "step": 132750 + }, + { + "epoch": 0.8481657999310018, + "grad_norm": 1.2794424295425415, + "learning_rate": 6.18366737291172e-05, + "loss": 0.8144, + "step": 132760 + }, + { + "epoch": 0.8482296870807405, + "grad_norm": 1.1177339553833008, + "learning_rate": 6.18317986140676e-05, + "loss": 0.711, + "step": 132770 + }, + { + "epoch": 0.8482935742304792, + "grad_norm": 0.9638439416885376, + "learning_rate": 6.182692337986157e-05, + "loss": 0.712, + "step": 132780 + }, + { + "epoch": 0.848357461380218, + "grad_norm": 0.7159675359725952, + "learning_rate": 6.18220480265482e-05, + "loss": 0.6036, + "step": 132790 + }, + { + "epoch": 0.8484213485299567, + "grad_norm": 0.9956109523773193, + "learning_rate": 6.181717255417658e-05, + "loss": 0.8205, + "step": 132800 + }, + { + "epoch": 0.8484852356796954, + "grad_norm": 0.7932936549186707, + "learning_rate": 6.18122969627958e-05, + "loss": 0.9146, + "step": 132810 + }, + { + "epoch": 0.8485491228294341, + "grad_norm": 1.288333535194397, + "learning_rate": 6.180742125245497e-05, + "loss": 0.8525, + "step": 132820 + }, + { + "epoch": 0.8486130099791728, + "grad_norm": 0.8285970687866211, + "learning_rate": 6.180254542320319e-05, + "loss": 0.9556, + "step": 132830 + }, + { + "epoch": 0.8486768971289115, + "grad_norm": 0.8148375153541565, + "learning_rate": 6.179766947508957e-05, + "loss": 0.822, + "step": 132840 + }, + { + "epoch": 0.8487407842786502, + "grad_norm": 0.717943549156189, + "learning_rate": 6.17927934081632e-05, + "loss": 0.9766, + "step": 132850 + }, + { + "epoch": 0.8488046714283889, + "grad_norm": 0.7930614948272705, + "learning_rate": 6.178791722247321e-05, + "loss": 0.8497, + "step": 132860 + }, + { + "epoch": 0.8488685585781276, + "grad_norm": 1.0193837881088257, + "learning_rate": 6.17830409180687e-05, + "loss": 0.9573, + "step": 132870 + }, + { + "epoch": 0.8489324457278663, + "grad_norm": 1.764022946357727, + "learning_rate": 6.177816449499878e-05, + "loss": 0.9453, + "step": 132880 + }, + { + "epoch": 0.848996332877605, + "grad_norm": 1.1653591394424438, + "learning_rate": 6.177328795331253e-05, + "loss": 0.7105, + "step": 132890 + }, + { + "epoch": 0.8490602200273437, + "grad_norm": 0.9944359064102173, + "learning_rate": 6.176841129305911e-05, + "loss": 0.8217, + "step": 132900 + }, + { + "epoch": 0.8491241071770824, + "grad_norm": 0.7695066928863525, + "learning_rate": 6.176353451428758e-05, + "loss": 0.8568, + "step": 132910 + }, + { + "epoch": 0.8491879943268211, + "grad_norm": 0.8422155976295471, + "learning_rate": 6.17586576170471e-05, + "loss": 0.7569, + "step": 132920 + }, + { + "epoch": 0.8492518814765598, + "grad_norm": 0.6993025541305542, + "learning_rate": 6.175378060138674e-05, + "loss": 0.8512, + "step": 132930 + }, + { + "epoch": 0.8493157686262985, + "grad_norm": 0.7745869755744934, + "learning_rate": 6.174890346735566e-05, + "loss": 0.8467, + "step": 132940 + }, + { + "epoch": 0.8493796557760372, + "grad_norm": 0.8811076283454895, + "learning_rate": 6.174402621500297e-05, + "loss": 0.6894, + "step": 132950 + }, + { + "epoch": 0.849443542925776, + "grad_norm": 1.6308493614196777, + "learning_rate": 6.173914884437777e-05, + "loss": 1.1379, + "step": 132960 + }, + { + "epoch": 0.8495074300755147, + "grad_norm": 0.7950766682624817, + "learning_rate": 6.173427135552917e-05, + "loss": 0.8812, + "step": 132970 + }, + { + "epoch": 0.8495713172252534, + "grad_norm": 0.5945727229118347, + "learning_rate": 6.172939374850633e-05, + "loss": 0.7214, + "step": 132980 + }, + { + "epoch": 0.849635204374992, + "grad_norm": 0.9707133173942566, + "learning_rate": 6.172451602335833e-05, + "loss": 0.8997, + "step": 132990 + }, + { + "epoch": 0.8496990915247307, + "grad_norm": 0.5925787091255188, + "learning_rate": 6.17196381801343e-05, + "loss": 0.6871, + "step": 133000 + }, + { + "epoch": 0.8497629786744694, + "grad_norm": 0.6003409624099731, + "learning_rate": 6.171476021888341e-05, + "loss": 0.8482, + "step": 133010 + }, + { + "epoch": 0.8498268658242081, + "grad_norm": 0.6482333540916443, + "learning_rate": 6.170988213965471e-05, + "loss": 0.912, + "step": 133020 + }, + { + "epoch": 0.8498907529739468, + "grad_norm": 0.6910186409950256, + "learning_rate": 6.170500394249739e-05, + "loss": 0.804, + "step": 133030 + }, + { + "epoch": 0.8499546401236855, + "grad_norm": 0.9655617475509644, + "learning_rate": 6.170012562746056e-05, + "loss": 0.8942, + "step": 133040 + }, + { + "epoch": 0.8500185272734242, + "grad_norm": 0.7532824873924255, + "learning_rate": 6.169524719459334e-05, + "loss": 0.6668, + "step": 133050 + }, + { + "epoch": 0.8500824144231629, + "grad_norm": 1.318796992301941, + "learning_rate": 6.169036864394485e-05, + "loss": 0.9058, + "step": 133060 + }, + { + "epoch": 0.8501463015729016, + "grad_norm": 0.89380943775177, + "learning_rate": 6.168548997556425e-05, + "loss": 0.8642, + "step": 133070 + }, + { + "epoch": 0.8502101887226403, + "grad_norm": 1.309441328048706, + "learning_rate": 6.168061118950063e-05, + "loss": 0.9597, + "step": 133080 + }, + { + "epoch": 0.850274075872379, + "grad_norm": 0.8857962489128113, + "learning_rate": 6.167573228580317e-05, + "loss": 0.7761, + "step": 133090 + }, + { + "epoch": 0.8503379630221177, + "grad_norm": 0.7629507780075073, + "learning_rate": 6.167085326452098e-05, + "loss": 0.9627, + "step": 133100 + }, + { + "epoch": 0.8504018501718564, + "grad_norm": 0.7469977140426636, + "learning_rate": 6.16659741257032e-05, + "loss": 0.8373, + "step": 133110 + }, + { + "epoch": 0.8504657373215951, + "grad_norm": 0.9111135005950928, + "learning_rate": 6.166109486939898e-05, + "loss": 0.8188, + "step": 133120 + }, + { + "epoch": 0.8505296244713338, + "grad_norm": 0.9041001796722412, + "learning_rate": 6.165621549565742e-05, + "loss": 1.0147, + "step": 133130 + }, + { + "epoch": 0.8505935116210726, + "grad_norm": 0.7008116245269775, + "learning_rate": 6.16513360045277e-05, + "loss": 0.8531, + "step": 133140 + }, + { + "epoch": 0.8506573987708113, + "grad_norm": NaN, + "learning_rate": 6.164694436218468e-05, + "loss": 0.9906, + "step": 133150 + }, + { + "epoch": 0.85072128592055, + "grad_norm": 0.6200425028800964, + "learning_rate": 6.164206464815282e-05, + "loss": 0.9371, + "step": 133160 + }, + { + "epoch": 0.8507851730702887, + "grad_norm": 1.1376943588256836, + "learning_rate": 6.16371848168753e-05, + "loss": 0.7415, + "step": 133170 + }, + { + "epoch": 0.8508490602200274, + "grad_norm": 0.8951854109764099, + "learning_rate": 6.163230486840124e-05, + "loss": 0.9746, + "step": 133180 + }, + { + "epoch": 0.8509129473697661, + "grad_norm": 0.6957682967185974, + "learning_rate": 6.162742480277984e-05, + "loss": 0.8415, + "step": 133190 + }, + { + "epoch": 0.8509768345195048, + "grad_norm": 0.7306455373764038, + "learning_rate": 6.162254462006018e-05, + "loss": 1.1689, + "step": 133200 + }, + { + "epoch": 0.8510407216692435, + "grad_norm": 0.8031535744667053, + "learning_rate": 6.161766432029146e-05, + "loss": 0.9466, + "step": 133210 + }, + { + "epoch": 0.8511046088189822, + "grad_norm": 0.6348420977592468, + "learning_rate": 6.16127839035228e-05, + "loss": 1.0381, + "step": 133220 + }, + { + "epoch": 0.8511684959687209, + "grad_norm": 1.0873243808746338, + "learning_rate": 6.160790336980335e-05, + "loss": 0.9977, + "step": 133230 + }, + { + "epoch": 0.8512323831184595, + "grad_norm": 0.8845553994178772, + "learning_rate": 6.160302271918229e-05, + "loss": 0.8587, + "step": 133240 + }, + { + "epoch": 0.8512962702681982, + "grad_norm": 0.7557221055030823, + "learning_rate": 6.159814195170876e-05, + "loss": 0.7288, + "step": 133250 + }, + { + "epoch": 0.8513601574179369, + "grad_norm": 0.9131662845611572, + "learning_rate": 6.159326106743188e-05, + "loss": 0.8813, + "step": 133260 + }, + { + "epoch": 0.8514240445676756, + "grad_norm": 1.1992850303649902, + "learning_rate": 6.158838006640086e-05, + "loss": 0.8098, + "step": 133270 + }, + { + "epoch": 0.8514879317174143, + "grad_norm": 0.9900951385498047, + "learning_rate": 6.15834989486648e-05, + "loss": 0.8796, + "step": 133280 + }, + { + "epoch": 0.851551818867153, + "grad_norm": 1.0529921054840088, + "learning_rate": 6.15786177142729e-05, + "loss": 1.2292, + "step": 133290 + }, + { + "epoch": 0.8516157060168917, + "grad_norm": 0.8537276983261108, + "learning_rate": 6.15737363632743e-05, + "loss": 0.7625, + "step": 133300 + }, + { + "epoch": 0.8516795931666304, + "grad_norm": 0.9666260480880737, + "learning_rate": 6.156885489571816e-05, + "loss": 1.091, + "step": 133310 + }, + { + "epoch": 0.8517434803163692, + "grad_norm": 0.8413388729095459, + "learning_rate": 6.156397331165364e-05, + "loss": 0.8328, + "step": 133320 + }, + { + "epoch": 0.8518073674661079, + "grad_norm": 0.5855796933174133, + "learning_rate": 6.155909161112992e-05, + "loss": 0.9068, + "step": 133330 + }, + { + "epoch": 0.8518712546158466, + "grad_norm": 1.3463701009750366, + "learning_rate": 6.155420979419612e-05, + "loss": 0.88, + "step": 133340 + }, + { + "epoch": 0.8519351417655853, + "grad_norm": 0.8949428796768188, + "learning_rate": 6.154932786090146e-05, + "loss": 0.9982, + "step": 133350 + }, + { + "epoch": 0.851999028915324, + "grad_norm": 1.747206687927246, + "learning_rate": 6.154444581129506e-05, + "loss": 1.1201, + "step": 133360 + }, + { + "epoch": 0.8520629160650627, + "grad_norm": 2.57167911529541, + "learning_rate": 6.153956364542612e-05, + "loss": 0.5995, + "step": 133370 + }, + { + "epoch": 0.8521268032148014, + "grad_norm": 0.8123181462287903, + "learning_rate": 6.153468136334377e-05, + "loss": 0.7424, + "step": 133380 + }, + { + "epoch": 0.8521906903645401, + "grad_norm": 0.9312867522239685, + "learning_rate": 6.15297989650972e-05, + "loss": 0.7271, + "step": 133390 + }, + { + "epoch": 0.8522545775142788, + "grad_norm": 0.9896630644798279, + "learning_rate": 6.15249164507356e-05, + "loss": 0.7866, + "step": 133400 + }, + { + "epoch": 0.8523184646640175, + "grad_norm": 1.394911527633667, + "learning_rate": 6.152003382030809e-05, + "loss": 0.9573, + "step": 133410 + }, + { + "epoch": 0.8523823518137562, + "grad_norm": 0.9811311960220337, + "learning_rate": 6.151515107386389e-05, + "loss": 1.0954, + "step": 133420 + }, + { + "epoch": 0.8524462389634949, + "grad_norm": 1.180068016052246, + "learning_rate": 6.15107565029105e-05, + "loss": 1.2012, + "step": 133430 + }, + { + "epoch": 0.8525101261132336, + "grad_norm": 1.4226782321929932, + "learning_rate": 6.150587353617002e-05, + "loss": 0.6689, + "step": 133440 + }, + { + "epoch": 0.8525740132629723, + "grad_norm": 0.7661568522453308, + "learning_rate": 6.150099045355547e-05, + "loss": 1.1808, + "step": 133450 + }, + { + "epoch": 0.852637900412711, + "grad_norm": 1.3695670366287231, + "learning_rate": 6.149610725511597e-05, + "loss": 0.8527, + "step": 133460 + }, + { + "epoch": 0.8527017875624497, + "grad_norm": 1.424774408340454, + "learning_rate": 6.149122394090073e-05, + "loss": 0.7823, + "step": 133470 + }, + { + "epoch": 0.8527656747121883, + "grad_norm": 1.2621248960494995, + "learning_rate": 6.148634051095893e-05, + "loss": 0.9556, + "step": 133480 + }, + { + "epoch": 0.852829561861927, + "grad_norm": 0.7116201519966125, + "learning_rate": 6.148145696533973e-05, + "loss": 0.8582, + "step": 133490 + }, + { + "epoch": 0.8528934490116657, + "grad_norm": 1.219441533088684, + "learning_rate": 6.147657330409234e-05, + "loss": 0.8675, + "step": 133500 + }, + { + "epoch": 0.8529573361614045, + "grad_norm": 0.8610523343086243, + "learning_rate": 6.147168952726593e-05, + "loss": 0.9913, + "step": 133510 + }, + { + "epoch": 0.8530212233111432, + "grad_norm": 1.2600919008255005, + "learning_rate": 6.146680563490968e-05, + "loss": 1.2457, + "step": 133520 + }, + { + "epoch": 0.8530851104608819, + "grad_norm": 0.7178075909614563, + "learning_rate": 6.146192162707275e-05, + "loss": 1.0738, + "step": 133530 + }, + { + "epoch": 0.8531489976106206, + "grad_norm": 0.7833428382873535, + "learning_rate": 6.145703750380439e-05, + "loss": 0.8051, + "step": 133540 + }, + { + "epoch": 0.8532128847603593, + "grad_norm": 0.6498239636421204, + "learning_rate": 6.145215326515375e-05, + "loss": 0.836, + "step": 133550 + }, + { + "epoch": 0.853276771910098, + "grad_norm": 1.1565107107162476, + "learning_rate": 6.144726891117e-05, + "loss": 0.6755, + "step": 133560 + }, + { + "epoch": 0.8533406590598367, + "grad_norm": 0.6840099692344666, + "learning_rate": 6.144238444190236e-05, + "loss": 0.9921, + "step": 133570 + }, + { + "epoch": 0.8534045462095754, + "grad_norm": 0.7863107323646545, + "learning_rate": 6.143749985740001e-05, + "loss": 0.8842, + "step": 133580 + }, + { + "epoch": 0.8534684333593141, + "grad_norm": 1.3972042798995972, + "learning_rate": 6.143261515771214e-05, + "loss": 0.7173, + "step": 133590 + }, + { + "epoch": 0.8535323205090528, + "grad_norm": 0.5784814357757568, + "learning_rate": 6.142773034288794e-05, + "loss": 1.0078, + "step": 133600 + }, + { + "epoch": 0.8535962076587915, + "grad_norm": 0.8995794057846069, + "learning_rate": 6.14228454129766e-05, + "loss": 0.7556, + "step": 133610 + }, + { + "epoch": 0.8536600948085302, + "grad_norm": 1.020451307296753, + "learning_rate": 6.141796036802734e-05, + "loss": 0.917, + "step": 133620 + }, + { + "epoch": 0.8537239819582689, + "grad_norm": 0.7589079141616821, + "learning_rate": 6.141307520808934e-05, + "loss": 0.7353, + "step": 133630 + }, + { + "epoch": 0.8537878691080076, + "grad_norm": 0.9039588570594788, + "learning_rate": 6.14081899332118e-05, + "loss": 1.0483, + "step": 133640 + }, + { + "epoch": 0.8538517562577463, + "grad_norm": 0.9937171339988708, + "learning_rate": 6.140330454344391e-05, + "loss": 0.7808, + "step": 133650 + }, + { + "epoch": 0.853915643407485, + "grad_norm": 2.9743402004241943, + "learning_rate": 6.139841903883488e-05, + "loss": 0.6187, + "step": 133660 + }, + { + "epoch": 0.8539795305572238, + "grad_norm": 0.9731509685516357, + "learning_rate": 6.139353341943391e-05, + "loss": 1.0048, + "step": 133670 + }, + { + "epoch": 0.8540434177069625, + "grad_norm": 0.7726428508758545, + "learning_rate": 6.13886476852902e-05, + "loss": 0.8544, + "step": 133680 + }, + { + "epoch": 0.8541073048567012, + "grad_norm": 1.0278613567352295, + "learning_rate": 6.138376183645295e-05, + "loss": 0.8038, + "step": 133690 + }, + { + "epoch": 0.8541711920064399, + "grad_norm": 0.9102327823638916, + "learning_rate": 6.137887587297138e-05, + "loss": 0.8712, + "step": 133700 + }, + { + "epoch": 0.8542350791561786, + "grad_norm": 0.6600117683410645, + "learning_rate": 6.137398979489468e-05, + "loss": 1.0407, + "step": 133710 + }, + { + "epoch": 0.8542989663059172, + "grad_norm": 1.2443808317184448, + "learning_rate": 6.136910360227207e-05, + "loss": 0.9258, + "step": 133720 + }, + { + "epoch": 0.8543628534556559, + "grad_norm": 0.6546837687492371, + "learning_rate": 6.136421729515275e-05, + "loss": 0.8981, + "step": 133730 + }, + { + "epoch": 0.8544267406053946, + "grad_norm": 0.5999804139137268, + "learning_rate": 6.135933087358591e-05, + "loss": 0.8701, + "step": 133740 + }, + { + "epoch": 0.8544906277551333, + "grad_norm": 0.6947192549705505, + "learning_rate": 6.135444433762081e-05, + "loss": 0.8316, + "step": 133750 + }, + { + "epoch": 0.854554514904872, + "grad_norm": 1.474822759628296, + "learning_rate": 6.134955768730663e-05, + "loss": 0.806, + "step": 133760 + }, + { + "epoch": 0.8546184020546107, + "grad_norm": 0.8612034320831299, + "learning_rate": 6.134467092269257e-05, + "loss": 0.7773, + "step": 133770 + }, + { + "epoch": 0.8546822892043494, + "grad_norm": 1.0290131568908691, + "learning_rate": 6.133978404382786e-05, + "loss": 0.8817, + "step": 133780 + }, + { + "epoch": 0.8547461763540881, + "grad_norm": 0.7579601407051086, + "learning_rate": 6.133489705076172e-05, + "loss": 0.9595, + "step": 133790 + }, + { + "epoch": 0.8548100635038268, + "grad_norm": 0.7710822224617004, + "learning_rate": 6.133000994354337e-05, + "loss": 0.8086, + "step": 133800 + }, + { + "epoch": 0.8548739506535655, + "grad_norm": 0.9965303540229797, + "learning_rate": 6.1325122722222e-05, + "loss": 0.8848, + "step": 133810 + }, + { + "epoch": 0.8549378378033042, + "grad_norm": 0.9291059970855713, + "learning_rate": 6.132023538684687e-05, + "loss": 0.7339, + "step": 133820 + }, + { + "epoch": 0.8550017249530429, + "grad_norm": 0.7711076140403748, + "learning_rate": 6.131534793746716e-05, + "loss": 0.9099, + "step": 133830 + }, + { + "epoch": 0.8550656121027816, + "grad_norm": 0.9747552871704102, + "learning_rate": 6.131046037413211e-05, + "loss": 1.076, + "step": 133840 + }, + { + "epoch": 0.8551294992525204, + "grad_norm": 0.6776683330535889, + "learning_rate": 6.130557269689092e-05, + "loss": 0.9466, + "step": 133850 + }, + { + "epoch": 0.8551933864022591, + "grad_norm": 2.5817158222198486, + "learning_rate": 6.130068490579286e-05, + "loss": 0.7916, + "step": 133860 + }, + { + "epoch": 0.8552572735519978, + "grad_norm": 0.8578413724899292, + "learning_rate": 6.129579700088711e-05, + "loss": 0.8225, + "step": 133870 + }, + { + "epoch": 0.8553211607017365, + "grad_norm": 0.8771921992301941, + "learning_rate": 6.129090898222291e-05, + "loss": 0.8542, + "step": 133880 + }, + { + "epoch": 0.8553850478514752, + "grad_norm": 0.8163079023361206, + "learning_rate": 6.128602084984951e-05, + "loss": 0.7394, + "step": 133890 + }, + { + "epoch": 0.8554489350012139, + "grad_norm": 1.6026225090026855, + "learning_rate": 6.128113260381611e-05, + "loss": 0.7461, + "step": 133900 + }, + { + "epoch": 0.8555128221509526, + "grad_norm": 1.152044415473938, + "learning_rate": 6.127624424417193e-05, + "loss": 0.9446, + "step": 133910 + }, + { + "epoch": 0.8555767093006913, + "grad_norm": 0.7472025156021118, + "learning_rate": 6.127135577096623e-05, + "loss": 0.7461, + "step": 133920 + }, + { + "epoch": 0.85564059645043, + "grad_norm": 0.7701200246810913, + "learning_rate": 6.126646718424822e-05, + "loss": 0.9566, + "step": 133930 + }, + { + "epoch": 0.8557044836001687, + "grad_norm": 0.626395583152771, + "learning_rate": 6.126157848406712e-05, + "loss": 0.6947, + "step": 133940 + }, + { + "epoch": 0.8557683707499074, + "grad_norm": 1.1391795873641968, + "learning_rate": 6.12566896704722e-05, + "loss": 0.8568, + "step": 133950 + }, + { + "epoch": 0.8558322578996461, + "grad_norm": 1.0339782238006592, + "learning_rate": 6.125180074351269e-05, + "loss": 0.9725, + "step": 133960 + }, + { + "epoch": 0.8558961450493847, + "grad_norm": 0.8343575596809387, + "learning_rate": 6.12469117032378e-05, + "loss": 1.1068, + "step": 133970 + }, + { + "epoch": 0.8559600321991234, + "grad_norm": 0.4701806604862213, + "learning_rate": 6.124202254969678e-05, + "loss": 0.7032, + "step": 133980 + }, + { + "epoch": 0.8560239193488621, + "grad_norm": 0.6901923418045044, + "learning_rate": 6.123713328293887e-05, + "loss": 0.9095, + "step": 133990 + }, + { + "epoch": 0.8560878064986008, + "grad_norm": 1.137757658958435, + "learning_rate": 6.123224390301329e-05, + "loss": 0.8695, + "step": 134000 + }, + { + "epoch": 0.8561516936483395, + "grad_norm": 1.1090092658996582, + "learning_rate": 6.122735440996931e-05, + "loss": 1.1309, + "step": 134010 + }, + { + "epoch": 0.8562155807980782, + "grad_norm": 0.6597867608070374, + "learning_rate": 6.122246480385616e-05, + "loss": 1.0033, + "step": 134020 + }, + { + "epoch": 0.856279467947817, + "grad_norm": 0.6848984360694885, + "learning_rate": 6.121757508472308e-05, + "loss": 0.9416, + "step": 134030 + }, + { + "epoch": 0.8563433550975557, + "grad_norm": 0.9123812913894653, + "learning_rate": 6.12126852526193e-05, + "loss": 0.98, + "step": 134040 + }, + { + "epoch": 0.8564072422472944, + "grad_norm": 0.5774307250976562, + "learning_rate": 6.120779530759409e-05, + "loss": 0.7818, + "step": 134050 + }, + { + "epoch": 0.8564711293970331, + "grad_norm": 1.0986335277557373, + "learning_rate": 6.120290524969668e-05, + "loss": 0.7268, + "step": 134060 + }, + { + "epoch": 0.8565350165467718, + "grad_norm": 4.184600830078125, + "learning_rate": 6.119801507897634e-05, + "loss": 1.0409, + "step": 134070 + }, + { + "epoch": 0.8565989036965105, + "grad_norm": 0.8018998503684998, + "learning_rate": 6.119312479548229e-05, + "loss": 0.9884, + "step": 134080 + }, + { + "epoch": 0.8566627908462492, + "grad_norm": 0.6833622455596924, + "learning_rate": 6.118823439926379e-05, + "loss": 0.7314, + "step": 134090 + }, + { + "epoch": 0.8567266779959879, + "grad_norm": 3.289335012435913, + "learning_rate": 6.118334389037008e-05, + "loss": 1.3275, + "step": 134100 + }, + { + "epoch": 0.8567905651457266, + "grad_norm": 2.1931798458099365, + "learning_rate": 6.117845326885043e-05, + "loss": 1.194, + "step": 134110 + }, + { + "epoch": 0.8568544522954653, + "grad_norm": 0.985336422920227, + "learning_rate": 6.11735625347541e-05, + "loss": 0.9866, + "step": 134120 + }, + { + "epoch": 0.856918339445204, + "grad_norm": 1.144832968711853, + "learning_rate": 6.116867168813031e-05, + "loss": 0.7581, + "step": 134130 + }, + { + "epoch": 0.8569822265949427, + "grad_norm": 0.8992478251457214, + "learning_rate": 6.116378072902833e-05, + "loss": 0.7862, + "step": 134140 + }, + { + "epoch": 0.8570461137446814, + "grad_norm": 0.4670437276363373, + "learning_rate": 6.115888965749744e-05, + "loss": 1.1326, + "step": 134150 + }, + { + "epoch": 0.8571100008944201, + "grad_norm": 0.688841700553894, + "learning_rate": 6.115399847358685e-05, + "loss": 0.7873, + "step": 134160 + }, + { + "epoch": 0.8571738880441588, + "grad_norm": 1.1751261949539185, + "learning_rate": 6.114910717734586e-05, + "loss": 0.8015, + "step": 134170 + }, + { + "epoch": 0.8572377751938975, + "grad_norm": 0.8896322846412659, + "learning_rate": 6.114421576882372e-05, + "loss": 1.0137, + "step": 134180 + }, + { + "epoch": 0.8573016623436363, + "grad_norm": 0.692790150642395, + "learning_rate": 6.113932424806969e-05, + "loss": 0.8688, + "step": 134190 + }, + { + "epoch": 0.857365549493375, + "grad_norm": 0.8335268497467041, + "learning_rate": 6.113443261513302e-05, + "loss": 0.9654, + "step": 134200 + }, + { + "epoch": 0.8574294366431136, + "grad_norm": 0.880637526512146, + "learning_rate": 6.112954087006297e-05, + "loss": 0.769, + "step": 134210 + }, + { + "epoch": 0.8574933237928523, + "grad_norm": 0.6039393544197083, + "learning_rate": 6.112464901290882e-05, + "loss": 0.9527, + "step": 134220 + }, + { + "epoch": 0.857557210942591, + "grad_norm": 1.0236138105392456, + "learning_rate": 6.111975704371984e-05, + "loss": 1.0147, + "step": 134230 + }, + { + "epoch": 0.8576210980923297, + "grad_norm": 1.451583743095398, + "learning_rate": 6.111486496254528e-05, + "loss": 0.7267, + "step": 134240 + }, + { + "epoch": 0.8576849852420684, + "grad_norm": 0.8510944843292236, + "learning_rate": 6.110997276943442e-05, + "loss": 1.272, + "step": 134250 + }, + { + "epoch": 0.8577488723918071, + "grad_norm": 0.7636389136314392, + "learning_rate": 6.110508046443652e-05, + "loss": 0.7339, + "step": 134260 + }, + { + "epoch": 0.8578127595415458, + "grad_norm": 0.7402799725532532, + "learning_rate": 6.110018804760085e-05, + "loss": 0.7794, + "step": 134270 + }, + { + "epoch": 0.8578766466912845, + "grad_norm": 0.8340638279914856, + "learning_rate": 6.109529551897669e-05, + "loss": 0.8052, + "step": 134280 + }, + { + "epoch": 0.8579405338410232, + "grad_norm": 1.0177710056304932, + "learning_rate": 6.109040287861331e-05, + "loss": 0.9876, + "step": 134290 + }, + { + "epoch": 0.8580044209907619, + "grad_norm": 0.8329386711120605, + "learning_rate": 6.108551012655996e-05, + "loss": 0.8806, + "step": 134300 + }, + { + "epoch": 0.8580683081405006, + "grad_norm": 0.7960025072097778, + "learning_rate": 6.108061726286596e-05, + "loss": 0.6743, + "step": 134310 + }, + { + "epoch": 0.8581321952902393, + "grad_norm": 1.751345157623291, + "learning_rate": 6.107572428758053e-05, + "loss": 0.9132, + "step": 134320 + }, + { + "epoch": 0.858196082439978, + "grad_norm": 0.8473448753356934, + "learning_rate": 6.1070831200753e-05, + "loss": 0.8131, + "step": 134330 + }, + { + "epoch": 0.8582599695897167, + "grad_norm": 0.8581190705299377, + "learning_rate": 6.10659380024326e-05, + "loss": 0.8037, + "step": 134340 + }, + { + "epoch": 0.8583238567394554, + "grad_norm": 0.964256227016449, + "learning_rate": 6.106104469266865e-05, + "loss": 0.6935, + "step": 134350 + }, + { + "epoch": 0.8583877438891941, + "grad_norm": 1.3757505416870117, + "learning_rate": 6.105615127151039e-05, + "loss": 0.7474, + "step": 134360 + }, + { + "epoch": 0.8584516310389328, + "grad_norm": 0.7548801898956299, + "learning_rate": 6.105125773900712e-05, + "loss": 0.8156, + "step": 134370 + }, + { + "epoch": 0.8585155181886716, + "grad_norm": 0.4941951036453247, + "learning_rate": 6.104636409520814e-05, + "loss": 0.7144, + "step": 134380 + }, + { + "epoch": 0.8585794053384103, + "grad_norm": 0.6011403799057007, + "learning_rate": 6.10414703401627e-05, + "loss": 1.0064, + "step": 134390 + }, + { + "epoch": 0.858643292488149, + "grad_norm": 1.6302911043167114, + "learning_rate": 6.103657647392012e-05, + "loss": 0.9264, + "step": 134400 + }, + { + "epoch": 0.8587071796378877, + "grad_norm": 0.9661455154418945, + "learning_rate": 6.103168249652966e-05, + "loss": 1.068, + "step": 134410 + }, + { + "epoch": 0.8587710667876264, + "grad_norm": 1.3184993267059326, + "learning_rate": 6.1026788408040616e-05, + "loss": 0.8662, + "step": 134420 + }, + { + "epoch": 0.8588349539373651, + "grad_norm": 0.8495551943778992, + "learning_rate": 6.102189420850226e-05, + "loss": 0.8618, + "step": 134430 + }, + { + "epoch": 0.8588988410871038, + "grad_norm": 0.7962411642074585, + "learning_rate": 6.101699989796391e-05, + "loss": 1.0453, + "step": 134440 + }, + { + "epoch": 0.8589627282368424, + "grad_norm": 0.7486063241958618, + "learning_rate": 6.1012105476474835e-05, + "loss": 0.9581, + "step": 134450 + }, + { + "epoch": 0.8590266153865811, + "grad_norm": 0.8825688362121582, + "learning_rate": 6.100721094408434e-05, + "loss": 0.9215, + "step": 134460 + }, + { + "epoch": 0.8590905025363198, + "grad_norm": 0.9865175485610962, + "learning_rate": 6.100231630084169e-05, + "loss": 0.8866, + "step": 134470 + }, + { + "epoch": 0.8591543896860585, + "grad_norm": 0.7580648064613342, + "learning_rate": 6.099742154679621e-05, + "loss": 0.7733, + "step": 134480 + }, + { + "epoch": 0.8592182768357972, + "grad_norm": 0.9302807450294495, + "learning_rate": 6.099252668199718e-05, + "loss": 0.7856, + "step": 134490 + }, + { + "epoch": 0.8592821639855359, + "grad_norm": 0.8940306305885315, + "learning_rate": 6.098763170649389e-05, + "loss": 1.2527, + "step": 134500 + }, + { + "epoch": 0.8593460511352746, + "grad_norm": 1.2878268957138062, + "learning_rate": 6.0982736620335644e-05, + "loss": 0.8244, + "step": 134510 + }, + { + "epoch": 0.8594099382850133, + "grad_norm": 0.8814017176628113, + "learning_rate": 6.097784142357174e-05, + "loss": 1.0, + "step": 134520 + }, + { + "epoch": 0.859473825434752, + "grad_norm": 0.6660280823707581, + "learning_rate": 6.097294611625147e-05, + "loss": 0.7688, + "step": 134530 + }, + { + "epoch": 0.8595377125844907, + "grad_norm": 1.1975473165512085, + "learning_rate": 6.0968050698424154e-05, + "loss": 1.033, + "step": 134540 + }, + { + "epoch": 0.8596015997342294, + "grad_norm": 0.822115421295166, + "learning_rate": 6.0963155170139066e-05, + "loss": 0.8708, + "step": 134550 + }, + { + "epoch": 0.8596654868839682, + "grad_norm": 0.9180407524108887, + "learning_rate": 6.095825953144553e-05, + "loss": 0.9883, + "step": 134560 + }, + { + "epoch": 0.8597293740337069, + "grad_norm": 1.0190486907958984, + "learning_rate": 6.095336378239284e-05, + "loss": 0.8116, + "step": 134570 + }, + { + "epoch": 0.8597932611834456, + "grad_norm": 0.6985743045806885, + "learning_rate": 6.094846792303029e-05, + "loss": 0.8544, + "step": 134580 + }, + { + "epoch": 0.8598571483331843, + "grad_norm": 0.9220016598701477, + "learning_rate": 6.0943571953407205e-05, + "loss": 1.1432, + "step": 134590 + }, + { + "epoch": 0.859921035482923, + "grad_norm": 1.1213401556015015, + "learning_rate": 6.093867587357288e-05, + "loss": 1.1309, + "step": 134600 + }, + { + "epoch": 0.8599849226326617, + "grad_norm": 0.7928663492202759, + "learning_rate": 6.093377968357663e-05, + "loss": 0.8537, + "step": 134610 + }, + { + "epoch": 0.8600488097824004, + "grad_norm": 1.3430203199386597, + "learning_rate": 6.092888338346775e-05, + "loss": 0.8679, + "step": 134620 + }, + { + "epoch": 0.8601126969321391, + "grad_norm": 0.7503795027732849, + "learning_rate": 6.0923986973295564e-05, + "loss": 1.0493, + "step": 134630 + }, + { + "epoch": 0.8601765840818778, + "grad_norm": 0.9265238046646118, + "learning_rate": 6.091909045310938e-05, + "loss": 0.9348, + "step": 134640 + }, + { + "epoch": 0.8602404712316165, + "grad_norm": 0.69620680809021, + "learning_rate": 6.091419382295851e-05, + "loss": 0.8313, + "step": 134650 + }, + { + "epoch": 0.8603043583813552, + "grad_norm": 0.8834882974624634, + "learning_rate": 6.090929708289227e-05, + "loss": 0.7323, + "step": 134660 + }, + { + "epoch": 0.8603682455310939, + "grad_norm": 0.9060829281806946, + "learning_rate": 6.0904400232959965e-05, + "loss": 0.8098, + "step": 134670 + }, + { + "epoch": 0.8604321326808326, + "grad_norm": 0.8208954930305481, + "learning_rate": 6.089950327321092e-05, + "loss": 0.9883, + "step": 134680 + }, + { + "epoch": 0.8604960198305712, + "grad_norm": 0.8429823517799377, + "learning_rate": 6.089460620369444e-05, + "loss": 0.9104, + "step": 134690 + }, + { + "epoch": 0.8605599069803099, + "grad_norm": 0.753400981426239, + "learning_rate": 6.088970902445985e-05, + "loss": 0.8429, + "step": 134700 + }, + { + "epoch": 0.8606237941300486, + "grad_norm": 1.0955417156219482, + "learning_rate": 6.088481173555648e-05, + "loss": 1.0817, + "step": 134710 + }, + { + "epoch": 0.8606876812797873, + "grad_norm": 1.585567831993103, + "learning_rate": 6.087991433703363e-05, + "loss": 0.8062, + "step": 134720 + }, + { + "epoch": 0.860751568429526, + "grad_norm": 0.6598794460296631, + "learning_rate": 6.0875016828940635e-05, + "loss": 0.8811, + "step": 134730 + }, + { + "epoch": 0.8608154555792648, + "grad_norm": 0.5704881548881531, + "learning_rate": 6.08701192113268e-05, + "loss": 0.7517, + "step": 134740 + }, + { + "epoch": 0.8608793427290035, + "grad_norm": 0.8759427666664124, + "learning_rate": 6.086522148424148e-05, + "loss": 0.874, + "step": 134750 + }, + { + "epoch": 0.8609432298787422, + "grad_norm": 0.7344384789466858, + "learning_rate": 6.086032364773396e-05, + "loss": 1.0344, + "step": 134760 + }, + { + "epoch": 0.8610071170284809, + "grad_norm": 1.3539067506790161, + "learning_rate": 6.0855425701853596e-05, + "loss": 1.1221, + "step": 134770 + }, + { + "epoch": 0.8610710041782196, + "grad_norm": 0.85466068983078, + "learning_rate": 6.08505276466497e-05, + "loss": 0.9896, + "step": 134780 + }, + { + "epoch": 0.8611348913279583, + "grad_norm": 1.4604058265686035, + "learning_rate": 6.0845629482171626e-05, + "loss": 0.762, + "step": 134790 + }, + { + "epoch": 0.861198778477697, + "grad_norm": 0.6832066178321838, + "learning_rate": 6.084073120846866e-05, + "loss": 0.8548, + "step": 134800 + }, + { + "epoch": 0.8612626656274357, + "grad_norm": 0.6687494516372681, + "learning_rate": 6.083583282559016e-05, + "loss": 1.1136, + "step": 134810 + }, + { + "epoch": 0.8613265527771744, + "grad_norm": 0.9443023800849915, + "learning_rate": 6.083093433358544e-05, + "loss": 1.0468, + "step": 134820 + }, + { + "epoch": 0.8613904399269131, + "grad_norm": 0.6644616723060608, + "learning_rate": 6.082603573250384e-05, + "loss": 0.7965, + "step": 134830 + }, + { + "epoch": 0.8614543270766518, + "grad_norm": 0.6500226855278015, + "learning_rate": 6.0821137022394705e-05, + "loss": 1.0983, + "step": 134840 + }, + { + "epoch": 0.8615182142263905, + "grad_norm": 1.1436829566955566, + "learning_rate": 6.0816238203307355e-05, + "loss": 1.0032, + "step": 134850 + }, + { + "epoch": 0.8615821013761292, + "grad_norm": 0.5248997211456299, + "learning_rate": 6.081133927529112e-05, + "loss": 0.7732, + "step": 134860 + }, + { + "epoch": 0.8616459885258679, + "grad_norm": 0.8002848625183105, + "learning_rate": 6.0806440238395347e-05, + "loss": 0.8625, + "step": 134870 + }, + { + "epoch": 0.8617098756756066, + "grad_norm": 1.1468842029571533, + "learning_rate": 6.080154109266938e-05, + "loss": 0.6841, + "step": 134880 + }, + { + "epoch": 0.8617737628253453, + "grad_norm": 1.0037697553634644, + "learning_rate": 6.0796641838162546e-05, + "loss": 0.83, + "step": 134890 + }, + { + "epoch": 0.861837649975084, + "grad_norm": 1.227607250213623, + "learning_rate": 6.0791742474924175e-05, + "loss": 0.9535, + "step": 134900 + }, + { + "epoch": 0.8619015371248228, + "grad_norm": 0.9437126517295837, + "learning_rate": 6.0786843003003636e-05, + "loss": 1.2439, + "step": 134910 + }, + { + "epoch": 0.8619654242745615, + "grad_norm": 1.4104220867156982, + "learning_rate": 6.078194342245025e-05, + "loss": 1.1915, + "step": 134920 + }, + { + "epoch": 0.8620293114243002, + "grad_norm": 0.9898942112922668, + "learning_rate": 6.0777043733313375e-05, + "loss": 0.8426, + "step": 134930 + }, + { + "epoch": 0.8620931985740388, + "grad_norm": 1.2425917387008667, + "learning_rate": 6.077214393564234e-05, + "loss": 0.7399, + "step": 134940 + }, + { + "epoch": 0.8621570857237775, + "grad_norm": 0.8567221760749817, + "learning_rate": 6.07672440294865e-05, + "loss": 0.8017, + "step": 134950 + }, + { + "epoch": 0.8622209728735162, + "grad_norm": 0.8979184031486511, + "learning_rate": 6.07623440148952e-05, + "loss": 1.1169, + "step": 134960 + }, + { + "epoch": 0.8622848600232549, + "grad_norm": 0.6733188033103943, + "learning_rate": 6.075744389191778e-05, + "loss": 0.9605, + "step": 134970 + }, + { + "epoch": 0.8623487471729936, + "grad_norm": 1.6359256505966187, + "learning_rate": 6.0752543660603587e-05, + "loss": 0.7852, + "step": 134980 + }, + { + "epoch": 0.8624126343227323, + "grad_norm": 1.0377789735794067, + "learning_rate": 6.074764332100199e-05, + "loss": 1.1418, + "step": 134990 + }, + { + "epoch": 0.862476521472471, + "grad_norm": 0.5584946274757385, + "learning_rate": 6.074274287316232e-05, + "loss": 0.6663, + "step": 135000 + }, + { + "epoch": 0.8625404086222097, + "grad_norm": 0.936705470085144, + "learning_rate": 6.073784231713393e-05, + "loss": 0.9404, + "step": 135010 + }, + { + "epoch": 0.8626042957719484, + "grad_norm": 0.8763816356658936, + "learning_rate": 6.0732941652966194e-05, + "loss": 0.7721, + "step": 135020 + }, + { + "epoch": 0.8626681829216871, + "grad_norm": 2.3810360431671143, + "learning_rate": 6.072804088070844e-05, + "loss": 0.6623, + "step": 135030 + }, + { + "epoch": 0.8627320700714258, + "grad_norm": 1.851711392402649, + "learning_rate": 6.0723140000410036e-05, + "loss": 0.8857, + "step": 135040 + }, + { + "epoch": 0.8627959572211645, + "grad_norm": 0.9787930846214294, + "learning_rate": 6.0718239012120334e-05, + "loss": 0.9424, + "step": 135050 + }, + { + "epoch": 0.8628598443709032, + "grad_norm": 1.1540073156356812, + "learning_rate": 6.071333791588868e-05, + "loss": 0.8247, + "step": 135060 + }, + { + "epoch": 0.862923731520642, + "grad_norm": 0.7594394087791443, + "learning_rate": 6.0708436711764464e-05, + "loss": 1.098, + "step": 135070 + }, + { + "epoch": 0.8629876186703807, + "grad_norm": 0.9598045349121094, + "learning_rate": 6.070353539979702e-05, + "loss": 0.7815, + "step": 135080 + }, + { + "epoch": 0.8630515058201194, + "grad_norm": 0.8562808632850647, + "learning_rate": 6.069863398003571e-05, + "loss": 0.9166, + "step": 135090 + }, + { + "epoch": 0.8631153929698581, + "grad_norm": 1.306075930595398, + "learning_rate": 6.0693732452529906e-05, + "loss": 0.7203, + "step": 135100 + }, + { + "epoch": 0.8631792801195968, + "grad_norm": 0.9534823298454285, + "learning_rate": 6.0688830817328955e-05, + "loss": 0.9033, + "step": 135110 + }, + { + "epoch": 0.8632431672693355, + "grad_norm": 0.8284388184547424, + "learning_rate": 6.068392907448224e-05, + "loss": 0.7617, + "step": 135120 + }, + { + "epoch": 0.8633070544190742, + "grad_norm": 0.9172964096069336, + "learning_rate": 6.067902722403912e-05, + "loss": 0.7122, + "step": 135130 + }, + { + "epoch": 0.8633709415688129, + "grad_norm": 1.129095435142517, + "learning_rate": 6.067412526604894e-05, + "loss": 1.0913, + "step": 135140 + }, + { + "epoch": 0.8634348287185516, + "grad_norm": 0.673621416091919, + "learning_rate": 6.06692232005611e-05, + "loss": 0.9686, + "step": 135150 + }, + { + "epoch": 0.8634987158682903, + "grad_norm": 0.5581203699111938, + "learning_rate": 6.066432102762495e-05, + "loss": 1.0076, + "step": 135160 + }, + { + "epoch": 0.863562603018029, + "grad_norm": 0.8796345591545105, + "learning_rate": 6.0659418747289864e-05, + "loss": 0.8523, + "step": 135170 + }, + { + "epoch": 0.8636264901677676, + "grad_norm": 0.9877477288246155, + "learning_rate": 6.06545163596052e-05, + "loss": 0.8171, + "step": 135180 + }, + { + "epoch": 0.8636903773175063, + "grad_norm": 1.176950216293335, + "learning_rate": 6.0649613864620345e-05, + "loss": 0.7933, + "step": 135190 + }, + { + "epoch": 0.863754264467245, + "grad_norm": 1.119374394416809, + "learning_rate": 6.064471126238467e-05, + "loss": 0.9705, + "step": 135200 + }, + { + "epoch": 0.8638181516169837, + "grad_norm": 0.757959246635437, + "learning_rate": 6.063980855294753e-05, + "loss": 0.9029, + "step": 135210 + }, + { + "epoch": 0.8638820387667224, + "grad_norm": 0.6737072467803955, + "learning_rate": 6.0634905736358326e-05, + "loss": 0.9489, + "step": 135220 + }, + { + "epoch": 0.8639459259164611, + "grad_norm": 0.7668609619140625, + "learning_rate": 6.063000281266641e-05, + "loss": 0.9851, + "step": 135230 + }, + { + "epoch": 0.8640098130661998, + "grad_norm": 0.9595603346824646, + "learning_rate": 6.062509978192118e-05, + "loss": 1.0468, + "step": 135240 + }, + { + "epoch": 0.8640737002159385, + "grad_norm": 1.4497737884521484, + "learning_rate": 6.062019664417199e-05, + "loss": 0.9572, + "step": 135250 + }, + { + "epoch": 0.8641375873656773, + "grad_norm": 0.8604928851127625, + "learning_rate": 6.061529339946824e-05, + "loss": 0.9146, + "step": 135260 + }, + { + "epoch": 0.864201474515416, + "grad_norm": 0.8765950798988342, + "learning_rate": 6.061039004785929e-05, + "loss": 0.7102, + "step": 135270 + }, + { + "epoch": 0.8642653616651547, + "grad_norm": 0.523587703704834, + "learning_rate": 6.060548658939456e-05, + "loss": 0.7571, + "step": 135280 + }, + { + "epoch": 0.8643292488148934, + "grad_norm": 0.93330979347229, + "learning_rate": 6.0600583024123394e-05, + "loss": 0.859, + "step": 135290 + }, + { + "epoch": 0.8643931359646321, + "grad_norm": 0.9948300123214722, + "learning_rate": 6.059567935209518e-05, + "loss": 0.8673, + "step": 135300 + }, + { + "epoch": 0.8644570231143708, + "grad_norm": 0.9706994295120239, + "learning_rate": 6.059077557335931e-05, + "loss": 0.7599, + "step": 135310 + }, + { + "epoch": 0.8645209102641095, + "grad_norm": 1.1021097898483276, + "learning_rate": 6.058587168796517e-05, + "loss": 0.7995, + "step": 135320 + }, + { + "epoch": 0.8645847974138482, + "grad_norm": 0.8516930937767029, + "learning_rate": 6.058096769596213e-05, + "loss": 0.9658, + "step": 135330 + }, + { + "epoch": 0.8646486845635869, + "grad_norm": 0.8828617930412292, + "learning_rate": 6.0576063597399615e-05, + "loss": 0.7387, + "step": 135340 + }, + { + "epoch": 0.8647125717133256, + "grad_norm": 0.8962679505348206, + "learning_rate": 6.0571159392326974e-05, + "loss": 1.0775, + "step": 135350 + }, + { + "epoch": 0.8647764588630643, + "grad_norm": 0.678126871585846, + "learning_rate": 6.056625508079361e-05, + "loss": 0.9041, + "step": 135360 + }, + { + "epoch": 0.864840346012803, + "grad_norm": 1.034623146057129, + "learning_rate": 6.056135066284893e-05, + "loss": 1.1352, + "step": 135370 + }, + { + "epoch": 0.8649042331625417, + "grad_norm": 0.7552897930145264, + "learning_rate": 6.05564461385423e-05, + "loss": 0.9906, + "step": 135380 + }, + { + "epoch": 0.8649681203122804, + "grad_norm": 1.2172116041183472, + "learning_rate": 6.055154150792313e-05, + "loss": 0.8446, + "step": 135390 + }, + { + "epoch": 0.8650320074620191, + "grad_norm": 0.5961598753929138, + "learning_rate": 6.054663677104081e-05, + "loss": 0.7268, + "step": 135400 + }, + { + "epoch": 0.8650958946117578, + "grad_norm": 0.595866322517395, + "learning_rate": 6.0541731927944734e-05, + "loss": 0.7465, + "step": 135410 + }, + { + "epoch": 0.8651597817614964, + "grad_norm": 0.8760963678359985, + "learning_rate": 6.0536826978684294e-05, + "loss": 0.8472, + "step": 135420 + }, + { + "epoch": 0.8652236689112351, + "grad_norm": 0.7106996774673462, + "learning_rate": 6.0531921923308874e-05, + "loss": 0.979, + "step": 135430 + }, + { + "epoch": 0.8652875560609739, + "grad_norm": 0.6956402063369751, + "learning_rate": 6.052701676186791e-05, + "loss": 0.9007, + "step": 135440 + }, + { + "epoch": 0.8653514432107126, + "grad_norm": 2.090533971786499, + "learning_rate": 6.0522111494410785e-05, + "loss": 0.8266, + "step": 135450 + }, + { + "epoch": 0.8654153303604513, + "grad_norm": 0.7869872450828552, + "learning_rate": 6.051720612098688e-05, + "loss": 0.6341, + "step": 135460 + }, + { + "epoch": 0.86547921751019, + "grad_norm": 0.6018970012664795, + "learning_rate": 6.051230064164561e-05, + "loss": 0.7308, + "step": 135470 + }, + { + "epoch": 0.8655431046599287, + "grad_norm": 0.8730195164680481, + "learning_rate": 6.050739505643639e-05, + "loss": 1.086, + "step": 135480 + }, + { + "epoch": 0.8656069918096674, + "grad_norm": 0.8639483451843262, + "learning_rate": 6.050248936540861e-05, + "loss": 1.0237, + "step": 135490 + }, + { + "epoch": 0.8656708789594061, + "grad_norm": 0.7378480434417725, + "learning_rate": 6.0497583568611674e-05, + "loss": 0.8411, + "step": 135500 + }, + { + "epoch": 0.8657347661091448, + "grad_norm": 0.876330554485321, + "learning_rate": 6.049267766609499e-05, + "loss": 1.0058, + "step": 135510 + }, + { + "epoch": 0.8657986532588835, + "grad_norm": 0.6682674884796143, + "learning_rate": 6.0487771657907974e-05, + "loss": 0.8205, + "step": 135520 + }, + { + "epoch": 0.8658625404086222, + "grad_norm": 1.085636019706726, + "learning_rate": 6.048286554410001e-05, + "loss": 0.9317, + "step": 135530 + }, + { + "epoch": 0.8659264275583609, + "grad_norm": 0.9509056210517883, + "learning_rate": 6.047795932472052e-05, + "loss": 0.7042, + "step": 135540 + }, + { + "epoch": 0.8659903147080996, + "grad_norm": 1.0311496257781982, + "learning_rate": 6.0473052999818925e-05, + "loss": 1.0667, + "step": 135550 + }, + { + "epoch": 0.8660542018578383, + "grad_norm": 1.0395874977111816, + "learning_rate": 6.0468146569444615e-05, + "loss": 1.0347, + "step": 135560 + }, + { + "epoch": 0.866118089007577, + "grad_norm": 0.933964729309082, + "learning_rate": 6.0463240033647025e-05, + "loss": 0.9422, + "step": 135570 + }, + { + "epoch": 0.8661819761573157, + "grad_norm": 0.8697935342788696, + "learning_rate": 6.045833339247555e-05, + "loss": 1.1477, + "step": 135580 + }, + { + "epoch": 0.8662458633070544, + "grad_norm": 0.7333647012710571, + "learning_rate": 6.045342664597959e-05, + "loss": 0.8173, + "step": 135590 + }, + { + "epoch": 0.8663097504567931, + "grad_norm": 0.761461079120636, + "learning_rate": 6.04485197942086e-05, + "loss": 0.9989, + "step": 135600 + }, + { + "epoch": 0.8663736376065319, + "grad_norm": 0.7777496576309204, + "learning_rate": 6.0443612837211984e-05, + "loss": 0.9211, + "step": 135610 + }, + { + "epoch": 0.8664375247562706, + "grad_norm": 0.8241527080535889, + "learning_rate": 6.043870577503914e-05, + "loss": 0.8573, + "step": 135620 + }, + { + "epoch": 0.8665014119060093, + "grad_norm": 1.33556067943573, + "learning_rate": 6.04337986077395e-05, + "loss": 0.88, + "step": 135630 + }, + { + "epoch": 0.866565299055748, + "grad_norm": 0.8477666974067688, + "learning_rate": 6.0428891335362484e-05, + "loss": 0.9298, + "step": 135640 + }, + { + "epoch": 0.8666291862054867, + "grad_norm": 1.5744928121566772, + "learning_rate": 6.0423983957957505e-05, + "loss": 0.9288, + "step": 135650 + }, + { + "epoch": 0.8666930733552254, + "grad_norm": 1.1519935131072998, + "learning_rate": 6.041907647557399e-05, + "loss": 0.9664, + "step": 135660 + }, + { + "epoch": 0.866756960504964, + "grad_norm": 0.649913489818573, + "learning_rate": 6.041416888826137e-05, + "loss": 0.9266, + "step": 135670 + }, + { + "epoch": 0.8668208476547027, + "grad_norm": 0.9466597437858582, + "learning_rate": 6.040926119606906e-05, + "loss": 0.8896, + "step": 135680 + }, + { + "epoch": 0.8668847348044414, + "grad_norm": 0.5539588332176208, + "learning_rate": 6.040435339904646e-05, + "loss": 0.9554, + "step": 135690 + }, + { + "epoch": 0.8669486219541801, + "grad_norm": 1.099380373954773, + "learning_rate": 6.039944549724305e-05, + "loss": 0.8488, + "step": 135700 + }, + { + "epoch": 0.8670125091039188, + "grad_norm": 0.6397864818572998, + "learning_rate": 6.0394537490708216e-05, + "loss": 0.9452, + "step": 135710 + }, + { + "epoch": 0.8670763962536575, + "grad_norm": 0.7527474164962769, + "learning_rate": 6.0389629379491395e-05, + "loss": 1.1831, + "step": 135720 + }, + { + "epoch": 0.8671402834033962, + "grad_norm": 1.0391385555267334, + "learning_rate": 6.0384721163642024e-05, + "loss": 0.951, + "step": 135730 + }, + { + "epoch": 0.8672041705531349, + "grad_norm": 0.9291607737541199, + "learning_rate": 6.0379812843209515e-05, + "loss": 0.8248, + "step": 135740 + }, + { + "epoch": 0.8672680577028736, + "grad_norm": 0.8267570734024048, + "learning_rate": 6.0374904418243315e-05, + "loss": 0.7667, + "step": 135750 + }, + { + "epoch": 0.8673319448526123, + "grad_norm": 1.5941237211227417, + "learning_rate": 6.0369995888792863e-05, + "loss": 0.8021, + "step": 135760 + }, + { + "epoch": 0.867395832002351, + "grad_norm": 0.5440096259117126, + "learning_rate": 6.036508725490757e-05, + "loss": 0.6538, + "step": 135770 + }, + { + "epoch": 0.8674597191520897, + "grad_norm": 1.1164497137069702, + "learning_rate": 6.036017851663689e-05, + "loss": 0.719, + "step": 135780 + }, + { + "epoch": 0.8675236063018285, + "grad_norm": 0.8377860188484192, + "learning_rate": 6.035526967403023e-05, + "loss": 0.9899, + "step": 135790 + }, + { + "epoch": 0.8675874934515672, + "grad_norm": 0.7305436730384827, + "learning_rate": 6.035036072713707e-05, + "loss": 0.744, + "step": 135800 + }, + { + "epoch": 0.8676513806013059, + "grad_norm": 1.4228670597076416, + "learning_rate": 6.034545167600682e-05, + "loss": 0.8398, + "step": 135810 + }, + { + "epoch": 0.8677152677510446, + "grad_norm": 0.7904695868492126, + "learning_rate": 6.0340542520688904e-05, + "loss": 0.8431, + "step": 135820 + }, + { + "epoch": 0.8677791549007833, + "grad_norm": 1.2709144353866577, + "learning_rate": 6.03356332612328e-05, + "loss": 0.8764, + "step": 135830 + }, + { + "epoch": 0.867843042050522, + "grad_norm": 0.8756301999092102, + "learning_rate": 6.03307238976879e-05, + "loss": 0.782, + "step": 135840 + }, + { + "epoch": 0.8679069292002607, + "grad_norm": 0.5899653434753418, + "learning_rate": 6.03258144301037e-05, + "loss": 0.7944, + "step": 135850 + }, + { + "epoch": 0.8679708163499994, + "grad_norm": 0.976030707359314, + "learning_rate": 6.03209048585296e-05, + "loss": 0.8582, + "step": 135860 + }, + { + "epoch": 0.8680347034997381, + "grad_norm": 1.095521092414856, + "learning_rate": 6.0315995183015064e-05, + "loss": 0.9034, + "step": 135870 + }, + { + "epoch": 0.8680985906494768, + "grad_norm": 0.8074119091033936, + "learning_rate": 6.031108540360954e-05, + "loss": 0.8167, + "step": 135880 + }, + { + "epoch": 0.8681624777992155, + "grad_norm": 1.544575810432434, + "learning_rate": 6.0306175520362454e-05, + "loss": 0.8362, + "step": 135890 + }, + { + "epoch": 0.8682263649489542, + "grad_norm": 0.7311546802520752, + "learning_rate": 6.030126553332327e-05, + "loss": 1.0513, + "step": 135900 + }, + { + "epoch": 0.8682902520986928, + "grad_norm": 1.0786371231079102, + "learning_rate": 6.029635544254143e-05, + "loss": 1.0181, + "step": 135910 + }, + { + "epoch": 0.8683541392484315, + "grad_norm": 1.0580967664718628, + "learning_rate": 6.029144524806638e-05, + "loss": 0.7061, + "step": 135920 + }, + { + "epoch": 0.8684180263981702, + "grad_norm": 0.9200608730316162, + "learning_rate": 6.028653494994757e-05, + "loss": 0.9578, + "step": 135930 + }, + { + "epoch": 0.8684819135479089, + "grad_norm": 1.2529308795928955, + "learning_rate": 6.028162454823446e-05, + "loss": 0.878, + "step": 135940 + }, + { + "epoch": 0.8685458006976476, + "grad_norm": 0.6350985765457153, + "learning_rate": 6.0276714042976504e-05, + "loss": 0.8528, + "step": 135950 + }, + { + "epoch": 0.8686096878473863, + "grad_norm": 1.1729838848114014, + "learning_rate": 6.0271803434223115e-05, + "loss": 0.8492, + "step": 135960 + }, + { + "epoch": 0.868673574997125, + "grad_norm": 0.679898738861084, + "learning_rate": 6.02668927220238e-05, + "loss": 1.0879, + "step": 135970 + }, + { + "epoch": 0.8687374621468638, + "grad_norm": 0.9746125936508179, + "learning_rate": 6.0261981906428e-05, + "loss": 0.9354, + "step": 135980 + }, + { + "epoch": 0.8688013492966025, + "grad_norm": 0.8682552576065063, + "learning_rate": 6.0257070987485166e-05, + "loss": 0.9114, + "step": 135990 + }, + { + "epoch": 0.8688652364463412, + "grad_norm": 0.879461944103241, + "learning_rate": 6.025215996524474e-05, + "loss": 0.7752, + "step": 136000 + }, + { + "epoch": 0.8689291235960799, + "grad_norm": 1.0179787874221802, + "learning_rate": 6.024724883975621e-05, + "loss": 0.9302, + "step": 136010 + }, + { + "epoch": 0.8689930107458186, + "grad_norm": 0.7405498623847961, + "learning_rate": 6.024233761106901e-05, + "loss": 0.8306, + "step": 136020 + }, + { + "epoch": 0.8690568978955573, + "grad_norm": 0.9917730689048767, + "learning_rate": 6.023742627923261e-05, + "loss": 0.8827, + "step": 136030 + }, + { + "epoch": 0.869120785045296, + "grad_norm": 1.0026957988739014, + "learning_rate": 6.023251484429647e-05, + "loss": 0.9303, + "step": 136040 + }, + { + "epoch": 0.8691846721950347, + "grad_norm": 0.6799507141113281, + "learning_rate": 6.022760330631005e-05, + "loss": 0.7576, + "step": 136050 + }, + { + "epoch": 0.8692485593447734, + "grad_norm": 0.7701660990715027, + "learning_rate": 6.0222691665322815e-05, + "loss": 0.7309, + "step": 136060 + }, + { + "epoch": 0.8693124464945121, + "grad_norm": 0.7990044355392456, + "learning_rate": 6.0217779921384246e-05, + "loss": 0.9324, + "step": 136070 + }, + { + "epoch": 0.8693763336442508, + "grad_norm": 0.8976256251335144, + "learning_rate": 6.0212868074543785e-05, + "loss": 0.855, + "step": 136080 + }, + { + "epoch": 0.8694402207939895, + "grad_norm": 1.0746898651123047, + "learning_rate": 6.02079561248509e-05, + "loss": 0.8045, + "step": 136090 + }, + { + "epoch": 0.8695041079437282, + "grad_norm": 1.292189359664917, + "learning_rate": 6.0203044072355065e-05, + "loss": 0.8419, + "step": 136100 + }, + { + "epoch": 0.8695679950934669, + "grad_norm": 0.673413097858429, + "learning_rate": 6.019813191710576e-05, + "loss": 0.7643, + "step": 136110 + }, + { + "epoch": 0.8696318822432056, + "grad_norm": 0.834862232208252, + "learning_rate": 6.0193219659152424e-05, + "loss": 0.8355, + "step": 136120 + }, + { + "epoch": 0.8696957693929444, + "grad_norm": 0.6725580096244812, + "learning_rate": 6.018830729854457e-05, + "loss": 1.1082, + "step": 136130 + }, + { + "epoch": 0.8697596565426831, + "grad_norm": 1.442153811454773, + "learning_rate": 6.018339483533163e-05, + "loss": 0.9068, + "step": 136140 + }, + { + "epoch": 0.8698235436924217, + "grad_norm": 1.0553643703460693, + "learning_rate": 6.017848226956311e-05, + "loss": 0.7979, + "step": 136150 + }, + { + "epoch": 0.8698874308421604, + "grad_norm": 1.0946028232574463, + "learning_rate": 6.017356960128846e-05, + "loss": 0.8663, + "step": 136160 + }, + { + "epoch": 0.8699513179918991, + "grad_norm": 1.3556715250015259, + "learning_rate": 6.0168656830557165e-05, + "loss": 0.9034, + "step": 136170 + }, + { + "epoch": 0.8700152051416378, + "grad_norm": 0.47752645611763, + "learning_rate": 6.016374395741869e-05, + "loss": 0.9813, + "step": 136180 + }, + { + "epoch": 0.8700790922913765, + "grad_norm": 1.1327354907989502, + "learning_rate": 6.0158830981922544e-05, + "loss": 0.838, + "step": 136190 + }, + { + "epoch": 0.8701429794411152, + "grad_norm": 0.8484867811203003, + "learning_rate": 6.0153917904118164e-05, + "loss": 0.8683, + "step": 136200 + }, + { + "epoch": 0.8702068665908539, + "grad_norm": 0.7545785307884216, + "learning_rate": 6.0149004724055046e-05, + "loss": 0.9844, + "step": 136210 + }, + { + "epoch": 0.8702707537405926, + "grad_norm": 0.7116890549659729, + "learning_rate": 6.0144091441782666e-05, + "loss": 0.8028, + "step": 136220 + }, + { + "epoch": 0.8703346408903313, + "grad_norm": 0.5172243714332581, + "learning_rate": 6.013917805735052e-05, + "loss": 0.8459, + "step": 136230 + }, + { + "epoch": 0.87039852804007, + "grad_norm": 0.9006187319755554, + "learning_rate": 6.0134264570808076e-05, + "loss": 1.1869, + "step": 136240 + }, + { + "epoch": 0.8704624151898087, + "grad_norm": 1.2033772468566895, + "learning_rate": 6.012935098220483e-05, + "loss": 0.7985, + "step": 136250 + }, + { + "epoch": 0.8705263023395474, + "grad_norm": 0.8718836903572083, + "learning_rate": 6.012443729159025e-05, + "loss": 0.8137, + "step": 136260 + }, + { + "epoch": 0.8705901894892861, + "grad_norm": 1.3549836874008179, + "learning_rate": 6.011952349901382e-05, + "loss": 1.1543, + "step": 136270 + }, + { + "epoch": 0.8706540766390248, + "grad_norm": 1.342417597770691, + "learning_rate": 6.011460960452503e-05, + "loss": 0.8184, + "step": 136280 + }, + { + "epoch": 0.8707179637887635, + "grad_norm": 0.5869442224502563, + "learning_rate": 6.010969560817338e-05, + "loss": 0.7865, + "step": 136290 + }, + { + "epoch": 0.8707818509385022, + "grad_norm": 1.237336277961731, + "learning_rate": 6.0104781510008345e-05, + "loss": 0.8941, + "step": 136300 + }, + { + "epoch": 0.870845738088241, + "grad_norm": 0.9919825792312622, + "learning_rate": 6.0099867310079416e-05, + "loss": 0.9696, + "step": 136310 + }, + { + "epoch": 0.8709096252379797, + "grad_norm": 0.6661075949668884, + "learning_rate": 6.0094953008436094e-05, + "loss": 0.7271, + "step": 136320 + }, + { + "epoch": 0.8709735123877184, + "grad_norm": 1.1574594974517822, + "learning_rate": 6.009003860512785e-05, + "loss": 0.6527, + "step": 136330 + }, + { + "epoch": 0.8710373995374571, + "grad_norm": 0.8676467537879944, + "learning_rate": 6.0085124100204205e-05, + "loss": 0.7372, + "step": 136340 + }, + { + "epoch": 0.8711012866871958, + "grad_norm": 0.6834307312965393, + "learning_rate": 6.0080209493714626e-05, + "loss": 0.9976, + "step": 136350 + }, + { + "epoch": 0.8711651738369345, + "grad_norm": 0.8142191767692566, + "learning_rate": 6.0075294785708617e-05, + "loss": 0.9738, + "step": 136360 + }, + { + "epoch": 0.8712290609866732, + "grad_norm": 1.038397192955017, + "learning_rate": 6.007037997623567e-05, + "loss": 0.78, + "step": 136370 + }, + { + "epoch": 0.8712929481364119, + "grad_norm": 1.5472460985183716, + "learning_rate": 6.006546506534529e-05, + "loss": 0.6741, + "step": 136380 + }, + { + "epoch": 0.8713568352861505, + "grad_norm": 0.9952694177627563, + "learning_rate": 6.006055005308697e-05, + "loss": 0.7892, + "step": 136390 + }, + { + "epoch": 0.8714207224358892, + "grad_norm": 1.1230021715164185, + "learning_rate": 6.005563493951021e-05, + "loss": 0.9274, + "step": 136400 + }, + { + "epoch": 0.8714846095856279, + "grad_norm": 1.2984684705734253, + "learning_rate": 6.005071972466449e-05, + "loss": 0.8563, + "step": 136410 + }, + { + "epoch": 0.8715484967353666, + "grad_norm": 0.9113028049468994, + "learning_rate": 6.004580440859934e-05, + "loss": 0.8594, + "step": 136420 + }, + { + "epoch": 0.8716123838851053, + "grad_norm": 1.0389131307601929, + "learning_rate": 6.0040888991364255e-05, + "loss": 1.0333, + "step": 136430 + }, + { + "epoch": 0.871676271034844, + "grad_norm": 0.7670816779136658, + "learning_rate": 6.003597347300872e-05, + "loss": 0.833, + "step": 136440 + }, + { + "epoch": 0.8717401581845827, + "grad_norm": 0.7277560234069824, + "learning_rate": 6.003105785358225e-05, + "loss": 1.0034, + "step": 136450 + }, + { + "epoch": 0.8718040453343214, + "grad_norm": 0.8484509587287903, + "learning_rate": 6.0026142133134354e-05, + "loss": 1.1544, + "step": 136460 + }, + { + "epoch": 0.8718679324840601, + "grad_norm": 1.0106443166732788, + "learning_rate": 6.0021226311714526e-05, + "loss": 0.7397, + "step": 136470 + }, + { + "epoch": 0.8719318196337988, + "grad_norm": 2.2280375957489014, + "learning_rate": 6.0016310389372275e-05, + "loss": 0.987, + "step": 136480 + }, + { + "epoch": 0.8719957067835375, + "grad_norm": 0.9950495958328247, + "learning_rate": 6.001139436615713e-05, + "loss": 0.7946, + "step": 136490 + }, + { + "epoch": 0.8720595939332763, + "grad_norm": 0.8028036952018738, + "learning_rate": 6.000647824211858e-05, + "loss": 0.8753, + "step": 136500 + }, + { + "epoch": 0.872123481083015, + "grad_norm": 1.1068840026855469, + "learning_rate": 6.000156201730614e-05, + "loss": 1.0608, + "step": 136510 + }, + { + "epoch": 0.8721873682327537, + "grad_norm": 0.5927395224571228, + "learning_rate": 5.9996645691769305e-05, + "loss": 0.7914, + "step": 136520 + }, + { + "epoch": 0.8722512553824924, + "grad_norm": 1.3745521306991577, + "learning_rate": 5.9991729265557605e-05, + "loss": 1.0867, + "step": 136530 + }, + { + "epoch": 0.8723151425322311, + "grad_norm": 1.0866520404815674, + "learning_rate": 5.998681273872055e-05, + "loss": 0.7932, + "step": 136540 + }, + { + "epoch": 0.8723790296819698, + "grad_norm": 1.1197307109832764, + "learning_rate": 5.998189611130764e-05, + "loss": 1.0212, + "step": 136550 + }, + { + "epoch": 0.8724429168317085, + "grad_norm": 0.9691267609596252, + "learning_rate": 5.9976979383368414e-05, + "loss": 0.7832, + "step": 136560 + }, + { + "epoch": 0.8725068039814472, + "grad_norm": 2.4745099544525146, + "learning_rate": 5.997206255495237e-05, + "loss": 0.9366, + "step": 136570 + }, + { + "epoch": 0.8725706911311859, + "grad_norm": 1.1133451461791992, + "learning_rate": 5.9967145626109035e-05, + "loss": 1.0052, + "step": 136580 + }, + { + "epoch": 0.8726345782809246, + "grad_norm": 0.8237787485122681, + "learning_rate": 5.996222859688791e-05, + "loss": 0.9783, + "step": 136590 + }, + { + "epoch": 0.8726984654306633, + "grad_norm": 0.960617184638977, + "learning_rate": 5.995731146733853e-05, + "loss": 0.7924, + "step": 136600 + }, + { + "epoch": 0.872762352580402, + "grad_norm": 0.8641276955604553, + "learning_rate": 5.99523942375104e-05, + "loss": 0.9258, + "step": 136610 + }, + { + "epoch": 0.8728262397301407, + "grad_norm": 0.6767961382865906, + "learning_rate": 5.994747690745306e-05, + "loss": 0.8195, + "step": 136620 + }, + { + "epoch": 0.8728901268798794, + "grad_norm": 0.8257598280906677, + "learning_rate": 5.9942559477216024e-05, + "loss": 1.0143, + "step": 136630 + }, + { + "epoch": 0.872954014029618, + "grad_norm": 0.9936842918395996, + "learning_rate": 5.99376419468488e-05, + "loss": 1.14, + "step": 136640 + }, + { + "epoch": 0.8730179011793567, + "grad_norm": 1.0033197402954102, + "learning_rate": 5.993272431640093e-05, + "loss": 0.9506, + "step": 136650 + }, + { + "epoch": 0.8730817883290954, + "grad_norm": 0.9221176505088806, + "learning_rate": 5.992780658592193e-05, + "loss": 0.8055, + "step": 136660 + }, + { + "epoch": 0.8731456754788341, + "grad_norm": 1.5162618160247803, + "learning_rate": 5.9922888755461336e-05, + "loss": 0.8351, + "step": 136670 + }, + { + "epoch": 0.8732095626285729, + "grad_norm": 0.7005751132965088, + "learning_rate": 5.991797082506867e-05, + "loss": 0.8902, + "step": 136680 + }, + { + "epoch": 0.8732734497783116, + "grad_norm": 0.8630402684211731, + "learning_rate": 5.9913052794793453e-05, + "loss": 0.9382, + "step": 136690 + }, + { + "epoch": 0.8733373369280503, + "grad_norm": 0.7950804829597473, + "learning_rate": 5.990813466468522e-05, + "loss": 1.0334, + "step": 136700 + }, + { + "epoch": 0.873401224077789, + "grad_norm": 0.9526651501655579, + "learning_rate": 5.9903216434793494e-05, + "loss": 1.1781, + "step": 136710 + }, + { + "epoch": 0.8734651112275277, + "grad_norm": 1.170040488243103, + "learning_rate": 5.989829810516782e-05, + "loss": 1.0522, + "step": 136720 + }, + { + "epoch": 0.8735289983772664, + "grad_norm": 1.0779001712799072, + "learning_rate": 5.9893379675857706e-05, + "loss": 0.8159, + "step": 136730 + }, + { + "epoch": 0.8735928855270051, + "grad_norm": 0.806840181350708, + "learning_rate": 5.9888461146912736e-05, + "loss": 0.7811, + "step": 136740 + }, + { + "epoch": 0.8736567726767438, + "grad_norm": 0.7434895634651184, + "learning_rate": 5.988354251838237e-05, + "loss": 0.8606, + "step": 136750 + }, + { + "epoch": 0.8737206598264825, + "grad_norm": 1.0427266359329224, + "learning_rate": 5.987862379031619e-05, + "loss": 1.006, + "step": 136760 + }, + { + "epoch": 0.8737845469762212, + "grad_norm": 1.1790105104446411, + "learning_rate": 5.987370496276372e-05, + "loss": 0.81, + "step": 136770 + }, + { + "epoch": 0.8738484341259599, + "grad_norm": 1.0513496398925781, + "learning_rate": 5.9868786035774504e-05, + "loss": 0.9206, + "step": 136780 + }, + { + "epoch": 0.8739123212756986, + "grad_norm": 1.0362788438796997, + "learning_rate": 5.986386700939808e-05, + "loss": 0.8835, + "step": 136790 + }, + { + "epoch": 0.8739762084254373, + "grad_norm": 1.3794645071029663, + "learning_rate": 5.985894788368397e-05, + "loss": 0.6992, + "step": 136800 + }, + { + "epoch": 0.874040095575176, + "grad_norm": 0.74660724401474, + "learning_rate": 5.9854028658681724e-05, + "loss": 1.0083, + "step": 136810 + }, + { + "epoch": 0.8741039827249147, + "grad_norm": 0.5785887837409973, + "learning_rate": 5.984910933444089e-05, + "loss": 0.9494, + "step": 136820 + }, + { + "epoch": 0.8741678698746534, + "grad_norm": 0.7972803115844727, + "learning_rate": 5.984418991101101e-05, + "loss": 1.1168, + "step": 136830 + }, + { + "epoch": 0.8742317570243922, + "grad_norm": 0.9159103631973267, + "learning_rate": 5.983927038844162e-05, + "loss": 1.1372, + "step": 136840 + }, + { + "epoch": 0.8742956441741309, + "grad_norm": 1.0228685140609741, + "learning_rate": 5.9834350766782255e-05, + "loss": 0.5965, + "step": 136850 + }, + { + "epoch": 0.8743595313238696, + "grad_norm": 1.0952800512313843, + "learning_rate": 5.982943104608247e-05, + "loss": 0.8082, + "step": 136860 + }, + { + "epoch": 0.8744234184736083, + "grad_norm": 0.7565765380859375, + "learning_rate": 5.982451122639182e-05, + "loss": 0.6736, + "step": 136870 + }, + { + "epoch": 0.8744873056233469, + "grad_norm": 0.7633196115493774, + "learning_rate": 5.981959130775985e-05, + "loss": 0.8517, + "step": 136880 + }, + { + "epoch": 0.8745511927730856, + "grad_norm": 2.1346609592437744, + "learning_rate": 5.981467129023609e-05, + "loss": 0.854, + "step": 136890 + }, + { + "epoch": 0.8746150799228243, + "grad_norm": 0.8160780668258667, + "learning_rate": 5.98097511738701e-05, + "loss": 1.1138, + "step": 136900 + }, + { + "epoch": 0.874678967072563, + "grad_norm": 0.6529989838600159, + "learning_rate": 5.9804830958711425e-05, + "loss": 0.675, + "step": 136910 + }, + { + "epoch": 0.8747428542223017, + "grad_norm": 0.846062421798706, + "learning_rate": 5.979991064480962e-05, + "loss": 0.763, + "step": 136920 + }, + { + "epoch": 0.8748067413720404, + "grad_norm": 0.8752646446228027, + "learning_rate": 5.9794990232214244e-05, + "loss": 0.7932, + "step": 136930 + }, + { + "epoch": 0.8748706285217791, + "grad_norm": 0.695993959903717, + "learning_rate": 5.979006972097484e-05, + "loss": 0.7567, + "step": 136940 + }, + { + "epoch": 0.8749345156715178, + "grad_norm": 0.825805127620697, + "learning_rate": 5.978514911114096e-05, + "loss": 0.7572, + "step": 136950 + }, + { + "epoch": 0.8749984028212565, + "grad_norm": 1.5052249431610107, + "learning_rate": 5.9780228402762165e-05, + "loss": 0.944, + "step": 136960 + }, + { + "epoch": 0.8750622899709952, + "grad_norm": 1.3028863668441772, + "learning_rate": 5.9775307595888006e-05, + "loss": 1.0014, + "step": 136970 + }, + { + "epoch": 0.8751261771207339, + "grad_norm": 2.27508282661438, + "learning_rate": 5.977038669056805e-05, + "loss": 0.8975, + "step": 136980 + }, + { + "epoch": 0.8751900642704726, + "grad_norm": 0.5962340235710144, + "learning_rate": 5.9765465686851854e-05, + "loss": 0.8318, + "step": 136990 + }, + { + "epoch": 0.8752539514202113, + "grad_norm": 0.7743219137191772, + "learning_rate": 5.976054458478896e-05, + "loss": 0.9495, + "step": 137000 + }, + { + "epoch": 0.87531783856995, + "grad_norm": 0.840707540512085, + "learning_rate": 5.975562338442893e-05, + "loss": 0.8466, + "step": 137010 + }, + { + "epoch": 0.8753817257196888, + "grad_norm": 0.7525313496589661, + "learning_rate": 5.975070208582134e-05, + "loss": 0.5504, + "step": 137020 + }, + { + "epoch": 0.8754456128694275, + "grad_norm": 2.833361864089966, + "learning_rate": 5.974578068901575e-05, + "loss": 0.9305, + "step": 137030 + }, + { + "epoch": 0.8755095000191662, + "grad_norm": 0.896931529045105, + "learning_rate": 5.9740859194061717e-05, + "loss": 1.0519, + "step": 137040 + }, + { + "epoch": 0.8755733871689049, + "grad_norm": 0.6994075179100037, + "learning_rate": 5.97359376010088e-05, + "loss": 0.9204, + "step": 137050 + }, + { + "epoch": 0.8756372743186436, + "grad_norm": 0.8043060898780823, + "learning_rate": 5.9731015909906565e-05, + "loss": 0.7847, + "step": 137060 + }, + { + "epoch": 0.8757011614683823, + "grad_norm": 0.9698672294616699, + "learning_rate": 5.9726094120804585e-05, + "loss": 0.7268, + "step": 137070 + }, + { + "epoch": 0.875765048618121, + "grad_norm": 1.0071710348129272, + "learning_rate": 5.972117223375242e-05, + "loss": 0.7952, + "step": 137080 + }, + { + "epoch": 0.8758289357678597, + "grad_norm": 0.7718594074249268, + "learning_rate": 5.9716250248799644e-05, + "loss": 0.7514, + "step": 137090 + }, + { + "epoch": 0.8758928229175984, + "grad_norm": 0.8059403300285339, + "learning_rate": 5.971132816599583e-05, + "loss": 0.9773, + "step": 137100 + }, + { + "epoch": 0.8759567100673371, + "grad_norm": 0.6279333829879761, + "learning_rate": 5.970640598539052e-05, + "loss": 1.1655, + "step": 137110 + }, + { + "epoch": 0.8760205972170757, + "grad_norm": 0.5626464486122131, + "learning_rate": 5.970148370703332e-05, + "loss": 0.7618, + "step": 137120 + }, + { + "epoch": 0.8760844843668144, + "grad_norm": 0.6805403828620911, + "learning_rate": 5.969656133097379e-05, + "loss": 0.9308, + "step": 137130 + }, + { + "epoch": 0.8761483715165531, + "grad_norm": 1.129631519317627, + "learning_rate": 5.969163885726148e-05, + "loss": 0.8858, + "step": 137140 + }, + { + "epoch": 0.8762122586662918, + "grad_norm": 0.6671173572540283, + "learning_rate": 5.9686716285946e-05, + "loss": 0.9919, + "step": 137150 + }, + { + "epoch": 0.8762761458160305, + "grad_norm": 0.8226957321166992, + "learning_rate": 5.9681793617076895e-05, + "loss": 0.8594, + "step": 137160 + }, + { + "epoch": 0.8763400329657692, + "grad_norm": 0.9677339792251587, + "learning_rate": 5.9676870850703747e-05, + "loss": 0.9001, + "step": 137170 + }, + { + "epoch": 0.8764039201155079, + "grad_norm": 1.0769922733306885, + "learning_rate": 5.967194798687615e-05, + "loss": 0.9104, + "step": 137180 + }, + { + "epoch": 0.8764678072652466, + "grad_norm": 0.9808753728866577, + "learning_rate": 5.966702502564366e-05, + "loss": 0.8969, + "step": 137190 + }, + { + "epoch": 0.8765316944149854, + "grad_norm": 0.8168275356292725, + "learning_rate": 5.9662101967055885e-05, + "loss": 0.8239, + "step": 137200 + }, + { + "epoch": 0.8765955815647241, + "grad_norm": 0.7705772519111633, + "learning_rate": 5.965717881116237e-05, + "loss": 0.7709, + "step": 137210 + }, + { + "epoch": 0.8766594687144628, + "grad_norm": 0.7873682975769043, + "learning_rate": 5.965225555801272e-05, + "loss": 0.8556, + "step": 137220 + }, + { + "epoch": 0.8767233558642015, + "grad_norm": 1.820673942565918, + "learning_rate": 5.9647332207656505e-05, + "loss": 1.1398, + "step": 137230 + }, + { + "epoch": 0.8767872430139402, + "grad_norm": 1.0492981672286987, + "learning_rate": 5.9642408760143296e-05, + "loss": 1.0855, + "step": 137240 + }, + { + "epoch": 0.8768511301636789, + "grad_norm": 1.0693048238754272, + "learning_rate": 5.9637485215522694e-05, + "loss": 1.043, + "step": 137250 + }, + { + "epoch": 0.8769150173134176, + "grad_norm": 1.1092848777770996, + "learning_rate": 5.963256157384427e-05, + "loss": 1.1529, + "step": 137260 + }, + { + "epoch": 0.8769789044631563, + "grad_norm": 1.148908257484436, + "learning_rate": 5.962763783515763e-05, + "loss": 0.7518, + "step": 137270 + }, + { + "epoch": 0.877042791612895, + "grad_norm": 1.0465582609176636, + "learning_rate": 5.9622713999512345e-05, + "loss": 0.8351, + "step": 137280 + }, + { + "epoch": 0.8771066787626337, + "grad_norm": 0.7496880888938904, + "learning_rate": 5.9617790066958e-05, + "loss": 1.0184, + "step": 137290 + }, + { + "epoch": 0.8771705659123724, + "grad_norm": 2.036813259124756, + "learning_rate": 5.96128660375442e-05, + "loss": 1.1939, + "step": 137300 + }, + { + "epoch": 0.8772344530621111, + "grad_norm": 0.8851515054702759, + "learning_rate": 5.9607941911320506e-05, + "loss": 0.8136, + "step": 137310 + }, + { + "epoch": 0.8772983402118498, + "grad_norm": 1.1349178552627563, + "learning_rate": 5.960301768833654e-05, + "loss": 0.8269, + "step": 137320 + }, + { + "epoch": 0.8773622273615885, + "grad_norm": 1.0561522245407104, + "learning_rate": 5.959809336864186e-05, + "loss": 1.0435, + "step": 137330 + }, + { + "epoch": 0.8774261145113272, + "grad_norm": 1.015069842338562, + "learning_rate": 5.959316895228609e-05, + "loss": 0.752, + "step": 137340 + }, + { + "epoch": 0.8774900016610659, + "grad_norm": 1.3497037887573242, + "learning_rate": 5.958824443931881e-05, + "loss": 0.7568, + "step": 137350 + }, + { + "epoch": 0.8775538888108046, + "grad_norm": 0.6360141038894653, + "learning_rate": 5.958331982978961e-05, + "loss": 0.8139, + "step": 137360 + }, + { + "epoch": 0.8776177759605432, + "grad_norm": 1.0269728899002075, + "learning_rate": 5.957839512374809e-05, + "loss": 0.9107, + "step": 137370 + }, + { + "epoch": 0.877681663110282, + "grad_norm": 0.7632153630256653, + "learning_rate": 5.957347032124384e-05, + "loss": 1.0206, + "step": 137380 + }, + { + "epoch": 0.8777455502600207, + "grad_norm": 0.7974910736083984, + "learning_rate": 5.9568545422326474e-05, + "loss": 0.8792, + "step": 137390 + }, + { + "epoch": 0.8778094374097594, + "grad_norm": 1.0210436582565308, + "learning_rate": 5.956362042704556e-05, + "loss": 1.0714, + "step": 137400 + }, + { + "epoch": 0.8778733245594981, + "grad_norm": 0.8050969839096069, + "learning_rate": 5.955869533545073e-05, + "loss": 0.9401, + "step": 137410 + }, + { + "epoch": 0.8779372117092368, + "grad_norm": 0.888954758644104, + "learning_rate": 5.955377014759156e-05, + "loss": 0.9508, + "step": 137420 + }, + { + "epoch": 0.8780010988589755, + "grad_norm": 0.69648677110672, + "learning_rate": 5.954884486351766e-05, + "loss": 0.9033, + "step": 137430 + }, + { + "epoch": 0.8780649860087142, + "grad_norm": 1.9958151578903198, + "learning_rate": 5.954391948327864e-05, + "loss": 0.9938, + "step": 137440 + }, + { + "epoch": 0.8781288731584529, + "grad_norm": 0.7439517378807068, + "learning_rate": 5.9538994006924085e-05, + "loss": 0.9702, + "step": 137450 + }, + { + "epoch": 0.8781927603081916, + "grad_norm": 0.9544771313667297, + "learning_rate": 5.953406843450361e-05, + "loss": 1.0634, + "step": 137460 + }, + { + "epoch": 0.8782566474579303, + "grad_norm": 0.9266231656074524, + "learning_rate": 5.9529142766066823e-05, + "loss": 0.8061, + "step": 137470 + }, + { + "epoch": 0.878320534607669, + "grad_norm": 0.9102841019630432, + "learning_rate": 5.952421700166333e-05, + "loss": 0.7466, + "step": 137480 + }, + { + "epoch": 0.8783844217574077, + "grad_norm": 0.9724735021591187, + "learning_rate": 5.9519291141342714e-05, + "loss": 0.7188, + "step": 137490 + }, + { + "epoch": 0.8784483089071464, + "grad_norm": 0.6619033217430115, + "learning_rate": 5.951436518515461e-05, + "loss": 0.696, + "step": 137500 + }, + { + "epoch": 0.8785121960568851, + "grad_norm": 1.0394726991653442, + "learning_rate": 5.9509439133148616e-05, + "loss": 0.9148, + "step": 137510 + }, + { + "epoch": 0.8785760832066238, + "grad_norm": 0.9882583618164062, + "learning_rate": 5.950451298537434e-05, + "loss": 0.7845, + "step": 137520 + }, + { + "epoch": 0.8786399703563625, + "grad_norm": 1.2519365549087524, + "learning_rate": 5.94995867418814e-05, + "loss": 0.7793, + "step": 137530 + }, + { + "epoch": 0.8787038575061012, + "grad_norm": 0.7872567772865295, + "learning_rate": 5.9494660402719404e-05, + "loss": 1.0541, + "step": 137540 + }, + { + "epoch": 0.87876774465584, + "grad_norm": 0.8353559970855713, + "learning_rate": 5.948973396793795e-05, + "loss": 1.0608, + "step": 137550 + }, + { + "epoch": 0.8788316318055787, + "grad_norm": 0.48675644397735596, + "learning_rate": 5.948480743758669e-05, + "loss": 0.9564, + "step": 137560 + }, + { + "epoch": 0.8788955189553174, + "grad_norm": 1.1137011051177979, + "learning_rate": 5.9479880811715195e-05, + "loss": 0.7974, + "step": 137570 + }, + { + "epoch": 0.8789594061050561, + "grad_norm": 1.0456700325012207, + "learning_rate": 5.9474954090373106e-05, + "loss": 0.9706, + "step": 137580 + }, + { + "epoch": 0.8790232932547948, + "grad_norm": 1.0245221853256226, + "learning_rate": 5.947002727361003e-05, + "loss": 0.8457, + "step": 137590 + }, + { + "epoch": 0.8790871804045335, + "grad_norm": 1.006938099861145, + "learning_rate": 5.9465593056979326e-05, + "loss": 0.7818, + "step": 137600 + }, + { + "epoch": 0.8791510675542721, + "grad_norm": 1.0359126329421997, + "learning_rate": 5.946066605905308e-05, + "loss": 0.7724, + "step": 137610 + }, + { + "epoch": 0.8792149547040108, + "grad_norm": 1.068823218345642, + "learning_rate": 5.945573896584974e-05, + "loss": 0.9845, + "step": 137620 + }, + { + "epoch": 0.8792788418537495, + "grad_norm": 0.46609166264533997, + "learning_rate": 5.945081177741892e-05, + "loss": 0.7789, + "step": 137630 + }, + { + "epoch": 0.8793427290034882, + "grad_norm": 1.1455978155136108, + "learning_rate": 5.9445884493810256e-05, + "loss": 0.7966, + "step": 137640 + }, + { + "epoch": 0.8794066161532269, + "grad_norm": 1.5680001974105835, + "learning_rate": 5.944095711507337e-05, + "loss": 0.9451, + "step": 137650 + }, + { + "epoch": 0.8794705033029656, + "grad_norm": 1.8977959156036377, + "learning_rate": 5.943602964125787e-05, + "loss": 0.8331, + "step": 137660 + }, + { + "epoch": 0.8795343904527043, + "grad_norm": 0.6231663227081299, + "learning_rate": 5.943110207241339e-05, + "loss": 0.8725, + "step": 137670 + }, + { + "epoch": 0.879598277602443, + "grad_norm": 0.8517551422119141, + "learning_rate": 5.942617440858955e-05, + "loss": 1.0001, + "step": 137680 + }, + { + "epoch": 0.8796621647521817, + "grad_norm": 0.9704746007919312, + "learning_rate": 5.9421246649835985e-05, + "loss": 1.2601, + "step": 137690 + }, + { + "epoch": 0.8797260519019204, + "grad_norm": 0.6457834839820862, + "learning_rate": 5.941631879620231e-05, + "loss": 0.6845, + "step": 137700 + }, + { + "epoch": 0.8797899390516591, + "grad_norm": 1.3771389722824097, + "learning_rate": 5.941139084773817e-05, + "loss": 1.0085, + "step": 137710 + }, + { + "epoch": 0.8798538262013978, + "grad_norm": 0.8982274532318115, + "learning_rate": 5.940646280449317e-05, + "loss": 0.754, + "step": 137720 + }, + { + "epoch": 0.8799177133511366, + "grad_norm": 1.1403874158859253, + "learning_rate": 5.9401534666516955e-05, + "loss": 0.9035, + "step": 137730 + }, + { + "epoch": 0.8799816005008753, + "grad_norm": 0.8235518932342529, + "learning_rate": 5.939660643385915e-05, + "loss": 0.9339, + "step": 137740 + }, + { + "epoch": 0.880045487650614, + "grad_norm": 0.8350309133529663, + "learning_rate": 5.939167810656939e-05, + "loss": 1.0703, + "step": 137750 + }, + { + "epoch": 0.8801093748003527, + "grad_norm": 1.7924656867980957, + "learning_rate": 5.938674968469731e-05, + "loss": 1.1085, + "step": 137760 + }, + { + "epoch": 0.8801732619500914, + "grad_norm": 1.2257702350616455, + "learning_rate": 5.9381821168292536e-05, + "loss": 0.9338, + "step": 137770 + }, + { + "epoch": 0.8802371490998301, + "grad_norm": 1.0357496738433838, + "learning_rate": 5.9376892557404704e-05, + "loss": 0.8123, + "step": 137780 + }, + { + "epoch": 0.8803010362495688, + "grad_norm": 0.990088164806366, + "learning_rate": 5.937196385208346e-05, + "loss": 1.004, + "step": 137790 + }, + { + "epoch": 0.8803649233993075, + "grad_norm": 0.97257000207901, + "learning_rate": 5.936703505237843e-05, + "loss": 0.6953, + "step": 137800 + }, + { + "epoch": 0.8804288105490462, + "grad_norm": 0.6690786480903625, + "learning_rate": 5.9362106158339245e-05, + "loss": 0.9455, + "step": 137810 + }, + { + "epoch": 0.8804926976987849, + "grad_norm": 1.3547656536102295, + "learning_rate": 5.935717717001556e-05, + "loss": 0.856, + "step": 137820 + }, + { + "epoch": 0.8805565848485236, + "grad_norm": 0.7319701313972473, + "learning_rate": 5.9352248087456994e-05, + "loss": 0.9645, + "step": 137830 + }, + { + "epoch": 0.8806204719982623, + "grad_norm": 0.9429260492324829, + "learning_rate": 5.934731891071321e-05, + "loss": 1.0002, + "step": 137840 + }, + { + "epoch": 0.8806843591480009, + "grad_norm": 0.8033540844917297, + "learning_rate": 5.934238963983384e-05, + "loss": 0.676, + "step": 137850 + }, + { + "epoch": 0.8807482462977396, + "grad_norm": 1.9178016185760498, + "learning_rate": 5.933746027486853e-05, + "loss": 0.727, + "step": 137860 + }, + { + "epoch": 0.8808121334474783, + "grad_norm": 0.7014086842536926, + "learning_rate": 5.9333023765997284e-05, + "loss": 1.0007, + "step": 137870 + }, + { + "epoch": 0.880876020597217, + "grad_norm": 1.6544643640518188, + "learning_rate": 5.9328094222405437e-05, + "loss": 0.8551, + "step": 137880 + }, + { + "epoch": 0.8809399077469557, + "grad_norm": 1.3481382131576538, + "learning_rate": 5.932316458487162e-05, + "loss": 0.7606, + "step": 137890 + }, + { + "epoch": 0.8810037948966944, + "grad_norm": 1.015859842300415, + "learning_rate": 5.931823485344545e-05, + "loss": 0.9227, + "step": 137900 + }, + { + "epoch": 0.8810676820464332, + "grad_norm": 1.004987359046936, + "learning_rate": 5.9313305028176606e-05, + "loss": 1.1689, + "step": 137910 + }, + { + "epoch": 0.8811315691961719, + "grad_norm": 0.8518670797348022, + "learning_rate": 5.930837510911471e-05, + "loss": 1.1651, + "step": 137920 + }, + { + "epoch": 0.8811954563459106, + "grad_norm": 0.6853091716766357, + "learning_rate": 5.930344509630943e-05, + "loss": 0.7861, + "step": 137930 + }, + { + "epoch": 0.8812593434956493, + "grad_norm": 1.4543042182922363, + "learning_rate": 5.929851498981041e-05, + "loss": 0.718, + "step": 137940 + }, + { + "epoch": 0.881323230645388, + "grad_norm": 0.791410505771637, + "learning_rate": 5.92935847896673e-05, + "loss": 0.8162, + "step": 137950 + }, + { + "epoch": 0.8813871177951267, + "grad_norm": 0.8567259311676025, + "learning_rate": 5.928865449592976e-05, + "loss": 0.797, + "step": 137960 + }, + { + "epoch": 0.8814510049448654, + "grad_norm": 0.9072690010070801, + "learning_rate": 5.928372410864742e-05, + "loss": 0.9948, + "step": 137970 + }, + { + "epoch": 0.8815148920946041, + "grad_norm": 1.3205629587173462, + "learning_rate": 5.9278793627869955e-05, + "loss": 1.0577, + "step": 137980 + }, + { + "epoch": 0.8815787792443428, + "grad_norm": 0.8285039663314819, + "learning_rate": 5.9273863053647015e-05, + "loss": 0.9573, + "step": 137990 + }, + { + "epoch": 0.8816426663940815, + "grad_norm": 0.7097443342208862, + "learning_rate": 5.926893238602825e-05, + "loss": 0.927, + "step": 138000 + }, + { + "epoch": 0.8817065535438202, + "grad_norm": 0.7954055666923523, + "learning_rate": 5.926400162506331e-05, + "loss": 1.0107, + "step": 138010 + }, + { + "epoch": 0.8817704406935589, + "grad_norm": 0.9735956788063049, + "learning_rate": 5.9259070770801874e-05, + "loss": 0.8408, + "step": 138020 + }, + { + "epoch": 0.8818343278432976, + "grad_norm": 1.4425255060195923, + "learning_rate": 5.925413982329357e-05, + "loss": 1.0734, + "step": 138030 + }, + { + "epoch": 0.8818982149930363, + "grad_norm": 0.9620723724365234, + "learning_rate": 5.9249208782588076e-05, + "loss": 0.9398, + "step": 138040 + }, + { + "epoch": 0.881962102142775, + "grad_norm": 0.6004499793052673, + "learning_rate": 5.924427764873505e-05, + "loss": 0.9196, + "step": 138050 + }, + { + "epoch": 0.8820259892925137, + "grad_norm": 0.9870150685310364, + "learning_rate": 5.9239346421784135e-05, + "loss": 0.9864, + "step": 138060 + }, + { + "epoch": 0.8820898764422525, + "grad_norm": 0.897495687007904, + "learning_rate": 5.9234415101785026e-05, + "loss": 1.0131, + "step": 138070 + }, + { + "epoch": 0.8821537635919912, + "grad_norm": 0.9317723512649536, + "learning_rate": 5.922948368878736e-05, + "loss": 0.856, + "step": 138080 + }, + { + "epoch": 0.8822176507417298, + "grad_norm": 0.9993261694908142, + "learning_rate": 5.922455218284081e-05, + "loss": 0.9035, + "step": 138090 + }, + { + "epoch": 0.8822815378914685, + "grad_norm": 0.8899745345115662, + "learning_rate": 5.921962058399504e-05, + "loss": 0.9287, + "step": 138100 + }, + { + "epoch": 0.8823454250412072, + "grad_norm": 0.59910649061203, + "learning_rate": 5.921468889229971e-05, + "loss": 1.0332, + "step": 138110 + }, + { + "epoch": 0.8824093121909459, + "grad_norm": 0.8276026248931885, + "learning_rate": 5.92097571078045e-05, + "loss": 0.8867, + "step": 138120 + }, + { + "epoch": 0.8824731993406846, + "grad_norm": 1.0071065425872803, + "learning_rate": 5.9204825230559056e-05, + "loss": 0.8795, + "step": 138130 + }, + { + "epoch": 0.8825370864904233, + "grad_norm": 1.0851963758468628, + "learning_rate": 5.919989326061307e-05, + "loss": 0.9147, + "step": 138140 + }, + { + "epoch": 0.882600973640162, + "grad_norm": 0.9725841879844666, + "learning_rate": 5.9194961198016196e-05, + "loss": 0.7633, + "step": 138150 + }, + { + "epoch": 0.8826648607899007, + "grad_norm": 0.5369303226470947, + "learning_rate": 5.9190029042818105e-05, + "loss": 0.9501, + "step": 138160 + }, + { + "epoch": 0.8827287479396394, + "grad_norm": 0.9604431390762329, + "learning_rate": 5.918509679506847e-05, + "loss": 0.785, + "step": 138170 + }, + { + "epoch": 0.8827926350893781, + "grad_norm": 1.0034009218215942, + "learning_rate": 5.918016445481698e-05, + "loss": 0.9936, + "step": 138180 + }, + { + "epoch": 0.8828565222391168, + "grad_norm": 0.8154608607292175, + "learning_rate": 5.917523202211328e-05, + "loss": 0.7805, + "step": 138190 + }, + { + "epoch": 0.8829204093888555, + "grad_norm": 1.2569918632507324, + "learning_rate": 5.9170299497007053e-05, + "loss": 0.6671, + "step": 138200 + }, + { + "epoch": 0.8829842965385942, + "grad_norm": 0.6132636666297913, + "learning_rate": 5.916536687954798e-05, + "loss": 0.7076, + "step": 138210 + }, + { + "epoch": 0.8830481836883329, + "grad_norm": 2.1591336727142334, + "learning_rate": 5.916043416978574e-05, + "loss": 1.1469, + "step": 138220 + }, + { + "epoch": 0.8831120708380716, + "grad_norm": 0.9249553084373474, + "learning_rate": 5.915550136776999e-05, + "loss": 0.8875, + "step": 138230 + }, + { + "epoch": 0.8831759579878103, + "grad_norm": 1.4961109161376953, + "learning_rate": 5.915056847355043e-05, + "loss": 0.7952, + "step": 138240 + }, + { + "epoch": 0.883239845137549, + "grad_norm": 1.0955626964569092, + "learning_rate": 5.914563548717673e-05, + "loss": 0.8794, + "step": 138250 + }, + { + "epoch": 0.8833037322872878, + "grad_norm": 0.5093604922294617, + "learning_rate": 5.9140702408698554e-05, + "loss": 0.7851, + "step": 138260 + }, + { + "epoch": 0.8833676194370265, + "grad_norm": 1.133516788482666, + "learning_rate": 5.913576923816562e-05, + "loss": 0.699, + "step": 138270 + }, + { + "epoch": 0.8834315065867652, + "grad_norm": 1.351069450378418, + "learning_rate": 5.9130835975627574e-05, + "loss": 0.9823, + "step": 138280 + }, + { + "epoch": 0.8834953937365039, + "grad_norm": 0.741649329662323, + "learning_rate": 5.912590262113411e-05, + "loss": 1.0134, + "step": 138290 + }, + { + "epoch": 0.8835592808862426, + "grad_norm": 0.5988890528678894, + "learning_rate": 5.912096917473491e-05, + "loss": 0.7114, + "step": 138300 + }, + { + "epoch": 0.8836231680359813, + "grad_norm": 0.9725940823554993, + "learning_rate": 5.911603563647966e-05, + "loss": 0.9138, + "step": 138310 + }, + { + "epoch": 0.88368705518572, + "grad_norm": 0.5736109018325806, + "learning_rate": 5.911110200641805e-05, + "loss": 0.7067, + "step": 138320 + }, + { + "epoch": 0.8837509423354587, + "grad_norm": 0.8855761885643005, + "learning_rate": 5.910616828459975e-05, + "loss": 1.2011, + "step": 138330 + }, + { + "epoch": 0.8838148294851973, + "grad_norm": 0.8970593810081482, + "learning_rate": 5.910123447107446e-05, + "loss": 0.9496, + "step": 138340 + }, + { + "epoch": 0.883878716634936, + "grad_norm": 0.8814042806625366, + "learning_rate": 5.909630056589188e-05, + "loss": 0.957, + "step": 138350 + }, + { + "epoch": 0.8839426037846747, + "grad_norm": 1.0015789270401, + "learning_rate": 5.909136656910167e-05, + "loss": 0.8841, + "step": 138360 + }, + { + "epoch": 0.8840064909344134, + "grad_norm": 0.869117796421051, + "learning_rate": 5.908643248075354e-05, + "loss": 0.7216, + "step": 138370 + }, + { + "epoch": 0.8840703780841521, + "grad_norm": 0.8888474702835083, + "learning_rate": 5.9081498300897167e-05, + "loss": 0.8551, + "step": 138380 + }, + { + "epoch": 0.8841342652338908, + "grad_norm": 0.8927051424980164, + "learning_rate": 5.907656402958226e-05, + "loss": 0.9334, + "step": 138390 + }, + { + "epoch": 0.8841981523836295, + "grad_norm": 1.0528945922851562, + "learning_rate": 5.907162966685849e-05, + "loss": 0.7634, + "step": 138400 + }, + { + "epoch": 0.8842620395333682, + "grad_norm": 1.1046907901763916, + "learning_rate": 5.906669521277557e-05, + "loss": 0.715, + "step": 138410 + }, + { + "epoch": 0.8843259266831069, + "grad_norm": 0.8882020711898804, + "learning_rate": 5.906176066738317e-05, + "loss": 1.0122, + "step": 138420 + }, + { + "epoch": 0.8843898138328457, + "grad_norm": 0.9222348928451538, + "learning_rate": 5.905682603073102e-05, + "loss": 0.8114, + "step": 138430 + }, + { + "epoch": 0.8844537009825844, + "grad_norm": 0.5830926895141602, + "learning_rate": 5.905189130286879e-05, + "loss": 1.0322, + "step": 138440 + }, + { + "epoch": 0.8845175881323231, + "grad_norm": 1.4993237257003784, + "learning_rate": 5.904695648384617e-05, + "loss": 0.6984, + "step": 138450 + }, + { + "epoch": 0.8845814752820618, + "grad_norm": 4.328673839569092, + "learning_rate": 5.904202157371288e-05, + "loss": 0.8778, + "step": 138460 + }, + { + "epoch": 0.8846453624318005, + "grad_norm": 0.7862850427627563, + "learning_rate": 5.903708657251861e-05, + "loss": 0.8364, + "step": 138470 + }, + { + "epoch": 0.8847092495815392, + "grad_norm": 1.8233660459518433, + "learning_rate": 5.903215148031307e-05, + "loss": 0.8774, + "step": 138480 + }, + { + "epoch": 0.8847731367312779, + "grad_norm": 0.7515198588371277, + "learning_rate": 5.902721629714595e-05, + "loss": 1.2306, + "step": 138490 + }, + { + "epoch": 0.8848370238810166, + "grad_norm": 1.1745033264160156, + "learning_rate": 5.902228102306695e-05, + "loss": 0.9555, + "step": 138500 + }, + { + "epoch": 0.8849009110307553, + "grad_norm": 1.0315542221069336, + "learning_rate": 5.901734565812577e-05, + "loss": 0.987, + "step": 138510 + }, + { + "epoch": 0.884964798180494, + "grad_norm": 1.1841830015182495, + "learning_rate": 5.9012410202372114e-05, + "loss": 1.1246, + "step": 138520 + }, + { + "epoch": 0.8850286853302327, + "grad_norm": 1.072008490562439, + "learning_rate": 5.9007474655855696e-05, + "loss": 0.8357, + "step": 138530 + }, + { + "epoch": 0.8850925724799714, + "grad_norm": 1.0678666830062866, + "learning_rate": 5.900253901862621e-05, + "loss": 0.7345, + "step": 138540 + }, + { + "epoch": 0.8851564596297101, + "grad_norm": 0.83828204870224, + "learning_rate": 5.899760329073338e-05, + "loss": 0.8972, + "step": 138550 + }, + { + "epoch": 0.8852203467794488, + "grad_norm": 0.9922822713851929, + "learning_rate": 5.899266747222689e-05, + "loss": 0.7582, + "step": 138560 + }, + { + "epoch": 0.8852842339291875, + "grad_norm": 2.141287088394165, + "learning_rate": 5.8987731563156464e-05, + "loss": 1.1712, + "step": 138570 + }, + { + "epoch": 0.8853481210789261, + "grad_norm": 0.8751981258392334, + "learning_rate": 5.89827955635718e-05, + "loss": 0.9515, + "step": 138580 + }, + { + "epoch": 0.8854120082286648, + "grad_norm": 0.6795740723609924, + "learning_rate": 5.897785947352262e-05, + "loss": 0.7279, + "step": 138590 + }, + { + "epoch": 0.8854758953784035, + "grad_norm": 0.8922616839408875, + "learning_rate": 5.8972923293058636e-05, + "loss": 1.0773, + "step": 138600 + }, + { + "epoch": 0.8855397825281422, + "grad_norm": 0.8627411127090454, + "learning_rate": 5.896798702222953e-05, + "loss": 1.0776, + "step": 138610 + }, + { + "epoch": 0.885603669677881, + "grad_norm": 0.6423478126525879, + "learning_rate": 5.896305066108504e-05, + "loss": 1.0121, + "step": 138620 + }, + { + "epoch": 0.8856675568276197, + "grad_norm": 0.9135613441467285, + "learning_rate": 5.895811420967489e-05, + "loss": 0.7514, + "step": 138630 + }, + { + "epoch": 0.8857314439773584, + "grad_norm": 1.0383354425430298, + "learning_rate": 5.895317766804877e-05, + "loss": 0.9648, + "step": 138640 + }, + { + "epoch": 0.8857953311270971, + "grad_norm": 1.2800050973892212, + "learning_rate": 5.89482410362564e-05, + "loss": 0.7818, + "step": 138650 + }, + { + "epoch": 0.8858592182768358, + "grad_norm": 0.8451805710792542, + "learning_rate": 5.894330431434751e-05, + "loss": 0.9926, + "step": 138660 + }, + { + "epoch": 0.8859231054265745, + "grad_norm": 0.9237948060035706, + "learning_rate": 5.893836750237181e-05, + "loss": 0.9855, + "step": 138670 + }, + { + "epoch": 0.8859869925763132, + "grad_norm": 0.6071786880493164, + "learning_rate": 5.893343060037902e-05, + "loss": 0.8057, + "step": 138680 + }, + { + "epoch": 0.8860508797260519, + "grad_norm": 1.0154786109924316, + "learning_rate": 5.892849360841886e-05, + "loss": 0.9599, + "step": 138690 + }, + { + "epoch": 0.8861147668757906, + "grad_norm": 1.8094230890274048, + "learning_rate": 5.892355652654102e-05, + "loss": 0.9918, + "step": 138700 + }, + { + "epoch": 0.8861786540255293, + "grad_norm": 0.8188693523406982, + "learning_rate": 5.891861935479527e-05, + "loss": 0.857, + "step": 138710 + }, + { + "epoch": 0.886242541175268, + "grad_norm": 0.6113899350166321, + "learning_rate": 5.891368209323129e-05, + "loss": 1.092, + "step": 138720 + }, + { + "epoch": 0.8863064283250067, + "grad_norm": 0.7425000667572021, + "learning_rate": 5.8908744741898846e-05, + "loss": 0.7412, + "step": 138730 + }, + { + "epoch": 0.8863703154747454, + "grad_norm": 1.9018378257751465, + "learning_rate": 5.8903807300847627e-05, + "loss": 0.889, + "step": 138740 + }, + { + "epoch": 0.8864342026244841, + "grad_norm": 1.2525657415390015, + "learning_rate": 5.889886977012735e-05, + "loss": 0.901, + "step": 138750 + }, + { + "epoch": 0.8864980897742228, + "grad_norm": 0.6941089630126953, + "learning_rate": 5.8893932149787764e-05, + "loss": 0.9275, + "step": 138760 + }, + { + "epoch": 0.8865619769239615, + "grad_norm": 0.9113277196884155, + "learning_rate": 5.8888994439878584e-05, + "loss": 0.9316, + "step": 138770 + }, + { + "epoch": 0.8866258640737003, + "grad_norm": 1.0078089237213135, + "learning_rate": 5.888405664044953e-05, + "loss": 0.9862, + "step": 138780 + }, + { + "epoch": 0.886689751223439, + "grad_norm": 0.7824812531471252, + "learning_rate": 5.887911875155036e-05, + "loss": 0.8826, + "step": 138790 + }, + { + "epoch": 0.8867536383731777, + "grad_norm": 0.5827013254165649, + "learning_rate": 5.887418077323077e-05, + "loss": 0.6422, + "step": 138800 + }, + { + "epoch": 0.8868175255229164, + "grad_norm": 1.1789437532424927, + "learning_rate": 5.886924270554051e-05, + "loss": 1.123, + "step": 138810 + }, + { + "epoch": 0.886881412672655, + "grad_norm": 0.9129090905189514, + "learning_rate": 5.886430454852929e-05, + "loss": 0.9861, + "step": 138820 + }, + { + "epoch": 0.8869452998223937, + "grad_norm": 0.82326340675354, + "learning_rate": 5.885936630224686e-05, + "loss": 0.8269, + "step": 138830 + }, + { + "epoch": 0.8870091869721324, + "grad_norm": 2.5597972869873047, + "learning_rate": 5.885442796674295e-05, + "loss": 0.9155, + "step": 138840 + }, + { + "epoch": 0.8870730741218711, + "grad_norm": 1.1469552516937256, + "learning_rate": 5.8849489542067296e-05, + "loss": 0.806, + "step": 138850 + }, + { + "epoch": 0.8871369612716098, + "grad_norm": 0.7060733437538147, + "learning_rate": 5.8844551028269625e-05, + "loss": 1.0475, + "step": 138860 + }, + { + "epoch": 0.8872008484213485, + "grad_norm": 0.5902007222175598, + "learning_rate": 5.883961242539966e-05, + "loss": 0.9141, + "step": 138870 + }, + { + "epoch": 0.8872647355710872, + "grad_norm": 1.3339205980300903, + "learning_rate": 5.883467373350716e-05, + "loss": 0.8036, + "step": 138880 + }, + { + "epoch": 0.8873286227208259, + "grad_norm": 0.8260666728019714, + "learning_rate": 5.882973495264186e-05, + "loss": 0.7641, + "step": 138890 + }, + { + "epoch": 0.8873925098705646, + "grad_norm": 1.3517704010009766, + "learning_rate": 5.8824796082853485e-05, + "loss": 0.7486, + "step": 138900 + }, + { + "epoch": 0.8874563970203033, + "grad_norm": 0.7072157859802246, + "learning_rate": 5.8819857124191766e-05, + "loss": 1.1322, + "step": 138910 + }, + { + "epoch": 0.887520284170042, + "grad_norm": 0.7836194634437561, + "learning_rate": 5.881491807670647e-05, + "loss": 1.0416, + "step": 138920 + }, + { + "epoch": 0.8875841713197807, + "grad_norm": 0.8094397783279419, + "learning_rate": 5.880997894044732e-05, + "loss": 0.7803, + "step": 138930 + }, + { + "epoch": 0.8876480584695194, + "grad_norm": 0.9594300985336304, + "learning_rate": 5.880503971546406e-05, + "loss": 0.6825, + "step": 138940 + }, + { + "epoch": 0.8877119456192581, + "grad_norm": 0.7078715562820435, + "learning_rate": 5.8800100401806436e-05, + "loss": 0.7998, + "step": 138950 + }, + { + "epoch": 0.8877758327689969, + "grad_norm": 1.1923208236694336, + "learning_rate": 5.879516099952418e-05, + "loss": 1.1095, + "step": 138960 + }, + { + "epoch": 0.8878397199187356, + "grad_norm": 0.8840433955192566, + "learning_rate": 5.8790221508667045e-05, + "loss": 0.8077, + "step": 138970 + }, + { + "epoch": 0.8879036070684743, + "grad_norm": 1.028594732284546, + "learning_rate": 5.878528192928479e-05, + "loss": 0.8315, + "step": 138980 + }, + { + "epoch": 0.887967494218213, + "grad_norm": 0.873859703540802, + "learning_rate": 5.878034226142712e-05, + "loss": 0.8896, + "step": 138990 + }, + { + "epoch": 0.8880313813679517, + "grad_norm": 1.0538140535354614, + "learning_rate": 5.877540250514383e-05, + "loss": 0.7489, + "step": 139000 + }, + { + "epoch": 0.8880952685176904, + "grad_norm": 1.347963571548462, + "learning_rate": 5.8770462660484625e-05, + "loss": 0.836, + "step": 139010 + }, + { + "epoch": 0.8881591556674291, + "grad_norm": 0.8959457874298096, + "learning_rate": 5.876552272749929e-05, + "loss": 0.7588, + "step": 139020 + }, + { + "epoch": 0.8882230428171678, + "grad_norm": 0.6587477922439575, + "learning_rate": 5.876058270623756e-05, + "loss": 0.7995, + "step": 139030 + }, + { + "epoch": 0.8882869299669065, + "grad_norm": 0.7410009503364563, + "learning_rate": 5.8755642596749164e-05, + "loss": 0.8671, + "step": 139040 + }, + { + "epoch": 0.8883508171166452, + "grad_norm": 0.96707683801651, + "learning_rate": 5.875070239908389e-05, + "loss": 0.7018, + "step": 139050 + }, + { + "epoch": 0.8884147042663839, + "grad_norm": 0.7956843972206116, + "learning_rate": 5.8745762113291455e-05, + "loss": 0.7706, + "step": 139060 + }, + { + "epoch": 0.8884785914161225, + "grad_norm": 0.9614824652671814, + "learning_rate": 5.874082173942165e-05, + "loss": 0.8501, + "step": 139070 + }, + { + "epoch": 0.8885424785658612, + "grad_norm": 1.660465121269226, + "learning_rate": 5.8735881277524195e-05, + "loss": 1.0422, + "step": 139080 + }, + { + "epoch": 0.8886063657155999, + "grad_norm": 0.7335018515586853, + "learning_rate": 5.8730940727648864e-05, + "loss": 1.0635, + "step": 139090 + }, + { + "epoch": 0.8886702528653386, + "grad_norm": 0.8188953399658203, + "learning_rate": 5.87260000898454e-05, + "loss": 0.9997, + "step": 139100 + }, + { + "epoch": 0.8887341400150773, + "grad_norm": 0.44217541813850403, + "learning_rate": 5.8721059364163564e-05, + "loss": 0.9746, + "step": 139110 + }, + { + "epoch": 0.888798027164816, + "grad_norm": 1.0299861431121826, + "learning_rate": 5.871611855065313e-05, + "loss": 1.0003, + "step": 139120 + }, + { + "epoch": 0.8888619143145547, + "grad_norm": 0.9896953105926514, + "learning_rate": 5.871117764936382e-05, + "loss": 0.8213, + "step": 139130 + }, + { + "epoch": 0.8889258014642935, + "grad_norm": 1.2226732969284058, + "learning_rate": 5.870623666034544e-05, + "loss": 0.8156, + "step": 139140 + }, + { + "epoch": 0.8889896886140322, + "grad_norm": 0.6431970000267029, + "learning_rate": 5.87012955836477e-05, + "loss": 0.9195, + "step": 139150 + }, + { + "epoch": 0.8890535757637709, + "grad_norm": 1.434300422668457, + "learning_rate": 5.86963544193204e-05, + "loss": 0.7808, + "step": 139160 + }, + { + "epoch": 0.8891174629135096, + "grad_norm": 0.8103228807449341, + "learning_rate": 5.869141316741328e-05, + "loss": 0.9496, + "step": 139170 + }, + { + "epoch": 0.8891813500632483, + "grad_norm": 0.9674835801124573, + "learning_rate": 5.868647182797612e-05, + "loss": 0.9964, + "step": 139180 + }, + { + "epoch": 0.889245237212987, + "grad_norm": 0.9061577916145325, + "learning_rate": 5.868153040105867e-05, + "loss": 0.9738, + "step": 139190 + }, + { + "epoch": 0.8893091243627257, + "grad_norm": 0.7973967790603638, + "learning_rate": 5.8676588886710695e-05, + "loss": 0.8598, + "step": 139200 + }, + { + "epoch": 0.8893730115124644, + "grad_norm": 1.2977885007858276, + "learning_rate": 5.867164728498197e-05, + "loss": 0.9625, + "step": 139210 + }, + { + "epoch": 0.8894368986622031, + "grad_norm": 0.897599458694458, + "learning_rate": 5.866670559592226e-05, + "loss": 1.0682, + "step": 139220 + }, + { + "epoch": 0.8895007858119418, + "grad_norm": 0.9015941023826599, + "learning_rate": 5.8661763819581314e-05, + "loss": 0.7841, + "step": 139230 + }, + { + "epoch": 0.8895646729616805, + "grad_norm": 1.129776120185852, + "learning_rate": 5.865682195600892e-05, + "loss": 1.0793, + "step": 139240 + }, + { + "epoch": 0.8896285601114192, + "grad_norm": 0.8388169407844543, + "learning_rate": 5.865188000525484e-05, + "loss": 0.9023, + "step": 139250 + }, + { + "epoch": 0.8896924472611579, + "grad_norm": 0.8551932573318481, + "learning_rate": 5.864693796736884e-05, + "loss": 0.7204, + "step": 139260 + }, + { + "epoch": 0.8897563344108966, + "grad_norm": 0.8071838021278381, + "learning_rate": 5.86419958424007e-05, + "loss": 0.7368, + "step": 139270 + }, + { + "epoch": 0.8898202215606353, + "grad_norm": 0.644696831703186, + "learning_rate": 5.863705363040017e-05, + "loss": 0.8828, + "step": 139280 + }, + { + "epoch": 0.889884108710374, + "grad_norm": 0.895206868648529, + "learning_rate": 5.863211133141705e-05, + "loss": 0.8151, + "step": 139290 + }, + { + "epoch": 0.8899479958601128, + "grad_norm": 1.169389009475708, + "learning_rate": 5.8627168945501096e-05, + "loss": 1.0632, + "step": 139300 + }, + { + "epoch": 0.8900118830098513, + "grad_norm": 0.695212721824646, + "learning_rate": 5.862222647270208e-05, + "loss": 0.884, + "step": 139310 + }, + { + "epoch": 0.89007577015959, + "grad_norm": 0.8649401664733887, + "learning_rate": 5.8617283913069796e-05, + "loss": 0.9877, + "step": 139320 + }, + { + "epoch": 0.8901396573093288, + "grad_norm": 1.0308243036270142, + "learning_rate": 5.8612341266654015e-05, + "loss": 0.9435, + "step": 139330 + }, + { + "epoch": 0.8902035444590675, + "grad_norm": 0.9423206448554993, + "learning_rate": 5.86073985335045e-05, + "loss": 0.8339, + "step": 139340 + }, + { + "epoch": 0.8902674316088062, + "grad_norm": 0.6064127087593079, + "learning_rate": 5.860245571367102e-05, + "loss": 0.8482, + "step": 139350 + }, + { + "epoch": 0.8903313187585449, + "grad_norm": 0.6485791206359863, + "learning_rate": 5.8597512807203393e-05, + "loss": 0.8751, + "step": 139360 + }, + { + "epoch": 0.8903952059082836, + "grad_norm": 4.558902740478516, + "learning_rate": 5.859256981415135e-05, + "loss": 0.9094, + "step": 139370 + }, + { + "epoch": 0.8904590930580223, + "grad_norm": 0.827876091003418, + "learning_rate": 5.858762673456472e-05, + "loss": 1.1908, + "step": 139380 + }, + { + "epoch": 0.890522980207761, + "grad_norm": 0.8407139182090759, + "learning_rate": 5.858268356849325e-05, + "loss": 0.8698, + "step": 139390 + }, + { + "epoch": 0.8905868673574997, + "grad_norm": 1.8288122415542603, + "learning_rate": 5.857774031598673e-05, + "loss": 1.018, + "step": 139400 + }, + { + "epoch": 0.8906507545072384, + "grad_norm": 0.8015510439872742, + "learning_rate": 5.8572796977094936e-05, + "loss": 0.803, + "step": 139410 + }, + { + "epoch": 0.8907146416569771, + "grad_norm": 1.08255934715271, + "learning_rate": 5.856785355186767e-05, + "loss": 0.8632, + "step": 139420 + }, + { + "epoch": 0.8907785288067158, + "grad_norm": 1.29863440990448, + "learning_rate": 5.8562910040354705e-05, + "loss": 0.8677, + "step": 139430 + }, + { + "epoch": 0.8908424159564545, + "grad_norm": 1.1884722709655762, + "learning_rate": 5.855796644260583e-05, + "loss": 0.8054, + "step": 139440 + }, + { + "epoch": 0.8909063031061932, + "grad_norm": 1.657820463180542, + "learning_rate": 5.8553022758670816e-05, + "loss": 0.866, + "step": 139450 + }, + { + "epoch": 0.8909701902559319, + "grad_norm": 1.095188021659851, + "learning_rate": 5.8548078988599484e-05, + "loss": 0.8458, + "step": 139460 + }, + { + "epoch": 0.8910340774056706, + "grad_norm": 0.8800215125083923, + "learning_rate": 5.8543135132441585e-05, + "loss": 0.7631, + "step": 139470 + }, + { + "epoch": 0.8910979645554093, + "grad_norm": 0.9168996214866638, + "learning_rate": 5.8538191190246924e-05, + "loss": 0.8652, + "step": 139480 + }, + { + "epoch": 0.8911618517051481, + "grad_norm": 0.6553764343261719, + "learning_rate": 5.85332471620653e-05, + "loss": 0.7113, + "step": 139490 + }, + { + "epoch": 0.8912257388548868, + "grad_norm": 1.1859967708587646, + "learning_rate": 5.85283030479465e-05, + "loss": 0.7221, + "step": 139500 + }, + { + "epoch": 0.8912896260046255, + "grad_norm": 0.7039145827293396, + "learning_rate": 5.852335884794029e-05, + "loss": 0.6689, + "step": 139510 + }, + { + "epoch": 0.8913535131543642, + "grad_norm": 2.0129079818725586, + "learning_rate": 5.85184145620965e-05, + "loss": 0.8524, + "step": 139520 + }, + { + "epoch": 0.8914174003041029, + "grad_norm": 0.9877476692199707, + "learning_rate": 5.8513470190464905e-05, + "loss": 0.8462, + "step": 139530 + }, + { + "epoch": 0.8914812874538416, + "grad_norm": 0.46770796179771423, + "learning_rate": 5.8508525733095285e-05, + "loss": 1.1589, + "step": 139540 + }, + { + "epoch": 0.8915451746035802, + "grad_norm": 1.1145647764205933, + "learning_rate": 5.8503581190037474e-05, + "loss": 1.1228, + "step": 139550 + }, + { + "epoch": 0.8916090617533189, + "grad_norm": 1.5091100931167603, + "learning_rate": 5.8498636561341224e-05, + "loss": 0.7566, + "step": 139560 + }, + { + "epoch": 0.8916729489030576, + "grad_norm": 0.8867336511611938, + "learning_rate": 5.849369184705635e-05, + "loss": 0.8833, + "step": 139570 + }, + { + "epoch": 0.8917368360527963, + "grad_norm": 1.0336995124816895, + "learning_rate": 5.8488747047232675e-05, + "loss": 0.9395, + "step": 139580 + }, + { + "epoch": 0.891800723202535, + "grad_norm": 1.3706622123718262, + "learning_rate": 5.848380216191995e-05, + "loss": 0.7776, + "step": 139590 + }, + { + "epoch": 0.8918646103522737, + "grad_norm": 1.2319433689117432, + "learning_rate": 5.8478857191168e-05, + "loss": 0.7916, + "step": 139600 + }, + { + "epoch": 0.8919284975020124, + "grad_norm": 0.5949766039848328, + "learning_rate": 5.847391213502663e-05, + "loss": 0.7991, + "step": 139610 + }, + { + "epoch": 0.8919923846517511, + "grad_norm": 0.7637538313865662, + "learning_rate": 5.846896699354564e-05, + "loss": 0.8839, + "step": 139620 + }, + { + "epoch": 0.8920562718014898, + "grad_norm": 1.117910385131836, + "learning_rate": 5.846402176677481e-05, + "loss": 0.7672, + "step": 139630 + }, + { + "epoch": 0.8921201589512285, + "grad_norm": 1.7316735982894897, + "learning_rate": 5.845907645476397e-05, + "loss": 1.0049, + "step": 139640 + }, + { + "epoch": 0.8921840461009672, + "grad_norm": 1.0572848320007324, + "learning_rate": 5.8454131057562914e-05, + "loss": 1.3189, + "step": 139650 + }, + { + "epoch": 0.892247933250706, + "grad_norm": 1.482458233833313, + "learning_rate": 5.844918557522143e-05, + "loss": 0.8126, + "step": 139660 + }, + { + "epoch": 0.8923118204004447, + "grad_norm": 1.1422396898269653, + "learning_rate": 5.8444240007789343e-05, + "loss": 0.946, + "step": 139670 + }, + { + "epoch": 0.8923757075501834, + "grad_norm": 0.6669201254844666, + "learning_rate": 5.8439294355316455e-05, + "loss": 0.8283, + "step": 139680 + }, + { + "epoch": 0.8924395946999221, + "grad_norm": 0.7748156785964966, + "learning_rate": 5.8434348617852566e-05, + "loss": 0.8111, + "step": 139690 + }, + { + "epoch": 0.8925034818496608, + "grad_norm": 0.7147510051727295, + "learning_rate": 5.842940279544751e-05, + "loss": 0.7302, + "step": 139700 + }, + { + "epoch": 0.8925673689993995, + "grad_norm": 0.9036562442779541, + "learning_rate": 5.842445688815106e-05, + "loss": 0.8618, + "step": 139710 + }, + { + "epoch": 0.8926312561491382, + "grad_norm": 1.1501970291137695, + "learning_rate": 5.841951089601304e-05, + "loss": 0.7836, + "step": 139720 + }, + { + "epoch": 0.8926951432988769, + "grad_norm": 1.0131080150604248, + "learning_rate": 5.8414564819083275e-05, + "loss": 0.7891, + "step": 139730 + }, + { + "epoch": 0.8927590304486156, + "grad_norm": 0.6381675601005554, + "learning_rate": 5.8409618657411544e-05, + "loss": 0.9683, + "step": 139740 + }, + { + "epoch": 0.8928229175983543, + "grad_norm": 0.8520289063453674, + "learning_rate": 5.840467241104769e-05, + "loss": 0.8815, + "step": 139750 + }, + { + "epoch": 0.892886804748093, + "grad_norm": 0.7620411515235901, + "learning_rate": 5.8399726080041504e-05, + "loss": 0.8859, + "step": 139760 + }, + { + "epoch": 0.8929506918978317, + "grad_norm": 0.7203412652015686, + "learning_rate": 5.839477966444282e-05, + "loss": 0.863, + "step": 139770 + }, + { + "epoch": 0.8930145790475704, + "grad_norm": 1.159543752670288, + "learning_rate": 5.8389833164301445e-05, + "loss": 0.7974, + "step": 139780 + }, + { + "epoch": 0.8930784661973091, + "grad_norm": 0.6249431371688843, + "learning_rate": 5.838488657966717e-05, + "loss": 0.954, + "step": 139790 + }, + { + "epoch": 0.8931423533470477, + "grad_norm": 0.8362451195716858, + "learning_rate": 5.8379939910589854e-05, + "loss": 0.8083, + "step": 139800 + }, + { + "epoch": 0.8932062404967864, + "grad_norm": 1.5072931051254272, + "learning_rate": 5.8374993157119296e-05, + "loss": 1.3744, + "step": 139810 + }, + { + "epoch": 0.8932701276465251, + "grad_norm": 0.9383344054222107, + "learning_rate": 5.8370046319305296e-05, + "loss": 0.8008, + "step": 139820 + }, + { + "epoch": 0.8933340147962638, + "grad_norm": 0.8047425150871277, + "learning_rate": 5.8365099397197695e-05, + "loss": 1.0529, + "step": 139830 + }, + { + "epoch": 0.8933979019460025, + "grad_norm": 0.8353585600852966, + "learning_rate": 5.8360152390846304e-05, + "loss": 0.6732, + "step": 139840 + }, + { + "epoch": 0.8934617890957413, + "grad_norm": 1.0151777267456055, + "learning_rate": 5.835520530030094e-05, + "loss": 0.7437, + "step": 139850 + }, + { + "epoch": 0.89352567624548, + "grad_norm": 0.9449456930160522, + "learning_rate": 5.8350258125611436e-05, + "loss": 0.8322, + "step": 139860 + }, + { + "epoch": 0.8935895633952187, + "grad_norm": 1.3340734243392944, + "learning_rate": 5.834531086682762e-05, + "loss": 0.9176, + "step": 139870 + }, + { + "epoch": 0.8936534505449574, + "grad_norm": 0.7839272022247314, + "learning_rate": 5.834036352399929e-05, + "loss": 0.8046, + "step": 139880 + }, + { + "epoch": 0.8937173376946961, + "grad_norm": 1.2315632104873657, + "learning_rate": 5.833541609717629e-05, + "loss": 0.9361, + "step": 139890 + }, + { + "epoch": 0.8937812248444348, + "grad_norm": 1.0572025775909424, + "learning_rate": 5.833046858640844e-05, + "loss": 0.7237, + "step": 139900 + }, + { + "epoch": 0.8938451119941735, + "grad_norm": 0.9382676482200623, + "learning_rate": 5.832552099174556e-05, + "loss": 0.8231, + "step": 139910 + }, + { + "epoch": 0.8939089991439122, + "grad_norm": 1.3315147161483765, + "learning_rate": 5.832057331323748e-05, + "loss": 0.8058, + "step": 139920 + }, + { + "epoch": 0.8939728862936509, + "grad_norm": 0.7122629284858704, + "learning_rate": 5.8316120330933764e-05, + "loss": 1.0295, + "step": 139930 + }, + { + "epoch": 0.8940367734433896, + "grad_norm": 0.9100248217582703, + "learning_rate": 5.831117249325708e-05, + "loss": 1.1005, + "step": 139940 + }, + { + "epoch": 0.8941006605931283, + "grad_norm": 1.913546085357666, + "learning_rate": 5.830622457187971e-05, + "loss": 0.9199, + "step": 139950 + }, + { + "epoch": 0.894164547742867, + "grad_norm": 0.9733704328536987, + "learning_rate": 5.830127656685145e-05, + "loss": 0.8767, + "step": 139960 + }, + { + "epoch": 0.8942284348926057, + "grad_norm": 1.0809566974639893, + "learning_rate": 5.8296328478222174e-05, + "loss": 0.7217, + "step": 139970 + }, + { + "epoch": 0.8942923220423444, + "grad_norm": 1.1782524585723877, + "learning_rate": 5.8291380306041685e-05, + "loss": 1.0244, + "step": 139980 + }, + { + "epoch": 0.8943562091920831, + "grad_norm": 0.9064955711364746, + "learning_rate": 5.828643205035982e-05, + "loss": 0.8093, + "step": 139990 + }, + { + "epoch": 0.8944200963418218, + "grad_norm": 1.3655163049697876, + "learning_rate": 5.828148371122643e-05, + "loss": 0.9088, + "step": 140000 + }, + { + "epoch": 0.8944839834915606, + "grad_norm": 0.760166585445404, + "learning_rate": 5.8276535288691325e-05, + "loss": 0.8999, + "step": 140010 + }, + { + "epoch": 0.8945478706412993, + "grad_norm": 1.0654029846191406, + "learning_rate": 5.8271586782804344e-05, + "loss": 0.9849, + "step": 140020 + }, + { + "epoch": 0.894611757791038, + "grad_norm": 0.7643817067146301, + "learning_rate": 5.826663819361534e-05, + "loss": 1.1517, + "step": 140030 + }, + { + "epoch": 0.8946756449407766, + "grad_norm": 0.8907740116119385, + "learning_rate": 5.8261689521174136e-05, + "loss": 1.0153, + "step": 140040 + }, + { + "epoch": 0.8947395320905153, + "grad_norm": 0.8669731616973877, + "learning_rate": 5.825674076553056e-05, + "loss": 0.9049, + "step": 140050 + }, + { + "epoch": 0.894803419240254, + "grad_norm": 0.9580491185188293, + "learning_rate": 5.8251791926734464e-05, + "loss": 0.885, + "step": 140060 + }, + { + "epoch": 0.8948673063899927, + "grad_norm": 1.1952874660491943, + "learning_rate": 5.8246843004835695e-05, + "loss": 0.8488, + "step": 140070 + }, + { + "epoch": 0.8949311935397314, + "grad_norm": 0.9386703372001648, + "learning_rate": 5.824189399988408e-05, + "loss": 0.8763, + "step": 140080 + }, + { + "epoch": 0.8949950806894701, + "grad_norm": 0.8566167950630188, + "learning_rate": 5.823694491192947e-05, + "loss": 0.7872, + "step": 140090 + }, + { + "epoch": 0.8950589678392088, + "grad_norm": 0.6722133755683899, + "learning_rate": 5.8231995741021685e-05, + "loss": 0.9128, + "step": 140100 + }, + { + "epoch": 0.8951228549889475, + "grad_norm": 0.688102662563324, + "learning_rate": 5.822704648721059e-05, + "loss": 0.8653, + "step": 140110 + }, + { + "epoch": 0.8951867421386862, + "grad_norm": 0.8262643814086914, + "learning_rate": 5.8222097150545996e-05, + "loss": 0.9295, + "step": 140120 + }, + { + "epoch": 0.8952506292884249, + "grad_norm": 0.7746517658233643, + "learning_rate": 5.821714773107779e-05, + "loss": 0.9269, + "step": 140130 + }, + { + "epoch": 0.8953145164381636, + "grad_norm": 0.5757784247398376, + "learning_rate": 5.82121982288558e-05, + "loss": 0.9843, + "step": 140140 + }, + { + "epoch": 0.8953784035879023, + "grad_norm": 0.8565959334373474, + "learning_rate": 5.8207248643929854e-05, + "loss": 0.9264, + "step": 140150 + }, + { + "epoch": 0.895442290737641, + "grad_norm": 0.7413806319236755, + "learning_rate": 5.820229897634983e-05, + "loss": 1.0038, + "step": 140160 + }, + { + "epoch": 0.8955061778873797, + "grad_norm": 0.862273633480072, + "learning_rate": 5.8197349226165556e-05, + "loss": 1.0254, + "step": 140170 + }, + { + "epoch": 0.8955700650371184, + "grad_norm": 1.1083346605300903, + "learning_rate": 5.8192399393426874e-05, + "loss": 1.0992, + "step": 140180 + }, + { + "epoch": 0.8956339521868572, + "grad_norm": 0.8349512219429016, + "learning_rate": 5.818744947818367e-05, + "loss": 0.9434, + "step": 140190 + }, + { + "epoch": 0.8956978393365959, + "grad_norm": 0.8543719053268433, + "learning_rate": 5.818249948048573e-05, + "loss": 0.7931, + "step": 140200 + }, + { + "epoch": 0.8957617264863346, + "grad_norm": 0.930448055267334, + "learning_rate": 5.817754940038296e-05, + "loss": 0.9503, + "step": 140210 + }, + { + "epoch": 0.8958256136360733, + "grad_norm": 1.1186769008636475, + "learning_rate": 5.8172599237925195e-05, + "loss": 0.9436, + "step": 140220 + }, + { + "epoch": 0.895889500785812, + "grad_norm": 0.9526256322860718, + "learning_rate": 5.8167648993162285e-05, + "loss": 0.7081, + "step": 140230 + }, + { + "epoch": 0.8959533879355507, + "grad_norm": 1.0328584909439087, + "learning_rate": 5.816269866614408e-05, + "loss": 0.9017, + "step": 140240 + }, + { + "epoch": 0.8960172750852894, + "grad_norm": 0.8597428798675537, + "learning_rate": 5.815774825692044e-05, + "loss": 0.8572, + "step": 140250 + }, + { + "epoch": 0.8960811622350281, + "grad_norm": 1.493808388710022, + "learning_rate": 5.815279776554121e-05, + "loss": 0.728, + "step": 140260 + }, + { + "epoch": 0.8961450493847668, + "grad_norm": 0.6569556593894958, + "learning_rate": 5.814784719205626e-05, + "loss": 0.7934, + "step": 140270 + }, + { + "epoch": 0.8962089365345054, + "grad_norm": 1.5247915983200073, + "learning_rate": 5.814289653651544e-05, + "loss": 0.9852, + "step": 140280 + }, + { + "epoch": 0.8962728236842441, + "grad_norm": 0.8611108064651489, + "learning_rate": 5.8137945798968606e-05, + "loss": 0.7608, + "step": 140290 + }, + { + "epoch": 0.8963367108339828, + "grad_norm": 1.3073726892471313, + "learning_rate": 5.813299497946562e-05, + "loss": 1.0608, + "step": 140300 + }, + { + "epoch": 0.8964005979837215, + "grad_norm": 0.9002431631088257, + "learning_rate": 5.812804407805633e-05, + "loss": 0.9049, + "step": 140310 + }, + { + "epoch": 0.8964644851334602, + "grad_norm": 0.9179620742797852, + "learning_rate": 5.8123093094790603e-05, + "loss": 0.761, + "step": 140320 + }, + { + "epoch": 0.8965283722831989, + "grad_norm": 1.2235110998153687, + "learning_rate": 5.8118142029718303e-05, + "loss": 0.7735, + "step": 140330 + }, + { + "epoch": 0.8965922594329376, + "grad_norm": 0.7875511646270752, + "learning_rate": 5.811319088288931e-05, + "loss": 1.0747, + "step": 140340 + }, + { + "epoch": 0.8966561465826763, + "grad_norm": 1.0100558996200562, + "learning_rate": 5.8108239654353444e-05, + "loss": 0.8439, + "step": 140350 + }, + { + "epoch": 0.896720033732415, + "grad_norm": 0.727079451084137, + "learning_rate": 5.81032883441606e-05, + "loss": 0.6591, + "step": 140360 + }, + { + "epoch": 0.8967839208821538, + "grad_norm": 0.9995219707489014, + "learning_rate": 5.809833695236063e-05, + "loss": 1.0365, + "step": 140370 + }, + { + "epoch": 0.8968478080318925, + "grad_norm": 1.349561095237732, + "learning_rate": 5.80933854790034e-05, + "loss": 0.8023, + "step": 140380 + }, + { + "epoch": 0.8969116951816312, + "grad_norm": 0.9812521934509277, + "learning_rate": 5.8088433924138785e-05, + "loss": 0.6831, + "step": 140390 + }, + { + "epoch": 0.8969755823313699, + "grad_norm": 0.8825498223304749, + "learning_rate": 5.808348228781662e-05, + "loss": 0.8826, + "step": 140400 + }, + { + "epoch": 0.8970394694811086, + "grad_norm": 1.0122778415679932, + "learning_rate": 5.807853057008682e-05, + "loss": 0.8666, + "step": 140410 + }, + { + "epoch": 0.8971033566308473, + "grad_norm": 0.6166019439697266, + "learning_rate": 5.807357877099922e-05, + "loss": 1.0452, + "step": 140420 + }, + { + "epoch": 0.897167243780586, + "grad_norm": 0.8858250379562378, + "learning_rate": 5.806862689060369e-05, + "loss": 1.0248, + "step": 140430 + }, + { + "epoch": 0.8972311309303247, + "grad_norm": 0.8427072167396545, + "learning_rate": 5.806367492895011e-05, + "loss": 0.7888, + "step": 140440 + }, + { + "epoch": 0.8972950180800634, + "grad_norm": 0.750184178352356, + "learning_rate": 5.805872288608834e-05, + "loss": 0.7918, + "step": 140450 + }, + { + "epoch": 0.8973589052298021, + "grad_norm": 1.128303050994873, + "learning_rate": 5.805377076206828e-05, + "loss": 0.6939, + "step": 140460 + }, + { + "epoch": 0.8974227923795408, + "grad_norm": 0.7906418442726135, + "learning_rate": 5.804881855693976e-05, + "loss": 0.9361, + "step": 140470 + }, + { + "epoch": 0.8974866795292795, + "grad_norm": 0.7705846428871155, + "learning_rate": 5.804386627075268e-05, + "loss": 1.2284, + "step": 140480 + }, + { + "epoch": 0.8975505666790182, + "grad_norm": 0.898186445236206, + "learning_rate": 5.803891390355691e-05, + "loss": 0.7291, + "step": 140490 + }, + { + "epoch": 0.8976144538287569, + "grad_norm": 0.7422863841056824, + "learning_rate": 5.803396145540232e-05, + "loss": 0.7275, + "step": 140500 + }, + { + "epoch": 0.8976783409784956, + "grad_norm": 1.0972338914871216, + "learning_rate": 5.802900892633879e-05, + "loss": 0.8772, + "step": 140510 + }, + { + "epoch": 0.8977422281282342, + "grad_norm": 1.0843368768692017, + "learning_rate": 5.8024056316416197e-05, + "loss": 0.9729, + "step": 140520 + }, + { + "epoch": 0.8978061152779729, + "grad_norm": 1.3560301065444946, + "learning_rate": 5.801910362568441e-05, + "loss": 1.0922, + "step": 140530 + }, + { + "epoch": 0.8978700024277116, + "grad_norm": 0.6802355647087097, + "learning_rate": 5.801415085419332e-05, + "loss": 0.851, + "step": 140540 + }, + { + "epoch": 0.8979338895774504, + "grad_norm": 1.0492808818817139, + "learning_rate": 5.800919800199279e-05, + "loss": 0.7976, + "step": 140550 + }, + { + "epoch": 0.8979977767271891, + "grad_norm": 0.4455210864543915, + "learning_rate": 5.8004245069132714e-05, + "loss": 1.0168, + "step": 140560 + }, + { + "epoch": 0.8980616638769278, + "grad_norm": 1.461052417755127, + "learning_rate": 5.799929205566296e-05, + "loss": 0.7563, + "step": 140570 + }, + { + "epoch": 0.8981255510266665, + "grad_norm": 0.7567191123962402, + "learning_rate": 5.799433896163342e-05, + "loss": 0.7135, + "step": 140580 + }, + { + "epoch": 0.8981894381764052, + "grad_norm": 1.1161195039749146, + "learning_rate": 5.7989385787093965e-05, + "loss": 1.2382, + "step": 140590 + }, + { + "epoch": 0.8982533253261439, + "grad_norm": 1.0105892419815063, + "learning_rate": 5.798443253209449e-05, + "loss": 0.9698, + "step": 140600 + }, + { + "epoch": 0.8983172124758826, + "grad_norm": 0.8450389504432678, + "learning_rate": 5.797947919668486e-05, + "loss": 1.1442, + "step": 140610 + }, + { + "epoch": 0.8983810996256213, + "grad_norm": 1.1079736948013306, + "learning_rate": 5.797452578091498e-05, + "loss": 0.9697, + "step": 140620 + }, + { + "epoch": 0.89844498677536, + "grad_norm": 1.0912140607833862, + "learning_rate": 5.796957228483473e-05, + "loss": 0.9725, + "step": 140630 + }, + { + "epoch": 0.8985088739250987, + "grad_norm": 1.1677342653274536, + "learning_rate": 5.7964618708493966e-05, + "loss": 0.996, + "step": 140640 + }, + { + "epoch": 0.8985727610748374, + "grad_norm": 0.8014145493507385, + "learning_rate": 5.7959665051942626e-05, + "loss": 0.8948, + "step": 140650 + }, + { + "epoch": 0.8986366482245761, + "grad_norm": 0.5833203792572021, + "learning_rate": 5.795471131523057e-05, + "loss": 0.8698, + "step": 140660 + }, + { + "epoch": 0.8987005353743148, + "grad_norm": 1.0062291622161865, + "learning_rate": 5.7949757498407686e-05, + "loss": 0.8926, + "step": 140670 + }, + { + "epoch": 0.8987644225240535, + "grad_norm": 0.8668988943099976, + "learning_rate": 5.7944803601523866e-05, + "loss": 0.9216, + "step": 140680 + }, + { + "epoch": 0.8988283096737922, + "grad_norm": 0.9266487956047058, + "learning_rate": 5.793984962462901e-05, + "loss": 0.887, + "step": 140690 + }, + { + "epoch": 0.8988921968235309, + "grad_norm": 0.897591769695282, + "learning_rate": 5.793489556777299e-05, + "loss": 0.815, + "step": 140700 + }, + { + "epoch": 0.8989560839732696, + "grad_norm": 1.11785089969635, + "learning_rate": 5.792994143100571e-05, + "loss": 0.7505, + "step": 140710 + }, + { + "epoch": 0.8990199711230084, + "grad_norm": 0.7704851627349854, + "learning_rate": 5.7924987214377056e-05, + "loss": 0.7002, + "step": 140720 + }, + { + "epoch": 0.8990838582727471, + "grad_norm": 1.0531551837921143, + "learning_rate": 5.7920032917936925e-05, + "loss": 0.8227, + "step": 140730 + }, + { + "epoch": 0.8991477454224858, + "grad_norm": 1.1784263849258423, + "learning_rate": 5.791507854173521e-05, + "loss": 1.0551, + "step": 140740 + }, + { + "epoch": 0.8992116325722245, + "grad_norm": 1.1204239130020142, + "learning_rate": 5.791012408582182e-05, + "loss": 0.7425, + "step": 140750 + }, + { + "epoch": 0.8992755197219632, + "grad_norm": 1.3130213022232056, + "learning_rate": 5.790516955024662e-05, + "loss": 0.6903, + "step": 140760 + }, + { + "epoch": 0.8993394068717018, + "grad_norm": 1.0173828601837158, + "learning_rate": 5.790021493505953e-05, + "loss": 1.0036, + "step": 140770 + }, + { + "epoch": 0.8994032940214405, + "grad_norm": 0.8122161030769348, + "learning_rate": 5.789526024031044e-05, + "loss": 0.909, + "step": 140780 + }, + { + "epoch": 0.8994671811711792, + "grad_norm": 1.2210887670516968, + "learning_rate": 5.7890305466049255e-05, + "loss": 0.9721, + "step": 140790 + }, + { + "epoch": 0.8995310683209179, + "grad_norm": 1.316361665725708, + "learning_rate": 5.788535061232586e-05, + "loss": 0.8888, + "step": 140800 + }, + { + "epoch": 0.8995949554706566, + "grad_norm": 0.7551913857460022, + "learning_rate": 5.788039567919017e-05, + "loss": 1.071, + "step": 140810 + }, + { + "epoch": 0.8996588426203953, + "grad_norm": 1.1682409048080444, + "learning_rate": 5.787544066669207e-05, + "loss": 0.7179, + "step": 140820 + }, + { + "epoch": 0.899722729770134, + "grad_norm": 1.1482545137405396, + "learning_rate": 5.787048557488147e-05, + "loss": 0.8796, + "step": 140830 + }, + { + "epoch": 0.8997866169198727, + "grad_norm": 0.6773545145988464, + "learning_rate": 5.786553040380828e-05, + "loss": 0.8058, + "step": 140840 + }, + { + "epoch": 0.8998505040696114, + "grad_norm": 1.3889111280441284, + "learning_rate": 5.7860575153522375e-05, + "loss": 0.7399, + "step": 140850 + }, + { + "epoch": 0.8999143912193501, + "grad_norm": 1.0887017250061035, + "learning_rate": 5.785561982407371e-05, + "loss": 0.7941, + "step": 140860 + }, + { + "epoch": 0.8999782783690888, + "grad_norm": 0.843041718006134, + "learning_rate": 5.785066441551212e-05, + "loss": 0.9538, + "step": 140870 + }, + { + "epoch": 0.9000421655188275, + "grad_norm": 0.9653436541557312, + "learning_rate": 5.784570892788758e-05, + "loss": 1.5095, + "step": 140880 + }, + { + "epoch": 0.9001060526685662, + "grad_norm": 0.824621319770813, + "learning_rate": 5.7840753361249945e-05, + "loss": 0.9549, + "step": 140890 + }, + { + "epoch": 0.900169939818305, + "grad_norm": 0.7692892551422119, + "learning_rate": 5.783579771564914e-05, + "loss": 0.8578, + "step": 140900 + }, + { + "epoch": 0.9002338269680437, + "grad_norm": 0.9946816563606262, + "learning_rate": 5.7830841991135086e-05, + "loss": 0.8378, + "step": 140910 + }, + { + "epoch": 0.9002977141177824, + "grad_norm": 0.8059331178665161, + "learning_rate": 5.782588618775766e-05, + "loss": 0.8517, + "step": 140920 + }, + { + "epoch": 0.9003616012675211, + "grad_norm": 0.7814688086509705, + "learning_rate": 5.782093030556681e-05, + "loss": 0.8913, + "step": 140930 + }, + { + "epoch": 0.9004254884172598, + "grad_norm": 0.7162201404571533, + "learning_rate": 5.781597434461241e-05, + "loss": 0.7852, + "step": 140940 + }, + { + "epoch": 0.9004893755669985, + "grad_norm": 0.7033975720405579, + "learning_rate": 5.78110183049444e-05, + "loss": 0.8596, + "step": 140950 + }, + { + "epoch": 0.9005532627167372, + "grad_norm": 1.4916331768035889, + "learning_rate": 5.7806062186612666e-05, + "loss": 0.9765, + "step": 140960 + }, + { + "epoch": 0.9006171498664759, + "grad_norm": 0.9655159115791321, + "learning_rate": 5.7801105989667134e-05, + "loss": 0.6591, + "step": 140970 + }, + { + "epoch": 0.9006810370162146, + "grad_norm": 1.061277985572815, + "learning_rate": 5.7796149714157724e-05, + "loss": 1.0504, + "step": 140980 + }, + { + "epoch": 0.9007449241659533, + "grad_norm": 0.7868290543556213, + "learning_rate": 5.779119336013433e-05, + "loss": 0.8025, + "step": 140990 + }, + { + "epoch": 0.900808811315692, + "grad_norm": 0.7589150667190552, + "learning_rate": 5.7786236927646886e-05, + "loss": 0.704, + "step": 141000 + }, + { + "epoch": 0.9008726984654306, + "grad_norm": 0.7982025742530823, + "learning_rate": 5.77812804167453e-05, + "loss": 0.8338, + "step": 141010 + }, + { + "epoch": 0.9009365856151693, + "grad_norm": 1.031693458557129, + "learning_rate": 5.7776323827479484e-05, + "loss": 0.8807, + "step": 141020 + }, + { + "epoch": 0.901000472764908, + "grad_norm": 0.9753697514533997, + "learning_rate": 5.777136715989936e-05, + "loss": 0.7684, + "step": 141030 + }, + { + "epoch": 0.9010643599146467, + "grad_norm": 0.9821022152900696, + "learning_rate": 5.776641041405485e-05, + "loss": 0.7901, + "step": 141040 + }, + { + "epoch": 0.9011282470643854, + "grad_norm": 0.5876692533493042, + "learning_rate": 5.776145358999587e-05, + "loss": 0.8646, + "step": 141050 + }, + { + "epoch": 0.9011921342141241, + "grad_norm": 1.2983272075653076, + "learning_rate": 5.7756496687772346e-05, + "loss": 0.721, + "step": 141060 + }, + { + "epoch": 0.9012560213638628, + "grad_norm": 0.7570874691009521, + "learning_rate": 5.775153970743418e-05, + "loss": 0.776, + "step": 141070 + }, + { + "epoch": 0.9013199085136016, + "grad_norm": 0.7375748157501221, + "learning_rate": 5.77465826490313e-05, + "loss": 0.7279, + "step": 141080 + }, + { + "epoch": 0.9013837956633403, + "grad_norm": 1.1768232583999634, + "learning_rate": 5.774162551261363e-05, + "loss": 0.8846, + "step": 141090 + }, + { + "epoch": 0.901447682813079, + "grad_norm": 0.6223422884941101, + "learning_rate": 5.7736668298231103e-05, + "loss": 0.8627, + "step": 141100 + }, + { + "epoch": 0.9015115699628177, + "grad_norm": 0.9526621699333191, + "learning_rate": 5.773171100593362e-05, + "loss": 1.0312, + "step": 141110 + }, + { + "epoch": 0.9015754571125564, + "grad_norm": 1.0160198211669922, + "learning_rate": 5.772675363577112e-05, + "loss": 1.0077, + "step": 141120 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 0.5634738802909851, + "learning_rate": 5.772179618779354e-05, + "loss": 0.8606, + "step": 141130 + }, + { + "epoch": 0.9017032314120338, + "grad_norm": 0.93465656042099, + "learning_rate": 5.7716838662050784e-05, + "loss": 1.0614, + "step": 141140 + }, + { + "epoch": 0.9017671185617725, + "grad_norm": 0.9534331560134888, + "learning_rate": 5.7711881058592786e-05, + "loss": 0.796, + "step": 141150 + }, + { + "epoch": 0.9018310057115112, + "grad_norm": 1.255573034286499, + "learning_rate": 5.7706923377469477e-05, + "loss": 0.7282, + "step": 141160 + }, + { + "epoch": 0.9018948928612499, + "grad_norm": 0.8945760726928711, + "learning_rate": 5.770196561873077e-05, + "loss": 0.7917, + "step": 141170 + }, + { + "epoch": 0.9019587800109886, + "grad_norm": 1.1224944591522217, + "learning_rate": 5.769700778242661e-05, + "loss": 1.3042, + "step": 141180 + }, + { + "epoch": 0.9020226671607273, + "grad_norm": 1.421687126159668, + "learning_rate": 5.769204986860692e-05, + "loss": 0.8137, + "step": 141190 + }, + { + "epoch": 0.902086554310466, + "grad_norm": 0.8872746229171753, + "learning_rate": 5.7687091877321654e-05, + "loss": 0.7287, + "step": 141200 + }, + { + "epoch": 0.9021504414602047, + "grad_norm": 0.8511372804641724, + "learning_rate": 5.7682133808620706e-05, + "loss": 0.6813, + "step": 141210 + }, + { + "epoch": 0.9022143286099434, + "grad_norm": 0.7379246354103088, + "learning_rate": 5.7677175662554025e-05, + "loss": 0.9436, + "step": 141220 + }, + { + "epoch": 0.9022782157596821, + "grad_norm": 0.7902219891548157, + "learning_rate": 5.767221743917155e-05, + "loss": 1.0224, + "step": 141230 + }, + { + "epoch": 0.9023421029094209, + "grad_norm": 1.0444148778915405, + "learning_rate": 5.766725913852321e-05, + "loss": 0.7918, + "step": 141240 + }, + { + "epoch": 0.9024059900591594, + "grad_norm": 0.641941249370575, + "learning_rate": 5.766230076065893e-05, + "loss": 0.749, + "step": 141250 + }, + { + "epoch": 0.9024698772088982, + "grad_norm": 0.4563290476799011, + "learning_rate": 5.7657342305628647e-05, + "loss": 0.6918, + "step": 141260 + }, + { + "epoch": 0.9025337643586369, + "grad_norm": 0.9786915183067322, + "learning_rate": 5.765238377348232e-05, + "loss": 0.6863, + "step": 141270 + }, + { + "epoch": 0.9025976515083756, + "grad_norm": 0.9505366086959839, + "learning_rate": 5.764742516426985e-05, + "loss": 0.8688, + "step": 141280 + }, + { + "epoch": 0.9026615386581143, + "grad_norm": 0.6907140612602234, + "learning_rate": 5.76424664780412e-05, + "loss": 0.6706, + "step": 141290 + }, + { + "epoch": 0.902725425807853, + "grad_norm": 0.9045562148094177, + "learning_rate": 5.7637507714846304e-05, + "loss": 0.9848, + "step": 141300 + }, + { + "epoch": 0.9027893129575917, + "grad_norm": 1.1987589597702026, + "learning_rate": 5.763254887473512e-05, + "loss": 0.8481, + "step": 141310 + }, + { + "epoch": 0.9028532001073304, + "grad_norm": 1.0016857385635376, + "learning_rate": 5.7627589957757535e-05, + "loss": 0.9294, + "step": 141320 + }, + { + "epoch": 0.9029170872570691, + "grad_norm": 0.8415845632553101, + "learning_rate": 5.762263096396351e-05, + "loss": 0.7345, + "step": 141330 + }, + { + "epoch": 0.9029809744068078, + "grad_norm": 0.8191704154014587, + "learning_rate": 5.761767189340302e-05, + "loss": 0.9255, + "step": 141340 + }, + { + "epoch": 0.9030448615565465, + "grad_norm": 0.7609559893608093, + "learning_rate": 5.761271274612597e-05, + "loss": 0.7549, + "step": 141350 + }, + { + "epoch": 0.9031087487062852, + "grad_norm": 0.6478745341300964, + "learning_rate": 5.7607753522182326e-05, + "loss": 0.882, + "step": 141360 + }, + { + "epoch": 0.9031726358560239, + "grad_norm": 1.0700238943099976, + "learning_rate": 5.7602794221622024e-05, + "loss": 0.8373, + "step": 141370 + }, + { + "epoch": 0.9032365230057626, + "grad_norm": 0.638687252998352, + "learning_rate": 5.7597834844495005e-05, + "loss": 0.8512, + "step": 141380 + }, + { + "epoch": 0.9033004101555013, + "grad_norm": 0.8440176248550415, + "learning_rate": 5.759287539085121e-05, + "loss": 0.8574, + "step": 141390 + }, + { + "epoch": 0.90336429730524, + "grad_norm": 1.7291699647903442, + "learning_rate": 5.7587915860740596e-05, + "loss": 0.9808, + "step": 141400 + }, + { + "epoch": 0.9034281844549787, + "grad_norm": 1.2312055826187134, + "learning_rate": 5.758295625421311e-05, + "loss": 0.7749, + "step": 141410 + }, + { + "epoch": 0.9034920716047175, + "grad_norm": 0.8590479493141174, + "learning_rate": 5.757799657131868e-05, + "loss": 0.9986, + "step": 141420 + }, + { + "epoch": 0.9035559587544562, + "grad_norm": 0.7374904155731201, + "learning_rate": 5.757303681210728e-05, + "loss": 0.7731, + "step": 141430 + }, + { + "epoch": 0.9036198459041949, + "grad_norm": 1.1177715063095093, + "learning_rate": 5.756807697662885e-05, + "loss": 1.0543, + "step": 141440 + }, + { + "epoch": 0.9036837330539336, + "grad_norm": 0.9571028351783752, + "learning_rate": 5.7563117064933327e-05, + "loss": 0.9526, + "step": 141450 + }, + { + "epoch": 0.9037476202036723, + "grad_norm": 0.8054457902908325, + "learning_rate": 5.755815707707067e-05, + "loss": 1.079, + "step": 141460 + }, + { + "epoch": 0.903811507353411, + "grad_norm": 1.010640263557434, + "learning_rate": 5.755319701309084e-05, + "loss": 0.9706, + "step": 141470 + }, + { + "epoch": 0.9038753945031497, + "grad_norm": 0.7342219948768616, + "learning_rate": 5.7548236873043795e-05, + "loss": 0.6875, + "step": 141480 + }, + { + "epoch": 0.9039392816528884, + "grad_norm": 0.8823431730270386, + "learning_rate": 5.754327665697945e-05, + "loss": 0.8113, + "step": 141490 + }, + { + "epoch": 0.904003168802627, + "grad_norm": 0.9259976744651794, + "learning_rate": 5.75383163649478e-05, + "loss": 0.7433, + "step": 141500 + }, + { + "epoch": 0.9040670559523657, + "grad_norm": 0.7435508370399475, + "learning_rate": 5.753335599699877e-05, + "loss": 0.725, + "step": 141510 + }, + { + "epoch": 0.9041309431021044, + "grad_norm": 1.2026573419570923, + "learning_rate": 5.752839555318235e-05, + "loss": 0.9269, + "step": 141520 + }, + { + "epoch": 0.9041948302518431, + "grad_norm": 0.9192835092544556, + "learning_rate": 5.752343503354844e-05, + "loss": 0.7876, + "step": 141530 + }, + { + "epoch": 0.9042587174015818, + "grad_norm": 0.9267190098762512, + "learning_rate": 5.7518474438147054e-05, + "loss": 0.9516, + "step": 141540 + }, + { + "epoch": 0.9043226045513205, + "grad_norm": 0.72882479429245, + "learning_rate": 5.7513513767028124e-05, + "loss": 0.8018, + "step": 141550 + }, + { + "epoch": 0.9043864917010592, + "grad_norm": 0.7397010326385498, + "learning_rate": 5.7508553020241606e-05, + "loss": 0.7555, + "step": 141560 + }, + { + "epoch": 0.9044503788507979, + "grad_norm": 1.5903676748275757, + "learning_rate": 5.750359219783746e-05, + "loss": 0.8326, + "step": 141570 + }, + { + "epoch": 0.9045142660005366, + "grad_norm": 1.2328916788101196, + "learning_rate": 5.749863129986566e-05, + "loss": 1.1779, + "step": 141580 + }, + { + "epoch": 0.9045781531502753, + "grad_norm": 1.1340621709823608, + "learning_rate": 5.7493670326376146e-05, + "loss": 1.0768, + "step": 141590 + }, + { + "epoch": 0.904642040300014, + "grad_norm": 0.6127023100852966, + "learning_rate": 5.74887092774189e-05, + "loss": 0.9385, + "step": 141600 + }, + { + "epoch": 0.9047059274497528, + "grad_norm": 1.0815454721450806, + "learning_rate": 5.748374815304386e-05, + "loss": 0.6736, + "step": 141610 + }, + { + "epoch": 0.9047698145994915, + "grad_norm": 1.3410221338272095, + "learning_rate": 5.7478786953301014e-05, + "loss": 0.8431, + "step": 141620 + }, + { + "epoch": 0.9048337017492302, + "grad_norm": 1.1525499820709229, + "learning_rate": 5.74738256782403e-05, + "loss": 0.872, + "step": 141630 + }, + { + "epoch": 0.9048975888989689, + "grad_norm": 0.8780061602592468, + "learning_rate": 5.74688643279117e-05, + "loss": 0.9206, + "step": 141640 + }, + { + "epoch": 0.9049614760487076, + "grad_norm": 1.385847568511963, + "learning_rate": 5.7463902902365174e-05, + "loss": 0.8192, + "step": 141650 + }, + { + "epoch": 0.9050253631984463, + "grad_norm": 1.0178450345993042, + "learning_rate": 5.745894140165069e-05, + "loss": 0.8582, + "step": 141660 + }, + { + "epoch": 0.905089250348185, + "grad_norm": 1.3906409740447998, + "learning_rate": 5.745397982581822e-05, + "loss": 0.79, + "step": 141670 + }, + { + "epoch": 0.9051531374979237, + "grad_norm": 0.7807207107543945, + "learning_rate": 5.7449018174917726e-05, + "loss": 0.721, + "step": 141680 + }, + { + "epoch": 0.9052170246476624, + "grad_norm": 0.7866250872612, + "learning_rate": 5.744405644899916e-05, + "loss": 0.7746, + "step": 141690 + }, + { + "epoch": 0.9052809117974011, + "grad_norm": 0.7729105949401855, + "learning_rate": 5.74390946481125e-05, + "loss": 0.682, + "step": 141700 + }, + { + "epoch": 0.9053447989471398, + "grad_norm": 1.1368606090545654, + "learning_rate": 5.7434132772307735e-05, + "loss": 1.2501, + "step": 141710 + }, + { + "epoch": 0.9054086860968785, + "grad_norm": 2.2718279361724854, + "learning_rate": 5.742917082163483e-05, + "loss": 0.8217, + "step": 141720 + }, + { + "epoch": 0.9054725732466172, + "grad_norm": 0.683323323726654, + "learning_rate": 5.742420879614373e-05, + "loss": 1.092, + "step": 141730 + }, + { + "epoch": 0.9055364603963558, + "grad_norm": 0.8295241594314575, + "learning_rate": 5.741924669588443e-05, + "loss": 0.6993, + "step": 141740 + }, + { + "epoch": 0.9056003475460945, + "grad_norm": 0.8456636667251587, + "learning_rate": 5.7414284520906905e-05, + "loss": 0.6898, + "step": 141750 + }, + { + "epoch": 0.9056642346958332, + "grad_norm": 0.8255101442337036, + "learning_rate": 5.7409322271261115e-05, + "loss": 0.9425, + "step": 141760 + }, + { + "epoch": 0.9057281218455719, + "grad_norm": 0.7259197235107422, + "learning_rate": 5.740435994699704e-05, + "loss": 0.7357, + "step": 141770 + }, + { + "epoch": 0.9057920089953106, + "grad_norm": 0.8073982000350952, + "learning_rate": 5.739939754816466e-05, + "loss": 1.0821, + "step": 141780 + }, + { + "epoch": 0.9058558961450494, + "grad_norm": 0.9572609663009644, + "learning_rate": 5.7394435074813944e-05, + "loss": 0.9942, + "step": 141790 + }, + { + "epoch": 0.9059197832947881, + "grad_norm": 0.7269392013549805, + "learning_rate": 5.738947252699487e-05, + "loss": 0.8068, + "step": 141800 + }, + { + "epoch": 0.9059836704445268, + "grad_norm": 0.710191547870636, + "learning_rate": 5.738450990475741e-05, + "loss": 0.8417, + "step": 141810 + }, + { + "epoch": 0.9060475575942655, + "grad_norm": 1.1868984699249268, + "learning_rate": 5.7379547208151554e-05, + "loss": 0.7552, + "step": 141820 + }, + { + "epoch": 0.9061114447440042, + "grad_norm": 0.8787739872932434, + "learning_rate": 5.737458443722726e-05, + "loss": 0.9598, + "step": 141830 + }, + { + "epoch": 0.9061753318937429, + "grad_norm": 0.5496005415916443, + "learning_rate": 5.736962159203453e-05, + "loss": 0.801, + "step": 141840 + }, + { + "epoch": 0.9062392190434816, + "grad_norm": 1.013084053993225, + "learning_rate": 5.736465867262333e-05, + "loss": 0.9996, + "step": 141850 + }, + { + "epoch": 0.9063031061932203, + "grad_norm": 0.8402630686759949, + "learning_rate": 5.735969567904363e-05, + "loss": 0.7736, + "step": 141860 + }, + { + "epoch": 0.906366993342959, + "grad_norm": 0.7525802850723267, + "learning_rate": 5.735473261134545e-05, + "loss": 0.6874, + "step": 141870 + }, + { + "epoch": 0.9064308804926977, + "grad_norm": 0.8910543918609619, + "learning_rate": 5.734976946957875e-05, + "loss": 0.8928, + "step": 141880 + }, + { + "epoch": 0.9064947676424364, + "grad_norm": 1.0312716960906982, + "learning_rate": 5.7344806253793504e-05, + "loss": 0.8406, + "step": 141890 + }, + { + "epoch": 0.9065586547921751, + "grad_norm": 0.8210422396659851, + "learning_rate": 5.733984296403971e-05, + "loss": 1.0218, + "step": 141900 + }, + { + "epoch": 0.9066225419419138, + "grad_norm": 1.0540539026260376, + "learning_rate": 5.733487960036735e-05, + "loss": 0.9388, + "step": 141910 + }, + { + "epoch": 0.9066864290916525, + "grad_norm": 1.0182119607925415, + "learning_rate": 5.73299161628264e-05, + "loss": 0.9436, + "step": 141920 + }, + { + "epoch": 0.9067503162413912, + "grad_norm": 0.9228383898735046, + "learning_rate": 5.732495265146687e-05, + "loss": 0.7732, + "step": 141930 + }, + { + "epoch": 0.90681420339113, + "grad_norm": 0.8174379467964172, + "learning_rate": 5.731998906633871e-05, + "loss": 1.08, + "step": 141940 + }, + { + "epoch": 0.9068780905408687, + "grad_norm": 0.807985246181488, + "learning_rate": 5.731502540749194e-05, + "loss": 0.9834, + "step": 141950 + }, + { + "epoch": 0.9069419776906074, + "grad_norm": 0.8408271670341492, + "learning_rate": 5.7310061674976526e-05, + "loss": 0.958, + "step": 141960 + }, + { + "epoch": 0.9070058648403461, + "grad_norm": 0.7872259616851807, + "learning_rate": 5.730509786884247e-05, + "loss": 1.0426, + "step": 141970 + }, + { + "epoch": 0.9070697519900847, + "grad_norm": 0.7922796010971069, + "learning_rate": 5.730013398913976e-05, + "loss": 1.0874, + "step": 141980 + }, + { + "epoch": 0.9071336391398234, + "grad_norm": 0.8514977693557739, + "learning_rate": 5.729517003591839e-05, + "loss": 1.1897, + "step": 141990 + }, + { + "epoch": 0.9071975262895621, + "grad_norm": 1.1067577600479126, + "learning_rate": 5.729020600922833e-05, + "loss": 1.1056, + "step": 142000 + }, + { + "epoch": 0.9072614134393008, + "grad_norm": 1.1628286838531494, + "learning_rate": 5.7285241909119606e-05, + "loss": 1.0116, + "step": 142010 + }, + { + "epoch": 0.9073253005890395, + "grad_norm": 1.2236912250518799, + "learning_rate": 5.7280277735642184e-05, + "loss": 0.7426, + "step": 142020 + }, + { + "epoch": 0.9073891877387782, + "grad_norm": 1.162598729133606, + "learning_rate": 5.727531348884607e-05, + "loss": 0.8706, + "step": 142030 + }, + { + "epoch": 0.9074530748885169, + "grad_norm": 1.4383586645126343, + "learning_rate": 5.7270349168781256e-05, + "loss": 0.8572, + "step": 142040 + }, + { + "epoch": 0.9075169620382556, + "grad_norm": 0.8928152322769165, + "learning_rate": 5.726538477549774e-05, + "loss": 0.8158, + "step": 142050 + }, + { + "epoch": 0.9075808491879943, + "grad_norm": 0.8349987864494324, + "learning_rate": 5.7260420309045507e-05, + "loss": 1.1324, + "step": 142060 + }, + { + "epoch": 0.907644736337733, + "grad_norm": 0.7434611320495605, + "learning_rate": 5.725545576947456e-05, + "loss": 0.9721, + "step": 142070 + }, + { + "epoch": 0.9077086234874717, + "grad_norm": 0.7635177373886108, + "learning_rate": 5.72504911568349e-05, + "loss": 0.8982, + "step": 142080 + }, + { + "epoch": 0.9077725106372104, + "grad_norm": 0.7675304412841797, + "learning_rate": 5.724552647117653e-05, + "loss": 0.9434, + "step": 142090 + }, + { + "epoch": 0.9078363977869491, + "grad_norm": 0.9475330710411072, + "learning_rate": 5.724056171254942e-05, + "loss": 0.6869, + "step": 142100 + }, + { + "epoch": 0.9079002849366878, + "grad_norm": 1.1535788774490356, + "learning_rate": 5.7235596881003604e-05, + "loss": 0.7812, + "step": 142110 + }, + { + "epoch": 0.9079641720864265, + "grad_norm": 0.9977142214775085, + "learning_rate": 5.723063197658907e-05, + "loss": 1.0825, + "step": 142120 + }, + { + "epoch": 0.9080280592361653, + "grad_norm": 0.7173039317131042, + "learning_rate": 5.722566699935581e-05, + "loss": 0.843, + "step": 142130 + }, + { + "epoch": 0.908091946385904, + "grad_norm": 1.8091074228286743, + "learning_rate": 5.7220701949353825e-05, + "loss": 0.8146, + "step": 142140 + }, + { + "epoch": 0.9081558335356427, + "grad_norm": 1.055882453918457, + "learning_rate": 5.7215736826633135e-05, + "loss": 0.7627, + "step": 142150 + }, + { + "epoch": 0.9082197206853814, + "grad_norm": 0.6005893349647522, + "learning_rate": 5.721077163124373e-05, + "loss": 1.0631, + "step": 142160 + }, + { + "epoch": 0.9082836078351201, + "grad_norm": 0.9407183527946472, + "learning_rate": 5.7205806363235616e-05, + "loss": 0.982, + "step": 142170 + }, + { + "epoch": 0.9083474949848588, + "grad_norm": 1.6951545476913452, + "learning_rate": 5.7200841022658804e-05, + "loss": 0.8731, + "step": 142180 + }, + { + "epoch": 0.9084113821345975, + "grad_norm": 1.102827787399292, + "learning_rate": 5.719587560956327e-05, + "loss": 0.9827, + "step": 142190 + }, + { + "epoch": 0.9084752692843362, + "grad_norm": 1.1529837846755981, + "learning_rate": 5.719091012399907e-05, + "loss": 0.8706, + "step": 142200 + }, + { + "epoch": 0.9085391564340749, + "grad_norm": 0.8350210785865784, + "learning_rate": 5.718594456601618e-05, + "loss": 0.8377, + "step": 142210 + }, + { + "epoch": 0.9086030435838135, + "grad_norm": 0.9454380869865417, + "learning_rate": 5.71809789356646e-05, + "loss": 0.893, + "step": 142220 + }, + { + "epoch": 0.9086669307335522, + "grad_norm": 0.8189147710800171, + "learning_rate": 5.7176013232994354e-05, + "loss": 0.9778, + "step": 142230 + }, + { + "epoch": 0.9087308178832909, + "grad_norm": 0.8861716389656067, + "learning_rate": 5.717104745805545e-05, + "loss": 1.0042, + "step": 142240 + }, + { + "epoch": 0.9087947050330296, + "grad_norm": 0.785626232624054, + "learning_rate": 5.716608161089789e-05, + "loss": 0.8079, + "step": 142250 + }, + { + "epoch": 0.9088585921827683, + "grad_norm": 1.1369320154190063, + "learning_rate": 5.716111569157169e-05, + "loss": 0.7916, + "step": 142260 + }, + { + "epoch": 0.908922479332507, + "grad_norm": 1.1900125741958618, + "learning_rate": 5.715614970012686e-05, + "loss": 0.9075, + "step": 142270 + }, + { + "epoch": 0.9089863664822457, + "grad_norm": 1.0034444332122803, + "learning_rate": 5.7151183636613425e-05, + "loss": 0.9664, + "step": 142280 + }, + { + "epoch": 0.9090502536319844, + "grad_norm": 0.9154879450798035, + "learning_rate": 5.714621750108138e-05, + "loss": 0.9571, + "step": 142290 + }, + { + "epoch": 0.9091141407817231, + "grad_norm": 0.6897664070129395, + "learning_rate": 5.714125129358072e-05, + "loss": 0.9097, + "step": 142300 + }, + { + "epoch": 0.9091780279314619, + "grad_norm": 0.6371667385101318, + "learning_rate": 5.7136285014161506e-05, + "loss": 0.7985, + "step": 142310 + }, + { + "epoch": 0.9092419150812006, + "grad_norm": 1.1131823062896729, + "learning_rate": 5.713131866287371e-05, + "loss": 0.9823, + "step": 142320 + }, + { + "epoch": 0.9093058022309393, + "grad_norm": 1.771092176437378, + "learning_rate": 5.712635223976738e-05, + "loss": 1.0425, + "step": 142330 + }, + { + "epoch": 0.909369689380678, + "grad_norm": 1.0553698539733887, + "learning_rate": 5.712138574489251e-05, + "loss": 0.9531, + "step": 142340 + }, + { + "epoch": 0.9094335765304167, + "grad_norm": 0.4885224401950836, + "learning_rate": 5.711641917829913e-05, + "loss": 1.0004, + "step": 142350 + }, + { + "epoch": 0.9094974636801554, + "grad_norm": 0.8409444093704224, + "learning_rate": 5.7111452540037245e-05, + "loss": 0.6038, + "step": 142360 + }, + { + "epoch": 0.9095613508298941, + "grad_norm": 0.7915950417518616, + "learning_rate": 5.7106485830156885e-05, + "loss": 0.72, + "step": 142370 + }, + { + "epoch": 0.9096252379796328, + "grad_norm": 0.8521913290023804, + "learning_rate": 5.710151904870806e-05, + "loss": 0.9703, + "step": 142380 + }, + { + "epoch": 0.9096891251293715, + "grad_norm": 0.759997546672821, + "learning_rate": 5.7096552195740797e-05, + "loss": 0.8757, + "step": 142390 + }, + { + "epoch": 0.9097530122791102, + "grad_norm": 1.0440067052841187, + "learning_rate": 5.7091585271305113e-05, + "loss": 0.9393, + "step": 142400 + }, + { + "epoch": 0.9098168994288489, + "grad_norm": 0.5940192341804504, + "learning_rate": 5.7086618275451034e-05, + "loss": 0.6164, + "step": 142410 + }, + { + "epoch": 0.9098807865785876, + "grad_norm": 0.9643594622612, + "learning_rate": 5.708165120822857e-05, + "loss": 0.9354, + "step": 142420 + }, + { + "epoch": 0.9099446737283263, + "grad_norm": 1.3111646175384521, + "learning_rate": 5.707668406968776e-05, + "loss": 0.8814, + "step": 142430 + }, + { + "epoch": 0.910008560878065, + "grad_norm": 0.7643352746963501, + "learning_rate": 5.7071716859878624e-05, + "loss": 0.7996, + "step": 142440 + }, + { + "epoch": 0.9100724480278037, + "grad_norm": 0.6600208878517151, + "learning_rate": 5.7066749578851163e-05, + "loss": 0.9294, + "step": 142450 + }, + { + "epoch": 0.9101363351775424, + "grad_norm": 0.8577162027359009, + "learning_rate": 5.706178222665543e-05, + "loss": 0.8142, + "step": 142460 + }, + { + "epoch": 0.910200222327281, + "grad_norm": 0.6898266077041626, + "learning_rate": 5.7056814803341454e-05, + "loss": 0.7175, + "step": 142470 + }, + { + "epoch": 0.9102641094770197, + "grad_norm": 0.6253551840782166, + "learning_rate": 5.705184730895924e-05, + "loss": 0.8652, + "step": 142480 + }, + { + "epoch": 0.9103279966267585, + "grad_norm": 1.0927127599716187, + "learning_rate": 5.704687974355881e-05, + "loss": 0.9197, + "step": 142490 + }, + { + "epoch": 0.9103918837764972, + "grad_norm": 1.2770670652389526, + "learning_rate": 5.7041912107190223e-05, + "loss": 0.7483, + "step": 142500 + }, + { + "epoch": 0.9104557709262359, + "grad_norm": 0.7076462507247925, + "learning_rate": 5.703694439990348e-05, + "loss": 0.9537, + "step": 142510 + }, + { + "epoch": 0.9105196580759746, + "grad_norm": 1.3433187007904053, + "learning_rate": 5.703197662174863e-05, + "loss": 0.8636, + "step": 142520 + }, + { + "epoch": 0.9105835452257133, + "grad_norm": 1.6730338335037231, + "learning_rate": 5.702700877277568e-05, + "loss": 0.9131, + "step": 142530 + }, + { + "epoch": 0.910647432375452, + "grad_norm": 1.130669355392456, + "learning_rate": 5.702204085303468e-05, + "loss": 0.9168, + "step": 142540 + }, + { + "epoch": 0.9107113195251907, + "grad_norm": 0.9546806216239929, + "learning_rate": 5.7017072862575626e-05, + "loss": 0.8348, + "step": 142550 + }, + { + "epoch": 0.9107752066749294, + "grad_norm": 0.6770340800285339, + "learning_rate": 5.701210480144861e-05, + "loss": 0.9684, + "step": 142560 + }, + { + "epoch": 0.9108390938246681, + "grad_norm": 1.0904873609542847, + "learning_rate": 5.700713666970361e-05, + "loss": 0.8426, + "step": 142570 + }, + { + "epoch": 0.9109029809744068, + "grad_norm": 0.8043915629386902, + "learning_rate": 5.7002168467390694e-05, + "loss": 1.1309, + "step": 142580 + }, + { + "epoch": 0.9109668681241455, + "grad_norm": 0.7632177472114563, + "learning_rate": 5.699720019455989e-05, + "loss": 0.8359, + "step": 142590 + }, + { + "epoch": 0.9110307552738842, + "grad_norm": 0.8584951758384705, + "learning_rate": 5.699223185126121e-05, + "loss": 0.8166, + "step": 142600 + }, + { + "epoch": 0.9110946424236229, + "grad_norm": 0.704011082649231, + "learning_rate": 5.698726343754472e-05, + "loss": 0.7772, + "step": 142610 + }, + { + "epoch": 0.9111585295733616, + "grad_norm": 0.954410970211029, + "learning_rate": 5.698229495346044e-05, + "loss": 0.9824, + "step": 142620 + }, + { + "epoch": 0.9112224167231003, + "grad_norm": 1.6050761938095093, + "learning_rate": 5.697732639905841e-05, + "loss": 0.986, + "step": 142630 + }, + { + "epoch": 0.911286303872839, + "grad_norm": 0.8036152720451355, + "learning_rate": 5.697235777438866e-05, + "loss": 1.1253, + "step": 142640 + }, + { + "epoch": 0.9113501910225777, + "grad_norm": 1.0591254234313965, + "learning_rate": 5.6967389079501234e-05, + "loss": 0.7433, + "step": 142650 + }, + { + "epoch": 0.9114140781723165, + "grad_norm": 0.63569176197052, + "learning_rate": 5.6962420314446186e-05, + "loss": 0.7921, + "step": 142660 + }, + { + "epoch": 0.9114779653220552, + "grad_norm": 0.6590409278869629, + "learning_rate": 5.6957451479273526e-05, + "loss": 0.7715, + "step": 142670 + }, + { + "epoch": 0.9115418524717939, + "grad_norm": 0.8721647262573242, + "learning_rate": 5.695248257403332e-05, + "loss": 0.9248, + "step": 142680 + }, + { + "epoch": 0.9116057396215326, + "grad_norm": 0.8576902151107788, + "learning_rate": 5.6947513598775605e-05, + "loss": 0.8463, + "step": 142690 + }, + { + "epoch": 0.9116696267712713, + "grad_norm": 1.4896668195724487, + "learning_rate": 5.69425445535504e-05, + "loss": 0.816, + "step": 142700 + }, + { + "epoch": 0.9117335139210099, + "grad_norm": 0.9425134062767029, + "learning_rate": 5.693757543840779e-05, + "loss": 0.9642, + "step": 142710 + }, + { + "epoch": 0.9117974010707486, + "grad_norm": 0.9548724889755249, + "learning_rate": 5.693260625339777e-05, + "loss": 0.6938, + "step": 142720 + }, + { + "epoch": 0.9118612882204873, + "grad_norm": 1.2961103916168213, + "learning_rate": 5.692763699857042e-05, + "loss": 0.993, + "step": 142730 + }, + { + "epoch": 0.911925175370226, + "grad_norm": 0.8667069673538208, + "learning_rate": 5.692266767397576e-05, + "loss": 0.8989, + "step": 142740 + }, + { + "epoch": 0.9119890625199647, + "grad_norm": 0.678893506526947, + "learning_rate": 5.691769827966386e-05, + "loss": 0.7032, + "step": 142750 + }, + { + "epoch": 0.9120529496697034, + "grad_norm": 0.5294567942619324, + "learning_rate": 5.6912728815684744e-05, + "loss": 0.9135, + "step": 142760 + }, + { + "epoch": 0.9121168368194421, + "grad_norm": 0.7864146828651428, + "learning_rate": 5.690775928208848e-05, + "loss": 0.7941, + "step": 142770 + }, + { + "epoch": 0.9121807239691808, + "grad_norm": 0.7031729817390442, + "learning_rate": 5.690278967892511e-05, + "loss": 0.8931, + "step": 142780 + }, + { + "epoch": 0.9122446111189195, + "grad_norm": 0.7443121671676636, + "learning_rate": 5.689782000624466e-05, + "loss": 0.8238, + "step": 142790 + }, + { + "epoch": 0.9123084982686582, + "grad_norm": 0.7489047050476074, + "learning_rate": 5.68928502640972e-05, + "loss": 0.731, + "step": 142800 + }, + { + "epoch": 0.9123723854183969, + "grad_norm": 1.1270229816436768, + "learning_rate": 5.688788045253277e-05, + "loss": 0.9834, + "step": 142810 + }, + { + "epoch": 0.9124362725681356, + "grad_norm": 0.6103757619857788, + "learning_rate": 5.688291057160143e-05, + "loss": 1.0271, + "step": 142820 + }, + { + "epoch": 0.9125001597178743, + "grad_norm": 0.7932960987091064, + "learning_rate": 5.687794062135322e-05, + "loss": 0.7008, + "step": 142830 + }, + { + "epoch": 0.912564046867613, + "grad_norm": 1.3297314643859863, + "learning_rate": 5.687297060183821e-05, + "loss": 0.9021, + "step": 142840 + }, + { + "epoch": 0.9126279340173518, + "grad_norm": 0.9981955289840698, + "learning_rate": 5.6868000513106435e-05, + "loss": 0.9884, + "step": 142850 + }, + { + "epoch": 0.9126918211670905, + "grad_norm": 1.4859464168548584, + "learning_rate": 5.6863030355207945e-05, + "loss": 1.0916, + "step": 142860 + }, + { + "epoch": 0.9127557083168292, + "grad_norm": 1.1081634759902954, + "learning_rate": 5.685806012819281e-05, + "loss": 0.7292, + "step": 142870 + }, + { + "epoch": 0.9128195954665679, + "grad_norm": 0.9023155570030212, + "learning_rate": 5.6853089832111076e-05, + "loss": 0.8272, + "step": 142880 + }, + { + "epoch": 0.9128834826163066, + "grad_norm": 0.712682843208313, + "learning_rate": 5.6848119467012795e-05, + "loss": 1.0256, + "step": 142890 + }, + { + "epoch": 0.9129473697660453, + "grad_norm": 1.0017153024673462, + "learning_rate": 5.684314903294803e-05, + "loss": 0.9368, + "step": 142900 + }, + { + "epoch": 0.913011256915784, + "grad_norm": 1.385031819343567, + "learning_rate": 5.6838178529966825e-05, + "loss": 0.8932, + "step": 142910 + }, + { + "epoch": 0.9130751440655227, + "grad_norm": 0.9716565608978271, + "learning_rate": 5.683320795811925e-05, + "loss": 1.127, + "step": 142920 + }, + { + "epoch": 0.9131390312152614, + "grad_norm": 2.391300916671753, + "learning_rate": 5.6828237317455365e-05, + "loss": 1.1424, + "step": 142930 + }, + { + "epoch": 0.9132029183650001, + "grad_norm": 2.1368186473846436, + "learning_rate": 5.682326660802523e-05, + "loss": 1.1169, + "step": 142940 + }, + { + "epoch": 0.9132668055147387, + "grad_norm": 1.1230798959732056, + "learning_rate": 5.6818295829878874e-05, + "loss": 0.791, + "step": 142950 + }, + { + "epoch": 0.9133306926644774, + "grad_norm": 0.8756389617919922, + "learning_rate": 5.6813324983066404e-05, + "loss": 0.7168, + "step": 142960 + }, + { + "epoch": 0.9133945798142161, + "grad_norm": 4.043021202087402, + "learning_rate": 5.680835406763785e-05, + "loss": 0.8593, + "step": 142970 + }, + { + "epoch": 0.9134584669639548, + "grad_norm": 0.9189384579658508, + "learning_rate": 5.680338308364328e-05, + "loss": 0.7222, + "step": 142980 + }, + { + "epoch": 0.9135223541136935, + "grad_norm": 0.9267463088035583, + "learning_rate": 5.679841203113275e-05, + "loss": 0.9252, + "step": 142990 + }, + { + "epoch": 0.9135862412634322, + "grad_norm": 0.9476075768470764, + "learning_rate": 5.6793440910156336e-05, + "loss": 0.8154, + "step": 143000 + }, + { + "epoch": 0.913650128413171, + "grad_norm": 0.7349272966384888, + "learning_rate": 5.67884697207641e-05, + "loss": 1.0062, + "step": 143010 + }, + { + "epoch": 0.9137140155629097, + "grad_norm": 0.6401690244674683, + "learning_rate": 5.67834984630061e-05, + "loss": 0.9801, + "step": 143020 + }, + { + "epoch": 0.9137779027126484, + "grad_norm": 0.9111157655715942, + "learning_rate": 5.677852713693239e-05, + "loss": 0.8259, + "step": 143030 + }, + { + "epoch": 0.9138417898623871, + "grad_norm": 0.5553478598594666, + "learning_rate": 5.6773555742593065e-05, + "loss": 0.8509, + "step": 143040 + }, + { + "epoch": 0.9139056770121258, + "grad_norm": 0.6528088450431824, + "learning_rate": 5.676858428003815e-05, + "loss": 0.6361, + "step": 143050 + }, + { + "epoch": 0.9139695641618645, + "grad_norm": 0.7725486755371094, + "learning_rate": 5.676361274931775e-05, + "loss": 0.9128, + "step": 143060 + }, + { + "epoch": 0.9140334513116032, + "grad_norm": 0.8535653948783875, + "learning_rate": 5.67586411504819e-05, + "loss": 0.9194, + "step": 143070 + }, + { + "epoch": 0.9140973384613419, + "grad_norm": 0.4683364927768707, + "learning_rate": 5.675366948358072e-05, + "loss": 0.8434, + "step": 143080 + }, + { + "epoch": 0.9141612256110806, + "grad_norm": 1.8194103240966797, + "learning_rate": 5.6748697748664225e-05, + "loss": 1.1056, + "step": 143090 + }, + { + "epoch": 0.9142251127608193, + "grad_norm": 0.8043122887611389, + "learning_rate": 5.674372594578251e-05, + "loss": 0.7171, + "step": 143100 + }, + { + "epoch": 0.914288999910558, + "grad_norm": 0.7784401774406433, + "learning_rate": 5.673875407498563e-05, + "loss": 0.8041, + "step": 143110 + }, + { + "epoch": 0.9143528870602967, + "grad_norm": 1.7115349769592285, + "learning_rate": 5.673378213632368e-05, + "loss": 0.863, + "step": 143120 + }, + { + "epoch": 0.9144167742100354, + "grad_norm": 0.8383505940437317, + "learning_rate": 5.672881012984672e-05, + "loss": 0.8503, + "step": 143130 + }, + { + "epoch": 0.9144806613597741, + "grad_norm": 0.8597167134284973, + "learning_rate": 5.672383805560482e-05, + "loss": 0.9283, + "step": 143140 + }, + { + "epoch": 0.9145445485095128, + "grad_norm": 1.1670582294464111, + "learning_rate": 5.6718865913648044e-05, + "loss": 0.9148, + "step": 143150 + }, + { + "epoch": 0.9146084356592515, + "grad_norm": 0.7308951616287231, + "learning_rate": 5.671389370402648e-05, + "loss": 0.8466, + "step": 143160 + }, + { + "epoch": 0.9146723228089902, + "grad_norm": 0.7195169925689697, + "learning_rate": 5.6708921426790194e-05, + "loss": 0.9054, + "step": 143170 + }, + { + "epoch": 0.914736209958729, + "grad_norm": 0.6677785515785217, + "learning_rate": 5.670394908198927e-05, + "loss": 0.7878, + "step": 143180 + }, + { + "epoch": 0.9148000971084677, + "grad_norm": 1.0069258213043213, + "learning_rate": 5.669897666967378e-05, + "loss": 0.8408, + "step": 143190 + }, + { + "epoch": 0.9148639842582063, + "grad_norm": 0.9950356483459473, + "learning_rate": 5.66940041898938e-05, + "loss": 0.9091, + "step": 143200 + }, + { + "epoch": 0.914927871407945, + "grad_norm": 0.8431347012519836, + "learning_rate": 5.6689031642699405e-05, + "loss": 0.7076, + "step": 143210 + }, + { + "epoch": 0.9149917585576837, + "grad_norm": 0.8251869678497314, + "learning_rate": 5.668405902814067e-05, + "loss": 0.9349, + "step": 143220 + }, + { + "epoch": 0.9150556457074224, + "grad_norm": 1.2934318780899048, + "learning_rate": 5.6679086346267685e-05, + "loss": 0.913, + "step": 143230 + }, + { + "epoch": 0.9151195328571611, + "grad_norm": 0.7144001722335815, + "learning_rate": 5.6674113597130515e-05, + "loss": 0.9816, + "step": 143240 + }, + { + "epoch": 0.9151834200068998, + "grad_norm": 0.8660211563110352, + "learning_rate": 5.666914078077926e-05, + "loss": 0.9166, + "step": 143250 + }, + { + "epoch": 0.9152473071566385, + "grad_norm": 0.8381525874137878, + "learning_rate": 5.6664167897263975e-05, + "loss": 0.8673, + "step": 143260 + }, + { + "epoch": 0.9153111943063772, + "grad_norm": 1.0956501960754395, + "learning_rate": 5.6659194946634764e-05, + "loss": 0.8773, + "step": 143270 + }, + { + "epoch": 0.9153750814561159, + "grad_norm": 0.6203900575637817, + "learning_rate": 5.6654221928941685e-05, + "loss": 0.9247, + "step": 143280 + }, + { + "epoch": 0.9154389686058546, + "grad_norm": 0.8943018317222595, + "learning_rate": 5.664924884423485e-05, + "loss": 0.8628, + "step": 143290 + }, + { + "epoch": 0.9155028557555933, + "grad_norm": 1.1351211071014404, + "learning_rate": 5.664427569256432e-05, + "loss": 0.7142, + "step": 143300 + }, + { + "epoch": 0.915566742905332, + "grad_norm": 0.7743847370147705, + "learning_rate": 5.663930247398018e-05, + "loss": 0.9053, + "step": 143310 + }, + { + "epoch": 0.9156306300550707, + "grad_norm": 0.7925605177879333, + "learning_rate": 5.663432918853253e-05, + "loss": 0.7516, + "step": 143320 + }, + { + "epoch": 0.9156945172048094, + "grad_norm": 0.9708541631698608, + "learning_rate": 5.6629355836271435e-05, + "loss": 1.1418, + "step": 143330 + }, + { + "epoch": 0.9157584043545481, + "grad_norm": 1.0878440141677856, + "learning_rate": 5.6624382417247004e-05, + "loss": 0.8489, + "step": 143340 + }, + { + "epoch": 0.9158222915042868, + "grad_norm": 0.7541216015815735, + "learning_rate": 5.66194089315093e-05, + "loss": 1.2666, + "step": 143350 + }, + { + "epoch": 0.9158861786540256, + "grad_norm": 0.780955970287323, + "learning_rate": 5.6614435379108434e-05, + "loss": 0.6196, + "step": 143360 + }, + { + "epoch": 0.9159500658037643, + "grad_norm": 0.8869837522506714, + "learning_rate": 5.6609461760094476e-05, + "loss": 0.7971, + "step": 143370 + }, + { + "epoch": 0.916013952953503, + "grad_norm": 0.9061605930328369, + "learning_rate": 5.660448807451752e-05, + "loss": 1.1207, + "step": 143380 + }, + { + "epoch": 0.9160778401032417, + "grad_norm": 0.8924505710601807, + "learning_rate": 5.659951432242765e-05, + "loss": 0.8292, + "step": 143390 + }, + { + "epoch": 0.9161417272529804, + "grad_norm": 1.1079713106155396, + "learning_rate": 5.659454050387496e-05, + "loss": 0.7443, + "step": 143400 + }, + { + "epoch": 0.9162056144027191, + "grad_norm": 1.7120331525802612, + "learning_rate": 5.658956661890955e-05, + "loss": 0.9726, + "step": 143410 + }, + { + "epoch": 0.9162695015524578, + "grad_norm": 0.6569525003433228, + "learning_rate": 5.6584592667581494e-05, + "loss": 1.1294, + "step": 143420 + }, + { + "epoch": 0.9163333887021965, + "grad_norm": 1.015156626701355, + "learning_rate": 5.65796186499409e-05, + "loss": 0.9498, + "step": 143430 + }, + { + "epoch": 0.9163972758519351, + "grad_norm": 0.7955893874168396, + "learning_rate": 5.657464456603785e-05, + "loss": 0.7562, + "step": 143440 + }, + { + "epoch": 0.9164611630016738, + "grad_norm": 0.87772136926651, + "learning_rate": 5.6569670415922436e-05, + "loss": 0.8287, + "step": 143450 + }, + { + "epoch": 0.9165250501514125, + "grad_norm": 0.7367571592330933, + "learning_rate": 5.656469619964477e-05, + "loss": 0.9409, + "step": 143460 + }, + { + "epoch": 0.9165889373011512, + "grad_norm": 0.940991997718811, + "learning_rate": 5.6559721917254924e-05, + "loss": 1.0162, + "step": 143470 + }, + { + "epoch": 0.9166528244508899, + "grad_norm": 0.8817263841629028, + "learning_rate": 5.655474756880301e-05, + "loss": 0.8085, + "step": 143480 + }, + { + "epoch": 0.9167167116006286, + "grad_norm": 0.982813835144043, + "learning_rate": 5.654977315433914e-05, + "loss": 0.9593, + "step": 143490 + }, + { + "epoch": 0.9167805987503673, + "grad_norm": 1.1913138628005981, + "learning_rate": 5.6544798673913354e-05, + "loss": 0.8467, + "step": 143500 + }, + { + "epoch": 0.916844485900106, + "grad_norm": 0.7284004092216492, + "learning_rate": 5.653982412757579e-05, + "loss": 0.8006, + "step": 143510 + }, + { + "epoch": 0.9169083730498447, + "grad_norm": 1.0794998407363892, + "learning_rate": 5.653484951537655e-05, + "loss": 0.794, + "step": 143520 + }, + { + "epoch": 0.9169722601995834, + "grad_norm": 1.2948535680770874, + "learning_rate": 5.652987483736572e-05, + "loss": 0.7273, + "step": 143530 + }, + { + "epoch": 0.9170361473493222, + "grad_norm": 0.7950011491775513, + "learning_rate": 5.652490009359339e-05, + "loss": 0.9643, + "step": 143540 + }, + { + "epoch": 0.9171000344990609, + "grad_norm": 0.5536199808120728, + "learning_rate": 5.651992528410967e-05, + "loss": 0.8933, + "step": 143550 + }, + { + "epoch": 0.9171639216487996, + "grad_norm": 0.880750298500061, + "learning_rate": 5.6514950408964685e-05, + "loss": 1.0077, + "step": 143560 + }, + { + "epoch": 0.9172278087985383, + "grad_norm": 1.2838913202285767, + "learning_rate": 5.6509975468208484e-05, + "loss": 0.7484, + "step": 143570 + }, + { + "epoch": 0.917291695948277, + "grad_norm": 0.8207671046257019, + "learning_rate": 5.650500046189122e-05, + "loss": 0.8739, + "step": 143580 + }, + { + "epoch": 0.9173555830980157, + "grad_norm": 0.9268253445625305, + "learning_rate": 5.650002539006296e-05, + "loss": 1.0368, + "step": 143590 + }, + { + "epoch": 0.9174194702477544, + "grad_norm": 0.7340649962425232, + "learning_rate": 5.649505025277382e-05, + "loss": 0.9109, + "step": 143600 + }, + { + "epoch": 0.9174833573974931, + "grad_norm": 1.0350323915481567, + "learning_rate": 5.649007505007391e-05, + "loss": 0.9756, + "step": 143610 + }, + { + "epoch": 0.9175472445472318, + "grad_norm": 0.784234344959259, + "learning_rate": 5.6485099782013326e-05, + "loss": 0.7373, + "step": 143620 + }, + { + "epoch": 0.9176111316969705, + "grad_norm": 0.8511738181114197, + "learning_rate": 5.648012444864219e-05, + "loss": 0.7554, + "step": 143630 + }, + { + "epoch": 0.9176750188467092, + "grad_norm": 0.9637295007705688, + "learning_rate": 5.647514905001059e-05, + "loss": 0.6863, + "step": 143640 + }, + { + "epoch": 0.9177389059964479, + "grad_norm": 0.6820176243782043, + "learning_rate": 5.6470173586168625e-05, + "loss": 0.974, + "step": 143650 + }, + { + "epoch": 0.9178027931461866, + "grad_norm": 0.7366352081298828, + "learning_rate": 5.646519805716643e-05, + "loss": 0.6999, + "step": 143660 + }, + { + "epoch": 0.9178666802959253, + "grad_norm": 0.7290107607841492, + "learning_rate": 5.646022246305409e-05, + "loss": 1.0206, + "step": 143670 + }, + { + "epoch": 0.9179305674456639, + "grad_norm": 1.0230958461761475, + "learning_rate": 5.645524680388172e-05, + "loss": 0.8786, + "step": 143680 + }, + { + "epoch": 0.9179944545954026, + "grad_norm": 1.1091904640197754, + "learning_rate": 5.645027107969942e-05, + "loss": 0.8807, + "step": 143690 + }, + { + "epoch": 0.9180583417451413, + "grad_norm": 1.9306013584136963, + "learning_rate": 5.644529529055733e-05, + "loss": 1.2533, + "step": 143700 + }, + { + "epoch": 0.91812222889488, + "grad_norm": 1.0572997331619263, + "learning_rate": 5.644031943650553e-05, + "loss": 0.7035, + "step": 143710 + }, + { + "epoch": 0.9181861160446187, + "grad_norm": 0.9027218818664551, + "learning_rate": 5.643534351759414e-05, + "loss": 0.8836, + "step": 143720 + }, + { + "epoch": 0.9182500031943575, + "grad_norm": 0.7371615767478943, + "learning_rate": 5.643036753387328e-05, + "loss": 0.6614, + "step": 143730 + }, + { + "epoch": 0.9183138903440962, + "grad_norm": 1.505537986755371, + "learning_rate": 5.642539148539306e-05, + "loss": 0.8683, + "step": 143740 + }, + { + "epoch": 0.9183777774938349, + "grad_norm": 0.8361949920654297, + "learning_rate": 5.64204153722036e-05, + "loss": 0.9926, + "step": 143750 + }, + { + "epoch": 0.9184416646435736, + "grad_norm": 1.110982894897461, + "learning_rate": 5.641543919435496e-05, + "loss": 0.8015, + "step": 143760 + }, + { + "epoch": 0.9185055517933123, + "grad_norm": 1.32961106300354, + "learning_rate": 5.641046295189733e-05, + "loss": 0.8, + "step": 143770 + }, + { + "epoch": 0.918569438943051, + "grad_norm": 0.8517293334007263, + "learning_rate": 5.640548664488078e-05, + "loss": 0.6818, + "step": 143780 + }, + { + "epoch": 0.9186333260927897, + "grad_norm": 0.8187686204910278, + "learning_rate": 5.6400510273355446e-05, + "loss": 0.793, + "step": 143790 + }, + { + "epoch": 0.9186972132425284, + "grad_norm": 2.4176223278045654, + "learning_rate": 5.639553383737143e-05, + "loss": 0.9948, + "step": 143800 + }, + { + "epoch": 0.9187611003922671, + "grad_norm": 0.7014775276184082, + "learning_rate": 5.6390557336978855e-05, + "loss": 1.1245, + "step": 143810 + }, + { + "epoch": 0.9188249875420058, + "grad_norm": 1.063392996788025, + "learning_rate": 5.638558077222784e-05, + "loss": 1.0894, + "step": 143820 + }, + { + "epoch": 0.9188888746917445, + "grad_norm": 0.49587613344192505, + "learning_rate": 5.63806041431685e-05, + "loss": 0.6732, + "step": 143830 + }, + { + "epoch": 0.9189527618414832, + "grad_norm": 0.848645031452179, + "learning_rate": 5.637562744985097e-05, + "loss": 0.9066, + "step": 143840 + }, + { + "epoch": 0.9190166489912219, + "grad_norm": 0.8816063404083252, + "learning_rate": 5.637065069232534e-05, + "loss": 0.9941, + "step": 143850 + }, + { + "epoch": 0.9190805361409606, + "grad_norm": 1.5030635595321655, + "learning_rate": 5.6365673870641755e-05, + "loss": 0.7683, + "step": 143860 + }, + { + "epoch": 0.9191444232906993, + "grad_norm": 1.574723482131958, + "learning_rate": 5.6360696984850324e-05, + "loss": 1.0703, + "step": 143870 + }, + { + "epoch": 0.919208310440438, + "grad_norm": 1.3411686420440674, + "learning_rate": 5.635572003500117e-05, + "loss": 0.9338, + "step": 143880 + }, + { + "epoch": 0.9192721975901768, + "grad_norm": 1.3527319431304932, + "learning_rate": 5.6350743021144416e-05, + "loss": 1.2366, + "step": 143890 + }, + { + "epoch": 0.9193360847399155, + "grad_norm": 0.8805691599845886, + "learning_rate": 5.634576594333019e-05, + "loss": 0.8299, + "step": 143900 + }, + { + "epoch": 0.9193999718896542, + "grad_norm": 0.6124188899993896, + "learning_rate": 5.634078880160861e-05, + "loss": 0.8055, + "step": 143910 + }, + { + "epoch": 0.9194638590393928, + "grad_norm": 1.0732533931732178, + "learning_rate": 5.63358115960298e-05, + "loss": 1.1149, + "step": 143920 + }, + { + "epoch": 0.9195277461891315, + "grad_norm": 0.5931747555732727, + "learning_rate": 5.633083432664389e-05, + "loss": 1.1673, + "step": 143930 + }, + { + "epoch": 0.9195916333388702, + "grad_norm": 1.0256335735321045, + "learning_rate": 5.632585699350099e-05, + "loss": 0.8732, + "step": 143940 + }, + { + "epoch": 0.9196555204886089, + "grad_norm": 0.9125732779502869, + "learning_rate": 5.632087959665124e-05, + "loss": 0.756, + "step": 143950 + }, + { + "epoch": 0.9197194076383476, + "grad_norm": 1.4520151615142822, + "learning_rate": 5.6315902136144784e-05, + "loss": 0.906, + "step": 143960 + }, + { + "epoch": 0.9197832947880863, + "grad_norm": 0.8130024671554565, + "learning_rate": 5.63109246120317e-05, + "loss": 0.7474, + "step": 143970 + }, + { + "epoch": 0.919847181937825, + "grad_norm": 0.40209460258483887, + "learning_rate": 5.630594702436217e-05, + "loss": 0.9289, + "step": 143980 + }, + { + "epoch": 0.9199110690875637, + "grad_norm": 0.8355494737625122, + "learning_rate": 5.630096937318629e-05, + "loss": 0.8839, + "step": 143990 + }, + { + "epoch": 0.9199749562373024, + "grad_norm": 1.9247257709503174, + "learning_rate": 5.629599165855419e-05, + "loss": 0.9603, + "step": 144000 + }, + { + "epoch": 0.9200388433870411, + "grad_norm": 0.7350730299949646, + "learning_rate": 5.629101388051602e-05, + "loss": 1.0115, + "step": 144010 + }, + { + "epoch": 0.9201027305367798, + "grad_norm": 0.9462405443191528, + "learning_rate": 5.628603603912188e-05, + "loss": 1.123, + "step": 144020 + }, + { + "epoch": 0.9201666176865185, + "grad_norm": 0.7234540581703186, + "learning_rate": 5.628105813442194e-05, + "loss": 0.8485, + "step": 144030 + }, + { + "epoch": 0.9202305048362572, + "grad_norm": 1.3080463409423828, + "learning_rate": 5.6276080166466294e-05, + "loss": 1.0137, + "step": 144040 + }, + { + "epoch": 0.9202943919859959, + "grad_norm": 1.0539608001708984, + "learning_rate": 5.62711021353051e-05, + "loss": 0.6729, + "step": 144050 + }, + { + "epoch": 0.9203582791357346, + "grad_norm": 0.9095763564109802, + "learning_rate": 5.626612404098848e-05, + "loss": 0.6638, + "step": 144060 + }, + { + "epoch": 0.9204221662854734, + "grad_norm": 1.001478672027588, + "learning_rate": 5.626114588356657e-05, + "loss": 0.9633, + "step": 144070 + }, + { + "epoch": 0.9204860534352121, + "grad_norm": 0.8986124396324158, + "learning_rate": 5.62561676630895e-05, + "loss": 0.8985, + "step": 144080 + }, + { + "epoch": 0.9205499405849508, + "grad_norm": 0.4983496367931366, + "learning_rate": 5.6251189379607415e-05, + "loss": 0.8668, + "step": 144090 + }, + { + "epoch": 0.9206138277346895, + "grad_norm": 1.6429502964019775, + "learning_rate": 5.6246211033170434e-05, + "loss": 0.9987, + "step": 144100 + }, + { + "epoch": 0.9206777148844282, + "grad_norm": 1.0039671659469604, + "learning_rate": 5.624123262382872e-05, + "loss": 0.8202, + "step": 144110 + }, + { + "epoch": 0.9207416020341669, + "grad_norm": 1.0661791563034058, + "learning_rate": 5.6236254151632385e-05, + "loss": 0.7685, + "step": 144120 + }, + { + "epoch": 0.9208054891839056, + "grad_norm": 0.973341166973114, + "learning_rate": 5.623177347295643e-05, + "loss": 0.8032, + "step": 144130 + }, + { + "epoch": 0.9208693763336443, + "grad_norm": 1.365503191947937, + "learning_rate": 5.6226794881474464e-05, + "loss": 0.7741, + "step": 144140 + }, + { + "epoch": 0.920933263483383, + "grad_norm": 0.7111403942108154, + "learning_rate": 5.622181622728329e-05, + "loss": 0.7476, + "step": 144150 + }, + { + "epoch": 0.9209971506331217, + "grad_norm": 0.641816258430481, + "learning_rate": 5.621683751043304e-05, + "loss": 0.9546, + "step": 144160 + }, + { + "epoch": 0.9210610377828603, + "grad_norm": 0.6329175233840942, + "learning_rate": 5.6211858730973856e-05, + "loss": 0.9538, + "step": 144170 + }, + { + "epoch": 0.921124924932599, + "grad_norm": 1.5208319425582886, + "learning_rate": 5.620687988895589e-05, + "loss": 0.8583, + "step": 144180 + }, + { + "epoch": 0.9211888120823377, + "grad_norm": 1.1061300039291382, + "learning_rate": 5.6201900984429255e-05, + "loss": 0.8138, + "step": 144190 + }, + { + "epoch": 0.9212526992320764, + "grad_norm": 0.8774361610412598, + "learning_rate": 5.619692201744413e-05, + "loss": 0.7494, + "step": 144200 + }, + { + "epoch": 0.9213165863818151, + "grad_norm": 1.0457643270492554, + "learning_rate": 5.6191942988050626e-05, + "loss": 0.8806, + "step": 144210 + }, + { + "epoch": 0.9213804735315538, + "grad_norm": 0.7023540139198303, + "learning_rate": 5.618696389629892e-05, + "loss": 0.739, + "step": 144220 + }, + { + "epoch": 0.9214443606812925, + "grad_norm": 0.671416699886322, + "learning_rate": 5.6181984742239117e-05, + "loss": 0.6146, + "step": 144230 + }, + { + "epoch": 0.9215082478310312, + "grad_norm": 0.7087250351905823, + "learning_rate": 5.6177005525921376e-05, + "loss": 1.0222, + "step": 144240 + }, + { + "epoch": 0.92157213498077, + "grad_norm": 1.0113321542739868, + "learning_rate": 5.617202624739585e-05, + "loss": 0.7315, + "step": 144250 + }, + { + "epoch": 0.9216360221305087, + "grad_norm": 1.1815264225006104, + "learning_rate": 5.616704690671267e-05, + "loss": 0.8089, + "step": 144260 + }, + { + "epoch": 0.9216999092802474, + "grad_norm": 0.7599946856498718, + "learning_rate": 5.616206750392201e-05, + "loss": 0.7554, + "step": 144270 + }, + { + "epoch": 0.9217637964299861, + "grad_norm": 1.0004082918167114, + "learning_rate": 5.6157088039074e-05, + "loss": 0.8352, + "step": 144280 + }, + { + "epoch": 0.9218276835797248, + "grad_norm": 1.208046317100525, + "learning_rate": 5.615210851221878e-05, + "loss": 0.9545, + "step": 144290 + }, + { + "epoch": 0.9218915707294635, + "grad_norm": 1.0685365200042725, + "learning_rate": 5.61471289234065e-05, + "loss": 1.0576, + "step": 144300 + }, + { + "epoch": 0.9219554578792022, + "grad_norm": 1.0764927864074707, + "learning_rate": 5.614214927268733e-05, + "loss": 0.946, + "step": 144310 + }, + { + "epoch": 0.9220193450289409, + "grad_norm": 1.0587629079818726, + "learning_rate": 5.613716956011139e-05, + "loss": 1.0278, + "step": 144320 + }, + { + "epoch": 0.9220832321786796, + "grad_norm": 1.0095889568328857, + "learning_rate": 5.613218978572884e-05, + "loss": 1.0609, + "step": 144330 + }, + { + "epoch": 0.9221471193284183, + "grad_norm": 0.8525044322013855, + "learning_rate": 5.6127209949589845e-05, + "loss": 1.2406, + "step": 144340 + }, + { + "epoch": 0.922211006478157, + "grad_norm": 0.7696043252944946, + "learning_rate": 5.612223005174454e-05, + "loss": 0.8016, + "step": 144350 + }, + { + "epoch": 0.9222748936278957, + "grad_norm": 0.8853589296340942, + "learning_rate": 5.6117250092243076e-05, + "loss": 0.929, + "step": 144360 + }, + { + "epoch": 0.9223387807776344, + "grad_norm": 1.2267171144485474, + "learning_rate": 5.611227007113563e-05, + "loss": 0.7504, + "step": 144370 + }, + { + "epoch": 0.9224026679273731, + "grad_norm": 0.9859304428100586, + "learning_rate": 5.6107289988472325e-05, + "loss": 0.9157, + "step": 144380 + }, + { + "epoch": 0.9224665550771118, + "grad_norm": 0.824668288230896, + "learning_rate": 5.6102309844303324e-05, + "loss": 1.1008, + "step": 144390 + }, + { + "epoch": 0.9225304422268505, + "grad_norm": 0.9470674991607666, + "learning_rate": 5.609732963867879e-05, + "loss": 0.852, + "step": 144400 + }, + { + "epoch": 0.9225943293765891, + "grad_norm": 0.7566165924072266, + "learning_rate": 5.609234937164886e-05, + "loss": 0.8084, + "step": 144410 + }, + { + "epoch": 0.9226582165263278, + "grad_norm": 0.963897168636322, + "learning_rate": 5.60873690432637e-05, + "loss": 1.0114, + "step": 144420 + }, + { + "epoch": 0.9227221036760666, + "grad_norm": 0.7856702208518982, + "learning_rate": 5.608238865357348e-05, + "loss": 0.9301, + "step": 144430 + }, + { + "epoch": 0.9227859908258053, + "grad_norm": 1.138763666152954, + "learning_rate": 5.6077408202628334e-05, + "loss": 0.7774, + "step": 144440 + }, + { + "epoch": 0.922849877975544, + "grad_norm": 1.0540574789047241, + "learning_rate": 5.607242769047843e-05, + "loss": 1.1135, + "step": 144450 + }, + { + "epoch": 0.9229137651252827, + "grad_norm": 0.8265545964241028, + "learning_rate": 5.606744711717393e-05, + "loss": 0.8917, + "step": 144460 + }, + { + "epoch": 0.9229776522750214, + "grad_norm": 0.5889626741409302, + "learning_rate": 5.6062466482765e-05, + "loss": 0.8798, + "step": 144470 + }, + { + "epoch": 0.9230415394247601, + "grad_norm": 0.7544617652893066, + "learning_rate": 5.6057485787301765e-05, + "loss": 0.9106, + "step": 144480 + }, + { + "epoch": 0.9231054265744988, + "grad_norm": 0.7426086664199829, + "learning_rate": 5.6052505030834425e-05, + "loss": 0.8295, + "step": 144490 + }, + { + "epoch": 0.9231693137242375, + "grad_norm": 0.8017638921737671, + "learning_rate": 5.6047524213413116e-05, + "loss": 0.7987, + "step": 144500 + }, + { + "epoch": 0.9232332008739762, + "grad_norm": 1.0374419689178467, + "learning_rate": 5.604254333508802e-05, + "loss": 0.885, + "step": 144510 + }, + { + "epoch": 0.9232970880237149, + "grad_norm": 1.0828471183776855, + "learning_rate": 5.603756239590926e-05, + "loss": 0.8636, + "step": 144520 + }, + { + "epoch": 0.9233609751734536, + "grad_norm": 0.7223926186561584, + "learning_rate": 5.603258139592704e-05, + "loss": 0.8139, + "step": 144530 + }, + { + "epoch": 0.9234248623231923, + "grad_norm": 0.8874870538711548, + "learning_rate": 5.60276003351915e-05, + "loss": 0.7938, + "step": 144540 + }, + { + "epoch": 0.923488749472931, + "grad_norm": 0.7778692841529846, + "learning_rate": 5.6022619213752816e-05, + "loss": 0.7878, + "step": 144550 + }, + { + "epoch": 0.9235526366226697, + "grad_norm": 0.779015302658081, + "learning_rate": 5.6017638031661144e-05, + "loss": 1.0048, + "step": 144560 + }, + { + "epoch": 0.9236165237724084, + "grad_norm": 0.6786310076713562, + "learning_rate": 5.6012656788966656e-05, + "loss": 0.7291, + "step": 144570 + }, + { + "epoch": 0.9236804109221471, + "grad_norm": 0.8063673377037048, + "learning_rate": 5.6007675485719504e-05, + "loss": 1.0574, + "step": 144580 + }, + { + "epoch": 0.9237442980718858, + "grad_norm": 1.407467007637024, + "learning_rate": 5.600269412196986e-05, + "loss": 0.9079, + "step": 144590 + }, + { + "epoch": 0.9238081852216246, + "grad_norm": 0.8846006393432617, + "learning_rate": 5.59977126977679e-05, + "loss": 0.9646, + "step": 144600 + }, + { + "epoch": 0.9238720723713633, + "grad_norm": 0.824082612991333, + "learning_rate": 5.5992731213163785e-05, + "loss": 0.9403, + "step": 144610 + }, + { + "epoch": 0.923935959521102, + "grad_norm": 0.7869988083839417, + "learning_rate": 5.598774966820768e-05, + "loss": 0.9048, + "step": 144620 + }, + { + "epoch": 0.9239998466708407, + "grad_norm": 1.9946730136871338, + "learning_rate": 5.5982768062949755e-05, + "loss": 1.2423, + "step": 144630 + }, + { + "epoch": 0.9240637338205794, + "grad_norm": 0.8373878598213196, + "learning_rate": 5.597778639744018e-05, + "loss": 1.0149, + "step": 144640 + }, + { + "epoch": 0.924127620970318, + "grad_norm": 0.6496232151985168, + "learning_rate": 5.5972804671729116e-05, + "loss": 0.8685, + "step": 144650 + }, + { + "epoch": 0.9241915081200567, + "grad_norm": 0.8842912912368774, + "learning_rate": 5.596782288586676e-05, + "loss": 0.9352, + "step": 144660 + }, + { + "epoch": 0.9242553952697954, + "grad_norm": 0.9695751667022705, + "learning_rate": 5.596284103990326e-05, + "loss": 1.1568, + "step": 144670 + }, + { + "epoch": 0.9243192824195341, + "grad_norm": 1.1602424383163452, + "learning_rate": 5.595785913388878e-05, + "loss": 0.8338, + "step": 144680 + }, + { + "epoch": 0.9243831695692728, + "grad_norm": 0.8936864733695984, + "learning_rate": 5.595287716787351e-05, + "loss": 0.915, + "step": 144690 + }, + { + "epoch": 0.9244470567190115, + "grad_norm": 1.0186363458633423, + "learning_rate": 5.5947895141907624e-05, + "loss": 0.9493, + "step": 144700 + }, + { + "epoch": 0.9245109438687502, + "grad_norm": 0.8145350217819214, + "learning_rate": 5.594291305604128e-05, + "loss": 0.8706, + "step": 144710 + }, + { + "epoch": 0.9245748310184889, + "grad_norm": 1.0934886932373047, + "learning_rate": 5.5937930910324666e-05, + "loss": 0.8845, + "step": 144720 + }, + { + "epoch": 0.9246387181682276, + "grad_norm": 1.0088940858840942, + "learning_rate": 5.593294870480794e-05, + "loss": 1.0438, + "step": 144730 + }, + { + "epoch": 0.9247026053179663, + "grad_norm": 0.632199764251709, + "learning_rate": 5.5927966439541304e-05, + "loss": 0.7483, + "step": 144740 + }, + { + "epoch": 0.924766492467705, + "grad_norm": 1.0264745950698853, + "learning_rate": 5.5922984114574904e-05, + "loss": 1.069, + "step": 144750 + }, + { + "epoch": 0.9248303796174437, + "grad_norm": 0.7027975916862488, + "learning_rate": 5.591800172995894e-05, + "loss": 1.0355, + "step": 144760 + }, + { + "epoch": 0.9248942667671824, + "grad_norm": 1.1865230798721313, + "learning_rate": 5.591301928574355e-05, + "loss": 0.7578, + "step": 144770 + }, + { + "epoch": 0.9249581539169212, + "grad_norm": 0.8094105124473572, + "learning_rate": 5.5908036781978966e-05, + "loss": 0.9229, + "step": 144780 + }, + { + "epoch": 0.9250220410666599, + "grad_norm": 1.438416600227356, + "learning_rate": 5.590305421871534e-05, + "loss": 1.0329, + "step": 144790 + }, + { + "epoch": 0.9250859282163986, + "grad_norm": 1.4989200830459595, + "learning_rate": 5.5898071596002855e-05, + "loss": 1.2769, + "step": 144800 + }, + { + "epoch": 0.9251498153661373, + "grad_norm": 1.0811078548431396, + "learning_rate": 5.589308891389168e-05, + "loss": 0.896, + "step": 144810 + }, + { + "epoch": 0.925213702515876, + "grad_norm": 0.8900967836380005, + "learning_rate": 5.5888106172431995e-05, + "loss": 0.8607, + "step": 144820 + }, + { + "epoch": 0.9252775896656147, + "grad_norm": 0.7876869440078735, + "learning_rate": 5.5883123371673995e-05, + "loss": 0.9373, + "step": 144830 + }, + { + "epoch": 0.9253414768153534, + "grad_norm": 1.5476784706115723, + "learning_rate": 5.5878140511667855e-05, + "loss": 0.9396, + "step": 144840 + }, + { + "epoch": 0.9254053639650921, + "grad_norm": 0.9162848591804504, + "learning_rate": 5.587315759246376e-05, + "loss": 0.9302, + "step": 144850 + }, + { + "epoch": 0.9254692511148308, + "grad_norm": 1.4244049787521362, + "learning_rate": 5.586817461411188e-05, + "loss": 0.8326, + "step": 144860 + }, + { + "epoch": 0.9255331382645695, + "grad_norm": 1.2231682538986206, + "learning_rate": 5.586319157666241e-05, + "loss": 0.6845, + "step": 144870 + }, + { + "epoch": 0.9255970254143082, + "grad_norm": 0.9394405484199524, + "learning_rate": 5.585820848016552e-05, + "loss": 0.6608, + "step": 144880 + }, + { + "epoch": 0.9256609125640469, + "grad_norm": 3.920628786087036, + "learning_rate": 5.585322532467141e-05, + "loss": 0.9313, + "step": 144890 + }, + { + "epoch": 0.9257247997137855, + "grad_norm": 0.7618585824966431, + "learning_rate": 5.5848242110230245e-05, + "loss": 0.8513, + "step": 144900 + }, + { + "epoch": 0.9257886868635242, + "grad_norm": 1.0813332796096802, + "learning_rate": 5.5843258836892234e-05, + "loss": 0.7353, + "step": 144910 + }, + { + "epoch": 0.9258525740132629, + "grad_norm": 1.2577931880950928, + "learning_rate": 5.583827550470755e-05, + "loss": 0.6659, + "step": 144920 + }, + { + "epoch": 0.9259164611630016, + "grad_norm": 1.2664730548858643, + "learning_rate": 5.583329211372637e-05, + "loss": 0.7771, + "step": 144930 + }, + { + "epoch": 0.9259803483127403, + "grad_norm": 0.9695154428482056, + "learning_rate": 5.582830866399888e-05, + "loss": 1.2447, + "step": 144940 + }, + { + "epoch": 0.926044235462479, + "grad_norm": 1.5105254650115967, + "learning_rate": 5.5823325155575314e-05, + "loss": 0.7378, + "step": 144950 + }, + { + "epoch": 0.9261081226122178, + "grad_norm": 0.6907379627227783, + "learning_rate": 5.5818341588505806e-05, + "loss": 1.249, + "step": 144960 + }, + { + "epoch": 0.9261720097619565, + "grad_norm": 0.8063596487045288, + "learning_rate": 5.581335796284057e-05, + "loss": 0.8475, + "step": 144970 + }, + { + "epoch": 0.9262358969116952, + "grad_norm": 1.5274707078933716, + "learning_rate": 5.5808374278629795e-05, + "loss": 1.2363, + "step": 144980 + }, + { + "epoch": 0.9262997840614339, + "grad_norm": 0.9278043508529663, + "learning_rate": 5.580339053592366e-05, + "loss": 0.9631, + "step": 144990 + }, + { + "epoch": 0.9263636712111726, + "grad_norm": 0.9493452310562134, + "learning_rate": 5.579840673477236e-05, + "loss": 0.8491, + "step": 145000 + }, + { + "epoch": 0.9264275583609113, + "grad_norm": 0.5410763621330261, + "learning_rate": 5.579342287522609e-05, + "loss": 0.9043, + "step": 145010 + }, + { + "epoch": 0.92649144551065, + "grad_norm": 0.8283954858779907, + "learning_rate": 5.578843895733504e-05, + "loss": 0.8748, + "step": 145020 + }, + { + "epoch": 0.9265553326603887, + "grad_norm": 0.9432761073112488, + "learning_rate": 5.57834549811494e-05, + "loss": 1.3108, + "step": 145030 + }, + { + "epoch": 0.9266192198101274, + "grad_norm": 0.6497068405151367, + "learning_rate": 5.5778470946719366e-05, + "loss": 0.8953, + "step": 145040 + }, + { + "epoch": 0.9266831069598661, + "grad_norm": 0.8885396122932434, + "learning_rate": 5.5773486854095134e-05, + "loss": 0.7763, + "step": 145050 + }, + { + "epoch": 0.9267469941096048, + "grad_norm": 0.7947267293930054, + "learning_rate": 5.576850270332689e-05, + "loss": 0.6814, + "step": 145060 + }, + { + "epoch": 0.9268108812593435, + "grad_norm": 0.9257674813270569, + "learning_rate": 5.576351849446484e-05, + "loss": 0.8433, + "step": 145070 + }, + { + "epoch": 0.9268747684090822, + "grad_norm": 0.6727604866027832, + "learning_rate": 5.575853422755917e-05, + "loss": 0.8165, + "step": 145080 + }, + { + "epoch": 0.9269386555588209, + "grad_norm": 0.8554814457893372, + "learning_rate": 5.5753549902660076e-05, + "loss": 0.9877, + "step": 145090 + }, + { + "epoch": 0.9270025427085596, + "grad_norm": 1.5152359008789062, + "learning_rate": 5.574856551981775e-05, + "loss": 0.9839, + "step": 145100 + }, + { + "epoch": 0.9270664298582983, + "grad_norm": 1.0759176015853882, + "learning_rate": 5.5743581079082405e-05, + "loss": 0.6801, + "step": 145110 + }, + { + "epoch": 0.927130317008037, + "grad_norm": 1.2060096263885498, + "learning_rate": 5.573859658050423e-05, + "loss": 0.7048, + "step": 145120 + }, + { + "epoch": 0.9271942041577758, + "grad_norm": 2.348177671432495, + "learning_rate": 5.5733612024133416e-05, + "loss": 1.1837, + "step": 145130 + }, + { + "epoch": 0.9272580913075144, + "grad_norm": 0.8047994375228882, + "learning_rate": 5.572862741002017e-05, + "loss": 0.7672, + "step": 145140 + }, + { + "epoch": 0.9273219784572531, + "grad_norm": 1.255176305770874, + "learning_rate": 5.57236427382147e-05, + "loss": 0.7974, + "step": 145150 + }, + { + "epoch": 0.9273858656069918, + "grad_norm": 0.7443729639053345, + "learning_rate": 5.571865800876719e-05, + "loss": 1.0677, + "step": 145160 + }, + { + "epoch": 0.9274497527567305, + "grad_norm": 0.9563167691230774, + "learning_rate": 5.571367322172785e-05, + "loss": 0.7602, + "step": 145170 + }, + { + "epoch": 0.9275136399064692, + "grad_norm": 1.492719292640686, + "learning_rate": 5.5708688377146866e-05, + "loss": 0.8196, + "step": 145180 + }, + { + "epoch": 0.9275775270562079, + "grad_norm": 1.4552744626998901, + "learning_rate": 5.570370347507446e-05, + "loss": 0.7228, + "step": 145190 + }, + { + "epoch": 0.9276414142059466, + "grad_norm": 1.105858564376831, + "learning_rate": 5.569871851556082e-05, + "loss": 0.9919, + "step": 145200 + }, + { + "epoch": 0.9277053013556853, + "grad_norm": 0.6973342299461365, + "learning_rate": 5.5693733498656165e-05, + "loss": 0.6737, + "step": 145210 + }, + { + "epoch": 0.927769188505424, + "grad_norm": 0.8317189812660217, + "learning_rate": 5.5688748424410675e-05, + "loss": 0.7846, + "step": 145220 + }, + { + "epoch": 0.9278330756551627, + "grad_norm": 1.7505478858947754, + "learning_rate": 5.568376329287458e-05, + "loss": 0.8123, + "step": 145230 + }, + { + "epoch": 0.9278969628049014, + "grad_norm": 1.1510494947433472, + "learning_rate": 5.567877810409806e-05, + "loss": 0.9568, + "step": 145240 + }, + { + "epoch": 0.9279608499546401, + "grad_norm": 0.599578857421875, + "learning_rate": 5.567379285813135e-05, + "loss": 1.1179, + "step": 145250 + }, + { + "epoch": 0.9280247371043788, + "grad_norm": 0.6046319007873535, + "learning_rate": 5.566880755502462e-05, + "loss": 0.9872, + "step": 145260 + }, + { + "epoch": 0.9280886242541175, + "grad_norm": 0.9385390281677246, + "learning_rate": 5.5663822194828095e-05, + "loss": 0.6484, + "step": 145270 + }, + { + "epoch": 0.9281525114038562, + "grad_norm": 1.1661081314086914, + "learning_rate": 5.565883677759198e-05, + "loss": 0.7681, + "step": 145280 + }, + { + "epoch": 0.928216398553595, + "grad_norm": 0.9337068200111389, + "learning_rate": 5.565385130336649e-05, + "loss": 1.2074, + "step": 145290 + }, + { + "epoch": 0.9282802857033337, + "grad_norm": 0.7268047332763672, + "learning_rate": 5.564886577220181e-05, + "loss": 1.2321, + "step": 145300 + }, + { + "epoch": 0.9283441728530724, + "grad_norm": 0.8471580743789673, + "learning_rate": 5.564388018414818e-05, + "loss": 0.7306, + "step": 145310 + }, + { + "epoch": 0.9284080600028111, + "grad_norm": 1.361351728439331, + "learning_rate": 5.563889453925579e-05, + "loss": 0.8582, + "step": 145320 + }, + { + "epoch": 0.9284719471525498, + "grad_norm": 0.8351898789405823, + "learning_rate": 5.563390883757485e-05, + "loss": 0.9428, + "step": 145330 + }, + { + "epoch": 0.9285358343022885, + "grad_norm": 0.5664870142936707, + "learning_rate": 5.562892307915559e-05, + "loss": 0.8726, + "step": 145340 + }, + { + "epoch": 0.9285997214520272, + "grad_norm": 0.9608287215232849, + "learning_rate": 5.56239372640482e-05, + "loss": 0.8297, + "step": 145350 + }, + { + "epoch": 0.9286636086017659, + "grad_norm": 0.5610537528991699, + "learning_rate": 5.5618951392302886e-05, + "loss": 1.07, + "step": 145360 + }, + { + "epoch": 0.9287274957515046, + "grad_norm": 1.0210973024368286, + "learning_rate": 5.561396546396988e-05, + "loss": 0.8373, + "step": 145370 + }, + { + "epoch": 0.9287913829012432, + "grad_norm": 0.8455613255500793, + "learning_rate": 5.560897947909938e-05, + "loss": 0.7514, + "step": 145380 + }, + { + "epoch": 0.9288552700509819, + "grad_norm": 0.7670028805732727, + "learning_rate": 5.56039934377416e-05, + "loss": 0.7576, + "step": 145390 + }, + { + "epoch": 0.9289191572007206, + "grad_norm": 0.9094594717025757, + "learning_rate": 5.559900733994676e-05, + "loss": 0.9407, + "step": 145400 + }, + { + "epoch": 0.9289830443504593, + "grad_norm": 0.8494235873222351, + "learning_rate": 5.559402118576508e-05, + "loss": 0.7876, + "step": 145410 + }, + { + "epoch": 0.929046931500198, + "grad_norm": 1.263748049736023, + "learning_rate": 5.558903497524676e-05, + "loss": 0.9246, + "step": 145420 + }, + { + "epoch": 0.9291108186499367, + "grad_norm": 0.7942246794700623, + "learning_rate": 5.558404870844201e-05, + "loss": 0.8941, + "step": 145430 + }, + { + "epoch": 0.9291747057996754, + "grad_norm": 1.4969481229782104, + "learning_rate": 5.557906238540108e-05, + "loss": 1.0584, + "step": 145440 + }, + { + "epoch": 0.9292385929494141, + "grad_norm": 1.607496738433838, + "learning_rate": 5.557407600617416e-05, + "loss": 1.0476, + "step": 145450 + }, + { + "epoch": 0.9293024800991528, + "grad_norm": 0.9495954513549805, + "learning_rate": 5.5569089570811464e-05, + "loss": 1.1759, + "step": 145460 + }, + { + "epoch": 0.9293663672488915, + "grad_norm": 1.1366685628890991, + "learning_rate": 5.556410307936322e-05, + "loss": 1.1819, + "step": 145470 + }, + { + "epoch": 0.9294302543986303, + "grad_norm": 1.110145926475525, + "learning_rate": 5.555911653187964e-05, + "loss": 0.8672, + "step": 145480 + }, + { + "epoch": 0.929494141548369, + "grad_norm": 0.8252993822097778, + "learning_rate": 5.5554129928410957e-05, + "loss": 0.7987, + "step": 145490 + }, + { + "epoch": 0.9295580286981077, + "grad_norm": 1.1678286790847778, + "learning_rate": 5.554914326900739e-05, + "loss": 0.8637, + "step": 145500 + }, + { + "epoch": 0.9296219158478464, + "grad_norm": 2.4226267337799072, + "learning_rate": 5.554415655371913e-05, + "loss": 1.0453, + "step": 145510 + }, + { + "epoch": 0.9296858029975851, + "grad_norm": 1.283092975616455, + "learning_rate": 5.553916978259642e-05, + "loss": 0.7881, + "step": 145520 + }, + { + "epoch": 0.9297496901473238, + "grad_norm": 1.1610386371612549, + "learning_rate": 5.55341829556895e-05, + "loss": 0.7663, + "step": 145530 + }, + { + "epoch": 0.9298135772970625, + "grad_norm": 1.2389222383499146, + "learning_rate": 5.552919607304854e-05, + "loss": 0.88, + "step": 145540 + }, + { + "epoch": 0.9298774644468012, + "grad_norm": 0.8321581482887268, + "learning_rate": 5.552420913472381e-05, + "loss": 0.7218, + "step": 145550 + }, + { + "epoch": 0.9299413515965399, + "grad_norm": 1.4073259830474854, + "learning_rate": 5.5519222140765514e-05, + "loss": 0.9816, + "step": 145560 + }, + { + "epoch": 0.9300052387462786, + "grad_norm": 0.8335140347480774, + "learning_rate": 5.5514235091223877e-05, + "loss": 1.096, + "step": 145570 + }, + { + "epoch": 0.9300691258960173, + "grad_norm": 1.2483246326446533, + "learning_rate": 5.5509247986149126e-05, + "loss": 0.8831, + "step": 145580 + }, + { + "epoch": 0.930133013045756, + "grad_norm": 0.7681006193161011, + "learning_rate": 5.550426082559147e-05, + "loss": 0.7553, + "step": 145590 + }, + { + "epoch": 0.9301969001954947, + "grad_norm": 0.8035659193992615, + "learning_rate": 5.5499273609601154e-05, + "loss": 0.8508, + "step": 145600 + }, + { + "epoch": 0.9302607873452334, + "grad_norm": 1.2194147109985352, + "learning_rate": 5.5494286338228384e-05, + "loss": 0.9484, + "step": 145610 + }, + { + "epoch": 0.930324674494972, + "grad_norm": 1.6273916959762573, + "learning_rate": 5.54892990115234e-05, + "loss": 0.8937, + "step": 145620 + }, + { + "epoch": 0.9303885616447107, + "grad_norm": 0.48322996497154236, + "learning_rate": 5.5484311629536425e-05, + "loss": 0.6711, + "step": 145630 + }, + { + "epoch": 0.9304524487944494, + "grad_norm": 0.7854679822921753, + "learning_rate": 5.5479324192317694e-05, + "loss": 0.8923, + "step": 145640 + }, + { + "epoch": 0.9305163359441881, + "grad_norm": 0.9290236830711365, + "learning_rate": 5.547433669991743e-05, + "loss": 0.8368, + "step": 145650 + }, + { + "epoch": 0.9305802230939269, + "grad_norm": 1.1597179174423218, + "learning_rate": 5.546934915238585e-05, + "loss": 0.7749, + "step": 145660 + }, + { + "epoch": 0.9306441102436656, + "grad_norm": 1.3749240636825562, + "learning_rate": 5.54643615497732e-05, + "loss": 0.9082, + "step": 145670 + }, + { + "epoch": 0.9307079973934043, + "grad_norm": 0.7642197608947754, + "learning_rate": 5.54593738921297e-05, + "loss": 1.0149, + "step": 145680 + }, + { + "epoch": 0.930771884543143, + "grad_norm": 0.974104642868042, + "learning_rate": 5.545438617950558e-05, + "loss": 0.635, + "step": 145690 + }, + { + "epoch": 0.9308357716928817, + "grad_norm": 0.8045002818107605, + "learning_rate": 5.544939841195108e-05, + "loss": 0.7151, + "step": 145700 + }, + { + "epoch": 0.9308996588426204, + "grad_norm": 1.036468267440796, + "learning_rate": 5.544441058951641e-05, + "loss": 0.9897, + "step": 145710 + }, + { + "epoch": 0.9309635459923591, + "grad_norm": 0.5775062441825867, + "learning_rate": 5.5439422712251835e-05, + "loss": 0.7542, + "step": 145720 + }, + { + "epoch": 0.9310274331420978, + "grad_norm": 0.88588947057724, + "learning_rate": 5.543443478020754e-05, + "loss": 0.7627, + "step": 145730 + }, + { + "epoch": 0.9310913202918365, + "grad_norm": 1.2807823419570923, + "learning_rate": 5.5429446793433814e-05, + "loss": 1.1104, + "step": 145740 + }, + { + "epoch": 0.9311552074415752, + "grad_norm": 1.0863866806030273, + "learning_rate": 5.5424458751980844e-05, + "loss": 0.6739, + "step": 145750 + }, + { + "epoch": 0.9312190945913139, + "grad_norm": 1.238873839378357, + "learning_rate": 5.5419470655898883e-05, + "loss": 0.9149, + "step": 145760 + }, + { + "epoch": 0.9312829817410526, + "grad_norm": 0.8662456274032593, + "learning_rate": 5.541448250523817e-05, + "loss": 0.8491, + "step": 145770 + }, + { + "epoch": 0.9313468688907913, + "grad_norm": 1.59992253780365, + "learning_rate": 5.5409494300048935e-05, + "loss": 1.2597, + "step": 145780 + }, + { + "epoch": 0.93141075604053, + "grad_norm": 0.7041088342666626, + "learning_rate": 5.540450604038141e-05, + "loss": 0.8117, + "step": 145790 + }, + { + "epoch": 0.9314746431902687, + "grad_norm": 0.9120072722434998, + "learning_rate": 5.539951772628583e-05, + "loss": 0.8138, + "step": 145800 + }, + { + "epoch": 0.9315385303400074, + "grad_norm": 0.865118682384491, + "learning_rate": 5.539452935781244e-05, + "loss": 0.7262, + "step": 145810 + }, + { + "epoch": 0.9316024174897461, + "grad_norm": 0.5491125583648682, + "learning_rate": 5.5389540935011466e-05, + "loss": 0.7411, + "step": 145820 + }, + { + "epoch": 0.9316663046394849, + "grad_norm": 0.8732421398162842, + "learning_rate": 5.538455245793316e-05, + "loss": 0.8556, + "step": 145830 + }, + { + "epoch": 0.9317301917892236, + "grad_norm": 0.7927635312080383, + "learning_rate": 5.5379563926627745e-05, + "loss": 1.0176, + "step": 145840 + }, + { + "epoch": 0.9317940789389623, + "grad_norm": 1.8838095664978027, + "learning_rate": 5.5374575341145476e-05, + "loss": 0.8793, + "step": 145850 + }, + { + "epoch": 0.931857966088701, + "grad_norm": 0.998735785484314, + "learning_rate": 5.536958670153658e-05, + "loss": 1.2281, + "step": 145860 + }, + { + "epoch": 0.9319218532384396, + "grad_norm": 0.7903896570205688, + "learning_rate": 5.53645980078513e-05, + "loss": 1.2174, + "step": 145870 + }, + { + "epoch": 0.9319857403881783, + "grad_norm": 0.8727949857711792, + "learning_rate": 5.535960926013987e-05, + "loss": 1.0978, + "step": 145880 + }, + { + "epoch": 0.932049627537917, + "grad_norm": 1.2408875226974487, + "learning_rate": 5.5354620458452546e-05, + "loss": 0.937, + "step": 145890 + }, + { + "epoch": 0.9321135146876557, + "grad_norm": 0.8686769604682922, + "learning_rate": 5.5349631602839557e-05, + "loss": 0.7722, + "step": 145900 + }, + { + "epoch": 0.9321774018373944, + "grad_norm": 0.914225697517395, + "learning_rate": 5.534464269335116e-05, + "loss": 0.7863, + "step": 145910 + }, + { + "epoch": 0.9322412889871331, + "grad_norm": 1.2484056949615479, + "learning_rate": 5.533965373003758e-05, + "loss": 0.8891, + "step": 145920 + }, + { + "epoch": 0.9323051761368718, + "grad_norm": 0.7330338358879089, + "learning_rate": 5.533466471294906e-05, + "loss": 0.9297, + "step": 145930 + }, + { + "epoch": 0.9323690632866105, + "grad_norm": 1.5863157510757446, + "learning_rate": 5.532967564213586e-05, + "loss": 0.926, + "step": 145940 + }, + { + "epoch": 0.9324329504363492, + "grad_norm": 1.0102980136871338, + "learning_rate": 5.532468651764822e-05, + "loss": 0.9029, + "step": 145950 + }, + { + "epoch": 0.9324968375860879, + "grad_norm": 1.0613675117492676, + "learning_rate": 5.531969733953637e-05, + "loss": 0.8485, + "step": 145960 + }, + { + "epoch": 0.9325607247358266, + "grad_norm": 0.8373472690582275, + "learning_rate": 5.531470810785057e-05, + "loss": 0.8047, + "step": 145970 + }, + { + "epoch": 0.9326246118855653, + "grad_norm": 3.7152907848358154, + "learning_rate": 5.5309718822641054e-05, + "loss": 0.8467, + "step": 145980 + }, + { + "epoch": 0.932688499035304, + "grad_norm": 1.0984721183776855, + "learning_rate": 5.5304729483958073e-05, + "loss": 0.6046, + "step": 145990 + }, + { + "epoch": 0.9327523861850427, + "grad_norm": 0.7963919043540955, + "learning_rate": 5.529974009185189e-05, + "loss": 0.9892, + "step": 146000 + }, + { + "epoch": 0.9328162733347815, + "grad_norm": 1.0803087949752808, + "learning_rate": 5.529475064637274e-05, + "loss": 0.965, + "step": 146010 + }, + { + "epoch": 0.9328801604845202, + "grad_norm": 0.7040061950683594, + "learning_rate": 5.528976114757086e-05, + "loss": 0.9054, + "step": 146020 + }, + { + "epoch": 0.9329440476342589, + "grad_norm": 1.2263482809066772, + "learning_rate": 5.528477159549652e-05, + "loss": 0.6155, + "step": 146030 + }, + { + "epoch": 0.9330079347839976, + "grad_norm": 0.532351016998291, + "learning_rate": 5.5279781990199954e-05, + "loss": 0.8038, + "step": 146040 + }, + { + "epoch": 0.9330718219337363, + "grad_norm": 0.8017789125442505, + "learning_rate": 5.527479233173142e-05, + "loss": 0.8907, + "step": 146050 + }, + { + "epoch": 0.933135709083475, + "grad_norm": 3.3494420051574707, + "learning_rate": 5.5269802620141155e-05, + "loss": 1.1866, + "step": 146060 + }, + { + "epoch": 0.9331995962332137, + "grad_norm": 0.9478211402893066, + "learning_rate": 5.526481285547943e-05, + "loss": 0.8094, + "step": 146070 + }, + { + "epoch": 0.9332634833829524, + "grad_norm": 0.8863970637321472, + "learning_rate": 5.525982303779648e-05, + "loss": 0.7409, + "step": 146080 + }, + { + "epoch": 0.9333273705326911, + "grad_norm": 1.6237622499465942, + "learning_rate": 5.525483316714256e-05, + "loss": 0.9142, + "step": 146090 + }, + { + "epoch": 0.9333912576824298, + "grad_norm": 0.8216589689254761, + "learning_rate": 5.524984324356792e-05, + "loss": 0.7625, + "step": 146100 + }, + { + "epoch": 0.9334551448321684, + "grad_norm": 2.1370699405670166, + "learning_rate": 5.524485326712282e-05, + "loss": 0.9642, + "step": 146110 + }, + { + "epoch": 0.9335190319819071, + "grad_norm": 1.2563636302947998, + "learning_rate": 5.5239863237857516e-05, + "loss": 1.0334, + "step": 146120 + }, + { + "epoch": 0.9335829191316458, + "grad_norm": 0.9968759417533875, + "learning_rate": 5.5234873155822256e-05, + "loss": 1.0448, + "step": 146130 + }, + { + "epoch": 0.9336468062813845, + "grad_norm": 1.1890705823898315, + "learning_rate": 5.5229883021067286e-05, + "loss": 0.9123, + "step": 146140 + }, + { + "epoch": 0.9337106934311232, + "grad_norm": 0.9829585552215576, + "learning_rate": 5.522489283364286e-05, + "loss": 0.8136, + "step": 146150 + }, + { + "epoch": 0.9337745805808619, + "grad_norm": 1.1096529960632324, + "learning_rate": 5.521990259359925e-05, + "loss": 1.0181, + "step": 146160 + }, + { + "epoch": 0.9338384677306006, + "grad_norm": 0.9074599146842957, + "learning_rate": 5.521491230098671e-05, + "loss": 0.8634, + "step": 146170 + }, + { + "epoch": 0.9339023548803393, + "grad_norm": 0.8308917284011841, + "learning_rate": 5.520992195585549e-05, + "loss": 0.7422, + "step": 146180 + }, + { + "epoch": 0.933966242030078, + "grad_norm": 0.6945448517799377, + "learning_rate": 5.5204931558255857e-05, + "loss": 0.7908, + "step": 146190 + }, + { + "epoch": 0.9340301291798168, + "grad_norm": 1.085972547531128, + "learning_rate": 5.519994110823805e-05, + "loss": 1.0072, + "step": 146200 + }, + { + "epoch": 0.9340940163295555, + "grad_norm": 1.0287305116653442, + "learning_rate": 5.519495060585235e-05, + "loss": 1.133, + "step": 146210 + }, + { + "epoch": 0.9341579034792942, + "grad_norm": 0.679003894329071, + "learning_rate": 5.5189960051148995e-05, + "loss": 0.726, + "step": 146220 + }, + { + "epoch": 0.9342217906290329, + "grad_norm": 1.126184105873108, + "learning_rate": 5.5184969444178246e-05, + "loss": 1.1845, + "step": 146230 + }, + { + "epoch": 0.9342856777787716, + "grad_norm": 0.5712942481040955, + "learning_rate": 5.517997878499037e-05, + "loss": 0.7351, + "step": 146240 + }, + { + "epoch": 0.9343495649285103, + "grad_norm": 0.9215618371963501, + "learning_rate": 5.517498807363564e-05, + "loss": 0.9922, + "step": 146250 + }, + { + "epoch": 0.934413452078249, + "grad_norm": 1.0021543502807617, + "learning_rate": 5.516999731016429e-05, + "loss": 0.9386, + "step": 146260 + }, + { + "epoch": 0.9344773392279877, + "grad_norm": 0.5854945778846741, + "learning_rate": 5.516500649462659e-05, + "loss": 0.6703, + "step": 146270 + }, + { + "epoch": 0.9345412263777264, + "grad_norm": 0.7984985709190369, + "learning_rate": 5.5160015627072824e-05, + "loss": 0.8576, + "step": 146280 + }, + { + "epoch": 0.9346051135274651, + "grad_norm": 2.2100253105163574, + "learning_rate": 5.5155024707553226e-05, + "loss": 1.2271, + "step": 146290 + }, + { + "epoch": 0.9346690006772038, + "grad_norm": 0.6737677454948425, + "learning_rate": 5.5150033736118065e-05, + "loss": 0.617, + "step": 146300 + }, + { + "epoch": 0.9347328878269425, + "grad_norm": 0.48655664920806885, + "learning_rate": 5.514504271281762e-05, + "loss": 0.6696, + "step": 146310 + }, + { + "epoch": 0.9347967749766812, + "grad_norm": 2.0812628269195557, + "learning_rate": 5.514005163770214e-05, + "loss": 1.1076, + "step": 146320 + }, + { + "epoch": 0.9348606621264199, + "grad_norm": 0.6728872060775757, + "learning_rate": 5.513506051082189e-05, + "loss": 0.9832, + "step": 146330 + }, + { + "epoch": 0.9349245492761586, + "grad_norm": 1.2701008319854736, + "learning_rate": 5.513006933222714e-05, + "loss": 0.9593, + "step": 146340 + }, + { + "epoch": 0.9349884364258972, + "grad_norm": 1.0993520021438599, + "learning_rate": 5.5125078101968155e-05, + "loss": 0.846, + "step": 146350 + }, + { + "epoch": 0.935052323575636, + "grad_norm": 0.947177529335022, + "learning_rate": 5.5120086820095195e-05, + "loss": 0.9143, + "step": 146360 + }, + { + "epoch": 0.9351162107253747, + "grad_norm": 0.9848998785018921, + "learning_rate": 5.511509548665854e-05, + "loss": 0.8256, + "step": 146370 + }, + { + "epoch": 0.9351800978751134, + "grad_norm": 1.219247579574585, + "learning_rate": 5.511010410170844e-05, + "loss": 0.7707, + "step": 146380 + }, + { + "epoch": 0.9352439850248521, + "grad_norm": 0.7544047832489014, + "learning_rate": 5.510511266529518e-05, + "loss": 0.9864, + "step": 146390 + }, + { + "epoch": 0.9353078721745908, + "grad_norm": 0.7642074227333069, + "learning_rate": 5.510012117746901e-05, + "loss": 0.7728, + "step": 146400 + }, + { + "epoch": 0.9353717593243295, + "grad_norm": 0.9915320873260498, + "learning_rate": 5.509512963828021e-05, + "loss": 1.2914, + "step": 146410 + }, + { + "epoch": 0.9354356464740682, + "grad_norm": 0.9371116757392883, + "learning_rate": 5.509013804777904e-05, + "loss": 0.8414, + "step": 146420 + }, + { + "epoch": 0.9354995336238069, + "grad_norm": 0.4904581904411316, + "learning_rate": 5.508514640601579e-05, + "loss": 0.9459, + "step": 146430 + }, + { + "epoch": 0.9355634207735456, + "grad_norm": 0.9960023760795593, + "learning_rate": 5.508015471304071e-05, + "loss": 0.8509, + "step": 146440 + }, + { + "epoch": 0.9356273079232843, + "grad_norm": 0.8758112788200378, + "learning_rate": 5.507516296890407e-05, + "loss": 0.8133, + "step": 146450 + }, + { + "epoch": 0.935691195073023, + "grad_norm": 0.8281605243682861, + "learning_rate": 5.507017117365616e-05, + "loss": 0.9745, + "step": 146460 + }, + { + "epoch": 0.9357550822227617, + "grad_norm": 0.9094333052635193, + "learning_rate": 5.5065179327347224e-05, + "loss": 0.9971, + "step": 146470 + }, + { + "epoch": 0.9358189693725004, + "grad_norm": 1.2272045612335205, + "learning_rate": 5.5060187430027565e-05, + "loss": 0.8394, + "step": 146480 + }, + { + "epoch": 0.9358828565222391, + "grad_norm": 1.1439415216445923, + "learning_rate": 5.505519548174745e-05, + "loss": 0.9516, + "step": 146490 + }, + { + "epoch": 0.9359467436719778, + "grad_norm": 0.7094035744667053, + "learning_rate": 5.5050203482557115e-05, + "loss": 1.0136, + "step": 146500 + }, + { + "epoch": 0.9360106308217165, + "grad_norm": 0.6839352250099182, + "learning_rate": 5.5045211432506884e-05, + "loss": 0.7337, + "step": 146510 + }, + { + "epoch": 0.9360745179714552, + "grad_norm": 2.7014598846435547, + "learning_rate": 5.504021933164699e-05, + "loss": 1.0982, + "step": 146520 + }, + { + "epoch": 0.936138405121194, + "grad_norm": 1.0742672681808472, + "learning_rate": 5.503522718002774e-05, + "loss": 0.9552, + "step": 146530 + }, + { + "epoch": 0.9362022922709327, + "grad_norm": 0.7901598811149597, + "learning_rate": 5.5030234977699394e-05, + "loss": 0.8024, + "step": 146540 + }, + { + "epoch": 0.9362661794206714, + "grad_norm": 0.9857130646705627, + "learning_rate": 5.502524272471223e-05, + "loss": 1.0354, + "step": 146550 + }, + { + "epoch": 0.9363300665704101, + "grad_norm": 0.9056035280227661, + "learning_rate": 5.502025042111654e-05, + "loss": 0.9316, + "step": 146560 + }, + { + "epoch": 0.9363939537201488, + "grad_norm": 0.8060906529426575, + "learning_rate": 5.501525806696257e-05, + "loss": 0.9312, + "step": 146570 + }, + { + "epoch": 0.9364578408698875, + "grad_norm": 1.0485197305679321, + "learning_rate": 5.5010265662300606e-05, + "loss": 0.8028, + "step": 146580 + }, + { + "epoch": 0.9365217280196262, + "grad_norm": 0.9534648656845093, + "learning_rate": 5.500527320718094e-05, + "loss": 0.6943, + "step": 146590 + }, + { + "epoch": 0.9365856151693648, + "grad_norm": 0.6805192828178406, + "learning_rate": 5.500028070165385e-05, + "loss": 0.8013, + "step": 146600 + }, + { + "epoch": 0.9366495023191035, + "grad_norm": 0.996015191078186, + "learning_rate": 5.49952881457696e-05, + "loss": 0.7835, + "step": 146610 + }, + { + "epoch": 0.9367133894688422, + "grad_norm": 1.0089763402938843, + "learning_rate": 5.4990295539578474e-05, + "loss": 0.7471, + "step": 146620 + }, + { + "epoch": 0.9367772766185809, + "grad_norm": 1.1047331094741821, + "learning_rate": 5.498530288313075e-05, + "loss": 0.8188, + "step": 146630 + }, + { + "epoch": 0.9368411637683196, + "grad_norm": 1.0767524242401123, + "learning_rate": 5.4980310176476726e-05, + "loss": 0.7166, + "step": 146640 + }, + { + "epoch": 0.9369050509180583, + "grad_norm": 1.266998291015625, + "learning_rate": 5.497531741966666e-05, + "loss": 0.8517, + "step": 146650 + }, + { + "epoch": 0.936968938067797, + "grad_norm": 0.7340264320373535, + "learning_rate": 5.497032461275085e-05, + "loss": 0.8263, + "step": 146660 + }, + { + "epoch": 0.9370328252175357, + "grad_norm": 0.8520843386650085, + "learning_rate": 5.496533175577957e-05, + "loss": 1.0392, + "step": 146670 + }, + { + "epoch": 0.9370967123672744, + "grad_norm": 1.1523327827453613, + "learning_rate": 5.4960338848803084e-05, + "loss": 0.8068, + "step": 146680 + }, + { + "epoch": 0.9371605995170131, + "grad_norm": 1.1300572156906128, + "learning_rate": 5.4955345891871716e-05, + "loss": 0.7149, + "step": 146690 + }, + { + "epoch": 0.9372244866667518, + "grad_norm": 1.2153987884521484, + "learning_rate": 5.495035288503573e-05, + "loss": 0.8757, + "step": 146700 + }, + { + "epoch": 0.9372883738164905, + "grad_norm": 0.7240068912506104, + "learning_rate": 5.49453598283454e-05, + "loss": 0.7754, + "step": 146710 + }, + { + "epoch": 0.9373522609662293, + "grad_norm": 0.8426817059516907, + "learning_rate": 5.494036672185102e-05, + "loss": 0.8532, + "step": 146720 + }, + { + "epoch": 0.937416148115968, + "grad_norm": 0.8055009245872498, + "learning_rate": 5.4935373565602864e-05, + "loss": 0.8759, + "step": 146730 + }, + { + "epoch": 0.9374800352657067, + "grad_norm": 1.1292035579681396, + "learning_rate": 5.4930380359651244e-05, + "loss": 0.9746, + "step": 146740 + }, + { + "epoch": 0.9375439224154454, + "grad_norm": 0.588070809841156, + "learning_rate": 5.492538710404642e-05, + "loss": 0.8524, + "step": 146750 + }, + { + "epoch": 0.9376078095651841, + "grad_norm": 0.8825819492340088, + "learning_rate": 5.492039379883869e-05, + "loss": 0.8543, + "step": 146760 + }, + { + "epoch": 0.9376716967149228, + "grad_norm": 0.8504408001899719, + "learning_rate": 5.491540044407833e-05, + "loss": 0.8576, + "step": 146770 + }, + { + "epoch": 0.9377355838646615, + "grad_norm": 0.845874011516571, + "learning_rate": 5.491040703981564e-05, + "loss": 0.6989, + "step": 146780 + }, + { + "epoch": 0.9377994710144002, + "grad_norm": 0.8666792511940002, + "learning_rate": 5.4905413586100904e-05, + "loss": 0.8939, + "step": 146790 + }, + { + "epoch": 0.9378633581641389, + "grad_norm": 1.1524525880813599, + "learning_rate": 5.4900420082984416e-05, + "loss": 1.1092, + "step": 146800 + }, + { + "epoch": 0.9379272453138776, + "grad_norm": 0.5927907824516296, + "learning_rate": 5.489542653051646e-05, + "loss": 0.6445, + "step": 146810 + }, + { + "epoch": 0.9379911324636163, + "grad_norm": 0.8610501885414124, + "learning_rate": 5.4890432928747306e-05, + "loss": 0.5961, + "step": 146820 + }, + { + "epoch": 0.938055019613355, + "grad_norm": 0.6616340279579163, + "learning_rate": 5.488543927772727e-05, + "loss": 0.952, + "step": 146830 + }, + { + "epoch": 0.9381189067630936, + "grad_norm": 0.5525166988372803, + "learning_rate": 5.488044557750662e-05, + "loss": 1.0917, + "step": 146840 + }, + { + "epoch": 0.9381827939128323, + "grad_norm": 1.3952975273132324, + "learning_rate": 5.487545182813568e-05, + "loss": 0.979, + "step": 146850 + }, + { + "epoch": 0.938246681062571, + "grad_norm": 0.9401289224624634, + "learning_rate": 5.4870458029664714e-05, + "loss": 0.8715, + "step": 146860 + }, + { + "epoch": 0.9383105682123097, + "grad_norm": 1.1923482418060303, + "learning_rate": 5.486546418214402e-05, + "loss": 1.0171, + "step": 146870 + }, + { + "epoch": 0.9383744553620484, + "grad_norm": 1.33669912815094, + "learning_rate": 5.486047028562391e-05, + "loss": 0.7589, + "step": 146880 + }, + { + "epoch": 0.9384383425117871, + "grad_norm": 0.8979326486587524, + "learning_rate": 5.4855476340154647e-05, + "loss": 0.6521, + "step": 146890 + }, + { + "epoch": 0.9385022296615259, + "grad_norm": 0.5857362747192383, + "learning_rate": 5.4850482345786534e-05, + "loss": 0.6615, + "step": 146900 + }, + { + "epoch": 0.9385661168112646, + "grad_norm": 1.1938517093658447, + "learning_rate": 5.484548830256987e-05, + "loss": 0.7288, + "step": 146910 + }, + { + "epoch": 0.9386300039610033, + "grad_norm": 0.7541248202323914, + "learning_rate": 5.484049421055495e-05, + "loss": 0.613, + "step": 146920 + }, + { + "epoch": 0.938693891110742, + "grad_norm": 1.009811282157898, + "learning_rate": 5.483550006979206e-05, + "loss": 0.7539, + "step": 146930 + }, + { + "epoch": 0.9387577782604807, + "grad_norm": 1.518933653831482, + "learning_rate": 5.4830505880331496e-05, + "loss": 0.7572, + "step": 146940 + }, + { + "epoch": 0.9388216654102194, + "grad_norm": 1.1620988845825195, + "learning_rate": 5.482551164222357e-05, + "loss": 0.8322, + "step": 146950 + }, + { + "epoch": 0.9388855525599581, + "grad_norm": 0.6771840453147888, + "learning_rate": 5.482051735551856e-05, + "loss": 0.8801, + "step": 146960 + }, + { + "epoch": 0.9389494397096968, + "grad_norm": 1.18392813205719, + "learning_rate": 5.481552302026678e-05, + "loss": 0.6552, + "step": 146970 + }, + { + "epoch": 0.9390133268594355, + "grad_norm": 0.7388846278190613, + "learning_rate": 5.481052863651851e-05, + "loss": 0.8432, + "step": 146980 + }, + { + "epoch": 0.9390772140091742, + "grad_norm": 0.739513099193573, + "learning_rate": 5.480553420432405e-05, + "loss": 0.732, + "step": 146990 + }, + { + "epoch": 0.9391411011589129, + "grad_norm": 1.6718555688858032, + "learning_rate": 5.4800539723733714e-05, + "loss": 0.7453, + "step": 147000 + }, + { + "epoch": 0.9392049883086516, + "grad_norm": 1.3321524858474731, + "learning_rate": 5.479554519479778e-05, + "loss": 0.9719, + "step": 147010 + }, + { + "epoch": 0.9392688754583903, + "grad_norm": 0.7319386601448059, + "learning_rate": 5.479055061756656e-05, + "loss": 0.7486, + "step": 147020 + }, + { + "epoch": 0.939332762608129, + "grad_norm": 1.211982250213623, + "learning_rate": 5.478555599209035e-05, + "loss": 0.8766, + "step": 147030 + }, + { + "epoch": 0.9393966497578677, + "grad_norm": 1.0847975015640259, + "learning_rate": 5.478056131841947e-05, + "loss": 0.9276, + "step": 147040 + }, + { + "epoch": 0.9394605369076064, + "grad_norm": 1.0367181301116943, + "learning_rate": 5.477556659660418e-05, + "loss": 0.7712, + "step": 147050 + }, + { + "epoch": 0.9395244240573452, + "grad_norm": 1.7131720781326294, + "learning_rate": 5.4770571826694806e-05, + "loss": 0.9039, + "step": 147060 + }, + { + "epoch": 0.9395883112070839, + "grad_norm": 0.8213444948196411, + "learning_rate": 5.4765577008741644e-05, + "loss": 0.7636, + "step": 147070 + }, + { + "epoch": 0.9396521983568225, + "grad_norm": 0.7397197484970093, + "learning_rate": 5.4760582142795006e-05, + "loss": 0.8527, + "step": 147080 + }, + { + "epoch": 0.9397160855065612, + "grad_norm": 0.7857769131660461, + "learning_rate": 5.475558722890518e-05, + "loss": 0.8647, + "step": 147090 + }, + { + "epoch": 0.9397799726562999, + "grad_norm": 0.7695388197898865, + "learning_rate": 5.4750592267122494e-05, + "loss": 0.8865, + "step": 147100 + }, + { + "epoch": 0.9398438598060386, + "grad_norm": 0.926115870475769, + "learning_rate": 5.4745597257497215e-05, + "loss": 1.1938, + "step": 147110 + }, + { + "epoch": 0.9399077469557773, + "grad_norm": 0.6379870772361755, + "learning_rate": 5.474060220007967e-05, + "loss": 0.7275, + "step": 147120 + }, + { + "epoch": 0.939971634105516, + "grad_norm": 0.6762766242027283, + "learning_rate": 5.473560709492016e-05, + "loss": 1.1361, + "step": 147130 + }, + { + "epoch": 0.9400355212552547, + "grad_norm": 0.9891712069511414, + "learning_rate": 5.4730611942069e-05, + "loss": 0.7849, + "step": 147140 + }, + { + "epoch": 0.9400994084049934, + "grad_norm": 1.2105982303619385, + "learning_rate": 5.472561674157647e-05, + "loss": 0.9796, + "step": 147150 + }, + { + "epoch": 0.9401632955547321, + "grad_norm": 0.9586398601531982, + "learning_rate": 5.47206214934929e-05, + "loss": 0.9349, + "step": 147160 + }, + { + "epoch": 0.9402271827044708, + "grad_norm": 1.1456085443496704, + "learning_rate": 5.471562619786858e-05, + "loss": 0.9046, + "step": 147170 + }, + { + "epoch": 0.9402910698542095, + "grad_norm": 0.9945286512374878, + "learning_rate": 5.471063085475383e-05, + "loss": 0.8429, + "step": 147180 + }, + { + "epoch": 0.9403549570039482, + "grad_norm": 0.9011921882629395, + "learning_rate": 5.4705635464198954e-05, + "loss": 0.8381, + "step": 147190 + }, + { + "epoch": 0.9404188441536869, + "grad_norm": 0.6775206327438354, + "learning_rate": 5.4700640026254246e-05, + "loss": 0.9295, + "step": 147200 + }, + { + "epoch": 0.9404827313034256, + "grad_norm": 0.9664705991744995, + "learning_rate": 5.469564454097004e-05, + "loss": 1.06, + "step": 147210 + }, + { + "epoch": 0.9405466184531643, + "grad_norm": 2.8239731788635254, + "learning_rate": 5.469064900839662e-05, + "loss": 0.8501, + "step": 147220 + }, + { + "epoch": 0.940610505602903, + "grad_norm": 1.0002690553665161, + "learning_rate": 5.4685653428584314e-05, + "loss": 0.9444, + "step": 147230 + }, + { + "epoch": 0.9406743927526418, + "grad_norm": 0.9000080227851868, + "learning_rate": 5.468065780158343e-05, + "loss": 0.912, + "step": 147240 + }, + { + "epoch": 0.9407382799023805, + "grad_norm": 1.004228115081787, + "learning_rate": 5.467566212744427e-05, + "loss": 0.8445, + "step": 147250 + }, + { + "epoch": 0.9408021670521192, + "grad_norm": 0.896317720413208, + "learning_rate": 5.467066640621714e-05, + "loss": 1.1256, + "step": 147260 + }, + { + "epoch": 0.9408660542018579, + "grad_norm": 1.0601049661636353, + "learning_rate": 5.466567063795237e-05, + "loss": 0.944, + "step": 147270 + }, + { + "epoch": 0.9409299413515966, + "grad_norm": 1.2464497089385986, + "learning_rate": 5.4660674822700264e-05, + "loss": 0.7599, + "step": 147280 + }, + { + "epoch": 0.9409938285013353, + "grad_norm": 0.9628870487213135, + "learning_rate": 5.4655678960511116e-05, + "loss": 0.9559, + "step": 147290 + }, + { + "epoch": 0.941057715651074, + "grad_norm": 0.6807940602302551, + "learning_rate": 5.465068305143526e-05, + "loss": 0.9153, + "step": 147300 + }, + { + "epoch": 0.9411216028008127, + "grad_norm": 0.7045243382453918, + "learning_rate": 5.4645687095523004e-05, + "loss": 0.6033, + "step": 147310 + }, + { + "epoch": 0.9411854899505514, + "grad_norm": 1.0960919857025146, + "learning_rate": 5.464069109282465e-05, + "loss": 0.9625, + "step": 147320 + }, + { + "epoch": 0.94124937710029, + "grad_norm": 0.8039271235466003, + "learning_rate": 5.4635695043390526e-05, + "loss": 1.0624, + "step": 147330 + }, + { + "epoch": 0.9413132642500287, + "grad_norm": 0.8753572106361389, + "learning_rate": 5.463069894727094e-05, + "loss": 0.7895, + "step": 147340 + }, + { + "epoch": 0.9413771513997674, + "grad_norm": 0.9138633608818054, + "learning_rate": 5.462570280451622e-05, + "loss": 1.1042, + "step": 147350 + }, + { + "epoch": 0.9414410385495061, + "grad_norm": 0.7882958054542542, + "learning_rate": 5.4620706615176645e-05, + "loss": 0.759, + "step": 147360 + }, + { + "epoch": 0.9415049256992448, + "grad_norm": 0.7644445896148682, + "learning_rate": 5.4615710379302574e-05, + "loss": 0.6677, + "step": 147370 + }, + { + "epoch": 0.9415688128489835, + "grad_norm": 0.8131184577941895, + "learning_rate": 5.461071409694432e-05, + "loss": 0.8138, + "step": 147380 + }, + { + "epoch": 0.9416326999987222, + "grad_norm": 1.2795737981796265, + "learning_rate": 5.460571776815216e-05, + "loss": 0.9862, + "step": 147390 + }, + { + "epoch": 0.9416965871484609, + "grad_norm": 0.9743437767028809, + "learning_rate": 5.460072139297646e-05, + "loss": 0.9635, + "step": 147400 + }, + { + "epoch": 0.9417604742981996, + "grad_norm": 0.8721929788589478, + "learning_rate": 5.459572497146751e-05, + "loss": 0.9526, + "step": 147410 + }, + { + "epoch": 0.9418243614479384, + "grad_norm": 0.8396299481391907, + "learning_rate": 5.459072850367563e-05, + "loss": 0.9158, + "step": 147420 + }, + { + "epoch": 0.9418882485976771, + "grad_norm": 0.7873830795288086, + "learning_rate": 5.4585731989651144e-05, + "loss": 0.8138, + "step": 147430 + }, + { + "epoch": 0.9419521357474158, + "grad_norm": 1.116898775100708, + "learning_rate": 5.458073542944436e-05, + "loss": 0.9122, + "step": 147440 + }, + { + "epoch": 0.9420160228971545, + "grad_norm": 0.5282606482505798, + "learning_rate": 5.4575738823105626e-05, + "loss": 0.7972, + "step": 147450 + }, + { + "epoch": 0.9420799100468932, + "grad_norm": 0.8728241920471191, + "learning_rate": 5.457074217068523e-05, + "loss": 0.6552, + "step": 147460 + }, + { + "epoch": 0.9421437971966319, + "grad_norm": 0.9961313009262085, + "learning_rate": 5.456574547223351e-05, + "loss": 0.896, + "step": 147470 + }, + { + "epoch": 0.9422076843463706, + "grad_norm": 0.6567425727844238, + "learning_rate": 5.456074872780078e-05, + "loss": 0.6517, + "step": 147480 + }, + { + "epoch": 0.9422715714961093, + "grad_norm": 1.023400068283081, + "learning_rate": 5.455575193743737e-05, + "loss": 1.0409, + "step": 147490 + }, + { + "epoch": 0.942335458645848, + "grad_norm": 1.032383918762207, + "learning_rate": 5.455075510119359e-05, + "loss": 1.1724, + "step": 147500 + }, + { + "epoch": 0.9423993457955867, + "grad_norm": 1.1703791618347168, + "learning_rate": 5.454575821911978e-05, + "loss": 0.9594, + "step": 147510 + }, + { + "epoch": 0.9424632329453254, + "grad_norm": 0.9019293785095215, + "learning_rate": 5.454076129126624e-05, + "loss": 0.8071, + "step": 147520 + }, + { + "epoch": 0.9425271200950641, + "grad_norm": 0.9911213517189026, + "learning_rate": 5.4535764317683314e-05, + "loss": 0.856, + "step": 147530 + }, + { + "epoch": 0.9425910072448028, + "grad_norm": 1.0215911865234375, + "learning_rate": 5.4530767298421315e-05, + "loss": 0.9103, + "step": 147540 + }, + { + "epoch": 0.9426548943945415, + "grad_norm": 1.2681025266647339, + "learning_rate": 5.452577023353057e-05, + "loss": 0.9712, + "step": 147550 + }, + { + "epoch": 0.9427187815442802, + "grad_norm": 1.3751388788223267, + "learning_rate": 5.4520773123061406e-05, + "loss": 1.0203, + "step": 147560 + }, + { + "epoch": 0.9427826686940188, + "grad_norm": 0.6695000529289246, + "learning_rate": 5.4515775967064145e-05, + "loss": 0.9345, + "step": 147570 + }, + { + "epoch": 0.9428465558437575, + "grad_norm": 0.8419767618179321, + "learning_rate": 5.4510778765589096e-05, + "loss": 0.8482, + "step": 147580 + }, + { + "epoch": 0.9429104429934962, + "grad_norm": 0.9437578320503235, + "learning_rate": 5.4505781518686626e-05, + "loss": 0.9328, + "step": 147590 + }, + { + "epoch": 0.942974330143235, + "grad_norm": 0.9840037226676941, + "learning_rate": 5.450078422640703e-05, + "loss": 1.0883, + "step": 147600 + }, + { + "epoch": 0.9430382172929737, + "grad_norm": 0.8432072401046753, + "learning_rate": 5.449578688880064e-05, + "loss": 0.7759, + "step": 147610 + }, + { + "epoch": 0.9431021044427124, + "grad_norm": 0.9873720407485962, + "learning_rate": 5.44907895059178e-05, + "loss": 0.7869, + "step": 147620 + }, + { + "epoch": 0.9431659915924511, + "grad_norm": 1.06051504611969, + "learning_rate": 5.44857920778088e-05, + "loss": 0.6161, + "step": 147630 + }, + { + "epoch": 0.9432298787421898, + "grad_norm": 0.6839306950569153, + "learning_rate": 5.448079460452401e-05, + "loss": 0.8623, + "step": 147640 + }, + { + "epoch": 0.9432937658919285, + "grad_norm": 0.9109451174736023, + "learning_rate": 5.4475797086113736e-05, + "loss": 0.7093, + "step": 147650 + }, + { + "epoch": 0.9433576530416672, + "grad_norm": 0.6764558553695679, + "learning_rate": 5.447079952262831e-05, + "loss": 0.715, + "step": 147660 + }, + { + "epoch": 0.9434215401914059, + "grad_norm": 1.947048306465149, + "learning_rate": 5.446580191411808e-05, + "loss": 1.3239, + "step": 147670 + }, + { + "epoch": 0.9434854273411446, + "grad_norm": 0.9166133403778076, + "learning_rate": 5.446080426063335e-05, + "loss": 1.124, + "step": 147680 + }, + { + "epoch": 0.9435493144908833, + "grad_norm": 0.7818967700004578, + "learning_rate": 5.4455806562224466e-05, + "loss": 0.9505, + "step": 147690 + }, + { + "epoch": 0.943613201640622, + "grad_norm": 1.0316641330718994, + "learning_rate": 5.445080881894174e-05, + "loss": 0.9351, + "step": 147700 + }, + { + "epoch": 0.9436770887903607, + "grad_norm": 0.9473081231117249, + "learning_rate": 5.444581103083553e-05, + "loss": 0.6593, + "step": 147710 + }, + { + "epoch": 0.9437409759400994, + "grad_norm": 2.7547168731689453, + "learning_rate": 5.4440813197956165e-05, + "loss": 0.8583, + "step": 147720 + }, + { + "epoch": 0.9438048630898381, + "grad_norm": 1.0668847560882568, + "learning_rate": 5.443581532035396e-05, + "loss": 0.8896, + "step": 147730 + }, + { + "epoch": 0.9438687502395768, + "grad_norm": 0.618463933467865, + "learning_rate": 5.443081739807926e-05, + "loss": 0.7973, + "step": 147740 + }, + { + "epoch": 0.9439326373893155, + "grad_norm": 0.8581937551498413, + "learning_rate": 5.442581943118239e-05, + "loss": 0.6358, + "step": 147750 + }, + { + "epoch": 0.9439965245390542, + "grad_norm": 0.7288326621055603, + "learning_rate": 5.44208214197137e-05, + "loss": 0.8207, + "step": 147760 + }, + { + "epoch": 0.944060411688793, + "grad_norm": 1.2387948036193848, + "learning_rate": 5.4415823363723515e-05, + "loss": 1.1969, + "step": 147770 + }, + { + "epoch": 0.9441242988385317, + "grad_norm": 0.8606761693954468, + "learning_rate": 5.441082526326217e-05, + "loss": 0.7147, + "step": 147780 + }, + { + "epoch": 0.9441881859882704, + "grad_norm": 0.8213832378387451, + "learning_rate": 5.4405827118379984e-05, + "loss": 0.8981, + "step": 147790 + }, + { + "epoch": 0.9442520731380091, + "grad_norm": 1.2024768590927124, + "learning_rate": 5.440082892912731e-05, + "loss": 0.7657, + "step": 147800 + }, + { + "epoch": 0.9443159602877477, + "grad_norm": 1.0111289024353027, + "learning_rate": 5.439583069555448e-05, + "loss": 0.9585, + "step": 147810 + }, + { + "epoch": 0.9443798474374864, + "grad_norm": 0.8769100904464722, + "learning_rate": 5.439083241771185e-05, + "loss": 0.8346, + "step": 147820 + }, + { + "epoch": 0.9444437345872251, + "grad_norm": 2.532590389251709, + "learning_rate": 5.438583409564972e-05, + "loss": 0.7687, + "step": 147830 + }, + { + "epoch": 0.9445076217369638, + "grad_norm": 0.7972337007522583, + "learning_rate": 5.4380835729418454e-05, + "loss": 0.9489, + "step": 147840 + }, + { + "epoch": 0.9445715088867025, + "grad_norm": 1.492598295211792, + "learning_rate": 5.437583731906838e-05, + "loss": 0.9452, + "step": 147850 + }, + { + "epoch": 0.9446353960364412, + "grad_norm": 0.8178605437278748, + "learning_rate": 5.4370838864649845e-05, + "loss": 0.8965, + "step": 147860 + }, + { + "epoch": 0.9446992831861799, + "grad_norm": 0.7244489789009094, + "learning_rate": 5.436584036621317e-05, + "loss": 0.7104, + "step": 147870 + }, + { + "epoch": 0.9447631703359186, + "grad_norm": 1.0198410749435425, + "learning_rate": 5.4360841823808715e-05, + "loss": 1.0496, + "step": 147880 + }, + { + "epoch": 0.9448270574856573, + "grad_norm": 0.7587177157402039, + "learning_rate": 5.435584323748679e-05, + "loss": 0.9377, + "step": 147890 + }, + { + "epoch": 0.944890944635396, + "grad_norm": 0.8727468252182007, + "learning_rate": 5.4350844607297776e-05, + "loss": 0.712, + "step": 147900 + }, + { + "epoch": 0.9449548317851347, + "grad_norm": 0.5229664444923401, + "learning_rate": 5.4345845933291984e-05, + "loss": 0.6849, + "step": 147910 + }, + { + "epoch": 0.9450187189348734, + "grad_norm": 0.65986567735672, + "learning_rate": 5.4340847215519776e-05, + "loss": 0.9049, + "step": 147920 + }, + { + "epoch": 0.9450826060846121, + "grad_norm": 1.3852014541625977, + "learning_rate": 5.4335848454031466e-05, + "loss": 0.7575, + "step": 147930 + }, + { + "epoch": 0.9451464932343508, + "grad_norm": 1.0181684494018555, + "learning_rate": 5.433084964887742e-05, + "loss": 0.879, + "step": 147940 + }, + { + "epoch": 0.9452103803840896, + "grad_norm": 2.153229236602783, + "learning_rate": 5.432585080010797e-05, + "loss": 0.8431, + "step": 147950 + }, + { + "epoch": 0.9452742675338283, + "grad_norm": 0.9575214385986328, + "learning_rate": 5.432085190777346e-05, + "loss": 0.659, + "step": 147960 + }, + { + "epoch": 0.945338154683567, + "grad_norm": 0.5660369396209717, + "learning_rate": 5.431585297192423e-05, + "loss": 0.967, + "step": 147970 + }, + { + "epoch": 0.9454020418333057, + "grad_norm": 1.3022630214691162, + "learning_rate": 5.431085399261063e-05, + "loss": 0.8355, + "step": 147980 + }, + { + "epoch": 0.9454659289830444, + "grad_norm": 1.3143341541290283, + "learning_rate": 5.4305854969883006e-05, + "loss": 0.8243, + "step": 147990 + }, + { + "epoch": 0.9455298161327831, + "grad_norm": 1.7055933475494385, + "learning_rate": 5.4300855903791694e-05, + "loss": 1.1814, + "step": 148000 + }, + { + "epoch": 0.9455937032825218, + "grad_norm": 0.5489552021026611, + "learning_rate": 5.429585679438705e-05, + "loss": 0.761, + "step": 148010 + }, + { + "epoch": 0.9456575904322605, + "grad_norm": 1.1143733263015747, + "learning_rate": 5.429085764171939e-05, + "loss": 1.1932, + "step": 148020 + }, + { + "epoch": 0.9457214775819992, + "grad_norm": 1.6179147958755493, + "learning_rate": 5.42858584458391e-05, + "loss": 1.4239, + "step": 148030 + }, + { + "epoch": 0.9457853647317379, + "grad_norm": 0.9910181164741516, + "learning_rate": 5.4280859206796506e-05, + "loss": 0.8706, + "step": 148040 + }, + { + "epoch": 0.9458492518814765, + "grad_norm": 1.4709466695785522, + "learning_rate": 5.4275859924641936e-05, + "loss": 0.6839, + "step": 148050 + }, + { + "epoch": 0.9459131390312152, + "grad_norm": 0.8617550730705261, + "learning_rate": 5.4270860599425775e-05, + "loss": 1.1046, + "step": 148060 + }, + { + "epoch": 0.9459770261809539, + "grad_norm": 1.1911267042160034, + "learning_rate": 5.426586123119835e-05, + "loss": 0.8552, + "step": 148070 + }, + { + "epoch": 0.9460409133306926, + "grad_norm": 0.9309574365615845, + "learning_rate": 5.426086182001001e-05, + "loss": 0.8093, + "step": 148080 + }, + { + "epoch": 0.9461048004804313, + "grad_norm": 0.9242094159126282, + "learning_rate": 5.425586236591112e-05, + "loss": 0.8593, + "step": 148090 + }, + { + "epoch": 0.94616868763017, + "grad_norm": 0.7709175944328308, + "learning_rate": 5.4250862868951994e-05, + "loss": 0.9407, + "step": 148100 + }, + { + "epoch": 0.9462325747799087, + "grad_norm": 2.212761878967285, + "learning_rate": 5.424586332918301e-05, + "loss": 0.8686, + "step": 148110 + }, + { + "epoch": 0.9462964619296474, + "grad_norm": 0.9792400598526001, + "learning_rate": 5.424086374665451e-05, + "loss": 0.8417, + "step": 148120 + }, + { + "epoch": 0.9463603490793862, + "grad_norm": 0.8189619183540344, + "learning_rate": 5.423586412141685e-05, + "loss": 0.7651, + "step": 148130 + }, + { + "epoch": 0.9464242362291249, + "grad_norm": 0.6825985312461853, + "learning_rate": 5.423086445352036e-05, + "loss": 0.6932, + "step": 148140 + }, + { + "epoch": 0.9464881233788636, + "grad_norm": 1.7145543098449707, + "learning_rate": 5.422586474301541e-05, + "loss": 0.789, + "step": 148150 + }, + { + "epoch": 0.9465520105286023, + "grad_norm": 0.9677301645278931, + "learning_rate": 5.4220864989952345e-05, + "loss": 0.9292, + "step": 148160 + }, + { + "epoch": 0.946615897678341, + "grad_norm": 0.8834055662155151, + "learning_rate": 5.421586519438152e-05, + "loss": 0.7546, + "step": 148170 + }, + { + "epoch": 0.9466797848280797, + "grad_norm": 1.360954999923706, + "learning_rate": 5.421086535635328e-05, + "loss": 0.9588, + "step": 148180 + }, + { + "epoch": 0.9467436719778184, + "grad_norm": 0.9436541795730591, + "learning_rate": 5.4205865475918e-05, + "loss": 0.7112, + "step": 148190 + }, + { + "epoch": 0.9468075591275571, + "grad_norm": 1.135512351989746, + "learning_rate": 5.420086555312599e-05, + "loss": 1.0807, + "step": 148200 + }, + { + "epoch": 0.9468714462772958, + "grad_norm": NaN, + "learning_rate": 5.419636558643983e-05, + "loss": 0.9952, + "step": 148210 + }, + { + "epoch": 0.9469353334270345, + "grad_norm": 0.8383281230926514, + "learning_rate": 5.4191365583308814e-05, + "loss": 0.9675, + "step": 148220 + }, + { + "epoch": 0.9469992205767732, + "grad_norm": 0.6859557032585144, + "learning_rate": 5.418636553796713e-05, + "loss": 0.9006, + "step": 148230 + }, + { + "epoch": 0.9470631077265119, + "grad_norm": 0.9620136022567749, + "learning_rate": 5.4181365450465125e-05, + "loss": 0.8653, + "step": 148240 + }, + { + "epoch": 0.9471269948762506, + "grad_norm": 0.5614147186279297, + "learning_rate": 5.417636532085315e-05, + "loss": 0.7507, + "step": 148250 + }, + { + "epoch": 0.9471908820259893, + "grad_norm": 1.3918198347091675, + "learning_rate": 5.417136514918156e-05, + "loss": 0.9124, + "step": 148260 + }, + { + "epoch": 0.947254769175728, + "grad_norm": 0.8057130575180054, + "learning_rate": 5.416636493550071e-05, + "loss": 0.8625, + "step": 148270 + }, + { + "epoch": 0.9473186563254667, + "grad_norm": 1.2097066640853882, + "learning_rate": 5.4161364679860974e-05, + "loss": 0.9413, + "step": 148280 + }, + { + "epoch": 0.9473825434752055, + "grad_norm": 0.8656195402145386, + "learning_rate": 5.415636438231269e-05, + "loss": 0.851, + "step": 148290 + }, + { + "epoch": 0.947446430624944, + "grad_norm": 0.9024816751480103, + "learning_rate": 5.4151364042906216e-05, + "loss": 0.6835, + "step": 148300 + }, + { + "epoch": 0.9475103177746828, + "grad_norm": 0.9172950983047485, + "learning_rate": 5.414636366169191e-05, + "loss": 0.9436, + "step": 148310 + }, + { + "epoch": 0.9475742049244215, + "grad_norm": 1.2229052782058716, + "learning_rate": 5.4141363238720144e-05, + "loss": 0.7715, + "step": 148320 + }, + { + "epoch": 0.9476380920741602, + "grad_norm": 0.7805074453353882, + "learning_rate": 5.4136362774041274e-05, + "loss": 0.7959, + "step": 148330 + }, + { + "epoch": 0.9477019792238989, + "grad_norm": 1.2638076543807983, + "learning_rate": 5.4131362267705635e-05, + "loss": 0.8026, + "step": 148340 + }, + { + "epoch": 0.9477658663736376, + "grad_norm": 0.8353046774864197, + "learning_rate": 5.412636171976362e-05, + "loss": 0.8111, + "step": 148350 + }, + { + "epoch": 0.9478297535233763, + "grad_norm": 0.7682856917381287, + "learning_rate": 5.4121361130265556e-05, + "loss": 1.0184, + "step": 148360 + }, + { + "epoch": 0.947893640673115, + "grad_norm": 1.0108745098114014, + "learning_rate": 5.411636049926183e-05, + "loss": 0.9613, + "step": 148370 + }, + { + "epoch": 0.9479575278228537, + "grad_norm": 1.050826907157898, + "learning_rate": 5.4111359826802785e-05, + "loss": 0.742, + "step": 148380 + }, + { + "epoch": 0.9480214149725924, + "grad_norm": 0.868506133556366, + "learning_rate": 5.41063591129388e-05, + "loss": 1.0165, + "step": 148390 + }, + { + "epoch": 0.9480853021223311, + "grad_norm": 0.8240692019462585, + "learning_rate": 5.410135835772023e-05, + "loss": 0.9583, + "step": 148400 + }, + { + "epoch": 0.9481491892720698, + "grad_norm": 0.7590421438217163, + "learning_rate": 5.409635756119742e-05, + "loss": 0.9101, + "step": 148410 + }, + { + "epoch": 0.9482130764218085, + "grad_norm": 0.9311391711235046, + "learning_rate": 5.409135672342076e-05, + "loss": 0.9968, + "step": 148420 + }, + { + "epoch": 0.9482769635715472, + "grad_norm": 0.7667366862297058, + "learning_rate": 5.408635584444058e-05, + "loss": 0.7345, + "step": 148430 + }, + { + "epoch": 0.9483408507212859, + "grad_norm": 2.864366292953491, + "learning_rate": 5.408135492430728e-05, + "loss": 0.9094, + "step": 148440 + }, + { + "epoch": 0.9484047378710246, + "grad_norm": 1.2095937728881836, + "learning_rate": 5.407635396307119e-05, + "loss": 0.9626, + "step": 148450 + }, + { + "epoch": 0.9484686250207633, + "grad_norm": 1.3204469680786133, + "learning_rate": 5.4071352960782697e-05, + "loss": 0.9588, + "step": 148460 + }, + { + "epoch": 0.948532512170502, + "grad_norm": 0.5598198771476746, + "learning_rate": 5.406635191749215e-05, + "loss": 0.8083, + "step": 148470 + }, + { + "epoch": 0.9485963993202408, + "grad_norm": 1.0732495784759521, + "learning_rate": 5.406135083324993e-05, + "loss": 0.8825, + "step": 148480 + }, + { + "epoch": 0.9486602864699795, + "grad_norm": 0.9504900574684143, + "learning_rate": 5.405634970810639e-05, + "loss": 1.174, + "step": 148490 + }, + { + "epoch": 0.9487241736197182, + "grad_norm": 1.0843398571014404, + "learning_rate": 5.40513485421119e-05, + "loss": 1.0179, + "step": 148500 + }, + { + "epoch": 0.9487880607694569, + "grad_norm": 1.0680909156799316, + "learning_rate": 5.404634733531683e-05, + "loss": 1.0718, + "step": 148510 + }, + { + "epoch": 0.9488519479191956, + "grad_norm": 0.8766992092132568, + "learning_rate": 5.404134608777154e-05, + "loss": 0.8991, + "step": 148520 + }, + { + "epoch": 0.9489158350689343, + "grad_norm": 0.7806427478790283, + "learning_rate": 5.4036344799526396e-05, + "loss": 1.0212, + "step": 148530 + }, + { + "epoch": 0.9489797222186729, + "grad_norm": 0.9401262998580933, + "learning_rate": 5.4031343470631756e-05, + "loss": 0.8057, + "step": 148540 + }, + { + "epoch": 0.9490436093684116, + "grad_norm": 0.99031001329422, + "learning_rate": 5.402634210113801e-05, + "loss": 0.8273, + "step": 148550 + }, + { + "epoch": 0.9491074965181503, + "grad_norm": 0.5672215223312378, + "learning_rate": 5.402134069109551e-05, + "loss": 1.0363, + "step": 148560 + }, + { + "epoch": 0.949171383667889, + "grad_norm": 0.7566940784454346, + "learning_rate": 5.401633924055464e-05, + "loss": 0.8976, + "step": 148570 + }, + { + "epoch": 0.9492352708176277, + "grad_norm": 1.022157907485962, + "learning_rate": 5.401133774956576e-05, + "loss": 0.9522, + "step": 148580 + }, + { + "epoch": 0.9492991579673664, + "grad_norm": 0.8552069664001465, + "learning_rate": 5.400633621817923e-05, + "loss": 0.7844, + "step": 148590 + }, + { + "epoch": 0.9493630451171051, + "grad_norm": 1.0440733432769775, + "learning_rate": 5.4001334646445436e-05, + "loss": 0.6886, + "step": 148600 + }, + { + "epoch": 0.9494269322668438, + "grad_norm": 1.094836711883545, + "learning_rate": 5.399633303441474e-05, + "loss": 1.0184, + "step": 148610 + }, + { + "epoch": 0.9494908194165825, + "grad_norm": 0.8054597973823547, + "learning_rate": 5.399133138213751e-05, + "loss": 0.9208, + "step": 148620 + }, + { + "epoch": 0.9495547065663212, + "grad_norm": 0.6301440000534058, + "learning_rate": 5.398632968966412e-05, + "loss": 0.8177, + "step": 148630 + }, + { + "epoch": 0.9496185937160599, + "grad_norm": 1.095146894454956, + "learning_rate": 5.398182813211199e-05, + "loss": 1.1332, + "step": 148640 + }, + { + "epoch": 0.9496824808657987, + "grad_norm": 1.182285189628601, + "learning_rate": 5.3976826363404665e-05, + "loss": 0.8807, + "step": 148650 + }, + { + "epoch": 0.9497463680155374, + "grad_norm": 1.0031092166900635, + "learning_rate": 5.397182455464725e-05, + "loss": 1.1409, + "step": 148660 + }, + { + "epoch": 0.9498102551652761, + "grad_norm": 0.9576486945152283, + "learning_rate": 5.396682270589015e-05, + "loss": 0.9945, + "step": 148670 + }, + { + "epoch": 0.9498741423150148, + "grad_norm": 1.1300849914550781, + "learning_rate": 5.396182081718369e-05, + "loss": 0.9599, + "step": 148680 + }, + { + "epoch": 0.9499380294647535, + "grad_norm": 1.3322839736938477, + "learning_rate": 5.395681888857829e-05, + "loss": 0.8257, + "step": 148690 + }, + { + "epoch": 0.9500019166144922, + "grad_norm": 1.616481065750122, + "learning_rate": 5.3951816920124285e-05, + "loss": 0.9125, + "step": 148700 + }, + { + "epoch": 0.9500658037642309, + "grad_norm": 0.7100057601928711, + "learning_rate": 5.394681491187207e-05, + "loss": 1.3379, + "step": 148710 + }, + { + "epoch": 0.9501296909139696, + "grad_norm": 0.8348981738090515, + "learning_rate": 5.394181286387202e-05, + "loss": 0.9037, + "step": 148720 + }, + { + "epoch": 0.9501935780637083, + "grad_norm": 1.0880135297775269, + "learning_rate": 5.3936810776174497e-05, + "loss": 0.8901, + "step": 148730 + }, + { + "epoch": 0.950257465213447, + "grad_norm": 1.474798560142517, + "learning_rate": 5.3931808648829887e-05, + "loss": 1.0315, + "step": 148740 + }, + { + "epoch": 0.9503213523631857, + "grad_norm": 1.2487114667892456, + "learning_rate": 5.392680648188856e-05, + "loss": 0.9557, + "step": 148750 + }, + { + "epoch": 0.9503852395129244, + "grad_norm": 0.5806934833526611, + "learning_rate": 5.392180427540089e-05, + "loss": 0.8193, + "step": 148760 + }, + { + "epoch": 0.9504491266626631, + "grad_norm": 0.8657655715942383, + "learning_rate": 5.391680202941727e-05, + "loss": 0.9947, + "step": 148770 + }, + { + "epoch": 0.9505130138124017, + "grad_norm": 1.0239652395248413, + "learning_rate": 5.3911799743988054e-05, + "loss": 0.9196, + "step": 148780 + }, + { + "epoch": 0.9505769009621404, + "grad_norm": 0.862191379070282, + "learning_rate": 5.390679741916365e-05, + "loss": 1.0819, + "step": 148790 + }, + { + "epoch": 0.9506407881118791, + "grad_norm": 0.7813977599143982, + "learning_rate": 5.39017950549944e-05, + "loss": 0.8585, + "step": 148800 + }, + { + "epoch": 0.9507046752616178, + "grad_norm": 0.829288125038147, + "learning_rate": 5.389679265153069e-05, + "loss": 0.9753, + "step": 148810 + }, + { + "epoch": 0.9507685624113565, + "grad_norm": 1.4242595434188843, + "learning_rate": 5.389179020882291e-05, + "loss": 0.5942, + "step": 148820 + }, + { + "epoch": 0.9508324495610952, + "grad_norm": 0.8533827066421509, + "learning_rate": 5.388678772692144e-05, + "loss": 1.042, + "step": 148830 + }, + { + "epoch": 0.950896336710834, + "grad_norm": 0.6416082978248596, + "learning_rate": 5.388178520587666e-05, + "loss": 1.0584, + "step": 148840 + }, + { + "epoch": 0.9509602238605727, + "grad_norm": 1.0949831008911133, + "learning_rate": 5.387678264573893e-05, + "loss": 0.8983, + "step": 148850 + }, + { + "epoch": 0.9510241110103114, + "grad_norm": 1.0142227411270142, + "learning_rate": 5.3871780046558664e-05, + "loss": 0.7403, + "step": 148860 + }, + { + "epoch": 0.9510879981600501, + "grad_norm": 0.9484434127807617, + "learning_rate": 5.3866777408386217e-05, + "loss": 0.9695, + "step": 148870 + }, + { + "epoch": 0.9511518853097888, + "grad_norm": 0.765370786190033, + "learning_rate": 5.386177473127197e-05, + "loss": 0.7995, + "step": 148880 + }, + { + "epoch": 0.9512157724595275, + "grad_norm": 0.7197050452232361, + "learning_rate": 5.385677201526631e-05, + "loss": 0.8397, + "step": 148890 + }, + { + "epoch": 0.9512796596092662, + "grad_norm": 0.9084450602531433, + "learning_rate": 5.385176926041963e-05, + "loss": 1.113, + "step": 148900 + }, + { + "epoch": 0.9513435467590049, + "grad_norm": 1.0009987354278564, + "learning_rate": 5.3846766466782294e-05, + "loss": 0.9134, + "step": 148910 + }, + { + "epoch": 0.9514074339087436, + "grad_norm": 0.9077139496803284, + "learning_rate": 5.3841763634404695e-05, + "loss": 1.0847, + "step": 148920 + }, + { + "epoch": 0.9514713210584823, + "grad_norm": 0.6592074632644653, + "learning_rate": 5.383676076333721e-05, + "loss": 0.7027, + "step": 148930 + }, + { + "epoch": 0.951535208208221, + "grad_norm": 0.9967330694198608, + "learning_rate": 5.383175785363023e-05, + "loss": 0.8229, + "step": 148940 + }, + { + "epoch": 0.9515990953579597, + "grad_norm": 0.9206305146217346, + "learning_rate": 5.382675490533413e-05, + "loss": 0.8789, + "step": 148950 + }, + { + "epoch": 0.9516629825076984, + "grad_norm": 2.410933017730713, + "learning_rate": 5.38217519184993e-05, + "loss": 0.8032, + "step": 148960 + }, + { + "epoch": 0.9517268696574371, + "grad_norm": 1.0102331638336182, + "learning_rate": 5.381674889317612e-05, + "loss": 1.1499, + "step": 148970 + }, + { + "epoch": 0.9517907568071758, + "grad_norm": 1.321588397026062, + "learning_rate": 5.3811745829414975e-05, + "loss": 0.9381, + "step": 148980 + }, + { + "epoch": 0.9518546439569145, + "grad_norm": 0.7851718664169312, + "learning_rate": 5.3806742727266245e-05, + "loss": 0.807, + "step": 148990 + }, + { + "epoch": 0.9519185311066533, + "grad_norm": 0.8672306537628174, + "learning_rate": 5.380173958678033e-05, + "loss": 0.8848, + "step": 149000 + }, + { + "epoch": 0.951982418256392, + "grad_norm": 1.5692130327224731, + "learning_rate": 5.379673640800761e-05, + "loss": 0.6902, + "step": 149010 + }, + { + "epoch": 0.9520463054061307, + "grad_norm": 0.9752570986747742, + "learning_rate": 5.379173319099845e-05, + "loss": 0.9114, + "step": 149020 + }, + { + "epoch": 0.9521101925558693, + "grad_norm": 0.89276123046875, + "learning_rate": 5.378672993580329e-05, + "loss": 0.7358, + "step": 149030 + }, + { + "epoch": 0.952174079705608, + "grad_norm": 0.9652552604675293, + "learning_rate": 5.378172664247246e-05, + "loss": 0.8794, + "step": 149040 + }, + { + "epoch": 0.9522379668553467, + "grad_norm": 0.7110236287117004, + "learning_rate": 5.377672331105639e-05, + "loss": 0.9261, + "step": 149050 + }, + { + "epoch": 0.9523018540050854, + "grad_norm": 0.5412469506263733, + "learning_rate": 5.3771719941605434e-05, + "loss": 0.8893, + "step": 149060 + }, + { + "epoch": 0.9523657411548241, + "grad_norm": 0.5765590071678162, + "learning_rate": 5.3766716534170004e-05, + "loss": 0.7548, + "step": 149070 + }, + { + "epoch": 0.9524296283045628, + "grad_norm": 0.9300665855407715, + "learning_rate": 5.376171308880047e-05, + "loss": 0.982, + "step": 149080 + }, + { + "epoch": 0.9524935154543015, + "grad_norm": 0.7088594436645508, + "learning_rate": 5.375670960554724e-05, + "loss": 0.7038, + "step": 149090 + }, + { + "epoch": 0.9525574026040402, + "grad_norm": 0.7242599129676819, + "learning_rate": 5.375170608446068e-05, + "loss": 0.8099, + "step": 149100 + }, + { + "epoch": 0.9526212897537789, + "grad_norm": 0.8361330628395081, + "learning_rate": 5.3746702525591205e-05, + "loss": 1.1563, + "step": 149110 + }, + { + "epoch": 0.9526851769035176, + "grad_norm": 1.0023174285888672, + "learning_rate": 5.3741698928989194e-05, + "loss": 0.8939, + "step": 149120 + }, + { + "epoch": 0.9527490640532563, + "grad_norm": 0.6952134966850281, + "learning_rate": 5.373669529470504e-05, + "loss": 0.9099, + "step": 149130 + }, + { + "epoch": 0.952812951202995, + "grad_norm": 0.7857193946838379, + "learning_rate": 5.373169162278913e-05, + "loss": 0.8369, + "step": 149140 + }, + { + "epoch": 0.9528768383527337, + "grad_norm": 0.8915547728538513, + "learning_rate": 5.372668791329185e-05, + "loss": 1.0329, + "step": 149150 + }, + { + "epoch": 0.9529407255024724, + "grad_norm": 0.9852863550186157, + "learning_rate": 5.372168416626361e-05, + "loss": 1.06, + "step": 149160 + }, + { + "epoch": 0.9530046126522111, + "grad_norm": 0.7365124225616455, + "learning_rate": 5.371668038175478e-05, + "loss": 0.7519, + "step": 149170 + }, + { + "epoch": 0.9530684998019499, + "grad_norm": 0.8659708499908447, + "learning_rate": 5.371167655981576e-05, + "loss": 0.8632, + "step": 149180 + }, + { + "epoch": 0.9531323869516886, + "grad_norm": 0.8980143070220947, + "learning_rate": 5.3706672700496954e-05, + "loss": 0.7586, + "step": 149190 + }, + { + "epoch": 0.9531962741014273, + "grad_norm": 0.9168791174888611, + "learning_rate": 5.370166880384875e-05, + "loss": 0.9606, + "step": 149200 + }, + { + "epoch": 0.953260161251166, + "grad_norm": 3.515974760055542, + "learning_rate": 5.369666486992153e-05, + "loss": 0.9327, + "step": 149210 + }, + { + "epoch": 0.9533240484009047, + "grad_norm": 0.5968050360679626, + "learning_rate": 5.3691660898765705e-05, + "loss": 0.7153, + "step": 149220 + }, + { + "epoch": 0.9533879355506434, + "grad_norm": 0.4954209625720978, + "learning_rate": 5.3686656890431665e-05, + "loss": 0.7143, + "step": 149230 + }, + { + "epoch": 0.9534518227003821, + "grad_norm": 0.938412070274353, + "learning_rate": 5.3681652844969785e-05, + "loss": 0.7915, + "step": 149240 + }, + { + "epoch": 0.9535157098501208, + "grad_norm": 0.7088356614112854, + "learning_rate": 5.3676648762430495e-05, + "loss": 0.8184, + "step": 149250 + }, + { + "epoch": 0.9535795969998595, + "grad_norm": 1.936963677406311, + "learning_rate": 5.367164464286416e-05, + "loss": 0.7562, + "step": 149260 + }, + { + "epoch": 0.9536434841495981, + "grad_norm": 0.9665189385414124, + "learning_rate": 5.366664048632118e-05, + "loss": 0.9453, + "step": 149270 + }, + { + "epoch": 0.9537073712993368, + "grad_norm": 1.063223123550415, + "learning_rate": 5.366163629285198e-05, + "loss": 0.8095, + "step": 149280 + }, + { + "epoch": 0.9537712584490755, + "grad_norm": 1.0628734827041626, + "learning_rate": 5.365663206250693e-05, + "loss": 1.1281, + "step": 149290 + }, + { + "epoch": 0.9538351455988142, + "grad_norm": 0.9469806551933289, + "learning_rate": 5.365162779533641e-05, + "loss": 1.0392, + "step": 149300 + }, + { + "epoch": 0.9538990327485529, + "grad_norm": 0.8327413201332092, + "learning_rate": 5.3646623491390855e-05, + "loss": 0.5963, + "step": 149310 + }, + { + "epoch": 0.9539629198982916, + "grad_norm": 0.7886581420898438, + "learning_rate": 5.3641619150720646e-05, + "loss": 0.8496, + "step": 149320 + }, + { + "epoch": 0.9540268070480303, + "grad_norm": 0.7735728621482849, + "learning_rate": 5.363661477337618e-05, + "loss": 0.7483, + "step": 149330 + }, + { + "epoch": 0.954090694197769, + "grad_norm": 0.7688968777656555, + "learning_rate": 5.363161035940785e-05, + "loss": 0.9939, + "step": 149340 + }, + { + "epoch": 0.9541545813475077, + "grad_norm": 1.1434403657913208, + "learning_rate": 5.362660590886607e-05, + "loss": 1.1379, + "step": 149350 + }, + { + "epoch": 0.9542184684972465, + "grad_norm": 1.1655575037002563, + "learning_rate": 5.362160142180123e-05, + "loss": 0.9408, + "step": 149360 + }, + { + "epoch": 0.9542823556469852, + "grad_norm": 0.7211439609527588, + "learning_rate": 5.361659689826373e-05, + "loss": 0.6937, + "step": 149370 + }, + { + "epoch": 0.9543462427967239, + "grad_norm": 0.9510606527328491, + "learning_rate": 5.361159233830396e-05, + "loss": 1.044, + "step": 149380 + }, + { + "epoch": 0.9544101299464626, + "grad_norm": 0.7282498478889465, + "learning_rate": 5.360658774197235e-05, + "loss": 0.7549, + "step": 149390 + }, + { + "epoch": 0.9544740170962013, + "grad_norm": 1.0271804332733154, + "learning_rate": 5.3601583109319264e-05, + "loss": 1.0737, + "step": 149400 + }, + { + "epoch": 0.95453790424594, + "grad_norm": 0.7570605278015137, + "learning_rate": 5.359657844039514e-05, + "loss": 0.6157, + "step": 149410 + }, + { + "epoch": 0.9546017913956787, + "grad_norm": 0.8889819979667664, + "learning_rate": 5.3591573735250344e-05, + "loss": 1.0411, + "step": 149420 + }, + { + "epoch": 0.9546656785454174, + "grad_norm": 1.0955427885055542, + "learning_rate": 5.358656899393529e-05, + "loss": 1.0119, + "step": 149430 + }, + { + "epoch": 0.9547295656951561, + "grad_norm": 0.8745180368423462, + "learning_rate": 5.35815642165004e-05, + "loss": 0.9069, + "step": 149440 + }, + { + "epoch": 0.9547934528448948, + "grad_norm": 0.9385198354721069, + "learning_rate": 5.357655940299605e-05, + "loss": 0.8872, + "step": 149450 + }, + { + "epoch": 0.9548573399946335, + "grad_norm": 0.8840739130973816, + "learning_rate": 5.357155455347265e-05, + "loss": 0.6832, + "step": 149460 + }, + { + "epoch": 0.9549212271443722, + "grad_norm": 0.6228633522987366, + "learning_rate": 5.3566549667980614e-05, + "loss": 0.927, + "step": 149470 + }, + { + "epoch": 0.9549851142941109, + "grad_norm": 1.233453631401062, + "learning_rate": 5.356154474657033e-05, + "loss": 1.0322, + "step": 149480 + }, + { + "epoch": 0.9550490014438496, + "grad_norm": 1.1719436645507812, + "learning_rate": 5.355653978929222e-05, + "loss": 0.9269, + "step": 149490 + }, + { + "epoch": 0.9551128885935883, + "grad_norm": 0.7223755717277527, + "learning_rate": 5.3551534796196656e-05, + "loss": 0.8008, + "step": 149500 + }, + { + "epoch": 0.9551767757433269, + "grad_norm": 0.946700930595398, + "learning_rate": 5.3546529767334085e-05, + "loss": 0.7767, + "step": 149510 + }, + { + "epoch": 0.9552406628930656, + "grad_norm": 1.2065627574920654, + "learning_rate": 5.3541524702754886e-05, + "loss": 1.0349, + "step": 149520 + }, + { + "epoch": 0.9553045500428043, + "grad_norm": 1.362046718597412, + "learning_rate": 5.353651960250946e-05, + "loss": 0.6447, + "step": 149530 + }, + { + "epoch": 0.955368437192543, + "grad_norm": 1.2870509624481201, + "learning_rate": 5.353151446664824e-05, + "loss": 0.7011, + "step": 149540 + }, + { + "epoch": 0.9554323243422818, + "grad_norm": 1.1624693870544434, + "learning_rate": 5.352650929522159e-05, + "loss": 1.0363, + "step": 149550 + }, + { + "epoch": 0.9554962114920205, + "grad_norm": 1.077800989151001, + "learning_rate": 5.352150408827996e-05, + "loss": 0.9021, + "step": 149560 + }, + { + "epoch": 0.9555600986417592, + "grad_norm": 1.4235363006591797, + "learning_rate": 5.351649884587373e-05, + "loss": 0.9034, + "step": 149570 + }, + { + "epoch": 0.9556239857914979, + "grad_norm": 0.7878531217575073, + "learning_rate": 5.351149356805332e-05, + "loss": 1.0202, + "step": 149580 + }, + { + "epoch": 0.9556878729412366, + "grad_norm": 0.9151926636695862, + "learning_rate": 5.3506488254869124e-05, + "loss": 0.7615, + "step": 149590 + }, + { + "epoch": 0.9557517600909753, + "grad_norm": 0.8599022626876831, + "learning_rate": 5.3501482906371556e-05, + "loss": 0.9448, + "step": 149600 + }, + { + "epoch": 0.955815647240714, + "grad_norm": 0.8533304929733276, + "learning_rate": 5.349647752261103e-05, + "loss": 1.0606, + "step": 149610 + }, + { + "epoch": 0.9558795343904527, + "grad_norm": 1.0226484537124634, + "learning_rate": 5.3491472103637955e-05, + "loss": 0.9859, + "step": 149620 + }, + { + "epoch": 0.9559434215401914, + "grad_norm": 0.7206386923789978, + "learning_rate": 5.3486466649502733e-05, + "loss": 0.9154, + "step": 149630 + }, + { + "epoch": 0.9560073086899301, + "grad_norm": 1.4392954111099243, + "learning_rate": 5.3481461160255773e-05, + "loss": 0.74, + "step": 149640 + }, + { + "epoch": 0.9560711958396688, + "grad_norm": 1.0331584215164185, + "learning_rate": 5.3476455635947484e-05, + "loss": 0.8146, + "step": 149650 + }, + { + "epoch": 0.9561350829894075, + "grad_norm": 0.8725119829177856, + "learning_rate": 5.3471450076628294e-05, + "loss": 0.8874, + "step": 149660 + }, + { + "epoch": 0.9561989701391462, + "grad_norm": 0.8222038149833679, + "learning_rate": 5.346644448234859e-05, + "loss": 0.8343, + "step": 149670 + }, + { + "epoch": 0.9562628572888849, + "grad_norm": 1.1900924444198608, + "learning_rate": 5.3461438853158784e-05, + "loss": 0.8374, + "step": 149680 + }, + { + "epoch": 0.9563267444386236, + "grad_norm": 0.8432425260543823, + "learning_rate": 5.34564331891093e-05, + "loss": 1.0343, + "step": 149690 + }, + { + "epoch": 0.9563906315883623, + "grad_norm": 1.0364454984664917, + "learning_rate": 5.3451427490250535e-05, + "loss": 0.9026, + "step": 149700 + }, + { + "epoch": 0.9564545187381011, + "grad_norm": 0.8370749950408936, + "learning_rate": 5.344642175663292e-05, + "loss": 0.8584, + "step": 149710 + }, + { + "epoch": 0.9565184058878398, + "grad_norm": 0.720355212688446, + "learning_rate": 5.3441415988306856e-05, + "loss": 0.7958, + "step": 149720 + }, + { + "epoch": 0.9565822930375785, + "grad_norm": 0.9760454297065735, + "learning_rate": 5.343641018532275e-05, + "loss": 0.653, + "step": 149730 + }, + { + "epoch": 0.9566461801873172, + "grad_norm": 0.9241121411323547, + "learning_rate": 5.3431404347731015e-05, + "loss": 0.9755, + "step": 149740 + }, + { + "epoch": 0.9567100673370558, + "grad_norm": 1.0608195066452026, + "learning_rate": 5.3426398475582086e-05, + "loss": 0.9359, + "step": 149750 + }, + { + "epoch": 0.9567739544867945, + "grad_norm": 0.5845559239387512, + "learning_rate": 5.3421392568926363e-05, + "loss": 0.5841, + "step": 149760 + }, + { + "epoch": 0.9568378416365332, + "grad_norm": 1.0789985656738281, + "learning_rate": 5.341638662781424e-05, + "loss": 0.8384, + "step": 149770 + }, + { + "epoch": 0.9569017287862719, + "grad_norm": 1.286792278289795, + "learning_rate": 5.341138065229616e-05, + "loss": 0.7882, + "step": 149780 + }, + { + "epoch": 0.9569656159360106, + "grad_norm": 0.814147412776947, + "learning_rate": 5.3406374642422516e-05, + "loss": 0.8117, + "step": 149790 + }, + { + "epoch": 0.9570295030857493, + "grad_norm": 0.7748686671257019, + "learning_rate": 5.340136859824374e-05, + "loss": 0.7822, + "step": 149800 + }, + { + "epoch": 0.957093390235488, + "grad_norm": 0.9128410220146179, + "learning_rate": 5.339636251981024e-05, + "loss": 0.8351, + "step": 149810 + }, + { + "epoch": 0.9571572773852267, + "grad_norm": 1.1304187774658203, + "learning_rate": 5.339135640717242e-05, + "loss": 0.66, + "step": 149820 + }, + { + "epoch": 0.9572211645349654, + "grad_norm": 3.093017816543579, + "learning_rate": 5.3386350260380724e-05, + "loss": 0.9673, + "step": 149830 + }, + { + "epoch": 0.9572850516847041, + "grad_norm": 0.8403214812278748, + "learning_rate": 5.338134407948554e-05, + "loss": 0.8767, + "step": 149840 + }, + { + "epoch": 0.9573489388344428, + "grad_norm": 0.7375771403312683, + "learning_rate": 5.33763378645373e-05, + "loss": 1.0156, + "step": 149850 + }, + { + "epoch": 0.9574128259841815, + "grad_norm": 1.0833170413970947, + "learning_rate": 5.3371331615586405e-05, + "loss": 0.7993, + "step": 149860 + }, + { + "epoch": 0.9574767131339202, + "grad_norm": 0.7025789618492126, + "learning_rate": 5.336632533268329e-05, + "loss": 0.8754, + "step": 149870 + }, + { + "epoch": 0.957540600283659, + "grad_norm": 1.240759253501892, + "learning_rate": 5.336131901587836e-05, + "loss": 1.0206, + "step": 149880 + }, + { + "epoch": 0.9576044874333977, + "grad_norm": 0.9655282497406006, + "learning_rate": 5.335631266522205e-05, + "loss": 0.8602, + "step": 149890 + }, + { + "epoch": 0.9576683745831364, + "grad_norm": 0.8315982818603516, + "learning_rate": 5.335130628076478e-05, + "loss": 0.6983, + "step": 149900 + }, + { + "epoch": 0.9577322617328751, + "grad_norm": 0.5027674436569214, + "learning_rate": 5.334629986255694e-05, + "loss": 0.8194, + "step": 149910 + }, + { + "epoch": 0.9577961488826138, + "grad_norm": 1.5360819101333618, + "learning_rate": 5.3341293410648964e-05, + "loss": 0.8028, + "step": 149920 + }, + { + "epoch": 0.9578600360323525, + "grad_norm": 0.6608107686042786, + "learning_rate": 5.333628692509128e-05, + "loss": 0.8155, + "step": 149930 + }, + { + "epoch": 0.9579239231820912, + "grad_norm": 0.8952988982200623, + "learning_rate": 5.33312804059343e-05, + "loss": 0.7765, + "step": 149940 + }, + { + "epoch": 0.9579878103318299, + "grad_norm": 0.556867241859436, + "learning_rate": 5.3326273853228435e-05, + "loss": 0.9709, + "step": 149950 + }, + { + "epoch": 0.9580516974815686, + "grad_norm": 1.2768900394439697, + "learning_rate": 5.332126726702413e-05, + "loss": 0.8193, + "step": 149960 + }, + { + "epoch": 0.9581155846313073, + "grad_norm": 1.526533603668213, + "learning_rate": 5.3316260647371785e-05, + "loss": 0.84, + "step": 149970 + }, + { + "epoch": 0.958179471781046, + "grad_norm": 0.7335947155952454, + "learning_rate": 5.3311253994321816e-05, + "loss": 0.7598, + "step": 149980 + }, + { + "epoch": 0.9582433589307847, + "grad_norm": 0.5931270718574524, + "learning_rate": 5.3306247307924676e-05, + "loss": 0.9405, + "step": 149990 + }, + { + "epoch": 0.9583072460805233, + "grad_norm": 0.8060052394866943, + "learning_rate": 5.330124058823074e-05, + "loss": 0.8914, + "step": 150000 + }, + { + "epoch": 0.958371133230262, + "grad_norm": 1.0445197820663452, + "learning_rate": 5.3296233835290485e-05, + "loss": 1.0033, + "step": 150010 + }, + { + "epoch": 0.9584350203800007, + "grad_norm": 1.0918605327606201, + "learning_rate": 5.329122704915428e-05, + "loss": 0.914, + "step": 150020 + }, + { + "epoch": 0.9584989075297394, + "grad_norm": 1.1090441942214966, + "learning_rate": 5.328622022987257e-05, + "loss": 1.0159, + "step": 150030 + }, + { + "epoch": 0.9585627946794781, + "grad_norm": 0.9032556414604187, + "learning_rate": 5.328121337749579e-05, + "loss": 0.7714, + "step": 150040 + }, + { + "epoch": 0.9586266818292168, + "grad_norm": 0.678882896900177, + "learning_rate": 5.3276206492074344e-05, + "loss": 0.8174, + "step": 150050 + }, + { + "epoch": 0.9586905689789555, + "grad_norm": 1.054736852645874, + "learning_rate": 5.327119957365867e-05, + "loss": 0.7512, + "step": 150060 + }, + { + "epoch": 0.9587544561286943, + "grad_norm": 0.6916370987892151, + "learning_rate": 5.326619262229918e-05, + "loss": 1.21, + "step": 150070 + }, + { + "epoch": 0.958818343278433, + "grad_norm": 1.1509549617767334, + "learning_rate": 5.326118563804632e-05, + "loss": 0.9652, + "step": 150080 + }, + { + "epoch": 0.9588822304281717, + "grad_norm": 0.5059418082237244, + "learning_rate": 5.325617862095049e-05, + "loss": 0.8353, + "step": 150090 + }, + { + "epoch": 0.9589461175779104, + "grad_norm": 1.0998530387878418, + "learning_rate": 5.325117157106212e-05, + "loss": 1.0926, + "step": 150100 + }, + { + "epoch": 0.9590100047276491, + "grad_norm": 0.799642026424408, + "learning_rate": 5.324616448843165e-05, + "loss": 0.8287, + "step": 150110 + }, + { + "epoch": 0.9590738918773878, + "grad_norm": 0.6795291304588318, + "learning_rate": 5.3241157373109485e-05, + "loss": 0.8267, + "step": 150120 + }, + { + "epoch": 0.9591377790271265, + "grad_norm": 0.9579530954360962, + "learning_rate": 5.323615022514607e-05, + "loss": 0.8384, + "step": 150130 + }, + { + "epoch": 0.9592016661768652, + "grad_norm": 1.0019688606262207, + "learning_rate": 5.3231143044591816e-05, + "loss": 0.7948, + "step": 150140 + }, + { + "epoch": 0.9592655533266039, + "grad_norm": 1.0456329584121704, + "learning_rate": 5.322613583149715e-05, + "loss": 1.0149, + "step": 150150 + }, + { + "epoch": 0.9593294404763426, + "grad_norm": 1.1081980466842651, + "learning_rate": 5.322112858591252e-05, + "loss": 1.0692, + "step": 150160 + }, + { + "epoch": 0.9593933276260813, + "grad_norm": 1.2669342756271362, + "learning_rate": 5.3216121307888336e-05, + "loss": 0.8101, + "step": 150170 + }, + { + "epoch": 0.95945721477582, + "grad_norm": 1.001755714416504, + "learning_rate": 5.3211113997475016e-05, + "loss": 0.6696, + "step": 150180 + }, + { + "epoch": 0.9595211019255587, + "grad_norm": 0.6659078001976013, + "learning_rate": 5.320610665472301e-05, + "loss": 0.713, + "step": 150190 + }, + { + "epoch": 0.9595849890752974, + "grad_norm": 0.7698034048080444, + "learning_rate": 5.320109927968273e-05, + "loss": 0.8432, + "step": 150200 + }, + { + "epoch": 0.9596488762250361, + "grad_norm": 0.7013608813285828, + "learning_rate": 5.319609187240462e-05, + "loss": 1.1162, + "step": 150210 + }, + { + "epoch": 0.9597127633747748, + "grad_norm": 0.7664918899536133, + "learning_rate": 5.319108443293909e-05, + "loss": 0.9773, + "step": 150220 + }, + { + "epoch": 0.9597766505245136, + "grad_norm": 1.7179349660873413, + "learning_rate": 5.3186076961336584e-05, + "loss": 0.8648, + "step": 150230 + }, + { + "epoch": 0.9598405376742521, + "grad_norm": 0.5642333030700684, + "learning_rate": 5.318106945764752e-05, + "loss": 0.8806, + "step": 150240 + }, + { + "epoch": 0.9599044248239909, + "grad_norm": 1.004900336265564, + "learning_rate": 5.317606192192235e-05, + "loss": 0.8323, + "step": 150250 + }, + { + "epoch": 0.9599683119737296, + "grad_norm": 0.6977973580360413, + "learning_rate": 5.317105435421148e-05, + "loss": 1.0794, + "step": 150260 + }, + { + "epoch": 0.9600321991234683, + "grad_norm": 0.8252426385879517, + "learning_rate": 5.316604675456535e-05, + "loss": 0.7651, + "step": 150270 + }, + { + "epoch": 0.960096086273207, + "grad_norm": 1.0018861293792725, + "learning_rate": 5.316103912303438e-05, + "loss": 0.9188, + "step": 150280 + }, + { + "epoch": 0.9601599734229457, + "grad_norm": 0.829059362411499, + "learning_rate": 5.3156031459669035e-05, + "loss": 0.756, + "step": 150290 + }, + { + "epoch": 0.9602238605726844, + "grad_norm": 1.1105847358703613, + "learning_rate": 5.31510237645197e-05, + "loss": 0.7177, + "step": 150300 + }, + { + "epoch": 0.9602877477224231, + "grad_norm": 0.7792565822601318, + "learning_rate": 5.314601603763684e-05, + "loss": 1.1403, + "step": 150310 + }, + { + "epoch": 0.9603516348721618, + "grad_norm": 0.5869541764259338, + "learning_rate": 5.314100827907087e-05, + "loss": 0.8808, + "step": 150320 + }, + { + "epoch": 0.9604155220219005, + "grad_norm": 0.8240609169006348, + "learning_rate": 5.313600048887224e-05, + "loss": 0.8968, + "step": 150330 + }, + { + "epoch": 0.9604794091716392, + "grad_norm": 0.8057375550270081, + "learning_rate": 5.313099266709136e-05, + "loss": 0.856, + "step": 150340 + }, + { + "epoch": 0.9605432963213779, + "grad_norm": 0.7334375977516174, + "learning_rate": 5.312598481377869e-05, + "loss": 1.004, + "step": 150350 + }, + { + "epoch": 0.9606071834711166, + "grad_norm": 0.95655757188797, + "learning_rate": 5.3120976928984635e-05, + "loss": 0.9142, + "step": 150360 + }, + { + "epoch": 0.9606710706208553, + "grad_norm": 0.6239562630653381, + "learning_rate": 5.311596901275965e-05, + "loss": 0.9041, + "step": 150370 + }, + { + "epoch": 0.960734957770594, + "grad_norm": 2.0877952575683594, + "learning_rate": 5.3110961065154154e-05, + "loss": 1.0505, + "step": 150380 + }, + { + "epoch": 0.9607988449203327, + "grad_norm": 1.8619037866592407, + "learning_rate": 5.31059530862186e-05, + "loss": 0.8509, + "step": 150390 + }, + { + "epoch": 0.9608627320700714, + "grad_norm": 1.0404013395309448, + "learning_rate": 5.310094507600338e-05, + "loss": 0.9482, + "step": 150400 + }, + { + "epoch": 0.9609266192198102, + "grad_norm": 0.5970574617385864, + "learning_rate": 5.3095937034558994e-05, + "loss": 1.0259, + "step": 150410 + }, + { + "epoch": 0.9609905063695489, + "grad_norm": 0.9825915098190308, + "learning_rate": 5.309092896193584e-05, + "loss": 1.0407, + "step": 150420 + }, + { + "epoch": 0.9610543935192876, + "grad_norm": 0.7270873188972473, + "learning_rate": 5.308592085818435e-05, + "loss": 0.7591, + "step": 150430 + }, + { + "epoch": 0.9611182806690263, + "grad_norm": 1.053653597831726, + "learning_rate": 5.308091272335497e-05, + "loss": 0.8987, + "step": 150440 + }, + { + "epoch": 0.961182167818765, + "grad_norm": 0.8554880023002625, + "learning_rate": 5.307590455749812e-05, + "loss": 0.7091, + "step": 150450 + }, + { + "epoch": 0.9612460549685037, + "grad_norm": 0.5443610548973083, + "learning_rate": 5.307089636066427e-05, + "loss": 0.9389, + "step": 150460 + }, + { + "epoch": 0.9613099421182424, + "grad_norm": 0.6648747324943542, + "learning_rate": 5.306588813290383e-05, + "loss": 1.0063, + "step": 150470 + }, + { + "epoch": 0.961373829267981, + "grad_norm": 1.133631944656372, + "learning_rate": 5.306087987426725e-05, + "loss": 0.8192, + "step": 150480 + }, + { + "epoch": 0.9614377164177197, + "grad_norm": 1.77051842212677, + "learning_rate": 5.305587158480496e-05, + "loss": 0.7231, + "step": 150490 + }, + { + "epoch": 0.9615016035674584, + "grad_norm": 1.1798468828201294, + "learning_rate": 5.3050863264567396e-05, + "loss": 0.6667, + "step": 150500 + }, + { + "epoch": 0.9615654907171971, + "grad_norm": 1.529371976852417, + "learning_rate": 5.3045854913605e-05, + "loss": 0.6694, + "step": 150510 + }, + { + "epoch": 0.9616293778669358, + "grad_norm": 0.8843643665313721, + "learning_rate": 5.30408465319682e-05, + "loss": 0.9087, + "step": 150520 + }, + { + "epoch": 0.9616932650166745, + "grad_norm": 0.660963773727417, + "learning_rate": 5.303583811970746e-05, + "loss": 0.7277, + "step": 150530 + }, + { + "epoch": 0.9617571521664132, + "grad_norm": 1.6010621786117554, + "learning_rate": 5.3030829676873196e-05, + "loss": 0.89, + "step": 150540 + }, + { + "epoch": 0.9618210393161519, + "grad_norm": 1.0877952575683594, + "learning_rate": 5.3025821203515855e-05, + "loss": 0.7486, + "step": 150550 + }, + { + "epoch": 0.9618849264658906, + "grad_norm": 1.9828286170959473, + "learning_rate": 5.302081269968587e-05, + "loss": 0.8163, + "step": 150560 + }, + { + "epoch": 0.9619488136156293, + "grad_norm": 0.7645601630210876, + "learning_rate": 5.301580416543369e-05, + "loss": 0.7069, + "step": 150570 + }, + { + "epoch": 0.962012700765368, + "grad_norm": 0.8405600190162659, + "learning_rate": 5.301079560080976e-05, + "loss": 0.859, + "step": 150580 + }, + { + "epoch": 0.9620765879151068, + "grad_norm": 1.2271150350570679, + "learning_rate": 5.3005787005864515e-05, + "loss": 0.7004, + "step": 150590 + }, + { + "epoch": 0.9621404750648455, + "grad_norm": 1.4471722841262817, + "learning_rate": 5.3000778380648396e-05, + "loss": 0.8751, + "step": 150600 + }, + { + "epoch": 0.9622043622145842, + "grad_norm": 0.9273978471755981, + "learning_rate": 5.299576972521183e-05, + "loss": 0.9883, + "step": 150610 + }, + { + "epoch": 0.9622682493643229, + "grad_norm": 0.882642924785614, + "learning_rate": 5.299076103960528e-05, + "loss": 0.8676, + "step": 150620 + }, + { + "epoch": 0.9623321365140616, + "grad_norm": 3.830977201461792, + "learning_rate": 5.298575232387918e-05, + "loss": 0.8193, + "step": 150630 + }, + { + "epoch": 0.9623960236638003, + "grad_norm": 1.230293869972229, + "learning_rate": 5.298074357808397e-05, + "loss": 0.8701, + "step": 150640 + }, + { + "epoch": 0.962459910813539, + "grad_norm": 1.0408953428268433, + "learning_rate": 5.29757348022701e-05, + "loss": 1.0132, + "step": 150650 + }, + { + "epoch": 0.9625237979632777, + "grad_norm": 1.4850291013717651, + "learning_rate": 5.297072599648799e-05, + "loss": 0.747, + "step": 150660 + }, + { + "epoch": 0.9625876851130164, + "grad_norm": 1.36034095287323, + "learning_rate": 5.296571716078811e-05, + "loss": 0.8511, + "step": 150670 + }, + { + "epoch": 0.9626515722627551, + "grad_norm": 0.6303446292877197, + "learning_rate": 5.29607082952209e-05, + "loss": 0.8755, + "step": 150680 + }, + { + "epoch": 0.9627154594124938, + "grad_norm": 1.7142499685287476, + "learning_rate": 5.2955699399836776e-05, + "loss": 0.8461, + "step": 150690 + }, + { + "epoch": 0.9627793465622325, + "grad_norm": 0.9409696459770203, + "learning_rate": 5.2950690474686215e-05, + "loss": 0.8414, + "step": 150700 + }, + { + "epoch": 0.9628432337119712, + "grad_norm": 1.2119961977005005, + "learning_rate": 5.2945681519819646e-05, + "loss": 0.7946, + "step": 150710 + }, + { + "epoch": 0.9629071208617099, + "grad_norm": 0.9473667740821838, + "learning_rate": 5.2940672535287516e-05, + "loss": 0.8334, + "step": 150720 + }, + { + "epoch": 0.9629710080114485, + "grad_norm": 0.7992385625839233, + "learning_rate": 5.2935663521140274e-05, + "loss": 0.8921, + "step": 150730 + }, + { + "epoch": 0.9630348951611872, + "grad_norm": 1.3898727893829346, + "learning_rate": 5.293065447742835e-05, + "loss": 0.6968, + "step": 150740 + }, + { + "epoch": 0.9630987823109259, + "grad_norm": 0.7555790543556213, + "learning_rate": 5.292564540420221e-05, + "loss": 0.9248, + "step": 150750 + }, + { + "epoch": 0.9631626694606646, + "grad_norm": 1.3342205286026, + "learning_rate": 5.292063630151228e-05, + "loss": 0.6645, + "step": 150760 + }, + { + "epoch": 0.9632265566104034, + "grad_norm": 1.6979509592056274, + "learning_rate": 5.2915627169409035e-05, + "loss": 1.0116, + "step": 150770 + }, + { + "epoch": 0.9632904437601421, + "grad_norm": 0.8660208582878113, + "learning_rate": 5.291061800794288e-05, + "loss": 0.8999, + "step": 150780 + }, + { + "epoch": 0.9633543309098808, + "grad_norm": 1.2357333898544312, + "learning_rate": 5.29056088171643e-05, + "loss": 0.6997, + "step": 150790 + }, + { + "epoch": 0.9634182180596195, + "grad_norm": 0.9448620676994324, + "learning_rate": 5.290059959712371e-05, + "loss": 0.7693, + "step": 150800 + }, + { + "epoch": 0.9634821052093582, + "grad_norm": 0.7834548354148865, + "learning_rate": 5.289559034787158e-05, + "loss": 0.7257, + "step": 150810 + }, + { + "epoch": 0.9635459923590969, + "grad_norm": 0.8070199489593506, + "learning_rate": 5.2890581069458355e-05, + "loss": 0.7485, + "step": 150820 + }, + { + "epoch": 0.9636098795088356, + "grad_norm": 1.6272724866867065, + "learning_rate": 5.288557176193447e-05, + "loss": 1.0041, + "step": 150830 + }, + { + "epoch": 0.9636737666585743, + "grad_norm": 0.8953685164451599, + "learning_rate": 5.288056242535039e-05, + "loss": 0.9897, + "step": 150840 + }, + { + "epoch": 0.963737653808313, + "grad_norm": 0.6054562330245972, + "learning_rate": 5.2875553059756545e-05, + "loss": 0.8495, + "step": 150850 + }, + { + "epoch": 0.9638015409580517, + "grad_norm": 1.346152663230896, + "learning_rate": 5.28705436652034e-05, + "loss": 0.9919, + "step": 150860 + }, + { + "epoch": 0.9638654281077904, + "grad_norm": 1.1231837272644043, + "learning_rate": 5.286553424174139e-05, + "loss": 0.8157, + "step": 150870 + }, + { + "epoch": 0.9639293152575291, + "grad_norm": 3.220287561416626, + "learning_rate": 5.286052478942097e-05, + "loss": 0.8534, + "step": 150880 + }, + { + "epoch": 0.9639932024072678, + "grad_norm": 0.7934685945510864, + "learning_rate": 5.28555153082926e-05, + "loss": 0.8603, + "step": 150890 + }, + { + "epoch": 0.9640570895570065, + "grad_norm": 0.9011366963386536, + "learning_rate": 5.2850505798406716e-05, + "loss": 0.846, + "step": 150900 + }, + { + "epoch": 0.9641209767067452, + "grad_norm": 0.7348203659057617, + "learning_rate": 5.2845496259813773e-05, + "loss": 1.0849, + "step": 150910 + }, + { + "epoch": 0.9641848638564839, + "grad_norm": 1.1324224472045898, + "learning_rate": 5.284098765057728e-05, + "loss": 1.0193, + "step": 150920 + }, + { + "epoch": 0.9642487510062226, + "grad_norm": 1.1051008701324463, + "learning_rate": 5.283597805757992e-05, + "loss": 1.0603, + "step": 150930 + }, + { + "epoch": 0.9643126381559614, + "grad_norm": 1.0409125089645386, + "learning_rate": 5.28309684360218e-05, + "loss": 1.0198, + "step": 150940 + }, + { + "epoch": 0.9643765253057001, + "grad_norm": 1.0346943140029907, + "learning_rate": 5.282595878595338e-05, + "loss": 0.9523, + "step": 150950 + }, + { + "epoch": 0.9644404124554388, + "grad_norm": 0.9301409125328064, + "learning_rate": 5.282094910742511e-05, + "loss": 0.8966, + "step": 150960 + }, + { + "epoch": 0.9645042996051774, + "grad_norm": 1.0205413103103638, + "learning_rate": 5.281593940048745e-05, + "loss": 0.8136, + "step": 150970 + }, + { + "epoch": 0.9645681867549161, + "grad_norm": 0.4575611650943756, + "learning_rate": 5.2810929665190836e-05, + "loss": 0.9879, + "step": 150980 + }, + { + "epoch": 0.9646320739046548, + "grad_norm": 1.4380875825881958, + "learning_rate": 5.280591990158572e-05, + "loss": 0.8518, + "step": 150990 + }, + { + "epoch": 0.9646959610543935, + "grad_norm": 1.1791918277740479, + "learning_rate": 5.280091010972258e-05, + "loss": 1.0687, + "step": 151000 + }, + { + "epoch": 0.9647598482041322, + "grad_norm": 1.3037655353546143, + "learning_rate": 5.279590028965185e-05, + "loss": 1.2233, + "step": 151010 + }, + { + "epoch": 0.9648237353538709, + "grad_norm": 1.227778673171997, + "learning_rate": 5.2790890441423965e-05, + "loss": 0.9386, + "step": 151020 + }, + { + "epoch": 0.9648876225036096, + "grad_norm": 0.6218796968460083, + "learning_rate": 5.2785880565089416e-05, + "loss": 1.1117, + "step": 151030 + }, + { + "epoch": 0.9649515096533483, + "grad_norm": 1.0044218301773071, + "learning_rate": 5.2780870660698634e-05, + "loss": 0.8787, + "step": 151040 + }, + { + "epoch": 0.965015396803087, + "grad_norm": 0.7901304960250854, + "learning_rate": 5.2775860728302084e-05, + "loss": 0.8695, + "step": 151050 + }, + { + "epoch": 0.9650792839528257, + "grad_norm": 1.453648328781128, + "learning_rate": 5.277085076795021e-05, + "loss": 0.8621, + "step": 151060 + }, + { + "epoch": 0.9651431711025644, + "grad_norm": 0.5584386587142944, + "learning_rate": 5.2765840779693474e-05, + "loss": 0.8665, + "step": 151070 + }, + { + "epoch": 0.9652070582523031, + "grad_norm": 0.5596110224723816, + "learning_rate": 5.2760830763582326e-05, + "loss": 0.8572, + "step": 151080 + }, + { + "epoch": 0.9652709454020418, + "grad_norm": 1.8040897846221924, + "learning_rate": 5.275582071966723e-05, + "loss": 0.885, + "step": 151090 + }, + { + "epoch": 0.9653348325517805, + "grad_norm": 0.8948487043380737, + "learning_rate": 5.275081064799864e-05, + "loss": 1.2298, + "step": 151100 + }, + { + "epoch": 0.9653987197015192, + "grad_norm": 0.6678011417388916, + "learning_rate": 5.2745800548626986e-05, + "loss": 0.9738, + "step": 151110 + }, + { + "epoch": 0.965462606851258, + "grad_norm": 0.5756567716598511, + "learning_rate": 5.274079042160278e-05, + "loss": 0.7736, + "step": 151120 + }, + { + "epoch": 0.9655264940009967, + "grad_norm": 2.063096046447754, + "learning_rate": 5.273578026697642e-05, + "loss": 0.7518, + "step": 151130 + }, + { + "epoch": 0.9655903811507354, + "grad_norm": 0.6849361658096313, + "learning_rate": 5.2730770084798384e-05, + "loss": 0.7166, + "step": 151140 + }, + { + "epoch": 0.9656542683004741, + "grad_norm": 1.2403055429458618, + "learning_rate": 5.272575987511914e-05, + "loss": 0.9516, + "step": 151150 + }, + { + "epoch": 0.9657181554502128, + "grad_norm": 1.0753352642059326, + "learning_rate": 5.272074963798913e-05, + "loss": 0.6363, + "step": 151160 + }, + { + "epoch": 0.9657820425999515, + "grad_norm": 0.6305554509162903, + "learning_rate": 5.271573937345882e-05, + "loss": 0.74, + "step": 151170 + }, + { + "epoch": 0.9658459297496902, + "grad_norm": 0.7894411087036133, + "learning_rate": 5.271072908157866e-05, + "loss": 0.7682, + "step": 151180 + }, + { + "epoch": 0.9659098168994289, + "grad_norm": 0.7625752687454224, + "learning_rate": 5.270571876239911e-05, + "loss": 0.7678, + "step": 151190 + }, + { + "epoch": 0.9659737040491676, + "grad_norm": 0.777441143989563, + "learning_rate": 5.270070841597062e-05, + "loss": 0.939, + "step": 151200 + }, + { + "epoch": 0.9660375911989062, + "grad_norm": 0.7669252157211304, + "learning_rate": 5.269569804234369e-05, + "loss": 1.0347, + "step": 151210 + }, + { + "epoch": 0.9661014783486449, + "grad_norm": 1.6018074750900269, + "learning_rate": 5.2691188682866444e-05, + "loss": 0.9971, + "step": 151220 + }, + { + "epoch": 0.9661653654983836, + "grad_norm": 1.066701889038086, + "learning_rate": 5.268617825770142e-05, + "loss": 0.8429, + "step": 151230 + }, + { + "epoch": 0.9662292526481223, + "grad_norm": 1.0202710628509521, + "learning_rate": 5.268116780548426e-05, + "loss": 0.8482, + "step": 151240 + }, + { + "epoch": 0.966293139797861, + "grad_norm": 0.9673142433166504, + "learning_rate": 5.267615732626542e-05, + "loss": 0.9325, + "step": 151250 + }, + { + "epoch": 0.9663570269475997, + "grad_norm": 0.8143606781959534, + "learning_rate": 5.2671146820095365e-05, + "loss": 0.8128, + "step": 151260 + }, + { + "epoch": 0.9664209140973384, + "grad_norm": 1.2554757595062256, + "learning_rate": 5.266613628702456e-05, + "loss": 0.8891, + "step": 151270 + }, + { + "epoch": 0.9664848012470771, + "grad_norm": 1.1865863800048828, + "learning_rate": 5.2661125727103434e-05, + "loss": 0.9087, + "step": 151280 + }, + { + "epoch": 0.9665486883968158, + "grad_norm": 0.8015139102935791, + "learning_rate": 5.265611514038248e-05, + "loss": 0.9303, + "step": 151290 + }, + { + "epoch": 0.9666125755465546, + "grad_norm": 0.9343031048774719, + "learning_rate": 5.2651104526912145e-05, + "loss": 1.0272, + "step": 151300 + }, + { + "epoch": 0.9666764626962933, + "grad_norm": 0.8166914582252502, + "learning_rate": 5.26460938867429e-05, + "loss": 1.2548, + "step": 151310 + }, + { + "epoch": 0.966740349846032, + "grad_norm": 0.9215566515922546, + "learning_rate": 5.264108321992518e-05, + "loss": 0.8869, + "step": 151320 + }, + { + "epoch": 0.9668042369957707, + "grad_norm": 1.8220895528793335, + "learning_rate": 5.2636072526509486e-05, + "loss": 1.0963, + "step": 151330 + }, + { + "epoch": 0.9668681241455094, + "grad_norm": 1.234108805656433, + "learning_rate": 5.2631061806546255e-05, + "loss": 0.9705, + "step": 151340 + }, + { + "epoch": 0.9669320112952481, + "grad_norm": 1.0234565734863281, + "learning_rate": 5.2626051060085956e-05, + "loss": 0.7106, + "step": 151350 + }, + { + "epoch": 0.9669958984449868, + "grad_norm": 0.9547996520996094, + "learning_rate": 5.262104028717906e-05, + "loss": 0.9427, + "step": 151360 + }, + { + "epoch": 0.9670597855947255, + "grad_norm": 0.819320559501648, + "learning_rate": 5.261602948787601e-05, + "loss": 0.8402, + "step": 151370 + }, + { + "epoch": 0.9671236727444642, + "grad_norm": 1.10805082321167, + "learning_rate": 5.261101866222728e-05, + "loss": 0.9087, + "step": 151380 + }, + { + "epoch": 0.9671875598942029, + "grad_norm": 0.9255377650260925, + "learning_rate": 5.260600781028334e-05, + "loss": 0.898, + "step": 151390 + }, + { + "epoch": 0.9672514470439416, + "grad_norm": 0.827276885509491, + "learning_rate": 5.2600996932094634e-05, + "loss": 0.8182, + "step": 151400 + }, + { + "epoch": 0.9673153341936803, + "grad_norm": 1.3312925100326538, + "learning_rate": 5.259598602771165e-05, + "loss": 0.8368, + "step": 151410 + }, + { + "epoch": 0.967379221343419, + "grad_norm": 2.995258092880249, + "learning_rate": 5.2590975097184844e-05, + "loss": 0.7184, + "step": 151420 + }, + { + "epoch": 0.9674431084931577, + "grad_norm": 0.8039790391921997, + "learning_rate": 5.258596414056467e-05, + "loss": 0.6866, + "step": 151430 + }, + { + "epoch": 0.9675069956428964, + "grad_norm": 1.1383510828018188, + "learning_rate": 5.25809531579016e-05, + "loss": 0.6508, + "step": 151440 + }, + { + "epoch": 0.967570882792635, + "grad_norm": 0.8737443089485168, + "learning_rate": 5.25759421492461e-05, + "loss": 0.8217, + "step": 151450 + }, + { + "epoch": 0.9676347699423737, + "grad_norm": 0.9597667455673218, + "learning_rate": 5.257093111464865e-05, + "loss": 0.8019, + "step": 151460 + }, + { + "epoch": 0.9676986570921124, + "grad_norm": 0.9724913239479065, + "learning_rate": 5.256592005415968e-05, + "loss": 1.0306, + "step": 151470 + }, + { + "epoch": 0.9677625442418512, + "grad_norm": 0.9813938736915588, + "learning_rate": 5.256090896782968e-05, + "loss": 1.2299, + "step": 151480 + }, + { + "epoch": 0.9678264313915899, + "grad_norm": 0.5077504515647888, + "learning_rate": 5.2555897855709114e-05, + "loss": 0.8514, + "step": 151490 + }, + { + "epoch": 0.9678903185413286, + "grad_norm": 0.8173817992210388, + "learning_rate": 5.2550886717848436e-05, + "loss": 1.0456, + "step": 151500 + }, + { + "epoch": 0.9679542056910673, + "grad_norm": 0.8337730765342712, + "learning_rate": 5.254587555429813e-05, + "loss": 0.9812, + "step": 151510 + }, + { + "epoch": 0.968018092840806, + "grad_norm": 0.9300865530967712, + "learning_rate": 5.254086436510866e-05, + "loss": 0.696, + "step": 151520 + }, + { + "epoch": 0.9680819799905447, + "grad_norm": 0.712742805480957, + "learning_rate": 5.253585315033047e-05, + "loss": 0.8063, + "step": 151530 + }, + { + "epoch": 0.9681458671402834, + "grad_norm": 0.685977578163147, + "learning_rate": 5.253084191001406e-05, + "loss": 0.9119, + "step": 151540 + }, + { + "epoch": 0.9682097542900221, + "grad_norm": 1.3182839155197144, + "learning_rate": 5.2525830644209885e-05, + "loss": 0.8322, + "step": 151550 + }, + { + "epoch": 0.9682736414397608, + "grad_norm": 1.8923838138580322, + "learning_rate": 5.25208193529684e-05, + "loss": 0.8107, + "step": 151560 + }, + { + "epoch": 0.9683375285894995, + "grad_norm": 0.7532184720039368, + "learning_rate": 5.251580803634008e-05, + "loss": 0.6918, + "step": 151570 + }, + { + "epoch": 0.9684014157392382, + "grad_norm": 0.9623368382453918, + "learning_rate": 5.2510796694375406e-05, + "loss": 0.8558, + "step": 151580 + }, + { + "epoch": 0.9684653028889769, + "grad_norm": 0.7997425198554993, + "learning_rate": 5.2505785327124836e-05, + "loss": 0.9148, + "step": 151590 + }, + { + "epoch": 0.9685291900387156, + "grad_norm": 0.9608021378517151, + "learning_rate": 5.250077393463884e-05, + "loss": 1.0758, + "step": 151600 + }, + { + "epoch": 0.9685930771884543, + "grad_norm": 1.5573434829711914, + "learning_rate": 5.2495762516967886e-05, + "loss": 0.8347, + "step": 151610 + }, + { + "epoch": 0.968656964338193, + "grad_norm": 2.805736541748047, + "learning_rate": 5.2490751074162446e-05, + "loss": 0.8639, + "step": 151620 + }, + { + "epoch": 0.9687208514879317, + "grad_norm": 0.5936098098754883, + "learning_rate": 5.2485739606272985e-05, + "loss": 0.8002, + "step": 151630 + }, + { + "epoch": 0.9687847386376705, + "grad_norm": 0.6356973648071289, + "learning_rate": 5.248072811334997e-05, + "loss": 0.8728, + "step": 151640 + }, + { + "epoch": 0.9688486257874092, + "grad_norm": 1.3857841491699219, + "learning_rate": 5.2475716595443894e-05, + "loss": 1.0488, + "step": 151650 + }, + { + "epoch": 0.9689125129371479, + "grad_norm": 1.0003827810287476, + "learning_rate": 5.24707050526052e-05, + "loss": 0.8302, + "step": 151660 + }, + { + "epoch": 0.9689764000868866, + "grad_norm": 0.884917140007019, + "learning_rate": 5.246569348488436e-05, + "loss": 0.734, + "step": 151670 + }, + { + "epoch": 0.9690402872366253, + "grad_norm": 0.648077130317688, + "learning_rate": 5.246068189233186e-05, + "loss": 1.0681, + "step": 151680 + }, + { + "epoch": 0.969104174386364, + "grad_norm": 0.6451058387756348, + "learning_rate": 5.245567027499816e-05, + "loss": 0.9954, + "step": 151690 + }, + { + "epoch": 0.9691680615361026, + "grad_norm": 1.31290602684021, + "learning_rate": 5.2450658632933736e-05, + "loss": 0.9458, + "step": 151700 + }, + { + "epoch": 0.9692319486858413, + "grad_norm": 0.7243294715881348, + "learning_rate": 5.244564696618907e-05, + "loss": 0.8819, + "step": 151710 + }, + { + "epoch": 0.96929583583558, + "grad_norm": 1.8024976253509521, + "learning_rate": 5.244063527481462e-05, + "loss": 0.7875, + "step": 151720 + }, + { + "epoch": 0.9693597229853187, + "grad_norm": 1.2114737033843994, + "learning_rate": 5.243562355886086e-05, + "loss": 0.7601, + "step": 151730 + }, + { + "epoch": 0.9694236101350574, + "grad_norm": 1.0889559984207153, + "learning_rate": 5.243061181837826e-05, + "loss": 0.9074, + "step": 151740 + }, + { + "epoch": 0.9694874972847961, + "grad_norm": 0.7654042840003967, + "learning_rate": 5.24256000534173e-05, + "loss": 0.7559, + "step": 151750 + }, + { + "epoch": 0.9695513844345348, + "grad_norm": 0.9789912700653076, + "learning_rate": 5.242058826402846e-05, + "loss": 0.8855, + "step": 151760 + }, + { + "epoch": 0.9696152715842735, + "grad_norm": 2.210740089416504, + "learning_rate": 5.241557645026219e-05, + "loss": 0.9904, + "step": 151770 + }, + { + "epoch": 0.9696791587340122, + "grad_norm": 0.6800025105476379, + "learning_rate": 5.241056461216898e-05, + "loss": 0.9026, + "step": 151780 + }, + { + "epoch": 0.9697430458837509, + "grad_norm": 1.204058289527893, + "learning_rate": 5.240555274979929e-05, + "loss": 1.0395, + "step": 151790 + }, + { + "epoch": 0.9698069330334896, + "grad_norm": 0.8177311420440674, + "learning_rate": 5.240054086320361e-05, + "loss": 1.1084, + "step": 151800 + }, + { + "epoch": 0.9698708201832283, + "grad_norm": 0.714911699295044, + "learning_rate": 5.239552895243241e-05, + "loss": 0.8985, + "step": 151810 + }, + { + "epoch": 0.969934707332967, + "grad_norm": 2.143303155899048, + "learning_rate": 5.239051701753614e-05, + "loss": 1.1244, + "step": 151820 + }, + { + "epoch": 0.9699985944827058, + "grad_norm": 0.977552592754364, + "learning_rate": 5.2385505058565324e-05, + "loss": 1.0981, + "step": 151830 + }, + { + "epoch": 0.9700624816324445, + "grad_norm": 2.233025312423706, + "learning_rate": 5.2380493075570394e-05, + "loss": 1.1075, + "step": 151840 + }, + { + "epoch": 0.9701263687821832, + "grad_norm": 1.9697365760803223, + "learning_rate": 5.237548106860183e-05, + "loss": 0.8237, + "step": 151850 + }, + { + "epoch": 0.9701902559319219, + "grad_norm": 3.052886486053467, + "learning_rate": 5.237046903771012e-05, + "loss": 0.762, + "step": 151860 + }, + { + "epoch": 0.9702541430816606, + "grad_norm": 2.318892002105713, + "learning_rate": 5.236545698294575e-05, + "loss": 0.7751, + "step": 151870 + }, + { + "epoch": 0.9703180302313993, + "grad_norm": 0.842376708984375, + "learning_rate": 5.2360444904359176e-05, + "loss": 0.8483, + "step": 151880 + }, + { + "epoch": 0.970381917381138, + "grad_norm": 1.4306411743164062, + "learning_rate": 5.235543280200088e-05, + "loss": 0.8895, + "step": 151890 + }, + { + "epoch": 0.9704458045308767, + "grad_norm": 4.7012939453125, + "learning_rate": 5.235042067592133e-05, + "loss": 1.0352, + "step": 151900 + }, + { + "epoch": 0.9705096916806154, + "grad_norm": 0.7660112977027893, + "learning_rate": 5.234540852617102e-05, + "loss": 0.6482, + "step": 151910 + }, + { + "epoch": 0.9705735788303541, + "grad_norm": 1.0199682712554932, + "learning_rate": 5.234039635280041e-05, + "loss": 0.9154, + "step": 151920 + }, + { + "epoch": 0.9706374659800928, + "grad_norm": 0.9796050190925598, + "learning_rate": 5.233538415585999e-05, + "loss": 0.9538, + "step": 151930 + }, + { + "epoch": 0.9707013531298314, + "grad_norm": 0.7912867069244385, + "learning_rate": 5.233037193540023e-05, + "loss": 0.7974, + "step": 151940 + }, + { + "epoch": 0.9707652402795701, + "grad_norm": 0.8988333344459534, + "learning_rate": 5.2325359691471606e-05, + "loss": 0.8347, + "step": 151950 + }, + { + "epoch": 0.9708291274293088, + "grad_norm": 0.6798145174980164, + "learning_rate": 5.2320347424124606e-05, + "loss": 0.8257, + "step": 151960 + }, + { + "epoch": 0.9708930145790475, + "grad_norm": 3.4491665363311768, + "learning_rate": 5.2315335133409694e-05, + "loss": 0.7976, + "step": 151970 + }, + { + "epoch": 0.9709569017287862, + "grad_norm": 0.5991364121437073, + "learning_rate": 5.2310322819377355e-05, + "loss": 0.9797, + "step": 151980 + }, + { + "epoch": 0.9710207888785249, + "grad_norm": 0.987819492816925, + "learning_rate": 5.2305310482078064e-05, + "loss": 0.9553, + "step": 151990 + }, + { + "epoch": 0.9710846760282636, + "grad_norm": 0.7812177538871765, + "learning_rate": 5.230029812156232e-05, + "loss": 0.7681, + "step": 152000 + }, + { + "epoch": 0.9711485631780024, + "grad_norm": 0.9155138731002808, + "learning_rate": 5.229528573788055e-05, + "loss": 0.7462, + "step": 152010 + }, + { + "epoch": 0.9712124503277411, + "grad_norm": 1.2678287029266357, + "learning_rate": 5.229027333108328e-05, + "loss": 1.0377, + "step": 152020 + }, + { + "epoch": 0.9712763374774798, + "grad_norm": 0.624622106552124, + "learning_rate": 5.228526090122099e-05, + "loss": 0.9255, + "step": 152030 + }, + { + "epoch": 0.9713402246272185, + "grad_norm": 1.6265813112258911, + "learning_rate": 5.228024844834414e-05, + "loss": 0.7908, + "step": 152040 + }, + { + "epoch": 0.9714041117769572, + "grad_norm": 0.7967225313186646, + "learning_rate": 5.227523597250321e-05, + "loss": 0.9871, + "step": 152050 + }, + { + "epoch": 0.9714679989266959, + "grad_norm": 1.2453161478042603, + "learning_rate": 5.227022347374868e-05, + "loss": 0.876, + "step": 152060 + }, + { + "epoch": 0.9715318860764346, + "grad_norm": 1.3441709280014038, + "learning_rate": 5.226521095213105e-05, + "loss": 0.7978, + "step": 152070 + }, + { + "epoch": 0.9715957732261733, + "grad_norm": 1.4316071271896362, + "learning_rate": 5.2260198407700775e-05, + "loss": 0.7867, + "step": 152080 + }, + { + "epoch": 0.971659660375912, + "grad_norm": 2.101490020751953, + "learning_rate": 5.225518584050835e-05, + "loss": 0.9115, + "step": 152090 + }, + { + "epoch": 0.9717235475256507, + "grad_norm": 0.7814741730690002, + "learning_rate": 5.225017325060425e-05, + "loss": 0.8807, + "step": 152100 + }, + { + "epoch": 0.9717874346753894, + "grad_norm": 0.9148108959197998, + "learning_rate": 5.224516063803897e-05, + "loss": 0.876, + "step": 152110 + }, + { + "epoch": 0.9718513218251281, + "grad_norm": 0.9834555983543396, + "learning_rate": 5.2240148002862964e-05, + "loss": 0.8846, + "step": 152120 + }, + { + "epoch": 0.9719152089748668, + "grad_norm": 0.8105610013008118, + "learning_rate": 5.223513534512674e-05, + "loss": 0.8168, + "step": 152130 + }, + { + "epoch": 0.9719790961246055, + "grad_norm": 1.4332830905914307, + "learning_rate": 5.223012266488076e-05, + "loss": 0.8217, + "step": 152140 + }, + { + "epoch": 0.9720429832743442, + "grad_norm": 0.8803772330284119, + "learning_rate": 5.222510996217554e-05, + "loss": 0.8183, + "step": 152150 + }, + { + "epoch": 0.972106870424083, + "grad_norm": 1.188647985458374, + "learning_rate": 5.222009723706151e-05, + "loss": 0.9345, + "step": 152160 + }, + { + "epoch": 0.9721707575738217, + "grad_norm": 0.9444614052772522, + "learning_rate": 5.2215084489589194e-05, + "loss": 0.9522, + "step": 152170 + }, + { + "epoch": 0.9722346447235602, + "grad_norm": 1.4221155643463135, + "learning_rate": 5.2210071719809064e-05, + "loss": 0.977, + "step": 152180 + }, + { + "epoch": 0.972298531873299, + "grad_norm": 1.2089207172393799, + "learning_rate": 5.220505892777159e-05, + "loss": 0.8424, + "step": 152190 + }, + { + "epoch": 0.9723624190230377, + "grad_norm": 0.8503243327140808, + "learning_rate": 5.220004611352727e-05, + "loss": 0.96, + "step": 152200 + }, + { + "epoch": 0.9724263061727764, + "grad_norm": 0.7801720499992371, + "learning_rate": 5.219503327712656e-05, + "loss": 0.7537, + "step": 152210 + }, + { + "epoch": 0.9724901933225151, + "grad_norm": 0.8834246397018433, + "learning_rate": 5.219002041861999e-05, + "loss": 0.8585, + "step": 152220 + }, + { + "epoch": 0.9725540804722538, + "grad_norm": 0.9630089998245239, + "learning_rate": 5.218500753805802e-05, + "loss": 0.7415, + "step": 152230 + }, + { + "epoch": 0.9726179676219925, + "grad_norm": 0.6178570985794067, + "learning_rate": 5.217999463549113e-05, + "loss": 0.8743, + "step": 152240 + }, + { + "epoch": 0.9726818547717312, + "grad_norm": 1.1636070013046265, + "learning_rate": 5.217498171096982e-05, + "loss": 0.8063, + "step": 152250 + }, + { + "epoch": 0.9727457419214699, + "grad_norm": 0.6922101378440857, + "learning_rate": 5.216996876454454e-05, + "loss": 0.806, + "step": 152260 + }, + { + "epoch": 0.9728096290712086, + "grad_norm": 0.6797016859054565, + "learning_rate": 5.2164955796265814e-05, + "loss": 0.8855, + "step": 152270 + }, + { + "epoch": 0.9728735162209473, + "grad_norm": 0.6983970403671265, + "learning_rate": 5.21599428061841e-05, + "loss": 0.8559, + "step": 152280 + }, + { + "epoch": 0.972937403370686, + "grad_norm": 1.0314277410507202, + "learning_rate": 5.2154929794349894e-05, + "loss": 0.6021, + "step": 152290 + }, + { + "epoch": 0.9730012905204247, + "grad_norm": 0.8680412173271179, + "learning_rate": 5.214991676081369e-05, + "loss": 0.7651, + "step": 152300 + }, + { + "epoch": 0.9730651776701634, + "grad_norm": 0.8231766819953918, + "learning_rate": 5.214490370562596e-05, + "loss": 0.7858, + "step": 152310 + }, + { + "epoch": 0.9731290648199021, + "grad_norm": 0.8090435266494751, + "learning_rate": 5.2139890628837183e-05, + "loss": 1.0899, + "step": 152320 + }, + { + "epoch": 0.9731929519696408, + "grad_norm": 1.0858497619628906, + "learning_rate": 5.213487753049787e-05, + "loss": 0.8816, + "step": 152330 + }, + { + "epoch": 0.9732568391193795, + "grad_norm": 0.9355524778366089, + "learning_rate": 5.212986441065849e-05, + "loss": 1.0293, + "step": 152340 + }, + { + "epoch": 0.9733207262691183, + "grad_norm": 1.1359403133392334, + "learning_rate": 5.2124851269369534e-05, + "loss": 0.919, + "step": 152350 + }, + { + "epoch": 0.973384613418857, + "grad_norm": 1.9253411293029785, + "learning_rate": 5.211983810668148e-05, + "loss": 1.0969, + "step": 152360 + }, + { + "epoch": 0.9734485005685957, + "grad_norm": 0.8547667264938354, + "learning_rate": 5.2114824922644824e-05, + "loss": 0.6838, + "step": 152370 + }, + { + "epoch": 0.9735123877183344, + "grad_norm": 1.0332297086715698, + "learning_rate": 5.210981171731005e-05, + "loss": 0.8215, + "step": 152380 + }, + { + "epoch": 0.9735762748680731, + "grad_norm": 1.0458303689956665, + "learning_rate": 5.210479849072765e-05, + "loss": 0.7492, + "step": 152390 + }, + { + "epoch": 0.9736401620178118, + "grad_norm": 0.9344658851623535, + "learning_rate": 5.209978524294811e-05, + "loss": 0.741, + "step": 152400 + }, + { + "epoch": 0.9737040491675505, + "grad_norm": 1.4250115156173706, + "learning_rate": 5.209477197402192e-05, + "loss": 0.6877, + "step": 152410 + }, + { + "epoch": 0.9737679363172892, + "grad_norm": 0.9061084985733032, + "learning_rate": 5.208975868399956e-05, + "loss": 1.0584, + "step": 152420 + }, + { + "epoch": 0.9738318234670278, + "grad_norm": 0.806816041469574, + "learning_rate": 5.208474537293152e-05, + "loss": 0.6887, + "step": 152430 + }, + { + "epoch": 0.9738957106167665, + "grad_norm": 0.933032751083374, + "learning_rate": 5.207973204086829e-05, + "loss": 0.8562, + "step": 152440 + }, + { + "epoch": 0.9739595977665052, + "grad_norm": 1.0426222085952759, + "learning_rate": 5.207471868786036e-05, + "loss": 0.871, + "step": 152450 + }, + { + "epoch": 0.9740234849162439, + "grad_norm": 1.1309046745300293, + "learning_rate": 5.206970531395822e-05, + "loss": 0.7954, + "step": 152460 + }, + { + "epoch": 0.9740873720659826, + "grad_norm": 0.8131570816040039, + "learning_rate": 5.2064691919212364e-05, + "loss": 0.91, + "step": 152470 + }, + { + "epoch": 0.9741512592157213, + "grad_norm": 0.9664103388786316, + "learning_rate": 5.205967850367326e-05, + "loss": 1.0082, + "step": 152480 + }, + { + "epoch": 0.97421514636546, + "grad_norm": 1.055430293083191, + "learning_rate": 5.205466506739143e-05, + "loss": 0.9536, + "step": 152490 + }, + { + "epoch": 0.9742790335151987, + "grad_norm": 0.8127159476280212, + "learning_rate": 5.2049651610417326e-05, + "loss": 0.7859, + "step": 152500 + }, + { + "epoch": 0.9743429206649374, + "grad_norm": 0.7035512924194336, + "learning_rate": 5.204463813280147e-05, + "loss": 0.9262, + "step": 152510 + }, + { + "epoch": 0.9744068078146761, + "grad_norm": 0.881603479385376, + "learning_rate": 5.203962463459433e-05, + "loss": 0.7748, + "step": 152520 + }, + { + "epoch": 0.9744706949644149, + "grad_norm": 0.8797648549079895, + "learning_rate": 5.203461111584641e-05, + "loss": 1.044, + "step": 152530 + }, + { + "epoch": 0.9745345821141536, + "grad_norm": 0.8272404670715332, + "learning_rate": 5.202959757660819e-05, + "loss": 0.6735, + "step": 152540 + }, + { + "epoch": 0.9745984692638923, + "grad_norm": 0.9870911240577698, + "learning_rate": 5.202458401693017e-05, + "loss": 0.811, + "step": 152550 + }, + { + "epoch": 0.974662356413631, + "grad_norm": 0.838467001914978, + "learning_rate": 5.2019570436862844e-05, + "loss": 1.024, + "step": 152560 + }, + { + "epoch": 0.9747262435633697, + "grad_norm": 0.5068958401679993, + "learning_rate": 5.2014556836456685e-05, + "loss": 1.1225, + "step": 152570 + }, + { + "epoch": 0.9747901307131084, + "grad_norm": 0.6512907147407532, + "learning_rate": 5.2009543215762204e-05, + "loss": 0.8789, + "step": 152580 + }, + { + "epoch": 0.9748540178628471, + "grad_norm": 0.9875491857528687, + "learning_rate": 5.200452957482988e-05, + "loss": 0.7917, + "step": 152590 + }, + { + "epoch": 0.9749179050125858, + "grad_norm": 0.5896627902984619, + "learning_rate": 5.199951591371022e-05, + "loss": 1.141, + "step": 152600 + }, + { + "epoch": 0.9749817921623245, + "grad_norm": 0.9969107508659363, + "learning_rate": 5.199450223245369e-05, + "loss": 0.8511, + "step": 152610 + }, + { + "epoch": 0.9750456793120632, + "grad_norm": 0.7358691096305847, + "learning_rate": 5.1989488531110794e-05, + "loss": 0.7029, + "step": 152620 + }, + { + "epoch": 0.9751095664618019, + "grad_norm": 0.5284615159034729, + "learning_rate": 5.198447480973204e-05, + "loss": 0.7927, + "step": 152630 + }, + { + "epoch": 0.9751734536115406, + "grad_norm": 1.0184814929962158, + "learning_rate": 5.1979461068367904e-05, + "loss": 0.9482, + "step": 152640 + }, + { + "epoch": 0.9752373407612793, + "grad_norm": 1.5319088697433472, + "learning_rate": 5.197444730706889e-05, + "loss": 0.871, + "step": 152650 + }, + { + "epoch": 0.975301227911018, + "grad_norm": 1.3232473134994507, + "learning_rate": 5.196943352588548e-05, + "loss": 0.8189, + "step": 152660 + }, + { + "epoch": 0.9753651150607566, + "grad_norm": 0.698314368724823, + "learning_rate": 5.196441972486816e-05, + "loss": 0.8088, + "step": 152670 + }, + { + "epoch": 0.9754290022104953, + "grad_norm": 0.8359566926956177, + "learning_rate": 5.1959405904067446e-05, + "loss": 0.8434, + "step": 152680 + }, + { + "epoch": 0.975492889360234, + "grad_norm": 0.5400557518005371, + "learning_rate": 5.195439206353381e-05, + "loss": 0.8659, + "step": 152690 + }, + { + "epoch": 0.9755567765099727, + "grad_norm": 1.6195552349090576, + "learning_rate": 5.1949378203317764e-05, + "loss": 0.6596, + "step": 152700 + }, + { + "epoch": 0.9756206636597115, + "grad_norm": 1.1681946516036987, + "learning_rate": 5.1944364323469785e-05, + "loss": 0.8251, + "step": 152710 + }, + { + "epoch": 0.9756845508094502, + "grad_norm": 1.1966689825057983, + "learning_rate": 5.1939350424040376e-05, + "loss": 0.9019, + "step": 152720 + }, + { + "epoch": 0.9757484379591889, + "grad_norm": 0.7698038220405579, + "learning_rate": 5.193433650508004e-05, + "loss": 0.6619, + "step": 152730 + }, + { + "epoch": 0.9758123251089276, + "grad_norm": 1.1396318674087524, + "learning_rate": 5.192932256663925e-05, + "loss": 0.9543, + "step": 152740 + }, + { + "epoch": 0.9758762122586663, + "grad_norm": 0.9183258414268494, + "learning_rate": 5.1924308608768524e-05, + "loss": 1.0159, + "step": 152750 + }, + { + "epoch": 0.975940099408405, + "grad_norm": 0.7935616374015808, + "learning_rate": 5.1919294631518336e-05, + "loss": 0.9497, + "step": 152760 + }, + { + "epoch": 0.9760039865581437, + "grad_norm": 0.8395205140113831, + "learning_rate": 5.1914280634939195e-05, + "loss": 0.9478, + "step": 152770 + }, + { + "epoch": 0.9760678737078824, + "grad_norm": 1.5380463600158691, + "learning_rate": 5.190926661908159e-05, + "loss": 0.9063, + "step": 152780 + }, + { + "epoch": 0.9761317608576211, + "grad_norm": 0.8509006500244141, + "learning_rate": 5.190425258399601e-05, + "loss": 1.1259, + "step": 152790 + }, + { + "epoch": 0.9761956480073598, + "grad_norm": 1.152294397354126, + "learning_rate": 5.189923852973297e-05, + "loss": 0.8142, + "step": 152800 + }, + { + "epoch": 0.9762595351570985, + "grad_norm": 0.7129946947097778, + "learning_rate": 5.1894224456342965e-05, + "loss": 0.9805, + "step": 152810 + }, + { + "epoch": 0.9763234223068372, + "grad_norm": 0.9357526302337646, + "learning_rate": 5.188921036387646e-05, + "loss": 0.9792, + "step": 152820 + }, + { + "epoch": 0.9763873094565759, + "grad_norm": 0.7643981575965881, + "learning_rate": 5.1884196252383986e-05, + "loss": 0.7641, + "step": 152830 + }, + { + "epoch": 0.9764511966063146, + "grad_norm": 0.860305666923523, + "learning_rate": 5.187918212191603e-05, + "loss": 0.8608, + "step": 152840 + }, + { + "epoch": 0.9765150837560533, + "grad_norm": 1.20055091381073, + "learning_rate": 5.187416797252307e-05, + "loss": 0.7898, + "step": 152850 + }, + { + "epoch": 0.976578970905792, + "grad_norm": 1.0174932479858398, + "learning_rate": 5.186915380425562e-05, + "loss": 0.7676, + "step": 152860 + }, + { + "epoch": 0.9766428580555307, + "grad_norm": 0.8433327674865723, + "learning_rate": 5.1864139617164174e-05, + "loss": 1.0166, + "step": 152870 + }, + { + "epoch": 0.9767067452052695, + "grad_norm": 0.8074188828468323, + "learning_rate": 5.185912541129924e-05, + "loss": 1.202, + "step": 152880 + }, + { + "epoch": 0.9767706323550082, + "grad_norm": 0.7383306622505188, + "learning_rate": 5.1854111186711295e-05, + "loss": 0.8643, + "step": 152890 + }, + { + "epoch": 0.9768345195047469, + "grad_norm": 0.704338014125824, + "learning_rate": 5.184909694345084e-05, + "loss": 0.7977, + "step": 152900 + }, + { + "epoch": 0.9768984066544855, + "grad_norm": 0.6669245362281799, + "learning_rate": 5.1844082681568386e-05, + "loss": 0.8619, + "step": 152910 + }, + { + "epoch": 0.9769622938042242, + "grad_norm": 0.9143712520599365, + "learning_rate": 5.183906840111442e-05, + "loss": 0.9948, + "step": 152920 + }, + { + "epoch": 0.9770261809539629, + "grad_norm": 1.7431244850158691, + "learning_rate": 5.1834054102139454e-05, + "loss": 0.6948, + "step": 152930 + }, + { + "epoch": 0.9770900681037016, + "grad_norm": 0.8700932264328003, + "learning_rate": 5.182903978469398e-05, + "loss": 0.8935, + "step": 152940 + }, + { + "epoch": 0.9771539552534403, + "grad_norm": 0.7274859547615051, + "learning_rate": 5.182402544882847e-05, + "loss": 0.7769, + "step": 152950 + }, + { + "epoch": 0.977217842403179, + "grad_norm": 0.48052549362182617, + "learning_rate": 5.181901109459347e-05, + "loss": 0.7875, + "step": 152960 + }, + { + "epoch": 0.9772817295529177, + "grad_norm": 1.1585367918014526, + "learning_rate": 5.181399672203946e-05, + "loss": 0.7967, + "step": 152970 + }, + { + "epoch": 0.9773456167026564, + "grad_norm": 1.8060321807861328, + "learning_rate": 5.1808982331216915e-05, + "loss": 0.8072, + "step": 152980 + }, + { + "epoch": 0.9774095038523951, + "grad_norm": 0.683174729347229, + "learning_rate": 5.1803967922176354e-05, + "loss": 0.8027, + "step": 152990 + }, + { + "epoch": 0.9774733910021338, + "grad_norm": 0.8326396346092224, + "learning_rate": 5.1798953494968285e-05, + "loss": 0.6853, + "step": 153000 + }, + { + "epoch": 0.9775372781518725, + "grad_norm": 0.6510669589042664, + "learning_rate": 5.179393904964319e-05, + "loss": 0.9173, + "step": 153010 + }, + { + "epoch": 0.9776011653016112, + "grad_norm": 0.894503653049469, + "learning_rate": 5.1788924586251575e-05, + "loss": 0.9928, + "step": 153020 + }, + { + "epoch": 0.9776650524513499, + "grad_norm": 0.8102232217788696, + "learning_rate": 5.178391010484395e-05, + "loss": 1.1457, + "step": 153030 + }, + { + "epoch": 0.9777289396010886, + "grad_norm": 0.8255794644355774, + "learning_rate": 5.17788956054708e-05, + "loss": 0.7739, + "step": 153040 + }, + { + "epoch": 0.9777928267508273, + "grad_norm": 0.968258261680603, + "learning_rate": 5.177388108818263e-05, + "loss": 0.8573, + "step": 153050 + }, + { + "epoch": 0.977856713900566, + "grad_norm": 0.6829890608787537, + "learning_rate": 5.176886655302994e-05, + "loss": 0.8869, + "step": 153060 + }, + { + "epoch": 0.9779206010503048, + "grad_norm": 3.4975321292877197, + "learning_rate": 5.176385200006324e-05, + "loss": 1.072, + "step": 153070 + }, + { + "epoch": 0.9779844882000435, + "grad_norm": 1.045479416847229, + "learning_rate": 5.1758837429333026e-05, + "loss": 0.6734, + "step": 153080 + }, + { + "epoch": 0.9780483753497822, + "grad_norm": 1.0568046569824219, + "learning_rate": 5.1753822840889796e-05, + "loss": 0.7752, + "step": 153090 + }, + { + "epoch": 0.9781122624995209, + "grad_norm": 0.843722403049469, + "learning_rate": 5.174880823478405e-05, + "loss": 0.9224, + "step": 153100 + }, + { + "epoch": 0.9781761496492596, + "grad_norm": 1.006016731262207, + "learning_rate": 5.174379361106629e-05, + "loss": 0.8705, + "step": 153110 + }, + { + "epoch": 0.9782400367989983, + "grad_norm": 2.0581963062286377, + "learning_rate": 5.173877896978703e-05, + "loss": 0.8711, + "step": 153120 + }, + { + "epoch": 0.978303923948737, + "grad_norm": 0.788337230682373, + "learning_rate": 5.173376431099676e-05, + "loss": 0.7674, + "step": 153130 + }, + { + "epoch": 0.9783678110984757, + "grad_norm": 1.791744351387024, + "learning_rate": 5.172874963474598e-05, + "loss": 0.9112, + "step": 153140 + }, + { + "epoch": 0.9784316982482143, + "grad_norm": 0.7716138362884521, + "learning_rate": 5.17237349410852e-05, + "loss": 0.7965, + "step": 153150 + }, + { + "epoch": 0.978495585397953, + "grad_norm": 0.616436779499054, + "learning_rate": 5.171872023006491e-05, + "loss": 1.0199, + "step": 153160 + }, + { + "epoch": 0.9785594725476917, + "grad_norm": 0.8398680686950684, + "learning_rate": 5.171370550173562e-05, + "loss": 1.0967, + "step": 153170 + }, + { + "epoch": 0.9786233596974304, + "grad_norm": 0.9748437404632568, + "learning_rate": 5.170869075614784e-05, + "loss": 0.9849, + "step": 153180 + }, + { + "epoch": 0.9786872468471691, + "grad_norm": 0.8368244171142578, + "learning_rate": 5.1703675993352064e-05, + "loss": 0.8339, + "step": 153190 + }, + { + "epoch": 0.9787511339969078, + "grad_norm": 0.9211305975914001, + "learning_rate": 5.169866121339879e-05, + "loss": 0.7099, + "step": 153200 + }, + { + "epoch": 0.9788150211466465, + "grad_norm": 1.1104360818862915, + "learning_rate": 5.169364641633855e-05, + "loss": 0.8684, + "step": 153210 + }, + { + "epoch": 0.9788789082963852, + "grad_norm": 0.6493495106697083, + "learning_rate": 5.1688631602221794e-05, + "loss": 1.0052, + "step": 153220 + }, + { + "epoch": 0.978942795446124, + "grad_norm": 1.0354483127593994, + "learning_rate": 5.168361677109908e-05, + "loss": 0.9606, + "step": 153230 + }, + { + "epoch": 0.9790066825958627, + "grad_norm": 0.6334801912307739, + "learning_rate": 5.1678601923020876e-05, + "loss": 1.0056, + "step": 153240 + }, + { + "epoch": 0.9790705697456014, + "grad_norm": 1.2471381425857544, + "learning_rate": 5.16735870580377e-05, + "loss": 0.9594, + "step": 153250 + }, + { + "epoch": 0.9791344568953401, + "grad_norm": 1.8686987161636353, + "learning_rate": 5.166857217620006e-05, + "loss": 0.8422, + "step": 153260 + }, + { + "epoch": 0.9791983440450788, + "grad_norm": 0.7281016707420349, + "learning_rate": 5.1663557277558447e-05, + "loss": 1.0227, + "step": 153270 + }, + { + "epoch": 0.9792622311948175, + "grad_norm": 1.3672218322753906, + "learning_rate": 5.1658542362163385e-05, + "loss": 1.2125, + "step": 153280 + }, + { + "epoch": 0.9793261183445562, + "grad_norm": 0.6560665369033813, + "learning_rate": 5.165352743006536e-05, + "loss": 0.7462, + "step": 153290 + }, + { + "epoch": 0.9793900054942949, + "grad_norm": 0.8147808313369751, + "learning_rate": 5.164851248131488e-05, + "loss": 0.8436, + "step": 153300 + }, + { + "epoch": 0.9794538926440336, + "grad_norm": 1.1131194829940796, + "learning_rate": 5.1643497515962455e-05, + "loss": 0.8414, + "step": 153310 + }, + { + "epoch": 0.9795177797937723, + "grad_norm": 0.8697034120559692, + "learning_rate": 5.16384825340586e-05, + "loss": 0.8524, + "step": 153320 + }, + { + "epoch": 0.979581666943511, + "grad_norm": 0.9882239699363708, + "learning_rate": 5.163346753565379e-05, + "loss": 0.7327, + "step": 153330 + }, + { + "epoch": 0.9796455540932497, + "grad_norm": 0.7860538959503174, + "learning_rate": 5.162845252079855e-05, + "loss": 0.7041, + "step": 153340 + }, + { + "epoch": 0.9797094412429884, + "grad_norm": 1.2516241073608398, + "learning_rate": 5.16234374895434e-05, + "loss": 1.284, + "step": 153350 + }, + { + "epoch": 0.9797733283927271, + "grad_norm": 1.390798568725586, + "learning_rate": 5.161842244193882e-05, + "loss": 1.0494, + "step": 153360 + }, + { + "epoch": 0.9798372155424658, + "grad_norm": 0.6051182746887207, + "learning_rate": 5.1613407378035326e-05, + "loss": 0.8672, + "step": 153370 + }, + { + "epoch": 0.9799011026922045, + "grad_norm": 1.087631344795227, + "learning_rate": 5.1608392297883426e-05, + "loss": 1.1912, + "step": 153380 + }, + { + "epoch": 0.9799649898419432, + "grad_norm": 0.9267514944076538, + "learning_rate": 5.160337720153362e-05, + "loss": 0.8654, + "step": 153390 + }, + { + "epoch": 0.9800288769916818, + "grad_norm": 1.898051142692566, + "learning_rate": 5.1598362089036424e-05, + "loss": 0.8454, + "step": 153400 + }, + { + "epoch": 0.9800927641414205, + "grad_norm": 0.8334454894065857, + "learning_rate": 5.1593346960442336e-05, + "loss": 0.9431, + "step": 153410 + }, + { + "epoch": 0.9801566512911593, + "grad_norm": 1.1599924564361572, + "learning_rate": 5.158833181580186e-05, + "loss": 0.7981, + "step": 153420 + }, + { + "epoch": 0.980220538440898, + "grad_norm": 0.7810460925102234, + "learning_rate": 5.1583316655165506e-05, + "loss": 1.0807, + "step": 153430 + }, + { + "epoch": 0.9802844255906367, + "grad_norm": 1.1010318994522095, + "learning_rate": 5.157830147858379e-05, + "loss": 0.7419, + "step": 153440 + }, + { + "epoch": 0.9803483127403754, + "grad_norm": 0.6888182163238525, + "learning_rate": 5.1573286286107216e-05, + "loss": 0.785, + "step": 153450 + }, + { + "epoch": 0.9804121998901141, + "grad_norm": 0.6871199011802673, + "learning_rate": 5.15682710777863e-05, + "loss": 0.8662, + "step": 153460 + }, + { + "epoch": 0.9804760870398528, + "grad_norm": 0.7507506012916565, + "learning_rate": 5.156325585367152e-05, + "loss": 0.8132, + "step": 153470 + }, + { + "epoch": 0.9805399741895915, + "grad_norm": 0.8823074698448181, + "learning_rate": 5.1558240613813416e-05, + "loss": 0.8198, + "step": 153480 + }, + { + "epoch": 0.9806038613393302, + "grad_norm": 0.6161057949066162, + "learning_rate": 5.155322535826246e-05, + "loss": 0.8405, + "step": 153490 + }, + { + "epoch": 0.9806677484890689, + "grad_norm": 0.7086304426193237, + "learning_rate": 5.1548210087069196e-05, + "loss": 0.7153, + "step": 153500 + }, + { + "epoch": 0.9807316356388076, + "grad_norm": 1.631806492805481, + "learning_rate": 5.154319480028411e-05, + "loss": 0.7625, + "step": 153510 + }, + { + "epoch": 0.9807955227885463, + "grad_norm": 0.7055345177650452, + "learning_rate": 5.153817949795772e-05, + "loss": 0.8304, + "step": 153520 + }, + { + "epoch": 0.980859409938285, + "grad_norm": 1.2235612869262695, + "learning_rate": 5.153316418014053e-05, + "loss": 0.9013, + "step": 153530 + }, + { + "epoch": 0.9809232970880237, + "grad_norm": 0.678455114364624, + "learning_rate": 5.152814884688305e-05, + "loss": 0.7831, + "step": 153540 + }, + { + "epoch": 0.9809871842377624, + "grad_norm": 1.0608277320861816, + "learning_rate": 5.152313349823579e-05, + "loss": 0.7629, + "step": 153550 + }, + { + "epoch": 0.9810510713875011, + "grad_norm": 0.8170053958892822, + "learning_rate": 5.151811813424926e-05, + "loss": 0.6033, + "step": 153560 + }, + { + "epoch": 0.9811149585372398, + "grad_norm": 0.7763380408287048, + "learning_rate": 5.151310275497396e-05, + "loss": 0.8555, + "step": 153570 + }, + { + "epoch": 0.9811788456869786, + "grad_norm": 1.2432817220687866, + "learning_rate": 5.150808736046042e-05, + "loss": 0.7298, + "step": 153580 + }, + { + "epoch": 0.9812427328367173, + "grad_norm": 0.9610840082168579, + "learning_rate": 5.150307195075912e-05, + "loss": 0.9684, + "step": 153590 + }, + { + "epoch": 0.981306619986456, + "grad_norm": 0.9366688132286072, + "learning_rate": 5.149805652592059e-05, + "loss": 0.7935, + "step": 153600 + }, + { + "epoch": 0.9813705071361947, + "grad_norm": 0.8396714329719543, + "learning_rate": 5.1493041085995334e-05, + "loss": 1.1084, + "step": 153610 + }, + { + "epoch": 0.9814343942859334, + "grad_norm": 0.9239795207977295, + "learning_rate": 5.148802563103387e-05, + "loss": 0.6051, + "step": 153620 + }, + { + "epoch": 0.9814982814356721, + "grad_norm": 0.866423487663269, + "learning_rate": 5.1483010161086695e-05, + "loss": 1.1068, + "step": 153630 + }, + { + "epoch": 0.9815621685854107, + "grad_norm": 1.2361878156661987, + "learning_rate": 5.147799467620432e-05, + "loss": 1.2309, + "step": 153640 + }, + { + "epoch": 0.9816260557351494, + "grad_norm": 0.9184995889663696, + "learning_rate": 5.147297917643728e-05, + "loss": 0.9739, + "step": 153650 + }, + { + "epoch": 0.9816899428848881, + "grad_norm": 0.8369823098182678, + "learning_rate": 5.146796366183604e-05, + "loss": 0.8687, + "step": 153660 + }, + { + "epoch": 0.9817538300346268, + "grad_norm": 0.8505387902259827, + "learning_rate": 5.146294813245115e-05, + "loss": 0.8085, + "step": 153670 + }, + { + "epoch": 0.9818177171843655, + "grad_norm": 1.1725592613220215, + "learning_rate": 5.14579325883331e-05, + "loss": 1.0974, + "step": 153680 + }, + { + "epoch": 0.9818816043341042, + "grad_norm": 1.1303421258926392, + "learning_rate": 5.145291702953241e-05, + "loss": 0.8779, + "step": 153690 + }, + { + "epoch": 0.9819454914838429, + "grad_norm": 1.3215134143829346, + "learning_rate": 5.144790145609961e-05, + "loss": 0.7935, + "step": 153700 + }, + { + "epoch": 0.9820093786335816, + "grad_norm": 0.9500746130943298, + "learning_rate": 5.1442885868085166e-05, + "loss": 1.2262, + "step": 153710 + }, + { + "epoch": 0.9820732657833203, + "grad_norm": 0.8446482419967651, + "learning_rate": 5.143787026553962e-05, + "loss": 0.7815, + "step": 153720 + }, + { + "epoch": 0.982137152933059, + "grad_norm": 1.9297406673431396, + "learning_rate": 5.143285464851347e-05, + "loss": 0.8834, + "step": 153730 + }, + { + "epoch": 0.9822010400827977, + "grad_norm": 1.5801446437835693, + "learning_rate": 5.1427839017057234e-05, + "loss": 0.8194, + "step": 153740 + }, + { + "epoch": 0.9822649272325364, + "grad_norm": 0.723784327507019, + "learning_rate": 5.142282337122142e-05, + "loss": 0.9339, + "step": 153750 + }, + { + "epoch": 0.9823288143822752, + "grad_norm": 0.9597413539886475, + "learning_rate": 5.141780771105655e-05, + "loss": 1.4497, + "step": 153760 + }, + { + "epoch": 0.9823927015320139, + "grad_norm": 1.1143347024917603, + "learning_rate": 5.1412792036613136e-05, + "loss": 1.099, + "step": 153770 + }, + { + "epoch": 0.9824565886817526, + "grad_norm": 0.9566546082496643, + "learning_rate": 5.1407776347941674e-05, + "loss": 1.1144, + "step": 153780 + }, + { + "epoch": 0.9825204758314913, + "grad_norm": 1.0242767333984375, + "learning_rate": 5.1402760645092696e-05, + "loss": 0.8559, + "step": 153790 + }, + { + "epoch": 0.98258436298123, + "grad_norm": 1.4230691194534302, + "learning_rate": 5.13977449281167e-05, + "loss": 1.1065, + "step": 153800 + }, + { + "epoch": 0.9826482501309687, + "grad_norm": 0.743841826915741, + "learning_rate": 5.139272919706421e-05, + "loss": 0.8184, + "step": 153810 + }, + { + "epoch": 0.9827121372807074, + "grad_norm": 1.586731195449829, + "learning_rate": 5.138771345198572e-05, + "loss": 0.8454, + "step": 153820 + }, + { + "epoch": 0.9827760244304461, + "grad_norm": 0.9233693480491638, + "learning_rate": 5.138269769293176e-05, + "loss": 1.0317, + "step": 153830 + }, + { + "epoch": 0.9828399115801848, + "grad_norm": 0.9364616274833679, + "learning_rate": 5.137768191995284e-05, + "loss": 0.9424, + "step": 153840 + }, + { + "epoch": 0.9829037987299235, + "grad_norm": 0.9385852217674255, + "learning_rate": 5.137266613309947e-05, + "loss": 1.1349, + "step": 153850 + }, + { + "epoch": 0.9829676858796622, + "grad_norm": 0.8022464513778687, + "learning_rate": 5.1367650332422155e-05, + "loss": 0.8832, + "step": 153860 + }, + { + "epoch": 0.9830315730294009, + "grad_norm": 1.424519658088684, + "learning_rate": 5.136263451797143e-05, + "loss": 0.8496, + "step": 153870 + }, + { + "epoch": 0.9830954601791395, + "grad_norm": 0.835070013999939, + "learning_rate": 5.1357618689797795e-05, + "loss": 0.7952, + "step": 153880 + }, + { + "epoch": 0.9831593473288782, + "grad_norm": 1.1491596698760986, + "learning_rate": 5.135260284795176e-05, + "loss": 1.0079, + "step": 153890 + }, + { + "epoch": 0.9832232344786169, + "grad_norm": 1.1634465456008911, + "learning_rate": 5.134758699248386e-05, + "loss": 0.8044, + "step": 153900 + }, + { + "epoch": 0.9832871216283556, + "grad_norm": 0.9196897745132446, + "learning_rate": 5.134257112344457e-05, + "loss": 0.8414, + "step": 153910 + }, + { + "epoch": 0.9833510087780943, + "grad_norm": 0.801220715045929, + "learning_rate": 5.133755524088444e-05, + "loss": 0.725, + "step": 153920 + }, + { + "epoch": 0.983414895927833, + "grad_norm": 0.6582996845245361, + "learning_rate": 5.133253934485397e-05, + "loss": 0.6951, + "step": 153930 + }, + { + "epoch": 0.9834787830775717, + "grad_norm": 1.0240771770477295, + "learning_rate": 5.132752343540368e-05, + "loss": 0.9298, + "step": 153940 + }, + { + "epoch": 0.9835426702273105, + "grad_norm": 0.8002316951751709, + "learning_rate": 5.132250751258407e-05, + "loss": 1.0542, + "step": 153950 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 1.1618034839630127, + "learning_rate": 5.131749157644568e-05, + "loss": 0.7801, + "step": 153960 + }, + { + "epoch": 0.9836704445267879, + "grad_norm": 0.9297146201133728, + "learning_rate": 5.1312475627039e-05, + "loss": 0.8619, + "step": 153970 + }, + { + "epoch": 0.9837343316765266, + "grad_norm": 0.7752575278282166, + "learning_rate": 5.130745966441456e-05, + "loss": 0.7556, + "step": 153980 + }, + { + "epoch": 0.9837982188262653, + "grad_norm": 1.0066181421279907, + "learning_rate": 5.130244368862286e-05, + "loss": 0.9884, + "step": 153990 + }, + { + "epoch": 0.983862105976004, + "grad_norm": 0.8494886159896851, + "learning_rate": 5.129742769971443e-05, + "loss": 0.8843, + "step": 154000 + }, + { + "epoch": 0.9839259931257427, + "grad_norm": 0.7732999920845032, + "learning_rate": 5.1292411697739786e-05, + "loss": 0.6683, + "step": 154010 + }, + { + "epoch": 0.9839898802754814, + "grad_norm": 1.17229425907135, + "learning_rate": 5.128739568274944e-05, + "loss": 0.7683, + "step": 154020 + }, + { + "epoch": 0.9840537674252201, + "grad_norm": 1.5680131912231445, + "learning_rate": 5.12823796547939e-05, + "loss": 1.2148, + "step": 154030 + }, + { + "epoch": 0.9841176545749588, + "grad_norm": 0.7837061882019043, + "learning_rate": 5.1277363613923676e-05, + "loss": 0.6788, + "step": 154040 + }, + { + "epoch": 0.9841815417246975, + "grad_norm": 1.022614598274231, + "learning_rate": 5.1272347560189314e-05, + "loss": 0.7454, + "step": 154050 + }, + { + "epoch": 0.9842454288744362, + "grad_norm": 1.2080639600753784, + "learning_rate": 5.12673314936413e-05, + "loss": 0.8185, + "step": 154060 + }, + { + "epoch": 0.9843093160241749, + "grad_norm": 1.0237927436828613, + "learning_rate": 5.126231541433018e-05, + "loss": 0.8095, + "step": 154070 + }, + { + "epoch": 0.9843732031739136, + "grad_norm": 0.5881984233856201, + "learning_rate": 5.125729932230643e-05, + "loss": 0.8783, + "step": 154080 + }, + { + "epoch": 0.9844370903236523, + "grad_norm": 0.7203654646873474, + "learning_rate": 5.12522832176206e-05, + "loss": 0.7445, + "step": 154090 + }, + { + "epoch": 0.984500977473391, + "grad_norm": 1.8319315910339355, + "learning_rate": 5.1247267100323195e-05, + "loss": 1.0696, + "step": 154100 + }, + { + "epoch": 0.9845648646231298, + "grad_norm": 0.8602228164672852, + "learning_rate": 5.124225097046472e-05, + "loss": 0.8352, + "step": 154110 + }, + { + "epoch": 0.9846287517728685, + "grad_norm": 0.7165752053260803, + "learning_rate": 5.12372348280957e-05, + "loss": 0.981, + "step": 154120 + }, + { + "epoch": 0.984692638922607, + "grad_norm": 0.9421229362487793, + "learning_rate": 5.123221867326666e-05, + "loss": 0.9251, + "step": 154130 + }, + { + "epoch": 0.9847565260723458, + "grad_norm": 1.153640866279602, + "learning_rate": 5.1227202506028117e-05, + "loss": 1.0239, + "step": 154140 + }, + { + "epoch": 0.9848204132220845, + "grad_norm": 0.8328729271888733, + "learning_rate": 5.122218632643059e-05, + "loss": 0.7723, + "step": 154150 + }, + { + "epoch": 0.9848843003718232, + "grad_norm": 3.4909727573394775, + "learning_rate": 5.1217170134524586e-05, + "loss": 0.9164, + "step": 154160 + }, + { + "epoch": 0.9849481875215619, + "grad_norm": 1.5162936449050903, + "learning_rate": 5.1212153930360615e-05, + "loss": 0.7539, + "step": 154170 + }, + { + "epoch": 0.9850120746713006, + "grad_norm": 0.8826167583465576, + "learning_rate": 5.1207137713989205e-05, + "loss": 0.7714, + "step": 154180 + }, + { + "epoch": 0.9850759618210393, + "grad_norm": 0.7749609351158142, + "learning_rate": 5.1202121485460894e-05, + "loss": 0.9825, + "step": 154190 + }, + { + "epoch": 0.985139848970778, + "grad_norm": 0.9990338683128357, + "learning_rate": 5.119710524482617e-05, + "loss": 0.7492, + "step": 154200 + }, + { + "epoch": 0.9852037361205167, + "grad_norm": Infinity, + "learning_rate": 5.119259061794569e-05, + "loss": 0.7843, + "step": 154210 + }, + { + "epoch": 0.9852676232702554, + "grad_norm": 0.6239207983016968, + "learning_rate": 5.118757435444798e-05, + "loss": 0.8757, + "step": 154220 + }, + { + "epoch": 0.9853315104199941, + "grad_norm": 0.8069121241569519, + "learning_rate": 5.118255807899036e-05, + "loss": 1.0016, + "step": 154230 + }, + { + "epoch": 0.9853953975697328, + "grad_norm": 0.8366886973381042, + "learning_rate": 5.117754179162335e-05, + "loss": 0.6895, + "step": 154240 + }, + { + "epoch": 0.9854592847194715, + "grad_norm": 0.8663131594657898, + "learning_rate": 5.1172525492397484e-05, + "loss": 0.683, + "step": 154250 + }, + { + "epoch": 0.9855231718692102, + "grad_norm": 1.0200055837631226, + "learning_rate": 5.116750918136327e-05, + "loss": 0.8902, + "step": 154260 + }, + { + "epoch": 0.9855870590189489, + "grad_norm": 1.0423588752746582, + "learning_rate": 5.116249285857123e-05, + "loss": 0.9948, + "step": 154270 + }, + { + "epoch": 0.9856509461686876, + "grad_norm": 1.235628366470337, + "learning_rate": 5.115747652407189e-05, + "loss": 0.9463, + "step": 154280 + }, + { + "epoch": 0.9857148333184264, + "grad_norm": 0.787745475769043, + "learning_rate": 5.115246017791575e-05, + "loss": 0.9099, + "step": 154290 + }, + { + "epoch": 0.9857787204681651, + "grad_norm": 0.8026257157325745, + "learning_rate": 5.114744382015334e-05, + "loss": 0.8882, + "step": 154300 + }, + { + "epoch": 0.9858426076179038, + "grad_norm": 0.9308233857154846, + "learning_rate": 5.114242745083517e-05, + "loss": 0.7079, + "step": 154310 + }, + { + "epoch": 0.9859064947676425, + "grad_norm": 1.0703706741333008, + "learning_rate": 5.1137411070011786e-05, + "loss": 0.8582, + "step": 154320 + }, + { + "epoch": 0.9859703819173812, + "grad_norm": 0.8068180680274963, + "learning_rate": 5.113239467773369e-05, + "loss": 0.666, + "step": 154330 + }, + { + "epoch": 0.9860342690671199, + "grad_norm": 1.0489506721496582, + "learning_rate": 5.1127378274051385e-05, + "loss": 0.9058, + "step": 154340 + }, + { + "epoch": 0.9860981562168586, + "grad_norm": 1.0829126834869385, + "learning_rate": 5.112236185901541e-05, + "loss": 0.9407, + "step": 154350 + }, + { + "epoch": 0.9861620433665973, + "grad_norm": 1.2354950904846191, + "learning_rate": 5.111734543267628e-05, + "loss": 1.0327, + "step": 154360 + }, + { + "epoch": 0.9862259305163359, + "grad_norm": 0.9200438857078552, + "learning_rate": 5.111232899508451e-05, + "loss": 0.884, + "step": 154370 + }, + { + "epoch": 0.9862898176660746, + "grad_norm": 1.6493422985076904, + "learning_rate": 5.110731254629063e-05, + "loss": 1.0967, + "step": 154380 + }, + { + "epoch": 0.9863537048158133, + "grad_norm": 0.49024438858032227, + "learning_rate": 5.110229608634516e-05, + "loss": 0.891, + "step": 154390 + }, + { + "epoch": 0.986417591965552, + "grad_norm": 0.9173659682273865, + "learning_rate": 5.1097279615298596e-05, + "loss": 0.8338, + "step": 154400 + }, + { + "epoch": 0.9864814791152907, + "grad_norm": 0.718056857585907, + "learning_rate": 5.109226313320149e-05, + "loss": 0.9905, + "step": 154410 + }, + { + "epoch": 0.9865453662650294, + "grad_norm": 0.8132973313331604, + "learning_rate": 5.108724664010435e-05, + "loss": 0.9829, + "step": 154420 + }, + { + "epoch": 0.9866092534147681, + "grad_norm": 1.2084906101226807, + "learning_rate": 5.1082230136057695e-05, + "loss": 0.8259, + "step": 154430 + }, + { + "epoch": 0.9866731405645068, + "grad_norm": 1.0011422634124756, + "learning_rate": 5.1077213621112043e-05, + "loss": 0.6755, + "step": 154440 + }, + { + "epoch": 0.9867370277142455, + "grad_norm": 0.930726945400238, + "learning_rate": 5.107219709531792e-05, + "loss": 0.8441, + "step": 154450 + }, + { + "epoch": 0.9868009148639842, + "grad_norm": 0.7071484923362732, + "learning_rate": 5.1067180558725846e-05, + "loss": 0.8279, + "step": 154460 + }, + { + "epoch": 0.986864802013723, + "grad_norm": 1.1952509880065918, + "learning_rate": 5.106216401138635e-05, + "loss": 0.7515, + "step": 154470 + }, + { + "epoch": 0.9869286891634617, + "grad_norm": 0.8858817219734192, + "learning_rate": 5.105714745334993e-05, + "loss": 0.8042, + "step": 154480 + }, + { + "epoch": 0.9869925763132004, + "grad_norm": 1.0529940128326416, + "learning_rate": 5.105213088466712e-05, + "loss": 0.8393, + "step": 154490 + }, + { + "epoch": 0.9870564634629391, + "grad_norm": 0.8459919095039368, + "learning_rate": 5.1047114305388445e-05, + "loss": 0.7869, + "step": 154500 + }, + { + "epoch": 0.9871203506126778, + "grad_norm": 0.8265353441238403, + "learning_rate": 5.104209771556443e-05, + "loss": 0.9366, + "step": 154510 + }, + { + "epoch": 0.9871842377624165, + "grad_norm": 1.7919999361038208, + "learning_rate": 5.1037081115245576e-05, + "loss": 0.8777, + "step": 154520 + }, + { + "epoch": 0.9872481249121552, + "grad_norm": 0.8435221910476685, + "learning_rate": 5.103206450448243e-05, + "loss": 0.9533, + "step": 154530 + }, + { + "epoch": 0.9873120120618939, + "grad_norm": 1.6864407062530518, + "learning_rate": 5.10270478833255e-05, + "loss": 0.7714, + "step": 154540 + }, + { + "epoch": 0.9873758992116326, + "grad_norm": 0.9249199032783508, + "learning_rate": 5.1022031251825306e-05, + "loss": 0.9767, + "step": 154550 + }, + { + "epoch": 0.9874397863613713, + "grad_norm": 0.6928601264953613, + "learning_rate": 5.101701461003238e-05, + "loss": 0.965, + "step": 154560 + }, + { + "epoch": 0.98750367351111, + "grad_norm": 1.4231996536254883, + "learning_rate": 5.101199795799723e-05, + "loss": 1.1854, + "step": 154570 + }, + { + "epoch": 0.9875675606608487, + "grad_norm": 1.1377532482147217, + "learning_rate": 5.1006981295770376e-05, + "loss": 0.8789, + "step": 154580 + }, + { + "epoch": 0.9876314478105874, + "grad_norm": 0.6748983860015869, + "learning_rate": 5.100196462340236e-05, + "loss": 0.8104, + "step": 154590 + }, + { + "epoch": 0.9876953349603261, + "grad_norm": 1.366390585899353, + "learning_rate": 5.0996947940943695e-05, + "loss": 1.0198, + "step": 154600 + }, + { + "epoch": 0.9877592221100647, + "grad_norm": 1.1231287717819214, + "learning_rate": 5.09919312484449e-05, + "loss": 0.9681, + "step": 154610 + }, + { + "epoch": 0.9878231092598034, + "grad_norm": 1.0698585510253906, + "learning_rate": 5.09869145459565e-05, + "loss": 0.7381, + "step": 154620 + }, + { + "epoch": 0.9878869964095421, + "grad_norm": 0.8807600736618042, + "learning_rate": 5.098189783352901e-05, + "loss": 0.8386, + "step": 154630 + }, + { + "epoch": 0.9879508835592808, + "grad_norm": 0.9870644211769104, + "learning_rate": 5.097688111121296e-05, + "loss": 0.7578, + "step": 154640 + }, + { + "epoch": 0.9880147707090196, + "grad_norm": 0.8701372742652893, + "learning_rate": 5.097186437905887e-05, + "loss": 0.9337, + "step": 154650 + }, + { + "epoch": 0.9880786578587583, + "grad_norm": 0.9452102780342102, + "learning_rate": 5.0966847637117275e-05, + "loss": 0.94, + "step": 154660 + }, + { + "epoch": 0.988142545008497, + "grad_norm": 1.2239512205123901, + "learning_rate": 5.096183088543869e-05, + "loss": 0.886, + "step": 154670 + }, + { + "epoch": 0.9882064321582357, + "grad_norm": 0.7913658022880554, + "learning_rate": 5.095681412407363e-05, + "loss": 0.9082, + "step": 154680 + }, + { + "epoch": 0.9882703193079744, + "grad_norm": 1.3978636264801025, + "learning_rate": 5.095179735307263e-05, + "loss": 0.8123, + "step": 154690 + }, + { + "epoch": 0.9883342064577131, + "grad_norm": 1.0464966297149658, + "learning_rate": 5.0946780572486194e-05, + "loss": 0.8111, + "step": 154700 + }, + { + "epoch": 0.9883980936074518, + "grad_norm": 0.7706538438796997, + "learning_rate": 5.094176378236487e-05, + "loss": 0.8334, + "step": 154710 + }, + { + "epoch": 0.9884619807571905, + "grad_norm": 0.8449127674102783, + "learning_rate": 5.0936746982759164e-05, + "loss": 0.8343, + "step": 154720 + }, + { + "epoch": 0.9885258679069292, + "grad_norm": 2.550860643386841, + "learning_rate": 5.093173017371961e-05, + "loss": 1.1222, + "step": 154730 + }, + { + "epoch": 0.9885897550566679, + "grad_norm": 0.8664345741271973, + "learning_rate": 5.0926713355296715e-05, + "loss": 0.9487, + "step": 154740 + }, + { + "epoch": 0.9886536422064066, + "grad_norm": 1.1151171922683716, + "learning_rate": 5.092169652754103e-05, + "loss": 0.8512, + "step": 154750 + }, + { + "epoch": 0.9887175293561453, + "grad_norm": 1.0279850959777832, + "learning_rate": 5.091667969050304e-05, + "loss": 0.9394, + "step": 154760 + }, + { + "epoch": 0.988781416505884, + "grad_norm": 0.8667672276496887, + "learning_rate": 5.091166284423332e-05, + "loss": 0.9599, + "step": 154770 + }, + { + "epoch": 0.9888453036556227, + "grad_norm": 0.6669974327087402, + "learning_rate": 5.0906645988782354e-05, + "loss": 0.7711, + "step": 154780 + }, + { + "epoch": 0.9889091908053614, + "grad_norm": 0.7633401155471802, + "learning_rate": 5.090162912420068e-05, + "loss": 0.8466, + "step": 154790 + }, + { + "epoch": 0.9889730779551001, + "grad_norm": 1.8261069059371948, + "learning_rate": 5.089661225053882e-05, + "loss": 0.7971, + "step": 154800 + }, + { + "epoch": 0.9890369651048388, + "grad_norm": 0.9991775155067444, + "learning_rate": 5.08915953678473e-05, + "loss": 1.0186, + "step": 154810 + }, + { + "epoch": 0.9891008522545776, + "grad_norm": 0.7954165935516357, + "learning_rate": 5.088657847617666e-05, + "loss": 0.9212, + "step": 154820 + }, + { + "epoch": 0.9891647394043163, + "grad_norm": 1.120200753211975, + "learning_rate": 5.0881561575577384e-05, + "loss": 0.7709, + "step": 154830 + }, + { + "epoch": 0.989228626554055, + "grad_norm": 0.8321331143379211, + "learning_rate": 5.0876544666100035e-05, + "loss": 1.0257, + "step": 154840 + }, + { + "epoch": 0.9892925137037936, + "grad_norm": 0.7614843249320984, + "learning_rate": 5.087152774779511e-05, + "loss": 1.0545, + "step": 154850 + }, + { + "epoch": 0.9893564008535323, + "grad_norm": 1.8227176666259766, + "learning_rate": 5.086651082071315e-05, + "loss": 0.8129, + "step": 154860 + }, + { + "epoch": 0.989420288003271, + "grad_norm": 0.577085018157959, + "learning_rate": 5.0861493884904686e-05, + "loss": 0.8638, + "step": 154870 + }, + { + "epoch": 0.9894841751530097, + "grad_norm": 0.9807033538818359, + "learning_rate": 5.0856476940420225e-05, + "loss": 0.9116, + "step": 154880 + }, + { + "epoch": 0.9895480623027484, + "grad_norm": 3.173016309738159, + "learning_rate": 5.0851459987310304e-05, + "loss": 0.675, + "step": 154890 + }, + { + "epoch": 0.9896119494524871, + "grad_norm": 4.230893611907959, + "learning_rate": 5.084644302562544e-05, + "loss": 0.7969, + "step": 154900 + }, + { + "epoch": 0.9896758366022258, + "grad_norm": 0.7749008536338806, + "learning_rate": 5.0841426055416164e-05, + "loss": 0.7723, + "step": 154910 + }, + { + "epoch": 0.9897397237519645, + "grad_norm": 1.0113115310668945, + "learning_rate": 5.083640907673299e-05, + "loss": 1.0151, + "step": 154920 + }, + { + "epoch": 0.9898036109017032, + "grad_norm": 3.3041319847106934, + "learning_rate": 5.083139208962646e-05, + "loss": 0.8368, + "step": 154930 + }, + { + "epoch": 0.9898674980514419, + "grad_norm": 0.8300958871841431, + "learning_rate": 5.082637509414709e-05, + "loss": 0.6277, + "step": 154940 + }, + { + "epoch": 0.9899313852011806, + "grad_norm": 0.720481276512146, + "learning_rate": 5.0821358090345414e-05, + "loss": 0.8199, + "step": 154950 + }, + { + "epoch": 0.9899952723509193, + "grad_norm": 0.7711173295974731, + "learning_rate": 5.081634107827196e-05, + "loss": 0.9664, + "step": 154960 + }, + { + "epoch": 0.990059159500658, + "grad_norm": 1.2588512897491455, + "learning_rate": 5.081132405797724e-05, + "loss": 0.8738, + "step": 154970 + }, + { + "epoch": 0.9901230466503967, + "grad_norm": 0.7315512895584106, + "learning_rate": 5.080630702951178e-05, + "loss": 0.9661, + "step": 154980 + }, + { + "epoch": 0.9901869338001354, + "grad_norm": 0.7708696722984314, + "learning_rate": 5.0801289992926106e-05, + "loss": 0.7121, + "step": 154990 + }, + { + "epoch": 0.9902508209498742, + "grad_norm": 1.7888171672821045, + "learning_rate": 5.079627294827075e-05, + "loss": 0.9311, + "step": 155000 + }, + { + "epoch": 0.9903147080996129, + "grad_norm": 0.5545375943183899, + "learning_rate": 5.0791255895596246e-05, + "loss": 1.0415, + "step": 155010 + }, + { + "epoch": 0.9903785952493516, + "grad_norm": 0.8352196216583252, + "learning_rate": 5.07862388349531e-05, + "loss": 0.9962, + "step": 155020 + }, + { + "epoch": 0.9904424823990903, + "grad_norm": 0.9001242518424988, + "learning_rate": 5.0781221766391865e-05, + "loss": 0.8372, + "step": 155030 + }, + { + "epoch": 0.990506369548829, + "grad_norm": 0.76966392993927, + "learning_rate": 5.0776204689963035e-05, + "loss": 0.8428, + "step": 155040 + }, + { + "epoch": 0.9905702566985677, + "grad_norm": 1.0977442264556885, + "learning_rate": 5.0771187605717154e-05, + "loss": 0.7625, + "step": 155050 + }, + { + "epoch": 0.9906341438483064, + "grad_norm": 0.6730074882507324, + "learning_rate": 5.076617051370476e-05, + "loss": 0.8101, + "step": 155060 + }, + { + "epoch": 0.9906980309980451, + "grad_norm": 0.7701064348220825, + "learning_rate": 5.076115341397636e-05, + "loss": 1.0062, + "step": 155070 + }, + { + "epoch": 0.9907619181477838, + "grad_norm": 0.697149932384491, + "learning_rate": 5.075613630658247e-05, + "loss": 0.9009, + "step": 155080 + }, + { + "epoch": 0.9908258052975225, + "grad_norm": 1.0090751647949219, + "learning_rate": 5.075111919157364e-05, + "loss": 0.6531, + "step": 155090 + }, + { + "epoch": 0.9908896924472611, + "grad_norm": 1.064965844154358, + "learning_rate": 5.07461020690004e-05, + "loss": 0.855, + "step": 155100 + }, + { + "epoch": 0.9909535795969998, + "grad_norm": 0.9386013150215149, + "learning_rate": 5.0741084938913265e-05, + "loss": 1.009, + "step": 155110 + }, + { + "epoch": 0.9910174667467385, + "grad_norm": 0.9192590117454529, + "learning_rate": 5.0736067801362754e-05, + "loss": 0.9065, + "step": 155120 + }, + { + "epoch": 0.9910813538964772, + "grad_norm": 0.8243198394775391, + "learning_rate": 5.073105065639942e-05, + "loss": 0.8052, + "step": 155130 + }, + { + "epoch": 0.9911452410462159, + "grad_norm": 0.8356258273124695, + "learning_rate": 5.072603350407376e-05, + "loss": 0.8429, + "step": 155140 + }, + { + "epoch": 0.9912091281959546, + "grad_norm": 0.962837815284729, + "learning_rate": 5.0721016344436314e-05, + "loss": 0.9432, + "step": 155150 + }, + { + "epoch": 0.9912730153456933, + "grad_norm": 0.7561596035957336, + "learning_rate": 5.071599917753761e-05, + "loss": 1.0909, + "step": 155160 + }, + { + "epoch": 0.991336902495432, + "grad_norm": 0.8569062352180481, + "learning_rate": 5.0710982003428187e-05, + "loss": 0.7786, + "step": 155170 + }, + { + "epoch": 0.9914007896451708, + "grad_norm": 1.1719166040420532, + "learning_rate": 5.0705964822158544e-05, + "loss": 0.7228, + "step": 155180 + }, + { + "epoch": 0.9914646767949095, + "grad_norm": 0.7665051221847534, + "learning_rate": 5.070094763377924e-05, + "loss": 0.8832, + "step": 155190 + }, + { + "epoch": 0.9915285639446482, + "grad_norm": 0.8756385445594788, + "learning_rate": 5.0695930438340776e-05, + "loss": 0.9353, + "step": 155200 + }, + { + "epoch": 0.9915924510943869, + "grad_norm": 1.0932066440582275, + "learning_rate": 5.069091323589369e-05, + "loss": 0.9989, + "step": 155210 + }, + { + "epoch": 0.9916563382441256, + "grad_norm": 1.00384521484375, + "learning_rate": 5.0685896026488514e-05, + "loss": 0.7546, + "step": 155220 + }, + { + "epoch": 0.9917202253938643, + "grad_norm": 1.111561894416809, + "learning_rate": 5.068087881017577e-05, + "loss": 1.0, + "step": 155230 + }, + { + "epoch": 0.991784112543603, + "grad_norm": 1.0187937021255493, + "learning_rate": 5.067586158700599e-05, + "loss": 0.8705, + "step": 155240 + }, + { + "epoch": 0.9918479996933417, + "grad_norm": 0.9533101320266724, + "learning_rate": 5.06708443570297e-05, + "loss": 1.0491, + "step": 155250 + }, + { + "epoch": 0.9919118868430804, + "grad_norm": 0.7527496218681335, + "learning_rate": 5.066582712029743e-05, + "loss": 0.852, + "step": 155260 + }, + { + "epoch": 0.9919757739928191, + "grad_norm": 0.5917440056800842, + "learning_rate": 5.0660809876859694e-05, + "loss": 0.9962, + "step": 155270 + }, + { + "epoch": 0.9920396611425578, + "grad_norm": 1.4870597124099731, + "learning_rate": 5.065579262676704e-05, + "loss": 0.8788, + "step": 155280 + }, + { + "epoch": 0.9921035482922965, + "grad_norm": 0.7296018004417419, + "learning_rate": 5.0650775370069966e-05, + "loss": 0.8849, + "step": 155290 + }, + { + "epoch": 0.9921674354420352, + "grad_norm": 1.0578160285949707, + "learning_rate": 5.0645758106819055e-05, + "loss": 0.7862, + "step": 155300 + }, + { + "epoch": 0.9922313225917739, + "grad_norm": 0.9313512444496155, + "learning_rate": 5.064074083706478e-05, + "loss": 0.7519, + "step": 155310 + }, + { + "epoch": 0.9922952097415126, + "grad_norm": 0.8470985293388367, + "learning_rate": 5.063572356085769e-05, + "loss": 0.9872, + "step": 155320 + }, + { + "epoch": 0.9923590968912513, + "grad_norm": 0.9447453618049622, + "learning_rate": 5.063070627824833e-05, + "loss": 1.0726, + "step": 155330 + }, + { + "epoch": 0.9924229840409899, + "grad_norm": 0.7884403467178345, + "learning_rate": 5.0625688989287204e-05, + "loss": 1.1167, + "step": 155340 + }, + { + "epoch": 0.9924868711907286, + "grad_norm": 1.2746003866195679, + "learning_rate": 5.0620671694024836e-05, + "loss": 1.0695, + "step": 155350 + }, + { + "epoch": 0.9925507583404674, + "grad_norm": 1.0743809938430786, + "learning_rate": 5.061565439251178e-05, + "loss": 0.9215, + "step": 155360 + }, + { + "epoch": 0.9926146454902061, + "grad_norm": 0.7811270952224731, + "learning_rate": 5.061063708479855e-05, + "loss": 0.8182, + "step": 155370 + }, + { + "epoch": 0.9926785326399448, + "grad_norm": 1.0729076862335205, + "learning_rate": 5.060561977093568e-05, + "loss": 0.7123, + "step": 155380 + }, + { + "epoch": 0.9927424197896835, + "grad_norm": 0.9297366142272949, + "learning_rate": 5.060060245097368e-05, + "loss": 0.9888, + "step": 155390 + }, + { + "epoch": 0.9928063069394222, + "grad_norm": 2.0715694427490234, + "learning_rate": 5.059558512496311e-05, + "loss": 0.9501, + "step": 155400 + }, + { + "epoch": 0.9928701940891609, + "grad_norm": 0.7811942100524902, + "learning_rate": 5.059056779295447e-05, + "loss": 0.8314, + "step": 155410 + }, + { + "epoch": 0.9929340812388996, + "grad_norm": 0.4893457591533661, + "learning_rate": 5.058555045499831e-05, + "loss": 1.1216, + "step": 155420 + }, + { + "epoch": 0.9929979683886383, + "grad_norm": 0.7355089783668518, + "learning_rate": 5.058053311114515e-05, + "loss": 0.9711, + "step": 155430 + }, + { + "epoch": 0.993061855538377, + "grad_norm": 0.8681902289390564, + "learning_rate": 5.057551576144551e-05, + "loss": 0.9379, + "step": 155440 + }, + { + "epoch": 0.9931257426881157, + "grad_norm": 1.1609843969345093, + "learning_rate": 5.0570498405949926e-05, + "loss": 0.8046, + "step": 155450 + }, + { + "epoch": 0.9931896298378544, + "grad_norm": 0.9073358774185181, + "learning_rate": 5.056548104470894e-05, + "loss": 0.8837, + "step": 155460 + }, + { + "epoch": 0.9932535169875931, + "grad_norm": 0.7544703483581543, + "learning_rate": 5.056046367777306e-05, + "loss": 1.0009, + "step": 155470 + }, + { + "epoch": 0.9933174041373318, + "grad_norm": 2.039977788925171, + "learning_rate": 5.055544630519284e-05, + "loss": 0.736, + "step": 155480 + }, + { + "epoch": 0.9933812912870705, + "grad_norm": 0.5482000112533569, + "learning_rate": 5.055042892701879e-05, + "loss": 0.7449, + "step": 155490 + }, + { + "epoch": 0.9934451784368092, + "grad_norm": 1.062432050704956, + "learning_rate": 5.054541154330145e-05, + "loss": 0.9661, + "step": 155500 + }, + { + "epoch": 0.993509065586548, + "grad_norm": 0.7228785753250122, + "learning_rate": 5.054039415409133e-05, + "loss": 1.0069, + "step": 155510 + }, + { + "epoch": 0.9935729527362867, + "grad_norm": 0.7066873908042908, + "learning_rate": 5.053537675943899e-05, + "loss": 0.911, + "step": 155520 + }, + { + "epoch": 0.9936368398860254, + "grad_norm": 1.2193684577941895, + "learning_rate": 5.053035935939493e-05, + "loss": 0.8247, + "step": 155530 + }, + { + "epoch": 0.9937007270357641, + "grad_norm": 0.8418088555335999, + "learning_rate": 5.05253419540097e-05, + "loss": 1.0304, + "step": 155540 + }, + { + "epoch": 0.9937646141855028, + "grad_norm": 0.9440509080886841, + "learning_rate": 5.052032454333383e-05, + "loss": 0.7883, + "step": 155550 + }, + { + "epoch": 0.9938285013352415, + "grad_norm": 2.196873426437378, + "learning_rate": 5.051530712741783e-05, + "loss": 0.7575, + "step": 155560 + }, + { + "epoch": 0.9938923884849802, + "grad_norm": 0.6164715886116028, + "learning_rate": 5.051028970631224e-05, + "loss": 0.9493, + "step": 155570 + }, + { + "epoch": 0.9939562756347188, + "grad_norm": 0.6802273392677307, + "learning_rate": 5.05052722800676e-05, + "loss": 0.7849, + "step": 155580 + }, + { + "epoch": 0.9940201627844575, + "grad_norm": 0.7959185242652893, + "learning_rate": 5.0500254848734415e-05, + "loss": 0.9129, + "step": 155590 + }, + { + "epoch": 0.9940840499341962, + "grad_norm": 0.7099493741989136, + "learning_rate": 5.049523741236325e-05, + "loss": 0.9131, + "step": 155600 + }, + { + "epoch": 0.9941479370839349, + "grad_norm": 1.0658419132232666, + "learning_rate": 5.049021997100459e-05, + "loss": 0.7746, + "step": 155610 + }, + { + "epoch": 0.9942118242336736, + "grad_norm": 1.2376176118850708, + "learning_rate": 5.048520252470901e-05, + "loss": 1.0721, + "step": 155620 + }, + { + "epoch": 0.9942757113834123, + "grad_norm": 3.2693676948547363, + "learning_rate": 5.048018507352702e-05, + "loss": 0.9045, + "step": 155630 + }, + { + "epoch": 0.994339598533151, + "grad_norm": 1.1164909601211548, + "learning_rate": 5.047516761750915e-05, + "loss": 0.8043, + "step": 155640 + }, + { + "epoch": 0.9944034856828897, + "grad_norm": 2.0805160999298096, + "learning_rate": 5.0470150156705933e-05, + "loss": 0.8283, + "step": 155650 + }, + { + "epoch": 0.9944673728326284, + "grad_norm": 1.3686854839324951, + "learning_rate": 5.0465132691167894e-05, + "loss": 0.7677, + "step": 155660 + }, + { + "epoch": 0.9945312599823671, + "grad_norm": 0.9186270833015442, + "learning_rate": 5.046011522094556e-05, + "loss": 0.8535, + "step": 155670 + }, + { + "epoch": 0.9945951471321058, + "grad_norm": 1.0402723550796509, + "learning_rate": 5.045509774608947e-05, + "loss": 0.7879, + "step": 155680 + }, + { + "epoch": 0.9946590342818445, + "grad_norm": 0.8195865154266357, + "learning_rate": 5.0450080266650165e-05, + "loss": 0.7122, + "step": 155690 + }, + { + "epoch": 0.9947229214315833, + "grad_norm": 1.0943642854690552, + "learning_rate": 5.0445062782678154e-05, + "loss": 0.9818, + "step": 155700 + }, + { + "epoch": 0.994786808581322, + "grad_norm": 0.9934229850769043, + "learning_rate": 5.044004529422397e-05, + "loss": 0.8661, + "step": 155710 + }, + { + "epoch": 0.9948506957310607, + "grad_norm": 1.000313639640808, + "learning_rate": 5.0435027801338164e-05, + "loss": 0.9347, + "step": 155720 + }, + { + "epoch": 0.9949145828807994, + "grad_norm": 0.7824245691299438, + "learning_rate": 5.043001030407124e-05, + "loss": 0.6195, + "step": 155730 + }, + { + "epoch": 0.9949784700305381, + "grad_norm": 0.848616898059845, + "learning_rate": 5.042499280247373e-05, + "loss": 0.6459, + "step": 155740 + }, + { + "epoch": 0.9950423571802768, + "grad_norm": 1.090847373008728, + "learning_rate": 5.04199752965962e-05, + "loss": 0.7625, + "step": 155750 + }, + { + "epoch": 0.9951062443300155, + "grad_norm": 1.3222191333770752, + "learning_rate": 5.041495778648914e-05, + "loss": 1.0372, + "step": 155760 + }, + { + "epoch": 0.9951701314797542, + "grad_norm": 1.4843127727508545, + "learning_rate": 5.0409940272203093e-05, + "loss": 0.7487, + "step": 155770 + }, + { + "epoch": 0.9952340186294929, + "grad_norm": 1.1687159538269043, + "learning_rate": 5.040492275378861e-05, + "loss": 0.823, + "step": 155780 + }, + { + "epoch": 0.9952979057792316, + "grad_norm": 0.8214207887649536, + "learning_rate": 5.039990523129618e-05, + "loss": 0.7828, + "step": 155790 + }, + { + "epoch": 0.9953617929289703, + "grad_norm": 1.5629782676696777, + "learning_rate": 5.0394887704776385e-05, + "loss": 0.7958, + "step": 155800 + }, + { + "epoch": 0.995425680078709, + "grad_norm": 1.2977160215377808, + "learning_rate": 5.038987017427971e-05, + "loss": 0.9585, + "step": 155810 + }, + { + "epoch": 0.9954895672284477, + "grad_norm": 0.7463441491127014, + "learning_rate": 5.0384852639856706e-05, + "loss": 0.7156, + "step": 155820 + }, + { + "epoch": 0.9955534543781863, + "grad_norm": 0.878885805606842, + "learning_rate": 5.037983510155791e-05, + "loss": 0.7061, + "step": 155830 + }, + { + "epoch": 0.995617341527925, + "grad_norm": 0.9723998308181763, + "learning_rate": 5.037481755943385e-05, + "loss": 1.1002, + "step": 155840 + }, + { + "epoch": 0.9956812286776637, + "grad_norm": 0.8004569411277771, + "learning_rate": 5.036980001353504e-05, + "loss": 0.7974, + "step": 155850 + }, + { + "epoch": 0.9957451158274024, + "grad_norm": 0.9547368288040161, + "learning_rate": 5.036478246391203e-05, + "loss": 1.1055, + "step": 155860 + }, + { + "epoch": 0.9958090029771411, + "grad_norm": 1.7963746786117554, + "learning_rate": 5.035976491061535e-05, + "loss": 0.9589, + "step": 155870 + }, + { + "epoch": 0.9958728901268799, + "grad_norm": 0.7605364918708801, + "learning_rate": 5.035474735369552e-05, + "loss": 1.0215, + "step": 155880 + }, + { + "epoch": 0.9959367772766186, + "grad_norm": 0.868885338306427, + "learning_rate": 5.0349729793203085e-05, + "loss": 1.1309, + "step": 155890 + }, + { + "epoch": 0.9960006644263573, + "grad_norm": 0.8125676512718201, + "learning_rate": 5.034471222918856e-05, + "loss": 0.7623, + "step": 155900 + }, + { + "epoch": 0.996064551576096, + "grad_norm": 0.9288298487663269, + "learning_rate": 5.033969466170248e-05, + "loss": 0.9507, + "step": 155910 + }, + { + "epoch": 0.9961284387258347, + "grad_norm": 1.1834713220596313, + "learning_rate": 5.033467709079539e-05, + "loss": 0.8471, + "step": 155920 + }, + { + "epoch": 0.9961923258755734, + "grad_norm": 0.5627526640892029, + "learning_rate": 5.032965951651781e-05, + "loss": 0.8455, + "step": 155930 + }, + { + "epoch": 0.9962562130253121, + "grad_norm": 1.100093126296997, + "learning_rate": 5.032464193892028e-05, + "loss": 0.7466, + "step": 155940 + }, + { + "epoch": 0.9963201001750508, + "grad_norm": 1.0520758628845215, + "learning_rate": 5.031962435805332e-05, + "loss": 0.8332, + "step": 155950 + }, + { + "epoch": 0.9963839873247895, + "grad_norm": 1.0780564546585083, + "learning_rate": 5.0314606773967456e-05, + "loss": 1.1039, + "step": 155960 + }, + { + "epoch": 0.9964478744745282, + "grad_norm": 2.222808361053467, + "learning_rate": 5.0309589186713235e-05, + "loss": 0.8462, + "step": 155970 + }, + { + "epoch": 0.9965117616242669, + "grad_norm": 0.8251408338546753, + "learning_rate": 5.030457159634118e-05, + "loss": 0.7902, + "step": 155980 + }, + { + "epoch": 0.9965756487740056, + "grad_norm": 0.9923737645149231, + "learning_rate": 5.029955400290183e-05, + "loss": 0.9637, + "step": 155990 + }, + { + "epoch": 0.9966395359237443, + "grad_norm": 0.7754630446434021, + "learning_rate": 5.029453640644571e-05, + "loss": 1.025, + "step": 156000 + }, + { + "epoch": 0.996703423073483, + "grad_norm": 0.49166470766067505, + "learning_rate": 5.028951880702336e-05, + "loss": 0.709, + "step": 156010 + }, + { + "epoch": 0.9967673102232217, + "grad_norm": 0.7618236541748047, + "learning_rate": 5.028450120468531e-05, + "loss": 0.9728, + "step": 156020 + }, + { + "epoch": 0.9968311973729604, + "grad_norm": 2.817028760910034, + "learning_rate": 5.027948359948209e-05, + "loss": 0.9221, + "step": 156030 + }, + { + "epoch": 0.9968950845226991, + "grad_norm": 0.8261348009109497, + "learning_rate": 5.027446599146421e-05, + "loss": 0.9886, + "step": 156040 + }, + { + "epoch": 0.9969589716724379, + "grad_norm": 0.749161958694458, + "learning_rate": 5.026944838068223e-05, + "loss": 0.9215, + "step": 156050 + }, + { + "epoch": 0.9970228588221766, + "grad_norm": 1.2337532043457031, + "learning_rate": 5.026443076718666e-05, + "loss": 0.9563, + "step": 156060 + }, + { + "epoch": 0.9970867459719152, + "grad_norm": 0.6758466362953186, + "learning_rate": 5.0259413151028066e-05, + "loss": 0.6758, + "step": 156070 + }, + { + "epoch": 0.9971506331216539, + "grad_norm": 1.36204993724823, + "learning_rate": 5.0254395532256935e-05, + "loss": 0.948, + "step": 156080 + }, + { + "epoch": 0.9972145202713926, + "grad_norm": 0.7348865866661072, + "learning_rate": 5.0249377910923834e-05, + "loss": 0.9833, + "step": 156090 + }, + { + "epoch": 0.9972784074211313, + "grad_norm": 0.6744032502174377, + "learning_rate": 5.0244360287079287e-05, + "loss": 0.9639, + "step": 156100 + }, + { + "epoch": 0.99734229457087, + "grad_norm": 0.9378622174263, + "learning_rate": 5.0239342660773804e-05, + "loss": 0.8584, + "step": 156110 + }, + { + "epoch": 0.9974061817206087, + "grad_norm": 1.2324236631393433, + "learning_rate": 5.023432503205794e-05, + "loss": 0.9609, + "step": 156120 + }, + { + "epoch": 0.9974700688703474, + "grad_norm": 0.9098687171936035, + "learning_rate": 5.0229307400982215e-05, + "loss": 1.008, + "step": 156130 + }, + { + "epoch": 0.9975339560200861, + "grad_norm": 0.9950636029243469, + "learning_rate": 5.0224289767597164e-05, + "loss": 0.892, + "step": 156140 + }, + { + "epoch": 0.9975978431698248, + "grad_norm": 0.6629953980445862, + "learning_rate": 5.021927213195333e-05, + "loss": 0.7111, + "step": 156150 + }, + { + "epoch": 0.9976617303195635, + "grad_norm": 0.7785073518753052, + "learning_rate": 5.021425449410123e-05, + "loss": 0.7638, + "step": 156160 + }, + { + "epoch": 0.9977256174693022, + "grad_norm": 0.6204543113708496, + "learning_rate": 5.0209236854091414e-05, + "loss": 0.8849, + "step": 156170 + }, + { + "epoch": 0.9977895046190409, + "grad_norm": 0.7661494016647339, + "learning_rate": 5.020421921197439e-05, + "loss": 0.7041, + "step": 156180 + }, + { + "epoch": 0.9978533917687796, + "grad_norm": 1.0413057804107666, + "learning_rate": 5.0199201567800704e-05, + "loss": 0.8334, + "step": 156190 + }, + { + "epoch": 0.9979172789185183, + "grad_norm": 1.1320688724517822, + "learning_rate": 5.0194183921620895e-05, + "loss": 0.8678, + "step": 156200 + }, + { + "epoch": 0.997981166068257, + "grad_norm": 1.0221413373947144, + "learning_rate": 5.0189166273485476e-05, + "loss": 0.7552, + "step": 156210 + }, + { + "epoch": 0.9980450532179957, + "grad_norm": 0.6623107194900513, + "learning_rate": 5.018414862344499e-05, + "loss": 0.8624, + "step": 156220 + }, + { + "epoch": 0.9981089403677345, + "grad_norm": 1.305783748626709, + "learning_rate": 5.017913097154997e-05, + "loss": 0.8993, + "step": 156230 + }, + { + "epoch": 0.9981728275174732, + "grad_norm": 1.0739842653274536, + "learning_rate": 5.017411331785094e-05, + "loss": 0.8799, + "step": 156240 + }, + { + "epoch": 0.9982367146672119, + "grad_norm": 0.6890377998352051, + "learning_rate": 5.016909566239846e-05, + "loss": 0.777, + "step": 156250 + }, + { + "epoch": 0.9983006018169506, + "grad_norm": 1.3199329376220703, + "learning_rate": 5.016407800524302e-05, + "loss": 1.0892, + "step": 156260 + }, + { + "epoch": 0.9983644889666893, + "grad_norm": 1.2792729139328003, + "learning_rate": 5.015906034643517e-05, + "loss": 0.7636, + "step": 156270 + }, + { + "epoch": 0.998428376116428, + "grad_norm": 1.0154145956039429, + "learning_rate": 5.015404268602547e-05, + "loss": 1.0007, + "step": 156280 + }, + { + "epoch": 0.9984922632661667, + "grad_norm": 1.130146861076355, + "learning_rate": 5.014902502406441e-05, + "loss": 0.8497, + "step": 156290 + }, + { + "epoch": 0.9985561504159054, + "grad_norm": 0.9147141575813293, + "learning_rate": 5.014400736060252e-05, + "loss": 1.0122, + "step": 156300 + }, + { + "epoch": 0.998620037565644, + "grad_norm": 2.783782720565796, + "learning_rate": 5.013898969569038e-05, + "loss": 1.0387, + "step": 156310 + }, + { + "epoch": 0.9986839247153827, + "grad_norm": 1.1431195735931396, + "learning_rate": 5.013397202937847e-05, + "loss": 1.1868, + "step": 156320 + }, + { + "epoch": 0.9987478118651214, + "grad_norm": 0.596916139125824, + "learning_rate": 5.0128954361717365e-05, + "loss": 0.841, + "step": 156330 + }, + { + "epoch": 0.9988116990148601, + "grad_norm": 0.7760692238807678, + "learning_rate": 5.0123936692757566e-05, + "loss": 0.821, + "step": 156340 + }, + { + "epoch": 0.9988755861645988, + "grad_norm": 0.8499806523323059, + "learning_rate": 5.011891902254963e-05, + "loss": 1.0573, + "step": 156350 + }, + { + "epoch": 0.9989394733143375, + "grad_norm": 0.9884029626846313, + "learning_rate": 5.0113901351144065e-05, + "loss": 1.0637, + "step": 156360 + }, + { + "epoch": 0.9990033604640762, + "grad_norm": 0.7256179451942444, + "learning_rate": 5.0108883678591424e-05, + "loss": 1.0313, + "step": 156370 + }, + { + "epoch": 0.9990672476138149, + "grad_norm": 1.0645331144332886, + "learning_rate": 5.010386600494222e-05, + "loss": 1.0018, + "step": 156380 + }, + { + "epoch": 0.9991311347635536, + "grad_norm": 1.1328057050704956, + "learning_rate": 5.0098848330247006e-05, + "loss": 0.9414, + "step": 156390 + }, + { + "epoch": 0.9991950219132923, + "grad_norm": 0.8693520426750183, + "learning_rate": 5.00938306545563e-05, + "loss": 0.7726, + "step": 156400 + }, + { + "epoch": 0.999258909063031, + "grad_norm": 1.004093050956726, + "learning_rate": 5.008881297792063e-05, + "loss": 0.7736, + "step": 156410 + }, + { + "epoch": 0.9993227962127698, + "grad_norm": 0.7669504284858704, + "learning_rate": 5.008379530039055e-05, + "loss": 1.1268, + "step": 156420 + }, + { + "epoch": 0.9993866833625085, + "grad_norm": 0.9872044324874878, + "learning_rate": 5.007877762201657e-05, + "loss": 0.7277, + "step": 156430 + }, + { + "epoch": 0.9994505705122472, + "grad_norm": 0.8186001181602478, + "learning_rate": 5.007375994284923e-05, + "loss": 0.9602, + "step": 156440 + }, + { + "epoch": 0.9995144576619859, + "grad_norm": 0.840390145778656, + "learning_rate": 5.006874226293907e-05, + "loss": 0.8884, + "step": 156450 + }, + { + "epoch": 0.9995783448117246, + "grad_norm": 1.0728955268859863, + "learning_rate": 5.0063724582336614e-05, + "loss": 1.0428, + "step": 156460 + }, + { + "epoch": 0.9996422319614633, + "grad_norm": 0.8296906352043152, + "learning_rate": 5.005870690109239e-05, + "loss": 0.7635, + "step": 156470 + }, + { + "epoch": 0.999706119111202, + "grad_norm": 0.9049399495124817, + "learning_rate": 5.0053689219256946e-05, + "loss": 0.7707, + "step": 156480 + }, + { + "epoch": 0.9997700062609407, + "grad_norm": 0.6958884000778198, + "learning_rate": 5.0048671536880797e-05, + "loss": 0.6962, + "step": 156490 + }, + { + "epoch": 0.9998338934106794, + "grad_norm": 1.1141598224639893, + "learning_rate": 5.0043653854014486e-05, + "loss": 0.6017, + "step": 156500 + }, + { + "epoch": 0.9998977805604181, + "grad_norm": 1.1344066858291626, + "learning_rate": 5.0038636170708544e-05, + "loss": 0.9523, + "step": 156510 + }, + { + "epoch": 0.9999616677101568, + "grad_norm": 0.8506420850753784, + "learning_rate": 5.003361848701351e-05, + "loss": 0.7334, + "step": 156520 + }, + { + "epoch": 1.0000255548598955, + "grad_norm": 1.0688685178756714, + "learning_rate": 5.00286008029799e-05, + "loss": 0.7696, + "step": 156530 + }, + { + "epoch": 1.0000894420096342, + "grad_norm": 1.3739877939224243, + "learning_rate": 5.002358311865826e-05, + "loss": 1.1261, + "step": 156540 + }, + { + "epoch": 1.000153329159373, + "grad_norm": 1.3839253187179565, + "learning_rate": 5.001856543409911e-05, + "loss": 0.9307, + "step": 156550 + }, + { + "epoch": 1.0002172163091116, + "grad_norm": 1.329694151878357, + "learning_rate": 5.001354774935299e-05, + "loss": 0.6314, + "step": 156560 + }, + { + "epoch": 1.0002811034588504, + "grad_norm": 0.7411394119262695, + "learning_rate": 5.000853006447044e-05, + "loss": 0.9073, + "step": 156570 + }, + { + "epoch": 1.000344990608589, + "grad_norm": 1.0584617853164673, + "learning_rate": 5.000351237950198e-05, + "loss": 0.9411, + "step": 156580 + }, + { + "epoch": 1.0004088777583278, + "grad_norm": 0.8067578673362732, + "learning_rate": 4.999849469449815e-05, + "loss": 0.8204, + "step": 156590 + }, + { + "epoch": 1.0004727649080665, + "grad_norm": 0.48932382464408875, + "learning_rate": 4.999347700950948e-05, + "loss": 0.6526, + "step": 156600 + }, + { + "epoch": 1.0005366520578052, + "grad_norm": 1.271622657775879, + "learning_rate": 4.99884593245865e-05, + "loss": 0.9878, + "step": 156610 + }, + { + "epoch": 1.0006005392075439, + "grad_norm": 0.7740110754966736, + "learning_rate": 4.998344163977975e-05, + "loss": 0.8846, + "step": 156620 + }, + { + "epoch": 1.0006644263572826, + "grad_norm": 1.9784252643585205, + "learning_rate": 4.997842395513975e-05, + "loss": 0.8338, + "step": 156630 + }, + { + "epoch": 1.0007283135070213, + "grad_norm": 1.1081469058990479, + "learning_rate": 4.9973406270717044e-05, + "loss": 0.8351, + "step": 156640 + }, + { + "epoch": 1.00079220065676, + "grad_norm": 0.9051830172538757, + "learning_rate": 4.996838858656215e-05, + "loss": 0.754, + "step": 156650 + }, + { + "epoch": 1.0008560878064987, + "grad_norm": 1.0151948928833008, + "learning_rate": 4.996337090272562e-05, + "loss": 0.9473, + "step": 156660 + }, + { + "epoch": 1.0009199749562374, + "grad_norm": 0.955154299736023, + "learning_rate": 4.9958353219257966e-05, + "loss": 1.0262, + "step": 156670 + }, + { + "epoch": 1.000983862105976, + "grad_norm": 0.9474563598632812, + "learning_rate": 4.995333553620974e-05, + "loss": 1.0282, + "step": 156680 + }, + { + "epoch": 1.0010477492557146, + "grad_norm": 0.6811796426773071, + "learning_rate": 4.994831785363147e-05, + "loss": 0.7229, + "step": 156690 + }, + { + "epoch": 1.0011116364054533, + "grad_norm": 1.0170528888702393, + "learning_rate": 4.994330017157368e-05, + "loss": 0.7113, + "step": 156700 + }, + { + "epoch": 1.001175523555192, + "grad_norm": 0.5645203590393066, + "learning_rate": 4.99382824900869e-05, + "loss": 0.7999, + "step": 156710 + }, + { + "epoch": 1.0012394107049307, + "grad_norm": 1.0513136386871338, + "learning_rate": 4.9933264809221674e-05, + "loss": 0.8627, + "step": 156720 + }, + { + "epoch": 1.0013032978546694, + "grad_norm": 0.749899685382843, + "learning_rate": 4.992824712902853e-05, + "loss": 1.1349, + "step": 156730 + }, + { + "epoch": 1.0013671850044081, + "grad_norm": 1.0058104991912842, + "learning_rate": 4.9923229449558e-05, + "loss": 0.8528, + "step": 156740 + }, + { + "epoch": 1.0014310721541468, + "grad_norm": 1.5371594429016113, + "learning_rate": 4.991821177086061e-05, + "loss": 0.7446, + "step": 156750 + }, + { + "epoch": 1.0014949593038855, + "grad_norm": 5.122760772705078, + "learning_rate": 4.991319409298691e-05, + "loss": 0.7597, + "step": 156760 + }, + { + "epoch": 1.0015588464536243, + "grad_norm": 0.7622484564781189, + "learning_rate": 4.990817641598741e-05, + "loss": 0.9563, + "step": 156770 + }, + { + "epoch": 1.001622733603363, + "grad_norm": 0.8959377408027649, + "learning_rate": 4.990315873991266e-05, + "loss": 0.959, + "step": 156780 + }, + { + "epoch": 1.0016866207531017, + "grad_norm": 0.8473190069198608, + "learning_rate": 4.989814106481318e-05, + "loss": 0.8385, + "step": 156790 + }, + { + "epoch": 1.0017505079028404, + "grad_norm": 0.8622284531593323, + "learning_rate": 4.989312339073951e-05, + "loss": 0.9859, + "step": 156800 + }, + { + "epoch": 1.001814395052579, + "grad_norm": 0.7962250113487244, + "learning_rate": 4.988810571774218e-05, + "loss": 0.7629, + "step": 156810 + }, + { + "epoch": 1.0018782822023178, + "grad_norm": 0.9139593243598938, + "learning_rate": 4.988308804587172e-05, + "loss": 0.7469, + "step": 156820 + }, + { + "epoch": 1.0019421693520565, + "grad_norm": 0.7976312041282654, + "learning_rate": 4.9878070375178666e-05, + "loss": 0.8388, + "step": 156830 + }, + { + "epoch": 1.0020060565017952, + "grad_norm": 0.8629096150398254, + "learning_rate": 4.987305270571355e-05, + "loss": 1.1644, + "step": 156840 + }, + { + "epoch": 1.002069943651534, + "grad_norm": 0.8316608667373657, + "learning_rate": 4.98680350375269e-05, + "loss": 0.9476, + "step": 156850 + }, + { + "epoch": 1.0021338308012726, + "grad_norm": 1.2766749858856201, + "learning_rate": 4.9863017370669255e-05, + "loss": 0.6629, + "step": 156860 + }, + { + "epoch": 1.0021977179510113, + "grad_norm": 1.242931604385376, + "learning_rate": 4.985799970519113e-05, + "loss": 0.8247, + "step": 156870 + }, + { + "epoch": 1.00226160510075, + "grad_norm": 0.8744287490844727, + "learning_rate": 4.9852982041143103e-05, + "loss": 0.7827, + "step": 156880 + }, + { + "epoch": 1.0023254922504887, + "grad_norm": 0.8393293619155884, + "learning_rate": 4.984796437857566e-05, + "loss": 0.8903, + "step": 156890 + }, + { + "epoch": 1.0023893794002274, + "grad_norm": 0.9289880394935608, + "learning_rate": 4.984294671753933e-05, + "loss": 0.7764, + "step": 156900 + }, + { + "epoch": 1.0024532665499661, + "grad_norm": 0.5650531053543091, + "learning_rate": 4.983792905808468e-05, + "loss": 0.8134, + "step": 156910 + }, + { + "epoch": 1.0025171536997048, + "grad_norm": 0.8123154640197754, + "learning_rate": 4.983291140026222e-05, + "loss": 0.8212, + "step": 156920 + }, + { + "epoch": 1.0025810408494435, + "grad_norm": 0.8330434560775757, + "learning_rate": 4.982789374412248e-05, + "loss": 0.773, + "step": 156930 + }, + { + "epoch": 1.0026449279991823, + "grad_norm": 1.7509496212005615, + "learning_rate": 4.9822876089716e-05, + "loss": 0.9797, + "step": 156940 + }, + { + "epoch": 1.002708815148921, + "grad_norm": 1.1598998308181763, + "learning_rate": 4.9817858437093315e-05, + "loss": 0.7175, + "step": 156950 + }, + { + "epoch": 1.0027727022986597, + "grad_norm": 0.7346895337104797, + "learning_rate": 4.981284078630496e-05, + "loss": 0.7898, + "step": 156960 + }, + { + "epoch": 1.0028365894483984, + "grad_norm": 1.4007014036178589, + "learning_rate": 4.980782313740145e-05, + "loss": 0.9026, + "step": 156970 + }, + { + "epoch": 1.002900476598137, + "grad_norm": 0.5492518544197083, + "learning_rate": 4.980280549043333e-05, + "loss": 0.9399, + "step": 156980 + }, + { + "epoch": 1.0029643637478758, + "grad_norm": 0.8820534348487854, + "learning_rate": 4.9797787845451114e-05, + "loss": 0.8288, + "step": 156990 + }, + { + "epoch": 1.0030282508976145, + "grad_norm": 0.8820619583129883, + "learning_rate": 4.979277020250537e-05, + "loss": 0.9111, + "step": 157000 + }, + { + "epoch": 1.0030921380473532, + "grad_norm": 0.825981616973877, + "learning_rate": 4.978775256164661e-05, + "loss": 0.8334, + "step": 157010 + }, + { + "epoch": 1.003156025197092, + "grad_norm": 1.0771020650863647, + "learning_rate": 4.978273492292535e-05, + "loss": 1.1424, + "step": 157020 + }, + { + "epoch": 1.0032199123468306, + "grad_norm": 1.030242681503296, + "learning_rate": 4.977771728639215e-05, + "loss": 0.7994, + "step": 157030 + }, + { + "epoch": 1.0032837994965693, + "grad_norm": 0.6895780563354492, + "learning_rate": 4.9772699652097526e-05, + "loss": 0.9221, + "step": 157040 + }, + { + "epoch": 1.003347686646308, + "grad_norm": 0.8492401242256165, + "learning_rate": 4.976768202009201e-05, + "loss": 1.0848, + "step": 157050 + }, + { + "epoch": 1.0034115737960467, + "grad_norm": 1.2907142639160156, + "learning_rate": 4.976266439042615e-05, + "loss": 1.0327, + "step": 157060 + }, + { + "epoch": 1.0034754609457854, + "grad_norm": 0.6771016120910645, + "learning_rate": 4.975764676315045e-05, + "loss": 0.7145, + "step": 157070 + }, + { + "epoch": 1.0035393480955241, + "grad_norm": 1.0897397994995117, + "learning_rate": 4.975262913831546e-05, + "loss": 1.1423, + "step": 157080 + }, + { + "epoch": 1.0036032352452628, + "grad_norm": 1.0723178386688232, + "learning_rate": 4.974761151597171e-05, + "loss": 0.9393, + "step": 157090 + }, + { + "epoch": 1.0036671223950016, + "grad_norm": 0.8129236698150635, + "learning_rate": 4.974259389616973e-05, + "loss": 0.98, + "step": 157100 + }, + { + "epoch": 1.0037310095447403, + "grad_norm": 0.8235514760017395, + "learning_rate": 4.973757627896005e-05, + "loss": 0.7475, + "step": 157110 + }, + { + "epoch": 1.003794896694479, + "grad_norm": 0.7380703091621399, + "learning_rate": 4.9732558664393214e-05, + "loss": 0.8268, + "step": 157120 + }, + { + "epoch": 1.0038587838442177, + "grad_norm": 1.4135018587112427, + "learning_rate": 4.9727541052519736e-05, + "loss": 0.9903, + "step": 157130 + }, + { + "epoch": 1.0039226709939564, + "grad_norm": 1.2729389667510986, + "learning_rate": 4.972252344339015e-05, + "loss": 0.9318, + "step": 157140 + }, + { + "epoch": 1.003986558143695, + "grad_norm": 0.9822022318840027, + "learning_rate": 4.9717505837055e-05, + "loss": 0.859, + "step": 157150 + }, + { + "epoch": 1.0040504452934336, + "grad_norm": 1.236742615699768, + "learning_rate": 4.971248823356481e-05, + "loss": 0.7326, + "step": 157160 + }, + { + "epoch": 1.0041143324431723, + "grad_norm": 1.0240849256515503, + "learning_rate": 4.970747063297011e-05, + "loss": 0.8845, + "step": 157170 + }, + { + "epoch": 1.004178219592911, + "grad_norm": 0.9231441020965576, + "learning_rate": 4.970245303532144e-05, + "loss": 0.7605, + "step": 157180 + }, + { + "epoch": 1.0042421067426497, + "grad_norm": 0.9898058772087097, + "learning_rate": 4.969743544066931e-05, + "loss": 0.8963, + "step": 157190 + }, + { + "epoch": 1.0043059938923884, + "grad_norm": 2.537976026535034, + "learning_rate": 4.969241784906428e-05, + "loss": 0.9316, + "step": 157200 + }, + { + "epoch": 1.004369881042127, + "grad_norm": 1.539099097251892, + "learning_rate": 4.968740026055686e-05, + "loss": 0.9704, + "step": 157210 + }, + { + "epoch": 1.0044337681918658, + "grad_norm": 1.2426973581314087, + "learning_rate": 4.96823826751976e-05, + "loss": 0.9339, + "step": 157220 + }, + { + "epoch": 1.0044976553416045, + "grad_norm": 0.9273771643638611, + "learning_rate": 4.967736509303701e-05, + "loss": 0.788, + "step": 157230 + }, + { + "epoch": 1.0045615424913432, + "grad_norm": 0.931636393070221, + "learning_rate": 4.9672347514125645e-05, + "loss": 0.9634, + "step": 157240 + }, + { + "epoch": 1.004625429641082, + "grad_norm": 2.1478915214538574, + "learning_rate": 4.966732993851402e-05, + "loss": 0.8644, + "step": 157250 + }, + { + "epoch": 1.0046893167908206, + "grad_norm": 0.8873687386512756, + "learning_rate": 4.966231236625267e-05, + "loss": 0.7035, + "step": 157260 + }, + { + "epoch": 1.0047532039405593, + "grad_norm": 0.49548423290252686, + "learning_rate": 4.965729479739212e-05, + "loss": 0.8945, + "step": 157270 + }, + { + "epoch": 1.004817091090298, + "grad_norm": 1.0232203006744385, + "learning_rate": 4.965227723198292e-05, + "loss": 0.8334, + "step": 157280 + }, + { + "epoch": 1.0048809782400367, + "grad_norm": 1.0437577962875366, + "learning_rate": 4.964725967007558e-05, + "loss": 0.7539, + "step": 157290 + }, + { + "epoch": 1.0049448653897755, + "grad_norm": 1.0047162771224976, + "learning_rate": 4.964224211172064e-05, + "loss": 1.1266, + "step": 157300 + }, + { + "epoch": 1.0050087525395142, + "grad_norm": 1.6184574365615845, + "learning_rate": 4.9637224556968634e-05, + "loss": 0.7608, + "step": 157310 + }, + { + "epoch": 1.0050726396892529, + "grad_norm": 0.6376124620437622, + "learning_rate": 4.963220700587009e-05, + "loss": 0.6971, + "step": 157320 + }, + { + "epoch": 1.0051365268389916, + "grad_norm": 0.8973642587661743, + "learning_rate": 4.9627189458475544e-05, + "loss": 1.0248, + "step": 157330 + }, + { + "epoch": 1.0052004139887303, + "grad_norm": 0.6999796032905579, + "learning_rate": 4.962217191483552e-05, + "loss": 0.9811, + "step": 157340 + }, + { + "epoch": 1.005264301138469, + "grad_norm": 1.4351537227630615, + "learning_rate": 4.961715437500055e-05, + "loss": 0.8413, + "step": 157350 + }, + { + "epoch": 1.0053281882882077, + "grad_norm": 0.7439177632331848, + "learning_rate": 4.9612136839021165e-05, + "loss": 0.9353, + "step": 157360 + }, + { + "epoch": 1.0053920754379464, + "grad_norm": 1.172404408454895, + "learning_rate": 4.9607119306947915e-05, + "loss": 0.9226, + "step": 157370 + }, + { + "epoch": 1.005455962587685, + "grad_norm": 0.95972740650177, + "learning_rate": 4.96021017788313e-05, + "loss": 0.788, + "step": 157380 + }, + { + "epoch": 1.0055198497374238, + "grad_norm": 0.9233173727989197, + "learning_rate": 4.959708425472186e-05, + "loss": 1.1035, + "step": 157390 + }, + { + "epoch": 1.0055837368871625, + "grad_norm": 0.7995260953903198, + "learning_rate": 4.959206673467013e-05, + "loss": 0.8307, + "step": 157400 + }, + { + "epoch": 1.0056476240369012, + "grad_norm": 2.598005533218384, + "learning_rate": 4.958704921872665e-05, + "loss": 1.0584, + "step": 157410 + }, + { + "epoch": 1.00571151118664, + "grad_norm": 1.0082064867019653, + "learning_rate": 4.9582031706941936e-05, + "loss": 0.894, + "step": 157420 + }, + { + "epoch": 1.0057753983363786, + "grad_norm": 2.215607166290283, + "learning_rate": 4.957701419936652e-05, + "loss": 1.2318, + "step": 157430 + }, + { + "epoch": 1.0058392854861173, + "grad_norm": 0.9388594627380371, + "learning_rate": 4.9571996696050945e-05, + "loss": 0.9138, + "step": 157440 + }, + { + "epoch": 1.005903172635856, + "grad_norm": 0.8170478343963623, + "learning_rate": 4.9566979197045726e-05, + "loss": 0.7381, + "step": 157450 + }, + { + "epoch": 1.0059670597855948, + "grad_norm": 0.7179467082023621, + "learning_rate": 4.9561961702401405e-05, + "loss": 1.0941, + "step": 157460 + }, + { + "epoch": 1.0060309469353335, + "grad_norm": 0.8564074635505676, + "learning_rate": 4.9556944212168506e-05, + "loss": 1.0778, + "step": 157470 + }, + { + "epoch": 1.0060948340850722, + "grad_norm": 1.0831458568572998, + "learning_rate": 4.955192672639756e-05, + "loss": 1.1512, + "step": 157480 + }, + { + "epoch": 1.0061587212348109, + "grad_norm": 0.6284303665161133, + "learning_rate": 4.9546909245139103e-05, + "loss": 0.875, + "step": 157490 + }, + { + "epoch": 1.0062226083845496, + "grad_norm": 0.7781526446342468, + "learning_rate": 4.954189176844367e-05, + "loss": 0.8538, + "step": 157500 + }, + { + "epoch": 1.0062864955342883, + "grad_norm": 0.9631011486053467, + "learning_rate": 4.9536874296361763e-05, + "loss": 0.8143, + "step": 157510 + }, + { + "epoch": 1.006350382684027, + "grad_norm": 1.1078636646270752, + "learning_rate": 4.953185682894395e-05, + "loss": 0.9528, + "step": 157520 + }, + { + "epoch": 1.0064142698337657, + "grad_norm": 0.8949397206306458, + "learning_rate": 4.952683936624074e-05, + "loss": 0.6958, + "step": 157530 + }, + { + "epoch": 1.0064781569835044, + "grad_norm": 0.6079200506210327, + "learning_rate": 4.952182190830266e-05, + "loss": 0.8135, + "step": 157540 + }, + { + "epoch": 1.006542044133243, + "grad_norm": 1.173264503479004, + "learning_rate": 4.951680445518026e-05, + "loss": 0.783, + "step": 157550 + }, + { + "epoch": 1.0066059312829818, + "grad_norm": 0.9052695631980896, + "learning_rate": 4.951178700692404e-05, + "loss": 0.6618, + "step": 157560 + }, + { + "epoch": 1.0066698184327205, + "grad_norm": 0.8281904458999634, + "learning_rate": 4.950676956358456e-05, + "loss": 0.9381, + "step": 157570 + }, + { + "epoch": 1.0067337055824592, + "grad_norm": 0.803945004940033, + "learning_rate": 4.950175212521234e-05, + "loss": 0.8125, + "step": 157580 + }, + { + "epoch": 1.006797592732198, + "grad_norm": 0.8486142158508301, + "learning_rate": 4.94967346918579e-05, + "loss": 0.7433, + "step": 157590 + }, + { + "epoch": 1.0068614798819366, + "grad_norm": 1.5051418542861938, + "learning_rate": 4.9491717263571777e-05, + "loss": 0.8196, + "step": 157600 + }, + { + "epoch": 1.0069253670316753, + "grad_norm": 0.7606010437011719, + "learning_rate": 4.94866998404045e-05, + "loss": 0.7728, + "step": 157610 + }, + { + "epoch": 1.006989254181414, + "grad_norm": 0.7703734040260315, + "learning_rate": 4.94816824224066e-05, + "loss": 0.83, + "step": 157620 + }, + { + "epoch": 1.0070531413311528, + "grad_norm": 0.6582179665565491, + "learning_rate": 4.947666500962861e-05, + "loss": 0.8594, + "step": 157630 + }, + { + "epoch": 1.0071170284808915, + "grad_norm": 0.9291526079177856, + "learning_rate": 4.947164760212106e-05, + "loss": 0.802, + "step": 157640 + }, + { + "epoch": 1.00718091563063, + "grad_norm": 0.7917651534080505, + "learning_rate": 4.9466630199934464e-05, + "loss": 0.6386, + "step": 157650 + }, + { + "epoch": 1.0072448027803687, + "grad_norm": 0.8128526210784912, + "learning_rate": 4.946161280311937e-05, + "loss": 1.0251, + "step": 157660 + }, + { + "epoch": 1.0073086899301074, + "grad_norm": 1.1424459218978882, + "learning_rate": 4.94565954117263e-05, + "loss": 0.9383, + "step": 157670 + }, + { + "epoch": 1.007372577079846, + "grad_norm": 0.5946982502937317, + "learning_rate": 4.9451578025805786e-05, + "loss": 0.6585, + "step": 157680 + }, + { + "epoch": 1.0074364642295848, + "grad_norm": 0.8714378476142883, + "learning_rate": 4.9446560645408354e-05, + "loss": 0.6871, + "step": 157690 + }, + { + "epoch": 1.0075003513793235, + "grad_norm": 0.8224378824234009, + "learning_rate": 4.944154327058454e-05, + "loss": 0.8583, + "step": 157700 + }, + { + "epoch": 1.0075642385290622, + "grad_norm": 1.5147910118103027, + "learning_rate": 4.9436525901384865e-05, + "loss": 0.7587, + "step": 157710 + }, + { + "epoch": 1.007628125678801, + "grad_norm": 0.6513627767562866, + "learning_rate": 4.943150853785984e-05, + "loss": 0.691, + "step": 157720 + }, + { + "epoch": 1.0076920128285396, + "grad_norm": 1.0497170686721802, + "learning_rate": 4.942649118006004e-05, + "loss": 0.9602, + "step": 157730 + }, + { + "epoch": 1.0077558999782783, + "grad_norm": 2.1938791275024414, + "learning_rate": 4.942147382803597e-05, + "loss": 0.8407, + "step": 157740 + }, + { + "epoch": 1.007819787128017, + "grad_norm": 1.1465129852294922, + "learning_rate": 4.941645648183816e-05, + "loss": 0.9224, + "step": 157750 + }, + { + "epoch": 1.0078836742777557, + "grad_norm": 1.2474966049194336, + "learning_rate": 4.941143914151713e-05, + "loss": 0.8904, + "step": 157760 + }, + { + "epoch": 1.0079475614274944, + "grad_norm": 0.6194661855697632, + "learning_rate": 4.9406421807123424e-05, + "loss": 0.7821, + "step": 157770 + }, + { + "epoch": 1.0080114485772331, + "grad_norm": 1.8728440999984741, + "learning_rate": 4.940140447870756e-05, + "loss": 0.8937, + "step": 157780 + }, + { + "epoch": 1.0080753357269718, + "grad_norm": 1.6072417497634888, + "learning_rate": 4.939638715632007e-05, + "loss": 0.7891, + "step": 157790 + }, + { + "epoch": 1.0081392228767105, + "grad_norm": 1.476420283317566, + "learning_rate": 4.939136984001148e-05, + "loss": 0.824, + "step": 157800 + }, + { + "epoch": 1.0082031100264492, + "grad_norm": 0.5812131762504578, + "learning_rate": 4.938635252983233e-05, + "loss": 0.7095, + "step": 157810 + }, + { + "epoch": 1.008266997176188, + "grad_norm": 0.7907794713973999, + "learning_rate": 4.9381335225833136e-05, + "loss": 1.0357, + "step": 157820 + }, + { + "epoch": 1.0083308843259267, + "grad_norm": 1.1523560285568237, + "learning_rate": 4.937631792806444e-05, + "loss": 0.7895, + "step": 157830 + }, + { + "epoch": 1.0083947714756654, + "grad_norm": 0.927148163318634, + "learning_rate": 4.937130063657675e-05, + "loss": 1.1295, + "step": 157840 + }, + { + "epoch": 1.008458658625404, + "grad_norm": 0.7727904915809631, + "learning_rate": 4.9366283351420616e-05, + "loss": 1.0136, + "step": 157850 + }, + { + "epoch": 1.0085225457751428, + "grad_norm": 0.8255678415298462, + "learning_rate": 4.936126607264656e-05, + "loss": 0.8372, + "step": 157860 + }, + { + "epoch": 1.0085864329248815, + "grad_norm": 0.6773051619529724, + "learning_rate": 4.9356248800305106e-05, + "loss": 0.7595, + "step": 157870 + }, + { + "epoch": 1.0086503200746202, + "grad_norm": 0.8545355200767517, + "learning_rate": 4.9351231534446776e-05, + "loss": 1.1426, + "step": 157880 + }, + { + "epoch": 1.008714207224359, + "grad_norm": 1.0750840902328491, + "learning_rate": 4.9346214275122105e-05, + "loss": 0.9281, + "step": 157890 + }, + { + "epoch": 1.0087780943740976, + "grad_norm": 0.7120381593704224, + "learning_rate": 4.9341197022381626e-05, + "loss": 0.8779, + "step": 157900 + }, + { + "epoch": 1.0088419815238363, + "grad_norm": 0.8368415236473083, + "learning_rate": 4.933617977627586e-05, + "loss": 0.7732, + "step": 157910 + }, + { + "epoch": 1.008905868673575, + "grad_norm": 0.7155873775482178, + "learning_rate": 4.933116253685534e-05, + "loss": 0.9169, + "step": 157920 + }, + { + "epoch": 1.0089697558233137, + "grad_norm": 0.7591100931167603, + "learning_rate": 4.932614530417059e-05, + "loss": 0.9115, + "step": 157930 + }, + { + "epoch": 1.0090336429730524, + "grad_norm": 1.039440393447876, + "learning_rate": 4.932112807827215e-05, + "loss": 0.9292, + "step": 157940 + }, + { + "epoch": 1.0090975301227911, + "grad_norm": 1.248268961906433, + "learning_rate": 4.931611085921052e-05, + "loss": 0.99, + "step": 157950 + }, + { + "epoch": 1.0091614172725298, + "grad_norm": 0.9334812760353088, + "learning_rate": 4.9311093647036255e-05, + "loss": 0.9961, + "step": 157960 + }, + { + "epoch": 1.0092253044222685, + "grad_norm": 0.8057778477668762, + "learning_rate": 4.930607644179987e-05, + "loss": 0.847, + "step": 157970 + }, + { + "epoch": 1.0092891915720072, + "grad_norm": 1.522869348526001, + "learning_rate": 4.93010592435519e-05, + "loss": 1.0672, + "step": 157980 + }, + { + "epoch": 1.009353078721746, + "grad_norm": 1.1457351446151733, + "learning_rate": 4.929604205234286e-05, + "loss": 0.7168, + "step": 157990 + }, + { + "epoch": 1.0094169658714847, + "grad_norm": 0.6407791376113892, + "learning_rate": 4.92910248682233e-05, + "loss": 0.7698, + "step": 158000 + }, + { + "epoch": 1.0094808530212234, + "grad_norm": 1.0870330333709717, + "learning_rate": 4.928600769124372e-05, + "loss": 0.8819, + "step": 158010 + }, + { + "epoch": 1.009544740170962, + "grad_norm": 1.1727497577667236, + "learning_rate": 4.928099052145466e-05, + "loss": 1.0363, + "step": 158020 + }, + { + "epoch": 1.0096086273207008, + "grad_norm": 0.7357398867607117, + "learning_rate": 4.927597335890665e-05, + "loss": 1.0793, + "step": 158030 + }, + { + "epoch": 1.0096725144704395, + "grad_norm": 1.1307222843170166, + "learning_rate": 4.927095620365021e-05, + "loss": 0.8468, + "step": 158040 + }, + { + "epoch": 1.0097364016201782, + "grad_norm": 0.747114896774292, + "learning_rate": 4.926593905573588e-05, + "loss": 0.8705, + "step": 158050 + }, + { + "epoch": 1.009800288769917, + "grad_norm": 0.9897515177726746, + "learning_rate": 4.926092191521418e-05, + "loss": 0.8029, + "step": 158060 + }, + { + "epoch": 1.0098641759196556, + "grad_norm": 1.3690990209579468, + "learning_rate": 4.925590478213563e-05, + "loss": 0.8735, + "step": 158070 + }, + { + "epoch": 1.0099280630693943, + "grad_norm": 1.2536840438842773, + "learning_rate": 4.925088765655076e-05, + "loss": 0.8238, + "step": 158080 + }, + { + "epoch": 1.009991950219133, + "grad_norm": 1.0726910829544067, + "learning_rate": 4.92458705385101e-05, + "loss": 1.0702, + "step": 158090 + }, + { + "epoch": 1.0100558373688717, + "grad_norm": 0.6436864733695984, + "learning_rate": 4.924085342806419e-05, + "loss": 0.7983, + "step": 158100 + }, + { + "epoch": 1.0101197245186104, + "grad_norm": 1.1560698747634888, + "learning_rate": 4.923583632526353e-05, + "loss": 0.943, + "step": 158110 + }, + { + "epoch": 1.0101836116683491, + "grad_norm": 2.0632247924804688, + "learning_rate": 4.923081923015866e-05, + "loss": 0.8969, + "step": 158120 + }, + { + "epoch": 1.0102474988180878, + "grad_norm": 1.1319400072097778, + "learning_rate": 4.922580214280011e-05, + "loss": 0.8463, + "step": 158130 + }, + { + "epoch": 1.0103113859678263, + "grad_norm": 0.8231350779533386, + "learning_rate": 4.92207850632384e-05, + "loss": 0.7702, + "step": 158140 + }, + { + "epoch": 1.010375273117565, + "grad_norm": 0.5904900431632996, + "learning_rate": 4.9215767991524056e-05, + "loss": 0.8007, + "step": 158150 + }, + { + "epoch": 1.0104391602673037, + "grad_norm": 0.6886080503463745, + "learning_rate": 4.921075092770761e-05, + "loss": 0.979, + "step": 158160 + }, + { + "epoch": 1.0105030474170424, + "grad_norm": 1.1137446165084839, + "learning_rate": 4.920573387183959e-05, + "loss": 0.6878, + "step": 158170 + }, + { + "epoch": 1.0105669345667811, + "grad_norm": 1.4936422109603882, + "learning_rate": 4.920071682397051e-05, + "loss": 0.9938, + "step": 158180 + }, + { + "epoch": 1.0106308217165199, + "grad_norm": 0.7173126935958862, + "learning_rate": 4.9195699784150906e-05, + "loss": 1.0017, + "step": 158190 + }, + { + "epoch": 1.0106947088662586, + "grad_norm": 1.1607474088668823, + "learning_rate": 4.91906827524313e-05, + "loss": 0.7932, + "step": 158200 + }, + { + "epoch": 1.0107585960159973, + "grad_norm": 1.1725101470947266, + "learning_rate": 4.918566572886222e-05, + "loss": 0.949, + "step": 158210 + }, + { + "epoch": 1.010822483165736, + "grad_norm": 0.7787816524505615, + "learning_rate": 4.9180648713494184e-05, + "loss": 1.1126, + "step": 158220 + }, + { + "epoch": 1.0108863703154747, + "grad_norm": 1.0860399007797241, + "learning_rate": 4.9175631706377735e-05, + "loss": 1.1293, + "step": 158230 + }, + { + "epoch": 1.0109502574652134, + "grad_norm": 0.8303453922271729, + "learning_rate": 4.917061470756338e-05, + "loss": 0.7749, + "step": 158240 + }, + { + "epoch": 1.011014144614952, + "grad_norm": 0.8753618597984314, + "learning_rate": 4.916559771710164e-05, + "loss": 0.7246, + "step": 158250 + }, + { + "epoch": 1.0110780317646908, + "grad_norm": 0.9717980027198792, + "learning_rate": 4.916058073504307e-05, + "loss": 0.7445, + "step": 158260 + }, + { + "epoch": 1.0111419189144295, + "grad_norm": 1.009787678718567, + "learning_rate": 4.915556376143818e-05, + "loss": 0.8512, + "step": 158270 + }, + { + "epoch": 1.0112058060641682, + "grad_norm": 0.6770163774490356, + "learning_rate": 4.9150546796337486e-05, + "loss": 0.8507, + "step": 158280 + }, + { + "epoch": 1.011269693213907, + "grad_norm": 1.2202988862991333, + "learning_rate": 4.914603153505973e-05, + "loss": 0.8679, + "step": 158290 + }, + { + "epoch": 1.0113335803636456, + "grad_norm": 1.823740839958191, + "learning_rate": 4.914101458625622e-05, + "loss": 0.9841, + "step": 158300 + }, + { + "epoch": 1.0113974675133843, + "grad_norm": 0.8558661937713623, + "learning_rate": 4.913599764610344e-05, + "loss": 0.9539, + "step": 158310 + }, + { + "epoch": 1.011461354663123, + "grad_norm": 2.187252998352051, + "learning_rate": 4.913098071465191e-05, + "loss": 1.0052, + "step": 158320 + }, + { + "epoch": 1.0115252418128617, + "grad_norm": 0.9841252565383911, + "learning_rate": 4.912596379195216e-05, + "loss": 1.0755, + "step": 158330 + }, + { + "epoch": 1.0115891289626004, + "grad_norm": 0.6061422824859619, + "learning_rate": 4.91209468780547e-05, + "loss": 1.1245, + "step": 158340 + }, + { + "epoch": 1.0116530161123392, + "grad_norm": 0.5873895883560181, + "learning_rate": 4.911592997301007e-05, + "loss": 0.8757, + "step": 158350 + }, + { + "epoch": 1.0117169032620779, + "grad_norm": 1.3842110633850098, + "learning_rate": 4.911091307686879e-05, + "loss": 0.9066, + "step": 158360 + }, + { + "epoch": 1.0117807904118166, + "grad_norm": 0.6734603047370911, + "learning_rate": 4.910589618968138e-05, + "loss": 0.7995, + "step": 158370 + }, + { + "epoch": 1.0118446775615553, + "grad_norm": 0.9523679614067078, + "learning_rate": 4.910087931149838e-05, + "loss": 0.8374, + "step": 158380 + }, + { + "epoch": 1.011908564711294, + "grad_norm": 1.5969555377960205, + "learning_rate": 4.90958624423703e-05, + "loss": 0.7422, + "step": 158390 + }, + { + "epoch": 1.0119724518610327, + "grad_norm": 1.0750153064727783, + "learning_rate": 4.9090845582347664e-05, + "loss": 0.9901, + "step": 158400 + }, + { + "epoch": 1.0120363390107714, + "grad_norm": 0.9817994236946106, + "learning_rate": 4.9085828731481004e-05, + "loss": 0.8279, + "step": 158410 + }, + { + "epoch": 1.01210022616051, + "grad_norm": 1.0518522262573242, + "learning_rate": 4.908081188982083e-05, + "loss": 0.8748, + "step": 158420 + }, + { + "epoch": 1.0121641133102488, + "grad_norm": 1.547917366027832, + "learning_rate": 4.907579505741769e-05, + "loss": 1.0364, + "step": 158430 + }, + { + "epoch": 1.0122280004599875, + "grad_norm": 0.9296419620513916, + "learning_rate": 4.907077823432209e-05, + "loss": 0.9092, + "step": 158440 + }, + { + "epoch": 1.0122918876097262, + "grad_norm": 0.907597541809082, + "learning_rate": 4.906576142058455e-05, + "loss": 0.7798, + "step": 158450 + }, + { + "epoch": 1.012355774759465, + "grad_norm": 1.192789077758789, + "learning_rate": 4.90607446162556e-05, + "loss": 0.7114, + "step": 158460 + }, + { + "epoch": 1.0124196619092036, + "grad_norm": 1.136411428451538, + "learning_rate": 4.905572782138577e-05, + "loss": 0.7638, + "step": 158470 + }, + { + "epoch": 1.0124835490589423, + "grad_norm": 1.407484769821167, + "learning_rate": 4.9050711036025576e-05, + "loss": 0.7775, + "step": 158480 + }, + { + "epoch": 1.012547436208681, + "grad_norm": 0.7734007835388184, + "learning_rate": 4.904569426022556e-05, + "loss": 0.7446, + "step": 158490 + }, + { + "epoch": 1.0126113233584197, + "grad_norm": 0.8089882135391235, + "learning_rate": 4.9040677494036205e-05, + "loss": 0.7903, + "step": 158500 + }, + { + "epoch": 1.0126752105081585, + "grad_norm": 0.9194614291191101, + "learning_rate": 4.903566073750806e-05, + "loss": 0.799, + "step": 158510 + }, + { + "epoch": 1.0127390976578972, + "grad_norm": 0.9951015710830688, + "learning_rate": 4.9030643990691655e-05, + "loss": 1.0582, + "step": 158520 + }, + { + "epoch": 1.0128029848076359, + "grad_norm": 0.6360517144203186, + "learning_rate": 4.90256272536375e-05, + "loss": 0.596, + "step": 158530 + }, + { + "epoch": 1.0128668719573746, + "grad_norm": 1.3233652114868164, + "learning_rate": 4.9020610526396116e-05, + "loss": 1.1017, + "step": 158540 + }, + { + "epoch": 1.0129307591071133, + "grad_norm": 0.7260464429855347, + "learning_rate": 4.9015593809018034e-05, + "loss": 0.7517, + "step": 158550 + }, + { + "epoch": 1.012994646256852, + "grad_norm": 1.1746294498443604, + "learning_rate": 4.901057710155378e-05, + "loss": 0.7294, + "step": 158560 + }, + { + "epoch": 1.0130585334065907, + "grad_norm": 0.7293243408203125, + "learning_rate": 4.900556040405386e-05, + "loss": 0.8202, + "step": 158570 + }, + { + "epoch": 1.0131224205563294, + "grad_norm": 1.2035466432571411, + "learning_rate": 4.900054371656882e-05, + "loss": 0.8897, + "step": 158580 + }, + { + "epoch": 1.013186307706068, + "grad_norm": 1.4569511413574219, + "learning_rate": 4.899552703914916e-05, + "loss": 0.9422, + "step": 158590 + }, + { + "epoch": 1.0132501948558068, + "grad_norm": 1.067893624305725, + "learning_rate": 4.899051037184541e-05, + "loss": 0.6845, + "step": 158600 + }, + { + "epoch": 1.0133140820055455, + "grad_norm": 1.2214561700820923, + "learning_rate": 4.89854937147081e-05, + "loss": 0.8067, + "step": 158610 + }, + { + "epoch": 1.013377969155284, + "grad_norm": 0.9461604356765747, + "learning_rate": 4.898047706778774e-05, + "loss": 0.7671, + "step": 158620 + }, + { + "epoch": 1.0134418563050227, + "grad_norm": 0.7252498865127563, + "learning_rate": 4.897546043113487e-05, + "loss": 0.9272, + "step": 158630 + }, + { + "epoch": 1.0135057434547614, + "grad_norm": 0.8559421896934509, + "learning_rate": 4.8970443804799985e-05, + "loss": 0.8639, + "step": 158640 + }, + { + "epoch": 1.0135696306045001, + "grad_norm": 0.5652778148651123, + "learning_rate": 4.8965427188833627e-05, + "loss": 0.8474, + "step": 158650 + }, + { + "epoch": 1.0136335177542388, + "grad_norm": 0.5576297640800476, + "learning_rate": 4.8960410583286315e-05, + "loss": 0.9793, + "step": 158660 + }, + { + "epoch": 1.0136974049039775, + "grad_norm": 0.7062064409255981, + "learning_rate": 4.895539398820857e-05, + "loss": 0.9898, + "step": 158670 + }, + { + "epoch": 1.0137612920537162, + "grad_norm": 1.1425914764404297, + "learning_rate": 4.89503774036509e-05, + "loss": 0.8466, + "step": 158680 + }, + { + "epoch": 1.013825179203455, + "grad_norm": 1.2256412506103516, + "learning_rate": 4.8945360829663837e-05, + "loss": 0.8359, + "step": 158690 + }, + { + "epoch": 1.0138890663531936, + "grad_norm": 1.0603712797164917, + "learning_rate": 4.894034426629791e-05, + "loss": 0.789, + "step": 158700 + }, + { + "epoch": 1.0139529535029324, + "grad_norm": 0.9669123291969299, + "learning_rate": 4.8935327713603634e-05, + "loss": 0.809, + "step": 158710 + }, + { + "epoch": 1.014016840652671, + "grad_norm": 1.034204363822937, + "learning_rate": 4.893031117163153e-05, + "loss": 0.7027, + "step": 158720 + }, + { + "epoch": 1.0140807278024098, + "grad_norm": 0.6518446803092957, + "learning_rate": 4.892529464043212e-05, + "loss": 0.7673, + "step": 158730 + }, + { + "epoch": 1.0141446149521485, + "grad_norm": 0.6312533617019653, + "learning_rate": 4.8920278120055915e-05, + "loss": 0.8681, + "step": 158740 + }, + { + "epoch": 1.0142085021018872, + "grad_norm": 1.1980721950531006, + "learning_rate": 4.8915261610553446e-05, + "loss": 0.9774, + "step": 158750 + }, + { + "epoch": 1.0142723892516259, + "grad_norm": 1.1740984916687012, + "learning_rate": 4.8910245111975236e-05, + "loss": 0.6504, + "step": 158760 + }, + { + "epoch": 1.0143362764013646, + "grad_norm": 0.9296197295188904, + "learning_rate": 4.8905228624371794e-05, + "loss": 0.9532, + "step": 158770 + }, + { + "epoch": 1.0144001635511033, + "grad_norm": 1.52986741065979, + "learning_rate": 4.890021214779366e-05, + "loss": 0.6905, + "step": 158780 + }, + { + "epoch": 1.014464050700842, + "grad_norm": 0.9443937540054321, + "learning_rate": 4.8895195682291326e-05, + "loss": 0.8081, + "step": 158790 + }, + { + "epoch": 1.0145279378505807, + "grad_norm": 1.0230110883712769, + "learning_rate": 4.8890179227915333e-05, + "loss": 0.8733, + "step": 158800 + }, + { + "epoch": 1.0145918250003194, + "grad_norm": 1.2321919202804565, + "learning_rate": 4.888516278471619e-05, + "loss": 0.6539, + "step": 158810 + }, + { + "epoch": 1.0146557121500581, + "grad_norm": 1.0817656517028809, + "learning_rate": 4.888014635274443e-05, + "loss": 0.7763, + "step": 158820 + }, + { + "epoch": 1.0147195992997968, + "grad_norm": 0.7480055093765259, + "learning_rate": 4.887512993205056e-05, + "loss": 0.7766, + "step": 158830 + }, + { + "epoch": 1.0147834864495355, + "grad_norm": 1.0258066654205322, + "learning_rate": 4.887011352268511e-05, + "loss": 0.9319, + "step": 158840 + }, + { + "epoch": 1.0148473735992742, + "grad_norm": 0.8432429432868958, + "learning_rate": 4.88650971246986e-05, + "loss": 1.2076, + "step": 158850 + }, + { + "epoch": 1.014911260749013, + "grad_norm": 1.4191237688064575, + "learning_rate": 4.886008073814153e-05, + "loss": 1.0, + "step": 158860 + }, + { + "epoch": 1.0149751478987517, + "grad_norm": 0.7389619946479797, + "learning_rate": 4.885506436306444e-05, + "loss": 0.662, + "step": 158870 + }, + { + "epoch": 1.0150390350484904, + "grad_norm": 0.9246344566345215, + "learning_rate": 4.8850047999517834e-05, + "loss": 0.8963, + "step": 158880 + }, + { + "epoch": 1.015102922198229, + "grad_norm": 0.9745082259178162, + "learning_rate": 4.884503164755224e-05, + "loss": 0.8817, + "step": 158890 + }, + { + "epoch": 1.0151668093479678, + "grad_norm": 1.0244060754776, + "learning_rate": 4.884001530721818e-05, + "loss": 0.9566, + "step": 158900 + }, + { + "epoch": 1.0152306964977065, + "grad_norm": 1.594533085823059, + "learning_rate": 4.883499897856617e-05, + "loss": 1.2052, + "step": 158910 + }, + { + "epoch": 1.0152945836474452, + "grad_norm": 1.8419270515441895, + "learning_rate": 4.882998266164673e-05, + "loss": 0.7605, + "step": 158920 + }, + { + "epoch": 1.0153584707971839, + "grad_norm": 0.7778764963150024, + "learning_rate": 4.8824966356510375e-05, + "loss": 0.9946, + "step": 158930 + }, + { + "epoch": 1.0154223579469226, + "grad_norm": 1.0129575729370117, + "learning_rate": 4.881995006320763e-05, + "loss": 1.0609, + "step": 158940 + }, + { + "epoch": 1.0154862450966613, + "grad_norm": 0.7646721601486206, + "learning_rate": 4.8814933781789004e-05, + "loss": 0.9917, + "step": 158950 + }, + { + "epoch": 1.0155501322464, + "grad_norm": 0.7843183279037476, + "learning_rate": 4.880991751230502e-05, + "loss": 0.8404, + "step": 158960 + }, + { + "epoch": 1.0156140193961387, + "grad_norm": 1.0840474367141724, + "learning_rate": 4.880490125480622e-05, + "loss": 0.8327, + "step": 158970 + }, + { + "epoch": 1.0156779065458774, + "grad_norm": 0.6885616779327393, + "learning_rate": 4.8799885009343084e-05, + "loss": 0.9157, + "step": 158980 + }, + { + "epoch": 1.0157417936956161, + "grad_norm": 1.2638275623321533, + "learning_rate": 4.879486877596613e-05, + "loss": 0.9309, + "step": 158990 + }, + { + "epoch": 1.0158056808453548, + "grad_norm": 0.6783813834190369, + "learning_rate": 4.87898525547259e-05, + "loss": 1.2162, + "step": 159000 + }, + { + "epoch": 1.0158695679950935, + "grad_norm": 0.8159160614013672, + "learning_rate": 4.878483634567289e-05, + "loss": 0.7415, + "step": 159010 + }, + { + "epoch": 1.0159334551448322, + "grad_norm": 0.8057973384857178, + "learning_rate": 4.8779820148857636e-05, + "loss": 1.0629, + "step": 159020 + }, + { + "epoch": 1.015997342294571, + "grad_norm": 1.3183810710906982, + "learning_rate": 4.8774803964330653e-05, + "loss": 0.6688, + "step": 159030 + }, + { + "epoch": 1.0160612294443097, + "grad_norm": 1.4091906547546387, + "learning_rate": 4.876978779214245e-05, + "loss": 1.0829, + "step": 159040 + }, + { + "epoch": 1.0161251165940484, + "grad_norm": 0.9577476978302002, + "learning_rate": 4.876477163234355e-05, + "loss": 1.0011, + "step": 159050 + }, + { + "epoch": 1.016189003743787, + "grad_norm": 0.6230905652046204, + "learning_rate": 4.8759755484984466e-05, + "loss": 0.8103, + "step": 159060 + }, + { + "epoch": 1.0162528908935258, + "grad_norm": 1.224851369857788, + "learning_rate": 4.875473935011572e-05, + "loss": 1.0953, + "step": 159070 + }, + { + "epoch": 1.0163167780432645, + "grad_norm": 0.8951313495635986, + "learning_rate": 4.874972322778782e-05, + "loss": 0.775, + "step": 159080 + }, + { + "epoch": 1.0163806651930032, + "grad_norm": 1.2181718349456787, + "learning_rate": 4.8744707118051294e-05, + "loss": 0.7839, + "step": 159090 + }, + { + "epoch": 1.0164445523427417, + "grad_norm": 0.6959841847419739, + "learning_rate": 4.873969102095666e-05, + "loss": 0.7769, + "step": 159100 + }, + { + "epoch": 1.0165084394924804, + "grad_norm": 1.0471941232681274, + "learning_rate": 4.873467493655442e-05, + "loss": 0.7421, + "step": 159110 + }, + { + "epoch": 1.016572326642219, + "grad_norm": 0.6630411148071289, + "learning_rate": 4.87296588648951e-05, + "loss": 0.8807, + "step": 159120 + }, + { + "epoch": 1.0166362137919578, + "grad_norm": 0.845357358455658, + "learning_rate": 4.8724642806029206e-05, + "loss": 0.7257, + "step": 159130 + }, + { + "epoch": 1.0167001009416965, + "grad_norm": 0.9893336296081543, + "learning_rate": 4.8719626760007276e-05, + "loss": 0.8755, + "step": 159140 + }, + { + "epoch": 1.0167639880914352, + "grad_norm": 1.0361378192901611, + "learning_rate": 4.87146107268798e-05, + "loss": 1.0532, + "step": 159150 + }, + { + "epoch": 1.016827875241174, + "grad_norm": 0.9727418422698975, + "learning_rate": 4.870959470669732e-05, + "loss": 0.7611, + "step": 159160 + }, + { + "epoch": 1.0168917623909126, + "grad_norm": 0.8137701749801636, + "learning_rate": 4.870457869951033e-05, + "loss": 0.9455, + "step": 159170 + }, + { + "epoch": 1.0169556495406513, + "grad_norm": 0.8102377653121948, + "learning_rate": 4.869956270536935e-05, + "loss": 0.8612, + "step": 159180 + }, + { + "epoch": 1.01701953669039, + "grad_norm": 0.8785635828971863, + "learning_rate": 4.8694546724324895e-05, + "loss": 0.8923, + "step": 159190 + }, + { + "epoch": 1.0170834238401287, + "grad_norm": 1.1253139972686768, + "learning_rate": 4.868953075642749e-05, + "loss": 0.7758, + "step": 159200 + }, + { + "epoch": 1.0171473109898674, + "grad_norm": 1.2032454013824463, + "learning_rate": 4.868451480172764e-05, + "loss": 0.8554, + "step": 159210 + }, + { + "epoch": 1.0172111981396061, + "grad_norm": 0.8401626944541931, + "learning_rate": 4.867949886027586e-05, + "loss": 0.7357, + "step": 159220 + }, + { + "epoch": 1.0172750852893448, + "grad_norm": 1.0876498222351074, + "learning_rate": 4.8674482932122686e-05, + "loss": 0.8303, + "step": 159230 + }, + { + "epoch": 1.0173389724390836, + "grad_norm": 0.704442024230957, + "learning_rate": 4.866946701731861e-05, + "loss": 0.9889, + "step": 159240 + }, + { + "epoch": 1.0174028595888223, + "grad_norm": 0.9629689455032349, + "learning_rate": 4.866445111591414e-05, + "loss": 0.8805, + "step": 159250 + }, + { + "epoch": 1.017466746738561, + "grad_norm": 0.8684937953948975, + "learning_rate": 4.8659435227959815e-05, + "loss": 0.7646, + "step": 159260 + }, + { + "epoch": 1.0175306338882997, + "grad_norm": 0.8176425099372864, + "learning_rate": 4.865441935350613e-05, + "loss": 0.8797, + "step": 159270 + }, + { + "epoch": 1.0175945210380384, + "grad_norm": 1.4737823009490967, + "learning_rate": 4.8649403492603615e-05, + "loss": 0.8141, + "step": 159280 + }, + { + "epoch": 1.017658408187777, + "grad_norm": 0.8579307198524475, + "learning_rate": 4.864438764530276e-05, + "loss": 1.0162, + "step": 159290 + }, + { + "epoch": 1.0177222953375158, + "grad_norm": 0.9210206270217896, + "learning_rate": 4.863937181165411e-05, + "loss": 0.9758, + "step": 159300 + }, + { + "epoch": 1.0177861824872545, + "grad_norm": 0.6698283553123474, + "learning_rate": 4.863435599170816e-05, + "loss": 0.8678, + "step": 159310 + }, + { + "epoch": 1.0178500696369932, + "grad_norm": 0.9772372841835022, + "learning_rate": 4.862934018551542e-05, + "loss": 0.975, + "step": 159320 + }, + { + "epoch": 1.017913956786732, + "grad_norm": 0.7392674684524536, + "learning_rate": 4.8624324393126416e-05, + "loss": 0.8978, + "step": 159330 + }, + { + "epoch": 1.0179778439364706, + "grad_norm": 0.781814455986023, + "learning_rate": 4.861930861459165e-05, + "loss": 0.8135, + "step": 159340 + }, + { + "epoch": 1.0180417310862093, + "grad_norm": 0.9082130193710327, + "learning_rate": 4.8614292849961646e-05, + "loss": 0.6263, + "step": 159350 + }, + { + "epoch": 1.018105618235948, + "grad_norm": 0.790849506855011, + "learning_rate": 4.860927709928691e-05, + "loss": 0.8153, + "step": 159360 + }, + { + "epoch": 1.0181695053856867, + "grad_norm": 1.3248441219329834, + "learning_rate": 4.860426136261796e-05, + "loss": 1.1221, + "step": 159370 + }, + { + "epoch": 1.0182333925354254, + "grad_norm": 0.797540545463562, + "learning_rate": 4.85992456400053e-05, + "loss": 1.0036, + "step": 159380 + }, + { + "epoch": 1.0182972796851641, + "grad_norm": 0.9565972089767456, + "learning_rate": 4.859422993149945e-05, + "loss": 0.9894, + "step": 159390 + }, + { + "epoch": 1.0183611668349029, + "grad_norm": 1.2057994604110718, + "learning_rate": 4.858921423715092e-05, + "loss": 0.7413, + "step": 159400 + }, + { + "epoch": 1.0184250539846416, + "grad_norm": 1.2084450721740723, + "learning_rate": 4.8584198557010224e-05, + "loss": 0.7765, + "step": 159410 + }, + { + "epoch": 1.0184889411343803, + "grad_norm": 0.9695774912834167, + "learning_rate": 4.8579182891127864e-05, + "loss": 0.7912, + "step": 159420 + }, + { + "epoch": 1.018552828284119, + "grad_norm": 0.6492698192596436, + "learning_rate": 4.857416723955437e-05, + "loss": 0.7832, + "step": 159430 + }, + { + "epoch": 1.0186167154338577, + "grad_norm": 0.815000593662262, + "learning_rate": 4.856915160234025e-05, + "loss": 0.9811, + "step": 159440 + }, + { + "epoch": 1.0186806025835964, + "grad_norm": 1.2782448530197144, + "learning_rate": 4.8564135979536e-05, + "loss": 0.7518, + "step": 159450 + }, + { + "epoch": 1.018744489733335, + "grad_norm": 1.1094145774841309, + "learning_rate": 4.855912037119218e-05, + "loss": 0.9107, + "step": 159460 + }, + { + "epoch": 1.0188083768830738, + "grad_norm": 0.8510834574699402, + "learning_rate": 4.855410477735923e-05, + "loss": 1.1143, + "step": 159470 + }, + { + "epoch": 1.0188722640328125, + "grad_norm": 0.5925911068916321, + "learning_rate": 4.85490891980877e-05, + "loss": 0.7271, + "step": 159480 + }, + { + "epoch": 1.0189361511825512, + "grad_norm": 1.6815381050109863, + "learning_rate": 4.85440736334281e-05, + "loss": 0.9389, + "step": 159490 + }, + { + "epoch": 1.01900003833229, + "grad_norm": 1.0281870365142822, + "learning_rate": 4.853905808343094e-05, + "loss": 0.7189, + "step": 159500 + }, + { + "epoch": 1.0190639254820286, + "grad_norm": 0.7680450081825256, + "learning_rate": 4.853404254814672e-05, + "loss": 0.7787, + "step": 159510 + }, + { + "epoch": 1.0191278126317673, + "grad_norm": 0.6367865800857544, + "learning_rate": 4.852902702762597e-05, + "loss": 0.7252, + "step": 159520 + }, + { + "epoch": 1.019191699781506, + "grad_norm": 0.8541706204414368, + "learning_rate": 4.852401152191918e-05, + "loss": 0.8, + "step": 159530 + }, + { + "epoch": 1.0192555869312447, + "grad_norm": 0.788281261920929, + "learning_rate": 4.851899603107687e-05, + "loss": 1.0424, + "step": 159540 + }, + { + "epoch": 1.0193194740809834, + "grad_norm": 0.8140308856964111, + "learning_rate": 4.851398055514955e-05, + "loss": 0.8846, + "step": 159550 + }, + { + "epoch": 1.0193833612307222, + "grad_norm": 1.011403203010559, + "learning_rate": 4.850896509418774e-05, + "loss": 0.7852, + "step": 159560 + }, + { + "epoch": 1.0194472483804609, + "grad_norm": 0.8204459547996521, + "learning_rate": 4.850394964824194e-05, + "loss": 0.9566, + "step": 159570 + }, + { + "epoch": 1.0195111355301996, + "grad_norm": 3.202881097793579, + "learning_rate": 4.8498934217362665e-05, + "loss": 0.8407, + "step": 159580 + }, + { + "epoch": 1.019575022679938, + "grad_norm": 1.3872183561325073, + "learning_rate": 4.849391880160041e-05, + "loss": 0.6323, + "step": 159590 + }, + { + "epoch": 1.0196389098296768, + "grad_norm": 0.9473857879638672, + "learning_rate": 4.848890340100571e-05, + "loss": 0.7185, + "step": 159600 + }, + { + "epoch": 1.0197027969794155, + "grad_norm": 1.2074732780456543, + "learning_rate": 4.848388801562905e-05, + "loss": 0.8186, + "step": 159610 + }, + { + "epoch": 1.0197666841291542, + "grad_norm": 0.5796457529067993, + "learning_rate": 4.8478872645520954e-05, + "loss": 0.7665, + "step": 159620 + }, + { + "epoch": 1.0198305712788929, + "grad_norm": 1.2254159450531006, + "learning_rate": 4.847385729073192e-05, + "loss": 0.9178, + "step": 159630 + }, + { + "epoch": 1.0198944584286316, + "grad_norm": 1.5869561433792114, + "learning_rate": 4.846884195131247e-05, + "loss": 0.9841, + "step": 159640 + }, + { + "epoch": 1.0199583455783703, + "grad_norm": 1.9314491748809814, + "learning_rate": 4.846382662731311e-05, + "loss": 0.7713, + "step": 159650 + }, + { + "epoch": 1.020022232728109, + "grad_norm": 0.8856831789016724, + "learning_rate": 4.8458811318784334e-05, + "loss": 0.7843, + "step": 159660 + }, + { + "epoch": 1.0200861198778477, + "grad_norm": 0.6835888028144836, + "learning_rate": 4.8453796025776665e-05, + "loss": 0.591, + "step": 159670 + }, + { + "epoch": 1.0201500070275864, + "grad_norm": 0.5204030275344849, + "learning_rate": 4.844878074834061e-05, + "loss": 0.8107, + "step": 159680 + }, + { + "epoch": 1.020213894177325, + "grad_norm": 0.9245986342430115, + "learning_rate": 4.8443765486526675e-05, + "loss": 0.8568, + "step": 159690 + }, + { + "epoch": 1.0202777813270638, + "grad_norm": 2.367129325866699, + "learning_rate": 4.8438750240385366e-05, + "loss": 0.8777, + "step": 159700 + }, + { + "epoch": 1.0203416684768025, + "grad_norm": 0.9855824112892151, + "learning_rate": 4.84337350099672e-05, + "loss": 0.821, + "step": 159710 + }, + { + "epoch": 1.0204055556265412, + "grad_norm": 0.6439787745475769, + "learning_rate": 4.842871979532267e-05, + "loss": 0.8859, + "step": 159720 + }, + { + "epoch": 1.02046944277628, + "grad_norm": 0.9851899743080139, + "learning_rate": 4.84242061156708e-05, + "loss": 0.9057, + "step": 159730 + }, + { + "epoch": 1.0205333299260186, + "grad_norm": 0.8334624171257019, + "learning_rate": 4.841919093113534e-05, + "loss": 0.7313, + "step": 159740 + }, + { + "epoch": 1.0205972170757573, + "grad_norm": 0.8483121991157532, + "learning_rate": 4.841417576252e-05, + "loss": 1.1077, + "step": 159750 + }, + { + "epoch": 1.020661104225496, + "grad_norm": 1.2232626676559448, + "learning_rate": 4.840916060987528e-05, + "loss": 0.8178, + "step": 159760 + }, + { + "epoch": 1.0207249913752348, + "grad_norm": 1.0820945501327515, + "learning_rate": 4.840414547325168e-05, + "loss": 0.8174, + "step": 159770 + }, + { + "epoch": 1.0207888785249735, + "grad_norm": 0.7780520915985107, + "learning_rate": 4.8399130352699726e-05, + "loss": 0.6902, + "step": 159780 + }, + { + "epoch": 1.0208527656747122, + "grad_norm": 0.8979020714759827, + "learning_rate": 4.839411524826991e-05, + "loss": 0.9166, + "step": 159790 + }, + { + "epoch": 1.0209166528244509, + "grad_norm": 4.772502422332764, + "learning_rate": 4.8389100160012744e-05, + "loss": 1.0738, + "step": 159800 + }, + { + "epoch": 1.0209805399741896, + "grad_norm": 1.1232408285140991, + "learning_rate": 4.838408508797874e-05, + "loss": 0.6826, + "step": 159810 + }, + { + "epoch": 1.0210444271239283, + "grad_norm": 0.6424837112426758, + "learning_rate": 4.8379070032218386e-05, + "loss": 1.0162, + "step": 159820 + }, + { + "epoch": 1.021108314273667, + "grad_norm": 1.7317675352096558, + "learning_rate": 4.83740549927822e-05, + "loss": 0.9172, + "step": 159830 + }, + { + "epoch": 1.0211722014234057, + "grad_norm": 1.2768808603286743, + "learning_rate": 4.83690399697207e-05, + "loss": 0.8575, + "step": 159840 + }, + { + "epoch": 1.0212360885731444, + "grad_norm": 0.901062548160553, + "learning_rate": 4.836402496308437e-05, + "loss": 0.9215, + "step": 159850 + }, + { + "epoch": 1.0212999757228831, + "grad_norm": 0.7806056141853333, + "learning_rate": 4.835900997292371e-05, + "loss": 0.7751, + "step": 159860 + }, + { + "epoch": 1.0213638628726218, + "grad_norm": 0.7727100253105164, + "learning_rate": 4.835399499928925e-05, + "loss": 0.6676, + "step": 159870 + }, + { + "epoch": 1.0214277500223605, + "grad_norm": 0.8922156095504761, + "learning_rate": 4.834898004223148e-05, + "loss": 0.8759, + "step": 159880 + }, + { + "epoch": 1.0214916371720992, + "grad_norm": 0.9628594517707825, + "learning_rate": 4.834396510180092e-05, + "loss": 0.8242, + "step": 159890 + }, + { + "epoch": 1.021555524321838, + "grad_norm": 0.6507847309112549, + "learning_rate": 4.8338950178048056e-05, + "loss": 1.1914, + "step": 159900 + }, + { + "epoch": 1.0216194114715766, + "grad_norm": 1.5707703828811646, + "learning_rate": 4.8333935271023415e-05, + "loss": 0.8153, + "step": 159910 + }, + { + "epoch": 1.0216832986213154, + "grad_norm": 0.7113951444625854, + "learning_rate": 4.8328920380777473e-05, + "loss": 0.7568, + "step": 159920 + }, + { + "epoch": 1.021747185771054, + "grad_norm": 0.9113976955413818, + "learning_rate": 4.832390550736075e-05, + "loss": 0.9178, + "step": 159930 + }, + { + "epoch": 1.0218110729207928, + "grad_norm": 0.5762841701507568, + "learning_rate": 4.831889065082375e-05, + "loss": 0.801, + "step": 159940 + }, + { + "epoch": 1.0218749600705315, + "grad_norm": 0.9322299361228943, + "learning_rate": 4.831387581121698e-05, + "loss": 1.0433, + "step": 159950 + }, + { + "epoch": 1.0219388472202702, + "grad_norm": 1.1670128107070923, + "learning_rate": 4.8308860988590935e-05, + "loss": 0.8452, + "step": 159960 + }, + { + "epoch": 1.0220027343700089, + "grad_norm": 0.7586202621459961, + "learning_rate": 4.8303846182996124e-05, + "loss": 0.8195, + "step": 159970 + }, + { + "epoch": 1.0220666215197476, + "grad_norm": 0.9201989769935608, + "learning_rate": 4.829883139448305e-05, + "loss": 0.894, + "step": 159980 + }, + { + "epoch": 1.0221305086694863, + "grad_norm": 0.981075644493103, + "learning_rate": 4.829381662310221e-05, + "loss": 1.0564, + "step": 159990 + }, + { + "epoch": 1.022194395819225, + "grad_norm": 1.116087794303894, + "learning_rate": 4.8288801868904124e-05, + "loss": 0.7544, + "step": 160000 + }, + { + "epoch": 1.0222582829689637, + "grad_norm": 0.7812409400939941, + "learning_rate": 4.828378713193927e-05, + "loss": 1.0803, + "step": 160010 + }, + { + "epoch": 1.0223221701187024, + "grad_norm": 0.7585064172744751, + "learning_rate": 4.827877241225817e-05, + "loss": 0.654, + "step": 160020 + }, + { + "epoch": 1.0223860572684411, + "grad_norm": 1.045121431350708, + "learning_rate": 4.8273757709911324e-05, + "loss": 1.3915, + "step": 160030 + }, + { + "epoch": 1.0224499444181798, + "grad_norm": 0.9577502012252808, + "learning_rate": 4.826874302494923e-05, + "loss": 0.8388, + "step": 160040 + }, + { + "epoch": 1.0225138315679185, + "grad_norm": 1.286743402481079, + "learning_rate": 4.826372835742239e-05, + "loss": 1.1065, + "step": 160050 + }, + { + "epoch": 1.0225777187176572, + "grad_norm": 1.2941441535949707, + "learning_rate": 4.82587137073813e-05, + "loss": 0.76, + "step": 160060 + }, + { + "epoch": 1.022641605867396, + "grad_norm": 0.7606869339942932, + "learning_rate": 4.825369907487647e-05, + "loss": 0.8793, + "step": 160070 + }, + { + "epoch": 1.0227054930171344, + "grad_norm": 1.1968684196472168, + "learning_rate": 4.824868445995841e-05, + "loss": 0.7791, + "step": 160080 + }, + { + "epoch": 1.0227693801668731, + "grad_norm": 0.8206735253334045, + "learning_rate": 4.8243669862677634e-05, + "loss": 0.8489, + "step": 160090 + }, + { + "epoch": 1.0228332673166118, + "grad_norm": 1.5179201364517212, + "learning_rate": 4.82386552830846e-05, + "loss": 0.9613, + "step": 160100 + }, + { + "epoch": 1.0228971544663505, + "grad_norm": 0.9671904444694519, + "learning_rate": 4.823364072122983e-05, + "loss": 1.0319, + "step": 160110 + }, + { + "epoch": 1.0229610416160893, + "grad_norm": 0.7965761423110962, + "learning_rate": 4.8228626177163824e-05, + "loss": 0.5945, + "step": 160120 + }, + { + "epoch": 1.023024928765828, + "grad_norm": 0.7991026043891907, + "learning_rate": 4.822361165093709e-05, + "loss": 0.8775, + "step": 160130 + }, + { + "epoch": 1.0230888159155667, + "grad_norm": 1.263779878616333, + "learning_rate": 4.821859714260012e-05, + "loss": 0.8976, + "step": 160140 + }, + { + "epoch": 1.0231527030653054, + "grad_norm": 0.7055035829544067, + "learning_rate": 4.821358265220342e-05, + "loss": 0.8665, + "step": 160150 + }, + { + "epoch": 1.023216590215044, + "grad_norm": 1.1190840005874634, + "learning_rate": 4.8208568179797485e-05, + "loss": 1.0261, + "step": 160160 + }, + { + "epoch": 1.0232804773647828, + "grad_norm": 0.7512614130973816, + "learning_rate": 4.820355372543282e-05, + "loss": 0.7382, + "step": 160170 + }, + { + "epoch": 1.0233443645145215, + "grad_norm": 0.8903238773345947, + "learning_rate": 4.8198539289159914e-05, + "loss": 0.9103, + "step": 160180 + }, + { + "epoch": 1.0234082516642602, + "grad_norm": 1.0027856826782227, + "learning_rate": 4.819352487102928e-05, + "loss": 0.9353, + "step": 160190 + }, + { + "epoch": 1.023472138813999, + "grad_norm": 1.451263427734375, + "learning_rate": 4.818851047109142e-05, + "loss": 0.7841, + "step": 160200 + }, + { + "epoch": 1.0235360259637376, + "grad_norm": 0.9455044269561768, + "learning_rate": 4.8183496089396826e-05, + "loss": 0.9434, + "step": 160210 + }, + { + "epoch": 1.0235999131134763, + "grad_norm": 0.7364357709884644, + "learning_rate": 4.8178481725996e-05, + "loss": 0.9864, + "step": 160220 + }, + { + "epoch": 1.023663800263215, + "grad_norm": 0.8373795747756958, + "learning_rate": 4.817346738093943e-05, + "loss": 0.8139, + "step": 160230 + }, + { + "epoch": 1.0237276874129537, + "grad_norm": 0.6274827122688293, + "learning_rate": 4.816845305427764e-05, + "loss": 0.8578, + "step": 160240 + }, + { + "epoch": 1.0237915745626924, + "grad_norm": 0.9364883303642273, + "learning_rate": 4.8163438746061095e-05, + "loss": 0.9182, + "step": 160250 + }, + { + "epoch": 1.0238554617124311, + "grad_norm": 0.8702690601348877, + "learning_rate": 4.815842445634032e-05, + "loss": 0.943, + "step": 160260 + }, + { + "epoch": 1.0239193488621698, + "grad_norm": 0.7030714750289917, + "learning_rate": 4.81534101851658e-05, + "loss": 0.8738, + "step": 160270 + }, + { + "epoch": 1.0239832360119085, + "grad_norm": 0.9644308090209961, + "learning_rate": 4.814839593258804e-05, + "loss": 1.1133, + "step": 160280 + }, + { + "epoch": 1.0240471231616473, + "grad_norm": 0.642647922039032, + "learning_rate": 4.814338169865753e-05, + "loss": 0.9129, + "step": 160290 + }, + { + "epoch": 1.024111010311386, + "grad_norm": 1.3615950345993042, + "learning_rate": 4.813836748342477e-05, + "loss": 0.8771, + "step": 160300 + }, + { + "epoch": 1.0241748974611247, + "grad_norm": 0.9638871550559998, + "learning_rate": 4.813335328694027e-05, + "loss": 0.9895, + "step": 160310 + }, + { + "epoch": 1.0242387846108634, + "grad_norm": 0.9455498456954956, + "learning_rate": 4.8128339109254516e-05, + "loss": 1.2231, + "step": 160320 + }, + { + "epoch": 1.024302671760602, + "grad_norm": 0.8217796683311462, + "learning_rate": 4.8123324950418e-05, + "loss": 0.8139, + "step": 160330 + }, + { + "epoch": 1.0243665589103408, + "grad_norm": 0.6381251811981201, + "learning_rate": 4.811831081048123e-05, + "loss": 0.684, + "step": 160340 + }, + { + "epoch": 1.0244304460600795, + "grad_norm": 1.0423862934112549, + "learning_rate": 4.8113296689494693e-05, + "loss": 0.7997, + "step": 160350 + }, + { + "epoch": 1.0244943332098182, + "grad_norm": 1.1656889915466309, + "learning_rate": 4.8108282587508895e-05, + "loss": 1.1299, + "step": 160360 + }, + { + "epoch": 1.024558220359557, + "grad_norm": 1.2188206911087036, + "learning_rate": 4.8103268504574325e-05, + "loss": 0.5699, + "step": 160370 + }, + { + "epoch": 1.0246221075092956, + "grad_norm": 0.7609322667121887, + "learning_rate": 4.8098254440741486e-05, + "loss": 1.2015, + "step": 160380 + }, + { + "epoch": 1.0246859946590343, + "grad_norm": 1.1754517555236816, + "learning_rate": 4.8093240396060854e-05, + "loss": 1.0247, + "step": 160390 + }, + { + "epoch": 1.024749881808773, + "grad_norm": 0.74627286195755, + "learning_rate": 4.808822637058296e-05, + "loss": 0.7625, + "step": 160400 + }, + { + "epoch": 1.0248137689585117, + "grad_norm": 0.7061801552772522, + "learning_rate": 4.8083212364358277e-05, + "loss": 0.9833, + "step": 160410 + }, + { + "epoch": 1.0248776561082504, + "grad_norm": 1.1060301065444946, + "learning_rate": 4.80781983774373e-05, + "loss": 0.9781, + "step": 160420 + }, + { + "epoch": 1.0249415432579891, + "grad_norm": 1.5939122438430786, + "learning_rate": 4.807318440987053e-05, + "loss": 0.915, + "step": 160430 + }, + { + "epoch": 1.0250054304077278, + "grad_norm": 0.8850380778312683, + "learning_rate": 4.8068170461708464e-05, + "loss": 1.1094, + "step": 160440 + }, + { + "epoch": 1.0250693175574666, + "grad_norm": 0.8643857836723328, + "learning_rate": 4.8063156533001585e-05, + "loss": 1.2635, + "step": 160450 + }, + { + "epoch": 1.0251332047072053, + "grad_norm": 1.5707765817642212, + "learning_rate": 4.8058142623800404e-05, + "loss": 1.073, + "step": 160460 + }, + { + "epoch": 1.025197091856944, + "grad_norm": 0.7831910252571106, + "learning_rate": 4.80531287341554e-05, + "loss": 0.9, + "step": 160470 + }, + { + "epoch": 1.0252609790066827, + "grad_norm": 0.6048902869224548, + "learning_rate": 4.804811486411707e-05, + "loss": 0.846, + "step": 160480 + }, + { + "epoch": 1.0253248661564214, + "grad_norm": 0.6415519118309021, + "learning_rate": 4.804310101373592e-05, + "loss": 0.9206, + "step": 160490 + }, + { + "epoch": 1.02538875330616, + "grad_norm": 0.9880675077438354, + "learning_rate": 4.803808718306243e-05, + "loss": 0.9464, + "step": 160500 + }, + { + "epoch": 1.0254526404558988, + "grad_norm": 0.9564589262008667, + "learning_rate": 4.80330733721471e-05, + "loss": 0.6466, + "step": 160510 + }, + { + "epoch": 1.0255165276056375, + "grad_norm": 0.9664949774742126, + "learning_rate": 4.802805958104043e-05, + "loss": 0.8345, + "step": 160520 + }, + { + "epoch": 1.0255804147553762, + "grad_norm": 0.4916277229785919, + "learning_rate": 4.80230458097929e-05, + "loss": 0.7715, + "step": 160530 + }, + { + "epoch": 1.025644301905115, + "grad_norm": 0.894357442855835, + "learning_rate": 4.801803205845501e-05, + "loss": 0.8438, + "step": 160540 + }, + { + "epoch": 1.0257081890548536, + "grad_norm": 1.06783926486969, + "learning_rate": 4.801301832707725e-05, + "loss": 0.811, + "step": 160550 + }, + { + "epoch": 1.0257720762045923, + "grad_norm": 0.6926699876785278, + "learning_rate": 4.800800461571012e-05, + "loss": 0.8775, + "step": 160560 + }, + { + "epoch": 1.0258359633543308, + "grad_norm": 0.8281245231628418, + "learning_rate": 4.800299092440411e-05, + "loss": 0.8978, + "step": 160570 + }, + { + "epoch": 1.0258998505040695, + "grad_norm": 2.3448851108551025, + "learning_rate": 4.7997977253209706e-05, + "loss": 0.7675, + "step": 160580 + }, + { + "epoch": 1.0259637376538082, + "grad_norm": 1.0262439250946045, + "learning_rate": 4.79929636021774e-05, + "loss": 0.9032, + "step": 160590 + }, + { + "epoch": 1.026027624803547, + "grad_norm": 0.8757922053337097, + "learning_rate": 4.798794997135769e-05, + "loss": 0.8369, + "step": 160600 + }, + { + "epoch": 1.0260915119532856, + "grad_norm": 0.669083833694458, + "learning_rate": 4.798293636080106e-05, + "loss": 0.8433, + "step": 160610 + }, + { + "epoch": 1.0261553991030243, + "grad_norm": 1.0054523944854736, + "learning_rate": 4.7977922770558e-05, + "loss": 0.8063, + "step": 160620 + }, + { + "epoch": 1.026219286252763, + "grad_norm": 0.6672931909561157, + "learning_rate": 4.797290920067902e-05, + "loss": 0.9353, + "step": 160630 + }, + { + "epoch": 1.0262831734025017, + "grad_norm": 1.1048996448516846, + "learning_rate": 4.796789565121459e-05, + "loss": 0.8202, + "step": 160640 + }, + { + "epoch": 1.0263470605522405, + "grad_norm": 0.6301406621932983, + "learning_rate": 4.796288212221521e-05, + "loss": 0.7345, + "step": 160650 + }, + { + "epoch": 1.0264109477019792, + "grad_norm": 0.7342906594276428, + "learning_rate": 4.795786861373137e-05, + "loss": 0.9213, + "step": 160660 + }, + { + "epoch": 1.0264748348517179, + "grad_norm": 0.8929392695426941, + "learning_rate": 4.795285512581356e-05, + "loss": 0.8837, + "step": 160670 + }, + { + "epoch": 1.0265387220014566, + "grad_norm": 0.8775836229324341, + "learning_rate": 4.7947841658512274e-05, + "loss": 0.7707, + "step": 160680 + }, + { + "epoch": 1.0266026091511953, + "grad_norm": 0.9756439328193665, + "learning_rate": 4.794282821187799e-05, + "loss": 1.0365, + "step": 160690 + }, + { + "epoch": 1.026666496300934, + "grad_norm": 0.5219516158103943, + "learning_rate": 4.793781478596121e-05, + "loss": 1.0974, + "step": 160700 + }, + { + "epoch": 1.0267303834506727, + "grad_norm": 0.6594055891036987, + "learning_rate": 4.793280138081241e-05, + "loss": 0.726, + "step": 160710 + }, + { + "epoch": 1.0267942706004114, + "grad_norm": 0.5748750567436218, + "learning_rate": 4.7927787996482095e-05, + "loss": 0.7363, + "step": 160720 + }, + { + "epoch": 1.02685815775015, + "grad_norm": 1.0073412656784058, + "learning_rate": 4.792277463302075e-05, + "loss": 0.9049, + "step": 160730 + }, + { + "epoch": 1.0269220448998888, + "grad_norm": 2.4105050563812256, + "learning_rate": 4.791776129047886e-05, + "loss": 0.9481, + "step": 160740 + }, + { + "epoch": 1.0269859320496275, + "grad_norm": 0.9212214946746826, + "learning_rate": 4.7912747968906916e-05, + "loss": 0.919, + "step": 160750 + }, + { + "epoch": 1.0270498191993662, + "grad_norm": 0.8846902847290039, + "learning_rate": 4.79077346683554e-05, + "loss": 0.6759, + "step": 160760 + }, + { + "epoch": 1.027113706349105, + "grad_norm": 2.740834951400757, + "learning_rate": 4.7902721388874814e-05, + "loss": 1.0255, + "step": 160770 + }, + { + "epoch": 1.0271775934988436, + "grad_norm": 1.0306124687194824, + "learning_rate": 4.7897708130515637e-05, + "loss": 0.8978, + "step": 160780 + }, + { + "epoch": 1.0272414806485823, + "grad_norm": 0.7168565988540649, + "learning_rate": 4.789269489332836e-05, + "loss": 0.9045, + "step": 160790 + }, + { + "epoch": 1.027305367798321, + "grad_norm": 1.0961785316467285, + "learning_rate": 4.788768167736346e-05, + "loss": 1.0505, + "step": 160800 + }, + { + "epoch": 1.0273692549480598, + "grad_norm": 1.3778088092803955, + "learning_rate": 4.788266848267144e-05, + "loss": 0.7855, + "step": 160810 + }, + { + "epoch": 1.0274331420977985, + "grad_norm": 0.6504400372505188, + "learning_rate": 4.7877655309302776e-05, + "loss": 0.7996, + "step": 160820 + }, + { + "epoch": 1.0274970292475372, + "grad_norm": 1.6229898929595947, + "learning_rate": 4.7872642157307965e-05, + "loss": 0.8625, + "step": 160830 + }, + { + "epoch": 1.0275609163972759, + "grad_norm": 0.8949072957038879, + "learning_rate": 4.7867629026737484e-05, + "loss": 0.9184, + "step": 160840 + }, + { + "epoch": 1.0276248035470146, + "grad_norm": 1.153201699256897, + "learning_rate": 4.786261591764182e-05, + "loss": 0.7552, + "step": 160850 + }, + { + "epoch": 1.0276886906967533, + "grad_norm": 1.047953486442566, + "learning_rate": 4.785760283007147e-05, + "loss": 0.7731, + "step": 160860 + }, + { + "epoch": 1.027752577846492, + "grad_norm": 0.7609559893608093, + "learning_rate": 4.785258976407691e-05, + "loss": 0.8437, + "step": 160870 + }, + { + "epoch": 1.0278164649962307, + "grad_norm": 0.9429197907447815, + "learning_rate": 4.784757671970863e-05, + "loss": 1.2661, + "step": 160880 + }, + { + "epoch": 1.0278803521459694, + "grad_norm": 4.659643173217773, + "learning_rate": 4.784256369701711e-05, + "loss": 0.9362, + "step": 160890 + }, + { + "epoch": 1.027944239295708, + "grad_norm": 0.6563370823860168, + "learning_rate": 4.783755069605284e-05, + "loss": 0.64, + "step": 160900 + }, + { + "epoch": 1.0280081264454468, + "grad_norm": 0.8179851174354553, + "learning_rate": 4.78325377168663e-05, + "loss": 0.9857, + "step": 160910 + }, + { + "epoch": 1.0280720135951855, + "grad_norm": 1.0126748085021973, + "learning_rate": 4.782752475950799e-05, + "loss": 0.7072, + "step": 160920 + }, + { + "epoch": 1.0281359007449242, + "grad_norm": 1.2026677131652832, + "learning_rate": 4.782251182402838e-05, + "loss": 0.9921, + "step": 160930 + }, + { + "epoch": 1.028199787894663, + "grad_norm": 0.667464554309845, + "learning_rate": 4.781749891047797e-05, + "loss": 0.8723, + "step": 160940 + }, + { + "epoch": 1.0282636750444016, + "grad_norm": 0.7997406125068665, + "learning_rate": 4.7812486018907224e-05, + "loss": 0.8121, + "step": 160950 + }, + { + "epoch": 1.0283275621941403, + "grad_norm": 0.905473530292511, + "learning_rate": 4.7807473149366636e-05, + "loss": 0.8253, + "step": 160960 + }, + { + "epoch": 1.028391449343879, + "grad_norm": 0.9246165752410889, + "learning_rate": 4.780246030190669e-05, + "loss": 0.8049, + "step": 160970 + }, + { + "epoch": 1.0284553364936178, + "grad_norm": 1.9537968635559082, + "learning_rate": 4.7797447476577876e-05, + "loss": 0.8806, + "step": 160980 + }, + { + "epoch": 1.0285192236433565, + "grad_norm": 1.1061798334121704, + "learning_rate": 4.779243467343068e-05, + "loss": 0.8522, + "step": 160990 + }, + { + "epoch": 1.0285831107930952, + "grad_norm": 0.8190047740936279, + "learning_rate": 4.778742189251556e-05, + "loss": 0.8833, + "step": 161000 + }, + { + "epoch": 1.0286469979428339, + "grad_norm": 0.997075617313385, + "learning_rate": 4.778240913388302e-05, + "loss": 1.2601, + "step": 161010 + }, + { + "epoch": 1.0287108850925726, + "grad_norm": 0.8031179308891296, + "learning_rate": 4.777739639758354e-05, + "loss": 0.7952, + "step": 161020 + }, + { + "epoch": 1.0287747722423113, + "grad_norm": 0.8493223786354065, + "learning_rate": 4.77723836836676e-05, + "loss": 0.7857, + "step": 161030 + }, + { + "epoch": 1.02883865939205, + "grad_norm": 0.6901337504386902, + "learning_rate": 4.776737099218568e-05, + "loss": 0.8275, + "step": 161040 + }, + { + "epoch": 1.0289025465417885, + "grad_norm": 0.6938019394874573, + "learning_rate": 4.7762358323188274e-05, + "loss": 1.0794, + "step": 161050 + }, + { + "epoch": 1.0289664336915272, + "grad_norm": 0.6040987372398376, + "learning_rate": 4.775734567672586e-05, + "loss": 0.748, + "step": 161060 + }, + { + "epoch": 1.0290303208412659, + "grad_norm": 0.8941619396209717, + "learning_rate": 4.775233305284891e-05, + "loss": 0.7298, + "step": 161070 + }, + { + "epoch": 1.0290942079910046, + "grad_norm": 1.5386172533035278, + "learning_rate": 4.7747320451607905e-05, + "loss": 0.8937, + "step": 161080 + }, + { + "epoch": 1.0291580951407433, + "grad_norm": 0.6366963982582092, + "learning_rate": 4.7742307873053336e-05, + "loss": 0.9512, + "step": 161090 + }, + { + "epoch": 1.029221982290482, + "grad_norm": 0.6901091933250427, + "learning_rate": 4.773729531723568e-05, + "loss": 1.0791, + "step": 161100 + }, + { + "epoch": 1.0292858694402207, + "grad_norm": 1.6313228607177734, + "learning_rate": 4.773228278420542e-05, + "loss": 0.8286, + "step": 161110 + }, + { + "epoch": 1.0293497565899594, + "grad_norm": 0.7802162170410156, + "learning_rate": 4.772727027401303e-05, + "loss": 0.9759, + "step": 161120 + }, + { + "epoch": 1.0294136437396981, + "grad_norm": 0.9796809554100037, + "learning_rate": 4.772225778670899e-05, + "loss": 0.9091, + "step": 161130 + }, + { + "epoch": 1.0294775308894368, + "grad_norm": 0.9127634167671204, + "learning_rate": 4.771724532234379e-05, + "loss": 0.8796, + "step": 161140 + }, + { + "epoch": 1.0295414180391755, + "grad_norm": 1.400619387626648, + "learning_rate": 4.771223288096791e-05, + "loss": 0.7692, + "step": 161150 + }, + { + "epoch": 1.0296053051889142, + "grad_norm": 0.8520326614379883, + "learning_rate": 4.7707220462631816e-05, + "loss": 0.729, + "step": 161160 + }, + { + "epoch": 1.029669192338653, + "grad_norm": 0.7154648900032043, + "learning_rate": 4.7702208067386e-05, + "loss": 0.7087, + "step": 161170 + }, + { + "epoch": 1.0297330794883917, + "grad_norm": 0.9384026527404785, + "learning_rate": 4.7697195695280924e-05, + "loss": 0.7901, + "step": 161180 + }, + { + "epoch": 1.0297969666381304, + "grad_norm": 0.7101640701293945, + "learning_rate": 4.769218334636709e-05, + "loss": 0.9741, + "step": 161190 + }, + { + "epoch": 1.029860853787869, + "grad_norm": 0.6075759530067444, + "learning_rate": 4.7687171020694974e-05, + "loss": 1.0605, + "step": 161200 + }, + { + "epoch": 1.0299247409376078, + "grad_norm": 0.9361954927444458, + "learning_rate": 4.7682158718315036e-05, + "loss": 0.7721, + "step": 161210 + }, + { + "epoch": 1.0299886280873465, + "grad_norm": 0.8033797144889832, + "learning_rate": 4.767714643927776e-05, + "loss": 1.1211, + "step": 161220 + }, + { + "epoch": 1.0300525152370852, + "grad_norm": 0.9793620109558105, + "learning_rate": 4.767213418363363e-05, + "loss": 0.8834, + "step": 161230 + }, + { + "epoch": 1.030116402386824, + "grad_norm": 1.3734352588653564, + "learning_rate": 4.766712195143313e-05, + "loss": 0.8944, + "step": 161240 + }, + { + "epoch": 1.0301802895365626, + "grad_norm": 1.7260838747024536, + "learning_rate": 4.766210974272673e-05, + "loss": 1.051, + "step": 161250 + }, + { + "epoch": 1.0302441766863013, + "grad_norm": 1.3381998538970947, + "learning_rate": 4.7657097557564906e-05, + "loss": 1.0072, + "step": 161260 + }, + { + "epoch": 1.03030806383604, + "grad_norm": 2.562208890914917, + "learning_rate": 4.765208539599814e-05, + "loss": 1.1557, + "step": 161270 + }, + { + "epoch": 1.0303719509857787, + "grad_norm": 0.8522435426712036, + "learning_rate": 4.7647073258076905e-05, + "loss": 0.8596, + "step": 161280 + }, + { + "epoch": 1.0304358381355174, + "grad_norm": 0.8830184936523438, + "learning_rate": 4.764206114385167e-05, + "loss": 0.7747, + "step": 161290 + }, + { + "epoch": 1.0304997252852561, + "grad_norm": 0.8866000175476074, + "learning_rate": 4.763704905337292e-05, + "loss": 0.8545, + "step": 161300 + }, + { + "epoch": 1.0305636124349948, + "grad_norm": 0.7133574485778809, + "learning_rate": 4.7632036986691136e-05, + "loss": 1.0307, + "step": 161310 + }, + { + "epoch": 1.0306274995847335, + "grad_norm": 0.8143234252929688, + "learning_rate": 4.762702494385679e-05, + "loss": 0.8395, + "step": 161320 + }, + { + "epoch": 1.0306913867344722, + "grad_norm": 0.9006192088127136, + "learning_rate": 4.762201292492035e-05, + "loss": 0.9308, + "step": 161330 + }, + { + "epoch": 1.030755273884211, + "grad_norm": 0.6531468033790588, + "learning_rate": 4.76170009299323e-05, + "loss": 1.047, + "step": 161340 + }, + { + "epoch": 1.0308191610339497, + "grad_norm": 1.5346288681030273, + "learning_rate": 4.7611988958943114e-05, + "loss": 0.6627, + "step": 161350 + }, + { + "epoch": 1.0308830481836884, + "grad_norm": 0.6416158080101013, + "learning_rate": 4.760697701200326e-05, + "loss": 1.0079, + "step": 161360 + }, + { + "epoch": 1.030946935333427, + "grad_norm": 1.5637730360031128, + "learning_rate": 4.760196508916323e-05, + "loss": 0.8233, + "step": 161370 + }, + { + "epoch": 1.0310108224831658, + "grad_norm": 0.7167168855667114, + "learning_rate": 4.759695319047347e-05, + "loss": 0.8423, + "step": 161380 + }, + { + "epoch": 1.0310747096329045, + "grad_norm": 0.7242134213447571, + "learning_rate": 4.7591941315984475e-05, + "loss": 0.8862, + "step": 161390 + }, + { + "epoch": 1.0311385967826432, + "grad_norm": 0.7080520391464233, + "learning_rate": 4.758692946574672e-05, + "loss": 0.6676, + "step": 161400 + }, + { + "epoch": 1.031202483932382, + "grad_norm": 0.8707887530326843, + "learning_rate": 4.7581917639810666e-05, + "loss": 0.7589, + "step": 161410 + }, + { + "epoch": 1.0312663710821206, + "grad_norm": 0.955916702747345, + "learning_rate": 4.7576905838226795e-05, + "loss": 0.7692, + "step": 161420 + }, + { + "epoch": 1.0313302582318593, + "grad_norm": 1.247024416923523, + "learning_rate": 4.7571894061045584e-05, + "loss": 0.8207, + "step": 161430 + }, + { + "epoch": 1.031394145381598, + "grad_norm": 0.6420993208885193, + "learning_rate": 4.756688230831748e-05, + "loss": 0.8466, + "step": 161440 + }, + { + "epoch": 1.0314580325313367, + "grad_norm": 1.2500863075256348, + "learning_rate": 4.7561870580093e-05, + "loss": 0.9945, + "step": 161450 + }, + { + "epoch": 1.0315219196810754, + "grad_norm": 0.887197732925415, + "learning_rate": 4.755685887642258e-05, + "loss": 0.649, + "step": 161460 + }, + { + "epoch": 1.0315858068308141, + "grad_norm": 0.8526789546012878, + "learning_rate": 4.7551847197356715e-05, + "loss": 0.8747, + "step": 161470 + }, + { + "epoch": 1.0316496939805528, + "grad_norm": 2.258584499359131, + "learning_rate": 4.754683554294586e-05, + "loss": 1.0596, + "step": 161480 + }, + { + "epoch": 1.0317135811302915, + "grad_norm": 1.2613749504089355, + "learning_rate": 4.7541823913240494e-05, + "loss": 0.8289, + "step": 161490 + }, + { + "epoch": 1.0317774682800303, + "grad_norm": 0.9289235472679138, + "learning_rate": 4.753681230829109e-05, + "loss": 0.7338, + "step": 161500 + }, + { + "epoch": 1.031841355429769, + "grad_norm": 1.704034447669983, + "learning_rate": 4.753180072814812e-05, + "loss": 0.7966, + "step": 161510 + }, + { + "epoch": 1.0319052425795077, + "grad_norm": 0.9457545876502991, + "learning_rate": 4.752678917286205e-05, + "loss": 0.8958, + "step": 161520 + }, + { + "epoch": 1.0319691297292461, + "grad_norm": 0.8356863856315613, + "learning_rate": 4.752177764248335e-05, + "loss": 0.8386, + "step": 161530 + }, + { + "epoch": 1.0320330168789849, + "grad_norm": 0.8406546711921692, + "learning_rate": 4.751676613706249e-05, + "loss": 0.9743, + "step": 161540 + }, + { + "epoch": 1.0320969040287236, + "grad_norm": 0.7396709322929382, + "learning_rate": 4.751175465664996e-05, + "loss": 0.7308, + "step": 161550 + }, + { + "epoch": 1.0321607911784623, + "grad_norm": 0.9812073707580566, + "learning_rate": 4.7506743201296195e-05, + "loss": 0.8928, + "step": 161560 + }, + { + "epoch": 1.032224678328201, + "grad_norm": 1.6784000396728516, + "learning_rate": 4.750173177105169e-05, + "loss": 0.9561, + "step": 161570 + }, + { + "epoch": 1.0322885654779397, + "grad_norm": 0.9261510372161865, + "learning_rate": 4.7496720365966904e-05, + "loss": 0.6682, + "step": 161580 + }, + { + "epoch": 1.0323524526276784, + "grad_norm": 0.7271578311920166, + "learning_rate": 4.749170898609231e-05, + "loss": 0.9699, + "step": 161590 + }, + { + "epoch": 1.032416339777417, + "grad_norm": 1.4668047428131104, + "learning_rate": 4.7486697631478376e-05, + "loss": 0.8962, + "step": 161600 + }, + { + "epoch": 1.0324802269271558, + "grad_norm": 1.041753888130188, + "learning_rate": 4.748168630217557e-05, + "loss": 0.7792, + "step": 161610 + }, + { + "epoch": 1.0325441140768945, + "grad_norm": 0.7902843952178955, + "learning_rate": 4.7476674998234374e-05, + "loss": 0.6976, + "step": 161620 + }, + { + "epoch": 1.0326080012266332, + "grad_norm": 0.85538250207901, + "learning_rate": 4.747166371970523e-05, + "loss": 1.0145, + "step": 161630 + }, + { + "epoch": 1.032671888376372, + "grad_norm": 0.8777297735214233, + "learning_rate": 4.746665246663862e-05, + "loss": 0.7155, + "step": 161640 + }, + { + "epoch": 1.0327357755261106, + "grad_norm": 1.4333163499832153, + "learning_rate": 4.746164123908502e-05, + "loss": 0.7761, + "step": 161650 + }, + { + "epoch": 1.0327996626758493, + "grad_norm": 1.410051941871643, + "learning_rate": 4.7456630037094884e-05, + "loss": 1.1004, + "step": 161660 + }, + { + "epoch": 1.032863549825588, + "grad_norm": 0.6612430214881897, + "learning_rate": 4.745161886071868e-05, + "loss": 0.8089, + "step": 161670 + }, + { + "epoch": 1.0329274369753267, + "grad_norm": 0.8079758882522583, + "learning_rate": 4.744660771000688e-05, + "loss": 0.8166, + "step": 161680 + }, + { + "epoch": 1.0329913241250654, + "grad_norm": 1.0633221864700317, + "learning_rate": 4.744159658500996e-05, + "loss": 0.8759, + "step": 161690 + }, + { + "epoch": 1.0330552112748042, + "grad_norm": 0.620724618434906, + "learning_rate": 4.743658548577836e-05, + "loss": 0.8089, + "step": 161700 + }, + { + "epoch": 1.0331190984245429, + "grad_norm": 0.9149288535118103, + "learning_rate": 4.743157441236257e-05, + "loss": 0.7886, + "step": 161710 + }, + { + "epoch": 1.0331829855742816, + "grad_norm": 1.0109933614730835, + "learning_rate": 4.742656336481305e-05, + "loss": 0.9348, + "step": 161720 + }, + { + "epoch": 1.0332468727240203, + "grad_norm": 0.9548466205596924, + "learning_rate": 4.742155234318026e-05, + "loss": 0.8496, + "step": 161730 + }, + { + "epoch": 1.033310759873759, + "grad_norm": 0.7352958917617798, + "learning_rate": 4.741654134751467e-05, + "loss": 0.8121, + "step": 161740 + }, + { + "epoch": 1.0333746470234977, + "grad_norm": 0.6567855477333069, + "learning_rate": 4.7411530377866745e-05, + "loss": 1.0109, + "step": 161750 + }, + { + "epoch": 1.0334385341732364, + "grad_norm": 1.1146572828292847, + "learning_rate": 4.740651943428693e-05, + "loss": 1.097, + "step": 161760 + }, + { + "epoch": 1.033502421322975, + "grad_norm": 1.0022268295288086, + "learning_rate": 4.740150851682572e-05, + "loss": 0.8517, + "step": 161770 + }, + { + "epoch": 1.0335663084727138, + "grad_norm": 1.1390724182128906, + "learning_rate": 4.739649762553358e-05, + "loss": 0.9655, + "step": 161780 + }, + { + "epoch": 1.0336301956224525, + "grad_norm": 0.6415576934814453, + "learning_rate": 4.7391486760460946e-05, + "loss": 0.9868, + "step": 161790 + }, + { + "epoch": 1.0336940827721912, + "grad_norm": 0.741622269153595, + "learning_rate": 4.73864759216583e-05, + "loss": 1.0249, + "step": 161800 + }, + { + "epoch": 1.03375796992193, + "grad_norm": 0.6062465310096741, + "learning_rate": 4.738146510917611e-05, + "loss": 0.9439, + "step": 161810 + }, + { + "epoch": 1.0338218570716686, + "grad_norm": 1.0667154788970947, + "learning_rate": 4.737645432306483e-05, + "loss": 0.9751, + "step": 161820 + }, + { + "epoch": 1.0338857442214073, + "grad_norm": 0.8074294328689575, + "learning_rate": 4.737144356337492e-05, + "loss": 0.779, + "step": 161830 + }, + { + "epoch": 1.033949631371146, + "grad_norm": 0.8012452125549316, + "learning_rate": 4.736643283015685e-05, + "loss": 0.7509, + "step": 161840 + }, + { + "epoch": 1.0340135185208847, + "grad_norm": 0.5732237696647644, + "learning_rate": 4.736142212346108e-05, + "loss": 0.8607, + "step": 161850 + }, + { + "epoch": 1.0340774056706235, + "grad_norm": 6.360764503479004, + "learning_rate": 4.7356411443338064e-05, + "loss": 0.7453, + "step": 161860 + }, + { + "epoch": 1.0341412928203622, + "grad_norm": 0.8119571208953857, + "learning_rate": 4.735140078983828e-05, + "loss": 0.9051, + "step": 161870 + }, + { + "epoch": 1.0342051799701009, + "grad_norm": 0.9205659031867981, + "learning_rate": 4.7346390163012186e-05, + "loss": 0.89, + "step": 161880 + }, + { + "epoch": 1.0342690671198396, + "grad_norm": 0.8405207395553589, + "learning_rate": 4.734137956291023e-05, + "loss": 0.9451, + "step": 161890 + }, + { + "epoch": 1.0343329542695783, + "grad_norm": 0.6272563934326172, + "learning_rate": 4.733636898958289e-05, + "loss": 0.9643, + "step": 161900 + }, + { + "epoch": 1.034396841419317, + "grad_norm": 1.0878784656524658, + "learning_rate": 4.733135844308061e-05, + "loss": 0.9135, + "step": 161910 + }, + { + "epoch": 1.0344607285690557, + "grad_norm": 1.5157370567321777, + "learning_rate": 4.732634792345386e-05, + "loss": 0.62, + "step": 161920 + }, + { + "epoch": 1.0345246157187944, + "grad_norm": 0.5053197145462036, + "learning_rate": 4.73213374307531e-05, + "loss": 0.711, + "step": 161930 + }, + { + "epoch": 1.034588502868533, + "grad_norm": 0.8659653663635254, + "learning_rate": 4.7316326965028795e-05, + "loss": 0.9249, + "step": 161940 + }, + { + "epoch": 1.0346523900182718, + "grad_norm": 1.141728162765503, + "learning_rate": 4.731131652633139e-05, + "loss": 0.9143, + "step": 161950 + }, + { + "epoch": 1.0347162771680105, + "grad_norm": 1.3741239309310913, + "learning_rate": 4.730630611471137e-05, + "loss": 0.6867, + "step": 161960 + }, + { + "epoch": 1.0347801643177492, + "grad_norm": 0.9329667091369629, + "learning_rate": 4.7301295730219156e-05, + "loss": 0.8827, + "step": 161970 + }, + { + "epoch": 1.034844051467488, + "grad_norm": 0.9854671359062195, + "learning_rate": 4.7296285372905234e-05, + "loss": 0.8048, + "step": 161980 + }, + { + "epoch": 1.0349079386172266, + "grad_norm": 0.6096177101135254, + "learning_rate": 4.729127504282007e-05, + "loss": 0.8919, + "step": 161990 + }, + { + "epoch": 1.0349718257669653, + "grad_norm": 0.7038993239402771, + "learning_rate": 4.7286264740014105e-05, + "loss": 0.8944, + "step": 162000 + }, + { + "epoch": 1.035035712916704, + "grad_norm": 0.6950554847717285, + "learning_rate": 4.7281254464537796e-05, + "loss": 0.9064, + "step": 162010 + }, + { + "epoch": 1.0350996000664425, + "grad_norm": 1.203827977180481, + "learning_rate": 4.7276244216441604e-05, + "loss": 0.8495, + "step": 162020 + }, + { + "epoch": 1.0351634872161812, + "grad_norm": 1.0846861600875854, + "learning_rate": 4.7271233995776e-05, + "loss": 0.8045, + "step": 162030 + }, + { + "epoch": 1.03522737436592, + "grad_norm": 1.1663875579833984, + "learning_rate": 4.726622380259145e-05, + "loss": 1.0716, + "step": 162040 + }, + { + "epoch": 1.0352912615156586, + "grad_norm": 1.1435000896453857, + "learning_rate": 4.726121363693837e-05, + "loss": 0.9198, + "step": 162050 + }, + { + "epoch": 1.0353551486653974, + "grad_norm": 1.8560707569122314, + "learning_rate": 4.725620349886723e-05, + "loss": 1.0904, + "step": 162060 + }, + { + "epoch": 1.035419035815136, + "grad_norm": 0.8612160086631775, + "learning_rate": 4.72511933884285e-05, + "loss": 1.0926, + "step": 162070 + }, + { + "epoch": 1.0354829229648748, + "grad_norm": 0.7381712794303894, + "learning_rate": 4.724618330567262e-05, + "loss": 0.9483, + "step": 162080 + }, + { + "epoch": 1.0355468101146135, + "grad_norm": 0.8830359578132629, + "learning_rate": 4.724117325065007e-05, + "loss": 0.8329, + "step": 162090 + }, + { + "epoch": 1.0356106972643522, + "grad_norm": 0.5636923313140869, + "learning_rate": 4.723616322341128e-05, + "loss": 0.9412, + "step": 162100 + }, + { + "epoch": 1.0356745844140909, + "grad_norm": 1.070669174194336, + "learning_rate": 4.723115322400673e-05, + "loss": 0.8123, + "step": 162110 + }, + { + "epoch": 1.0357384715638296, + "grad_norm": 0.564687967300415, + "learning_rate": 4.7226143252486857e-05, + "loss": 0.8417, + "step": 162120 + }, + { + "epoch": 1.0358023587135683, + "grad_norm": 0.7282418012619019, + "learning_rate": 4.7221133308902126e-05, + "loss": 1.1135, + "step": 162130 + }, + { + "epoch": 1.035866245863307, + "grad_norm": 1.1398917436599731, + "learning_rate": 4.7216123393302984e-05, + "loss": 1.1587, + "step": 162140 + }, + { + "epoch": 1.0359301330130457, + "grad_norm": 0.6934093832969666, + "learning_rate": 4.721111350573989e-05, + "loss": 0.8892, + "step": 162150 + }, + { + "epoch": 1.0359940201627844, + "grad_norm": 1.1826916933059692, + "learning_rate": 4.7206103646263286e-05, + "loss": 0.8076, + "step": 162160 + }, + { + "epoch": 1.0360579073125231, + "grad_norm": 1.2234805822372437, + "learning_rate": 4.720109381492365e-05, + "loss": 0.8936, + "step": 162170 + }, + { + "epoch": 1.0361217944622618, + "grad_norm": 0.7950318455696106, + "learning_rate": 4.719608401177141e-05, + "loss": 0.9691, + "step": 162180 + }, + { + "epoch": 1.0361856816120005, + "grad_norm": 0.8671907186508179, + "learning_rate": 4.7191074236857034e-05, + "loss": 0.7834, + "step": 162190 + }, + { + "epoch": 1.0362495687617392, + "grad_norm": 1.9488028287887573, + "learning_rate": 4.718606449023097e-05, + "loss": 0.9613, + "step": 162200 + }, + { + "epoch": 1.036313455911478, + "grad_norm": 0.8187459111213684, + "learning_rate": 4.7181054771943675e-05, + "loss": 0.8139, + "step": 162210 + }, + { + "epoch": 1.0363773430612166, + "grad_norm": 0.9009087085723877, + "learning_rate": 4.717604508204559e-05, + "loss": 0.9519, + "step": 162220 + }, + { + "epoch": 1.0364412302109554, + "grad_norm": 0.9934369325637817, + "learning_rate": 4.717103542058717e-05, + "loss": 0.8313, + "step": 162230 + }, + { + "epoch": 1.036505117360694, + "grad_norm": 1.1523507833480835, + "learning_rate": 4.716602578761888e-05, + "loss": 0.7647, + "step": 162240 + }, + { + "epoch": 1.0365690045104328, + "grad_norm": 0.9399502277374268, + "learning_rate": 4.7161016183191165e-05, + "loss": 0.6769, + "step": 162250 + }, + { + "epoch": 1.0366328916601715, + "grad_norm": 1.1942522525787354, + "learning_rate": 4.715600660735446e-05, + "loss": 0.9192, + "step": 162260 + }, + { + "epoch": 1.0366967788099102, + "grad_norm": 0.9985781908035278, + "learning_rate": 4.715099706015924e-05, + "loss": 0.8788, + "step": 162270 + }, + { + "epoch": 1.0367606659596489, + "grad_norm": 0.9958245754241943, + "learning_rate": 4.7145987541655937e-05, + "loss": 0.8654, + "step": 162280 + }, + { + "epoch": 1.0368245531093876, + "grad_norm": 0.8271211981773376, + "learning_rate": 4.7140978051895006e-05, + "loss": 0.7453, + "step": 162290 + }, + { + "epoch": 1.0368884402591263, + "grad_norm": 0.8055025339126587, + "learning_rate": 4.713596859092691e-05, + "loss": 0.8883, + "step": 162300 + }, + { + "epoch": 1.036952327408865, + "grad_norm": 0.7597174644470215, + "learning_rate": 4.713095915880208e-05, + "loss": 0.7735, + "step": 162310 + }, + { + "epoch": 1.0370162145586037, + "grad_norm": 1.102772831916809, + "learning_rate": 4.7125949755570976e-05, + "loss": 0.9174, + "step": 162320 + }, + { + "epoch": 1.0370801017083424, + "grad_norm": 0.8158230781555176, + "learning_rate": 4.712094038128405e-05, + "loss": 1.0066, + "step": 162330 + }, + { + "epoch": 1.0371439888580811, + "grad_norm": 1.1537375450134277, + "learning_rate": 4.7115931035991737e-05, + "loss": 0.9317, + "step": 162340 + }, + { + "epoch": 1.0372078760078198, + "grad_norm": 1.2150251865386963, + "learning_rate": 4.7110921719744496e-05, + "loss": 0.8097, + "step": 162350 + }, + { + "epoch": 1.0372717631575585, + "grad_norm": 1.1560499668121338, + "learning_rate": 4.7105912432592776e-05, + "loss": 1.0273, + "step": 162360 + }, + { + "epoch": 1.0373356503072972, + "grad_norm": 0.8007164001464844, + "learning_rate": 4.710090317458702e-05, + "loss": 0.8984, + "step": 162370 + }, + { + "epoch": 1.037399537457036, + "grad_norm": 1.2273977994918823, + "learning_rate": 4.7095893945777675e-05, + "loss": 0.6576, + "step": 162380 + }, + { + "epoch": 1.0374634246067747, + "grad_norm": 0.8803560733795166, + "learning_rate": 4.709088474621519e-05, + "loss": 0.6075, + "step": 162390 + }, + { + "epoch": 1.0375273117565134, + "grad_norm": 1.3692848682403564, + "learning_rate": 4.7085875575950014e-05, + "loss": 1.1301, + "step": 162400 + }, + { + "epoch": 1.037591198906252, + "grad_norm": 0.5952280163764954, + "learning_rate": 4.70808664350326e-05, + "loss": 0.958, + "step": 162410 + }, + { + "epoch": 1.0376550860559908, + "grad_norm": 0.9508340954780579, + "learning_rate": 4.7075857323513375e-05, + "loss": 0.881, + "step": 162420 + }, + { + "epoch": 1.0377189732057295, + "grad_norm": 1.1710052490234375, + "learning_rate": 4.70708482414428e-05, + "loss": 0.8735, + "step": 162430 + }, + { + "epoch": 1.0377828603554682, + "grad_norm": 0.9404975175857544, + "learning_rate": 4.706583918887131e-05, + "loss": 1.0104, + "step": 162440 + }, + { + "epoch": 1.037846747505207, + "grad_norm": 2.7931301593780518, + "learning_rate": 4.706083016584937e-05, + "loss": 0.9396, + "step": 162450 + }, + { + "epoch": 1.0379106346549456, + "grad_norm": 0.9627763628959656, + "learning_rate": 4.705582117242741e-05, + "loss": 1.0136, + "step": 162460 + }, + { + "epoch": 1.0379745218046843, + "grad_norm": 0.7976264953613281, + "learning_rate": 4.705081220865587e-05, + "loss": 0.8678, + "step": 162470 + }, + { + "epoch": 1.038038408954423, + "grad_norm": 1.0382015705108643, + "learning_rate": 4.7045803274585216e-05, + "loss": 0.9829, + "step": 162480 + }, + { + "epoch": 1.0381022961041617, + "grad_norm": 0.73431795835495, + "learning_rate": 4.704079437026586e-05, + "loss": 0.7218, + "step": 162490 + }, + { + "epoch": 1.0381661832539004, + "grad_norm": 0.4776628613471985, + "learning_rate": 4.703578549574827e-05, + "loss": 0.9868, + "step": 162500 + }, + { + "epoch": 1.038230070403639, + "grad_norm": 0.8821565508842468, + "learning_rate": 4.703077665108289e-05, + "loss": 0.785, + "step": 162510 + }, + { + "epoch": 1.0382939575533776, + "grad_norm": 0.6736978888511658, + "learning_rate": 4.7025767836320185e-05, + "loss": 0.7856, + "step": 162520 + }, + { + "epoch": 1.0383578447031163, + "grad_norm": 0.9216102361679077, + "learning_rate": 4.7020759051510543e-05, + "loss": 0.8021, + "step": 162530 + }, + { + "epoch": 1.038421731852855, + "grad_norm": 0.7360609173774719, + "learning_rate": 4.7015750296704435e-05, + "loss": 0.9183, + "step": 162540 + }, + { + "epoch": 1.0384856190025937, + "grad_norm": 4.63002347946167, + "learning_rate": 4.70107415719523e-05, + "loss": 0.8354, + "step": 162550 + }, + { + "epoch": 1.0385495061523324, + "grad_norm": 0.9079681038856506, + "learning_rate": 4.7005732877304594e-05, + "loss": 0.7523, + "step": 162560 + }, + { + "epoch": 1.0386133933020711, + "grad_norm": 1.2179920673370361, + "learning_rate": 4.700072421281174e-05, + "loss": 0.7573, + "step": 162570 + }, + { + "epoch": 1.0386772804518098, + "grad_norm": 0.7866430282592773, + "learning_rate": 4.699571557852419e-05, + "loss": 0.7788, + "step": 162580 + }, + { + "epoch": 1.0387411676015486, + "grad_norm": 0.9179165959358215, + "learning_rate": 4.6990706974492385e-05, + "loss": 1.2034, + "step": 162590 + }, + { + "epoch": 1.0388050547512873, + "grad_norm": 0.9241698384284973, + "learning_rate": 4.698569840076676e-05, + "loss": 1.0294, + "step": 162600 + }, + { + "epoch": 1.038868941901026, + "grad_norm": 0.9719100594520569, + "learning_rate": 4.698068985739775e-05, + "loss": 0.7556, + "step": 162610 + }, + { + "epoch": 1.0389328290507647, + "grad_norm": 1.2796732187271118, + "learning_rate": 4.6975681344435824e-05, + "loss": 0.9342, + "step": 162620 + }, + { + "epoch": 1.0389967162005034, + "grad_norm": 1.0012308359146118, + "learning_rate": 4.69706728619314e-05, + "loss": 1.0237, + "step": 162630 + }, + { + "epoch": 1.039060603350242, + "grad_norm": 2.034093141555786, + "learning_rate": 4.6965664409934915e-05, + "loss": 0.7937, + "step": 162640 + }, + { + "epoch": 1.0391244904999808, + "grad_norm": 0.9703361988067627, + "learning_rate": 4.696065598849682e-05, + "loss": 1.0253, + "step": 162650 + }, + { + "epoch": 1.0391883776497195, + "grad_norm": 0.8073797821998596, + "learning_rate": 4.695564759766754e-05, + "loss": 0.9453, + "step": 162660 + }, + { + "epoch": 1.0392522647994582, + "grad_norm": 1.053033471107483, + "learning_rate": 4.6950639237497526e-05, + "loss": 1.2163, + "step": 162670 + }, + { + "epoch": 1.039316151949197, + "grad_norm": 1.111161470413208, + "learning_rate": 4.694563090803722e-05, + "loss": 0.759, + "step": 162680 + }, + { + "epoch": 1.0393800390989356, + "grad_norm": 0.8086190819740295, + "learning_rate": 4.6940622609337046e-05, + "loss": 1.0759, + "step": 162690 + }, + { + "epoch": 1.0394439262486743, + "grad_norm": 1.1267038583755493, + "learning_rate": 4.6935614341447455e-05, + "loss": 1.0972, + "step": 162700 + }, + { + "epoch": 1.039507813398413, + "grad_norm": 0.6237444877624512, + "learning_rate": 4.693060610441887e-05, + "loss": 0.8735, + "step": 162710 + }, + { + "epoch": 1.0395717005481517, + "grad_norm": 1.0405194759368896, + "learning_rate": 4.6925597898301746e-05, + "loss": 0.9158, + "step": 162720 + }, + { + "epoch": 1.0396355876978904, + "grad_norm": 0.8615335822105408, + "learning_rate": 4.692058972314651e-05, + "loss": 0.9571, + "step": 162730 + }, + { + "epoch": 1.0396994748476291, + "grad_norm": 1.0155763626098633, + "learning_rate": 4.691558157900359e-05, + "loss": 0.9306, + "step": 162740 + }, + { + "epoch": 1.0397633619973679, + "grad_norm": 1.2488973140716553, + "learning_rate": 4.691057346592344e-05, + "loss": 1.2076, + "step": 162750 + }, + { + "epoch": 1.0398272491471066, + "grad_norm": 0.745689332485199, + "learning_rate": 4.690556538395648e-05, + "loss": 0.7862, + "step": 162760 + }, + { + "epoch": 1.0398911362968453, + "grad_norm": 0.8243439793586731, + "learning_rate": 4.690055733315317e-05, + "loss": 0.6247, + "step": 162770 + }, + { + "epoch": 1.039955023446584, + "grad_norm": 1.0167168378829956, + "learning_rate": 4.689554931356391e-05, + "loss": 0.8157, + "step": 162780 + }, + { + "epoch": 1.0400189105963227, + "grad_norm": 0.9779410362243652, + "learning_rate": 4.689054132523917e-05, + "loss": 0.8555, + "step": 162790 + }, + { + "epoch": 1.0400827977460614, + "grad_norm": 0.9805686473846436, + "learning_rate": 4.688553336822936e-05, + "loss": 0.7161, + "step": 162800 + }, + { + "epoch": 1.0401466848958, + "grad_norm": 0.9556584358215332, + "learning_rate": 4.688052544258493e-05, + "loss": 0.7346, + "step": 162810 + }, + { + "epoch": 1.0402105720455388, + "grad_norm": 0.589160680770874, + "learning_rate": 4.687551754835629e-05, + "loss": 0.8524, + "step": 162820 + }, + { + "epoch": 1.0402744591952775, + "grad_norm": 0.9270586371421814, + "learning_rate": 4.6870509685593905e-05, + "loss": 0.8096, + "step": 162830 + }, + { + "epoch": 1.0403383463450162, + "grad_norm": 0.8865234851837158, + "learning_rate": 4.686550185434819e-05, + "loss": 1.0351, + "step": 162840 + }, + { + "epoch": 1.040402233494755, + "grad_norm": 0.7455191612243652, + "learning_rate": 4.6860494054669593e-05, + "loss": 0.9099, + "step": 162850 + }, + { + "epoch": 1.0404661206444936, + "grad_norm": 1.0902010202407837, + "learning_rate": 4.6855486286608526e-05, + "loss": 0.9958, + "step": 162860 + }, + { + "epoch": 1.0405300077942323, + "grad_norm": 0.9043089747428894, + "learning_rate": 4.685047855021544e-05, + "loss": 0.7603, + "step": 162870 + }, + { + "epoch": 1.040593894943971, + "grad_norm": 1.1999502182006836, + "learning_rate": 4.6845470845540755e-05, + "loss": 0.6993, + "step": 162880 + }, + { + "epoch": 1.0406577820937097, + "grad_norm": 0.9678430557250977, + "learning_rate": 4.6840463172634915e-05, + "loss": 0.8959, + "step": 162890 + }, + { + "epoch": 1.0407216692434484, + "grad_norm": 1.4664934873580933, + "learning_rate": 4.6835455531548336e-05, + "loss": 0.7311, + "step": 162900 + }, + { + "epoch": 1.0407855563931872, + "grad_norm": 1.0061843395233154, + "learning_rate": 4.683044792233146e-05, + "loss": 0.7976, + "step": 162910 + }, + { + "epoch": 1.0408494435429259, + "grad_norm": 1.5385732650756836, + "learning_rate": 4.6825440345034714e-05, + "loss": 0.7237, + "step": 162920 + }, + { + "epoch": 1.0409133306926646, + "grad_norm": 1.0533753633499146, + "learning_rate": 4.6820432799708536e-05, + "loss": 1.3697, + "step": 162930 + }, + { + "epoch": 1.0409772178424033, + "grad_norm": 0.6327069401741028, + "learning_rate": 4.681542528640335e-05, + "loss": 0.9806, + "step": 162940 + }, + { + "epoch": 1.041041104992142, + "grad_norm": 0.9591493010520935, + "learning_rate": 4.681041780516958e-05, + "loss": 0.9627, + "step": 162950 + }, + { + "epoch": 1.0411049921418807, + "grad_norm": 0.9499560594558716, + "learning_rate": 4.680541035605766e-05, + "loss": 0.986, + "step": 162960 + }, + { + "epoch": 1.0411688792916194, + "grad_norm": 1.0875968933105469, + "learning_rate": 4.680040293911803e-05, + "loss": 1.0011, + "step": 162970 + }, + { + "epoch": 1.0412327664413579, + "grad_norm": 0.7869101166725159, + "learning_rate": 4.679539555440111e-05, + "loss": 1.1579, + "step": 162980 + }, + { + "epoch": 1.0412966535910968, + "grad_norm": 0.9354702234268188, + "learning_rate": 4.6790388201957326e-05, + "loss": 0.7806, + "step": 162990 + }, + { + "epoch": 1.0413605407408353, + "grad_norm": 1.113673448562622, + "learning_rate": 4.678538088183711e-05, + "loss": 0.6678, + "step": 163000 + }, + { + "epoch": 1.041424427890574, + "grad_norm": 1.2135668992996216, + "learning_rate": 4.67803735940909e-05, + "loss": 1.1268, + "step": 163010 + }, + { + "epoch": 1.0414883150403127, + "grad_norm": 1.2587641477584839, + "learning_rate": 4.6775366338769106e-05, + "loss": 0.8575, + "step": 163020 + }, + { + "epoch": 1.0415522021900514, + "grad_norm": 0.9545971155166626, + "learning_rate": 4.677035911592216e-05, + "loss": 0.682, + "step": 163030 + }, + { + "epoch": 1.04161608933979, + "grad_norm": 0.8569166660308838, + "learning_rate": 4.676535192560049e-05, + "loss": 1.0957, + "step": 163040 + }, + { + "epoch": 1.0416799764895288, + "grad_norm": 1.1652165651321411, + "learning_rate": 4.6760344767854524e-05, + "loss": 0.9504, + "step": 163050 + }, + { + "epoch": 1.0417438636392675, + "grad_norm": 1.0948913097381592, + "learning_rate": 4.675533764273469e-05, + "loss": 1.0243, + "step": 163060 + }, + { + "epoch": 1.0418077507890062, + "grad_norm": 1.0545660257339478, + "learning_rate": 4.6750330550291414e-05, + "loss": 1.0981, + "step": 163070 + }, + { + "epoch": 1.041871637938745, + "grad_norm": 0.7978355884552002, + "learning_rate": 4.674532349057512e-05, + "loss": 0.7644, + "step": 163080 + }, + { + "epoch": 1.0419355250884836, + "grad_norm": 1.381232738494873, + "learning_rate": 4.674031646363624e-05, + "loss": 0.8781, + "step": 163090 + }, + { + "epoch": 1.0419994122382223, + "grad_norm": 0.7706129550933838, + "learning_rate": 4.673530946952518e-05, + "loss": 0.8316, + "step": 163100 + }, + { + "epoch": 1.042063299387961, + "grad_norm": 1.1203925609588623, + "learning_rate": 4.673030250829239e-05, + "loss": 1.0133, + "step": 163110 + }, + { + "epoch": 1.0421271865376998, + "grad_norm": 1.0205503702163696, + "learning_rate": 4.672529557998828e-05, + "loss": 0.858, + "step": 163120 + }, + { + "epoch": 1.0421910736874385, + "grad_norm": 0.9623286128044128, + "learning_rate": 4.672028868466327e-05, + "loss": 0.8325, + "step": 163130 + }, + { + "epoch": 1.0422549608371772, + "grad_norm": 1.5411490201950073, + "learning_rate": 4.6715281822367786e-05, + "loss": 0.8526, + "step": 163140 + }, + { + "epoch": 1.0423188479869159, + "grad_norm": 1.3137603998184204, + "learning_rate": 4.6710274993152264e-05, + "loss": 0.7788, + "step": 163150 + }, + { + "epoch": 1.0423827351366546, + "grad_norm": 0.8402478098869324, + "learning_rate": 4.670526819706712e-05, + "loss": 0.8678, + "step": 163160 + }, + { + "epoch": 1.0424466222863933, + "grad_norm": 1.447865605354309, + "learning_rate": 4.6700261434162774e-05, + "loss": 1.0133, + "step": 163170 + }, + { + "epoch": 1.042510509436132, + "grad_norm": 0.958998441696167, + "learning_rate": 4.669525470448965e-05, + "loss": 1.0255, + "step": 163180 + }, + { + "epoch": 1.0425743965858707, + "grad_norm": 0.8727611303329468, + "learning_rate": 4.669024800809817e-05, + "loss": 0.7536, + "step": 163190 + }, + { + "epoch": 1.0426382837356094, + "grad_norm": 1.4122403860092163, + "learning_rate": 4.668524134503875e-05, + "loss": 0.8819, + "step": 163200 + }, + { + "epoch": 1.0427021708853481, + "grad_norm": 1.0412826538085938, + "learning_rate": 4.668023471536183e-05, + "loss": 0.7679, + "step": 163210 + }, + { + "epoch": 1.0427660580350868, + "grad_norm": 0.7801997065544128, + "learning_rate": 4.6675228119117816e-05, + "loss": 0.8101, + "step": 163220 + }, + { + "epoch": 1.0428299451848255, + "grad_norm": 0.6483613848686218, + "learning_rate": 4.667022155635713e-05, + "loss": 1.0608, + "step": 163230 + }, + { + "epoch": 1.0428938323345642, + "grad_norm": 0.8273327946662903, + "learning_rate": 4.666521502713019e-05, + "loss": 1.1278, + "step": 163240 + }, + { + "epoch": 1.042957719484303, + "grad_norm": 0.8206183910369873, + "learning_rate": 4.666020853148742e-05, + "loss": 0.8313, + "step": 163250 + }, + { + "epoch": 1.0430216066340416, + "grad_norm": 1.0390762090682983, + "learning_rate": 4.6655202069479245e-05, + "loss": 0.8014, + "step": 163260 + }, + { + "epoch": 1.0430854937837803, + "grad_norm": 1.1970301866531372, + "learning_rate": 4.665019564115607e-05, + "loss": 0.7897, + "step": 163270 + }, + { + "epoch": 1.043149380933519, + "grad_norm": 0.8434377908706665, + "learning_rate": 4.6645189246568325e-05, + "loss": 0.892, + "step": 163280 + }, + { + "epoch": 1.0432132680832578, + "grad_norm": 0.9261834025382996, + "learning_rate": 4.6640182885766434e-05, + "loss": 0.858, + "step": 163290 + }, + { + "epoch": 1.0432771552329965, + "grad_norm": 0.863802433013916, + "learning_rate": 4.66351765588008e-05, + "loss": 0.9973, + "step": 163300 + }, + { + "epoch": 1.0433410423827352, + "grad_norm": 1.4032562971115112, + "learning_rate": 4.663017026572185e-05, + "loss": 0.8758, + "step": 163310 + }, + { + "epoch": 1.0434049295324739, + "grad_norm": 0.7954492568969727, + "learning_rate": 4.6625164006580006e-05, + "loss": 0.7323, + "step": 163320 + }, + { + "epoch": 1.0434688166822126, + "grad_norm": 0.5905239582061768, + "learning_rate": 4.662015778142568e-05, + "loss": 0.854, + "step": 163330 + }, + { + "epoch": 1.0435327038319513, + "grad_norm": 0.6909790635108948, + "learning_rate": 4.661515159030927e-05, + "loss": 0.8778, + "step": 163340 + }, + { + "epoch": 1.04359659098169, + "grad_norm": 2.076235055923462, + "learning_rate": 4.6610145433281225e-05, + "loss": 1.1878, + "step": 163350 + }, + { + "epoch": 1.0436604781314287, + "grad_norm": 0.9670212268829346, + "learning_rate": 4.6605139310391956e-05, + "loss": 1.1208, + "step": 163360 + }, + { + "epoch": 1.0437243652811674, + "grad_norm": 0.9897720813751221, + "learning_rate": 4.6600133221691865e-05, + "loss": 0.92, + "step": 163370 + }, + { + "epoch": 1.0437882524309061, + "grad_norm": 0.831823468208313, + "learning_rate": 4.659512716723138e-05, + "loss": 1.0368, + "step": 163380 + }, + { + "epoch": 1.0438521395806448, + "grad_norm": 1.1693984270095825, + "learning_rate": 4.65901211470609e-05, + "loss": 0.9192, + "step": 163390 + }, + { + "epoch": 1.0439160267303835, + "grad_norm": 1.2230675220489502, + "learning_rate": 4.6585115161230855e-05, + "loss": 0.8708, + "step": 163400 + }, + { + "epoch": 1.0439799138801222, + "grad_norm": 0.6010975241661072, + "learning_rate": 4.658010920979165e-05, + "loss": 0.7601, + "step": 163410 + }, + { + "epoch": 1.044043801029861, + "grad_norm": 0.9563870429992676, + "learning_rate": 4.657510329279371e-05, + "loss": 0.8896, + "step": 163420 + }, + { + "epoch": 1.0441076881795996, + "grad_norm": 0.8813214302062988, + "learning_rate": 4.657009741028745e-05, + "loss": 1.0249, + "step": 163430 + }, + { + "epoch": 1.0441715753293384, + "grad_norm": 1.9995386600494385, + "learning_rate": 4.656509156232326e-05, + "loss": 0.9217, + "step": 163440 + }, + { + "epoch": 1.044235462479077, + "grad_norm": 1.1445976495742798, + "learning_rate": 4.6560085748951574e-05, + "loss": 0.8641, + "step": 163450 + }, + { + "epoch": 1.0442993496288158, + "grad_norm": 0.9022873640060425, + "learning_rate": 4.655507997022281e-05, + "loss": 0.6487, + "step": 163460 + }, + { + "epoch": 1.0443632367785542, + "grad_norm": 1.0987581014633179, + "learning_rate": 4.6550074226187364e-05, + "loss": 1.0106, + "step": 163470 + }, + { + "epoch": 1.0444271239282932, + "grad_norm": 0.8926182389259338, + "learning_rate": 4.654506851689566e-05, + "loss": 0.9113, + "step": 163480 + }, + { + "epoch": 1.0444910110780317, + "grad_norm": 0.9439060688018799, + "learning_rate": 4.6540062842398106e-05, + "loss": 0.7761, + "step": 163490 + }, + { + "epoch": 1.0445548982277704, + "grad_norm": 1.0596351623535156, + "learning_rate": 4.6535057202745105e-05, + "loss": 0.7039, + "step": 163500 + }, + { + "epoch": 1.044618785377509, + "grad_norm": 1.1492658853530884, + "learning_rate": 4.6530051597987076e-05, + "loss": 0.7879, + "step": 163510 + }, + { + "epoch": 1.0446826725272478, + "grad_norm": 0.6139842867851257, + "learning_rate": 4.6525046028174435e-05, + "loss": 0.7387, + "step": 163520 + }, + { + "epoch": 1.0447465596769865, + "grad_norm": 0.8731631636619568, + "learning_rate": 4.6520040493357584e-05, + "loss": 0.7721, + "step": 163530 + }, + { + "epoch": 1.0448104468267252, + "grad_norm": 1.1998591423034668, + "learning_rate": 4.651503499358694e-05, + "loss": 0.7002, + "step": 163540 + }, + { + "epoch": 1.044874333976464, + "grad_norm": 0.9016870856285095, + "learning_rate": 4.65100295289129e-05, + "loss": 0.707, + "step": 163550 + }, + { + "epoch": 1.0449382211262026, + "grad_norm": 0.9702405333518982, + "learning_rate": 4.650502409938589e-05, + "loss": 0.9536, + "step": 163560 + }, + { + "epoch": 1.0450021082759413, + "grad_norm": 1.6170375347137451, + "learning_rate": 4.6500018705056295e-05, + "loss": 0.5949, + "step": 163570 + }, + { + "epoch": 1.04506599542568, + "grad_norm": 1.246148943901062, + "learning_rate": 4.6495013345974555e-05, + "loss": 1.0669, + "step": 163580 + }, + { + "epoch": 1.0451298825754187, + "grad_norm": 0.6922361254692078, + "learning_rate": 4.6490008022191056e-05, + "loss": 0.7937, + "step": 163590 + }, + { + "epoch": 1.0451937697251574, + "grad_norm": 0.8997225761413574, + "learning_rate": 4.6485002733756214e-05, + "loss": 0.9853, + "step": 163600 + }, + { + "epoch": 1.0452576568748961, + "grad_norm": 0.808290421962738, + "learning_rate": 4.647999748072044e-05, + "loss": 0.9732, + "step": 163610 + }, + { + "epoch": 1.0453215440246348, + "grad_norm": 1.9581161737442017, + "learning_rate": 4.647499226313413e-05, + "loss": 0.9051, + "step": 163620 + }, + { + "epoch": 1.0453854311743735, + "grad_norm": 0.791298508644104, + "learning_rate": 4.646998708104771e-05, + "loss": 0.9813, + "step": 163630 + }, + { + "epoch": 1.0454493183241123, + "grad_norm": 0.7668251395225525, + "learning_rate": 4.646498193451156e-05, + "loss": 0.9142, + "step": 163640 + }, + { + "epoch": 1.045513205473851, + "grad_norm": 0.5910570621490479, + "learning_rate": 4.6459976823576105e-05, + "loss": 1.1384, + "step": 163650 + }, + { + "epoch": 1.0455770926235897, + "grad_norm": 0.6951436400413513, + "learning_rate": 4.645497174829173e-05, + "loss": 0.8321, + "step": 163660 + }, + { + "epoch": 1.0456409797733284, + "grad_norm": 1.029000997543335, + "learning_rate": 4.644996670870887e-05, + "loss": 0.8471, + "step": 163670 + }, + { + "epoch": 1.045704866923067, + "grad_norm": 1.0844112634658813, + "learning_rate": 4.644496170487792e-05, + "loss": 0.8395, + "step": 163680 + }, + { + "epoch": 1.0457687540728058, + "grad_norm": 1.0361007452011108, + "learning_rate": 4.6439956736849284e-05, + "loss": 0.9918, + "step": 163690 + }, + { + "epoch": 1.0458326412225445, + "grad_norm": 0.8928366899490356, + "learning_rate": 4.643495180467336e-05, + "loss": 0.9797, + "step": 163700 + }, + { + "epoch": 1.0458965283722832, + "grad_norm": 1.0317213535308838, + "learning_rate": 4.642994690840055e-05, + "loss": 0.8224, + "step": 163710 + }, + { + "epoch": 1.045960415522022, + "grad_norm": 0.8236901760101318, + "learning_rate": 4.6424942048081275e-05, + "loss": 0.773, + "step": 163720 + }, + { + "epoch": 1.0460243026717606, + "grad_norm": 1.1160997152328491, + "learning_rate": 4.641993722376591e-05, + "loss": 0.9447, + "step": 163730 + }, + { + "epoch": 1.0460881898214993, + "grad_norm": 1.206079125404358, + "learning_rate": 4.6414932435504886e-05, + "loss": 0.8785, + "step": 163740 + }, + { + "epoch": 1.046152076971238, + "grad_norm": 0.8513423204421997, + "learning_rate": 4.64099276833486e-05, + "loss": 0.8516, + "step": 163750 + }, + { + "epoch": 1.0462159641209767, + "grad_norm": 0.8137949705123901, + "learning_rate": 4.640492296734744e-05, + "loss": 0.7112, + "step": 163760 + }, + { + "epoch": 1.0462798512707154, + "grad_norm": 2.2618188858032227, + "learning_rate": 4.6399918287551814e-05, + "loss": 0.8609, + "step": 163770 + }, + { + "epoch": 1.0463437384204541, + "grad_norm": 0.6762095093727112, + "learning_rate": 4.639491364401212e-05, + "loss": 0.8314, + "step": 163780 + }, + { + "epoch": 1.0464076255701928, + "grad_norm": 0.492422491312027, + "learning_rate": 4.638990903677878e-05, + "loss": 0.726, + "step": 163790 + }, + { + "epoch": 1.0464715127199316, + "grad_norm": 2.151423215866089, + "learning_rate": 4.638540492135234e-05, + "loss": 1.0316, + "step": 163800 + }, + { + "epoch": 1.0465353998696703, + "grad_norm": 0.8146224021911621, + "learning_rate": 4.638040038323989e-05, + "loss": 1.0298, + "step": 163810 + }, + { + "epoch": 1.046599287019409, + "grad_norm": 0.9958155751228333, + "learning_rate": 4.637539588157993e-05, + "loss": 0.9971, + "step": 163820 + }, + { + "epoch": 1.0466631741691477, + "grad_norm": 1.1163171529769897, + "learning_rate": 4.637039141642288e-05, + "loss": 0.9504, + "step": 163830 + }, + { + "epoch": 1.0467270613188864, + "grad_norm": 0.8440408110618591, + "learning_rate": 4.6365386987819124e-05, + "loss": 0.8803, + "step": 163840 + }, + { + "epoch": 1.046790948468625, + "grad_norm": 1.4019627571105957, + "learning_rate": 4.636038259581907e-05, + "loss": 0.87, + "step": 163850 + }, + { + "epoch": 1.0468548356183638, + "grad_norm": 2.8956496715545654, + "learning_rate": 4.635537824047311e-05, + "loss": 0.7409, + "step": 163860 + }, + { + "epoch": 1.0469187227681025, + "grad_norm": 0.9600827693939209, + "learning_rate": 4.6350373921831644e-05, + "loss": 0.6301, + "step": 163870 + }, + { + "epoch": 1.0469826099178412, + "grad_norm": 0.9884980320930481, + "learning_rate": 4.634536963994506e-05, + "loss": 0.8136, + "step": 163880 + }, + { + "epoch": 1.04704649706758, + "grad_norm": 0.8919425010681152, + "learning_rate": 4.634036539486378e-05, + "loss": 0.5576, + "step": 163890 + }, + { + "epoch": 1.0471103842173186, + "grad_norm": 1.3119444847106934, + "learning_rate": 4.6335361186638184e-05, + "loss": 1.0449, + "step": 163900 + }, + { + "epoch": 1.0471742713670573, + "grad_norm": 0.822734534740448, + "learning_rate": 4.633035701531867e-05, + "loss": 0.8107, + "step": 163910 + }, + { + "epoch": 1.047238158516796, + "grad_norm": 1.3529523611068726, + "learning_rate": 4.632535288095563e-05, + "loss": 0.8808, + "step": 163920 + }, + { + "epoch": 1.0473020456665347, + "grad_norm": 1.056562900543213, + "learning_rate": 4.6320348783599465e-05, + "loss": 0.8139, + "step": 163930 + }, + { + "epoch": 1.0473659328162734, + "grad_norm": 0.8713958859443665, + "learning_rate": 4.631534472330058e-05, + "loss": 1.2265, + "step": 163940 + }, + { + "epoch": 1.0474298199660121, + "grad_norm": 1.1219075918197632, + "learning_rate": 4.6310340700109355e-05, + "loss": 0.798, + "step": 163950 + }, + { + "epoch": 1.0474937071157506, + "grad_norm": 0.8713673949241638, + "learning_rate": 4.6305336714076195e-05, + "loss": 1.0113, + "step": 163960 + }, + { + "epoch": 1.0475575942654893, + "grad_norm": 0.8879077434539795, + "learning_rate": 4.6300332765251485e-05, + "loss": 0.8673, + "step": 163970 + }, + { + "epoch": 1.047621481415228, + "grad_norm": 0.7119232416152954, + "learning_rate": 4.6295328853685626e-05, + "loss": 0.7477, + "step": 163980 + }, + { + "epoch": 1.0476853685649667, + "grad_norm": 0.6839287281036377, + "learning_rate": 4.629032497942901e-05, + "loss": 0.7913, + "step": 163990 + }, + { + "epoch": 1.0477492557147055, + "grad_norm": 0.6696777939796448, + "learning_rate": 4.628532114253203e-05, + "loss": 0.652, + "step": 164000 + }, + { + "epoch": 1.0478131428644442, + "grad_norm": 0.7793158292770386, + "learning_rate": 4.628031734304508e-05, + "loss": 0.5978, + "step": 164010 + }, + { + "epoch": 1.0478770300141829, + "grad_norm": 0.8177552819252014, + "learning_rate": 4.627531358101855e-05, + "loss": 0.9784, + "step": 164020 + }, + { + "epoch": 1.0479409171639216, + "grad_norm": 1.0433216094970703, + "learning_rate": 4.6270309856502844e-05, + "loss": 0.8135, + "step": 164030 + }, + { + "epoch": 1.0480048043136603, + "grad_norm": 1.0402250289916992, + "learning_rate": 4.6265306169548344e-05, + "loss": 0.9751, + "step": 164040 + }, + { + "epoch": 1.048068691463399, + "grad_norm": 0.9214003086090088, + "learning_rate": 4.6260302520205434e-05, + "loss": 0.9328, + "step": 164050 + }, + { + "epoch": 1.0481325786131377, + "grad_norm": 0.8937646746635437, + "learning_rate": 4.625529890852452e-05, + "loss": 0.8227, + "step": 164060 + }, + { + "epoch": 1.0481964657628764, + "grad_norm": 0.989142656326294, + "learning_rate": 4.6250295334555984e-05, + "loss": 0.6317, + "step": 164070 + }, + { + "epoch": 1.048260352912615, + "grad_norm": 0.8708199858665466, + "learning_rate": 4.6245291798350214e-05, + "loss": 0.8059, + "step": 164080 + }, + { + "epoch": 1.0483242400623538, + "grad_norm": 0.7595809102058411, + "learning_rate": 4.624028829995761e-05, + "loss": 0.8904, + "step": 164090 + }, + { + "epoch": 1.0483881272120925, + "grad_norm": 0.7372194528579712, + "learning_rate": 4.623528483942855e-05, + "loss": 0.7613, + "step": 164100 + }, + { + "epoch": 1.0484520143618312, + "grad_norm": 1.089552402496338, + "learning_rate": 4.623028141681343e-05, + "loss": 1.0416, + "step": 164110 + }, + { + "epoch": 1.04851590151157, + "grad_norm": 1.4906351566314697, + "learning_rate": 4.6225278032162647e-05, + "loss": 0.7298, + "step": 164120 + }, + { + "epoch": 1.0485797886613086, + "grad_norm": 0.742764413356781, + "learning_rate": 4.622027468552658e-05, + "loss": 0.9095, + "step": 164130 + }, + { + "epoch": 1.0486436758110473, + "grad_norm": 1.2194721698760986, + "learning_rate": 4.6215271376955606e-05, + "loss": 1.0118, + "step": 164140 + }, + { + "epoch": 1.048707562960786, + "grad_norm": 0.6740885376930237, + "learning_rate": 4.621026810650012e-05, + "loss": 1.0442, + "step": 164150 + }, + { + "epoch": 1.0487714501105247, + "grad_norm": 0.9453091621398926, + "learning_rate": 4.620526487421052e-05, + "loss": 0.811, + "step": 164160 + }, + { + "epoch": 1.0488353372602635, + "grad_norm": 0.7730445265769958, + "learning_rate": 4.620026168013718e-05, + "loss": 0.8522, + "step": 164170 + }, + { + "epoch": 1.0488992244100022, + "grad_norm": 0.7625861167907715, + "learning_rate": 4.61952585243305e-05, + "loss": 1.0819, + "step": 164180 + }, + { + "epoch": 1.0489631115597409, + "grad_norm": 0.8981362581253052, + "learning_rate": 4.6190255406840855e-05, + "loss": 0.9319, + "step": 164190 + }, + { + "epoch": 1.0490269987094796, + "grad_norm": 0.9963375926017761, + "learning_rate": 4.618525232771863e-05, + "loss": 0.9983, + "step": 164200 + }, + { + "epoch": 1.0490908858592183, + "grad_norm": 0.873155951499939, + "learning_rate": 4.618024928701422e-05, + "loss": 0.85, + "step": 164210 + }, + { + "epoch": 1.049154773008957, + "grad_norm": 1.5402940511703491, + "learning_rate": 4.6175246284778e-05, + "loss": 0.7995, + "step": 164220 + }, + { + "epoch": 1.0492186601586957, + "grad_norm": 0.8315989375114441, + "learning_rate": 4.6170243321060356e-05, + "loss": 0.9546, + "step": 164230 + }, + { + "epoch": 1.0492825473084344, + "grad_norm": 0.9614139795303345, + "learning_rate": 4.616524039591168e-05, + "loss": 0.8737, + "step": 164240 + }, + { + "epoch": 1.049346434458173, + "grad_norm": 0.5436043739318848, + "learning_rate": 4.616023750938235e-05, + "loss": 0.8369, + "step": 164250 + }, + { + "epoch": 1.0494103216079118, + "grad_norm": 1.519921898841858, + "learning_rate": 4.615523466152275e-05, + "loss": 0.9794, + "step": 164260 + }, + { + "epoch": 1.0494742087576505, + "grad_norm": 1.4589173793792725, + "learning_rate": 4.6150231852383264e-05, + "loss": 0.8924, + "step": 164270 + }, + { + "epoch": 1.0495380959073892, + "grad_norm": 0.6913012266159058, + "learning_rate": 4.6145229082014276e-05, + "loss": 0.8225, + "step": 164280 + }, + { + "epoch": 1.049601983057128, + "grad_norm": 1.0679293870925903, + "learning_rate": 4.614022635046616e-05, + "loss": 0.6774, + "step": 164290 + }, + { + "epoch": 1.0496658702068666, + "grad_norm": 1.1619223356246948, + "learning_rate": 4.613522365778931e-05, + "loss": 0.822, + "step": 164300 + }, + { + "epoch": 1.0497297573566053, + "grad_norm": 1.5839335918426514, + "learning_rate": 4.6130221004034084e-05, + "loss": 0.8866, + "step": 164310 + }, + { + "epoch": 1.049793644506344, + "grad_norm": 0.945656418800354, + "learning_rate": 4.6125218389250894e-05, + "loss": 0.988, + "step": 164320 + }, + { + "epoch": 1.0498575316560828, + "grad_norm": 1.125430703163147, + "learning_rate": 4.612021581349011e-05, + "loss": 0.8968, + "step": 164330 + }, + { + "epoch": 1.0499214188058215, + "grad_norm": 0.88238126039505, + "learning_rate": 4.6115213276802104e-05, + "loss": 0.7057, + "step": 164340 + }, + { + "epoch": 1.0499853059555602, + "grad_norm": 0.8494474291801453, + "learning_rate": 4.611021077923727e-05, + "loss": 0.7707, + "step": 164350 + }, + { + "epoch": 1.0500491931052989, + "grad_norm": 1.4997683763504028, + "learning_rate": 4.6105208320845966e-05, + "loss": 1.0624, + "step": 164360 + }, + { + "epoch": 1.0501130802550376, + "grad_norm": 1.758766531944275, + "learning_rate": 4.61002059016786e-05, + "loss": 0.8057, + "step": 164370 + }, + { + "epoch": 1.0501769674047763, + "grad_norm": 1.3619985580444336, + "learning_rate": 4.6095203521785516e-05, + "loss": 0.7932, + "step": 164380 + }, + { + "epoch": 1.050240854554515, + "grad_norm": 0.7864593863487244, + "learning_rate": 4.609020118121712e-05, + "loss": 0.9398, + "step": 164390 + }, + { + "epoch": 1.0503047417042537, + "grad_norm": 0.868182897567749, + "learning_rate": 4.6085198880023774e-05, + "loss": 1.1271, + "step": 164400 + }, + { + "epoch": 1.0503686288539924, + "grad_norm": 1.1137135028839111, + "learning_rate": 4.608019661825587e-05, + "loss": 1.1727, + "step": 164410 + }, + { + "epoch": 1.050432516003731, + "grad_norm": 0.7334727048873901, + "learning_rate": 4.607519439596378e-05, + "loss": 1.04, + "step": 164420 + }, + { + "epoch": 1.0504964031534698, + "grad_norm": 0.8515543341636658, + "learning_rate": 4.607019221319787e-05, + "loss": 0.7822, + "step": 164430 + }, + { + "epoch": 1.0505602903032085, + "grad_norm": 0.8414304256439209, + "learning_rate": 4.606519007000853e-05, + "loss": 0.8864, + "step": 164440 + }, + { + "epoch": 1.050624177452947, + "grad_norm": 0.7646651268005371, + "learning_rate": 4.606018796644612e-05, + "loss": 1.0714, + "step": 164450 + }, + { + "epoch": 1.0506880646026857, + "grad_norm": 0.847108006477356, + "learning_rate": 4.605518590256104e-05, + "loss": 0.8618, + "step": 164460 + }, + { + "epoch": 1.0507519517524244, + "grad_norm": 1.3605737686157227, + "learning_rate": 4.605018387840364e-05, + "loss": 0.8288, + "step": 164470 + }, + { + "epoch": 1.0508158389021631, + "grad_norm": 1.32858407497406, + "learning_rate": 4.604518189402431e-05, + "loss": 1.0004, + "step": 164480 + }, + { + "epoch": 1.0508797260519018, + "grad_norm": 0.8185241222381592, + "learning_rate": 4.604017994947342e-05, + "loss": 0.8581, + "step": 164490 + }, + { + "epoch": 1.0509436132016405, + "grad_norm": 0.9678569436073303, + "learning_rate": 4.6035178044801344e-05, + "loss": 0.8152, + "step": 164500 + }, + { + "epoch": 1.0510075003513792, + "grad_norm": 1.0296353101730347, + "learning_rate": 4.603017618005845e-05, + "loss": 0.895, + "step": 164510 + }, + { + "epoch": 1.051071387501118, + "grad_norm": 0.9702723622322083, + "learning_rate": 4.602517435529511e-05, + "loss": 0.8082, + "step": 164520 + }, + { + "epoch": 1.0511352746508567, + "grad_norm": 0.5624803304672241, + "learning_rate": 4.602017257056171e-05, + "loss": 0.915, + "step": 164530 + }, + { + "epoch": 1.0511991618005954, + "grad_norm": 1.2291347980499268, + "learning_rate": 4.6015170825908614e-05, + "loss": 0.7313, + "step": 164540 + }, + { + "epoch": 1.051263048950334, + "grad_norm": 0.8868616223335266, + "learning_rate": 4.60101691213862e-05, + "loss": 0.8421, + "step": 164550 + }, + { + "epoch": 1.0513269361000728, + "grad_norm": 0.8309329152107239, + "learning_rate": 4.600516745704484e-05, + "loss": 0.9126, + "step": 164560 + }, + { + "epoch": 1.0513908232498115, + "grad_norm": 0.7704881429672241, + "learning_rate": 4.600016583293489e-05, + "loss": 0.7842, + "step": 164570 + }, + { + "epoch": 1.0514547103995502, + "grad_norm": 0.8722350001335144, + "learning_rate": 4.599516424910673e-05, + "loss": 0.8703, + "step": 164580 + }, + { + "epoch": 1.051518597549289, + "grad_norm": 1.562312126159668, + "learning_rate": 4.599016270561074e-05, + "loss": 0.7106, + "step": 164590 + }, + { + "epoch": 1.0515824846990276, + "grad_norm": 1.1941059827804565, + "learning_rate": 4.5985161202497275e-05, + "loss": 0.8637, + "step": 164600 + }, + { + "epoch": 1.0516463718487663, + "grad_norm": 1.0434215068817139, + "learning_rate": 4.598015973981673e-05, + "loss": 0.8606, + "step": 164610 + }, + { + "epoch": 1.051710258998505, + "grad_norm": 0.5463379621505737, + "learning_rate": 4.597515831761943e-05, + "loss": 0.9113, + "step": 164620 + }, + { + "epoch": 1.0517741461482437, + "grad_norm": 0.5214067697525024, + "learning_rate": 4.597015693595577e-05, + "loss": 0.7315, + "step": 164630 + }, + { + "epoch": 1.0518380332979824, + "grad_norm": 0.9531989097595215, + "learning_rate": 4.596515559487611e-05, + "loss": 0.8608, + "step": 164640 + }, + { + "epoch": 1.0519019204477211, + "grad_norm": 0.7098289728164673, + "learning_rate": 4.5960154294430836e-05, + "loss": 0.898, + "step": 164650 + }, + { + "epoch": 1.0519658075974598, + "grad_norm": 0.8016313314437866, + "learning_rate": 4.595515303467029e-05, + "loss": 0.8499, + "step": 164660 + }, + { + "epoch": 1.0520296947471985, + "grad_norm": 1.1667627096176147, + "learning_rate": 4.5950151815644866e-05, + "loss": 0.8077, + "step": 164670 + }, + { + "epoch": 1.0520935818969372, + "grad_norm": 0.8660734295845032, + "learning_rate": 4.594515063740491e-05, + "loss": 1.0402, + "step": 164680 + }, + { + "epoch": 1.052157469046676, + "grad_norm": 0.9622820615768433, + "learning_rate": 4.59401495000008e-05, + "loss": 0.7553, + "step": 164690 + }, + { + "epoch": 1.0522213561964147, + "grad_norm": 1.0822231769561768, + "learning_rate": 4.593514840348289e-05, + "loss": 0.8874, + "step": 164700 + }, + { + "epoch": 1.0522852433461534, + "grad_norm": 0.6997173428535461, + "learning_rate": 4.5930147347901556e-05, + "loss": 0.9951, + "step": 164710 + }, + { + "epoch": 1.052349130495892, + "grad_norm": 0.49910104274749756, + "learning_rate": 4.5925146333307164e-05, + "loss": 0.8204, + "step": 164720 + }, + { + "epoch": 1.0524130176456308, + "grad_norm": 2.026914358139038, + "learning_rate": 4.592014535975007e-05, + "loss": 1.1756, + "step": 164730 + }, + { + "epoch": 1.0524769047953695, + "grad_norm": 0.9706935286521912, + "learning_rate": 4.591514442728064e-05, + "loss": 0.9323, + "step": 164740 + }, + { + "epoch": 1.0525407919451082, + "grad_norm": 1.4648338556289673, + "learning_rate": 4.591014353594923e-05, + "loss": 0.7388, + "step": 164750 + }, + { + "epoch": 1.052604679094847, + "grad_norm": 1.224882960319519, + "learning_rate": 4.5905142685806226e-05, + "loss": 0.7921, + "step": 164760 + }, + { + "epoch": 1.0526685662445856, + "grad_norm": 0.6611737012863159, + "learning_rate": 4.590014187690198e-05, + "loss": 0.7512, + "step": 164770 + }, + { + "epoch": 1.0527324533943243, + "grad_norm": 1.2943297624588013, + "learning_rate": 4.5895141109286846e-05, + "loss": 0.9351, + "step": 164780 + }, + { + "epoch": 1.052796340544063, + "grad_norm": 0.680542528629303, + "learning_rate": 4.5890140383011194e-05, + "loss": 1.1433, + "step": 164790 + }, + { + "epoch": 1.0528602276938017, + "grad_norm": 1.3205195665359497, + "learning_rate": 4.588513969812538e-05, + "loss": 0.7543, + "step": 164800 + }, + { + "epoch": 1.0529241148435404, + "grad_norm": 1.1977238655090332, + "learning_rate": 4.588013905467977e-05, + "loss": 0.934, + "step": 164810 + }, + { + "epoch": 1.0529880019932791, + "grad_norm": 0.6131165623664856, + "learning_rate": 4.587513845272473e-05, + "loss": 0.9284, + "step": 164820 + }, + { + "epoch": 1.0530518891430178, + "grad_norm": 0.8425104022026062, + "learning_rate": 4.5870137892310607e-05, + "loss": 1.1248, + "step": 164830 + }, + { + "epoch": 1.0531157762927565, + "grad_norm": 0.9462717771530151, + "learning_rate": 4.586513737348776e-05, + "loss": 0.8297, + "step": 164840 + }, + { + "epoch": 1.0531796634424953, + "grad_norm": 1.0752277374267578, + "learning_rate": 4.586013689630657e-05, + "loss": 1.0739, + "step": 164850 + }, + { + "epoch": 1.053243550592234, + "grad_norm": 0.9188938140869141, + "learning_rate": 4.5855136460817385e-05, + "loss": 0.9023, + "step": 164860 + }, + { + "epoch": 1.0533074377419727, + "grad_norm": 0.6802998781204224, + "learning_rate": 4.585013606707055e-05, + "loss": 0.6506, + "step": 164870 + }, + { + "epoch": 1.0533713248917114, + "grad_norm": 0.7449166774749756, + "learning_rate": 4.5845135715116444e-05, + "loss": 1.0255, + "step": 164880 + }, + { + "epoch": 1.05343521204145, + "grad_norm": 1.3200147151947021, + "learning_rate": 4.584013540500542e-05, + "loss": 0.8287, + "step": 164890 + }, + { + "epoch": 1.0534990991911888, + "grad_norm": 0.9720762372016907, + "learning_rate": 4.583513513678782e-05, + "loss": 0.956, + "step": 164900 + }, + { + "epoch": 1.0535629863409275, + "grad_norm": 0.6958931684494019, + "learning_rate": 4.583013491051402e-05, + "loss": 0.8158, + "step": 164910 + }, + { + "epoch": 1.0536268734906662, + "grad_norm": 1.0599778890609741, + "learning_rate": 4.582513472623436e-05, + "loss": 0.8731, + "step": 164920 + }, + { + "epoch": 1.053690760640405, + "grad_norm": 0.5339720249176025, + "learning_rate": 4.582013458399922e-05, + "loss": 0.9776, + "step": 164930 + }, + { + "epoch": 1.0537546477901434, + "grad_norm": 0.8194118738174438, + "learning_rate": 4.581513448385893e-05, + "loss": 0.7849, + "step": 164940 + }, + { + "epoch": 1.053818534939882, + "grad_norm": 0.7215436697006226, + "learning_rate": 4.581013442586386e-05, + "loss": 0.7049, + "step": 164950 + }, + { + "epoch": 1.0538824220896208, + "grad_norm": 0.7318156361579895, + "learning_rate": 4.580513441006436e-05, + "loss": 0.8978, + "step": 164960 + }, + { + "epoch": 1.0539463092393595, + "grad_norm": 1.3139148950576782, + "learning_rate": 4.580013443651079e-05, + "loss": 0.8481, + "step": 164970 + }, + { + "epoch": 1.0540101963890982, + "grad_norm": 1.0408555269241333, + "learning_rate": 4.579513450525349e-05, + "loss": 0.655, + "step": 164980 + }, + { + "epoch": 1.054074083538837, + "grad_norm": 0.7458349466323853, + "learning_rate": 4.579013461634283e-05, + "loss": 0.7734, + "step": 164990 + }, + { + "epoch": 1.0541379706885756, + "grad_norm": 0.6548849940299988, + "learning_rate": 4.5785134769829156e-05, + "loss": 1.0121, + "step": 165000 + }, + { + "epoch": 1.0542018578383143, + "grad_norm": 1.47128164768219, + "learning_rate": 4.578013496576282e-05, + "loss": 0.9101, + "step": 165010 + }, + { + "epoch": 1.054265744988053, + "grad_norm": 2.107875347137451, + "learning_rate": 4.5775135204194176e-05, + "loss": 0.9387, + "step": 165020 + }, + { + "epoch": 1.0543296321377917, + "grad_norm": 1.0853489637374878, + "learning_rate": 4.5770135485173574e-05, + "loss": 0.8148, + "step": 165030 + }, + { + "epoch": 1.0543935192875304, + "grad_norm": 0.6279266476631165, + "learning_rate": 4.5765135808751357e-05, + "loss": 0.9949, + "step": 165040 + }, + { + "epoch": 1.0544574064372692, + "grad_norm": 0.9035767912864685, + "learning_rate": 4.57601361749779e-05, + "loss": 0.8647, + "step": 165050 + }, + { + "epoch": 1.0545212935870079, + "grad_norm": 0.9950432181358337, + "learning_rate": 4.5755136583903535e-05, + "loss": 1.0575, + "step": 165060 + }, + { + "epoch": 1.0545851807367466, + "grad_norm": 0.6531002521514893, + "learning_rate": 4.5750137035578625e-05, + "loss": 0.9391, + "step": 165070 + }, + { + "epoch": 1.0546490678864853, + "grad_norm": 0.6531468629837036, + "learning_rate": 4.57451375300535e-05, + "loss": 0.9551, + "step": 165080 + }, + { + "epoch": 1.054712955036224, + "grad_norm": 1.396674394607544, + "learning_rate": 4.574013806737853e-05, + "loss": 0.9346, + "step": 165090 + }, + { + "epoch": 1.0547768421859627, + "grad_norm": 1.2501516342163086, + "learning_rate": 4.573513864760407e-05, + "loss": 0.7332, + "step": 165100 + }, + { + "epoch": 1.0548407293357014, + "grad_norm": 1.078444004058838, + "learning_rate": 4.573013927078044e-05, + "loss": 1.02, + "step": 165110 + }, + { + "epoch": 1.05490461648544, + "grad_norm": 0.7868189811706543, + "learning_rate": 4.5725139936958e-05, + "loss": 0.7525, + "step": 165120 + }, + { + "epoch": 1.0549685036351788, + "grad_norm": 0.5394753813743591, + "learning_rate": 4.5720140646187096e-05, + "loss": 0.8605, + "step": 165130 + }, + { + "epoch": 1.0550323907849175, + "grad_norm": 1.1440558433532715, + "learning_rate": 4.5715141398518076e-05, + "loss": 0.9491, + "step": 165140 + }, + { + "epoch": 1.0550962779346562, + "grad_norm": 0.6817034482955933, + "learning_rate": 4.5710142194001285e-05, + "loss": 0.8546, + "step": 165150 + }, + { + "epoch": 1.055160165084395, + "grad_norm": 0.8437715172767639, + "learning_rate": 4.570514303268707e-05, + "loss": 0.7823, + "step": 165160 + }, + { + "epoch": 1.0552240522341336, + "grad_norm": 0.793656587600708, + "learning_rate": 4.5700143914625794e-05, + "loss": 0.9057, + "step": 165170 + }, + { + "epoch": 1.0552879393838723, + "grad_norm": 0.8038541078567505, + "learning_rate": 4.569514483986778e-05, + "loss": 0.9739, + "step": 165180 + }, + { + "epoch": 1.055351826533611, + "grad_norm": 1.1511846780776978, + "learning_rate": 4.569014580846339e-05, + "loss": 1.0002, + "step": 165190 + }, + { + "epoch": 1.0554157136833497, + "grad_norm": 0.7070904970169067, + "learning_rate": 4.568514682046295e-05, + "loss": 0.9256, + "step": 165200 + }, + { + "epoch": 1.0554796008330884, + "grad_norm": 1.058930516242981, + "learning_rate": 4.568014787591683e-05, + "loss": 0.9628, + "step": 165210 + }, + { + "epoch": 1.0555434879828272, + "grad_norm": 1.03263521194458, + "learning_rate": 4.567514897487535e-05, + "loss": 0.9915, + "step": 165220 + }, + { + "epoch": 1.0556073751325659, + "grad_norm": 1.2190163135528564, + "learning_rate": 4.567015011738885e-05, + "loss": 0.965, + "step": 165230 + }, + { + "epoch": 1.0556712622823046, + "grad_norm": 0.6620869040489197, + "learning_rate": 4.5665151303507704e-05, + "loss": 0.696, + "step": 165240 + }, + { + "epoch": 1.0557351494320433, + "grad_norm": 0.9180126786231995, + "learning_rate": 4.566015253328222e-05, + "loss": 0.8117, + "step": 165250 + }, + { + "epoch": 1.055799036581782, + "grad_norm": 1.2093178033828735, + "learning_rate": 4.5655153806762766e-05, + "loss": 1.1724, + "step": 165260 + }, + { + "epoch": 1.0558629237315207, + "grad_norm": 0.9234322309494019, + "learning_rate": 4.565015512399966e-05, + "loss": 0.8175, + "step": 165270 + }, + { + "epoch": 1.0559268108812594, + "grad_norm": 1.0635215044021606, + "learning_rate": 4.564515648504326e-05, + "loss": 0.9139, + "step": 165280 + }, + { + "epoch": 1.055990698030998, + "grad_norm": 0.6104520559310913, + "learning_rate": 4.56401578899439e-05, + "loss": 0.6578, + "step": 165290 + }, + { + "epoch": 1.0560545851807368, + "grad_norm": 1.0595885515213013, + "learning_rate": 4.563515933875193e-05, + "loss": 0.9892, + "step": 165300 + }, + { + "epoch": 1.0561184723304755, + "grad_norm": 0.8852406144142151, + "learning_rate": 4.5630160831517675e-05, + "loss": 1.0354, + "step": 165310 + }, + { + "epoch": 1.0561823594802142, + "grad_norm": 2.066617965698242, + "learning_rate": 4.562516236829148e-05, + "loss": 1.1012, + "step": 165320 + }, + { + "epoch": 1.056246246629953, + "grad_norm": 0.6148334741592407, + "learning_rate": 4.5620163949123687e-05, + "loss": 0.9188, + "step": 165330 + }, + { + "epoch": 1.0563101337796916, + "grad_norm": 0.8803834915161133, + "learning_rate": 4.5615165574064634e-05, + "loss": 0.9608, + "step": 165340 + }, + { + "epoch": 1.0563740209294303, + "grad_norm": 0.8797494769096375, + "learning_rate": 4.5610167243164655e-05, + "loss": 0.9332, + "step": 165350 + }, + { + "epoch": 1.056437908079169, + "grad_norm": 0.9440881013870239, + "learning_rate": 4.560516895647408e-05, + "loss": 0.7448, + "step": 165360 + }, + { + "epoch": 1.0565017952289077, + "grad_norm": 0.8351962566375732, + "learning_rate": 4.560017071404326e-05, + "loss": 0.8806, + "step": 165370 + }, + { + "epoch": 1.0565656823786465, + "grad_norm": 1.0572452545166016, + "learning_rate": 4.559517251592253e-05, + "loss": 0.8308, + "step": 165380 + }, + { + "epoch": 1.0566295695283852, + "grad_norm": 1.751478672027588, + "learning_rate": 4.559017436216223e-05, + "loss": 0.8669, + "step": 165390 + }, + { + "epoch": 1.0566934566781239, + "grad_norm": 0.6728960871696472, + "learning_rate": 4.558517625281268e-05, + "loss": 0.7418, + "step": 165400 + }, + { + "epoch": 1.0567573438278623, + "grad_norm": 0.8979251384735107, + "learning_rate": 4.5580178187924235e-05, + "loss": 1.1276, + "step": 165410 + }, + { + "epoch": 1.0568212309776013, + "grad_norm": 0.9827498197555542, + "learning_rate": 4.557518016754721e-05, + "loss": 0.8415, + "step": 165420 + }, + { + "epoch": 1.0568851181273398, + "grad_norm": 1.329075574874878, + "learning_rate": 4.5570182191731956e-05, + "loss": 0.9694, + "step": 165430 + }, + { + "epoch": 1.0569490052770785, + "grad_norm": 0.7447249293327332, + "learning_rate": 4.556518426052879e-05, + "loss": 0.777, + "step": 165440 + }, + { + "epoch": 1.0570128924268172, + "grad_norm": 0.7475079894065857, + "learning_rate": 4.5560186373988065e-05, + "loss": 0.7823, + "step": 165450 + }, + { + "epoch": 1.0570767795765559, + "grad_norm": 1.0540409088134766, + "learning_rate": 4.55551885321601e-05, + "loss": 0.7704, + "step": 165460 + }, + { + "epoch": 1.0571406667262946, + "grad_norm": 0.9531934857368469, + "learning_rate": 4.5550190735095235e-05, + "loss": 1.0657, + "step": 165470 + }, + { + "epoch": 1.0572045538760333, + "grad_norm": 1.0872973203659058, + "learning_rate": 4.5545192982843795e-05, + "loss": 1.0177, + "step": 165480 + }, + { + "epoch": 1.057268441025772, + "grad_norm": 0.9160931706428528, + "learning_rate": 4.554019527545612e-05, + "loss": 0.877, + "step": 165490 + }, + { + "epoch": 1.0573323281755107, + "grad_norm": 0.6422150135040283, + "learning_rate": 4.553519761298253e-05, + "loss": 1.1888, + "step": 165500 + }, + { + "epoch": 1.0573962153252494, + "grad_norm": 1.0594791173934937, + "learning_rate": 4.553019999547337e-05, + "loss": 0.8751, + "step": 165510 + }, + { + "epoch": 1.0574601024749881, + "grad_norm": 0.8945194482803345, + "learning_rate": 4.5525202422978955e-05, + "loss": 1.0125, + "step": 165520 + }, + { + "epoch": 1.0575239896247268, + "grad_norm": 0.8445724248886108, + "learning_rate": 4.552020489554963e-05, + "loss": 1.0213, + "step": 165530 + }, + { + "epoch": 1.0575878767744655, + "grad_norm": 0.7421507239341736, + "learning_rate": 4.551520741323571e-05, + "loss": 0.7153, + "step": 165540 + }, + { + "epoch": 1.0576517639242042, + "grad_norm": 1.3028641939163208, + "learning_rate": 4.551020997608754e-05, + "loss": 0.7857, + "step": 165550 + }, + { + "epoch": 1.057715651073943, + "grad_norm": 0.816175639629364, + "learning_rate": 4.550521258415543e-05, + "loss": 0.8056, + "step": 165560 + }, + { + "epoch": 1.0577795382236816, + "grad_norm": 0.7016112804412842, + "learning_rate": 4.550021523748971e-05, + "loss": 0.7017, + "step": 165570 + }, + { + "epoch": 1.0578434253734204, + "grad_norm": 0.6792407631874084, + "learning_rate": 4.549521793614076e-05, + "loss": 0.8311, + "step": 165580 + }, + { + "epoch": 1.057907312523159, + "grad_norm": 1.004659652709961, + "learning_rate": 4.5490220680158825e-05, + "loss": 0.8236, + "step": 165590 + }, + { + "epoch": 1.0579711996728978, + "grad_norm": 1.2447409629821777, + "learning_rate": 4.548522346959427e-05, + "loss": 0.727, + "step": 165600 + }, + { + "epoch": 1.0580350868226365, + "grad_norm": 0.8811039328575134, + "learning_rate": 4.548022630449743e-05, + "loss": 0.7418, + "step": 165610 + }, + { + "epoch": 1.0580989739723752, + "grad_norm": 0.8841410875320435, + "learning_rate": 4.547522918491862e-05, + "loss": 0.7467, + "step": 165620 + }, + { + "epoch": 1.0581628611221139, + "grad_norm": 0.8277806639671326, + "learning_rate": 4.547023211090816e-05, + "loss": 0.8483, + "step": 165630 + }, + { + "epoch": 1.0582267482718526, + "grad_norm": 0.8341606855392456, + "learning_rate": 4.5465235082516387e-05, + "loss": 1.0341, + "step": 165640 + }, + { + "epoch": 1.0582906354215913, + "grad_norm": 0.7356009483337402, + "learning_rate": 4.546023809979362e-05, + "loss": 0.8042, + "step": 165650 + }, + { + "epoch": 1.05835452257133, + "grad_norm": 1.4884812831878662, + "learning_rate": 4.545524116279018e-05, + "loss": 0.8323, + "step": 165660 + }, + { + "epoch": 1.0584184097210687, + "grad_norm": 0.9706101417541504, + "learning_rate": 4.54502442715564e-05, + "loss": 1.1248, + "step": 165670 + }, + { + "epoch": 1.0584822968708074, + "grad_norm": 0.9353876709938049, + "learning_rate": 4.5445247426142586e-05, + "loss": 0.7526, + "step": 165680 + }, + { + "epoch": 1.0585461840205461, + "grad_norm": 1.7931292057037354, + "learning_rate": 4.544025062659906e-05, + "loss": 0.8956, + "step": 165690 + }, + { + "epoch": 1.0586100711702848, + "grad_norm": 1.6735409498214722, + "learning_rate": 4.543525387297618e-05, + "loss": 0.9402, + "step": 165700 + }, + { + "epoch": 1.0586739583200235, + "grad_norm": 0.7010751962661743, + "learning_rate": 4.543025716532422e-05, + "loss": 0.8364, + "step": 165710 + }, + { + "epoch": 1.0587378454697622, + "grad_norm": 0.9951981902122498, + "learning_rate": 4.542526050369355e-05, + "loss": 1.0676, + "step": 165720 + }, + { + "epoch": 1.058801732619501, + "grad_norm": 2.725141763687134, + "learning_rate": 4.542026388813444e-05, + "loss": 1.0406, + "step": 165730 + }, + { + "epoch": 1.0588656197692397, + "grad_norm": 0.6263905763626099, + "learning_rate": 4.541526731869725e-05, + "loss": 1.0343, + "step": 165740 + }, + { + "epoch": 1.0589295069189784, + "grad_norm": 1.4677350521087646, + "learning_rate": 4.541027079543228e-05, + "loss": 0.9855, + "step": 165750 + }, + { + "epoch": 1.058993394068717, + "grad_norm": 0.8590619564056396, + "learning_rate": 4.540527431838986e-05, + "loss": 0.8498, + "step": 165760 + }, + { + "epoch": 1.0590572812184558, + "grad_norm": 2.01846981048584, + "learning_rate": 4.540027788762029e-05, + "loss": 0.9391, + "step": 165770 + }, + { + "epoch": 1.0591211683681945, + "grad_norm": 0.6292256116867065, + "learning_rate": 4.539528150317391e-05, + "loss": 0.8408, + "step": 165780 + }, + { + "epoch": 1.0591850555179332, + "grad_norm": 0.7966293096542358, + "learning_rate": 4.539028516510102e-05, + "loss": 0.8568, + "step": 165790 + }, + { + "epoch": 1.0592489426676719, + "grad_norm": 0.9135885834693909, + "learning_rate": 4.538528887345196e-05, + "loss": 0.8816, + "step": 165800 + }, + { + "epoch": 1.0593128298174106, + "grad_norm": 0.8115611672401428, + "learning_rate": 4.538029262827702e-05, + "loss": 0.8993, + "step": 165810 + }, + { + "epoch": 1.0593767169671493, + "grad_norm": 0.9862786531448364, + "learning_rate": 4.537529642962654e-05, + "loss": 0.8257, + "step": 165820 + }, + { + "epoch": 1.059440604116888, + "grad_norm": 0.9464985728263855, + "learning_rate": 4.537030027755082e-05, + "loss": 0.9438, + "step": 165830 + }, + { + "epoch": 1.0595044912666267, + "grad_norm": 0.6717821359634399, + "learning_rate": 4.536530417210019e-05, + "loss": 0.7328, + "step": 165840 + }, + { + "epoch": 1.0595683784163654, + "grad_norm": 1.0255380868911743, + "learning_rate": 4.5360308113324947e-05, + "loss": 0.8817, + "step": 165850 + }, + { + "epoch": 1.0596322655661041, + "grad_norm": 0.5127224922180176, + "learning_rate": 4.5355811700376274e-05, + "loss": 1.0763, + "step": 165860 + }, + { + "epoch": 1.0596961527158428, + "grad_norm": 0.7932383418083191, + "learning_rate": 4.5350815730422905e-05, + "loss": 0.7545, + "step": 165870 + }, + { + "epoch": 1.0597600398655815, + "grad_norm": 1.7010321617126465, + "learning_rate": 4.5345819807290847e-05, + "loss": 0.8557, + "step": 165880 + }, + { + "epoch": 1.0598239270153202, + "grad_norm": 0.7503814697265625, + "learning_rate": 4.53408239310304e-05, + "loss": 0.9328, + "step": 165890 + }, + { + "epoch": 1.0598878141650587, + "grad_norm": 1.0629501342773438, + "learning_rate": 4.53358281016919e-05, + "loss": 0.8286, + "step": 165900 + }, + { + "epoch": 1.0599517013147974, + "grad_norm": 0.9964657425880432, + "learning_rate": 4.533083231932563e-05, + "loss": 0.8564, + "step": 165910 + }, + { + "epoch": 1.0600155884645361, + "grad_norm": 0.8007980585098267, + "learning_rate": 4.532583658398193e-05, + "loss": 0.8858, + "step": 165920 + }, + { + "epoch": 1.0600794756142748, + "grad_norm": 0.8962225914001465, + "learning_rate": 4.5320840895711095e-05, + "loss": 0.8047, + "step": 165930 + }, + { + "epoch": 1.0601433627640136, + "grad_norm": 1.0580693483352661, + "learning_rate": 4.531584525456344e-05, + "loss": 0.7851, + "step": 165940 + }, + { + "epoch": 1.0602072499137523, + "grad_norm": 0.8029645681381226, + "learning_rate": 4.531084966058928e-05, + "loss": 0.8152, + "step": 165950 + }, + { + "epoch": 1.060271137063491, + "grad_norm": 0.8548164367675781, + "learning_rate": 4.5305854113838914e-05, + "loss": 1.0714, + "step": 165960 + }, + { + "epoch": 1.0603350242132297, + "grad_norm": 0.8717803359031677, + "learning_rate": 4.530085861436266e-05, + "loss": 0.8307, + "step": 165970 + }, + { + "epoch": 1.0603989113629684, + "grad_norm": 0.7216166853904724, + "learning_rate": 4.529586316221083e-05, + "loss": 0.8735, + "step": 165980 + }, + { + "epoch": 1.060462798512707, + "grad_norm": 1.3361812829971313, + "learning_rate": 4.529086775743372e-05, + "loss": 0.8581, + "step": 165990 + }, + { + "epoch": 1.0605266856624458, + "grad_norm": 1.0749822854995728, + "learning_rate": 4.528587240008165e-05, + "loss": 1.1413, + "step": 166000 + }, + { + "epoch": 1.0605905728121845, + "grad_norm": 1.623487114906311, + "learning_rate": 4.5280877090204915e-05, + "loss": 1.1168, + "step": 166010 + }, + { + "epoch": 1.0606544599619232, + "grad_norm": 0.959581196308136, + "learning_rate": 4.527588182785384e-05, + "loss": 0.8113, + "step": 166020 + }, + { + "epoch": 1.060718347111662, + "grad_norm": 1.2680141925811768, + "learning_rate": 4.5270886613078716e-05, + "loss": 0.9222, + "step": 166030 + }, + { + "epoch": 1.0607822342614006, + "grad_norm": 1.1884212493896484, + "learning_rate": 4.526589144592986e-05, + "loss": 0.7005, + "step": 166040 + }, + { + "epoch": 1.0608461214111393, + "grad_norm": 1.0362354516983032, + "learning_rate": 4.526089632645757e-05, + "loss": 0.765, + "step": 166050 + }, + { + "epoch": 1.060910008560878, + "grad_norm": 1.0656408071517944, + "learning_rate": 4.5255901254712156e-05, + "loss": 1.0426, + "step": 166060 + }, + { + "epoch": 1.0609738957106167, + "grad_norm": 0.8089607357978821, + "learning_rate": 4.5250906230743925e-05, + "loss": 1.4514, + "step": 166070 + }, + { + "epoch": 1.0610377828603554, + "grad_norm": 0.8284891247749329, + "learning_rate": 4.5245911254603166e-05, + "loss": 0.7829, + "step": 166080 + }, + { + "epoch": 1.0611016700100941, + "grad_norm": 0.869490385055542, + "learning_rate": 4.5240916326340205e-05, + "loss": 0.8162, + "step": 166090 + }, + { + "epoch": 1.0611655571598329, + "grad_norm": 0.9870272874832153, + "learning_rate": 4.523592144600532e-05, + "loss": 0.8931, + "step": 166100 + }, + { + "epoch": 1.0612294443095716, + "grad_norm": 1.2657639980316162, + "learning_rate": 4.523092661364885e-05, + "loss": 0.8136, + "step": 166110 + }, + { + "epoch": 1.0612933314593103, + "grad_norm": 0.9630696773529053, + "learning_rate": 4.5225931829321056e-05, + "loss": 0.9948, + "step": 166120 + }, + { + "epoch": 1.061357218609049, + "grad_norm": 3.2127113342285156, + "learning_rate": 4.5220937093072265e-05, + "loss": 0.7691, + "step": 166130 + }, + { + "epoch": 1.0614211057587877, + "grad_norm": 0.7906152606010437, + "learning_rate": 4.521594240495277e-05, + "loss": 0.9219, + "step": 166140 + }, + { + "epoch": 1.0614849929085264, + "grad_norm": 0.8807733058929443, + "learning_rate": 4.5210947765012876e-05, + "loss": 1.0482, + "step": 166150 + }, + { + "epoch": 1.061548880058265, + "grad_norm": 0.849648654460907, + "learning_rate": 4.520595317330287e-05, + "loss": 0.7637, + "step": 166160 + }, + { + "epoch": 1.0616127672080038, + "grad_norm": 0.848292350769043, + "learning_rate": 4.5200958629873074e-05, + "loss": 0.8503, + "step": 166170 + }, + { + "epoch": 1.0616766543577425, + "grad_norm": 0.877740740776062, + "learning_rate": 4.519596413477378e-05, + "loss": 0.8768, + "step": 166180 + }, + { + "epoch": 1.0617405415074812, + "grad_norm": 0.7507287859916687, + "learning_rate": 4.5190969688055275e-05, + "loss": 0.8739, + "step": 166190 + }, + { + "epoch": 1.06180442865722, + "grad_norm": 1.0137852430343628, + "learning_rate": 4.5185975289767866e-05, + "loss": 0.9776, + "step": 166200 + }, + { + "epoch": 1.0618683158069586, + "grad_norm": 0.5660897493362427, + "learning_rate": 4.518098093996187e-05, + "loss": 0.8763, + "step": 166210 + }, + { + "epoch": 1.0619322029566973, + "grad_norm": 1.1215473413467407, + "learning_rate": 4.5175986638687546e-05, + "loss": 0.8374, + "step": 166220 + }, + { + "epoch": 1.061996090106436, + "grad_norm": 0.8180109262466431, + "learning_rate": 4.5170992385995214e-05, + "loss": 0.8279, + "step": 166230 + }, + { + "epoch": 1.0620599772561747, + "grad_norm": 1.204626202583313, + "learning_rate": 4.5165998181935164e-05, + "loss": 1.0606, + "step": 166240 + }, + { + "epoch": 1.0621238644059134, + "grad_norm": 0.8536267280578613, + "learning_rate": 4.5161004026557696e-05, + "loss": 0.8713, + "step": 166250 + }, + { + "epoch": 1.0621877515556521, + "grad_norm": 0.7017921209335327, + "learning_rate": 4.51560099199131e-05, + "loss": 0.7666, + "step": 166260 + }, + { + "epoch": 1.0622516387053909, + "grad_norm": 0.7818034887313843, + "learning_rate": 4.515101586205168e-05, + "loss": 1.0096, + "step": 166270 + }, + { + "epoch": 1.0623155258551296, + "grad_norm": 1.2084392309188843, + "learning_rate": 4.5146021853023715e-05, + "loss": 1.1418, + "step": 166280 + }, + { + "epoch": 1.0623794130048683, + "grad_norm": 1.1068148612976074, + "learning_rate": 4.514102789287952e-05, + "loss": 0.9818, + "step": 166290 + }, + { + "epoch": 1.062443300154607, + "grad_norm": 0.9860485196113586, + "learning_rate": 4.5136033981669376e-05, + "loss": 0.9613, + "step": 166300 + }, + { + "epoch": 1.0625071873043457, + "grad_norm": 0.842819333076477, + "learning_rate": 4.513104011944357e-05, + "loss": 0.9115, + "step": 166310 + }, + { + "epoch": 1.0625710744540844, + "grad_norm": 0.5500154495239258, + "learning_rate": 4.512604630625241e-05, + "loss": 0.9704, + "step": 166320 + }, + { + "epoch": 1.062634961603823, + "grad_norm": 0.7272401452064514, + "learning_rate": 4.512105254214617e-05, + "loss": 1.0708, + "step": 166330 + }, + { + "epoch": 1.0626988487535618, + "grad_norm": 0.5617895126342773, + "learning_rate": 4.511605882717516e-05, + "loss": 0.7842, + "step": 166340 + }, + { + "epoch": 1.0627627359033005, + "grad_norm": 0.9489856958389282, + "learning_rate": 4.5111065161389667e-05, + "loss": 0.8623, + "step": 166350 + }, + { + "epoch": 1.0628266230530392, + "grad_norm": 0.7376025915145874, + "learning_rate": 4.510607154483997e-05, + "loss": 0.8115, + "step": 166360 + }, + { + "epoch": 1.062890510202778, + "grad_norm": 2.6469199657440186, + "learning_rate": 4.5101077977576376e-05, + "loss": 0.8634, + "step": 166370 + }, + { + "epoch": 1.0629543973525166, + "grad_norm": 0.6932651996612549, + "learning_rate": 4.509608445964916e-05, + "loss": 0.6602, + "step": 166380 + }, + { + "epoch": 1.063018284502255, + "grad_norm": 1.501181960105896, + "learning_rate": 4.509109099110861e-05, + "loss": 1.0629, + "step": 166390 + }, + { + "epoch": 1.063082171651994, + "grad_norm": 1.0305095911026, + "learning_rate": 4.508609757200503e-05, + "loss": 0.9877, + "step": 166400 + }, + { + "epoch": 1.0631460588017325, + "grad_norm": 0.8262702226638794, + "learning_rate": 4.508110420238869e-05, + "loss": 0.8587, + "step": 166410 + }, + { + "epoch": 1.0632099459514712, + "grad_norm": 0.8938581347465515, + "learning_rate": 4.507611088230989e-05, + "loss": 0.8692, + "step": 166420 + }, + { + "epoch": 1.06327383310121, + "grad_norm": 1.1800354719161987, + "learning_rate": 4.5071117611818914e-05, + "loss": 0.9498, + "step": 166430 + }, + { + "epoch": 1.0633377202509486, + "grad_norm": 0.742709755897522, + "learning_rate": 4.5066124390966045e-05, + "loss": 0.7247, + "step": 166440 + }, + { + "epoch": 1.0634016074006873, + "grad_norm": 0.7936336398124695, + "learning_rate": 4.506113121980158e-05, + "loss": 0.8067, + "step": 166450 + }, + { + "epoch": 1.063465494550426, + "grad_norm": 0.7889736890792847, + "learning_rate": 4.505613809837579e-05, + "loss": 1.0327, + "step": 166460 + }, + { + "epoch": 1.0635293817001648, + "grad_norm": 1.5141351222991943, + "learning_rate": 4.505114502673896e-05, + "loss": 1.1342, + "step": 166470 + }, + { + "epoch": 1.0635932688499035, + "grad_norm": 0.8879356980323792, + "learning_rate": 4.504615200494139e-05, + "loss": 1.1157, + "step": 166480 + }, + { + "epoch": 1.0636571559996422, + "grad_norm": 0.4957992732524872, + "learning_rate": 4.5041159033033356e-05, + "loss": 0.8696, + "step": 166490 + }, + { + "epoch": 1.0637210431493809, + "grad_norm": 1.099480390548706, + "learning_rate": 4.5036166111065136e-05, + "loss": 0.88, + "step": 166500 + }, + { + "epoch": 1.0637849302991196, + "grad_norm": 1.1279675960540771, + "learning_rate": 4.503117323908702e-05, + "loss": 0.839, + "step": 166510 + }, + { + "epoch": 1.0638488174488583, + "grad_norm": 1.0145577192306519, + "learning_rate": 4.5026180417149284e-05, + "loss": 1.0136, + "step": 166520 + }, + { + "epoch": 1.063912704598597, + "grad_norm": 0.5523449778556824, + "learning_rate": 4.502118764530222e-05, + "loss": 0.748, + "step": 166530 + }, + { + "epoch": 1.0639765917483357, + "grad_norm": 1.2338347434997559, + "learning_rate": 4.501619492359609e-05, + "loss": 0.7316, + "step": 166540 + }, + { + "epoch": 1.0640404788980744, + "grad_norm": 0.9280697703361511, + "learning_rate": 4.50112022520812e-05, + "loss": 0.849, + "step": 166550 + }, + { + "epoch": 1.064104366047813, + "grad_norm": 1.0032998323440552, + "learning_rate": 4.500620963080782e-05, + "loss": 0.9341, + "step": 166560 + }, + { + "epoch": 1.0641682531975518, + "grad_norm": 1.1202173233032227, + "learning_rate": 4.500121705982622e-05, + "loss": 0.9334, + "step": 166570 + }, + { + "epoch": 1.0642321403472905, + "grad_norm": 1.0374928712844849, + "learning_rate": 4.49962245391867e-05, + "loss": 0.7728, + "step": 166580 + }, + { + "epoch": 1.0642960274970292, + "grad_norm": 0.8324866890907288, + "learning_rate": 4.499123206893953e-05, + "loss": 0.8554, + "step": 166590 + }, + { + "epoch": 1.064359914646768, + "grad_norm": 0.8850876688957214, + "learning_rate": 4.4986239649134975e-05, + "loss": 0.7597, + "step": 166600 + }, + { + "epoch": 1.0644238017965066, + "grad_norm": 0.7691602110862732, + "learning_rate": 4.498124727982333e-05, + "loss": 0.7727, + "step": 166610 + }, + { + "epoch": 1.0644876889462453, + "grad_norm": 0.7466904520988464, + "learning_rate": 4.497625496105487e-05, + "loss": 0.8507, + "step": 166620 + }, + { + "epoch": 1.064551576095984, + "grad_norm": 1.0000793933868408, + "learning_rate": 4.497126269287986e-05, + "loss": 0.7191, + "step": 166630 + }, + { + "epoch": 1.0646154632457228, + "grad_norm": 0.7658026218414307, + "learning_rate": 4.4966270475348596e-05, + "loss": 0.6359, + "step": 166640 + }, + { + "epoch": 1.0646793503954615, + "grad_norm": 0.7452802658081055, + "learning_rate": 4.496127830851133e-05, + "loss": 0.8817, + "step": 166650 + }, + { + "epoch": 1.0647432375452002, + "grad_norm": 0.8648143410682678, + "learning_rate": 4.4956286192418364e-05, + "loss": 0.7892, + "step": 166660 + }, + { + "epoch": 1.0648071246949389, + "grad_norm": 0.7528172731399536, + "learning_rate": 4.4951294127119955e-05, + "loss": 0.7276, + "step": 166670 + }, + { + "epoch": 1.0648710118446776, + "grad_norm": 0.9941525459289551, + "learning_rate": 4.4946302112666386e-05, + "loss": 0.6766, + "step": 166680 + }, + { + "epoch": 1.0649348989944163, + "grad_norm": 0.9956261515617371, + "learning_rate": 4.4941310149107916e-05, + "loss": 0.7099, + "step": 166690 + }, + { + "epoch": 1.064998786144155, + "grad_norm": 0.6918345093727112, + "learning_rate": 4.4936318236494846e-05, + "loss": 0.6829, + "step": 166700 + }, + { + "epoch": 1.0650626732938937, + "grad_norm": 1.1491965055465698, + "learning_rate": 4.493132637487742e-05, + "loss": 0.7505, + "step": 166710 + }, + { + "epoch": 1.0651265604436324, + "grad_norm": 0.7228277921676636, + "learning_rate": 4.492633456430592e-05, + "loss": 0.6065, + "step": 166720 + }, + { + "epoch": 1.0651904475933711, + "grad_norm": 1.5048450231552124, + "learning_rate": 4.492134280483063e-05, + "loss": 0.8662, + "step": 166730 + }, + { + "epoch": 1.0652543347431098, + "grad_norm": 0.8829289078712463, + "learning_rate": 4.49163510965018e-05, + "loss": 1.1516, + "step": 166740 + }, + { + "epoch": 1.0653182218928485, + "grad_norm": 3.7568225860595703, + "learning_rate": 4.491135943936972e-05, + "loss": 0.9261, + "step": 166750 + }, + { + "epoch": 1.0653821090425872, + "grad_norm": 0.618725061416626, + "learning_rate": 4.490636783348465e-05, + "loss": 0.7957, + "step": 166760 + }, + { + "epoch": 1.065445996192326, + "grad_norm": 0.86220383644104, + "learning_rate": 4.4901376278896865e-05, + "loss": 1.0595, + "step": 166770 + }, + { + "epoch": 1.0655098833420646, + "grad_norm": 1.004080891609192, + "learning_rate": 4.489638477565663e-05, + "loss": 0.9258, + "step": 166780 + }, + { + "epoch": 1.0655737704918034, + "grad_norm": 0.8573571443557739, + "learning_rate": 4.4891393323814214e-05, + "loss": 0.7236, + "step": 166790 + }, + { + "epoch": 1.065637657641542, + "grad_norm": 0.7342075109481812, + "learning_rate": 4.488640192341988e-05, + "loss": 0.9191, + "step": 166800 + }, + { + "epoch": 1.0657015447912808, + "grad_norm": 1.0184838771820068, + "learning_rate": 4.4881410574523916e-05, + "loss": 1.002, + "step": 166810 + }, + { + "epoch": 1.0657654319410195, + "grad_norm": 0.7228994965553284, + "learning_rate": 4.487641927717657e-05, + "loss": 0.9738, + "step": 166820 + }, + { + "epoch": 1.0658293190907582, + "grad_norm": 1.0932427644729614, + "learning_rate": 4.4871428031428116e-05, + "loss": 0.9745, + "step": 166830 + }, + { + "epoch": 1.0658932062404969, + "grad_norm": 1.0328798294067383, + "learning_rate": 4.4866436837328816e-05, + "loss": 0.7275, + "step": 166840 + }, + { + "epoch": 1.0659570933902356, + "grad_norm": 0.9208298921585083, + "learning_rate": 4.486144569492894e-05, + "loss": 0.7779, + "step": 166850 + }, + { + "epoch": 1.066020980539974, + "grad_norm": 1.0833368301391602, + "learning_rate": 4.485645460427874e-05, + "loss": 0.8716, + "step": 166860 + }, + { + "epoch": 1.066084867689713, + "grad_norm": 1.336681842803955, + "learning_rate": 4.4851463565428504e-05, + "loss": 0.9037, + "step": 166870 + }, + { + "epoch": 1.0661487548394515, + "grad_norm": 0.800895631313324, + "learning_rate": 4.484647257842848e-05, + "loss": 0.7627, + "step": 166880 + }, + { + "epoch": 1.0662126419891902, + "grad_norm": 0.9473403096199036, + "learning_rate": 4.484148164332894e-05, + "loss": 1.1128, + "step": 166890 + }, + { + "epoch": 1.066276529138929, + "grad_norm": 1.6257963180541992, + "learning_rate": 4.4836490760180136e-05, + "loss": 0.7619, + "step": 166900 + }, + { + "epoch": 1.0663404162886676, + "grad_norm": 0.9070813655853271, + "learning_rate": 4.4831499929032353e-05, + "loss": 0.9571, + "step": 166910 + }, + { + "epoch": 1.0664043034384063, + "grad_norm": 3.044250249862671, + "learning_rate": 4.482650914993582e-05, + "loss": 0.7744, + "step": 166920 + }, + { + "epoch": 1.066468190588145, + "grad_norm": 1.1026273965835571, + "learning_rate": 4.4821518422940824e-05, + "loss": 0.828, + "step": 166930 + }, + { + "epoch": 1.0665320777378837, + "grad_norm": 1.101633906364441, + "learning_rate": 4.4816527748097616e-05, + "loss": 0.9907, + "step": 166940 + }, + { + "epoch": 1.0665959648876224, + "grad_norm": 0.8642861247062683, + "learning_rate": 4.481153712545645e-05, + "loss": 0.7035, + "step": 166950 + }, + { + "epoch": 1.0666598520373611, + "grad_norm": 1.0184928178787231, + "learning_rate": 4.480654655506761e-05, + "loss": 0.8259, + "step": 166960 + }, + { + "epoch": 1.0667237391870998, + "grad_norm": 1.317468285560608, + "learning_rate": 4.4801556036981324e-05, + "loss": 0.8339, + "step": 166970 + }, + { + "epoch": 1.0667876263368385, + "grad_norm": 1.0101289749145508, + "learning_rate": 4.479656557124787e-05, + "loss": 0.9275, + "step": 166980 + }, + { + "epoch": 1.0668515134865773, + "grad_norm": 1.7089424133300781, + "learning_rate": 4.479157515791751e-05, + "loss": 0.7327, + "step": 166990 + }, + { + "epoch": 1.066915400636316, + "grad_norm": 0.7492755651473999, + "learning_rate": 4.4786584797040485e-05, + "loss": 0.6942, + "step": 167000 + }, + { + "epoch": 1.0669792877860547, + "grad_norm": 1.4743152856826782, + "learning_rate": 4.4781594488667065e-05, + "loss": 0.6916, + "step": 167010 + }, + { + "epoch": 1.0670431749357934, + "grad_norm": 1.0931754112243652, + "learning_rate": 4.47766042328475e-05, + "loss": 0.8617, + "step": 167020 + }, + { + "epoch": 1.067107062085532, + "grad_norm": 2.1639623641967773, + "learning_rate": 4.477161402963206e-05, + "loss": 0.7629, + "step": 167030 + }, + { + "epoch": 1.0671709492352708, + "grad_norm": 0.7249797582626343, + "learning_rate": 4.476662387907098e-05, + "loss": 0.9629, + "step": 167040 + }, + { + "epoch": 1.0672348363850095, + "grad_norm": 1.3922828435897827, + "learning_rate": 4.476163378121452e-05, + "loss": 1.0208, + "step": 167050 + }, + { + "epoch": 1.0672987235347482, + "grad_norm": 0.8654850721359253, + "learning_rate": 4.475664373611294e-05, + "loss": 1.0369, + "step": 167060 + }, + { + "epoch": 1.067362610684487, + "grad_norm": 1.1859705448150635, + "learning_rate": 4.47516537438165e-05, + "loss": 0.7634, + "step": 167070 + }, + { + "epoch": 1.0674264978342256, + "grad_norm": 0.7614762187004089, + "learning_rate": 4.474666380437545e-05, + "loss": 0.79, + "step": 167080 + }, + { + "epoch": 1.0674903849839643, + "grad_norm": 4.074455261230469, + "learning_rate": 4.4741673917840035e-05, + "loss": 0.9808, + "step": 167090 + }, + { + "epoch": 1.067554272133703, + "grad_norm": 0.8677727580070496, + "learning_rate": 4.473668408426052e-05, + "loss": 0.9667, + "step": 167100 + }, + { + "epoch": 1.0676181592834417, + "grad_norm": 0.8779893517494202, + "learning_rate": 4.4731694303687144e-05, + "loss": 0.8795, + "step": 167110 + }, + { + "epoch": 1.0676820464331804, + "grad_norm": 0.8248956203460693, + "learning_rate": 4.4726704576170165e-05, + "loss": 0.8779, + "step": 167120 + }, + { + "epoch": 1.0677459335829191, + "grad_norm": 0.7094441056251526, + "learning_rate": 4.472171490175983e-05, + "loss": 1.049, + "step": 167130 + }, + { + "epoch": 1.0678098207326578, + "grad_norm": 0.6770461201667786, + "learning_rate": 4.47167252805064e-05, + "loss": 0.9599, + "step": 167140 + }, + { + "epoch": 1.0678737078823966, + "grad_norm": 1.041219711303711, + "learning_rate": 4.471173571246011e-05, + "loss": 1.0222, + "step": 167150 + }, + { + "epoch": 1.0679375950321353, + "grad_norm": 0.9974572062492371, + "learning_rate": 4.470674619767122e-05, + "loss": 0.9056, + "step": 167160 + }, + { + "epoch": 1.068001482181874, + "grad_norm": 0.9295303821563721, + "learning_rate": 4.470175673618998e-05, + "loss": 0.9401, + "step": 167170 + }, + { + "epoch": 1.0680653693316127, + "grad_norm": 1.0348182916641235, + "learning_rate": 4.4696767328066626e-05, + "loss": 1.0809, + "step": 167180 + }, + { + "epoch": 1.0681292564813514, + "grad_norm": 0.7640472650527954, + "learning_rate": 4.4691777973351426e-05, + "loss": 0.9645, + "step": 167190 + }, + { + "epoch": 1.06819314363109, + "grad_norm": 0.6735183000564575, + "learning_rate": 4.46867886720946e-05, + "loss": 0.8436, + "step": 167200 + }, + { + "epoch": 1.0682570307808288, + "grad_norm": 0.9889833331108093, + "learning_rate": 4.468179942434641e-05, + "loss": 0.8263, + "step": 167210 + }, + { + "epoch": 1.0683209179305675, + "grad_norm": 0.9082995057106018, + "learning_rate": 4.4676810230157107e-05, + "loss": 0.7079, + "step": 167220 + }, + { + "epoch": 1.0683848050803062, + "grad_norm": 0.7941724061965942, + "learning_rate": 4.467182108957692e-05, + "loss": 0.6653, + "step": 167230 + }, + { + "epoch": 1.068448692230045, + "grad_norm": 0.6307287216186523, + "learning_rate": 4.466683200265611e-05, + "loss": 0.8493, + "step": 167240 + }, + { + "epoch": 1.0685125793797836, + "grad_norm": 0.5019100904464722, + "learning_rate": 4.466184296944492e-05, + "loss": 0.9378, + "step": 167250 + }, + { + "epoch": 1.0685764665295223, + "grad_norm": 2.5777366161346436, + "learning_rate": 4.465685398999358e-05, + "loss": 0.9536, + "step": 167260 + }, + { + "epoch": 1.068640353679261, + "grad_norm": 0.7746297121047974, + "learning_rate": 4.465186506435235e-05, + "loss": 1.051, + "step": 167270 + }, + { + "epoch": 1.0687042408289997, + "grad_norm": 0.7943103313446045, + "learning_rate": 4.4646876192571465e-05, + "loss": 0.7168, + "step": 167280 + }, + { + "epoch": 1.0687681279787384, + "grad_norm": 2.4050159454345703, + "learning_rate": 4.464188737470117e-05, + "loss": 0.7538, + "step": 167290 + }, + { + "epoch": 1.0688320151284771, + "grad_norm": 1.3626781702041626, + "learning_rate": 4.463689861079169e-05, + "loss": 0.8716, + "step": 167300 + }, + { + "epoch": 1.0688959022782158, + "grad_norm": 2.211430788040161, + "learning_rate": 4.463190990089329e-05, + "loss": 1.2561, + "step": 167310 + }, + { + "epoch": 1.0689597894279546, + "grad_norm": 0.8661367893218994, + "learning_rate": 4.462692124505621e-05, + "loss": 0.8686, + "step": 167320 + }, + { + "epoch": 1.0690236765776933, + "grad_norm": 0.7823354005813599, + "learning_rate": 4.462193264333067e-05, + "loss": 0.8867, + "step": 167330 + }, + { + "epoch": 1.069087563727432, + "grad_norm": 1.1050783395767212, + "learning_rate": 4.4616944095766924e-05, + "loss": 0.7956, + "step": 167340 + }, + { + "epoch": 1.0691514508771705, + "grad_norm": 2.726477861404419, + "learning_rate": 4.4611955602415215e-05, + "loss": 0.9001, + "step": 167350 + }, + { + "epoch": 1.0692153380269094, + "grad_norm": 0.719448447227478, + "learning_rate": 4.4606967163325765e-05, + "loss": 0.9276, + "step": 167360 + }, + { + "epoch": 1.0692792251766479, + "grad_norm": 1.078935146331787, + "learning_rate": 4.460197877854882e-05, + "loss": 0.8703, + "step": 167370 + }, + { + "epoch": 1.0693431123263866, + "grad_norm": 0.5898913145065308, + "learning_rate": 4.4596990448134625e-05, + "loss": 0.895, + "step": 167380 + }, + { + "epoch": 1.0694069994761253, + "grad_norm": 0.8286154270172119, + "learning_rate": 4.459200217213339e-05, + "loss": 0.9249, + "step": 167390 + }, + { + "epoch": 1.069470886625864, + "grad_norm": 2.352092742919922, + "learning_rate": 4.45870139505954e-05, + "loss": 1.0513, + "step": 167400 + }, + { + "epoch": 1.0695347737756027, + "grad_norm": 1.0438710451126099, + "learning_rate": 4.458202578357085e-05, + "loss": 0.6509, + "step": 167410 + }, + { + "epoch": 1.0695986609253414, + "grad_norm": 1.3391057252883911, + "learning_rate": 4.457703767110999e-05, + "loss": 0.8631, + "step": 167420 + }, + { + "epoch": 1.06966254807508, + "grad_norm": 1.0806117057800293, + "learning_rate": 4.4572049613263055e-05, + "loss": 0.8065, + "step": 167430 + }, + { + "epoch": 1.0697264352248188, + "grad_norm": 0.7838671803474426, + "learning_rate": 4.456706161008027e-05, + "loss": 0.8195, + "step": 167440 + }, + { + "epoch": 1.0697903223745575, + "grad_norm": 0.7801133394241333, + "learning_rate": 4.456207366161188e-05, + "loss": 0.665, + "step": 167450 + }, + { + "epoch": 1.0698542095242962, + "grad_norm": 1.2091721296310425, + "learning_rate": 4.4557085767908116e-05, + "loss": 1.0544, + "step": 167460 + }, + { + "epoch": 1.069918096674035, + "grad_norm": 0.7456448674201965, + "learning_rate": 4.45520979290192e-05, + "loss": 0.8551, + "step": 167470 + }, + { + "epoch": 1.0699819838237736, + "grad_norm": 0.9205488562583923, + "learning_rate": 4.454711014499538e-05, + "loss": 0.8446, + "step": 167480 + }, + { + "epoch": 1.0700458709735123, + "grad_norm": 0.8250446319580078, + "learning_rate": 4.4542122415886863e-05, + "loss": 0.9594, + "step": 167490 + }, + { + "epoch": 1.070109758123251, + "grad_norm": 1.068128228187561, + "learning_rate": 4.45371347417439e-05, + "loss": 0.8108, + "step": 167500 + }, + { + "epoch": 1.0701736452729897, + "grad_norm": 0.781631350517273, + "learning_rate": 4.453214712261672e-05, + "loss": 0.7391, + "step": 167510 + }, + { + "epoch": 1.0702375324227285, + "grad_norm": 0.6052827835083008, + "learning_rate": 4.452715955855555e-05, + "loss": 0.8435, + "step": 167520 + }, + { + "epoch": 1.0703014195724672, + "grad_norm": 1.0644205808639526, + "learning_rate": 4.4522172049610613e-05, + "loss": 0.9365, + "step": 167530 + }, + { + "epoch": 1.0703653067222059, + "grad_norm": 0.9585718512535095, + "learning_rate": 4.451718459583215e-05, + "loss": 1.0394, + "step": 167540 + }, + { + "epoch": 1.0704291938719446, + "grad_norm": 0.5789369344711304, + "learning_rate": 4.451219719727038e-05, + "loss": 0.8574, + "step": 167550 + }, + { + "epoch": 1.0704930810216833, + "grad_norm": 0.7152701020240784, + "learning_rate": 4.450720985397553e-05, + "loss": 1.0427, + "step": 167560 + }, + { + "epoch": 1.070556968171422, + "grad_norm": 0.8098849058151245, + "learning_rate": 4.4502222565997826e-05, + "loss": 1.0818, + "step": 167570 + }, + { + "epoch": 1.0706208553211607, + "grad_norm": 0.870178759098053, + "learning_rate": 4.44972353333875e-05, + "loss": 1.1725, + "step": 167580 + }, + { + "epoch": 1.0706847424708994, + "grad_norm": 0.6339501142501831, + "learning_rate": 4.449224815619476e-05, + "loss": 0.8222, + "step": 167590 + }, + { + "epoch": 1.070748629620638, + "grad_norm": 0.4935135543346405, + "learning_rate": 4.448775974414488e-05, + "loss": 0.9549, + "step": 167600 + }, + { + "epoch": 1.0708125167703768, + "grad_norm": 1.5905778408050537, + "learning_rate": 4.448277267238396e-05, + "loss": 1.0304, + "step": 167610 + }, + { + "epoch": 1.0708764039201155, + "grad_norm": 0.7705449461936951, + "learning_rate": 4.44777856561863e-05, + "loss": 0.858, + "step": 167620 + }, + { + "epoch": 1.0709402910698542, + "grad_norm": 0.9918437600135803, + "learning_rate": 4.447279869560211e-05, + "loss": 0.8912, + "step": 167630 + }, + { + "epoch": 1.071004178219593, + "grad_norm": 1.0865930318832397, + "learning_rate": 4.4467811790681626e-05, + "loss": 0.8048, + "step": 167640 + }, + { + "epoch": 1.0710680653693316, + "grad_norm": 0.8210586905479431, + "learning_rate": 4.446282494147506e-05, + "loss": 0.8904, + "step": 167650 + }, + { + "epoch": 1.0711319525190703, + "grad_norm": 0.9417276978492737, + "learning_rate": 4.445783814803263e-05, + "loss": 0.9236, + "step": 167660 + }, + { + "epoch": 1.071195839668809, + "grad_norm": 1.0197242498397827, + "learning_rate": 4.4452851410404575e-05, + "loss": 0.7601, + "step": 167670 + }, + { + "epoch": 1.0712597268185478, + "grad_norm": 1.1198569536209106, + "learning_rate": 4.4447864728641106e-05, + "loss": 0.7624, + "step": 167680 + }, + { + "epoch": 1.0713236139682865, + "grad_norm": 0.7067335844039917, + "learning_rate": 4.4442878102792436e-05, + "loss": 0.8205, + "step": 167690 + }, + { + "epoch": 1.0713875011180252, + "grad_norm": 0.8206075429916382, + "learning_rate": 4.443789153290879e-05, + "loss": 0.8195, + "step": 167700 + }, + { + "epoch": 1.0714513882677639, + "grad_norm": 0.9611579775810242, + "learning_rate": 4.4432905019040386e-05, + "loss": 0.6071, + "step": 167710 + }, + { + "epoch": 1.0715152754175026, + "grad_norm": 1.1848875284194946, + "learning_rate": 4.442791856123744e-05, + "loss": 0.7083, + "step": 167720 + }, + { + "epoch": 1.0715791625672413, + "grad_norm": 0.7153217196464539, + "learning_rate": 4.442293215955019e-05, + "loss": 1.1518, + "step": 167730 + }, + { + "epoch": 1.07164304971698, + "grad_norm": 1.418792963027954, + "learning_rate": 4.4417945814028834e-05, + "loss": 0.9059, + "step": 167740 + }, + { + "epoch": 1.0717069368667187, + "grad_norm": 1.3425724506378174, + "learning_rate": 4.4412959524723586e-05, + "loss": 0.6204, + "step": 167750 + }, + { + "epoch": 1.0717708240164574, + "grad_norm": 0.91298907995224, + "learning_rate": 4.440797329168467e-05, + "loss": 0.8499, + "step": 167760 + }, + { + "epoch": 1.071834711166196, + "grad_norm": 0.994581937789917, + "learning_rate": 4.440298711496231e-05, + "loss": 1.0169, + "step": 167770 + }, + { + "epoch": 1.0718985983159348, + "grad_norm": 0.8729711174964905, + "learning_rate": 4.439800099460671e-05, + "loss": 0.6801, + "step": 167780 + }, + { + "epoch": 1.0719624854656735, + "grad_norm": 0.9728224873542786, + "learning_rate": 4.4393014930668084e-05, + "loss": 0.6659, + "step": 167790 + }, + { + "epoch": 1.0720263726154122, + "grad_norm": 1.115736722946167, + "learning_rate": 4.4388028923196645e-05, + "loss": 1.0955, + "step": 167800 + }, + { + "epoch": 1.072090259765151, + "grad_norm": 0.9253178834915161, + "learning_rate": 4.438304297224261e-05, + "loss": 0.7719, + "step": 167810 + }, + { + "epoch": 1.0721541469148896, + "grad_norm": 0.7113856673240662, + "learning_rate": 4.4378057077856216e-05, + "loss": 0.991, + "step": 167820 + }, + { + "epoch": 1.0722180340646283, + "grad_norm": 1.2730010747909546, + "learning_rate": 4.4373071240087624e-05, + "loss": 1.0908, + "step": 167830 + }, + { + "epoch": 1.0722819212143668, + "grad_norm": 0.6992239356040955, + "learning_rate": 4.4368085458987075e-05, + "loss": 0.8704, + "step": 167840 + }, + { + "epoch": 1.0723458083641058, + "grad_norm": 0.8266318440437317, + "learning_rate": 4.436309973460478e-05, + "loss": 0.8084, + "step": 167850 + }, + { + "epoch": 1.0724096955138442, + "grad_norm": 0.6330587863922119, + "learning_rate": 4.4358114066990944e-05, + "loss": 0.8448, + "step": 167860 + }, + { + "epoch": 1.072473582663583, + "grad_norm": 2.0142178535461426, + "learning_rate": 4.435312845619579e-05, + "loss": 0.7857, + "step": 167870 + }, + { + "epoch": 1.0725374698133217, + "grad_norm": 1.0071065425872803, + "learning_rate": 4.434814290226951e-05, + "loss": 1.2855, + "step": 167880 + }, + { + "epoch": 1.0726013569630604, + "grad_norm": 0.9862484335899353, + "learning_rate": 4.434315740526232e-05, + "loss": 1.083, + "step": 167890 + }, + { + "epoch": 1.072665244112799, + "grad_norm": 0.7634795904159546, + "learning_rate": 4.433817196522443e-05, + "loss": 0.9183, + "step": 167900 + }, + { + "epoch": 1.0727291312625378, + "grad_norm": 1.3216174840927124, + "learning_rate": 4.433318658220605e-05, + "loss": 1.0064, + "step": 167910 + }, + { + "epoch": 1.0727930184122765, + "grad_norm": 1.0118058919906616, + "learning_rate": 4.432820125625738e-05, + "loss": 0.8958, + "step": 167920 + }, + { + "epoch": 1.0728569055620152, + "grad_norm": 1.1793543100357056, + "learning_rate": 4.432321598742863e-05, + "loss": 1.077, + "step": 167930 + }, + { + "epoch": 1.072920792711754, + "grad_norm": 0.829182505607605, + "learning_rate": 4.4318230775770006e-05, + "loss": 0.6589, + "step": 167940 + }, + { + "epoch": 1.0729846798614926, + "grad_norm": 0.9053888916969299, + "learning_rate": 4.431324562133172e-05, + "loss": 0.7827, + "step": 167950 + }, + { + "epoch": 1.0730485670112313, + "grad_norm": 1.006364345550537, + "learning_rate": 4.430826052416396e-05, + "loss": 1.0471, + "step": 167960 + }, + { + "epoch": 1.07311245416097, + "grad_norm": 3.569615125656128, + "learning_rate": 4.430327548431695e-05, + "loss": 1.0903, + "step": 167970 + }, + { + "epoch": 1.0731763413107087, + "grad_norm": 1.1795485019683838, + "learning_rate": 4.429829050184088e-05, + "loss": 0.8895, + "step": 167980 + }, + { + "epoch": 1.0732402284604474, + "grad_norm": 1.0050119161605835, + "learning_rate": 4.429330557678595e-05, + "loss": 0.8886, + "step": 167990 + }, + { + "epoch": 1.0733041156101861, + "grad_norm": 0.7378489971160889, + "learning_rate": 4.428832070920238e-05, + "loss": 0.7695, + "step": 168000 + }, + { + "epoch": 1.0733680027599248, + "grad_norm": 1.542384147644043, + "learning_rate": 4.428333589914036e-05, + "loss": 0.6757, + "step": 168010 + }, + { + "epoch": 1.0734318899096635, + "grad_norm": 0.9338440299034119, + "learning_rate": 4.4278351146650086e-05, + "loss": 0.8971, + "step": 168020 + }, + { + "epoch": 1.0734957770594022, + "grad_norm": 0.7650867700576782, + "learning_rate": 4.427336645178177e-05, + "loss": 0.7302, + "step": 168030 + }, + { + "epoch": 1.073559664209141, + "grad_norm": 0.7335389852523804, + "learning_rate": 4.42683818145856e-05, + "loss": 1.0885, + "step": 168040 + }, + { + "epoch": 1.0736235513588797, + "grad_norm": 1.6143547296524048, + "learning_rate": 4.4263397235111795e-05, + "loss": 0.9723, + "step": 168050 + }, + { + "epoch": 1.0736874385086184, + "grad_norm": 0.8721071481704712, + "learning_rate": 4.425841271341055e-05, + "loss": 0.9947, + "step": 168060 + }, + { + "epoch": 1.073751325658357, + "grad_norm": 0.7515314221382141, + "learning_rate": 4.425342824953204e-05, + "loss": 1.0497, + "step": 168070 + }, + { + "epoch": 1.0738152128080958, + "grad_norm": 1.048852801322937, + "learning_rate": 4.424844384352649e-05, + "loss": 1.0662, + "step": 168080 + }, + { + "epoch": 1.0738790999578345, + "grad_norm": 0.9293131232261658, + "learning_rate": 4.424345949544408e-05, + "loss": 0.96, + "step": 168090 + }, + { + "epoch": 1.0739429871075732, + "grad_norm": 0.540847659111023, + "learning_rate": 4.4238475205335015e-05, + "loss": 1.0502, + "step": 168100 + }, + { + "epoch": 1.074006874257312, + "grad_norm": 0.7694253325462341, + "learning_rate": 4.423349097324949e-05, + "loss": 0.8383, + "step": 168110 + }, + { + "epoch": 1.0740707614070506, + "grad_norm": 0.9716971516609192, + "learning_rate": 4.42285067992377e-05, + "loss": 1.1018, + "step": 168120 + }, + { + "epoch": 1.0741346485567893, + "grad_norm": 1.4167827367782593, + "learning_rate": 4.4223522683349835e-05, + "loss": 0.9259, + "step": 168130 + }, + { + "epoch": 1.074198535706528, + "grad_norm": 0.784041702747345, + "learning_rate": 4.42185386256361e-05, + "loss": 0.7515, + "step": 168140 + }, + { + "epoch": 1.0742624228562667, + "grad_norm": 0.80231112241745, + "learning_rate": 4.421355462614668e-05, + "loss": 0.9655, + "step": 168150 + }, + { + "epoch": 1.0743263100060054, + "grad_norm": 0.7538440823554993, + "learning_rate": 4.420857068493178e-05, + "loss": 0.7496, + "step": 168160 + }, + { + "epoch": 1.0743901971557441, + "grad_norm": 2.33453106880188, + "learning_rate": 4.4203586802041566e-05, + "loss": 0.7772, + "step": 168170 + }, + { + "epoch": 1.0744540843054828, + "grad_norm": 1.0160599946975708, + "learning_rate": 4.419860297752626e-05, + "loss": 0.6744, + "step": 168180 + }, + { + "epoch": 1.0745179714552215, + "grad_norm": 0.8873048424720764, + "learning_rate": 4.419361921143604e-05, + "loss": 0.9338, + "step": 168190 + }, + { + "epoch": 1.0745818586049602, + "grad_norm": 0.8100428581237793, + "learning_rate": 4.4188635503821094e-05, + "loss": 0.6733, + "step": 168200 + }, + { + "epoch": 1.074645745754699, + "grad_norm": 0.87029629945755, + "learning_rate": 4.418365185473162e-05, + "loss": 0.8515, + "step": 168210 + }, + { + "epoch": 1.0747096329044377, + "grad_norm": 1.0414305925369263, + "learning_rate": 4.4178668264217796e-05, + "loss": 1.0144, + "step": 168220 + }, + { + "epoch": 1.0747735200541764, + "grad_norm": 1.6467934846878052, + "learning_rate": 4.417368473232982e-05, + "loss": 0.8989, + "step": 168230 + }, + { + "epoch": 1.074837407203915, + "grad_norm": 1.692230224609375, + "learning_rate": 4.416870125911788e-05, + "loss": 0.9082, + "step": 168240 + }, + { + "epoch": 1.0749012943536538, + "grad_norm": 1.0755226612091064, + "learning_rate": 4.416371784463216e-05, + "loss": 0.8357, + "step": 168250 + }, + { + "epoch": 1.0749651815033925, + "grad_norm": 0.6711715459823608, + "learning_rate": 4.415873448892286e-05, + "loss": 1.0267, + "step": 168260 + }, + { + "epoch": 1.0750290686531312, + "grad_norm": 0.965099036693573, + "learning_rate": 4.4153751192040153e-05, + "loss": 1.1305, + "step": 168270 + }, + { + "epoch": 1.07509295580287, + "grad_norm": 0.9926470518112183, + "learning_rate": 4.414876795403423e-05, + "loss": 0.7589, + "step": 168280 + }, + { + "epoch": 1.0751568429526086, + "grad_norm": 0.9975265264511108, + "learning_rate": 4.4143784774955274e-05, + "loss": 0.7783, + "step": 168290 + }, + { + "epoch": 1.0752207301023473, + "grad_norm": 0.831061601638794, + "learning_rate": 4.41388016548535e-05, + "loss": 0.941, + "step": 168300 + }, + { + "epoch": 1.075284617252086, + "grad_norm": 0.7290365099906921, + "learning_rate": 4.413381859377904e-05, + "loss": 0.7004, + "step": 168310 + }, + { + "epoch": 1.0753485044018247, + "grad_norm": 1.004852294921875, + "learning_rate": 4.412883559178209e-05, + "loss": 1.2023, + "step": 168320 + }, + { + "epoch": 1.0754123915515632, + "grad_norm": 1.3445194959640503, + "learning_rate": 4.412385264891286e-05, + "loss": 0.8475, + "step": 168330 + }, + { + "epoch": 1.0754762787013021, + "grad_norm": 1.051191806793213, + "learning_rate": 4.411886976522151e-05, + "loss": 0.8842, + "step": 168340 + }, + { + "epoch": 1.0755401658510406, + "grad_norm": 1.453165888786316, + "learning_rate": 4.411388694075822e-05, + "loss": 0.763, + "step": 168350 + }, + { + "epoch": 1.0756040530007793, + "grad_norm": 1.1620047092437744, + "learning_rate": 4.410890417557319e-05, + "loss": 0.769, + "step": 168360 + }, + { + "epoch": 1.075667940150518, + "grad_norm": 0.6074997782707214, + "learning_rate": 4.410392146971659e-05, + "loss": 0.808, + "step": 168370 + }, + { + "epoch": 1.0757318273002567, + "grad_norm": 0.8900678157806396, + "learning_rate": 4.409893882323861e-05, + "loss": 0.9424, + "step": 168380 + }, + { + "epoch": 1.0757957144499954, + "grad_norm": 0.6844345331192017, + "learning_rate": 4.409395623618941e-05, + "loss": 0.5761, + "step": 168390 + }, + { + "epoch": 1.0758596015997341, + "grad_norm": 1.0848898887634277, + "learning_rate": 4.408897370861919e-05, + "loss": 0.7388, + "step": 168400 + }, + { + "epoch": 1.0759234887494729, + "grad_norm": 1.1647626161575317, + "learning_rate": 4.4083991240578115e-05, + "loss": 0.9816, + "step": 168410 + }, + { + "epoch": 1.0759873758992116, + "grad_norm": 1.0366533994674683, + "learning_rate": 4.407900883211636e-05, + "loss": 0.8788, + "step": 168420 + }, + { + "epoch": 1.0760512630489503, + "grad_norm": 1.0390725135803223, + "learning_rate": 4.407402648328412e-05, + "loss": 0.7624, + "step": 168430 + }, + { + "epoch": 1.076115150198689, + "grad_norm": 0.7953748106956482, + "learning_rate": 4.406904419413155e-05, + "loss": 0.8238, + "step": 168440 + }, + { + "epoch": 1.0761790373484277, + "grad_norm": 0.825472354888916, + "learning_rate": 4.406406196470884e-05, + "loss": 1.1502, + "step": 168450 + }, + { + "epoch": 1.0762429244981664, + "grad_norm": 0.642017126083374, + "learning_rate": 4.4059079795066164e-05, + "loss": 0.8664, + "step": 168460 + }, + { + "epoch": 1.076306811647905, + "grad_norm": 1.6154723167419434, + "learning_rate": 4.4054097685253694e-05, + "loss": 0.7728, + "step": 168470 + }, + { + "epoch": 1.0763706987976438, + "grad_norm": 1.057303786277771, + "learning_rate": 4.4049115635321595e-05, + "loss": 0.7316, + "step": 168480 + }, + { + "epoch": 1.0764345859473825, + "grad_norm": 1.460866928100586, + "learning_rate": 4.404413364532006e-05, + "loss": 0.9117, + "step": 168490 + }, + { + "epoch": 1.0764984730971212, + "grad_norm": 0.7985673546791077, + "learning_rate": 4.403915171529925e-05, + "loss": 0.857, + "step": 168500 + }, + { + "epoch": 1.07656236024686, + "grad_norm": 0.9446043372154236, + "learning_rate": 4.403416984530934e-05, + "loss": 1.1648, + "step": 168510 + }, + { + "epoch": 1.0766262473965986, + "grad_norm": 1.0071111917495728, + "learning_rate": 4.402918803540049e-05, + "loss": 0.7617, + "step": 168520 + }, + { + "epoch": 1.0766901345463373, + "grad_norm": 1.2365188598632812, + "learning_rate": 4.402420628562289e-05, + "loss": 0.8075, + "step": 168530 + }, + { + "epoch": 1.076754021696076, + "grad_norm": 0.8958815932273865, + "learning_rate": 4.4019224596026706e-05, + "loss": 0.819, + "step": 168540 + }, + { + "epoch": 1.0768179088458147, + "grad_norm": 0.5266879796981812, + "learning_rate": 4.40142429666621e-05, + "loss": 0.796, + "step": 168550 + }, + { + "epoch": 1.0768817959955534, + "grad_norm": 1.222324013710022, + "learning_rate": 4.400926139757924e-05, + "loss": 0.6938, + "step": 168560 + }, + { + "epoch": 1.0769456831452922, + "grad_norm": 1.7840367555618286, + "learning_rate": 4.40042798888283e-05, + "loss": 0.8517, + "step": 168570 + }, + { + "epoch": 1.0770095702950309, + "grad_norm": 0.7238796353340149, + "learning_rate": 4.3999298440459455e-05, + "loss": 0.834, + "step": 168580 + }, + { + "epoch": 1.0770734574447696, + "grad_norm": 0.7589558362960815, + "learning_rate": 4.399431705252287e-05, + "loss": 0.6977, + "step": 168590 + }, + { + "epoch": 1.0771373445945083, + "grad_norm": 0.9841206669807434, + "learning_rate": 4.398933572506871e-05, + "loss": 0.7392, + "step": 168600 + }, + { + "epoch": 1.077201231744247, + "grad_norm": 1.0497477054595947, + "learning_rate": 4.398435445814713e-05, + "loss": 0.7858, + "step": 168610 + }, + { + "epoch": 1.0772651188939857, + "grad_norm": 0.9010785818099976, + "learning_rate": 4.3979373251808307e-05, + "loss": 1.0822, + "step": 168620 + }, + { + "epoch": 1.0773290060437244, + "grad_norm": 0.7049776315689087, + "learning_rate": 4.3974392106102405e-05, + "loss": 0.7629, + "step": 168630 + }, + { + "epoch": 1.077392893193463, + "grad_norm": 0.7971298098564148, + "learning_rate": 4.396941102107959e-05, + "loss": 0.712, + "step": 168640 + }, + { + "epoch": 1.0774567803432018, + "grad_norm": 0.9001279473304749, + "learning_rate": 4.396442999679003e-05, + "loss": 0.92, + "step": 168650 + }, + { + "epoch": 1.0775206674929405, + "grad_norm": 0.7700995802879333, + "learning_rate": 4.395944903328387e-05, + "loss": 0.6326, + "step": 168660 + }, + { + "epoch": 1.0775845546426792, + "grad_norm": 1.122528314590454, + "learning_rate": 4.395446813061128e-05, + "loss": 0.7356, + "step": 168670 + }, + { + "epoch": 1.077648441792418, + "grad_norm": 1.0956536531448364, + "learning_rate": 4.3949487288822434e-05, + "loss": 0.9384, + "step": 168680 + }, + { + "epoch": 1.0777123289421566, + "grad_norm": 0.9272425174713135, + "learning_rate": 4.3944506507967484e-05, + "loss": 0.9868, + "step": 168690 + }, + { + "epoch": 1.0777762160918953, + "grad_norm": 0.9327694177627563, + "learning_rate": 4.3939525788096595e-05, + "loss": 1.1208, + "step": 168700 + }, + { + "epoch": 1.077840103241634, + "grad_norm": 0.92158043384552, + "learning_rate": 4.3934545129259925e-05, + "loss": 1.16, + "step": 168710 + }, + { + "epoch": 1.0779039903913727, + "grad_norm": 0.7717714905738831, + "learning_rate": 4.3929564531507627e-05, + "loss": 0.9948, + "step": 168720 + }, + { + "epoch": 1.0779678775411115, + "grad_norm": 1.565289855003357, + "learning_rate": 4.392458399488987e-05, + "loss": 0.6847, + "step": 168730 + }, + { + "epoch": 1.0780317646908502, + "grad_norm": 0.6341485977172852, + "learning_rate": 4.3919603519456806e-05, + "loss": 0.8132, + "step": 168740 + }, + { + "epoch": 1.0780956518405889, + "grad_norm": 1.279700517654419, + "learning_rate": 4.391462310525859e-05, + "loss": 0.9369, + "step": 168750 + }, + { + "epoch": 1.0781595389903276, + "grad_norm": 0.7400231957435608, + "learning_rate": 4.390964275234538e-05, + "loss": 1.0159, + "step": 168760 + }, + { + "epoch": 1.0782234261400663, + "grad_norm": 1.025972843170166, + "learning_rate": 4.3904662460767346e-05, + "loss": 0.9119, + "step": 168770 + }, + { + "epoch": 1.078287313289805, + "grad_norm": 0.7673271298408508, + "learning_rate": 4.389968223057464e-05, + "loss": 0.8859, + "step": 168780 + }, + { + "epoch": 1.0783512004395437, + "grad_norm": 0.8631071448326111, + "learning_rate": 4.389470206181743e-05, + "loss": 0.7731, + "step": 168790 + }, + { + "epoch": 1.0784150875892824, + "grad_norm": 0.7734096646308899, + "learning_rate": 4.388972195454583e-05, + "loss": 0.8901, + "step": 168800 + }, + { + "epoch": 1.078478974739021, + "grad_norm": 1.3677899837493896, + "learning_rate": 4.388474190881e-05, + "loss": 1.0029, + "step": 168810 + }, + { + "epoch": 1.0785428618887596, + "grad_norm": 1.3638877868652344, + "learning_rate": 4.3879761924660135e-05, + "loss": 0.8329, + "step": 168820 + }, + { + "epoch": 1.0786067490384985, + "grad_norm": 0.7826031446456909, + "learning_rate": 4.387478200214635e-05, + "loss": 0.8171, + "step": 168830 + }, + { + "epoch": 1.078670636188237, + "grad_norm": 1.2444953918457031, + "learning_rate": 4.3869802141318804e-05, + "loss": 0.9164, + "step": 168840 + }, + { + "epoch": 1.0787345233379757, + "grad_norm": 0.8797706961631775, + "learning_rate": 4.3864822342227664e-05, + "loss": 0.9033, + "step": 168850 + }, + { + "epoch": 1.0787984104877144, + "grad_norm": 0.9462030529975891, + "learning_rate": 4.3859842604923065e-05, + "loss": 0.8665, + "step": 168860 + }, + { + "epoch": 1.0788622976374531, + "grad_norm": 0.860593318939209, + "learning_rate": 4.3854862929455164e-05, + "loss": 0.8267, + "step": 168870 + }, + { + "epoch": 1.0789261847871918, + "grad_norm": 3.0111243724823, + "learning_rate": 4.38498833158741e-05, + "loss": 0.8713, + "step": 168880 + }, + { + "epoch": 1.0789900719369305, + "grad_norm": 0.8164293169975281, + "learning_rate": 4.384490376423004e-05, + "loss": 0.981, + "step": 168890 + }, + { + "epoch": 1.0790539590866692, + "grad_norm": 1.224902868270874, + "learning_rate": 4.383992427457312e-05, + "loss": 0.8621, + "step": 168900 + }, + { + "epoch": 1.079117846236408, + "grad_norm": 1.4257479906082153, + "learning_rate": 4.38349448469535e-05, + "loss": 0.6887, + "step": 168910 + }, + { + "epoch": 1.0791817333861466, + "grad_norm": 1.2082140445709229, + "learning_rate": 4.382996548142132e-05, + "loss": 0.6849, + "step": 168920 + }, + { + "epoch": 1.0792456205358854, + "grad_norm": 0.7488890886306763, + "learning_rate": 4.3824986178026725e-05, + "loss": 1.042, + "step": 168930 + }, + { + "epoch": 1.079309507685624, + "grad_norm": 1.0096524953842163, + "learning_rate": 4.3820006936819856e-05, + "loss": 0.8779, + "step": 168940 + }, + { + "epoch": 1.0793733948353628, + "grad_norm": 1.3145537376403809, + "learning_rate": 4.381502775785086e-05, + "loss": 0.8794, + "step": 168950 + }, + { + "epoch": 1.0794372819851015, + "grad_norm": 0.878990888595581, + "learning_rate": 4.38100486411699e-05, + "loss": 0.8852, + "step": 168960 + }, + { + "epoch": 1.0795011691348402, + "grad_norm": 1.4810287952423096, + "learning_rate": 4.380506958682709e-05, + "loss": 0.7115, + "step": 168970 + }, + { + "epoch": 1.0795650562845789, + "grad_norm": 2.4938902854919434, + "learning_rate": 4.3800090594872594e-05, + "loss": 0.9315, + "step": 168980 + }, + { + "epoch": 1.0796289434343176, + "grad_norm": 0.8952536582946777, + "learning_rate": 4.379511166535655e-05, + "loss": 1.0401, + "step": 168990 + }, + { + "epoch": 1.0796928305840563, + "grad_norm": 0.9901078939437866, + "learning_rate": 4.37901327983291e-05, + "loss": 0.7888, + "step": 169000 + }, + { + "epoch": 1.079756717733795, + "grad_norm": 1.0167852640151978, + "learning_rate": 4.3785153993840386e-05, + "loss": 0.8806, + "step": 169010 + }, + { + "epoch": 1.0798206048835337, + "grad_norm": 0.898930013179779, + "learning_rate": 4.378017525194055e-05, + "loss": 0.7056, + "step": 169020 + }, + { + "epoch": 1.0798844920332724, + "grad_norm": 0.9970487356185913, + "learning_rate": 4.3775196572679724e-05, + "loss": 0.8798, + "step": 169030 + }, + { + "epoch": 1.0799483791830111, + "grad_norm": 0.7721714377403259, + "learning_rate": 4.377021795610805e-05, + "loss": 0.5651, + "step": 169040 + }, + { + "epoch": 1.0800122663327498, + "grad_norm": 0.7181320786476135, + "learning_rate": 4.3765239402275685e-05, + "loss": 0.9484, + "step": 169050 + }, + { + "epoch": 1.0800761534824885, + "grad_norm": 0.7529858946800232, + "learning_rate": 4.3760260911232745e-05, + "loss": 0.9099, + "step": 169060 + }, + { + "epoch": 1.0801400406322272, + "grad_norm": 1.4071221351623535, + "learning_rate": 4.3755282483029376e-05, + "loss": 0.8341, + "step": 169070 + }, + { + "epoch": 1.080203927781966, + "grad_norm": 1.0312004089355469, + "learning_rate": 4.3750304117715704e-05, + "loss": 1.3534, + "step": 169080 + }, + { + "epoch": 1.0802678149317047, + "grad_norm": 0.9659938812255859, + "learning_rate": 4.3745325815341885e-05, + "loss": 0.7913, + "step": 169090 + }, + { + "epoch": 1.0803317020814434, + "grad_norm": 0.763280987739563, + "learning_rate": 4.374034757595805e-05, + "loss": 0.9172, + "step": 169100 + }, + { + "epoch": 1.080395589231182, + "grad_norm": 4.690800189971924, + "learning_rate": 4.373536939961433e-05, + "loss": 0.8613, + "step": 169110 + }, + { + "epoch": 1.0804594763809208, + "grad_norm": 1.3843324184417725, + "learning_rate": 4.373039128636085e-05, + "loss": 0.7185, + "step": 169120 + }, + { + "epoch": 1.0805233635306595, + "grad_norm": 1.8589551448822021, + "learning_rate": 4.372541323624777e-05, + "loss": 1.0096, + "step": 169130 + }, + { + "epoch": 1.0805872506803982, + "grad_norm": 1.5525774955749512, + "learning_rate": 4.3720435249325196e-05, + "loss": 1.0313, + "step": 169140 + }, + { + "epoch": 1.0806511378301369, + "grad_norm": 0.9464132785797119, + "learning_rate": 4.3715457325643274e-05, + "loss": 0.8473, + "step": 169150 + }, + { + "epoch": 1.0807150249798756, + "grad_norm": 0.9257469773292542, + "learning_rate": 4.3710479465252135e-05, + "loss": 0.6759, + "step": 169160 + }, + { + "epoch": 1.0807789121296143, + "grad_norm": 3.840132474899292, + "learning_rate": 4.370550166820191e-05, + "loss": 0.8859, + "step": 169170 + }, + { + "epoch": 1.080842799279353, + "grad_norm": 0.8638167977333069, + "learning_rate": 4.370052393454272e-05, + "loss": 0.9481, + "step": 169180 + }, + { + "epoch": 1.0809066864290917, + "grad_norm": 0.8762006759643555, + "learning_rate": 4.3695546264324716e-05, + "loss": 1.11, + "step": 169190 + }, + { + "epoch": 1.0809705735788304, + "grad_norm": 0.763314425945282, + "learning_rate": 4.369056865759801e-05, + "loss": 0.8666, + "step": 169200 + }, + { + "epoch": 1.0810344607285691, + "grad_norm": 0.9983099102973938, + "learning_rate": 4.368559111441274e-05, + "loss": 0.9914, + "step": 169210 + }, + { + "epoch": 1.0810983478783078, + "grad_norm": 0.7259060144424438, + "learning_rate": 4.3680613634819026e-05, + "loss": 1.0586, + "step": 169220 + }, + { + "epoch": 1.0811622350280465, + "grad_norm": 1.0532219409942627, + "learning_rate": 4.3675636218867e-05, + "loss": 0.865, + "step": 169230 + }, + { + "epoch": 1.0812261221777852, + "grad_norm": 2.120307207107544, + "learning_rate": 4.367065886660678e-05, + "loss": 0.8903, + "step": 169240 + }, + { + "epoch": 1.081290009327524, + "grad_norm": 0.9013161659240723, + "learning_rate": 4.366568157808851e-05, + "loss": 1.0287, + "step": 169250 + }, + { + "epoch": 1.0813538964772627, + "grad_norm": 1.2690575122833252, + "learning_rate": 4.3660704353362316e-05, + "loss": 1.1216, + "step": 169260 + }, + { + "epoch": 1.0814177836270014, + "grad_norm": 0.8027593493461609, + "learning_rate": 4.36557271924783e-05, + "loss": 0.8628, + "step": 169270 + }, + { + "epoch": 1.08148167077674, + "grad_norm": 0.7389708757400513, + "learning_rate": 4.3650750095486616e-05, + "loss": 0.8239, + "step": 169280 + }, + { + "epoch": 1.0815455579264786, + "grad_norm": 1.0916862487792969, + "learning_rate": 4.3645773062437354e-05, + "loss": 0.6986, + "step": 169290 + }, + { + "epoch": 1.0816094450762175, + "grad_norm": 0.9672138690948486, + "learning_rate": 4.3640796093380666e-05, + "loss": 0.6625, + "step": 169300 + }, + { + "epoch": 1.081673332225956, + "grad_norm": 0.8081925511360168, + "learning_rate": 4.3635819188366655e-05, + "loss": 0.7711, + "step": 169310 + }, + { + "epoch": 1.0817372193756947, + "grad_norm": 0.949112594127655, + "learning_rate": 4.363084234744545e-05, + "loss": 1.0129, + "step": 169320 + }, + { + "epoch": 1.0818011065254334, + "grad_norm": 0.9872422218322754, + "learning_rate": 4.3625865570667174e-05, + "loss": 0.9821, + "step": 169330 + }, + { + "epoch": 1.081864993675172, + "grad_norm": 0.7962661385536194, + "learning_rate": 4.3620888858081945e-05, + "loss": 1.1761, + "step": 169340 + }, + { + "epoch": 1.0819288808249108, + "grad_norm": 0.5037175416946411, + "learning_rate": 4.361591220973988e-05, + "loss": 1.0152, + "step": 169350 + }, + { + "epoch": 1.0819927679746495, + "grad_norm": 1.953133225440979, + "learning_rate": 4.361093562569111e-05, + "loss": 0.7962, + "step": 169360 + }, + { + "epoch": 1.0820566551243882, + "grad_norm": 0.6017458438873291, + "learning_rate": 4.3605959105985746e-05, + "loss": 0.8448, + "step": 169370 + }, + { + "epoch": 1.082120542274127, + "grad_norm": 0.8771808743476868, + "learning_rate": 4.360098265067391e-05, + "loss": 0.8213, + "step": 169380 + }, + { + "epoch": 1.0821844294238656, + "grad_norm": 1.3374742269515991, + "learning_rate": 4.35960062598057e-05, + "loss": 0.7434, + "step": 169390 + }, + { + "epoch": 1.0822483165736043, + "grad_norm": 0.7994939088821411, + "learning_rate": 4.359102993343125e-05, + "loss": 0.8355, + "step": 169400 + }, + { + "epoch": 1.082312203723343, + "grad_norm": 0.8211526274681091, + "learning_rate": 4.358605367160067e-05, + "loss": 0.9097, + "step": 169410 + }, + { + "epoch": 1.0823760908730817, + "grad_norm": 0.9132190346717834, + "learning_rate": 4.3581077474364084e-05, + "loss": 0.8039, + "step": 169420 + }, + { + "epoch": 1.0824399780228204, + "grad_norm": 0.6380227208137512, + "learning_rate": 4.35761013417716e-05, + "loss": 0.8588, + "step": 169430 + }, + { + "epoch": 1.0825038651725591, + "grad_norm": 1.1924580335617065, + "learning_rate": 4.357112527387333e-05, + "loss": 1.1468, + "step": 169440 + }, + { + "epoch": 1.0825677523222978, + "grad_norm": 0.803373396396637, + "learning_rate": 4.3566149270719404e-05, + "loss": 0.7185, + "step": 169450 + }, + { + "epoch": 1.0826316394720366, + "grad_norm": 1.080508828163147, + "learning_rate": 4.356117333235992e-05, + "loss": 0.7215, + "step": 169460 + }, + { + "epoch": 1.0826955266217753, + "grad_norm": 0.8322065472602844, + "learning_rate": 4.355619745884498e-05, + "loss": 0.8798, + "step": 169470 + }, + { + "epoch": 1.082759413771514, + "grad_norm": 1.1867259740829468, + "learning_rate": 4.355122165022471e-05, + "loss": 0.8808, + "step": 169480 + }, + { + "epoch": 1.0828233009212527, + "grad_norm": 1.0579557418823242, + "learning_rate": 4.354624590654922e-05, + "loss": 1.1633, + "step": 169490 + }, + { + "epoch": 1.0828871880709914, + "grad_norm": 0.992499828338623, + "learning_rate": 4.354127022786861e-05, + "loss": 0.8273, + "step": 169500 + }, + { + "epoch": 1.08295107522073, + "grad_norm": 1.0000821352005005, + "learning_rate": 4.3536294614233e-05, + "loss": 1.0181, + "step": 169510 + }, + { + "epoch": 1.0830149623704688, + "grad_norm": 1.0470143556594849, + "learning_rate": 4.3531319065692494e-05, + "loss": 1.181, + "step": 169520 + }, + { + "epoch": 1.0830788495202075, + "grad_norm": 0.9132643938064575, + "learning_rate": 4.3526343582297205e-05, + "loss": 0.8465, + "step": 169530 + }, + { + "epoch": 1.0831427366699462, + "grad_norm": 1.1474629640579224, + "learning_rate": 4.352136816409723e-05, + "loss": 0.9753, + "step": 169540 + }, + { + "epoch": 1.083206623819685, + "grad_norm": 1.3476228713989258, + "learning_rate": 4.351639281114269e-05, + "loss": 0.8403, + "step": 169550 + }, + { + "epoch": 1.0832705109694236, + "grad_norm": 0.955660879611969, + "learning_rate": 4.351141752348368e-05, + "loss": 0.7374, + "step": 169560 + }, + { + "epoch": 1.0833343981191623, + "grad_norm": 0.7566315531730652, + "learning_rate": 4.3506442301170305e-05, + "loss": 1.0007, + "step": 169570 + }, + { + "epoch": 1.083398285268901, + "grad_norm": 1.4220582246780396, + "learning_rate": 4.3501467144252686e-05, + "loss": 1.0204, + "step": 169580 + }, + { + "epoch": 1.0834621724186397, + "grad_norm": 1.1132736206054688, + "learning_rate": 4.3496492052780904e-05, + "loss": 0.8878, + "step": 169590 + }, + { + "epoch": 1.0835260595683784, + "grad_norm": 0.5826320052146912, + "learning_rate": 4.349151702680507e-05, + "loss": 1.0499, + "step": 169600 + }, + { + "epoch": 1.0835899467181171, + "grad_norm": 0.7217887043952942, + "learning_rate": 4.348654206637529e-05, + "loss": 0.784, + "step": 169610 + }, + { + "epoch": 1.0836538338678559, + "grad_norm": 1.1123169660568237, + "learning_rate": 4.348156717154167e-05, + "loss": 0.9233, + "step": 169620 + }, + { + "epoch": 1.0837177210175946, + "grad_norm": 0.9296445250511169, + "learning_rate": 4.347659234235431e-05, + "loss": 0.9011, + "step": 169630 + }, + { + "epoch": 1.0837816081673333, + "grad_norm": 0.9957021474838257, + "learning_rate": 4.347161757886331e-05, + "loss": 0.8949, + "step": 169640 + }, + { + "epoch": 1.083845495317072, + "grad_norm": 0.7610149383544922, + "learning_rate": 4.346664288111877e-05, + "loss": 0.9835, + "step": 169650 + }, + { + "epoch": 1.0839093824668107, + "grad_norm": 0.9562907218933105, + "learning_rate": 4.346166824917079e-05, + "loss": 0.7712, + "step": 169660 + }, + { + "epoch": 1.0839732696165494, + "grad_norm": 1.062423586845398, + "learning_rate": 4.345669368306946e-05, + "loss": 1.179, + "step": 169670 + }, + { + "epoch": 1.084037156766288, + "grad_norm": 1.3824454545974731, + "learning_rate": 4.3451719182864894e-05, + "loss": 1.1116, + "step": 169680 + }, + { + "epoch": 1.0841010439160268, + "grad_norm": 0.847102701663971, + "learning_rate": 4.344674474860717e-05, + "loss": 1.0358, + "step": 169690 + }, + { + "epoch": 1.0841649310657655, + "grad_norm": 2.5781049728393555, + "learning_rate": 4.34417703803464e-05, + "loss": 0.9208, + "step": 169700 + }, + { + "epoch": 1.0842288182155042, + "grad_norm": 0.65672367811203, + "learning_rate": 4.343679607813268e-05, + "loss": 0.748, + "step": 169710 + }, + { + "epoch": 1.084292705365243, + "grad_norm": 1.4862734079360962, + "learning_rate": 4.3431821842016104e-05, + "loss": 0.862, + "step": 169720 + }, + { + "epoch": 1.0843565925149816, + "grad_norm": 1.015182375907898, + "learning_rate": 4.342684767204675e-05, + "loss": 1.0431, + "step": 169730 + }, + { + "epoch": 1.0844204796647203, + "grad_norm": 0.8225411772727966, + "learning_rate": 4.342187356827474e-05, + "loss": 0.8876, + "step": 169740 + }, + { + "epoch": 1.084484366814459, + "grad_norm": 0.8683805465698242, + "learning_rate": 4.341689953075015e-05, + "loss": 0.8132, + "step": 169750 + }, + { + "epoch": 1.0845482539641977, + "grad_norm": 1.6863847970962524, + "learning_rate": 4.3411925559523096e-05, + "loss": 0.9326, + "step": 169760 + }, + { + "epoch": 1.0846121411139364, + "grad_norm": 1.3636550903320312, + "learning_rate": 4.340695165464362e-05, + "loss": 0.8445, + "step": 169770 + }, + { + "epoch": 1.084676028263675, + "grad_norm": 1.0071210861206055, + "learning_rate": 4.340197781616186e-05, + "loss": 0.8123, + "step": 169780 + }, + { + "epoch": 1.0847399154134139, + "grad_norm": 0.7153182625770569, + "learning_rate": 4.339700404412789e-05, + "loss": 0.9171, + "step": 169790 + }, + { + "epoch": 1.0848038025631523, + "grad_norm": 1.7383946180343628, + "learning_rate": 4.33920303385918e-05, + "loss": 0.9486, + "step": 169800 + }, + { + "epoch": 1.084867689712891, + "grad_norm": 0.558637797832489, + "learning_rate": 4.338705669960368e-05, + "loss": 0.7979, + "step": 169810 + }, + { + "epoch": 1.0849315768626298, + "grad_norm": 1.0765694379806519, + "learning_rate": 4.338208312721362e-05, + "loss": 0.8457, + "step": 169820 + }, + { + "epoch": 1.0849954640123685, + "grad_norm": 0.681546151638031, + "learning_rate": 4.337710962147171e-05, + "loss": 0.6883, + "step": 169830 + }, + { + "epoch": 1.0850593511621072, + "grad_norm": 1.1017876863479614, + "learning_rate": 4.3372136182428037e-05, + "loss": 0.9216, + "step": 169840 + }, + { + "epoch": 1.0851232383118459, + "grad_norm": 1.7058666944503784, + "learning_rate": 4.3367162810132685e-05, + "loss": 1.0559, + "step": 169850 + }, + { + "epoch": 1.0851871254615846, + "grad_norm": 1.3724285364151, + "learning_rate": 4.336218950463574e-05, + "loss": 0.6846, + "step": 169860 + }, + { + "epoch": 1.0852510126113233, + "grad_norm": 0.880271315574646, + "learning_rate": 4.335721626598729e-05, + "loss": 0.9166, + "step": 169870 + }, + { + "epoch": 1.085314899761062, + "grad_norm": 1.2698674201965332, + "learning_rate": 4.335224309423742e-05, + "loss": 0.8341, + "step": 169880 + }, + { + "epoch": 1.0853787869108007, + "grad_norm": 1.0333696603775024, + "learning_rate": 4.3347269989436214e-05, + "loss": 1.3271, + "step": 169890 + }, + { + "epoch": 1.0854426740605394, + "grad_norm": 1.2140263319015503, + "learning_rate": 4.334229695163375e-05, + "loss": 0.8656, + "step": 169900 + }, + { + "epoch": 1.085506561210278, + "grad_norm": 0.5510571599006653, + "learning_rate": 4.3337323980880123e-05, + "loss": 1.0138, + "step": 169910 + }, + { + "epoch": 1.0855704483600168, + "grad_norm": 0.4985210597515106, + "learning_rate": 4.33323510772254e-05, + "loss": 0.7532, + "step": 169920 + }, + { + "epoch": 1.0856343355097555, + "grad_norm": 0.7603302597999573, + "learning_rate": 4.332737824071966e-05, + "loss": 0.8652, + "step": 169930 + }, + { + "epoch": 1.0856982226594942, + "grad_norm": 0.925507664680481, + "learning_rate": 4.3322405471413006e-05, + "loss": 0.817, + "step": 169940 + }, + { + "epoch": 1.085762109809233, + "grad_norm": 1.286191701889038, + "learning_rate": 4.3317432769355514e-05, + "loss": 0.905, + "step": 169950 + }, + { + "epoch": 1.0858259969589716, + "grad_norm": 0.7807539105415344, + "learning_rate": 4.331246013459724e-05, + "loss": 0.9113, + "step": 169960 + }, + { + "epoch": 1.0858898841087103, + "grad_norm": 0.8085950016975403, + "learning_rate": 4.3307487567188294e-05, + "loss": 0.7799, + "step": 169970 + }, + { + "epoch": 1.085953771258449, + "grad_norm": 0.7874223589897156, + "learning_rate": 4.330251506717873e-05, + "loss": 0.9971, + "step": 169980 + }, + { + "epoch": 1.0860176584081878, + "grad_norm": 0.9304120540618896, + "learning_rate": 4.329754263461863e-05, + "loss": 0.7827, + "step": 169990 + }, + { + "epoch": 1.0860815455579265, + "grad_norm": 0.9361356496810913, + "learning_rate": 4.329257026955808e-05, + "loss": 1.0771, + "step": 170000 + }, + { + "epoch": 1.0861454327076652, + "grad_norm": 1.004862666130066, + "learning_rate": 4.3287597972047144e-05, + "loss": 0.9004, + "step": 170010 + }, + { + "epoch": 1.0862093198574039, + "grad_norm": 1.2250120639801025, + "learning_rate": 4.328262574213591e-05, + "loss": 0.7129, + "step": 170020 + }, + { + "epoch": 1.0862732070071426, + "grad_norm": 0.5635613799095154, + "learning_rate": 4.3277653579874445e-05, + "loss": 0.8634, + "step": 170030 + }, + { + "epoch": 1.0863370941568813, + "grad_norm": 0.9488706588745117, + "learning_rate": 4.3272681485312824e-05, + "loss": 1.0111, + "step": 170040 + }, + { + "epoch": 1.08640098130662, + "grad_norm": 0.8792842626571655, + "learning_rate": 4.326770945850111e-05, + "loss": 0.6489, + "step": 170050 + }, + { + "epoch": 1.0864648684563587, + "grad_norm": 0.5924078226089478, + "learning_rate": 4.32627374994894e-05, + "loss": 0.7982, + "step": 170060 + }, + { + "epoch": 1.0865287556060974, + "grad_norm": 1.1501904726028442, + "learning_rate": 4.325776560832775e-05, + "loss": 0.8234, + "step": 170070 + }, + { + "epoch": 1.0865926427558361, + "grad_norm": 0.9579604268074036, + "learning_rate": 4.3252793785066234e-05, + "loss": 0.8457, + "step": 170080 + }, + { + "epoch": 1.0866565299055748, + "grad_norm": 0.599661648273468, + "learning_rate": 4.3247822029754915e-05, + "loss": 0.9024, + "step": 170090 + }, + { + "epoch": 1.0867204170553135, + "grad_norm": 0.861957848072052, + "learning_rate": 4.324285034244387e-05, + "loss": 0.8308, + "step": 170100 + }, + { + "epoch": 1.0867843042050522, + "grad_norm": 1.1481958627700806, + "learning_rate": 4.323787872318317e-05, + "loss": 0.8787, + "step": 170110 + }, + { + "epoch": 1.086848191354791, + "grad_norm": 0.8969278931617737, + "learning_rate": 4.323290717202289e-05, + "loss": 1.1709, + "step": 170120 + }, + { + "epoch": 1.0869120785045296, + "grad_norm": 0.944645345211029, + "learning_rate": 4.322793568901308e-05, + "loss": 0.8074, + "step": 170130 + }, + { + "epoch": 1.0869759656542684, + "grad_norm": 0.8422662019729614, + "learning_rate": 4.32229642742038e-05, + "loss": 0.9204, + "step": 170140 + }, + { + "epoch": 1.087039852804007, + "grad_norm": 1.7301738262176514, + "learning_rate": 4.321799292764515e-05, + "loss": 0.7814, + "step": 170150 + }, + { + "epoch": 1.0871037399537458, + "grad_norm": 0.9981438517570496, + "learning_rate": 4.321302164938717e-05, + "loss": 0.8506, + "step": 170160 + }, + { + "epoch": 1.0871676271034845, + "grad_norm": 0.6587639451026917, + "learning_rate": 4.3208050439479955e-05, + "loss": 0.7711, + "step": 170170 + }, + { + "epoch": 1.0872315142532232, + "grad_norm": 0.6386655569076538, + "learning_rate": 4.320307929797353e-05, + "loss": 0.9447, + "step": 170180 + }, + { + "epoch": 1.0872954014029619, + "grad_norm": 0.953752875328064, + "learning_rate": 4.319810822491798e-05, + "loss": 0.5579, + "step": 170190 + }, + { + "epoch": 1.0873592885527006, + "grad_norm": 1.0442869663238525, + "learning_rate": 4.3193137220363366e-05, + "loss": 0.767, + "step": 170200 + }, + { + "epoch": 1.0874231757024393, + "grad_norm": 1.0589914321899414, + "learning_rate": 4.318816628435975e-05, + "loss": 0.632, + "step": 170210 + }, + { + "epoch": 1.087487062852178, + "grad_norm": 0.8957975506782532, + "learning_rate": 4.318319541695719e-05, + "loss": 1.2458, + "step": 170220 + }, + { + "epoch": 1.0875509500019167, + "grad_norm": 0.7432546615600586, + "learning_rate": 4.3178224618205755e-05, + "loss": 0.6911, + "step": 170230 + }, + { + "epoch": 1.0876148371516554, + "grad_norm": 0.9806839227676392, + "learning_rate": 4.3173253888155496e-05, + "loss": 0.9478, + "step": 170240 + }, + { + "epoch": 1.0876787243013941, + "grad_norm": 1.5259449481964111, + "learning_rate": 4.3168283226856496e-05, + "loss": 0.7437, + "step": 170250 + }, + { + "epoch": 1.0877426114511328, + "grad_norm": 0.9942728281021118, + "learning_rate": 4.316331263435876e-05, + "loss": 0.9433, + "step": 170260 + }, + { + "epoch": 1.0878064986008713, + "grad_norm": 1.1904678344726562, + "learning_rate": 4.31583421107124e-05, + "loss": 0.8075, + "step": 170270 + }, + { + "epoch": 1.0878703857506102, + "grad_norm": 1.0735795497894287, + "learning_rate": 4.315337165596745e-05, + "loss": 0.9411, + "step": 170280 + }, + { + "epoch": 1.0879342729003487, + "grad_norm": 1.5544825792312622, + "learning_rate": 4.3148401270173963e-05, + "loss": 0.8414, + "step": 170290 + }, + { + "epoch": 1.0879981600500874, + "grad_norm": 1.0982941389083862, + "learning_rate": 4.314343095338201e-05, + "loss": 1.1805, + "step": 170300 + }, + { + "epoch": 1.0880620471998261, + "grad_norm": 0.6447432041168213, + "learning_rate": 4.3138460705641645e-05, + "loss": 0.8576, + "step": 170310 + }, + { + "epoch": 1.0881259343495648, + "grad_norm": 0.844802737236023, + "learning_rate": 4.313349052700291e-05, + "loss": 0.8521, + "step": 170320 + }, + { + "epoch": 1.0881898214993035, + "grad_norm": 1.66594660282135, + "learning_rate": 4.312852041751586e-05, + "loss": 1.0375, + "step": 170330 + }, + { + "epoch": 1.0882537086490423, + "grad_norm": 2.11464786529541, + "learning_rate": 4.312355037723056e-05, + "loss": 0.9433, + "step": 170340 + }, + { + "epoch": 1.088317595798781, + "grad_norm": 0.6536024808883667, + "learning_rate": 4.311858040619706e-05, + "loss": 0.8515, + "step": 170350 + }, + { + "epoch": 1.0883814829485197, + "grad_norm": 0.7490361332893372, + "learning_rate": 4.31136105044654e-05, + "loss": 0.9418, + "step": 170360 + }, + { + "epoch": 1.0884453700982584, + "grad_norm": 0.8330373167991638, + "learning_rate": 4.310864067208564e-05, + "loss": 0.7771, + "step": 170370 + }, + { + "epoch": 1.088509257247997, + "grad_norm": 0.6857531666755676, + "learning_rate": 4.310367090910784e-05, + "loss": 0.9539, + "step": 170380 + }, + { + "epoch": 1.0885731443977358, + "grad_norm": 0.7235297560691833, + "learning_rate": 4.309870121558204e-05, + "loss": 0.5731, + "step": 170390 + }, + { + "epoch": 1.0886370315474745, + "grad_norm": 0.9833201766014099, + "learning_rate": 4.3093731591558285e-05, + "loss": 0.8506, + "step": 170400 + }, + { + "epoch": 1.0887009186972132, + "grad_norm": 0.8931936025619507, + "learning_rate": 4.308876203708662e-05, + "loss": 0.7127, + "step": 170410 + }, + { + "epoch": 1.088764805846952, + "grad_norm": 0.9253937602043152, + "learning_rate": 4.308379255221711e-05, + "loss": 0.7261, + "step": 170420 + }, + { + "epoch": 1.0888286929966906, + "grad_norm": 1.6246715784072876, + "learning_rate": 4.307882313699979e-05, + "loss": 0.8979, + "step": 170430 + }, + { + "epoch": 1.0888925801464293, + "grad_norm": 0.6399512887001038, + "learning_rate": 4.307385379148471e-05, + "loss": 0.7505, + "step": 170440 + }, + { + "epoch": 1.088956467296168, + "grad_norm": 1.1002649068832397, + "learning_rate": 4.3068884515721905e-05, + "loss": 0.7275, + "step": 170450 + }, + { + "epoch": 1.0890203544459067, + "grad_norm": 0.920734167098999, + "learning_rate": 4.306391530976143e-05, + "loss": 1.1204, + "step": 170460 + }, + { + "epoch": 1.0890842415956454, + "grad_norm": 0.6019224524497986, + "learning_rate": 4.3058946173653336e-05, + "loss": 0.8799, + "step": 170470 + }, + { + "epoch": 1.0891481287453841, + "grad_norm": 0.9627069234848022, + "learning_rate": 4.3053977107447656e-05, + "loss": 0.8298, + "step": 170480 + }, + { + "epoch": 1.0892120158951228, + "grad_norm": 0.9707046747207642, + "learning_rate": 4.3049008111194436e-05, + "loss": 0.9176, + "step": 170490 + }, + { + "epoch": 1.0892759030448615, + "grad_norm": 1.699053406715393, + "learning_rate": 4.3044039184943725e-05, + "loss": 0.9308, + "step": 170500 + }, + { + "epoch": 1.0893397901946003, + "grad_norm": 2.0104095935821533, + "learning_rate": 4.303907032874555e-05, + "loss": 0.8551, + "step": 170510 + }, + { + "epoch": 1.089403677344339, + "grad_norm": 1.3910030126571655, + "learning_rate": 4.303410154264996e-05, + "loss": 0.7657, + "step": 170520 + }, + { + "epoch": 1.0894675644940777, + "grad_norm": 1.23843252658844, + "learning_rate": 4.3029132826706994e-05, + "loss": 1.1489, + "step": 170530 + }, + { + "epoch": 1.0895314516438164, + "grad_norm": 1.2719831466674805, + "learning_rate": 4.30241641809667e-05, + "loss": 0.8545, + "step": 170540 + }, + { + "epoch": 1.089595338793555, + "grad_norm": 0.8064283728599548, + "learning_rate": 4.301919560547909e-05, + "loss": 1.2119, + "step": 170550 + }, + { + "epoch": 1.0896592259432938, + "grad_norm": 0.8944299221038818, + "learning_rate": 4.301422710029423e-05, + "loss": 0.8799, + "step": 170560 + }, + { + "epoch": 1.0897231130930325, + "grad_norm": 1.0481112003326416, + "learning_rate": 4.300925866546215e-05, + "loss": 0.8182, + "step": 170570 + }, + { + "epoch": 1.0897870002427712, + "grad_norm": 0.682194709777832, + "learning_rate": 4.300429030103288e-05, + "loss": 0.834, + "step": 170580 + }, + { + "epoch": 1.08985088739251, + "grad_norm": 0.7931283712387085, + "learning_rate": 4.2999322007056456e-05, + "loss": 0.9273, + "step": 170590 + }, + { + "epoch": 1.0899147745422486, + "grad_norm": 0.880792498588562, + "learning_rate": 4.2994353783582916e-05, + "loss": 0.7912, + "step": 170600 + }, + { + "epoch": 1.0899786616919873, + "grad_norm": 0.9642816185951233, + "learning_rate": 4.2989385630662295e-05, + "loss": 0.8244, + "step": 170610 + }, + { + "epoch": 1.090042548841726, + "grad_norm": 1.038432240486145, + "learning_rate": 4.298441754834462e-05, + "loss": 0.7977, + "step": 170620 + }, + { + "epoch": 1.0901064359914647, + "grad_norm": 0.7804505228996277, + "learning_rate": 4.297944953667994e-05, + "loss": 0.821, + "step": 170630 + }, + { + "epoch": 1.0901703231412034, + "grad_norm": 0.8476502299308777, + "learning_rate": 4.297448159571827e-05, + "loss": 0.9512, + "step": 170640 + }, + { + "epoch": 1.0902342102909421, + "grad_norm": 0.8416975140571594, + "learning_rate": 4.296951372550965e-05, + "loss": 0.9175, + "step": 170650 + }, + { + "epoch": 1.0902980974406808, + "grad_norm": 1.0005344152450562, + "learning_rate": 4.296454592610412e-05, + "loss": 0.9392, + "step": 170660 + }, + { + "epoch": 1.0903619845904196, + "grad_norm": 0.8752073049545288, + "learning_rate": 4.295957819755167e-05, + "loss": 0.952, + "step": 170670 + }, + { + "epoch": 1.0904258717401583, + "grad_norm": 1.113079309463501, + "learning_rate": 4.2954610539902376e-05, + "loss": 0.8772, + "step": 170680 + }, + { + "epoch": 1.090489758889897, + "grad_norm": 0.7961655259132385, + "learning_rate": 4.294964295320625e-05, + "loss": 0.8065, + "step": 170690 + }, + { + "epoch": 1.0905536460396357, + "grad_norm": 0.9878672957420349, + "learning_rate": 4.294467543751332e-05, + "loss": 0.6863, + "step": 170700 + }, + { + "epoch": 1.0906175331893744, + "grad_norm": 1.0683012008666992, + "learning_rate": 4.2939707992873614e-05, + "loss": 0.8474, + "step": 170710 + }, + { + "epoch": 1.090681420339113, + "grad_norm": 1.4252039194107056, + "learning_rate": 4.293474061933715e-05, + "loss": 0.9683, + "step": 170720 + }, + { + "epoch": 1.0907453074888518, + "grad_norm": 0.6621407866477966, + "learning_rate": 4.2929773316953986e-05, + "loss": 0.9312, + "step": 170730 + }, + { + "epoch": 1.0908091946385905, + "grad_norm": 0.9435315132141113, + "learning_rate": 4.29248060857741e-05, + "loss": 0.6707, + "step": 170740 + }, + { + "epoch": 1.0908730817883292, + "grad_norm": 0.7621443867683411, + "learning_rate": 4.291983892584754e-05, + "loss": 0.7015, + "step": 170750 + }, + { + "epoch": 1.0909369689380677, + "grad_norm": 0.7548527717590332, + "learning_rate": 4.2914871837224325e-05, + "loss": 0.8002, + "step": 170760 + }, + { + "epoch": 1.0910008560878066, + "grad_norm": 1.0962146520614624, + "learning_rate": 4.2909904819954474e-05, + "loss": 0.9549, + "step": 170770 + }, + { + "epoch": 1.091064743237545, + "grad_norm": 1.036853313446045, + "learning_rate": 4.290493787408801e-05, + "loss": 0.804, + "step": 170780 + }, + { + "epoch": 1.0911286303872838, + "grad_norm": 0.9214669466018677, + "learning_rate": 4.289997099967497e-05, + "loss": 0.9074, + "step": 170790 + }, + { + "epoch": 1.0911925175370225, + "grad_norm": 1.0445566177368164, + "learning_rate": 4.289500419676537e-05, + "loss": 1.0604, + "step": 170800 + }, + { + "epoch": 1.0912564046867612, + "grad_norm": 1.1054303646087646, + "learning_rate": 4.289053413532349e-05, + "loss": 0.9242, + "step": 170810 + }, + { + "epoch": 1.0913202918365, + "grad_norm": 0.9858868718147278, + "learning_rate": 4.288556746840821e-05, + "loss": 0.9317, + "step": 170820 + }, + { + "epoch": 1.0913841789862386, + "grad_norm": 1.280604362487793, + "learning_rate": 4.288060087314143e-05, + "loss": 0.8943, + "step": 170830 + }, + { + "epoch": 1.0914480661359773, + "grad_norm": 0.986193060874939, + "learning_rate": 4.287563434957315e-05, + "loss": 0.9631, + "step": 170840 + }, + { + "epoch": 1.091511953285716, + "grad_norm": 0.850288987159729, + "learning_rate": 4.28706678977534e-05, + "loss": 0.8876, + "step": 170850 + }, + { + "epoch": 1.0915758404354547, + "grad_norm": 0.8429303169250488, + "learning_rate": 4.2865701517732194e-05, + "loss": 0.9627, + "step": 170860 + }, + { + "epoch": 1.0916397275851935, + "grad_norm": 1.297688364982605, + "learning_rate": 4.286073520955954e-05, + "loss": 1.0169, + "step": 170870 + }, + { + "epoch": 1.0917036147349322, + "grad_norm": 1.3055537939071655, + "learning_rate": 4.285576897328548e-05, + "loss": 0.9055, + "step": 170880 + }, + { + "epoch": 1.0917675018846709, + "grad_norm": 0.5656216740608215, + "learning_rate": 4.285080280895999e-05, + "loss": 0.8723, + "step": 170890 + }, + { + "epoch": 1.0918313890344096, + "grad_norm": 1.0163016319274902, + "learning_rate": 4.284583671663309e-05, + "loss": 0.7025, + "step": 170900 + }, + { + "epoch": 1.0918952761841483, + "grad_norm": 1.388039469718933, + "learning_rate": 4.2840870696354815e-05, + "loss": 0.9179, + "step": 170910 + }, + { + "epoch": 1.091959163333887, + "grad_norm": 0.740880012512207, + "learning_rate": 4.283590474817516e-05, + "loss": 0.8821, + "step": 170920 + }, + { + "epoch": 1.0920230504836257, + "grad_norm": 1.1587016582489014, + "learning_rate": 4.283093887214414e-05, + "loss": 0.8497, + "step": 170930 + }, + { + "epoch": 1.0920869376333644, + "grad_norm": 0.5431670546531677, + "learning_rate": 4.282597306831178e-05, + "loss": 0.7304, + "step": 170940 + }, + { + "epoch": 1.092150824783103, + "grad_norm": 0.808057427406311, + "learning_rate": 4.282100733672807e-05, + "loss": 0.751, + "step": 170950 + }, + { + "epoch": 1.0922147119328418, + "grad_norm": 1.3623803853988647, + "learning_rate": 4.281604167744303e-05, + "loss": 0.9272, + "step": 170960 + }, + { + "epoch": 1.0922785990825805, + "grad_norm": 0.8062224388122559, + "learning_rate": 4.2811076090506665e-05, + "loss": 1.1279, + "step": 170970 + }, + { + "epoch": 1.0923424862323192, + "grad_norm": 0.6659354567527771, + "learning_rate": 4.280611057596898e-05, + "loss": 0.8028, + "step": 170980 + }, + { + "epoch": 1.092406373382058, + "grad_norm": 0.9792410731315613, + "learning_rate": 4.2801145133879984e-05, + "loss": 0.8559, + "step": 170990 + }, + { + "epoch": 1.0924702605317966, + "grad_norm": 1.0196290016174316, + "learning_rate": 4.2796179764289685e-05, + "loss": 0.7903, + "step": 171000 + }, + { + "epoch": 1.0925341476815353, + "grad_norm": 0.648735761642456, + "learning_rate": 4.279121446724809e-05, + "loss": 0.6385, + "step": 171010 + }, + { + "epoch": 1.092598034831274, + "grad_norm": 0.98899906873703, + "learning_rate": 4.27862492428052e-05, + "loss": 0.9158, + "step": 171020 + }, + { + "epoch": 1.0926619219810128, + "grad_norm": 1.0701076984405518, + "learning_rate": 4.278128409101102e-05, + "loss": 0.8305, + "step": 171030 + }, + { + "epoch": 1.0927258091307515, + "grad_norm": 0.6789625287055969, + "learning_rate": 4.277631901191556e-05, + "loss": 0.794, + "step": 171040 + }, + { + "epoch": 1.0927896962804902, + "grad_norm": 1.3493213653564453, + "learning_rate": 4.277135400556881e-05, + "loss": 0.9299, + "step": 171050 + }, + { + "epoch": 1.0928535834302289, + "grad_norm": 0.4776621460914612, + "learning_rate": 4.2766389072020787e-05, + "loss": 0.7761, + "step": 171060 + }, + { + "epoch": 1.0929174705799676, + "grad_norm": 1.0845409631729126, + "learning_rate": 4.276142421132148e-05, + "loss": 0.8007, + "step": 171070 + }, + { + "epoch": 1.0929813577297063, + "grad_norm": 1.085318684577942, + "learning_rate": 4.275645942352089e-05, + "loss": 0.8909, + "step": 171080 + }, + { + "epoch": 1.093045244879445, + "grad_norm": 0.7613808512687683, + "learning_rate": 4.275149470866902e-05, + "loss": 0.9719, + "step": 171090 + }, + { + "epoch": 1.0931091320291837, + "grad_norm": 1.1465986967086792, + "learning_rate": 4.274653006681587e-05, + "loss": 0.9457, + "step": 171100 + }, + { + "epoch": 1.0931730191789224, + "grad_norm": 1.1371395587921143, + "learning_rate": 4.274156549801143e-05, + "loss": 0.8983, + "step": 171110 + }, + { + "epoch": 1.093236906328661, + "grad_norm": 0.8858291506767273, + "learning_rate": 4.2736601002305715e-05, + "loss": 0.6213, + "step": 171120 + }, + { + "epoch": 1.0933007934783998, + "grad_norm": 1.0316749811172485, + "learning_rate": 4.2731636579748714e-05, + "loss": 0.835, + "step": 171130 + }, + { + "epoch": 1.0933646806281385, + "grad_norm": 1.0872924327850342, + "learning_rate": 4.2726672230390416e-05, + "loss": 0.9478, + "step": 171140 + }, + { + "epoch": 1.0934285677778772, + "grad_norm": 1.0157170295715332, + "learning_rate": 4.2721707954280824e-05, + "loss": 0.9286, + "step": 171150 + }, + { + "epoch": 1.093492454927616, + "grad_norm": 1.3078545331954956, + "learning_rate": 4.2716743751469926e-05, + "loss": 0.8732, + "step": 171160 + }, + { + "epoch": 1.0935563420773546, + "grad_norm": 0.5241159796714783, + "learning_rate": 4.271177962200772e-05, + "loss": 0.8715, + "step": 171170 + }, + { + "epoch": 1.0936202292270933, + "grad_norm": 1.0080080032348633, + "learning_rate": 4.27068155659442e-05, + "loss": 0.9691, + "step": 171180 + }, + { + "epoch": 1.093684116376832, + "grad_norm": 0.7973745465278625, + "learning_rate": 4.2701851583329356e-05, + "loss": 0.7358, + "step": 171190 + }, + { + "epoch": 1.0937480035265708, + "grad_norm": 0.9834219813346863, + "learning_rate": 4.269688767421318e-05, + "loss": 0.8833, + "step": 171200 + }, + { + "epoch": 1.0938118906763095, + "grad_norm": 1.2644120454788208, + "learning_rate": 4.269192383864567e-05, + "loss": 1.0025, + "step": 171210 + }, + { + "epoch": 1.0938757778260482, + "grad_norm": 4.959319591522217, + "learning_rate": 4.2686960076676794e-05, + "loss": 0.8961, + "step": 171220 + }, + { + "epoch": 1.0939396649757869, + "grad_norm": 0.6554162502288818, + "learning_rate": 4.268199638835657e-05, + "loss": 0.8272, + "step": 171230 + }, + { + "epoch": 1.0940035521255256, + "grad_norm": 1.006784439086914, + "learning_rate": 4.267703277373497e-05, + "loss": 1.1133, + "step": 171240 + }, + { + "epoch": 1.094067439275264, + "grad_norm": 1.1141198873519897, + "learning_rate": 4.2672069232861984e-05, + "loss": 1.0126, + "step": 171250 + }, + { + "epoch": 1.094131326425003, + "grad_norm": 1.0400590896606445, + "learning_rate": 4.2667105765787604e-05, + "loss": 0.772, + "step": 171260 + }, + { + "epoch": 1.0941952135747415, + "grad_norm": 1.063521146774292, + "learning_rate": 4.266214237256181e-05, + "loss": 0.9628, + "step": 171270 + }, + { + "epoch": 1.0942591007244802, + "grad_norm": 0.8146291971206665, + "learning_rate": 4.265717905323459e-05, + "loss": 0.687, + "step": 171280 + }, + { + "epoch": 1.0943229878742189, + "grad_norm": 0.9968299269676208, + "learning_rate": 4.2652215807855924e-05, + "loss": 0.846, + "step": 171290 + }, + { + "epoch": 1.0943868750239576, + "grad_norm": 1.3008157014846802, + "learning_rate": 4.264725263647581e-05, + "loss": 0.7657, + "step": 171300 + }, + { + "epoch": 1.0944507621736963, + "grad_norm": 1.059985876083374, + "learning_rate": 4.264228953914421e-05, + "loss": 1.1265, + "step": 171310 + }, + { + "epoch": 1.094514649323435, + "grad_norm": 2.1042439937591553, + "learning_rate": 4.263732651591113e-05, + "loss": 0.7885, + "step": 171320 + }, + { + "epoch": 1.0945785364731737, + "grad_norm": 1.5627658367156982, + "learning_rate": 4.263236356682654e-05, + "loss": 0.9555, + "step": 171330 + }, + { + "epoch": 1.0946424236229124, + "grad_norm": 1.1831436157226562, + "learning_rate": 4.262740069194042e-05, + "loss": 0.7085, + "step": 171340 + }, + { + "epoch": 1.0947063107726511, + "grad_norm": 0.998292863368988, + "learning_rate": 4.262243789130276e-05, + "loss": 0.8089, + "step": 171350 + }, + { + "epoch": 1.0947701979223898, + "grad_norm": 0.8351050615310669, + "learning_rate": 4.261747516496353e-05, + "loss": 0.7589, + "step": 171360 + }, + { + "epoch": 1.0948340850721285, + "grad_norm": 0.9817091226577759, + "learning_rate": 4.261251251297273e-05, + "loss": 1.1004, + "step": 171370 + }, + { + "epoch": 1.0948979722218672, + "grad_norm": 0.8968245387077332, + "learning_rate": 4.260754993538031e-05, + "loss": 0.7585, + "step": 171380 + }, + { + "epoch": 1.094961859371606, + "grad_norm": 0.994646430015564, + "learning_rate": 4.260258743223625e-05, + "loss": 1.0548, + "step": 171390 + }, + { + "epoch": 1.0950257465213447, + "grad_norm": 0.6734488010406494, + "learning_rate": 4.2597625003590534e-05, + "loss": 1.1092, + "step": 171400 + }, + { + "epoch": 1.0950896336710834, + "grad_norm": 1.889375925064087, + "learning_rate": 4.259266264949314e-05, + "loss": 0.7207, + "step": 171410 + }, + { + "epoch": 1.095153520820822, + "grad_norm": 1.042815923690796, + "learning_rate": 4.258770036999404e-05, + "loss": 0.7736, + "step": 171420 + }, + { + "epoch": 1.0952174079705608, + "grad_norm": 0.9770397543907166, + "learning_rate": 4.25827381651432e-05, + "loss": 0.841, + "step": 171430 + }, + { + "epoch": 1.0952812951202995, + "grad_norm": 0.7401404976844788, + "learning_rate": 4.2577776034990604e-05, + "loss": 0.642, + "step": 171440 + }, + { + "epoch": 1.0953451822700382, + "grad_norm": 0.7844344973564148, + "learning_rate": 4.257281397958624e-05, + "loss": 0.8013, + "step": 171450 + }, + { + "epoch": 1.095409069419777, + "grad_norm": 0.5132720470428467, + "learning_rate": 4.2567851998980055e-05, + "loss": 0.9082, + "step": 171460 + }, + { + "epoch": 1.0954729565695156, + "grad_norm": 0.7843263745307922, + "learning_rate": 4.256289009322204e-05, + "loss": 0.5077, + "step": 171470 + }, + { + "epoch": 1.0955368437192543, + "grad_norm": 1.0947779417037964, + "learning_rate": 4.2557928262362145e-05, + "loss": 0.9435, + "step": 171480 + }, + { + "epoch": 1.095600730868993, + "grad_norm": 0.9761757850646973, + "learning_rate": 4.2552966506450355e-05, + "loss": 0.6885, + "step": 171490 + }, + { + "epoch": 1.0956646180187317, + "grad_norm": 1.1741174459457397, + "learning_rate": 4.254800482553664e-05, + "loss": 0.7751, + "step": 171500 + }, + { + "epoch": 1.0957285051684704, + "grad_norm": 0.848971962928772, + "learning_rate": 4.2543043219670954e-05, + "loss": 0.8029, + "step": 171510 + }, + { + "epoch": 1.0957923923182091, + "grad_norm": 1.3921595811843872, + "learning_rate": 4.253808168890327e-05, + "loss": 0.9036, + "step": 171520 + }, + { + "epoch": 1.0958562794679478, + "grad_norm": 1.0362133979797363, + "learning_rate": 4.2533120233283576e-05, + "loss": 1.1471, + "step": 171530 + }, + { + "epoch": 1.0959201666176865, + "grad_norm": 0.9512225985527039, + "learning_rate": 4.252815885286181e-05, + "loss": 0.7721, + "step": 171540 + }, + { + "epoch": 1.0959840537674252, + "grad_norm": 1.5971604585647583, + "learning_rate": 4.252319754768795e-05, + "loss": 0.7549, + "step": 171550 + }, + { + "epoch": 1.096047940917164, + "grad_norm": 1.265650749206543, + "learning_rate": 4.251823631781195e-05, + "loss": 0.8241, + "step": 171560 + }, + { + "epoch": 1.0961118280669027, + "grad_norm": 1.4117658138275146, + "learning_rate": 4.25132751632838e-05, + "loss": 1.1055, + "step": 171570 + }, + { + "epoch": 1.0961757152166414, + "grad_norm": 0.8779417276382446, + "learning_rate": 4.2508314084153434e-05, + "loss": 1.0223, + "step": 171580 + }, + { + "epoch": 1.09623960236638, + "grad_norm": 1.1261694431304932, + "learning_rate": 4.250335308047083e-05, + "loss": 1.1164, + "step": 171590 + }, + { + "epoch": 1.0963034895161188, + "grad_norm": 0.9011442065238953, + "learning_rate": 4.249839215228595e-05, + "loss": 0.7676, + "step": 171600 + }, + { + "epoch": 1.0963673766658575, + "grad_norm": 0.8143396377563477, + "learning_rate": 4.249343129964875e-05, + "loss": 0.9507, + "step": 171610 + }, + { + "epoch": 1.0964312638155962, + "grad_norm": 1.3709062337875366, + "learning_rate": 4.248847052260919e-05, + "loss": 0.9756, + "step": 171620 + }, + { + "epoch": 1.096495150965335, + "grad_norm": 1.1121571063995361, + "learning_rate": 4.248350982121722e-05, + "loss": 0.942, + "step": 171630 + }, + { + "epoch": 1.0965590381150736, + "grad_norm": 0.9342554807662964, + "learning_rate": 4.247854919552281e-05, + "loss": 0.8853, + "step": 171640 + }, + { + "epoch": 1.0966229252648123, + "grad_norm": 0.7645041942596436, + "learning_rate": 4.2473588645575925e-05, + "loss": 0.9555, + "step": 171650 + }, + { + "epoch": 1.096686812414551, + "grad_norm": 0.7210363149642944, + "learning_rate": 4.246862817142651e-05, + "loss": 0.9715, + "step": 171660 + }, + { + "epoch": 1.0967506995642897, + "grad_norm": 0.8885617256164551, + "learning_rate": 4.2463667773124526e-05, + "loss": 0.8339, + "step": 171670 + }, + { + "epoch": 1.0968145867140284, + "grad_norm": 1.1784389019012451, + "learning_rate": 4.245870745071993e-05, + "loss": 1.0082, + "step": 171680 + }, + { + "epoch": 1.0968784738637671, + "grad_norm": 0.9615663290023804, + "learning_rate": 4.245374720426267e-05, + "loss": 1.1509, + "step": 171690 + }, + { + "epoch": 1.0969423610135058, + "grad_norm": 1.2281625270843506, + "learning_rate": 4.244878703380271e-05, + "loss": 0.9602, + "step": 171700 + }, + { + "epoch": 1.0970062481632445, + "grad_norm": 0.9366394281387329, + "learning_rate": 4.244382693939e-05, + "loss": 0.9398, + "step": 171710 + }, + { + "epoch": 1.097070135312983, + "grad_norm": 0.8655837774276733, + "learning_rate": 4.243886692107448e-05, + "loss": 0.7933, + "step": 171720 + }, + { + "epoch": 1.097134022462722, + "grad_norm": 0.7708938717842102, + "learning_rate": 4.2433906978906114e-05, + "loss": 0.9384, + "step": 171730 + }, + { + "epoch": 1.0971979096124604, + "grad_norm": 1.2257397174835205, + "learning_rate": 4.2428947112934854e-05, + "loss": 1.0811, + "step": 171740 + }, + { + "epoch": 1.0972617967621991, + "grad_norm": 1.1159895658493042, + "learning_rate": 4.2423987323210646e-05, + "loss": 0.8552, + "step": 171750 + }, + { + "epoch": 1.0973256839119379, + "grad_norm": 0.725980818271637, + "learning_rate": 4.241902760978343e-05, + "loss": 0.9862, + "step": 171760 + }, + { + "epoch": 1.0973895710616766, + "grad_norm": 1.0240581035614014, + "learning_rate": 4.241406797270318e-05, + "loss": 0.7955, + "step": 171770 + }, + { + "epoch": 1.0974534582114153, + "grad_norm": 0.8556148409843445, + "learning_rate": 4.240910841201982e-05, + "loss": 1.0128, + "step": 171780 + }, + { + "epoch": 1.097517345361154, + "grad_norm": 0.7460873126983643, + "learning_rate": 4.24041489277833e-05, + "loss": 0.8961, + "step": 171790 + }, + { + "epoch": 1.0975812325108927, + "grad_norm": 0.9523458480834961, + "learning_rate": 4.239918952004358e-05, + "loss": 0.9272, + "step": 171800 + }, + { + "epoch": 1.0976451196606314, + "grad_norm": 0.7124657034873962, + "learning_rate": 4.23942301888506e-05, + "loss": 0.7402, + "step": 171810 + }, + { + "epoch": 1.09770900681037, + "grad_norm": 0.6065206527709961, + "learning_rate": 4.23892709342543e-05, + "loss": 0.8897, + "step": 171820 + }, + { + "epoch": 1.0977728939601088, + "grad_norm": 1.2093256711959839, + "learning_rate": 4.2384311756304616e-05, + "loss": 0.6684, + "step": 171830 + }, + { + "epoch": 1.0978367811098475, + "grad_norm": 0.9949159026145935, + "learning_rate": 4.23793526550515e-05, + "loss": 0.8771, + "step": 171840 + }, + { + "epoch": 1.0979006682595862, + "grad_norm": 0.8249183893203735, + "learning_rate": 4.2374393630544925e-05, + "loss": 1.0064, + "step": 171850 + }, + { + "epoch": 1.097964555409325, + "grad_norm": 1.299425482749939, + "learning_rate": 4.236943468283478e-05, + "loss": 0.9749, + "step": 171860 + }, + { + "epoch": 1.0980284425590636, + "grad_norm": 0.5668662190437317, + "learning_rate": 4.236447581197103e-05, + "loss": 0.7784, + "step": 171870 + }, + { + "epoch": 1.0980923297088023, + "grad_norm": 0.7093889117240906, + "learning_rate": 4.235951701800362e-05, + "loss": 1.1923, + "step": 171880 + }, + { + "epoch": 1.098156216858541, + "grad_norm": 0.6421130299568176, + "learning_rate": 4.235455830098248e-05, + "loss": 0.5921, + "step": 171890 + }, + { + "epoch": 1.0982201040082797, + "grad_norm": 1.3373634815216064, + "learning_rate": 4.2349599660957545e-05, + "loss": 1.1682, + "step": 171900 + }, + { + "epoch": 1.0982839911580184, + "grad_norm": 0.7919142246246338, + "learning_rate": 4.234464109797877e-05, + "loss": 0.7321, + "step": 171910 + }, + { + "epoch": 1.0983478783077572, + "grad_norm": 1.1525673866271973, + "learning_rate": 4.2339682612096075e-05, + "loss": 0.9742, + "step": 171920 + }, + { + "epoch": 1.0984117654574959, + "grad_norm": 0.77737957239151, + "learning_rate": 4.23347242033594e-05, + "loss": 0.8141, + "step": 171930 + }, + { + "epoch": 1.0984756526072346, + "grad_norm": 2.1227810382843018, + "learning_rate": 4.23297658718187e-05, + "loss": 0.8464, + "step": 171940 + }, + { + "epoch": 1.0985395397569733, + "grad_norm": 0.8301504850387573, + "learning_rate": 4.2324807617523865e-05, + "loss": 0.7757, + "step": 171950 + }, + { + "epoch": 1.098603426906712, + "grad_norm": 0.7667549252510071, + "learning_rate": 4.2319849440524877e-05, + "loss": 0.8712, + "step": 171960 + }, + { + "epoch": 1.0986673140564507, + "grad_norm": 1.3217402696609497, + "learning_rate": 4.231489134087165e-05, + "loss": 0.9119, + "step": 171970 + }, + { + "epoch": 1.0987312012061894, + "grad_norm": 0.9311858415603638, + "learning_rate": 4.2309933318614116e-05, + "loss": 0.7721, + "step": 171980 + }, + { + "epoch": 1.098795088355928, + "grad_norm": 0.8711034059524536, + "learning_rate": 4.23049753738022e-05, + "loss": 0.7246, + "step": 171990 + }, + { + "epoch": 1.0988589755056668, + "grad_norm": 1.1722370386123657, + "learning_rate": 4.230001750648584e-05, + "loss": 1.0147, + "step": 172000 + }, + { + "epoch": 1.0989228626554055, + "grad_norm": 0.976512610912323, + "learning_rate": 4.2295059716714965e-05, + "loss": 0.9354, + "step": 172010 + }, + { + "epoch": 1.0989867498051442, + "grad_norm": 1.198662281036377, + "learning_rate": 4.229010200453951e-05, + "loss": 0.8933, + "step": 172020 + }, + { + "epoch": 1.099050636954883, + "grad_norm": 1.7006891965866089, + "learning_rate": 4.22851443700094e-05, + "loss": 1.0453, + "step": 172030 + }, + { + "epoch": 1.0991145241046216, + "grad_norm": 0.9124136567115784, + "learning_rate": 4.228018681317456e-05, + "loss": 0.6503, + "step": 172040 + }, + { + "epoch": 1.0991784112543603, + "grad_norm": 0.9206166863441467, + "learning_rate": 4.227522933408491e-05, + "loss": 0.8653, + "step": 172050 + }, + { + "epoch": 1.099242298404099, + "grad_norm": 0.5899572968482971, + "learning_rate": 4.2270271932790386e-05, + "loss": 0.794, + "step": 172060 + }, + { + "epoch": 1.0993061855538377, + "grad_norm": 1.1387134790420532, + "learning_rate": 4.2265314609340915e-05, + "loss": 0.7775, + "step": 172070 + }, + { + "epoch": 1.0993700727035765, + "grad_norm": 1.3362581729888916, + "learning_rate": 4.226035736378641e-05, + "loss": 0.7042, + "step": 172080 + }, + { + "epoch": 1.0994339598533152, + "grad_norm": 0.7027170658111572, + "learning_rate": 4.225540019617681e-05, + "loss": 0.7286, + "step": 172090 + }, + { + "epoch": 1.0994978470030539, + "grad_norm": 0.8913945555686951, + "learning_rate": 4.225044310656202e-05, + "loss": 1.0044, + "step": 172100 + }, + { + "epoch": 1.0995617341527926, + "grad_norm": 1.0845966339111328, + "learning_rate": 4.224548609499198e-05, + "loss": 0.8733, + "step": 172110 + }, + { + "epoch": 1.0996256213025313, + "grad_norm": 0.90619957447052, + "learning_rate": 4.22405291615166e-05, + "loss": 0.7557, + "step": 172120 + }, + { + "epoch": 1.09968950845227, + "grad_norm": 1.1762818098068237, + "learning_rate": 4.2235572306185805e-05, + "loss": 1.0823, + "step": 172130 + }, + { + "epoch": 1.0997533956020087, + "grad_norm": 0.9015527367591858, + "learning_rate": 4.223061552904952e-05, + "loss": 0.8001, + "step": 172140 + }, + { + "epoch": 1.0998172827517474, + "grad_norm": 1.2849770784378052, + "learning_rate": 4.222565883015765e-05, + "loss": 0.9836, + "step": 172150 + }, + { + "epoch": 1.099881169901486, + "grad_norm": 0.5528566837310791, + "learning_rate": 4.222070220956012e-05, + "loss": 0.7608, + "step": 172160 + }, + { + "epoch": 1.0999450570512248, + "grad_norm": 1.3537547588348389, + "learning_rate": 4.2215745667306846e-05, + "loss": 0.956, + "step": 172170 + }, + { + "epoch": 1.1000089442009635, + "grad_norm": 0.8691318035125732, + "learning_rate": 4.2210789203447755e-05, + "loss": 0.8627, + "step": 172180 + }, + { + "epoch": 1.1000728313507022, + "grad_norm": 1.3839830160140991, + "learning_rate": 4.220583281803275e-05, + "loss": 0.8302, + "step": 172190 + }, + { + "epoch": 1.100136718500441, + "grad_norm": 0.724374532699585, + "learning_rate": 4.220087651111176e-05, + "loss": 0.6618, + "step": 172200 + }, + { + "epoch": 1.1002006056501794, + "grad_norm": 0.858903706073761, + "learning_rate": 4.2195920282734694e-05, + "loss": 0.7735, + "step": 172210 + }, + { + "epoch": 1.1002644927999183, + "grad_norm": 1.1366229057312012, + "learning_rate": 4.219096413295145e-05, + "loss": 0.9598, + "step": 172220 + }, + { + "epoch": 1.1003283799496568, + "grad_norm": 1.0029979944229126, + "learning_rate": 4.218600806181196e-05, + "loss": 0.8373, + "step": 172230 + }, + { + "epoch": 1.1003922670993955, + "grad_norm": 0.6481642127037048, + "learning_rate": 4.218105206936613e-05, + "loss": 1.1082, + "step": 172240 + }, + { + "epoch": 1.1004561542491342, + "grad_norm": 0.9797278046607971, + "learning_rate": 4.2176096155663866e-05, + "loss": 0.6281, + "step": 172250 + }, + { + "epoch": 1.100520041398873, + "grad_norm": 0.7582735419273376, + "learning_rate": 4.217114032075508e-05, + "loss": 0.9182, + "step": 172260 + }, + { + "epoch": 1.1005839285486116, + "grad_norm": 0.9007226824760437, + "learning_rate": 4.216618456468969e-05, + "loss": 0.7436, + "step": 172270 + }, + { + "epoch": 1.1006478156983504, + "grad_norm": 1.785007357597351, + "learning_rate": 4.2161228887517594e-05, + "loss": 0.9624, + "step": 172280 + }, + { + "epoch": 1.100711702848089, + "grad_norm": 0.7608382105827332, + "learning_rate": 4.215627328928871e-05, + "loss": 0.7186, + "step": 172290 + }, + { + "epoch": 1.1007755899978278, + "grad_norm": 1.9284123182296753, + "learning_rate": 4.215131777005294e-05, + "loss": 0.8328, + "step": 172300 + }, + { + "epoch": 1.1008394771475665, + "grad_norm": 0.9450188279151917, + "learning_rate": 4.2146362329860186e-05, + "loss": 0.9594, + "step": 172310 + }, + { + "epoch": 1.1009033642973052, + "grad_norm": 0.9953072667121887, + "learning_rate": 4.2141406968760356e-05, + "loss": 0.9788, + "step": 172320 + }, + { + "epoch": 1.1009672514470439, + "grad_norm": 1.1051782369613647, + "learning_rate": 4.2136451686803355e-05, + "loss": 0.8349, + "step": 172330 + }, + { + "epoch": 1.1010311385967826, + "grad_norm": 0.7446302771568298, + "learning_rate": 4.213149648403911e-05, + "loss": 1.0095, + "step": 172340 + }, + { + "epoch": 1.1010950257465213, + "grad_norm": 0.9063861966133118, + "learning_rate": 4.212654136051748e-05, + "loss": 1.2343, + "step": 172350 + }, + { + "epoch": 1.10115891289626, + "grad_norm": 1.1501379013061523, + "learning_rate": 4.21215863162884e-05, + "loss": 0.8306, + "step": 172360 + }, + { + "epoch": 1.1012228000459987, + "grad_norm": 1.0388190746307373, + "learning_rate": 4.2116631351401756e-05, + "loss": 0.9503, + "step": 172370 + }, + { + "epoch": 1.1012866871957374, + "grad_norm": 2.0269041061401367, + "learning_rate": 4.211167646590746e-05, + "loss": 0.8782, + "step": 172380 + }, + { + "epoch": 1.1013505743454761, + "grad_norm": 0.9322276711463928, + "learning_rate": 4.2106721659855395e-05, + "loss": 0.9964, + "step": 172390 + }, + { + "epoch": 1.1014144614952148, + "grad_norm": 0.815617024898529, + "learning_rate": 4.210176693329548e-05, + "loss": 1.1831, + "step": 172400 + }, + { + "epoch": 1.1014783486449535, + "grad_norm": 0.8040037155151367, + "learning_rate": 4.20968122862776e-05, + "loss": 0.829, + "step": 172410 + }, + { + "epoch": 1.1015422357946922, + "grad_norm": 0.7182419896125793, + "learning_rate": 4.209185771885166e-05, + "loss": 0.6466, + "step": 172420 + }, + { + "epoch": 1.101606122944431, + "grad_norm": 0.6640510559082031, + "learning_rate": 4.208690323106755e-05, + "loss": 1.0263, + "step": 172430 + }, + { + "epoch": 1.1016700100941696, + "grad_norm": 1.316969394683838, + "learning_rate": 4.2081948822975184e-05, + "loss": 0.8435, + "step": 172440 + }, + { + "epoch": 1.1017338972439084, + "grad_norm": 1.1966426372528076, + "learning_rate": 4.2076994494624436e-05, + "loss": 0.8266, + "step": 172450 + }, + { + "epoch": 1.101797784393647, + "grad_norm": 0.8291551470756531, + "learning_rate": 4.20720402460652e-05, + "loss": 0.947, + "step": 172460 + }, + { + "epoch": 1.1018616715433858, + "grad_norm": 0.6528846025466919, + "learning_rate": 4.206708607734739e-05, + "loss": 1.0032, + "step": 172470 + }, + { + "epoch": 1.1019255586931245, + "grad_norm": 0.9250202178955078, + "learning_rate": 4.2062131988520866e-05, + "loss": 0.9916, + "step": 172480 + }, + { + "epoch": 1.1019894458428632, + "grad_norm": 0.8550183773040771, + "learning_rate": 4.2057177979635554e-05, + "loss": 0.6554, + "step": 172490 + }, + { + "epoch": 1.1020533329926019, + "grad_norm": 1.0084480047225952, + "learning_rate": 4.205222405074133e-05, + "loss": 1.3241, + "step": 172500 + }, + { + "epoch": 1.1021172201423406, + "grad_norm": 0.6729216575622559, + "learning_rate": 4.204727020188809e-05, + "loss": 0.6531, + "step": 172510 + }, + { + "epoch": 1.1021811072920793, + "grad_norm": 1.1924982070922852, + "learning_rate": 4.204231643312571e-05, + "loss": 0.8125, + "step": 172520 + }, + { + "epoch": 1.102244994441818, + "grad_norm": 1.1781163215637207, + "learning_rate": 4.2037362744504096e-05, + "loss": 0.8448, + "step": 172530 + }, + { + "epoch": 1.1023088815915567, + "grad_norm": 1.3087345361709595, + "learning_rate": 4.2032409136073125e-05, + "loss": 0.8734, + "step": 172540 + }, + { + "epoch": 1.1023727687412954, + "grad_norm": 0.7616584300994873, + "learning_rate": 4.202745560788269e-05, + "loss": 1.1044, + "step": 172550 + }, + { + "epoch": 1.1024366558910341, + "grad_norm": 1.276070237159729, + "learning_rate": 4.202250215998267e-05, + "loss": 0.9824, + "step": 172560 + }, + { + "epoch": 1.1025005430407728, + "grad_norm": 0.9662360548973083, + "learning_rate": 4.201754879242296e-05, + "loss": 0.849, + "step": 172570 + }, + { + "epoch": 1.1025644301905115, + "grad_norm": 1.022019863128662, + "learning_rate": 4.201259550525343e-05, + "loss": 0.8815, + "step": 172580 + }, + { + "epoch": 1.1026283173402502, + "grad_norm": 0.5287925601005554, + "learning_rate": 4.200764229852398e-05, + "loss": 0.8442, + "step": 172590 + }, + { + "epoch": 1.102692204489989, + "grad_norm": 1.201217532157898, + "learning_rate": 4.200268917228449e-05, + "loss": 0.8891, + "step": 172600 + }, + { + "epoch": 1.1027560916397277, + "grad_norm": 0.9720419645309448, + "learning_rate": 4.199773612658483e-05, + "loss": 0.7276, + "step": 172610 + }, + { + "epoch": 1.1028199787894664, + "grad_norm": 0.7095947861671448, + "learning_rate": 4.1992783161474894e-05, + "loss": 0.838, + "step": 172620 + }, + { + "epoch": 1.102883865939205, + "grad_norm": 0.8162873983383179, + "learning_rate": 4.198783027700456e-05, + "loss": 0.8075, + "step": 172630 + }, + { + "epoch": 1.1029477530889438, + "grad_norm": 0.8493092656135559, + "learning_rate": 4.1982877473223706e-05, + "loss": 1.0478, + "step": 172640 + }, + { + "epoch": 1.1030116402386825, + "grad_norm": 1.2920939922332764, + "learning_rate": 4.197792475018221e-05, + "loss": 0.9082, + "step": 172650 + }, + { + "epoch": 1.1030755273884212, + "grad_norm": 1.1556893587112427, + "learning_rate": 4.197297210792996e-05, + "loss": 0.9246, + "step": 172660 + }, + { + "epoch": 1.10313941453816, + "grad_norm": 0.756718635559082, + "learning_rate": 4.196801954651682e-05, + "loss": 1.0195, + "step": 172670 + }, + { + "epoch": 1.1032033016878986, + "grad_norm": 1.058808445930481, + "learning_rate": 4.196306706599267e-05, + "loss": 0.8777, + "step": 172680 + }, + { + "epoch": 1.1032671888376373, + "grad_norm": 0.9461612105369568, + "learning_rate": 4.195811466640738e-05, + "loss": 0.7936, + "step": 172690 + }, + { + "epoch": 1.1033310759873758, + "grad_norm": 0.7724422812461853, + "learning_rate": 4.195316234781084e-05, + "loss": 0.7225, + "step": 172700 + }, + { + "epoch": 1.1033949631371147, + "grad_norm": 0.7925411462783813, + "learning_rate": 4.194821011025291e-05, + "loss": 1.0131, + "step": 172710 + }, + { + "epoch": 1.1034588502868532, + "grad_norm": 1.2644563913345337, + "learning_rate": 4.194325795378348e-05, + "loss": 0.7148, + "step": 172720 + }, + { + "epoch": 1.103522737436592, + "grad_norm": 0.794563889503479, + "learning_rate": 4.193830587845241e-05, + "loss": 0.8343, + "step": 172730 + }, + { + "epoch": 1.1035866245863306, + "grad_norm": 0.9380400776863098, + "learning_rate": 4.193335388430957e-05, + "loss": 0.81, + "step": 172740 + }, + { + "epoch": 1.1036505117360693, + "grad_norm": 0.8075729608535767, + "learning_rate": 4.192840197140484e-05, + "loss": 0.9544, + "step": 172750 + }, + { + "epoch": 1.103714398885808, + "grad_norm": 0.8002738356590271, + "learning_rate": 4.192345013978809e-05, + "loss": 0.9816, + "step": 172760 + }, + { + "epoch": 1.1037782860355467, + "grad_norm": 1.1612515449523926, + "learning_rate": 4.1918498389509175e-05, + "loss": 0.9024, + "step": 172770 + }, + { + "epoch": 1.1038421731852854, + "grad_norm": 0.9753612875938416, + "learning_rate": 4.191354672061798e-05, + "loss": 0.8175, + "step": 172780 + }, + { + "epoch": 1.1039060603350241, + "grad_norm": 1.0018128156661987, + "learning_rate": 4.190859513316436e-05, + "loss": 0.944, + "step": 172790 + }, + { + "epoch": 1.1039699474847628, + "grad_norm": 0.9988775849342346, + "learning_rate": 4.1903643627198184e-05, + "loss": 0.855, + "step": 172800 + }, + { + "epoch": 1.1040338346345016, + "grad_norm": 0.9023102521896362, + "learning_rate": 4.189869220276933e-05, + "loss": 0.8005, + "step": 172810 + }, + { + "epoch": 1.1040977217842403, + "grad_norm": 0.9544625878334045, + "learning_rate": 4.189374085992766e-05, + "loss": 0.6235, + "step": 172820 + }, + { + "epoch": 1.104161608933979, + "grad_norm": 0.8562326431274414, + "learning_rate": 4.1888789598723024e-05, + "loss": 0.7493, + "step": 172830 + }, + { + "epoch": 1.1042254960837177, + "grad_norm": 0.857276976108551, + "learning_rate": 4.188383841920529e-05, + "loss": 0.8467, + "step": 172840 + }, + { + "epoch": 1.1042893832334564, + "grad_norm": 0.8983993530273438, + "learning_rate": 4.1878887321424325e-05, + "loss": 0.8552, + "step": 172850 + }, + { + "epoch": 1.104353270383195, + "grad_norm": 2.2336559295654297, + "learning_rate": 4.1873936305429995e-05, + "loss": 1.1215, + "step": 172860 + }, + { + "epoch": 1.1044171575329338, + "grad_norm": 0.9345127940177917, + "learning_rate": 4.186898537127215e-05, + "loss": 0.8787, + "step": 172870 + }, + { + "epoch": 1.1044810446826725, + "grad_norm": 1.2393178939819336, + "learning_rate": 4.186403451900066e-05, + "loss": 1.0144, + "step": 172880 + }, + { + "epoch": 1.1045449318324112, + "grad_norm": 1.1470755338668823, + "learning_rate": 4.1859083748665385e-05, + "loss": 0.7146, + "step": 172890 + }, + { + "epoch": 1.10460881898215, + "grad_norm": 0.8653610944747925, + "learning_rate": 4.185413306031617e-05, + "loss": 1.0216, + "step": 172900 + }, + { + "epoch": 1.1046727061318886, + "grad_norm": 0.784228503704071, + "learning_rate": 4.184918245400289e-05, + "loss": 0.8824, + "step": 172910 + }, + { + "epoch": 1.1047365932816273, + "grad_norm": 0.7353566884994507, + "learning_rate": 4.1844231929775394e-05, + "loss": 0.7958, + "step": 172920 + }, + { + "epoch": 1.104800480431366, + "grad_norm": 0.8124344944953918, + "learning_rate": 4.1839281487683535e-05, + "loss": 0.815, + "step": 172930 + }, + { + "epoch": 1.1048643675811047, + "grad_norm": 1.401458501815796, + "learning_rate": 4.183433112777717e-05, + "loss": 0.7634, + "step": 172940 + }, + { + "epoch": 1.1049282547308434, + "grad_norm": 1.014285922050476, + "learning_rate": 4.182938085010616e-05, + "loss": 0.9408, + "step": 172950 + }, + { + "epoch": 1.1049921418805821, + "grad_norm": 1.2659311294555664, + "learning_rate": 4.182443065472035e-05, + "loss": 0.897, + "step": 172960 + }, + { + "epoch": 1.1050560290303209, + "grad_norm": 0.4442936182022095, + "learning_rate": 4.18194805416696e-05, + "loss": 0.7383, + "step": 172970 + }, + { + "epoch": 1.1051199161800596, + "grad_norm": 1.057563304901123, + "learning_rate": 4.1814530511003755e-05, + "loss": 0.8815, + "step": 172980 + }, + { + "epoch": 1.1051838033297983, + "grad_norm": 0.4912680387496948, + "learning_rate": 4.1809580562772674e-05, + "loss": 0.8234, + "step": 172990 + }, + { + "epoch": 1.105247690479537, + "grad_norm": 3.1906774044036865, + "learning_rate": 4.1804630697026196e-05, + "loss": 0.9966, + "step": 173000 + }, + { + "epoch": 1.1053115776292757, + "grad_norm": 1.059396743774414, + "learning_rate": 4.179968091381417e-05, + "loss": 0.8505, + "step": 173010 + }, + { + "epoch": 1.1053754647790144, + "grad_norm": 0.7790278792381287, + "learning_rate": 4.1794731213186456e-05, + "loss": 0.8695, + "step": 173020 + }, + { + "epoch": 1.105439351928753, + "grad_norm": 0.581296980381012, + "learning_rate": 4.17897815951929e-05, + "loss": 0.7661, + "step": 173030 + }, + { + "epoch": 1.1055032390784918, + "grad_norm": 1.0370402336120605, + "learning_rate": 4.1784832059883347e-05, + "loss": 0.689, + "step": 173040 + }, + { + "epoch": 1.1055671262282305, + "grad_norm": 1.0399209260940552, + "learning_rate": 4.177988260730765e-05, + "loss": 1.1618, + "step": 173050 + }, + { + "epoch": 1.1056310133779692, + "grad_norm": 2.4747142791748047, + "learning_rate": 4.177493323751564e-05, + "loss": 0.9111, + "step": 173060 + }, + { + "epoch": 1.105694900527708, + "grad_norm": 0.7076548337936401, + "learning_rate": 4.176998395055716e-05, + "loss": 0.8245, + "step": 173070 + }, + { + "epoch": 1.1057587876774466, + "grad_norm": 0.9863085746765137, + "learning_rate": 4.1765034746482076e-05, + "loss": 0.9501, + "step": 173080 + }, + { + "epoch": 1.1058226748271853, + "grad_norm": 0.8387857675552368, + "learning_rate": 4.1760085625340206e-05, + "loss": 0.7514, + "step": 173090 + }, + { + "epoch": 1.105886561976924, + "grad_norm": 0.7635478973388672, + "learning_rate": 4.175513658718141e-05, + "loss": 0.9059, + "step": 173100 + }, + { + "epoch": 1.1059504491266627, + "grad_norm": 0.7215414643287659, + "learning_rate": 4.1750187632055514e-05, + "loss": 0.9656, + "step": 173110 + }, + { + "epoch": 1.1060143362764014, + "grad_norm": 1.6596556901931763, + "learning_rate": 4.1745238760012366e-05, + "loss": 0.8594, + "step": 173120 + }, + { + "epoch": 1.1060782234261402, + "grad_norm": 1.1065661907196045, + "learning_rate": 4.174028997110181e-05, + "loss": 0.9072, + "step": 173130 + }, + { + "epoch": 1.1061421105758789, + "grad_norm": 0.6142051219940186, + "learning_rate": 4.173534126537368e-05, + "loss": 0.6434, + "step": 173140 + }, + { + "epoch": 1.1062059977256176, + "grad_norm": 6.143871307373047, + "learning_rate": 4.173039264287781e-05, + "loss": 1.0416, + "step": 173150 + }, + { + "epoch": 1.1062698848753563, + "grad_norm": 1.4489151239395142, + "learning_rate": 4.172544410366404e-05, + "loss": 0.8264, + "step": 173160 + }, + { + "epoch": 1.106333772025095, + "grad_norm": 1.5547164678573608, + "learning_rate": 4.172049564778221e-05, + "loss": 1.1625, + "step": 173170 + }, + { + "epoch": 1.1063976591748337, + "grad_norm": 1.0034717321395874, + "learning_rate": 4.171554727528215e-05, + "loss": 1.0096, + "step": 173180 + }, + { + "epoch": 1.1064615463245722, + "grad_norm": 0.7144385576248169, + "learning_rate": 4.1710598986213696e-05, + "loss": 0.8016, + "step": 173190 + }, + { + "epoch": 1.106525433474311, + "grad_norm": 1.2244892120361328, + "learning_rate": 4.170565078062668e-05, + "loss": 0.8115, + "step": 173200 + }, + { + "epoch": 1.1065893206240496, + "grad_norm": 0.885942280292511, + "learning_rate": 4.170070265857092e-05, + "loss": 0.9264, + "step": 173210 + }, + { + "epoch": 1.1066532077737883, + "grad_norm": 0.5482710003852844, + "learning_rate": 4.169575462009628e-05, + "loss": 1.0419, + "step": 173220 + }, + { + "epoch": 1.106717094923527, + "grad_norm": 0.9318670630455017, + "learning_rate": 4.169080666525258e-05, + "loss": 0.6748, + "step": 173230 + }, + { + "epoch": 1.1067809820732657, + "grad_norm": 0.8965703845024109, + "learning_rate": 4.1685858794089646e-05, + "loss": 0.8237, + "step": 173240 + }, + { + "epoch": 1.1068448692230044, + "grad_norm": 2.319718599319458, + "learning_rate": 4.1680911006657306e-05, + "loss": 1.3752, + "step": 173250 + }, + { + "epoch": 1.106908756372743, + "grad_norm": 1.0713149309158325, + "learning_rate": 4.167596330300538e-05, + "loss": 0.8723, + "step": 173260 + }, + { + "epoch": 1.1069726435224818, + "grad_norm": 0.9905885457992554, + "learning_rate": 4.167101568318371e-05, + "loss": 0.9566, + "step": 173270 + }, + { + "epoch": 1.1070365306722205, + "grad_norm": 0.9312426447868347, + "learning_rate": 4.166606814724212e-05, + "loss": 0.7414, + "step": 173280 + }, + { + "epoch": 1.1071004178219592, + "grad_norm": 0.8389294147491455, + "learning_rate": 4.1661120695230435e-05, + "loss": 1.1426, + "step": 173290 + }, + { + "epoch": 1.107164304971698, + "grad_norm": 0.496663898229599, + "learning_rate": 4.165617332719847e-05, + "loss": 0.7483, + "step": 173300 + }, + { + "epoch": 1.1072281921214366, + "grad_norm": 1.2054636478424072, + "learning_rate": 4.165122604319609e-05, + "loss": 0.701, + "step": 173310 + }, + { + "epoch": 1.1072920792711753, + "grad_norm": 1.9126182794570923, + "learning_rate": 4.164627884327306e-05, + "loss": 0.8618, + "step": 173320 + }, + { + "epoch": 1.107355966420914, + "grad_norm": 1.3022087812423706, + "learning_rate": 4.1641331727479216e-05, + "loss": 0.9053, + "step": 173330 + }, + { + "epoch": 1.1074198535706528, + "grad_norm": 1.2964845895767212, + "learning_rate": 4.16363846958644e-05, + "loss": 0.8173, + "step": 173340 + }, + { + "epoch": 1.1074837407203915, + "grad_norm": 0.7168669104576111, + "learning_rate": 4.163143774847844e-05, + "loss": 0.9235, + "step": 173350 + }, + { + "epoch": 1.1075476278701302, + "grad_norm": 0.8237465023994446, + "learning_rate": 4.1626490885371134e-05, + "loss": 0.8374, + "step": 173360 + }, + { + "epoch": 1.1076115150198689, + "grad_norm": 0.9421668648719788, + "learning_rate": 4.162154410659231e-05, + "loss": 0.804, + "step": 173370 + }, + { + "epoch": 1.1076754021696076, + "grad_norm": 0.8502464890480042, + "learning_rate": 4.161659741219178e-05, + "loss": 0.7407, + "step": 173380 + }, + { + "epoch": 1.1077392893193463, + "grad_norm": 0.7700791954994202, + "learning_rate": 4.161165080221937e-05, + "loss": 1.105, + "step": 173390 + }, + { + "epoch": 1.107803176469085, + "grad_norm": 0.670958936214447, + "learning_rate": 4.160670427672489e-05, + "loss": 1.1197, + "step": 173400 + }, + { + "epoch": 1.1078670636188237, + "grad_norm": 1.0116523504257202, + "learning_rate": 4.160175783575817e-05, + "loss": 0.779, + "step": 173410 + }, + { + "epoch": 1.1079309507685624, + "grad_norm": 1.674210548400879, + "learning_rate": 4.1596811479369004e-05, + "loss": 0.9408, + "step": 173420 + }, + { + "epoch": 1.1079948379183011, + "grad_norm": 0.8221597075462341, + "learning_rate": 4.1591865207607215e-05, + "loss": 1.0986, + "step": 173430 + }, + { + "epoch": 1.1080587250680398, + "grad_norm": 1.2803705930709839, + "learning_rate": 4.1586919020522624e-05, + "loss": 0.9425, + "step": 173440 + }, + { + "epoch": 1.1081226122177785, + "grad_norm": 0.7996966242790222, + "learning_rate": 4.158197291816503e-05, + "loss": 0.8781, + "step": 173450 + }, + { + "epoch": 1.1081864993675172, + "grad_norm": 1.3771693706512451, + "learning_rate": 4.157702690058426e-05, + "loss": 0.9906, + "step": 173460 + }, + { + "epoch": 1.108250386517256, + "grad_norm": 1.1042697429656982, + "learning_rate": 4.157208096783011e-05, + "loss": 0.795, + "step": 173470 + }, + { + "epoch": 1.1083142736669946, + "grad_norm": 1.042994499206543, + "learning_rate": 4.156713511995241e-05, + "loss": 0.741, + "step": 173480 + }, + { + "epoch": 1.1083781608167333, + "grad_norm": 0.7262585163116455, + "learning_rate": 4.156218935700094e-05, + "loss": 0.7013, + "step": 173490 + }, + { + "epoch": 1.108442047966472, + "grad_norm": 0.6869814395904541, + "learning_rate": 4.155724367902552e-05, + "loss": 0.7797, + "step": 173500 + }, + { + "epoch": 1.1085059351162108, + "grad_norm": 0.9764127731323242, + "learning_rate": 4.155229808607596e-05, + "loss": 0.9213, + "step": 173510 + }, + { + "epoch": 1.1085698222659495, + "grad_norm": 0.9626973271369934, + "learning_rate": 4.1547352578202074e-05, + "loss": 1.0226, + "step": 173520 + }, + { + "epoch": 1.1086337094156882, + "grad_norm": 1.117993712425232, + "learning_rate": 4.154240715545366e-05, + "loss": 0.7672, + "step": 173530 + }, + { + "epoch": 1.1086975965654269, + "grad_norm": 1.0907342433929443, + "learning_rate": 4.153746181788051e-05, + "loss": 0.8026, + "step": 173540 + }, + { + "epoch": 1.1087614837151656, + "grad_norm": 1.1601186990737915, + "learning_rate": 4.153251656553246e-05, + "loss": 0.8438, + "step": 173550 + }, + { + "epoch": 1.1088253708649043, + "grad_norm": 0.5594744086265564, + "learning_rate": 4.152757139845928e-05, + "loss": 1.0255, + "step": 173560 + }, + { + "epoch": 1.108889258014643, + "grad_norm": 0.759391725063324, + "learning_rate": 4.152262631671079e-05, + "loss": 0.8922, + "step": 173570 + }, + { + "epoch": 1.1089531451643817, + "grad_norm": 1.0028988122940063, + "learning_rate": 4.151768132033679e-05, + "loss": 0.6874, + "step": 173580 + }, + { + "epoch": 1.1090170323141204, + "grad_norm": 1.068331003189087, + "learning_rate": 4.1512736409387075e-05, + "loss": 0.8646, + "step": 173590 + }, + { + "epoch": 1.1090809194638591, + "grad_norm": 0.9323468208312988, + "learning_rate": 4.150779158391145e-05, + "loss": 0.6483, + "step": 173600 + }, + { + "epoch": 1.1091448066135978, + "grad_norm": 1.2181113958358765, + "learning_rate": 4.1502846843959706e-05, + "loss": 0.9429, + "step": 173610 + }, + { + "epoch": 1.1092086937633365, + "grad_norm": 1.4081296920776367, + "learning_rate": 4.149790218958165e-05, + "loss": 0.9354, + "step": 173620 + }, + { + "epoch": 1.1092725809130752, + "grad_norm": 0.8330966234207153, + "learning_rate": 4.1492957620827066e-05, + "loss": 0.814, + "step": 173630 + }, + { + "epoch": 1.109336468062814, + "grad_norm": 0.9767903089523315, + "learning_rate": 4.148801313774576e-05, + "loss": 0.8311, + "step": 173640 + }, + { + "epoch": 1.1094003552125526, + "grad_norm": 1.104324460029602, + "learning_rate": 4.148306874038753e-05, + "loss": 1.0736, + "step": 173650 + }, + { + "epoch": 1.1094642423622911, + "grad_norm": 1.6907215118408203, + "learning_rate": 4.147812442880217e-05, + "loss": 0.8524, + "step": 173660 + }, + { + "epoch": 1.10952812951203, + "grad_norm": 1.7395610809326172, + "learning_rate": 4.147318020303946e-05, + "loss": 1.1916, + "step": 173670 + }, + { + "epoch": 1.1095920166617685, + "grad_norm": 0.7708711624145508, + "learning_rate": 4.1468236063149216e-05, + "loss": 0.9255, + "step": 173680 + }, + { + "epoch": 1.1096559038115075, + "grad_norm": 1.1579172611236572, + "learning_rate": 4.14632920091812e-05, + "loss": 0.9762, + "step": 173690 + }, + { + "epoch": 1.109719790961246, + "grad_norm": 1.0927681922912598, + "learning_rate": 4.145834804118522e-05, + "loss": 1.127, + "step": 173700 + }, + { + "epoch": 1.1097836781109847, + "grad_norm": 1.0229488611221313, + "learning_rate": 4.1453404159211074e-05, + "loss": 1.0284, + "step": 173710 + }, + { + "epoch": 1.1098475652607234, + "grad_norm": 1.7790634632110596, + "learning_rate": 4.144846036330854e-05, + "loss": 0.6832, + "step": 173720 + }, + { + "epoch": 1.109911452410462, + "grad_norm": 0.9170469641685486, + "learning_rate": 4.144351665352741e-05, + "loss": 0.7688, + "step": 173730 + }, + { + "epoch": 1.1099753395602008, + "grad_norm": 0.9874303936958313, + "learning_rate": 4.1438573029917454e-05, + "loss": 0.7773, + "step": 173740 + }, + { + "epoch": 1.1100392267099395, + "grad_norm": 0.5128178000450134, + "learning_rate": 4.1433629492528485e-05, + "loss": 0.8497, + "step": 173750 + }, + { + "epoch": 1.1101031138596782, + "grad_norm": 0.9220598340034485, + "learning_rate": 4.142868604141028e-05, + "loss": 0.9081, + "step": 173760 + }, + { + "epoch": 1.110167001009417, + "grad_norm": 0.7820585370063782, + "learning_rate": 4.142374267661262e-05, + "loss": 0.8426, + "step": 173770 + }, + { + "epoch": 1.1102308881591556, + "grad_norm": 0.7808403372764587, + "learning_rate": 4.141879939818529e-05, + "loss": 0.9227, + "step": 173780 + }, + { + "epoch": 1.1102947753088943, + "grad_norm": 0.9865910410881042, + "learning_rate": 4.141385620617808e-05, + "loss": 0.7624, + "step": 173790 + }, + { + "epoch": 1.110358662458633, + "grad_norm": 1.2182927131652832, + "learning_rate": 4.140891310064079e-05, + "loss": 0.8516, + "step": 173800 + }, + { + "epoch": 1.1104225496083717, + "grad_norm": 0.6205449104309082, + "learning_rate": 4.140397008162315e-05, + "loss": 0.8439, + "step": 173810 + }, + { + "epoch": 1.1104864367581104, + "grad_norm": 0.9484419822692871, + "learning_rate": 4.1399027149174965e-05, + "loss": 0.7673, + "step": 173820 + }, + { + "epoch": 1.1105503239078491, + "grad_norm": 0.9140626192092896, + "learning_rate": 4.139408430334601e-05, + "loss": 1.0648, + "step": 173830 + }, + { + "epoch": 1.1106142110575878, + "grad_norm": 0.9677501916885376, + "learning_rate": 4.138914154418609e-05, + "loss": 0.9232, + "step": 173840 + }, + { + "epoch": 1.1106780982073265, + "grad_norm": 0.8527592420578003, + "learning_rate": 4.138419887174495e-05, + "loss": 0.8107, + "step": 173850 + }, + { + "epoch": 1.1107419853570653, + "grad_norm": 0.6409138441085815, + "learning_rate": 4.137925628607238e-05, + "loss": 0.7725, + "step": 173860 + }, + { + "epoch": 1.110805872506804, + "grad_norm": 0.6374943852424622, + "learning_rate": 4.137431378721816e-05, + "loss": 0.8226, + "step": 173870 + }, + { + "epoch": 1.1108697596565427, + "grad_norm": 1.587991714477539, + "learning_rate": 4.136937137523207e-05, + "loss": 0.7838, + "step": 173880 + }, + { + "epoch": 1.1109336468062814, + "grad_norm": 1.2672752141952515, + "learning_rate": 4.136442905016387e-05, + "loss": 0.918, + "step": 173890 + }, + { + "epoch": 1.11099753395602, + "grad_norm": 0.7447906732559204, + "learning_rate": 4.135948681206334e-05, + "loss": 0.9646, + "step": 173900 + }, + { + "epoch": 1.1110614211057588, + "grad_norm": 1.1759493350982666, + "learning_rate": 4.135454466098026e-05, + "loss": 1.2743, + "step": 173910 + }, + { + "epoch": 1.1111253082554975, + "grad_norm": 0.9131328463554382, + "learning_rate": 4.1349602596964386e-05, + "loss": 0.8198, + "step": 173920 + }, + { + "epoch": 1.1111891954052362, + "grad_norm": 0.665424108505249, + "learning_rate": 4.13446606200655e-05, + "loss": 0.7456, + "step": 173930 + }, + { + "epoch": 1.111253082554975, + "grad_norm": 0.6461379528045654, + "learning_rate": 4.133971873033338e-05, + "loss": 0.8839, + "step": 173940 + }, + { + "epoch": 1.1113169697047136, + "grad_norm": 0.6741138100624084, + "learning_rate": 4.1334776927817776e-05, + "loss": 0.8698, + "step": 173950 + }, + { + "epoch": 1.1113808568544523, + "grad_norm": 1.0578811168670654, + "learning_rate": 4.132983521256846e-05, + "loss": 0.822, + "step": 173960 + }, + { + "epoch": 1.111444744004191, + "grad_norm": 1.079478144645691, + "learning_rate": 4.1324893584635214e-05, + "loss": 0.7061, + "step": 173970 + }, + { + "epoch": 1.1115086311539297, + "grad_norm": 1.0450830459594727, + "learning_rate": 4.131995204406779e-05, + "loss": 0.6351, + "step": 173980 + }, + { + "epoch": 1.1115725183036684, + "grad_norm": 0.7627521753311157, + "learning_rate": 4.131501059091596e-05, + "loss": 0.7369, + "step": 173990 + }, + { + "epoch": 1.1116364054534071, + "grad_norm": 1.1520981788635254, + "learning_rate": 4.131006922522948e-05, + "loss": 0.8487, + "step": 174000 + }, + { + "epoch": 1.1117002926031458, + "grad_norm": 0.9615451097488403, + "learning_rate": 4.130512794705813e-05, + "loss": 0.9746, + "step": 174010 + }, + { + "epoch": 1.1117641797528846, + "grad_norm": 1.2906620502471924, + "learning_rate": 4.130018675645166e-05, + "loss": 1.0011, + "step": 174020 + }, + { + "epoch": 1.1118280669026233, + "grad_norm": 1.2265547513961792, + "learning_rate": 4.129524565345984e-05, + "loss": 0.8273, + "step": 174030 + }, + { + "epoch": 1.111891954052362, + "grad_norm": 0.739889919757843, + "learning_rate": 4.1290304638132414e-05, + "loss": 0.8474, + "step": 174040 + }, + { + "epoch": 1.1119558412021007, + "grad_norm": 0.7667902708053589, + "learning_rate": 4.128536371051916e-05, + "loss": 0.7887, + "step": 174050 + }, + { + "epoch": 1.1120197283518394, + "grad_norm": 0.8716595768928528, + "learning_rate": 4.1280422870669834e-05, + "loss": 0.7622, + "step": 174060 + }, + { + "epoch": 1.112083615501578, + "grad_norm": 1.0395444631576538, + "learning_rate": 4.127548211863419e-05, + "loss": 0.856, + "step": 174070 + }, + { + "epoch": 1.1121475026513168, + "grad_norm": 0.9293608069419861, + "learning_rate": 4.1270541454462e-05, + "loss": 1.0687, + "step": 174080 + }, + { + "epoch": 1.1122113898010555, + "grad_norm": 0.7219346165657043, + "learning_rate": 4.1265600878203e-05, + "loss": 0.8219, + "step": 174090 + }, + { + "epoch": 1.1122752769507942, + "grad_norm": 0.8477901220321655, + "learning_rate": 4.126066038990696e-05, + "loss": 0.7718, + "step": 174100 + }, + { + "epoch": 1.112339164100533, + "grad_norm": 0.8595911860466003, + "learning_rate": 4.125571998962363e-05, + "loss": 0.8496, + "step": 174110 + }, + { + "epoch": 1.1124030512502716, + "grad_norm": 0.6253750920295715, + "learning_rate": 4.125077967740276e-05, + "loss": 0.808, + "step": 174120 + }, + { + "epoch": 1.1124669384000103, + "grad_norm": 1.5641717910766602, + "learning_rate": 4.124583945329412e-05, + "loss": 0.8895, + "step": 174130 + }, + { + "epoch": 1.112530825549749, + "grad_norm": 1.0902210474014282, + "learning_rate": 4.124089931734744e-05, + "loss": 0.787, + "step": 174140 + }, + { + "epoch": 1.1125947126994875, + "grad_norm": 0.6443929076194763, + "learning_rate": 4.123595926961248e-05, + "loss": 0.7237, + "step": 174150 + }, + { + "epoch": 1.1126585998492264, + "grad_norm": 0.9357183575630188, + "learning_rate": 4.1231019310139e-05, + "loss": 0.8374, + "step": 174160 + }, + { + "epoch": 1.112722486998965, + "grad_norm": 1.554679036140442, + "learning_rate": 4.122607943897674e-05, + "loss": 0.8504, + "step": 174170 + }, + { + "epoch": 1.1127863741487036, + "grad_norm": 0.9971223473548889, + "learning_rate": 4.122113965617544e-05, + "loss": 0.9429, + "step": 174180 + }, + { + "epoch": 1.1128502612984423, + "grad_norm": 0.8027485013008118, + "learning_rate": 4.1216199961784876e-05, + "loss": 0.8253, + "step": 174190 + }, + { + "epoch": 1.112914148448181, + "grad_norm": 0.7266373038291931, + "learning_rate": 4.1211260355854764e-05, + "loss": 0.813, + "step": 174200 + }, + { + "epoch": 1.1129780355979197, + "grad_norm": 1.004325270652771, + "learning_rate": 4.120632083843487e-05, + "loss": 0.9, + "step": 174210 + }, + { + "epoch": 1.1130419227476585, + "grad_norm": 2.024078130722046, + "learning_rate": 4.120138140957493e-05, + "loss": 0.8187, + "step": 174220 + }, + { + "epoch": 1.1131058098973972, + "grad_norm": 1.0069423913955688, + "learning_rate": 4.119644206932469e-05, + "loss": 0.6752, + "step": 174230 + }, + { + "epoch": 1.1131696970471359, + "grad_norm": 0.9048756957054138, + "learning_rate": 4.1191502817733894e-05, + "loss": 0.8194, + "step": 174240 + }, + { + "epoch": 1.1132335841968746, + "grad_norm": 0.8194555044174194, + "learning_rate": 4.1186563654852286e-05, + "loss": 0.8253, + "step": 174250 + }, + { + "epoch": 1.1132974713466133, + "grad_norm": 6.5662665367126465, + "learning_rate": 4.118162458072961e-05, + "loss": 1.0693, + "step": 174260 + }, + { + "epoch": 1.113361358496352, + "grad_norm": 1.075524926185608, + "learning_rate": 4.117668559541559e-05, + "loss": 0.8809, + "step": 174270 + }, + { + "epoch": 1.1134252456460907, + "grad_norm": 0.49496862292289734, + "learning_rate": 4.117174669896001e-05, + "loss": 0.6506, + "step": 174280 + }, + { + "epoch": 1.1134891327958294, + "grad_norm": 1.0560492277145386, + "learning_rate": 4.116680789141256e-05, + "loss": 0.7415, + "step": 174290 + }, + { + "epoch": 1.113553019945568, + "grad_norm": 0.9809279441833496, + "learning_rate": 4.1161869172823e-05, + "loss": 0.8428, + "step": 174300 + }, + { + "epoch": 1.1136169070953068, + "grad_norm": 0.8138776421546936, + "learning_rate": 4.115693054324106e-05, + "loss": 1.0035, + "step": 174310 + }, + { + "epoch": 1.1136807942450455, + "grad_norm": 1.1911145448684692, + "learning_rate": 4.1151992002716475e-05, + "loss": 0.8993, + "step": 174320 + }, + { + "epoch": 1.1137446813947842, + "grad_norm": 0.5131652355194092, + "learning_rate": 4.114705355129899e-05, + "loss": 0.7444, + "step": 174330 + }, + { + "epoch": 1.113808568544523, + "grad_norm": 1.192893624305725, + "learning_rate": 4.1142115189038334e-05, + "loss": 0.7713, + "step": 174340 + }, + { + "epoch": 1.1138724556942616, + "grad_norm": 1.0480468273162842, + "learning_rate": 4.1137176915984246e-05, + "loss": 1.0559, + "step": 174350 + }, + { + "epoch": 1.1139363428440003, + "grad_norm": 1.0468538999557495, + "learning_rate": 4.113223873218644e-05, + "loss": 0.7441, + "step": 174360 + }, + { + "epoch": 1.114000229993739, + "grad_norm": 0.9152908325195312, + "learning_rate": 4.112730063769468e-05, + "loss": 0.8123, + "step": 174370 + }, + { + "epoch": 1.1140641171434778, + "grad_norm": 1.3621870279312134, + "learning_rate": 4.112236263255866e-05, + "loss": 0.8506, + "step": 174380 + }, + { + "epoch": 1.1141280042932165, + "grad_norm": 0.8971595168113708, + "learning_rate": 4.1117424716828126e-05, + "loss": 0.7592, + "step": 174390 + }, + { + "epoch": 1.1141918914429552, + "grad_norm": 3.1121487617492676, + "learning_rate": 4.111248689055283e-05, + "loss": 0.7638, + "step": 174400 + }, + { + "epoch": 1.1142557785926939, + "grad_norm": 0.7368488311767578, + "learning_rate": 4.1107549153782463e-05, + "loss": 0.8275, + "step": 174410 + }, + { + "epoch": 1.1143196657424326, + "grad_norm": 2.520725965499878, + "learning_rate": 4.110261150656678e-05, + "loss": 0.9942, + "step": 174420 + }, + { + "epoch": 1.1143835528921713, + "grad_norm": 0.9537742733955383, + "learning_rate": 4.10976739489555e-05, + "loss": 0.8468, + "step": 174430 + }, + { + "epoch": 1.11444744004191, + "grad_norm": 1.5037137269973755, + "learning_rate": 4.1093230223758204e-05, + "loss": 0.9547, + "step": 174440 + }, + { + "epoch": 1.1145113271916487, + "grad_norm": 0.5854917764663696, + "learning_rate": 4.108829283653227e-05, + "loss": 1.0519, + "step": 174450 + }, + { + "epoch": 1.1145752143413874, + "grad_norm": 0.5292088985443115, + "learning_rate": 4.1083355539054936e-05, + "loss": 0.7183, + "step": 174460 + }, + { + "epoch": 1.114639101491126, + "grad_norm": 1.0242040157318115, + "learning_rate": 4.1078418331375924e-05, + "loss": 0.8832, + "step": 174470 + }, + { + "epoch": 1.1147029886408648, + "grad_norm": 1.240865707397461, + "learning_rate": 4.107348121354496e-05, + "loss": 0.8545, + "step": 174480 + }, + { + "epoch": 1.1147668757906035, + "grad_norm": 3.079923391342163, + "learning_rate": 4.106854418561176e-05, + "loss": 0.7876, + "step": 174490 + }, + { + "epoch": 1.1148307629403422, + "grad_norm": 1.0026297569274902, + "learning_rate": 4.106360724762604e-05, + "loss": 0.7126, + "step": 174500 + }, + { + "epoch": 1.114894650090081, + "grad_norm": 1.4164525270462036, + "learning_rate": 4.1058670399637536e-05, + "loss": 0.7584, + "step": 174510 + }, + { + "epoch": 1.1149585372398196, + "grad_norm": 0.6686666011810303, + "learning_rate": 4.105373364169596e-05, + "loss": 0.9105, + "step": 174520 + }, + { + "epoch": 1.1150224243895583, + "grad_norm": 0.8393816947937012, + "learning_rate": 4.104879697385102e-05, + "loss": 1.1391, + "step": 174530 + }, + { + "epoch": 1.115086311539297, + "grad_norm": 0.927527129650116, + "learning_rate": 4.1043860396152436e-05, + "loss": 0.7634, + "step": 174540 + }, + { + "epoch": 1.1151501986890358, + "grad_norm": 1.2626625299453735, + "learning_rate": 4.1038923908649926e-05, + "loss": 0.8502, + "step": 174550 + }, + { + "epoch": 1.1152140858387745, + "grad_norm": 1.2957005500793457, + "learning_rate": 4.103398751139321e-05, + "loss": 1.0296, + "step": 174560 + }, + { + "epoch": 1.1152779729885132, + "grad_norm": 0.6817342042922974, + "learning_rate": 4.1029051204432e-05, + "loss": 0.7997, + "step": 174570 + }, + { + "epoch": 1.1153418601382519, + "grad_norm": 1.2032334804534912, + "learning_rate": 4.1024114987816e-05, + "loss": 0.905, + "step": 174580 + }, + { + "epoch": 1.1154057472879906, + "grad_norm": 1.033623218536377, + "learning_rate": 4.101917886159492e-05, + "loss": 0.8813, + "step": 174590 + }, + { + "epoch": 1.1154696344377293, + "grad_norm": 0.7177959084510803, + "learning_rate": 4.101424282581849e-05, + "loss": 0.7331, + "step": 174600 + }, + { + "epoch": 1.115533521587468, + "grad_norm": 1.2658313512802124, + "learning_rate": 4.100930688053641e-05, + "loss": 0.7928, + "step": 174610 + }, + { + "epoch": 1.1155974087372067, + "grad_norm": 1.4184327125549316, + "learning_rate": 4.100437102579838e-05, + "loss": 0.878, + "step": 174620 + }, + { + "epoch": 1.1156612958869454, + "grad_norm": 0.9174264669418335, + "learning_rate": 4.099943526165412e-05, + "loss": 0.8099, + "step": 174630 + }, + { + "epoch": 1.1157251830366839, + "grad_norm": 1.151061773300171, + "learning_rate": 4.099449958815333e-05, + "loss": 0.9451, + "step": 174640 + }, + { + "epoch": 1.1157890701864228, + "grad_norm": 0.7230824828147888, + "learning_rate": 4.098956400534572e-05, + "loss": 0.912, + "step": 174650 + }, + { + "epoch": 1.1158529573361613, + "grad_norm": 0.8960369229316711, + "learning_rate": 4.0984628513281e-05, + "loss": 1.0227, + "step": 174660 + }, + { + "epoch": 1.1159168444859, + "grad_norm": 0.6633699536323547, + "learning_rate": 4.097969311200886e-05, + "loss": 0.9389, + "step": 174670 + }, + { + "epoch": 1.1159807316356387, + "grad_norm": 1.3627066612243652, + "learning_rate": 4.097475780157903e-05, + "loss": 1.0452, + "step": 174680 + }, + { + "epoch": 1.1160446187853774, + "grad_norm": 1.2317612171173096, + "learning_rate": 4.0969822582041186e-05, + "loss": 0.817, + "step": 174690 + }, + { + "epoch": 1.1161085059351161, + "grad_norm": 1.1265528202056885, + "learning_rate": 4.0964887453445044e-05, + "loss": 0.8126, + "step": 174700 + }, + { + "epoch": 1.1161723930848548, + "grad_norm": 1.0905567407608032, + "learning_rate": 4.095995241584029e-05, + "loss": 0.9245, + "step": 174710 + }, + { + "epoch": 1.1162362802345935, + "grad_norm": 1.2128629684448242, + "learning_rate": 4.0955017469276646e-05, + "loss": 0.9359, + "step": 174720 + }, + { + "epoch": 1.1163001673843322, + "grad_norm": 0.9822705388069153, + "learning_rate": 4.0950082613803804e-05, + "loss": 1.0899, + "step": 174730 + }, + { + "epoch": 1.116364054534071, + "grad_norm": 0.9940265417098999, + "learning_rate": 4.094514784947146e-05, + "loss": 0.8863, + "step": 174740 + }, + { + "epoch": 1.1164279416838097, + "grad_norm": 2.056150436401367, + "learning_rate": 4.094021317632931e-05, + "loss": 0.7975, + "step": 174750 + }, + { + "epoch": 1.1164918288335484, + "grad_norm": 1.5072576999664307, + "learning_rate": 4.093527859442705e-05, + "loss": 0.9716, + "step": 174760 + }, + { + "epoch": 1.116555715983287, + "grad_norm": 0.8445620536804199, + "learning_rate": 4.0930344103814374e-05, + "loss": 1.021, + "step": 174770 + }, + { + "epoch": 1.1166196031330258, + "grad_norm": 1.3397151231765747, + "learning_rate": 4.0925409704540976e-05, + "loss": 0.8359, + "step": 174780 + }, + { + "epoch": 1.1166834902827645, + "grad_norm": 1.194464921951294, + "learning_rate": 4.092047539665656e-05, + "loss": 0.9008, + "step": 174790 + }, + { + "epoch": 1.1167473774325032, + "grad_norm": 1.0016111135482788, + "learning_rate": 4.091554118021082e-05, + "loss": 0.7577, + "step": 174800 + }, + { + "epoch": 1.116811264582242, + "grad_norm": 0.9145414233207703, + "learning_rate": 4.0910607055253416e-05, + "loss": 1.0052, + "step": 174810 + }, + { + "epoch": 1.1168751517319806, + "grad_norm": 0.8176177144050598, + "learning_rate": 4.090567302183408e-05, + "loss": 0.6904, + "step": 174820 + }, + { + "epoch": 1.1169390388817193, + "grad_norm": 0.9763671159744263, + "learning_rate": 4.090073908000248e-05, + "loss": 0.7303, + "step": 174830 + }, + { + "epoch": 1.117002926031458, + "grad_norm": 0.9583315849304199, + "learning_rate": 4.089580522980831e-05, + "loss": 0.8352, + "step": 174840 + }, + { + "epoch": 1.1170668131811967, + "grad_norm": 1.0675028562545776, + "learning_rate": 4.089087147130126e-05, + "loss": 0.8125, + "step": 174850 + }, + { + "epoch": 1.1171307003309354, + "grad_norm": 0.8443053364753723, + "learning_rate": 4.088593780453101e-05, + "loss": 0.8599, + "step": 174860 + }, + { + "epoch": 1.1171945874806741, + "grad_norm": 0.7804033160209656, + "learning_rate": 4.088100422954725e-05, + "loss": 0.8323, + "step": 174870 + }, + { + "epoch": 1.1172584746304128, + "grad_norm": 1.0660284757614136, + "learning_rate": 4.087607074639968e-05, + "loss": 0.8467, + "step": 174880 + }, + { + "epoch": 1.1173223617801515, + "grad_norm": 0.8337397575378418, + "learning_rate": 4.0871137355137954e-05, + "loss": 0.971, + "step": 174890 + }, + { + "epoch": 1.1173862489298902, + "grad_norm": 1.1970064640045166, + "learning_rate": 4.086620405581178e-05, + "loss": 0.7649, + "step": 174900 + }, + { + "epoch": 1.117450136079629, + "grad_norm": 0.8727056384086609, + "learning_rate": 4.086127084847084e-05, + "loss": 0.8041, + "step": 174910 + }, + { + "epoch": 1.1175140232293677, + "grad_norm": 0.9415563941001892, + "learning_rate": 4.085633773316481e-05, + "loss": 0.851, + "step": 174920 + }, + { + "epoch": 1.1175779103791064, + "grad_norm": 1.1210389137268066, + "learning_rate": 4.085140470994335e-05, + "loss": 0.9958, + "step": 174930 + }, + { + "epoch": 1.117641797528845, + "grad_norm": 2.0388622283935547, + "learning_rate": 4.084647177885617e-05, + "loss": 0.7574, + "step": 174940 + }, + { + "epoch": 1.1177056846785838, + "grad_norm": 1.0090299844741821, + "learning_rate": 4.084153893995294e-05, + "loss": 0.7524, + "step": 174950 + }, + { + "epoch": 1.1177695718283225, + "grad_norm": 1.3186482191085815, + "learning_rate": 4.0836606193283335e-05, + "loss": 1.0176, + "step": 174960 + }, + { + "epoch": 1.1178334589780612, + "grad_norm": 1.435909390449524, + "learning_rate": 4.083167353889703e-05, + "loss": 1.243, + "step": 174970 + }, + { + "epoch": 1.1178973461278, + "grad_norm": 1.2158678770065308, + "learning_rate": 4.082674097684371e-05, + "loss": 0.7206, + "step": 174980 + }, + { + "epoch": 1.1179612332775386, + "grad_norm": 1.0424692630767822, + "learning_rate": 4.082180850717304e-05, + "loss": 0.9083, + "step": 174990 + }, + { + "epoch": 1.1180251204272773, + "grad_norm": 0.7622018456459045, + "learning_rate": 4.081687612993469e-05, + "loss": 1.0366, + "step": 175000 + }, + { + "epoch": 1.118089007577016, + "grad_norm": 1.2907741069793701, + "learning_rate": 4.081194384517836e-05, + "loss": 0.8439, + "step": 175010 + }, + { + "epoch": 1.1181528947267547, + "grad_norm": 0.8142081499099731, + "learning_rate": 4.080701165295369e-05, + "loss": 0.9369, + "step": 175020 + }, + { + "epoch": 1.1182167818764934, + "grad_norm": 0.7973591685295105, + "learning_rate": 4.0802079553310364e-05, + "loss": 0.8274, + "step": 175030 + }, + { + "epoch": 1.1182806690262321, + "grad_norm": 1.1246249675750732, + "learning_rate": 4.079714754629806e-05, + "loss": 0.7711, + "step": 175040 + }, + { + "epoch": 1.1183445561759708, + "grad_norm": 1.2581672668457031, + "learning_rate": 4.0792215631966444e-05, + "loss": 1.0281, + "step": 175050 + }, + { + "epoch": 1.1184084433257095, + "grad_norm": 0.6276068687438965, + "learning_rate": 4.078728381036518e-05, + "loss": 0.9746, + "step": 175060 + }, + { + "epoch": 1.1184723304754483, + "grad_norm": 0.6815016865730286, + "learning_rate": 4.078235208154394e-05, + "loss": 0.9694, + "step": 175070 + }, + { + "epoch": 1.118536217625187, + "grad_norm": 0.8250168561935425, + "learning_rate": 4.077742044555238e-05, + "loss": 1.0307, + "step": 175080 + }, + { + "epoch": 1.1186001047749257, + "grad_norm": 0.7040315866470337, + "learning_rate": 4.077248890244019e-05, + "loss": 0.7931, + "step": 175090 + }, + { + "epoch": 1.1186639919246644, + "grad_norm": 0.6185616850852966, + "learning_rate": 4.076755745225701e-05, + "loss": 0.8425, + "step": 175100 + }, + { + "epoch": 1.118727879074403, + "grad_norm": 0.7982765436172485, + "learning_rate": 4.076262609505252e-05, + "loss": 0.987, + "step": 175110 + }, + { + "epoch": 1.1187917662241418, + "grad_norm": 1.0827354192733765, + "learning_rate": 4.075769483087637e-05, + "loss": 0.8767, + "step": 175120 + }, + { + "epoch": 1.1188556533738803, + "grad_norm": 0.7010329961776733, + "learning_rate": 4.0752763659778234e-05, + "loss": 0.9916, + "step": 175130 + }, + { + "epoch": 1.1189195405236192, + "grad_norm": 1.2009533643722534, + "learning_rate": 4.0747832581807765e-05, + "loss": 1.1279, + "step": 175140 + }, + { + "epoch": 1.1189834276733577, + "grad_norm": 0.8541277050971985, + "learning_rate": 4.074290159701463e-05, + "loss": 0.9996, + "step": 175150 + }, + { + "epoch": 1.1190473148230964, + "grad_norm": 0.8169267177581787, + "learning_rate": 4.073797070544848e-05, + "loss": 1.2023, + "step": 175160 + }, + { + "epoch": 1.119111201972835, + "grad_norm": 0.9204958081245422, + "learning_rate": 4.0733039907158976e-05, + "loss": 0.9974, + "step": 175170 + }, + { + "epoch": 1.1191750891225738, + "grad_norm": 0.9314907789230347, + "learning_rate": 4.072810920219578e-05, + "loss": 0.7827, + "step": 175180 + }, + { + "epoch": 1.1192389762723125, + "grad_norm": 0.9987381100654602, + "learning_rate": 4.0723178590608545e-05, + "loss": 0.6706, + "step": 175190 + }, + { + "epoch": 1.1193028634220512, + "grad_norm": 1.2689613103866577, + "learning_rate": 4.071824807244693e-05, + "loss": 1.0809, + "step": 175200 + }, + { + "epoch": 1.11936675057179, + "grad_norm": 0.9223312139511108, + "learning_rate": 4.071331764776059e-05, + "loss": 0.7422, + "step": 175210 + }, + { + "epoch": 1.1194306377215286, + "grad_norm": 0.9146457314491272, + "learning_rate": 4.0708387316599166e-05, + "loss": 0.7824, + "step": 175220 + }, + { + "epoch": 1.1194945248712673, + "grad_norm": 0.8546509146690369, + "learning_rate": 4.070345707901233e-05, + "loss": 1.1368, + "step": 175230 + }, + { + "epoch": 1.119558412021006, + "grad_norm": 1.138922095298767, + "learning_rate": 4.06985269350497e-05, + "loss": 1.0918, + "step": 175240 + }, + { + "epoch": 1.1196222991707447, + "grad_norm": 0.8561988472938538, + "learning_rate": 4.0693596884760976e-05, + "loss": 0.8103, + "step": 175250 + }, + { + "epoch": 1.1196861863204834, + "grad_norm": 2.1585047245025635, + "learning_rate": 4.0688666928195776e-05, + "loss": 0.8861, + "step": 175260 + }, + { + "epoch": 1.1197500734702222, + "grad_norm": 0.9140286445617676, + "learning_rate": 4.068373706540376e-05, + "loss": 0.9865, + "step": 175270 + }, + { + "epoch": 1.1198139606199609, + "grad_norm": 1.0838178396224976, + "learning_rate": 4.067880729643456e-05, + "loss": 0.6893, + "step": 175280 + }, + { + "epoch": 1.1198778477696996, + "grad_norm": 0.9828647971153259, + "learning_rate": 4.067387762133784e-05, + "loss": 0.7995, + "step": 175290 + }, + { + "epoch": 1.1199417349194383, + "grad_norm": 1.4637941122055054, + "learning_rate": 4.0668948040163244e-05, + "loss": 0.8851, + "step": 175300 + }, + { + "epoch": 1.120005622069177, + "grad_norm": 0.7903958559036255, + "learning_rate": 4.0664018552960406e-05, + "loss": 0.7814, + "step": 175310 + }, + { + "epoch": 1.1200695092189157, + "grad_norm": 0.8866510987281799, + "learning_rate": 4.0659089159778984e-05, + "loss": 1.0234, + "step": 175320 + }, + { + "epoch": 1.1201333963686544, + "grad_norm": 0.8908731937408447, + "learning_rate": 4.0654159860668615e-05, + "loss": 0.6416, + "step": 175330 + }, + { + "epoch": 1.120197283518393, + "grad_norm": 1.0165271759033203, + "learning_rate": 4.064923065567894e-05, + "loss": 0.6595, + "step": 175340 + }, + { + "epoch": 1.1202611706681318, + "grad_norm": 0.7236544489860535, + "learning_rate": 4.064430154485961e-05, + "loss": 0.8758, + "step": 175350 + }, + { + "epoch": 1.1203250578178705, + "grad_norm": 1.364446997642517, + "learning_rate": 4.063937252826024e-05, + "loss": 0.9554, + "step": 175360 + }, + { + "epoch": 1.1203889449676092, + "grad_norm": 1.0828992128372192, + "learning_rate": 4.0634443605930504e-05, + "loss": 0.7889, + "step": 175370 + }, + { + "epoch": 1.120452832117348, + "grad_norm": 0.9547019600868225, + "learning_rate": 4.062951477792002e-05, + "loss": 0.6543, + "step": 175380 + }, + { + "epoch": 1.1205167192670866, + "grad_norm": 0.7458175420761108, + "learning_rate": 4.062458604427842e-05, + "loss": 0.9494, + "step": 175390 + }, + { + "epoch": 1.1205806064168253, + "grad_norm": 0.7044423818588257, + "learning_rate": 4.0619657405055366e-05, + "loss": 0.7833, + "step": 175400 + }, + { + "epoch": 1.120644493566564, + "grad_norm": 0.512610912322998, + "learning_rate": 4.0614728860300464e-05, + "loss": 0.7563, + "step": 175410 + }, + { + "epoch": 1.1207083807163027, + "grad_norm": 0.7722542881965637, + "learning_rate": 4.0609800410063366e-05, + "loss": 0.8186, + "step": 175420 + }, + { + "epoch": 1.1207722678660414, + "grad_norm": 0.9468956589698792, + "learning_rate": 4.06048720543937e-05, + "loss": 1.1304, + "step": 175430 + }, + { + "epoch": 1.1208361550157802, + "grad_norm": 0.7915295362472534, + "learning_rate": 4.05999437933411e-05, + "loss": 0.8595, + "step": 175440 + }, + { + "epoch": 1.1209000421655189, + "grad_norm": 0.7079314589500427, + "learning_rate": 4.0595015626955195e-05, + "loss": 0.7979, + "step": 175450 + }, + { + "epoch": 1.1209639293152576, + "grad_norm": 1.3249633312225342, + "learning_rate": 4.059008755528562e-05, + "loss": 0.8842, + "step": 175460 + }, + { + "epoch": 1.1210278164649963, + "grad_norm": 1.0822068452835083, + "learning_rate": 4.058515957838201e-05, + "loss": 0.836, + "step": 175470 + }, + { + "epoch": 1.121091703614735, + "grad_norm": 1.2300411462783813, + "learning_rate": 4.058023169629398e-05, + "loss": 0.6829, + "step": 175480 + }, + { + "epoch": 1.1211555907644737, + "grad_norm": 0.7818104028701782, + "learning_rate": 4.057530390907117e-05, + "loss": 0.9587, + "step": 175490 + }, + { + "epoch": 1.1212194779142124, + "grad_norm": 0.9931803345680237, + "learning_rate": 4.057037621676321e-05, + "loss": 0.935, + "step": 175500 + }, + { + "epoch": 1.121283365063951, + "grad_norm": 0.8156737685203552, + "learning_rate": 4.056544861941971e-05, + "loss": 1.1708, + "step": 175510 + }, + { + "epoch": 1.1213472522136898, + "grad_norm": 0.7829884886741638, + "learning_rate": 4.056052111709031e-05, + "loss": 0.8434, + "step": 175520 + }, + { + "epoch": 1.1214111393634285, + "grad_norm": 0.9422668218612671, + "learning_rate": 4.055559370982462e-05, + "loss": 0.9067, + "step": 175530 + }, + { + "epoch": 1.1214750265131672, + "grad_norm": 1.0434776544570923, + "learning_rate": 4.055066639767228e-05, + "loss": 0.9479, + "step": 175540 + }, + { + "epoch": 1.121538913662906, + "grad_norm": 1.8336058855056763, + "learning_rate": 4.0545739180682896e-05, + "loss": 1.2378, + "step": 175550 + }, + { + "epoch": 1.1216028008126446, + "grad_norm": 1.4612547159194946, + "learning_rate": 4.0540812058906096e-05, + "loss": 0.822, + "step": 175560 + }, + { + "epoch": 1.1216666879623833, + "grad_norm": 0.8069930672645569, + "learning_rate": 4.053588503239151e-05, + "loss": 0.7872, + "step": 175570 + }, + { + "epoch": 1.121730575112122, + "grad_norm": 0.6849136352539062, + "learning_rate": 4.0530958101188745e-05, + "loss": 0.6795, + "step": 175580 + }, + { + "epoch": 1.1217944622618607, + "grad_norm": 0.5357284545898438, + "learning_rate": 4.052603126534743e-05, + "loss": 0.7397, + "step": 175590 + }, + { + "epoch": 1.1218583494115995, + "grad_norm": 1.2236217260360718, + "learning_rate": 4.052110452491717e-05, + "loss": 0.7363, + "step": 175600 + }, + { + "epoch": 1.1219222365613382, + "grad_norm": 1.0828033685684204, + "learning_rate": 4.051617787994759e-05, + "loss": 0.9291, + "step": 175610 + }, + { + "epoch": 1.1219861237110766, + "grad_norm": 1.3939331769943237, + "learning_rate": 4.051125133048831e-05, + "loss": 1.0402, + "step": 175620 + }, + { + "epoch": 1.1220500108608156, + "grad_norm": 1.7905160188674927, + "learning_rate": 4.050632487658893e-05, + "loss": 0.8458, + "step": 175630 + }, + { + "epoch": 1.122113898010554, + "grad_norm": 1.2075114250183105, + "learning_rate": 4.0501398518299074e-05, + "loss": 0.8363, + "step": 175640 + }, + { + "epoch": 1.1221777851602928, + "grad_norm": 0.7596954703330994, + "learning_rate": 4.049647225566835e-05, + "loss": 1.0533, + "step": 175650 + }, + { + "epoch": 1.1222416723100315, + "grad_norm": 0.8684565424919128, + "learning_rate": 4.049154608874638e-05, + "loss": 0.6895, + "step": 175660 + }, + { + "epoch": 1.1223055594597702, + "grad_norm": 0.6642537117004395, + "learning_rate": 4.048662001758276e-05, + "loss": 0.9142, + "step": 175670 + }, + { + "epoch": 1.1223694466095089, + "grad_norm": 0.7803688645362854, + "learning_rate": 4.0481694042227106e-05, + "loss": 0.8982, + "step": 175680 + }, + { + "epoch": 1.1224333337592476, + "grad_norm": 1.0256503820419312, + "learning_rate": 4.0476768162729026e-05, + "loss": 0.5992, + "step": 175690 + }, + { + "epoch": 1.1224972209089863, + "grad_norm": 0.797667920589447, + "learning_rate": 4.0471842379138137e-05, + "loss": 1.0248, + "step": 175700 + }, + { + "epoch": 1.122561108058725, + "grad_norm": 1.0702816247940063, + "learning_rate": 4.046691669150404e-05, + "loss": 0.9816, + "step": 175710 + }, + { + "epoch": 1.1226249952084637, + "grad_norm": 1.9437769651412964, + "learning_rate": 4.0461991099876327e-05, + "loss": 1.401, + "step": 175720 + }, + { + "epoch": 1.1226888823582024, + "grad_norm": 0.9341719150543213, + "learning_rate": 4.0457065604304625e-05, + "loss": 0.8363, + "step": 175730 + }, + { + "epoch": 1.1227527695079411, + "grad_norm": 1.0677189826965332, + "learning_rate": 4.045214020483852e-05, + "loss": 0.8378, + "step": 175740 + }, + { + "epoch": 1.1228166566576798, + "grad_norm": 0.9307116866111755, + "learning_rate": 4.044721490152764e-05, + "loss": 0.7788, + "step": 175750 + }, + { + "epoch": 1.1228805438074185, + "grad_norm": 0.7231166958808899, + "learning_rate": 4.0442289694421545e-05, + "loss": 0.9322, + "step": 175760 + }, + { + "epoch": 1.1229444309571572, + "grad_norm": 0.9440060257911682, + "learning_rate": 4.043736458356987e-05, + "loss": 0.7776, + "step": 175770 + }, + { + "epoch": 1.123008318106896, + "grad_norm": 0.6914847493171692, + "learning_rate": 4.0432439569022215e-05, + "loss": 0.8959, + "step": 175780 + }, + { + "epoch": 1.1230722052566346, + "grad_norm": 0.8245557546615601, + "learning_rate": 4.0427514650828164e-05, + "loss": 0.9195, + "step": 175790 + }, + { + "epoch": 1.1231360924063734, + "grad_norm": 1.162238359451294, + "learning_rate": 4.042258982903733e-05, + "loss": 0.7828, + "step": 175800 + }, + { + "epoch": 1.123199979556112, + "grad_norm": 0.9539982080459595, + "learning_rate": 4.041766510369929e-05, + "loss": 0.9306, + "step": 175810 + }, + { + "epoch": 1.1232638667058508, + "grad_norm": 2.9992008209228516, + "learning_rate": 4.041274047486366e-05, + "loss": 0.7617, + "step": 175820 + }, + { + "epoch": 1.1233277538555895, + "grad_norm": 0.7411594986915588, + "learning_rate": 4.040781594258003e-05, + "loss": 0.8345, + "step": 175830 + }, + { + "epoch": 1.1233916410053282, + "grad_norm": 3.4469544887542725, + "learning_rate": 4.040289150689799e-05, + "loss": 0.9663, + "step": 175840 + }, + { + "epoch": 1.1234555281550669, + "grad_norm": 0.5798484683036804, + "learning_rate": 4.0397967167867136e-05, + "loss": 0.8329, + "step": 175850 + }, + { + "epoch": 1.1235194153048056, + "grad_norm": 0.6647492051124573, + "learning_rate": 4.039304292553706e-05, + "loss": 0.8047, + "step": 175860 + }, + { + "epoch": 1.1235833024545443, + "grad_norm": 0.9796327948570251, + "learning_rate": 4.0388118779957346e-05, + "loss": 1.3402, + "step": 175870 + }, + { + "epoch": 1.123647189604283, + "grad_norm": 0.9039154648780823, + "learning_rate": 4.038319473117759e-05, + "loss": 0.7715, + "step": 175880 + }, + { + "epoch": 1.1237110767540217, + "grad_norm": 1.2593525648117065, + "learning_rate": 4.0378270779247405e-05, + "loss": 0.7266, + "step": 175890 + }, + { + "epoch": 1.1237749639037604, + "grad_norm": 0.7207663655281067, + "learning_rate": 4.037334692421634e-05, + "loss": 0.6984, + "step": 175900 + }, + { + "epoch": 1.1238388510534991, + "grad_norm": 1.0210093259811401, + "learning_rate": 4.0368423166134e-05, + "loss": 0.6854, + "step": 175910 + }, + { + "epoch": 1.1239027382032378, + "grad_norm": 1.0901857614517212, + "learning_rate": 4.036349950504997e-05, + "loss": 0.7311, + "step": 175920 + }, + { + "epoch": 1.1239666253529765, + "grad_norm": 0.6734563112258911, + "learning_rate": 4.035857594101384e-05, + "loss": 0.8838, + "step": 175930 + }, + { + "epoch": 1.1240305125027152, + "grad_norm": 1.24734365940094, + "learning_rate": 4.035365247407519e-05, + "loss": 0.6681, + "step": 175940 + }, + { + "epoch": 1.124094399652454, + "grad_norm": 1.328225016593933, + "learning_rate": 4.034872910428361e-05, + "loss": 0.6492, + "step": 175950 + }, + { + "epoch": 1.1241582868021927, + "grad_norm": 0.9848608374595642, + "learning_rate": 4.0343805831688666e-05, + "loss": 0.8751, + "step": 175960 + }, + { + "epoch": 1.1242221739519314, + "grad_norm": 3.334869861602783, + "learning_rate": 4.033888265633996e-05, + "loss": 0.878, + "step": 175970 + }, + { + "epoch": 1.12428606110167, + "grad_norm": 0.7569032311439514, + "learning_rate": 4.0333959578287064e-05, + "loss": 1.1384, + "step": 175980 + }, + { + "epoch": 1.1243499482514088, + "grad_norm": 0.623479425907135, + "learning_rate": 4.0329036597579554e-05, + "loss": 0.8313, + "step": 175990 + }, + { + "epoch": 1.1244138354011475, + "grad_norm": 0.9155558347702026, + "learning_rate": 4.032411371426701e-05, + "loss": 0.6329, + "step": 176000 + }, + { + "epoch": 1.1244777225508862, + "grad_norm": 0.5530851483345032, + "learning_rate": 4.031919092839901e-05, + "loss": 0.9966, + "step": 176010 + }, + { + "epoch": 1.124541609700625, + "grad_norm": 1.3361395597457886, + "learning_rate": 4.0314268240025136e-05, + "loss": 0.7737, + "step": 176020 + }, + { + "epoch": 1.1246054968503636, + "grad_norm": 0.7976097464561462, + "learning_rate": 4.0309345649194965e-05, + "loss": 0.9081, + "step": 176030 + }, + { + "epoch": 1.1246693840001023, + "grad_norm": 1.1982431411743164, + "learning_rate": 4.030442315595806e-05, + "loss": 0.8035, + "step": 176040 + }, + { + "epoch": 1.124733271149841, + "grad_norm": 1.0385111570358276, + "learning_rate": 4.0299500760364003e-05, + "loss": 0.9468, + "step": 176050 + }, + { + "epoch": 1.1247971582995797, + "grad_norm": 0.796828031539917, + "learning_rate": 4.029457846246237e-05, + "loss": 0.9623, + "step": 176060 + }, + { + "epoch": 1.1248610454493184, + "grad_norm": 0.6143119931221008, + "learning_rate": 4.028965626230272e-05, + "loss": 0.7577, + "step": 176070 + }, + { + "epoch": 1.1249249325990571, + "grad_norm": 0.8207269906997681, + "learning_rate": 4.028473415993464e-05, + "loss": 1.015, + "step": 176080 + }, + { + "epoch": 1.1249888197487956, + "grad_norm": 1.3695751428604126, + "learning_rate": 4.027981215540768e-05, + "loss": 0.8617, + "step": 176090 + }, + { + "epoch": 1.1250527068985345, + "grad_norm": 0.821844220161438, + "learning_rate": 4.027489024877143e-05, + "loss": 0.8861, + "step": 176100 + }, + { + "epoch": 1.125116594048273, + "grad_norm": 0.7588791847229004, + "learning_rate": 4.0269968440075444e-05, + "loss": 0.6437, + "step": 176110 + }, + { + "epoch": 1.125180481198012, + "grad_norm": 1.1480047702789307, + "learning_rate": 4.0265046729369304e-05, + "loss": 1.0675, + "step": 176120 + }, + { + "epoch": 1.1252443683477504, + "grad_norm": 0.909865140914917, + "learning_rate": 4.026012511670256e-05, + "loss": 1.0902, + "step": 176130 + }, + { + "epoch": 1.1253082554974891, + "grad_norm": 0.5706981420516968, + "learning_rate": 4.025520360212478e-05, + "loss": 0.886, + "step": 176140 + }, + { + "epoch": 1.1253721426472278, + "grad_norm": 0.8094174265861511, + "learning_rate": 4.0250282185685527e-05, + "loss": 0.8935, + "step": 176150 + }, + { + "epoch": 1.1254360297969666, + "grad_norm": 0.8232213258743286, + "learning_rate": 4.0245360867434376e-05, + "loss": 1.084, + "step": 176160 + }, + { + "epoch": 1.1254999169467053, + "grad_norm": 0.9300898313522339, + "learning_rate": 4.0240439647420873e-05, + "loss": 0.9735, + "step": 176170 + }, + { + "epoch": 1.125563804096444, + "grad_norm": 1.0743767023086548, + "learning_rate": 4.0235518525694594e-05, + "loss": 0.7813, + "step": 176180 + }, + { + "epoch": 1.1256276912461827, + "grad_norm": 0.9083045721054077, + "learning_rate": 4.0230597502305085e-05, + "loss": 0.7308, + "step": 176190 + }, + { + "epoch": 1.1256915783959214, + "grad_norm": 0.641772985458374, + "learning_rate": 4.022567657730191e-05, + "loss": 0.9756, + "step": 176200 + }, + { + "epoch": 1.12575546554566, + "grad_norm": 1.0198135375976562, + "learning_rate": 4.022075575073463e-05, + "loss": 1.1072, + "step": 176210 + }, + { + "epoch": 1.1258193526953988, + "grad_norm": 0.8631082773208618, + "learning_rate": 4.0215835022652796e-05, + "loss": 0.8153, + "step": 176220 + }, + { + "epoch": 1.1258832398451375, + "grad_norm": 1.0434458255767822, + "learning_rate": 4.0210914393105975e-05, + "loss": 1.0284, + "step": 176230 + }, + { + "epoch": 1.1259471269948762, + "grad_norm": 1.107435703277588, + "learning_rate": 4.020599386214371e-05, + "loss": 0.7435, + "step": 176240 + }, + { + "epoch": 1.126011014144615, + "grad_norm": 1.2904818058013916, + "learning_rate": 4.020107342981556e-05, + "loss": 0.8234, + "step": 176250 + }, + { + "epoch": 1.1260749012943536, + "grad_norm": 0.8774335980415344, + "learning_rate": 4.019615309617108e-05, + "loss": 0.9961, + "step": 176260 + }, + { + "epoch": 1.1261387884440923, + "grad_norm": 0.841364860534668, + "learning_rate": 4.019123286125982e-05, + "loss": 0.7548, + "step": 176270 + }, + { + "epoch": 1.126202675593831, + "grad_norm": 1.181922435760498, + "learning_rate": 4.0186312725131324e-05, + "loss": 0.8136, + "step": 176280 + }, + { + "epoch": 1.1262665627435697, + "grad_norm": 1.0423723459243774, + "learning_rate": 4.0181392687835144e-05, + "loss": 0.9189, + "step": 176290 + }, + { + "epoch": 1.1263304498933084, + "grad_norm": 1.5770231485366821, + "learning_rate": 4.0176472749420844e-05, + "loss": 0.7019, + "step": 176300 + }, + { + "epoch": 1.1263943370430471, + "grad_norm": 0.641741931438446, + "learning_rate": 4.0171552909937966e-05, + "loss": 0.7976, + "step": 176310 + }, + { + "epoch": 1.1264582241927859, + "grad_norm": 1.0063591003417969, + "learning_rate": 4.0166633169436045e-05, + "loss": 0.7588, + "step": 176320 + }, + { + "epoch": 1.1265221113425246, + "grad_norm": 0.9964459538459778, + "learning_rate": 4.016171352796464e-05, + "loss": 0.7928, + "step": 176330 + }, + { + "epoch": 1.1265859984922633, + "grad_norm": 0.7890756726264954, + "learning_rate": 4.015679398557329e-05, + "loss": 0.8646, + "step": 176340 + }, + { + "epoch": 1.126649885642002, + "grad_norm": 0.8053073883056641, + "learning_rate": 4.015187454231154e-05, + "loss": 0.732, + "step": 176350 + }, + { + "epoch": 1.1267137727917407, + "grad_norm": 0.5603983402252197, + "learning_rate": 4.0146955198228936e-05, + "loss": 0.7962, + "step": 176360 + }, + { + "epoch": 1.1267776599414794, + "grad_norm": 0.9688702821731567, + "learning_rate": 4.014203595337503e-05, + "loss": 0.8995, + "step": 176370 + }, + { + "epoch": 1.126841547091218, + "grad_norm": 0.8661218285560608, + "learning_rate": 4.013711680779934e-05, + "loss": 0.7945, + "step": 176380 + }, + { + "epoch": 1.1269054342409568, + "grad_norm": 0.7414273619651794, + "learning_rate": 4.013219776155141e-05, + "loss": 0.8952, + "step": 176390 + }, + { + "epoch": 1.1269693213906955, + "grad_norm": 0.9069517254829407, + "learning_rate": 4.012727881468079e-05, + "loss": 0.8427, + "step": 176400 + }, + { + "epoch": 1.1270332085404342, + "grad_norm": 1.473358392715454, + "learning_rate": 4.0122359967237e-05, + "loss": 1.0556, + "step": 176410 + }, + { + "epoch": 1.127097095690173, + "grad_norm": 0.7204602360725403, + "learning_rate": 4.0117441219269605e-05, + "loss": 0.8077, + "step": 176420 + }, + { + "epoch": 1.1271609828399116, + "grad_norm": 0.7743983864784241, + "learning_rate": 4.011252257082812e-05, + "loss": 0.8541, + "step": 176430 + }, + { + "epoch": 1.1272248699896503, + "grad_norm": 0.9365828037261963, + "learning_rate": 4.010760402196209e-05, + "loss": 1.0954, + "step": 176440 + }, + { + "epoch": 1.127288757139389, + "grad_norm": 0.8942638039588928, + "learning_rate": 4.0102685572721046e-05, + "loss": 0.8843, + "step": 176450 + }, + { + "epoch": 1.1273526442891277, + "grad_norm": 0.8413757085800171, + "learning_rate": 4.0097767223154513e-05, + "loss": 0.9711, + "step": 176460 + }, + { + "epoch": 1.1274165314388664, + "grad_norm": 0.9489029049873352, + "learning_rate": 4.009284897331204e-05, + "loss": 0.9959, + "step": 176470 + }, + { + "epoch": 1.1274804185886051, + "grad_norm": 1.105385661125183, + "learning_rate": 4.008793082324315e-05, + "loss": 0.9225, + "step": 176480 + }, + { + "epoch": 1.1275443057383439, + "grad_norm": 1.2501875162124634, + "learning_rate": 4.0083012772997363e-05, + "loss": 0.87, + "step": 176490 + }, + { + "epoch": 1.1276081928880826, + "grad_norm": 2.948787212371826, + "learning_rate": 4.0078094822624224e-05, + "loss": 0.9624, + "step": 176500 + }, + { + "epoch": 1.1276720800378213, + "grad_norm": 0.6365622282028198, + "learning_rate": 4.007317697217325e-05, + "loss": 0.8106, + "step": 176510 + }, + { + "epoch": 1.12773596718756, + "grad_norm": 0.8782965540885925, + "learning_rate": 4.006825922169397e-05, + "loss": 0.9458, + "step": 176520 + }, + { + "epoch": 1.1277998543372987, + "grad_norm": 3.0257232189178467, + "learning_rate": 4.006334157123592e-05, + "loss": 0.6978, + "step": 176530 + }, + { + "epoch": 1.1278637414870374, + "grad_norm": 0.8372243046760559, + "learning_rate": 4.005842402084861e-05, + "loss": 1.2384, + "step": 176540 + }, + { + "epoch": 1.127927628636776, + "grad_norm": 1.0190279483795166, + "learning_rate": 4.0053506570581575e-05, + "loss": 0.9559, + "step": 176550 + }, + { + "epoch": 1.1279915157865146, + "grad_norm": 0.8211161494255066, + "learning_rate": 4.004858922048433e-05, + "loss": 0.8927, + "step": 176560 + }, + { + "epoch": 1.1280554029362535, + "grad_norm": 0.9208956956863403, + "learning_rate": 4.00436719706064e-05, + "loss": 1.0404, + "step": 176570 + }, + { + "epoch": 1.128119290085992, + "grad_norm": 0.8615883588790894, + "learning_rate": 4.003875482099731e-05, + "loss": 0.9103, + "step": 176580 + }, + { + "epoch": 1.128183177235731, + "grad_norm": 1.2518213987350464, + "learning_rate": 4.0033837771706576e-05, + "loss": 0.8344, + "step": 176590 + }, + { + "epoch": 1.1282470643854694, + "grad_norm": 2.0138180255889893, + "learning_rate": 4.0028920822783716e-05, + "loss": 0.7134, + "step": 176600 + }, + { + "epoch": 1.1283109515352083, + "grad_norm": 1.0518046617507935, + "learning_rate": 4.002400397427824e-05, + "loss": 0.8504, + "step": 176610 + }, + { + "epoch": 1.1283748386849468, + "grad_norm": 1.0148245096206665, + "learning_rate": 4.0019087226239684e-05, + "loss": 0.8871, + "step": 176620 + }, + { + "epoch": 1.1284387258346855, + "grad_norm": 0.7396321296691895, + "learning_rate": 4.001417057871756e-05, + "loss": 0.7334, + "step": 176630 + }, + { + "epoch": 1.1285026129844242, + "grad_norm": 0.7813929915428162, + "learning_rate": 4.000925403176137e-05, + "loss": 0.8969, + "step": 176640 + }, + { + "epoch": 1.128566500134163, + "grad_norm": 0.9283764958381653, + "learning_rate": 4.0004337585420635e-05, + "loss": 0.6189, + "step": 176650 + }, + { + "epoch": 1.1286303872839016, + "grad_norm": 0.765900194644928, + "learning_rate": 3.999942123974487e-05, + "loss": 0.9674, + "step": 176660 + }, + { + "epoch": 1.1286942744336403, + "grad_norm": 2.070537805557251, + "learning_rate": 3.999450499478359e-05, + "loss": 1.0555, + "step": 176670 + }, + { + "epoch": 1.128758161583379, + "grad_norm": 1.2113842964172363, + "learning_rate": 3.99895888505863e-05, + "loss": 1.0827, + "step": 176680 + }, + { + "epoch": 1.1288220487331178, + "grad_norm": 0.9803450703620911, + "learning_rate": 3.99846728072025e-05, + "loss": 1.0341, + "step": 176690 + }, + { + "epoch": 1.1288859358828565, + "grad_norm": 2.890547513961792, + "learning_rate": 3.997975686468172e-05, + "loss": 0.8549, + "step": 176700 + }, + { + "epoch": 1.1289498230325952, + "grad_norm": 0.7865378260612488, + "learning_rate": 3.997484102307345e-05, + "loss": 0.7177, + "step": 176710 + }, + { + "epoch": 1.1290137101823339, + "grad_norm": 0.8870431184768677, + "learning_rate": 3.9969925282427205e-05, + "loss": 1.0428, + "step": 176720 + }, + { + "epoch": 1.1290775973320726, + "grad_norm": 0.9200987219810486, + "learning_rate": 3.996500964279249e-05, + "loss": 0.7899, + "step": 176730 + }, + { + "epoch": 1.1291414844818113, + "grad_norm": 0.9197986721992493, + "learning_rate": 3.996009410421881e-05, + "loss": 0.8757, + "step": 176740 + }, + { + "epoch": 1.12920537163155, + "grad_norm": 1.4534109830856323, + "learning_rate": 3.995517866675568e-05, + "loss": 0.8529, + "step": 176750 + }, + { + "epoch": 1.1292692587812887, + "grad_norm": 1.1534450054168701, + "learning_rate": 3.995026333045257e-05, + "loss": 0.7974, + "step": 176760 + }, + { + "epoch": 1.1293331459310274, + "grad_norm": 0.5616369843482971, + "learning_rate": 3.994534809535901e-05, + "loss": 0.9461, + "step": 176770 + }, + { + "epoch": 1.129397033080766, + "grad_norm": 0.7991965413093567, + "learning_rate": 3.99404329615245e-05, + "loss": 0.7453, + "step": 176780 + }, + { + "epoch": 1.1294609202305048, + "grad_norm": 0.9583500027656555, + "learning_rate": 3.9935517928998534e-05, + "loss": 0.9574, + "step": 176790 + }, + { + "epoch": 1.1295248073802435, + "grad_norm": 0.9706417322158813, + "learning_rate": 3.99306029978306e-05, + "loss": 0.9761, + "step": 176800 + }, + { + "epoch": 1.1295886945299822, + "grad_norm": 1.1579259634017944, + "learning_rate": 3.9925688168070205e-05, + "loss": 0.8768, + "step": 176810 + }, + { + "epoch": 1.129652581679721, + "grad_norm": 0.9929895401000977, + "learning_rate": 3.992077343976685e-05, + "loss": 1.0702, + "step": 176820 + }, + { + "epoch": 1.1297164688294596, + "grad_norm": 0.6023370027542114, + "learning_rate": 3.991585881297002e-05, + "loss": 0.921, + "step": 176830 + }, + { + "epoch": 1.1297803559791983, + "grad_norm": 1.0571560859680176, + "learning_rate": 3.991094428772922e-05, + "loss": 0.8577, + "step": 176840 + }, + { + "epoch": 1.129844243128937, + "grad_norm": 2.328549861907959, + "learning_rate": 3.990602986409394e-05, + "loss": 0.7989, + "step": 176850 + }, + { + "epoch": 1.1299081302786758, + "grad_norm": 2.132355213165283, + "learning_rate": 3.99011155421137e-05, + "loss": 1.0796, + "step": 176860 + }, + { + "epoch": 1.1299720174284145, + "grad_norm": 0.7115640640258789, + "learning_rate": 3.9896201321837936e-05, + "loss": 0.91, + "step": 176870 + }, + { + "epoch": 1.1300359045781532, + "grad_norm": 0.7301086783409119, + "learning_rate": 3.9891287203316164e-05, + "loss": 0.8216, + "step": 176880 + }, + { + "epoch": 1.1300997917278919, + "grad_norm": 0.8626973032951355, + "learning_rate": 3.988637318659788e-05, + "loss": 0.7884, + "step": 176890 + }, + { + "epoch": 1.1301636788776306, + "grad_norm": 1.646178126335144, + "learning_rate": 3.988145927173256e-05, + "loss": 1.0285, + "step": 176900 + }, + { + "epoch": 1.1302275660273693, + "grad_norm": 0.8210130333900452, + "learning_rate": 3.987654545876971e-05, + "loss": 0.6995, + "step": 176910 + }, + { + "epoch": 1.130291453177108, + "grad_norm": 1.1210635900497437, + "learning_rate": 3.98716317477588e-05, + "loss": 0.9967, + "step": 176920 + }, + { + "epoch": 1.1303553403268467, + "grad_norm": 0.7595154643058777, + "learning_rate": 3.98667181387493e-05, + "loss": 0.9252, + "step": 176930 + }, + { + "epoch": 1.1304192274765854, + "grad_norm": 0.8323392868041992, + "learning_rate": 3.986180463179074e-05, + "loss": 0.9651, + "step": 176940 + }, + { + "epoch": 1.1304831146263241, + "grad_norm": 1.6904256343841553, + "learning_rate": 3.985689122693257e-05, + "loss": 0.8427, + "step": 176950 + }, + { + "epoch": 1.1305470017760628, + "grad_norm": 0.7488362789154053, + "learning_rate": 3.985197792422428e-05, + "loss": 0.9075, + "step": 176960 + }, + { + "epoch": 1.1306108889258015, + "grad_norm": 0.6929590702056885, + "learning_rate": 3.984706472371535e-05, + "loss": 0.9052, + "step": 176970 + }, + { + "epoch": 1.1306747760755402, + "grad_norm": 1.0957715511322021, + "learning_rate": 3.984215162545527e-05, + "loss": 0.8558, + "step": 176980 + }, + { + "epoch": 1.130738663225279, + "grad_norm": 1.0007911920547485, + "learning_rate": 3.983723862949351e-05, + "loss": 0.9489, + "step": 176990 + }, + { + "epoch": 1.1308025503750176, + "grad_norm": 2.169706344604492, + "learning_rate": 3.983232573587955e-05, + "loss": 1.0108, + "step": 177000 + }, + { + "epoch": 1.1308664375247564, + "grad_norm": 0.8061663508415222, + "learning_rate": 3.9827412944662856e-05, + "loss": 0.7875, + "step": 177010 + }, + { + "epoch": 1.130930324674495, + "grad_norm": 0.8821120262145996, + "learning_rate": 3.982250025589292e-05, + "loss": 0.9757, + "step": 177020 + }, + { + "epoch": 1.1309942118242338, + "grad_norm": 1.5471516847610474, + "learning_rate": 3.9817587669619214e-05, + "loss": 1.2905, + "step": 177030 + }, + { + "epoch": 1.1310580989739725, + "grad_norm": 0.76194828748703, + "learning_rate": 3.981267518589121e-05, + "loss": 0.7426, + "step": 177040 + }, + { + "epoch": 1.131121986123711, + "grad_norm": 1.5704962015151978, + "learning_rate": 3.980776280475838e-05, + "loss": 1.1536, + "step": 177050 + }, + { + "epoch": 1.1311858732734499, + "grad_norm": 0.8537958860397339, + "learning_rate": 3.9802850526270184e-05, + "loss": 1.0459, + "step": 177060 + }, + { + "epoch": 1.1312497604231884, + "grad_norm": 0.8242719173431396, + "learning_rate": 3.9797938350476116e-05, + "loss": 0.6936, + "step": 177070 + }, + { + "epoch": 1.1313136475729273, + "grad_norm": 0.935931384563446, + "learning_rate": 3.979302627742564e-05, + "loss": 0.7973, + "step": 177080 + }, + { + "epoch": 1.1313775347226658, + "grad_norm": 0.7640109062194824, + "learning_rate": 3.978811430716821e-05, + "loss": 0.7046, + "step": 177090 + }, + { + "epoch": 1.1314414218724047, + "grad_norm": 1.0798890590667725, + "learning_rate": 3.9783202439753303e-05, + "loss": 1.0424, + "step": 177100 + }, + { + "epoch": 1.1315053090221432, + "grad_norm": 0.8156636953353882, + "learning_rate": 3.977829067523039e-05, + "loss": 0.6593, + "step": 177110 + }, + { + "epoch": 1.131569196171882, + "grad_norm": 0.8849560022354126, + "learning_rate": 3.977337901364893e-05, + "loss": 1.1453, + "step": 177120 + }, + { + "epoch": 1.1316330833216206, + "grad_norm": 0.9597598910331726, + "learning_rate": 3.9768467455058395e-05, + "loss": 0.7509, + "step": 177130 + }, + { + "epoch": 1.1316969704713593, + "grad_norm": 0.8824740648269653, + "learning_rate": 3.9763555999508226e-05, + "loss": 0.6749, + "step": 177140 + }, + { + "epoch": 1.131760857621098, + "grad_norm": 0.6336933970451355, + "learning_rate": 3.975864464704793e-05, + "loss": 0.7753, + "step": 177150 + }, + { + "epoch": 1.1318247447708367, + "grad_norm": 1.0209406614303589, + "learning_rate": 3.9753733397726925e-05, + "loss": 0.8533, + "step": 177160 + }, + { + "epoch": 1.1318886319205754, + "grad_norm": 1.1303939819335938, + "learning_rate": 3.97488222515947e-05, + "loss": 0.8434, + "step": 177170 + }, + { + "epoch": 1.1319525190703141, + "grad_norm": 0.7389097213745117, + "learning_rate": 3.97439112087007e-05, + "loss": 0.9985, + "step": 177180 + }, + { + "epoch": 1.1320164062200528, + "grad_norm": 0.8295130729675293, + "learning_rate": 3.9739000269094385e-05, + "loss": 1.0372, + "step": 177190 + }, + { + "epoch": 1.1320802933697915, + "grad_norm": 0.9382843375205994, + "learning_rate": 3.9734089432825216e-05, + "loss": 0.8728, + "step": 177200 + }, + { + "epoch": 1.1321441805195303, + "grad_norm": 1.4782503843307495, + "learning_rate": 3.9729178699942646e-05, + "loss": 0.8717, + "step": 177210 + }, + { + "epoch": 1.132208067669269, + "grad_norm": 0.9647573828697205, + "learning_rate": 3.972426807049614e-05, + "loss": 0.9883, + "step": 177220 + }, + { + "epoch": 1.1322719548190077, + "grad_norm": 0.8529700636863708, + "learning_rate": 3.9719357544535134e-05, + "loss": 0.9342, + "step": 177230 + }, + { + "epoch": 1.1323358419687464, + "grad_norm": 0.7517978549003601, + "learning_rate": 3.97144471221091e-05, + "loss": 0.9383, + "step": 177240 + }, + { + "epoch": 1.132399729118485, + "grad_norm": 0.7577384114265442, + "learning_rate": 3.9709536803267475e-05, + "loss": 0.7, + "step": 177250 + }, + { + "epoch": 1.1324636162682238, + "grad_norm": 0.7614976763725281, + "learning_rate": 3.9704626588059715e-05, + "loss": 0.9475, + "step": 177260 + }, + { + "epoch": 1.1325275034179625, + "grad_norm": 0.9894033670425415, + "learning_rate": 3.969971647653528e-05, + "loss": 0.7899, + "step": 177270 + }, + { + "epoch": 1.1325913905677012, + "grad_norm": 1.0283654928207397, + "learning_rate": 3.969480646874361e-05, + "loss": 0.8041, + "step": 177280 + }, + { + "epoch": 1.13265527771744, + "grad_norm": 0.8264014720916748, + "learning_rate": 3.968989656473415e-05, + "loss": 0.8589, + "step": 177290 + }, + { + "epoch": 1.1327191648671786, + "grad_norm": 0.9640235900878906, + "learning_rate": 3.968498676455635e-05, + "loss": 0.7731, + "step": 177300 + }, + { + "epoch": 1.1327830520169173, + "grad_norm": 1.018362283706665, + "learning_rate": 3.968007706825966e-05, + "loss": 0.9082, + "step": 177310 + }, + { + "epoch": 1.132846939166656, + "grad_norm": 0.6184595227241516, + "learning_rate": 3.967516747589352e-05, + "loss": 1.0333, + "step": 177320 + }, + { + "epoch": 1.1329108263163947, + "grad_norm": 0.9920672178268433, + "learning_rate": 3.967025798750738e-05, + "loss": 0.8722, + "step": 177330 + }, + { + "epoch": 1.1329747134661334, + "grad_norm": 1.114719271659851, + "learning_rate": 3.966534860315069e-05, + "loss": 0.7463, + "step": 177340 + }, + { + "epoch": 1.1330386006158721, + "grad_norm": 0.7361113429069519, + "learning_rate": 3.966043932287286e-05, + "loss": 0.8014, + "step": 177350 + }, + { + "epoch": 1.1331024877656108, + "grad_norm": 0.8126183748245239, + "learning_rate": 3.965553014672336e-05, + "loss": 1.0529, + "step": 177360 + }, + { + "epoch": 1.1331663749153496, + "grad_norm": 0.9899440407752991, + "learning_rate": 3.965062107475161e-05, + "loss": 0.6825, + "step": 177370 + }, + { + "epoch": 1.1332302620650883, + "grad_norm": 1.2756019830703735, + "learning_rate": 3.964571210700708e-05, + "loss": 0.7693, + "step": 177380 + }, + { + "epoch": 1.133294149214827, + "grad_norm": 0.8842296600341797, + "learning_rate": 3.9640803243539174e-05, + "loss": 0.7919, + "step": 177390 + }, + { + "epoch": 1.1333580363645657, + "grad_norm": 1.163802981376648, + "learning_rate": 3.963589448439734e-05, + "loss": 1.0671, + "step": 177400 + }, + { + "epoch": 1.1334219235143044, + "grad_norm": 1.0410068035125732, + "learning_rate": 3.9630985829631014e-05, + "loss": 0.9584, + "step": 177410 + }, + { + "epoch": 1.133485810664043, + "grad_norm": 0.9730801582336426, + "learning_rate": 3.962607727928963e-05, + "loss": 1.0008, + "step": 177420 + }, + { + "epoch": 1.1335496978137818, + "grad_norm": 0.867691159248352, + "learning_rate": 3.962116883342263e-05, + "loss": 0.7916, + "step": 177430 + }, + { + "epoch": 1.1336135849635205, + "grad_norm": 0.998274028301239, + "learning_rate": 3.961626049207943e-05, + "loss": 0.7744, + "step": 177440 + }, + { + "epoch": 1.1336774721132592, + "grad_norm": 1.0011632442474365, + "learning_rate": 3.961135225530947e-05, + "loss": 1.0658, + "step": 177450 + }, + { + "epoch": 1.133741359262998, + "grad_norm": 0.8723202347755432, + "learning_rate": 3.9606444123162176e-05, + "loss": 0.8927, + "step": 177460 + }, + { + "epoch": 1.1338052464127366, + "grad_norm": 1.3066819906234741, + "learning_rate": 3.9601536095687e-05, + "loss": 1.3038, + "step": 177470 + }, + { + "epoch": 1.1338691335624753, + "grad_norm": 0.928521454334259, + "learning_rate": 3.959662817293334e-05, + "loss": 0.6839, + "step": 177480 + }, + { + "epoch": 1.133933020712214, + "grad_norm": 0.982753574848175, + "learning_rate": 3.959172035495064e-05, + "loss": 0.7935, + "step": 177490 + }, + { + "epoch": 1.1339969078619527, + "grad_norm": 0.7935505509376526, + "learning_rate": 3.958681264178831e-05, + "loss": 0.9875, + "step": 177500 + }, + { + "epoch": 1.1340607950116914, + "grad_norm": 0.9873987436294556, + "learning_rate": 3.958190503349579e-05, + "loss": 0.9934, + "step": 177510 + }, + { + "epoch": 1.1341246821614301, + "grad_norm": 0.8302233219146729, + "learning_rate": 3.9576997530122505e-05, + "loss": 1.0712, + "step": 177520 + }, + { + "epoch": 1.1341885693111688, + "grad_norm": 0.8561579585075378, + "learning_rate": 3.9572090131717865e-05, + "loss": 1.0051, + "step": 177530 + }, + { + "epoch": 1.1342524564609073, + "grad_norm": 0.733059287071228, + "learning_rate": 3.956718283833131e-05, + "loss": 0.8051, + "step": 177540 + }, + { + "epoch": 1.1343163436106463, + "grad_norm": 0.8287229537963867, + "learning_rate": 3.9562275650012234e-05, + "loss": 1.0124, + "step": 177550 + }, + { + "epoch": 1.1343802307603847, + "grad_norm": 3.3876659870147705, + "learning_rate": 3.955736856681008e-05, + "loss": 0.8488, + "step": 177560 + }, + { + "epoch": 1.1344441179101237, + "grad_norm": 1.316805124282837, + "learning_rate": 3.955246158877426e-05, + "loss": 0.9535, + "step": 177570 + }, + { + "epoch": 1.1345080050598622, + "grad_norm": 0.6540564894676208, + "learning_rate": 3.954755471595419e-05, + "loss": 0.6977, + "step": 177580 + }, + { + "epoch": 1.134571892209601, + "grad_norm": 0.8143235445022583, + "learning_rate": 3.954264794839929e-05, + "loss": 0.8273, + "step": 177590 + }, + { + "epoch": 1.1346357793593396, + "grad_norm": 2.201139211654663, + "learning_rate": 3.9537741286158966e-05, + "loss": 0.8558, + "step": 177600 + }, + { + "epoch": 1.1346996665090783, + "grad_norm": 1.5337715148925781, + "learning_rate": 3.953283472928264e-05, + "loss": 0.8518, + "step": 177610 + }, + { + "epoch": 1.134763553658817, + "grad_norm": 0.6330432295799255, + "learning_rate": 3.952792827781972e-05, + "loss": 0.9595, + "step": 177620 + }, + { + "epoch": 1.1348274408085557, + "grad_norm": 0.9442708492279053, + "learning_rate": 3.952302193181963e-05, + "loss": 0.7393, + "step": 177630 + }, + { + "epoch": 1.1348913279582944, + "grad_norm": 0.9224326014518738, + "learning_rate": 3.951811569133176e-05, + "loss": 1.1248, + "step": 177640 + }, + { + "epoch": 1.134955215108033, + "grad_norm": 0.8346001505851746, + "learning_rate": 3.9513209556405546e-05, + "loss": 0.8125, + "step": 177650 + }, + { + "epoch": 1.1350191022577718, + "grad_norm": 0.6194186210632324, + "learning_rate": 3.950830352709038e-05, + "loss": 0.6402, + "step": 177660 + }, + { + "epoch": 1.1350829894075105, + "grad_norm": 0.8308387994766235, + "learning_rate": 3.950339760343566e-05, + "loss": 0.667, + "step": 177670 + }, + { + "epoch": 1.1351468765572492, + "grad_norm": 3.698460817337036, + "learning_rate": 3.949849178549082e-05, + "loss": 0.9149, + "step": 177680 + }, + { + "epoch": 1.135210763706988, + "grad_norm": 0.8807551860809326, + "learning_rate": 3.949358607330525e-05, + "loss": 0.9277, + "step": 177690 + }, + { + "epoch": 1.1352746508567266, + "grad_norm": 0.4799681305885315, + "learning_rate": 3.948868046692837e-05, + "loss": 0.8688, + "step": 177700 + }, + { + "epoch": 1.1353385380064653, + "grad_norm": 1.0262006521224976, + "learning_rate": 3.948377496640956e-05, + "loss": 0.8444, + "step": 177710 + }, + { + "epoch": 1.135402425156204, + "grad_norm": 1.2053029537200928, + "learning_rate": 3.947886957179824e-05, + "loss": 0.8337, + "step": 177720 + }, + { + "epoch": 1.1354663123059427, + "grad_norm": 0.8215839266777039, + "learning_rate": 3.94739642831438e-05, + "loss": 1.1505, + "step": 177730 + }, + { + "epoch": 1.1355301994556815, + "grad_norm": 0.8201315402984619, + "learning_rate": 3.946905910049564e-05, + "loss": 0.5706, + "step": 177740 + }, + { + "epoch": 1.1355940866054202, + "grad_norm": 1.062843918800354, + "learning_rate": 3.9464154023903176e-05, + "loss": 0.728, + "step": 177750 + }, + { + "epoch": 1.1356579737551589, + "grad_norm": 0.9273681640625, + "learning_rate": 3.94592490534158e-05, + "loss": 0.8214, + "step": 177760 + }, + { + "epoch": 1.1357218609048976, + "grad_norm": 0.9091299176216125, + "learning_rate": 3.9454344189082893e-05, + "loss": 0.7926, + "step": 177770 + }, + { + "epoch": 1.1357857480546363, + "grad_norm": 0.6764705181121826, + "learning_rate": 3.944943943095387e-05, + "loss": 0.7392, + "step": 177780 + }, + { + "epoch": 1.135849635204375, + "grad_norm": 1.0174916982650757, + "learning_rate": 3.944453477907812e-05, + "loss": 1.0032, + "step": 177790 + }, + { + "epoch": 1.1359135223541137, + "grad_norm": 1.1083827018737793, + "learning_rate": 3.943963023350503e-05, + "loss": 0.7908, + "step": 177800 + }, + { + "epoch": 1.1359774095038524, + "grad_norm": 1.1159414052963257, + "learning_rate": 3.9434725794284e-05, + "loss": 0.8939, + "step": 177810 + }, + { + "epoch": 1.136041296653591, + "grad_norm": 0.6911755800247192, + "learning_rate": 3.9429821461464435e-05, + "loss": 1.0674, + "step": 177820 + }, + { + "epoch": 1.1361051838033298, + "grad_norm": 0.6208989024162292, + "learning_rate": 3.942491723509571e-05, + "loss": 0.9803, + "step": 177830 + }, + { + "epoch": 1.1361690709530685, + "grad_norm": 1.1319308280944824, + "learning_rate": 3.942001311522721e-05, + "loss": 0.7391, + "step": 177840 + }, + { + "epoch": 1.1362329581028072, + "grad_norm": 0.943028450012207, + "learning_rate": 3.941510910190833e-05, + "loss": 0.8243, + "step": 177850 + }, + { + "epoch": 1.136296845252546, + "grad_norm": 0.8729805946350098, + "learning_rate": 3.941020519518846e-05, + "loss": 0.8554, + "step": 177860 + }, + { + "epoch": 1.1363607324022846, + "grad_norm": 2.452819585800171, + "learning_rate": 3.940530139511699e-05, + "loss": 1.1151, + "step": 177870 + }, + { + "epoch": 1.1364246195520233, + "grad_norm": 0.8181469440460205, + "learning_rate": 3.940039770174329e-05, + "loss": 0.7646, + "step": 177880 + }, + { + "epoch": 1.136488506701762, + "grad_norm": 0.7301978468894958, + "learning_rate": 3.939549411511676e-05, + "loss": 0.8143, + "step": 177890 + }, + { + "epoch": 1.1365523938515008, + "grad_norm": 1.052147388458252, + "learning_rate": 3.939059063528678e-05, + "loss": 0.9841, + "step": 177900 + }, + { + "epoch": 1.1366162810012395, + "grad_norm": 1.142288327217102, + "learning_rate": 3.938568726230273e-05, + "loss": 0.9373, + "step": 177910 + }, + { + "epoch": 1.1366801681509782, + "grad_norm": 3.7667288780212402, + "learning_rate": 3.9380783996214e-05, + "loss": 1.0897, + "step": 177920 + }, + { + "epoch": 1.1367440553007169, + "grad_norm": 0.8383669853210449, + "learning_rate": 3.9375880837069945e-05, + "loss": 0.9187, + "step": 177930 + }, + { + "epoch": 1.1368079424504556, + "grad_norm": 0.7958370447158813, + "learning_rate": 3.937097778491997e-05, + "loss": 0.6835, + "step": 177940 + }, + { + "epoch": 1.1368718296001943, + "grad_norm": 0.8150979280471802, + "learning_rate": 3.9366074839813446e-05, + "loss": 1.0067, + "step": 177950 + }, + { + "epoch": 1.136935716749933, + "grad_norm": 0.7327728867530823, + "learning_rate": 3.9361172001799744e-05, + "loss": 0.7047, + "step": 177960 + }, + { + "epoch": 1.1369996038996717, + "grad_norm": 0.9451047778129578, + "learning_rate": 3.935626927092825e-05, + "loss": 0.9296, + "step": 177970 + }, + { + "epoch": 1.1370634910494104, + "grad_norm": 1.3871461153030396, + "learning_rate": 3.9351366647248325e-05, + "loss": 0.9513, + "step": 177980 + }, + { + "epoch": 1.137127378199149, + "grad_norm": 1.4654449224472046, + "learning_rate": 3.934646413080934e-05, + "loss": 1.026, + "step": 177990 + }, + { + "epoch": 1.1371912653488878, + "grad_norm": 1.2127928733825684, + "learning_rate": 3.934156172166069e-05, + "loss": 0.8672, + "step": 178000 + }, + { + "epoch": 1.1372551524986265, + "grad_norm": 1.219888687133789, + "learning_rate": 3.933665941985174e-05, + "loss": 0.7495, + "step": 178010 + }, + { + "epoch": 1.1373190396483652, + "grad_norm": 0.9994648098945618, + "learning_rate": 3.933175722543185e-05, + "loss": 1.1691, + "step": 178020 + }, + { + "epoch": 1.1373829267981037, + "grad_norm": 1.248658537864685, + "learning_rate": 3.9326855138450396e-05, + "loss": 0.8557, + "step": 178030 + }, + { + "epoch": 1.1374468139478426, + "grad_norm": 0.9416319727897644, + "learning_rate": 3.932195315895674e-05, + "loss": 0.7892, + "step": 178040 + }, + { + "epoch": 1.1375107010975811, + "grad_norm": 1.1696947813034058, + "learning_rate": 3.9317051287000264e-05, + "loss": 0.8266, + "step": 178050 + }, + { + "epoch": 1.13757458824732, + "grad_norm": 1.167878270149231, + "learning_rate": 3.931214952263031e-05, + "loss": 1.0848, + "step": 178060 + }, + { + "epoch": 1.1376384753970585, + "grad_norm": 1.0277177095413208, + "learning_rate": 3.930724786589626e-05, + "loss": 0.9707, + "step": 178070 + }, + { + "epoch": 1.1377023625467972, + "grad_norm": 1.651376724243164, + "learning_rate": 3.9302346316847484e-05, + "loss": 1.1477, + "step": 178080 + }, + { + "epoch": 1.137766249696536, + "grad_norm": 1.2855274677276611, + "learning_rate": 3.9297444875533324e-05, + "loss": 0.7331, + "step": 178090 + }, + { + "epoch": 1.1378301368462747, + "grad_norm": 1.2985841035842896, + "learning_rate": 3.929254354200316e-05, + "loss": 0.8664, + "step": 178100 + }, + { + "epoch": 1.1378940239960134, + "grad_norm": 0.9524058103561401, + "learning_rate": 3.928764231630634e-05, + "loss": 0.7921, + "step": 178110 + }, + { + "epoch": 1.137957911145752, + "grad_norm": 0.7414030432701111, + "learning_rate": 3.928274119849223e-05, + "loss": 0.8939, + "step": 178120 + }, + { + "epoch": 1.1380217982954908, + "grad_norm": 0.6210494637489319, + "learning_rate": 3.9277840188610197e-05, + "loss": 0.9458, + "step": 178130 + }, + { + "epoch": 1.1380856854452295, + "grad_norm": 0.8775937557220459, + "learning_rate": 3.927293928670958e-05, + "loss": 0.9972, + "step": 178140 + }, + { + "epoch": 1.1381495725949682, + "grad_norm": 0.8039231896400452, + "learning_rate": 3.926803849283975e-05, + "loss": 0.7796, + "step": 178150 + }, + { + "epoch": 1.138213459744707, + "grad_norm": 1.4437992572784424, + "learning_rate": 3.926313780705005e-05, + "loss": 0.8891, + "step": 178160 + }, + { + "epoch": 1.1382773468944456, + "grad_norm": 1.6523823738098145, + "learning_rate": 3.925823722938985e-05, + "loss": 1.0061, + "step": 178170 + }, + { + "epoch": 1.1383412340441843, + "grad_norm": 0.7780923843383789, + "learning_rate": 3.925333675990849e-05, + "loss": 0.9331, + "step": 178180 + }, + { + "epoch": 1.138405121193923, + "grad_norm": 1.6156859397888184, + "learning_rate": 3.924843639865531e-05, + "loss": 0.6508, + "step": 178190 + }, + { + "epoch": 1.1384690083436617, + "grad_norm": 3.4931466579437256, + "learning_rate": 3.92435361456797e-05, + "loss": 0.9637, + "step": 178200 + }, + { + "epoch": 1.1385328954934004, + "grad_norm": 1.5247029066085815, + "learning_rate": 3.9238636001030974e-05, + "loss": 0.9032, + "step": 178210 + }, + { + "epoch": 1.1385967826431391, + "grad_norm": 1.0243054628372192, + "learning_rate": 3.92337359647585e-05, + "loss": 0.928, + "step": 178220 + }, + { + "epoch": 1.1386606697928778, + "grad_norm": 0.9085797071456909, + "learning_rate": 3.922883603691162e-05, + "loss": 0.9142, + "step": 178230 + }, + { + "epoch": 1.1387245569426165, + "grad_norm": 0.8062956929206848, + "learning_rate": 3.922393621753968e-05, + "loss": 0.7967, + "step": 178240 + }, + { + "epoch": 1.1387884440923552, + "grad_norm": 0.6442756056785583, + "learning_rate": 3.921903650669202e-05, + "loss": 1.1538, + "step": 178250 + }, + { + "epoch": 1.138852331242094, + "grad_norm": 0.8777545094490051, + "learning_rate": 3.9214136904417986e-05, + "loss": 0.8731, + "step": 178260 + }, + { + "epoch": 1.1389162183918327, + "grad_norm": 1.089011311531067, + "learning_rate": 3.920923741076693e-05, + "loss": 0.6851, + "step": 178270 + }, + { + "epoch": 1.1389801055415714, + "grad_norm": 0.9241474866867065, + "learning_rate": 3.920433802578819e-05, + "loss": 0.9525, + "step": 178280 + }, + { + "epoch": 1.13904399269131, + "grad_norm": 1.746314287185669, + "learning_rate": 3.91994387495311e-05, + "loss": 1.2368, + "step": 178290 + }, + { + "epoch": 1.1391078798410488, + "grad_norm": 0.8153021931648254, + "learning_rate": 3.919453958204502e-05, + "loss": 0.7242, + "step": 178300 + }, + { + "epoch": 1.1391717669907875, + "grad_norm": 0.8815270662307739, + "learning_rate": 3.918964052337927e-05, + "loss": 0.7458, + "step": 178310 + }, + { + "epoch": 1.1392356541405262, + "grad_norm": 0.9482192993164062, + "learning_rate": 3.918474157358318e-05, + "loss": 0.8227, + "step": 178320 + }, + { + "epoch": 1.139299541290265, + "grad_norm": 0.9908492565155029, + "learning_rate": 3.9179842732706114e-05, + "loss": 0.9984, + "step": 178330 + }, + { + "epoch": 1.1393634284400036, + "grad_norm": 0.5842317342758179, + "learning_rate": 3.917494400079738e-05, + "loss": 0.7083, + "step": 178340 + }, + { + "epoch": 1.1394273155897423, + "grad_norm": 0.8298249840736389, + "learning_rate": 3.917004537790633e-05, + "loss": 0.816, + "step": 178350 + }, + { + "epoch": 1.139491202739481, + "grad_norm": 1.1871623992919922, + "learning_rate": 3.916514686408229e-05, + "loss": 0.7508, + "step": 178360 + }, + { + "epoch": 1.1395550898892197, + "grad_norm": 0.6309062242507935, + "learning_rate": 3.916024845937459e-05, + "loss": 0.7599, + "step": 178370 + }, + { + "epoch": 1.1396189770389584, + "grad_norm": 0.6612233519554138, + "learning_rate": 3.9155350163832575e-05, + "loss": 0.8779, + "step": 178380 + }, + { + "epoch": 1.1396828641886971, + "grad_norm": 1.1284621953964233, + "learning_rate": 3.915045197750556e-05, + "loss": 1.2675, + "step": 178390 + }, + { + "epoch": 1.1397467513384358, + "grad_norm": 1.0002460479736328, + "learning_rate": 3.9145553900442886e-05, + "loss": 0.8455, + "step": 178400 + }, + { + "epoch": 1.1398106384881745, + "grad_norm": 0.8611626625061035, + "learning_rate": 3.914065593269387e-05, + "loss": 0.8965, + "step": 178410 + }, + { + "epoch": 1.1398745256379132, + "grad_norm": 1.169734001159668, + "learning_rate": 3.9135758074307846e-05, + "loss": 0.8153, + "step": 178420 + }, + { + "epoch": 1.139938412787652, + "grad_norm": 0.885779857635498, + "learning_rate": 3.913086032533413e-05, + "loss": 0.8775, + "step": 178430 + }, + { + "epoch": 1.1400022999373907, + "grad_norm": 0.981346607208252, + "learning_rate": 3.912596268582206e-05, + "loss": 0.8037, + "step": 178440 + }, + { + "epoch": 1.1400661870871294, + "grad_norm": 1.3203126192092896, + "learning_rate": 3.9121554903891656e-05, + "loss": 0.8425, + "step": 178450 + }, + { + "epoch": 1.140130074236868, + "grad_norm": 0.7057384848594666, + "learning_rate": 3.911665747249259e-05, + "loss": 0.7483, + "step": 178460 + }, + { + "epoch": 1.1401939613866068, + "grad_norm": 0.6951310634613037, + "learning_rate": 3.91117601506982e-05, + "loss": 1.031, + "step": 178470 + }, + { + "epoch": 1.1402578485363455, + "grad_norm": 0.8375203013420105, + "learning_rate": 3.91068629385578e-05, + "loss": 0.7288, + "step": 178480 + }, + { + "epoch": 1.1403217356860842, + "grad_norm": 0.8579121232032776, + "learning_rate": 3.910196583612071e-05, + "loss": 0.919, + "step": 178490 + }, + { + "epoch": 1.140385622835823, + "grad_norm": 1.0421398878097534, + "learning_rate": 3.909706884343625e-05, + "loss": 1.0143, + "step": 178500 + }, + { + "epoch": 1.1404495099855616, + "grad_norm": 0.5818277597427368, + "learning_rate": 3.9092171960553745e-05, + "loss": 1.0266, + "step": 178510 + }, + { + "epoch": 1.1405133971353, + "grad_norm": 1.5515040159225464, + "learning_rate": 3.908727518752251e-05, + "loss": 0.9996, + "step": 178520 + }, + { + "epoch": 1.140577284285039, + "grad_norm": 0.9544771909713745, + "learning_rate": 3.908237852439185e-05, + "loss": 0.8596, + "step": 178530 + }, + { + "epoch": 1.1406411714347775, + "grad_norm": 1.0081593990325928, + "learning_rate": 3.9077481971211075e-05, + "loss": 0.6763, + "step": 178540 + }, + { + "epoch": 1.1407050585845164, + "grad_norm": 0.8507857918739319, + "learning_rate": 3.907258552802951e-05, + "loss": 1.3678, + "step": 178550 + }, + { + "epoch": 1.140768945734255, + "grad_norm": 0.8438236713409424, + "learning_rate": 3.9067689194896476e-05, + "loss": 0.8784, + "step": 178560 + }, + { + "epoch": 1.1408328328839936, + "grad_norm": 0.8789088129997253, + "learning_rate": 3.906279297186125e-05, + "loss": 0.7002, + "step": 178570 + }, + { + "epoch": 1.1408967200337323, + "grad_norm": 0.8186583518981934, + "learning_rate": 3.905789685897318e-05, + "loss": 0.866, + "step": 178580 + }, + { + "epoch": 1.140960607183471, + "grad_norm": 0.7446259260177612, + "learning_rate": 3.905300085628154e-05, + "loss": 0.8398, + "step": 178590 + }, + { + "epoch": 1.1410244943332097, + "grad_norm": 1.0921506881713867, + "learning_rate": 3.9048104963835654e-05, + "loss": 1.1518, + "step": 178600 + }, + { + "epoch": 1.1410883814829484, + "grad_norm": 1.0900102853775024, + "learning_rate": 3.904320918168483e-05, + "loss": 0.8006, + "step": 178610 + }, + { + "epoch": 1.1411522686326872, + "grad_norm": 0.7984510660171509, + "learning_rate": 3.9038313509878365e-05, + "loss": 0.8165, + "step": 178620 + }, + { + "epoch": 1.1412161557824259, + "grad_norm": 0.9651395678520203, + "learning_rate": 3.9033417948465554e-05, + "loss": 1.1725, + "step": 178630 + }, + { + "epoch": 1.1412800429321646, + "grad_norm": 0.9538650512695312, + "learning_rate": 3.902852249749573e-05, + "loss": 0.7891, + "step": 178640 + }, + { + "epoch": 1.1413439300819033, + "grad_norm": 0.9638974070549011, + "learning_rate": 3.9023627157018174e-05, + "loss": 0.7188, + "step": 178650 + }, + { + "epoch": 1.141407817231642, + "grad_norm": 0.8554356694221497, + "learning_rate": 3.901873192708219e-05, + "loss": 1.0519, + "step": 178660 + }, + { + "epoch": 1.1414717043813807, + "grad_norm": 0.9228730797767639, + "learning_rate": 3.9013836807737064e-05, + "loss": 1.0421, + "step": 178670 + }, + { + "epoch": 1.1415355915311194, + "grad_norm": 0.788002610206604, + "learning_rate": 3.9008941799032116e-05, + "loss": 0.8416, + "step": 178680 + }, + { + "epoch": 1.141599478680858, + "grad_norm": 0.4219997823238373, + "learning_rate": 3.9004046901016634e-05, + "loss": 0.843, + "step": 178690 + }, + { + "epoch": 1.1416633658305968, + "grad_norm": 0.9031209945678711, + "learning_rate": 3.899915211373991e-05, + "loss": 0.9158, + "step": 178700 + }, + { + "epoch": 1.1417272529803355, + "grad_norm": 0.7229623794555664, + "learning_rate": 3.899425743725124e-05, + "loss": 0.9196, + "step": 178710 + }, + { + "epoch": 1.1417911401300742, + "grad_norm": 0.8259024620056152, + "learning_rate": 3.898936287159992e-05, + "loss": 0.9001, + "step": 178720 + }, + { + "epoch": 1.141855027279813, + "grad_norm": 1.3349802494049072, + "learning_rate": 3.898446841683523e-05, + "loss": 0.8477, + "step": 178730 + }, + { + "epoch": 1.1419189144295516, + "grad_norm": 0.7066223621368408, + "learning_rate": 3.8979574073006485e-05, + "loss": 0.8269, + "step": 178740 + }, + { + "epoch": 1.1419828015792903, + "grad_norm": 1.2465709447860718, + "learning_rate": 3.897467984016296e-05, + "loss": 1.0771, + "step": 178750 + }, + { + "epoch": 1.142046688729029, + "grad_norm": 0.7932277321815491, + "learning_rate": 3.896978571835395e-05, + "loss": 0.7609, + "step": 178760 + }, + { + "epoch": 1.1421105758787677, + "grad_norm": 1.180996298789978, + "learning_rate": 3.8964891707628745e-05, + "loss": 0.7902, + "step": 178770 + }, + { + "epoch": 1.1421744630285064, + "grad_norm": 1.8655349016189575, + "learning_rate": 3.895999780803662e-05, + "loss": 0.6203, + "step": 178780 + }, + { + "epoch": 1.1422383501782452, + "grad_norm": 0.9006361365318298, + "learning_rate": 3.8955104019626865e-05, + "loss": 0.9835, + "step": 178790 + }, + { + "epoch": 1.1423022373279839, + "grad_norm": 1.117305040359497, + "learning_rate": 3.895021034244878e-05, + "loss": 0.8953, + "step": 178800 + }, + { + "epoch": 1.1423661244777226, + "grad_norm": 1.0132900476455688, + "learning_rate": 3.894531677655162e-05, + "loss": 0.9854, + "step": 178810 + }, + { + "epoch": 1.1424300116274613, + "grad_norm": 1.055531620979309, + "learning_rate": 3.89404233219847e-05, + "loss": 0.551, + "step": 178820 + }, + { + "epoch": 1.1424938987772, + "grad_norm": 1.010424256324768, + "learning_rate": 3.893552997879727e-05, + "loss": 1.0174, + "step": 178830 + }, + { + "epoch": 1.1425577859269387, + "grad_norm": 0.947930097579956, + "learning_rate": 3.893063674703862e-05, + "loss": 0.9078, + "step": 178840 + }, + { + "epoch": 1.1426216730766774, + "grad_norm": 1.0680005550384521, + "learning_rate": 3.892574362675805e-05, + "loss": 0.9613, + "step": 178850 + }, + { + "epoch": 1.142685560226416, + "grad_norm": 1.8149521350860596, + "learning_rate": 3.8920850618004825e-05, + "loss": 0.7425, + "step": 178860 + }, + { + "epoch": 1.1427494473761548, + "grad_norm": 1.3183022737503052, + "learning_rate": 3.891595772082821e-05, + "loss": 1.0288, + "step": 178870 + }, + { + "epoch": 1.1428133345258935, + "grad_norm": 0.9927735924720764, + "learning_rate": 3.891106493527749e-05, + "loss": 0.6818, + "step": 178880 + }, + { + "epoch": 1.1428772216756322, + "grad_norm": 0.6876515746116638, + "learning_rate": 3.8906172261401944e-05, + "loss": 1.0061, + "step": 178890 + }, + { + "epoch": 1.142941108825371, + "grad_norm": 0.6178897619247437, + "learning_rate": 3.8901279699250833e-05, + "loss": 0.7599, + "step": 178900 + }, + { + "epoch": 1.1430049959751096, + "grad_norm": 0.7962726950645447, + "learning_rate": 3.889638724887345e-05, + "loss": 0.9557, + "step": 178910 + }, + { + "epoch": 1.1430688831248483, + "grad_norm": 1.6612660884857178, + "learning_rate": 3.889149491031905e-05, + "loss": 0.9919, + "step": 178920 + }, + { + "epoch": 1.143132770274587, + "grad_norm": 2.1932833194732666, + "learning_rate": 3.88866026836369e-05, + "loss": 0.9959, + "step": 178930 + }, + { + "epoch": 1.1431966574243257, + "grad_norm": 1.0773873329162598, + "learning_rate": 3.888171056887628e-05, + "loss": 0.9211, + "step": 178940 + }, + { + "epoch": 1.1432605445740645, + "grad_norm": 0.9403598308563232, + "learning_rate": 3.887681856608646e-05, + "loss": 0.7464, + "step": 178950 + }, + { + "epoch": 1.1433244317238032, + "grad_norm": 0.6172276139259338, + "learning_rate": 3.8871926675316696e-05, + "loss": 0.8862, + "step": 178960 + }, + { + "epoch": 1.1433883188735419, + "grad_norm": 0.8478199243545532, + "learning_rate": 3.886703489661625e-05, + "loss": 1.1241, + "step": 178970 + }, + { + "epoch": 1.1434522060232806, + "grad_norm": 1.059793472290039, + "learning_rate": 3.8862143230034395e-05, + "loss": 0.8413, + "step": 178980 + }, + { + "epoch": 1.143516093173019, + "grad_norm": 1.063503384590149, + "learning_rate": 3.88572516756204e-05, + "loss": 0.6765, + "step": 178990 + }, + { + "epoch": 1.143579980322758, + "grad_norm": 1.4878591299057007, + "learning_rate": 3.8852360233423515e-05, + "loss": 1.0485, + "step": 179000 + }, + { + "epoch": 1.1436438674724965, + "grad_norm": 0.5734277367591858, + "learning_rate": 3.884746890349301e-05, + "loss": 0.9841, + "step": 179010 + }, + { + "epoch": 1.1437077546222354, + "grad_norm": 1.1828947067260742, + "learning_rate": 3.8842577685878136e-05, + "loss": 1.053, + "step": 179020 + }, + { + "epoch": 1.1437716417719739, + "grad_norm": 0.7959204316139221, + "learning_rate": 3.883768658062816e-05, + "loss": 0.8519, + "step": 179030 + }, + { + "epoch": 1.1438355289217128, + "grad_norm": 1.2506985664367676, + "learning_rate": 3.883279558779234e-05, + "loss": 0.8219, + "step": 179040 + }, + { + "epoch": 1.1438994160714513, + "grad_norm": 0.72293621301651, + "learning_rate": 3.882790470741993e-05, + "loss": 1.0947, + "step": 179050 + }, + { + "epoch": 1.14396330322119, + "grad_norm": 1.0498420000076294, + "learning_rate": 3.8823013939560177e-05, + "loss": 0.8723, + "step": 179060 + }, + { + "epoch": 1.1440271903709287, + "grad_norm": 0.7280313372612, + "learning_rate": 3.881812328426234e-05, + "loss": 0.8269, + "step": 179070 + }, + { + "epoch": 1.1440910775206674, + "grad_norm": 0.8474599719047546, + "learning_rate": 3.881323274157569e-05, + "loss": 1.0128, + "step": 179080 + }, + { + "epoch": 1.1441549646704061, + "grad_norm": 0.9039817452430725, + "learning_rate": 3.880834231154946e-05, + "loss": 0.8717, + "step": 179090 + }, + { + "epoch": 1.1442188518201448, + "grad_norm": 2.04731822013855, + "learning_rate": 3.8803451994232896e-05, + "loss": 0.754, + "step": 179100 + }, + { + "epoch": 1.1442827389698835, + "grad_norm": 0.7595462203025818, + "learning_rate": 3.879856178967526e-05, + "loss": 0.7114, + "step": 179110 + }, + { + "epoch": 1.1443466261196222, + "grad_norm": 1.808081030845642, + "learning_rate": 3.87936716979258e-05, + "loss": 0.7817, + "step": 179120 + }, + { + "epoch": 1.144410513269361, + "grad_norm": 0.9245400428771973, + "learning_rate": 3.8788781719033765e-05, + "loss": 0.7758, + "step": 179130 + }, + { + "epoch": 1.1444744004190996, + "grad_norm": 0.8909382224082947, + "learning_rate": 3.8783891853048394e-05, + "loss": 1.1851, + "step": 179140 + }, + { + "epoch": 1.1445382875688384, + "grad_norm": 0.8799207210540771, + "learning_rate": 3.877900210001893e-05, + "loss": 0.8381, + "step": 179150 + }, + { + "epoch": 1.144602174718577, + "grad_norm": 2.0935261249542236, + "learning_rate": 3.877411245999462e-05, + "loss": 0.5351, + "step": 179160 + }, + { + "epoch": 1.1446660618683158, + "grad_norm": 0.7175489664077759, + "learning_rate": 3.8769222933024716e-05, + "loss": 0.8042, + "step": 179170 + }, + { + "epoch": 1.1447299490180545, + "grad_norm": 0.9167541861534119, + "learning_rate": 3.8764333519158455e-05, + "loss": 0.919, + "step": 179180 + }, + { + "epoch": 1.1447938361677932, + "grad_norm": 0.929903507232666, + "learning_rate": 3.8759444218445075e-05, + "loss": 1.0032, + "step": 179190 + }, + { + "epoch": 1.1448577233175319, + "grad_norm": 0.9965102076530457, + "learning_rate": 3.875455503093381e-05, + "loss": 0.9498, + "step": 179200 + }, + { + "epoch": 1.1449216104672706, + "grad_norm": 0.677137553691864, + "learning_rate": 3.874966595667391e-05, + "loss": 0.8057, + "step": 179210 + }, + { + "epoch": 1.1449854976170093, + "grad_norm": 1.0251044034957886, + "learning_rate": 3.874477699571461e-05, + "loss": 0.7497, + "step": 179220 + }, + { + "epoch": 1.145049384766748, + "grad_norm": 0.7787938117980957, + "learning_rate": 3.873988814810514e-05, + "loss": 0.8835, + "step": 179230 + }, + { + "epoch": 1.1451132719164867, + "grad_norm": 0.9648923277854919, + "learning_rate": 3.873499941389474e-05, + "loss": 0.874, + "step": 179240 + }, + { + "epoch": 1.1451771590662254, + "grad_norm": 0.9109137058258057, + "learning_rate": 3.8730110793132634e-05, + "loss": 1.028, + "step": 179250 + }, + { + "epoch": 1.1452410462159641, + "grad_norm": 1.3188549280166626, + "learning_rate": 3.872522228586807e-05, + "loss": 0.9197, + "step": 179260 + }, + { + "epoch": 1.1453049333657028, + "grad_norm": 0.8881723284721375, + "learning_rate": 3.8720333892150265e-05, + "loss": 0.7404, + "step": 179270 + }, + { + "epoch": 1.1453688205154415, + "grad_norm": 0.8248503804206848, + "learning_rate": 3.871544561202846e-05, + "loss": 0.8751, + "step": 179280 + }, + { + "epoch": 1.1454327076651802, + "grad_norm": 1.1585686206817627, + "learning_rate": 3.8710557445551884e-05, + "loss": 0.8279, + "step": 179290 + }, + { + "epoch": 1.145496594814919, + "grad_norm": 0.8697220087051392, + "learning_rate": 3.870566939276975e-05, + "loss": 0.9178, + "step": 179300 + }, + { + "epoch": 1.1455604819646577, + "grad_norm": 0.8634340167045593, + "learning_rate": 3.870078145373131e-05, + "loss": 0.8718, + "step": 179310 + }, + { + "epoch": 1.1456243691143964, + "grad_norm": 0.6686280965805054, + "learning_rate": 3.8695893628485766e-05, + "loss": 0.7065, + "step": 179320 + }, + { + "epoch": 1.145688256264135, + "grad_norm": 0.8824241161346436, + "learning_rate": 3.8691005917082355e-05, + "loss": 0.835, + "step": 179330 + }, + { + "epoch": 1.1457521434138738, + "grad_norm": 1.1661261320114136, + "learning_rate": 3.86861183195703e-05, + "loss": 0.8727, + "step": 179340 + }, + { + "epoch": 1.1458160305636125, + "grad_norm": 2.5761728286743164, + "learning_rate": 3.868123083599882e-05, + "loss": 0.7874, + "step": 179350 + }, + { + "epoch": 1.1458799177133512, + "grad_norm": 1.544651985168457, + "learning_rate": 3.867634346641713e-05, + "loss": 0.8582, + "step": 179360 + }, + { + "epoch": 1.1459438048630899, + "grad_norm": 0.9821982979774475, + "learning_rate": 3.867145621087447e-05, + "loss": 0.9806, + "step": 179370 + }, + { + "epoch": 1.1460076920128286, + "grad_norm": 1.0123443603515625, + "learning_rate": 3.866656906942004e-05, + "loss": 1.0326, + "step": 179380 + }, + { + "epoch": 1.1460715791625673, + "grad_norm": 0.7034146785736084, + "learning_rate": 3.866168204210307e-05, + "loss": 0.9303, + "step": 179390 + }, + { + "epoch": 1.146135466312306, + "grad_norm": 0.9089710712432861, + "learning_rate": 3.865679512897276e-05, + "loss": 0.9674, + "step": 179400 + }, + { + "epoch": 1.1461993534620447, + "grad_norm": 0.6943764686584473, + "learning_rate": 3.865190833007835e-05, + "loss": 0.6687, + "step": 179410 + }, + { + "epoch": 1.1462632406117834, + "grad_norm": 0.9293021559715271, + "learning_rate": 3.8647021645469025e-05, + "loss": 0.9211, + "step": 179420 + }, + { + "epoch": 1.1463271277615221, + "grad_norm": 1.3763326406478882, + "learning_rate": 3.8642135075194045e-05, + "loss": 0.9414, + "step": 179430 + }, + { + "epoch": 1.1463910149112608, + "grad_norm": 0.9421727657318115, + "learning_rate": 3.863724861930258e-05, + "loss": 0.9874, + "step": 179440 + }, + { + "epoch": 1.1464549020609995, + "grad_norm": 1.2408033609390259, + "learning_rate": 3.863236227784383e-05, + "loss": 0.9334, + "step": 179450 + }, + { + "epoch": 1.1465187892107382, + "grad_norm": 0.9744552373886108, + "learning_rate": 3.862747605086704e-05, + "loss": 0.8964, + "step": 179460 + }, + { + "epoch": 1.146582676360477, + "grad_norm": 0.6820455193519592, + "learning_rate": 3.86225899384214e-05, + "loss": 0.6913, + "step": 179470 + }, + { + "epoch": 1.1466465635102154, + "grad_norm": 0.6710923910140991, + "learning_rate": 3.8617703940556114e-05, + "loss": 0.9485, + "step": 179480 + }, + { + "epoch": 1.1467104506599544, + "grad_norm": 0.6139732003211975, + "learning_rate": 3.861281805732041e-05, + "loss": 0.8372, + "step": 179490 + }, + { + "epoch": 1.1467743378096928, + "grad_norm": 0.9873855113983154, + "learning_rate": 3.8607932288763473e-05, + "loss": 0.98, + "step": 179500 + }, + { + "epoch": 1.1468382249594318, + "grad_norm": 0.5885666608810425, + "learning_rate": 3.860353519515323e-05, + "loss": 1.0115, + "step": 179510 + }, + { + "epoch": 1.1469021121091703, + "grad_norm": 1.1420519351959229, + "learning_rate": 3.859864964462152e-05, + "loss": 0.8411, + "step": 179520 + }, + { + "epoch": 1.1469659992589092, + "grad_norm": 1.1546262502670288, + "learning_rate": 3.859376420891128e-05, + "loss": 0.8978, + "step": 179530 + }, + { + "epoch": 1.1470298864086477, + "grad_norm": 1.3471343517303467, + "learning_rate": 3.858887888807169e-05, + "loss": 0.9926, + "step": 179540 + }, + { + "epoch": 1.1470937735583864, + "grad_norm": 0.7748019695281982, + "learning_rate": 3.858399368215197e-05, + "loss": 1.0838, + "step": 179550 + }, + { + "epoch": 1.147157660708125, + "grad_norm": 0.711402416229248, + "learning_rate": 3.857910859120131e-05, + "loss": 0.804, + "step": 179560 + }, + { + "epoch": 1.1472215478578638, + "grad_norm": 0.8038139939308167, + "learning_rate": 3.85742236152689e-05, + "loss": 0.733, + "step": 179570 + }, + { + "epoch": 1.1472854350076025, + "grad_norm": 2.674290895462036, + "learning_rate": 3.8569338754403966e-05, + "loss": 1.0378, + "step": 179580 + }, + { + "epoch": 1.1473493221573412, + "grad_norm": 1.2918055057525635, + "learning_rate": 3.856445400865566e-05, + "loss": 1.0691, + "step": 179590 + }, + { + "epoch": 1.14741320930708, + "grad_norm": 0.8749229311943054, + "learning_rate": 3.855956937807319e-05, + "loss": 0.8179, + "step": 179600 + }, + { + "epoch": 1.1474770964568186, + "grad_norm": 2.0050618648529053, + "learning_rate": 3.8554684862705755e-05, + "loss": 1.077, + "step": 179610 + }, + { + "epoch": 1.1475409836065573, + "grad_norm": 0.6908048391342163, + "learning_rate": 3.8549800462602546e-05, + "loss": 0.985, + "step": 179620 + }, + { + "epoch": 1.147604870756296, + "grad_norm": 1.2195981740951538, + "learning_rate": 3.8544916177812756e-05, + "loss": 0.8393, + "step": 179630 + }, + { + "epoch": 1.1476687579060347, + "grad_norm": 0.7860914468765259, + "learning_rate": 3.854003200838557e-05, + "loss": 0.855, + "step": 179640 + }, + { + "epoch": 1.1477326450557734, + "grad_norm": 0.6682232618331909, + "learning_rate": 3.8535147954370174e-05, + "loss": 0.9404, + "step": 179650 + }, + { + "epoch": 1.1477965322055121, + "grad_norm": 1.3988233804702759, + "learning_rate": 3.853026401581576e-05, + "loss": 0.9323, + "step": 179660 + }, + { + "epoch": 1.1478604193552508, + "grad_norm": 0.9596664309501648, + "learning_rate": 3.852538019277151e-05, + "loss": 0.9515, + "step": 179670 + }, + { + "epoch": 1.1479243065049896, + "grad_norm": 1.8466547727584839, + "learning_rate": 3.85204964852866e-05, + "loss": 1.0884, + "step": 179680 + }, + { + "epoch": 1.1479881936547283, + "grad_norm": 0.7646600604057312, + "learning_rate": 3.851561289341023e-05, + "loss": 1.0537, + "step": 179690 + }, + { + "epoch": 1.148052080804467, + "grad_norm": 0.6700149178504944, + "learning_rate": 3.851072941719157e-05, + "loss": 0.8711, + "step": 179700 + }, + { + "epoch": 1.1481159679542057, + "grad_norm": 1.687111258506775, + "learning_rate": 3.8505846056679805e-05, + "loss": 0.6716, + "step": 179710 + }, + { + "epoch": 1.1481798551039444, + "grad_norm": 0.8387539982795715, + "learning_rate": 3.850096281192412e-05, + "loss": 1.1118, + "step": 179720 + }, + { + "epoch": 1.148243742253683, + "grad_norm": 1.3596510887145996, + "learning_rate": 3.8496079682973685e-05, + "loss": 0.8337, + "step": 179730 + }, + { + "epoch": 1.1483076294034218, + "grad_norm": 2.704559803009033, + "learning_rate": 3.849119666987767e-05, + "loss": 1.0053, + "step": 179740 + }, + { + "epoch": 1.1483715165531605, + "grad_norm": 0.7268739938735962, + "learning_rate": 3.8486313772685274e-05, + "loss": 1.0139, + "step": 179750 + }, + { + "epoch": 1.1484354037028992, + "grad_norm": 1.4763686656951904, + "learning_rate": 3.848143099144566e-05, + "loss": 0.8677, + "step": 179760 + }, + { + "epoch": 1.148499290852638, + "grad_norm": 1.2906302213668823, + "learning_rate": 3.847654832620798e-05, + "loss": 0.9295, + "step": 179770 + }, + { + "epoch": 1.1485631780023766, + "grad_norm": 0.7881545424461365, + "learning_rate": 3.847166577702145e-05, + "loss": 0.967, + "step": 179780 + }, + { + "epoch": 1.1486270651521153, + "grad_norm": 0.7793990969657898, + "learning_rate": 3.846678334393521e-05, + "loss": 0.8536, + "step": 179790 + }, + { + "epoch": 1.148690952301854, + "grad_norm": 0.7015102505683899, + "learning_rate": 3.8461901026998424e-05, + "loss": 0.766, + "step": 179800 + }, + { + "epoch": 1.1487548394515927, + "grad_norm": 0.8594871163368225, + "learning_rate": 3.84570188262603e-05, + "loss": 0.9288, + "step": 179810 + }, + { + "epoch": 1.1488187266013314, + "grad_norm": 0.8671073317527771, + "learning_rate": 3.845213674176997e-05, + "loss": 0.8234, + "step": 179820 + }, + { + "epoch": 1.1488826137510701, + "grad_norm": 1.8361603021621704, + "learning_rate": 3.8447254773576625e-05, + "loss": 0.8157, + "step": 179830 + }, + { + "epoch": 1.1489465009008089, + "grad_norm": 0.8427587151527405, + "learning_rate": 3.844237292172941e-05, + "loss": 0.8628, + "step": 179840 + }, + { + "epoch": 1.1490103880505476, + "grad_norm": 0.4868806004524231, + "learning_rate": 3.84374911862775e-05, + "loss": 0.8923, + "step": 179850 + }, + { + "epoch": 1.1490742752002863, + "grad_norm": 0.9738015532493591, + "learning_rate": 3.843260956727006e-05, + "loss": 0.8577, + "step": 179860 + }, + { + "epoch": 1.149138162350025, + "grad_norm": 0.9108662009239197, + "learning_rate": 3.8427728064756246e-05, + "loss": 0.697, + "step": 179870 + }, + { + "epoch": 1.1492020494997637, + "grad_norm": 0.8939322233200073, + "learning_rate": 3.842284667878522e-05, + "loss": 0.8281, + "step": 179880 + }, + { + "epoch": 1.1492659366495024, + "grad_norm": 2.0734434127807617, + "learning_rate": 3.8417965409406146e-05, + "loss": 0.8891, + "step": 179890 + }, + { + "epoch": 1.149329823799241, + "grad_norm": 1.4640614986419678, + "learning_rate": 3.8413084256668184e-05, + "loss": 0.9592, + "step": 179900 + }, + { + "epoch": 1.1493937109489798, + "grad_norm": 1.422544002532959, + "learning_rate": 3.840820322062048e-05, + "loss": 0.7184, + "step": 179910 + }, + { + "epoch": 1.1494575980987185, + "grad_norm": 0.8664306402206421, + "learning_rate": 3.8403322301312204e-05, + "loss": 0.9412, + "step": 179920 + }, + { + "epoch": 1.1495214852484572, + "grad_norm": 0.8996186852455139, + "learning_rate": 3.83984414987925e-05, + "loss": 0.7447, + "step": 179930 + }, + { + "epoch": 1.149585372398196, + "grad_norm": 0.9673845171928406, + "learning_rate": 3.839356081311053e-05, + "loss": 0.8512, + "step": 179940 + }, + { + "epoch": 1.1496492595479346, + "grad_norm": 2.0374224185943604, + "learning_rate": 3.8388680244315445e-05, + "loss": 0.9246, + "step": 179950 + }, + { + "epoch": 1.1497131466976733, + "grad_norm": 0.8047225475311279, + "learning_rate": 3.83837997924564e-05, + "loss": 0.8539, + "step": 179960 + }, + { + "epoch": 1.1497770338474118, + "grad_norm": 0.9088835120201111, + "learning_rate": 3.837891945758253e-05, + "loss": 1.0117, + "step": 179970 + }, + { + "epoch": 1.1498409209971507, + "grad_norm": 0.5110180974006653, + "learning_rate": 3.8374039239743e-05, + "loss": 1.0974, + "step": 179980 + }, + { + "epoch": 1.1499048081468892, + "grad_norm": 0.6761854887008667, + "learning_rate": 3.8369159138986946e-05, + "loss": 0.9451, + "step": 179990 + }, + { + "epoch": 1.1499686952966282, + "grad_norm": 0.8277345895767212, + "learning_rate": 3.836427915536353e-05, + "loss": 0.9262, + "step": 180000 + }, + { + "epoch": 1.1500325824463666, + "grad_norm": 0.847381055355072, + "learning_rate": 3.8359399288921876e-05, + "loss": 0.8348, + "step": 180010 + }, + { + "epoch": 1.1500964695961056, + "grad_norm": 0.8218114972114563, + "learning_rate": 3.835451953971115e-05, + "loss": 0.7931, + "step": 180020 + }, + { + "epoch": 1.150160356745844, + "grad_norm": 1.1525636911392212, + "learning_rate": 3.834963990778049e-05, + "loss": 0.9558, + "step": 180030 + }, + { + "epoch": 1.1502242438955828, + "grad_norm": 3.6466426849365234, + "learning_rate": 3.8344760393179036e-05, + "loss": 0.9922, + "step": 180040 + }, + { + "epoch": 1.1502881310453215, + "grad_norm": 0.820673406124115, + "learning_rate": 3.833988099595593e-05, + "loss": 0.9471, + "step": 180050 + }, + { + "epoch": 1.1503520181950602, + "grad_norm": 3.4960498809814453, + "learning_rate": 3.83350017161603e-05, + "loss": 0.7328, + "step": 180060 + }, + { + "epoch": 1.1504159053447989, + "grad_norm": 1.1338176727294922, + "learning_rate": 3.833012255384132e-05, + "loss": 1.0169, + "step": 180070 + }, + { + "epoch": 1.1504797924945376, + "grad_norm": 0.9621381163597107, + "learning_rate": 3.8325243509048087e-05, + "loss": 0.8626, + "step": 180080 + }, + { + "epoch": 1.1505436796442763, + "grad_norm": 0.5299105644226074, + "learning_rate": 3.832036458182975e-05, + "loss": 0.9519, + "step": 180090 + }, + { + "epoch": 1.150607566794015, + "grad_norm": 1.7963881492614746, + "learning_rate": 3.831548577223544e-05, + "loss": 1.0677, + "step": 180100 + }, + { + "epoch": 1.1506714539437537, + "grad_norm": 1.3347585201263428, + "learning_rate": 3.83106070803143e-05, + "loss": 0.8912, + "step": 180110 + }, + { + "epoch": 1.1507353410934924, + "grad_norm": 1.0304107666015625, + "learning_rate": 3.8305728506115466e-05, + "loss": 0.7527, + "step": 180120 + }, + { + "epoch": 1.150799228243231, + "grad_norm": 0.5389941334724426, + "learning_rate": 3.8300850049688045e-05, + "loss": 0.7198, + "step": 180130 + }, + { + "epoch": 1.1508631153929698, + "grad_norm": 0.7622451186180115, + "learning_rate": 3.829597171108119e-05, + "loss": 0.8252, + "step": 180140 + }, + { + "epoch": 1.1509270025427085, + "grad_norm": 0.707021176815033, + "learning_rate": 3.829109349034403e-05, + "loss": 0.879, + "step": 180150 + }, + { + "epoch": 1.1509908896924472, + "grad_norm": 1.0913118124008179, + "learning_rate": 3.828621538752569e-05, + "loss": 0.9587, + "step": 180160 + }, + { + "epoch": 1.151054776842186, + "grad_norm": 0.8637599349021912, + "learning_rate": 3.82813374026753e-05, + "loss": 0.8042, + "step": 180170 + }, + { + "epoch": 1.1511186639919246, + "grad_norm": 1.4673819541931152, + "learning_rate": 3.827645953584197e-05, + "loss": 1.0232, + "step": 180180 + }, + { + "epoch": 1.1511825511416633, + "grad_norm": 0.8439252972602844, + "learning_rate": 3.827158178707484e-05, + "loss": 0.8032, + "step": 180190 + }, + { + "epoch": 1.151246438291402, + "grad_norm": 0.743141233921051, + "learning_rate": 3.826670415642303e-05, + "loss": 0.8605, + "step": 180200 + }, + { + "epoch": 1.1513103254411408, + "grad_norm": 1.2738887071609497, + "learning_rate": 3.826182664393566e-05, + "loss": 0.9834, + "step": 180210 + }, + { + "epoch": 1.1513742125908795, + "grad_norm": 1.370021104812622, + "learning_rate": 3.825694924966185e-05, + "loss": 0.9888, + "step": 180220 + }, + { + "epoch": 1.1514380997406182, + "grad_norm": 0.7736698389053345, + "learning_rate": 3.825207197365072e-05, + "loss": 1.0524, + "step": 180230 + }, + { + "epoch": 1.1515019868903569, + "grad_norm": 0.7036052346229553, + "learning_rate": 3.8247194815951384e-05, + "loss": 0.8803, + "step": 180240 + }, + { + "epoch": 1.1515658740400956, + "grad_norm": 0.7617993950843811, + "learning_rate": 3.824231777661297e-05, + "loss": 0.9111, + "step": 180250 + }, + { + "epoch": 1.1516297611898343, + "grad_norm": 0.7389393448829651, + "learning_rate": 3.8237440855684586e-05, + "loss": 0.8772, + "step": 180260 + }, + { + "epoch": 1.151693648339573, + "grad_norm": 0.9305931925773621, + "learning_rate": 3.8232564053215345e-05, + "loss": 0.8958, + "step": 180270 + }, + { + "epoch": 1.1517575354893117, + "grad_norm": 0.979669988155365, + "learning_rate": 3.8227687369254375e-05, + "loss": 0.7242, + "step": 180280 + }, + { + "epoch": 1.1518214226390504, + "grad_norm": 0.8320657014846802, + "learning_rate": 3.8222810803850764e-05, + "loss": 0.8644, + "step": 180290 + }, + { + "epoch": 1.1518853097887891, + "grad_norm": 0.5598975419998169, + "learning_rate": 3.821793435705364e-05, + "loss": 0.6644, + "step": 180300 + }, + { + "epoch": 1.1519491969385278, + "grad_norm": 1.0118290185928345, + "learning_rate": 3.821305802891212e-05, + "loss": 0.8266, + "step": 180310 + }, + { + "epoch": 1.1520130840882665, + "grad_norm": 0.8727541565895081, + "learning_rate": 3.820818181947529e-05, + "loss": 0.9033, + "step": 180320 + }, + { + "epoch": 1.1520769712380052, + "grad_norm": 1.116168737411499, + "learning_rate": 3.8203305728792265e-05, + "loss": 0.8494, + "step": 180330 + }, + { + "epoch": 1.152140858387744, + "grad_norm": 1.2078253030776978, + "learning_rate": 3.819842975691217e-05, + "loss": 0.8909, + "step": 180340 + }, + { + "epoch": 1.1522047455374826, + "grad_norm": 1.025717854499817, + "learning_rate": 3.8193553903884096e-05, + "loss": 0.8589, + "step": 180350 + }, + { + "epoch": 1.1522686326872214, + "grad_norm": 0.9668052792549133, + "learning_rate": 3.818867816975715e-05, + "loss": 0.6721, + "step": 180360 + }, + { + "epoch": 1.15233251983696, + "grad_norm": 0.5999698638916016, + "learning_rate": 3.818380255458043e-05, + "loss": 0.9891, + "step": 180370 + }, + { + "epoch": 1.1523964069866988, + "grad_norm": 0.9243110418319702, + "learning_rate": 3.817892705840304e-05, + "loss": 0.9535, + "step": 180380 + }, + { + "epoch": 1.1524602941364375, + "grad_norm": 0.7655467391014099, + "learning_rate": 3.817405168127408e-05, + "loss": 1.1042, + "step": 180390 + }, + { + "epoch": 1.1525241812861762, + "grad_norm": 1.5217201709747314, + "learning_rate": 3.816917642324265e-05, + "loss": 0.8361, + "step": 180400 + }, + { + "epoch": 1.1525880684359149, + "grad_norm": 0.9932133555412292, + "learning_rate": 3.816430128435786e-05, + "loss": 0.9629, + "step": 180410 + }, + { + "epoch": 1.1526519555856536, + "grad_norm": 0.9760572910308838, + "learning_rate": 3.8159426264668784e-05, + "loss": 0.8349, + "step": 180420 + }, + { + "epoch": 1.1527158427353923, + "grad_norm": 1.3286035060882568, + "learning_rate": 3.815455136422453e-05, + "loss": 0.7393, + "step": 180430 + }, + { + "epoch": 1.152779729885131, + "grad_norm": 1.8230654001235962, + "learning_rate": 3.814967658307419e-05, + "loss": 0.721, + "step": 180440 + }, + { + "epoch": 1.1528436170348697, + "grad_norm": 2.0647432804107666, + "learning_rate": 3.8144801921266864e-05, + "loss": 0.7575, + "step": 180450 + }, + { + "epoch": 1.1529075041846082, + "grad_norm": 1.0009099245071411, + "learning_rate": 3.813992737885164e-05, + "loss": 0.998, + "step": 180460 + }, + { + "epoch": 1.1529713913343471, + "grad_norm": 0.806215763092041, + "learning_rate": 3.8135052955877605e-05, + "loss": 0.9999, + "step": 180470 + }, + { + "epoch": 1.1530352784840856, + "grad_norm": 0.6237285733222961, + "learning_rate": 3.813017865239385e-05, + "loss": 0.689, + "step": 180480 + }, + { + "epoch": 1.1530991656338245, + "grad_norm": 0.7346072793006897, + "learning_rate": 3.812530446844946e-05, + "loss": 0.9834, + "step": 180490 + }, + { + "epoch": 1.153163052783563, + "grad_norm": 1.3192369937896729, + "learning_rate": 3.812043040409354e-05, + "loss": 0.8756, + "step": 180500 + }, + { + "epoch": 1.1532269399333017, + "grad_norm": 1.162412166595459, + "learning_rate": 3.811555645937516e-05, + "loss": 0.9242, + "step": 180510 + }, + { + "epoch": 1.1532908270830404, + "grad_norm": 0.7788326740264893, + "learning_rate": 3.81106826343434e-05, + "loss": 0.9619, + "step": 180520 + }, + { + "epoch": 1.1533547142327791, + "grad_norm": 1.02617347240448, + "learning_rate": 3.810580892904735e-05, + "loss": 0.7565, + "step": 180530 + }, + { + "epoch": 1.1534186013825178, + "grad_norm": 0.9374564290046692, + "learning_rate": 3.8100935343536094e-05, + "loss": 0.8091, + "step": 180540 + }, + { + "epoch": 1.1534824885322565, + "grad_norm": 1.0165518522262573, + "learning_rate": 3.809606187785874e-05, + "loss": 1.1619, + "step": 180550 + }, + { + "epoch": 1.1535463756819953, + "grad_norm": 0.9394980072975159, + "learning_rate": 3.809118853206432e-05, + "loss": 0.7875, + "step": 180560 + }, + { + "epoch": 1.153610262831734, + "grad_norm": 0.8749181628227234, + "learning_rate": 3.808631530620194e-05, + "loss": 0.8106, + "step": 180570 + }, + { + "epoch": 1.1536741499814727, + "grad_norm": 0.6072586178779602, + "learning_rate": 3.808144220032066e-05, + "loss": 1.0034, + "step": 180580 + }, + { + "epoch": 1.1537380371312114, + "grad_norm": 1.6008857488632202, + "learning_rate": 3.807656921446957e-05, + "loss": 0.8974, + "step": 180590 + }, + { + "epoch": 1.15380192428095, + "grad_norm": 1.2123829126358032, + "learning_rate": 3.807169634869775e-05, + "loss": 0.7672, + "step": 180600 + }, + { + "epoch": 1.1538658114306888, + "grad_norm": 0.917658269405365, + "learning_rate": 3.806682360305427e-05, + "loss": 0.8276, + "step": 180610 + }, + { + "epoch": 1.1539296985804275, + "grad_norm": 0.9251102209091187, + "learning_rate": 3.806195097758819e-05, + "loss": 1.0129, + "step": 180620 + }, + { + "epoch": 1.1539935857301662, + "grad_norm": 1.3319838047027588, + "learning_rate": 3.805707847234859e-05, + "loss": 0.7021, + "step": 180630 + }, + { + "epoch": 1.154057472879905, + "grad_norm": 0.8594711422920227, + "learning_rate": 3.805220608738456e-05, + "loss": 0.9549, + "step": 180640 + }, + { + "epoch": 1.1541213600296436, + "grad_norm": 1.0314610004425049, + "learning_rate": 3.8047333822745126e-05, + "loss": 0.8582, + "step": 180650 + }, + { + "epoch": 1.1541852471793823, + "grad_norm": 1.0455981492996216, + "learning_rate": 3.804246167847939e-05, + "loss": 1.2074, + "step": 180660 + }, + { + "epoch": 1.154249134329121, + "grad_norm": 1.1809886693954468, + "learning_rate": 3.803758965463641e-05, + "loss": 0.6409, + "step": 180670 + }, + { + "epoch": 1.1543130214788597, + "grad_norm": 0.782289981842041, + "learning_rate": 3.803271775126525e-05, + "loss": 0.817, + "step": 180680 + }, + { + "epoch": 1.1543769086285984, + "grad_norm": 0.8476322293281555, + "learning_rate": 3.802784596841499e-05, + "loss": 0.6469, + "step": 180690 + }, + { + "epoch": 1.1544407957783371, + "grad_norm": 1.193834662437439, + "learning_rate": 3.802297430613467e-05, + "loss": 1.1158, + "step": 180700 + }, + { + "epoch": 1.1545046829280758, + "grad_norm": 0.7722360491752625, + "learning_rate": 3.801810276447336e-05, + "loss": 0.84, + "step": 180710 + }, + { + "epoch": 1.1545685700778145, + "grad_norm": 1.5033289194107056, + "learning_rate": 3.8013231343480116e-05, + "loss": 1.0485, + "step": 180720 + }, + { + "epoch": 1.1546324572275533, + "grad_norm": 1.234089732170105, + "learning_rate": 3.8008360043204004e-05, + "loss": 0.9439, + "step": 180730 + }, + { + "epoch": 1.154696344377292, + "grad_norm": 1.0235389471054077, + "learning_rate": 3.800348886369409e-05, + "loss": 0.7474, + "step": 180740 + }, + { + "epoch": 1.1547602315270307, + "grad_norm": 1.125136137008667, + "learning_rate": 3.799861780499941e-05, + "loss": 1.1013, + "step": 180750 + }, + { + "epoch": 1.1548241186767694, + "grad_norm": 0.9494902491569519, + "learning_rate": 3.7993746867169036e-05, + "loss": 0.694, + "step": 180760 + }, + { + "epoch": 1.154888005826508, + "grad_norm": 1.7362793684005737, + "learning_rate": 3.798887605025202e-05, + "loss": 1.2714, + "step": 180770 + }, + { + "epoch": 1.1549518929762468, + "grad_norm": 1.3733989000320435, + "learning_rate": 3.798400535429741e-05, + "loss": 0.659, + "step": 180780 + }, + { + "epoch": 1.1550157801259855, + "grad_norm": 1.4269659519195557, + "learning_rate": 3.797913477935426e-05, + "loss": 0.9194, + "step": 180790 + }, + { + "epoch": 1.1550796672757242, + "grad_norm": 0.7260159850120544, + "learning_rate": 3.7974264325471625e-05, + "loss": 0.9957, + "step": 180800 + }, + { + "epoch": 1.155143554425463, + "grad_norm": 0.8534484505653381, + "learning_rate": 3.7969393992698555e-05, + "loss": 0.8095, + "step": 180810 + }, + { + "epoch": 1.1552074415752016, + "grad_norm": 1.1256579160690308, + "learning_rate": 3.796452378108409e-05, + "loss": 1.053, + "step": 180820 + }, + { + "epoch": 1.1552713287249403, + "grad_norm": 0.9373098015785217, + "learning_rate": 3.7959653690677285e-05, + "loss": 0.766, + "step": 180830 + }, + { + "epoch": 1.155335215874679, + "grad_norm": 1.5208162069320679, + "learning_rate": 3.795478372152718e-05, + "loss": 1.1105, + "step": 180840 + }, + { + "epoch": 1.1553991030244177, + "grad_norm": 1.0059376955032349, + "learning_rate": 3.794991387368283e-05, + "loss": 0.891, + "step": 180850 + }, + { + "epoch": 1.1554629901741564, + "grad_norm": 0.7993272542953491, + "learning_rate": 3.794504414719326e-05, + "loss": 0.7323, + "step": 180860 + }, + { + "epoch": 1.1555268773238951, + "grad_norm": 0.8731907606124878, + "learning_rate": 3.794017454210753e-05, + "loss": 0.8365, + "step": 180870 + }, + { + "epoch": 1.1555907644736338, + "grad_norm": 0.9847076535224915, + "learning_rate": 3.793530505847468e-05, + "loss": 0.7823, + "step": 180880 + }, + { + "epoch": 1.1556546516233726, + "grad_norm": 0.7352263331413269, + "learning_rate": 3.793043569634375e-05, + "loss": 0.9791, + "step": 180890 + }, + { + "epoch": 1.1557185387731113, + "grad_norm": 0.743043065071106, + "learning_rate": 3.792556645576376e-05, + "loss": 0.7904, + "step": 180900 + }, + { + "epoch": 1.15578242592285, + "grad_norm": 0.854838490486145, + "learning_rate": 3.792069733678377e-05, + "loss": 0.8579, + "step": 180910 + }, + { + "epoch": 1.1558463130725887, + "grad_norm": 1.538976788520813, + "learning_rate": 3.7915828339452814e-05, + "loss": 0.9096, + "step": 180920 + }, + { + "epoch": 1.1559102002223274, + "grad_norm": 1.347536563873291, + "learning_rate": 3.791095946381992e-05, + "loss": 0.8312, + "step": 180930 + }, + { + "epoch": 1.155974087372066, + "grad_norm": 0.7410019040107727, + "learning_rate": 3.790609070993412e-05, + "loss": 0.7537, + "step": 180940 + }, + { + "epoch": 1.1560379745218046, + "grad_norm": 1.0292385816574097, + "learning_rate": 3.790122207784444e-05, + "loss": 0.8605, + "step": 180950 + }, + { + "epoch": 1.1561018616715435, + "grad_norm": 0.9388356804847717, + "learning_rate": 3.789635356759993e-05, + "loss": 0.9448, + "step": 180960 + }, + { + "epoch": 1.156165748821282, + "grad_norm": 1.113138198852539, + "learning_rate": 3.789148517924961e-05, + "loss": 0.8119, + "step": 180970 + }, + { + "epoch": 1.156229635971021, + "grad_norm": 1.052147388458252, + "learning_rate": 3.78866169128425e-05, + "loss": 0.739, + "step": 180980 + }, + { + "epoch": 1.1562935231207594, + "grad_norm": 0.8988513946533203, + "learning_rate": 3.788174876842765e-05, + "loss": 0.9475, + "step": 180990 + }, + { + "epoch": 1.156357410270498, + "grad_norm": 0.6247332096099854, + "learning_rate": 3.787688074605407e-05, + "loss": 0.9559, + "step": 181000 + }, + { + "epoch": 1.1564212974202368, + "grad_norm": 0.7940369844436646, + "learning_rate": 3.7872012845770786e-05, + "loss": 0.865, + "step": 181010 + }, + { + "epoch": 1.1564851845699755, + "grad_norm": 0.5319830179214478, + "learning_rate": 3.786714506762683e-05, + "loss": 0.8773, + "step": 181020 + }, + { + "epoch": 1.1565490717197142, + "grad_norm": 1.5722520351409912, + "learning_rate": 3.7862277411671215e-05, + "loss": 0.772, + "step": 181030 + }, + { + "epoch": 1.156612958869453, + "grad_norm": 0.8292412161827087, + "learning_rate": 3.785740987795298e-05, + "loss": 0.9318, + "step": 181040 + }, + { + "epoch": 1.1566768460191916, + "grad_norm": 0.785133957862854, + "learning_rate": 3.785254246652112e-05, + "loss": 0.9598, + "step": 181050 + }, + { + "epoch": 1.1567407331689303, + "grad_norm": 0.6454100012779236, + "learning_rate": 3.784767517742467e-05, + "loss": 0.9134, + "step": 181060 + }, + { + "epoch": 1.156804620318669, + "grad_norm": 0.7190561890602112, + "learning_rate": 3.784280801071264e-05, + "loss": 1.0787, + "step": 181070 + }, + { + "epoch": 1.1568685074684077, + "grad_norm": 1.0026381015777588, + "learning_rate": 3.7837940966434054e-05, + "loss": 0.8038, + "step": 181080 + }, + { + "epoch": 1.1569323946181465, + "grad_norm": 0.8632673025131226, + "learning_rate": 3.783307404463792e-05, + "loss": 0.8546, + "step": 181090 + }, + { + "epoch": 1.1569962817678852, + "grad_norm": 0.8693109154701233, + "learning_rate": 3.782820724537326e-05, + "loss": 0.7959, + "step": 181100 + }, + { + "epoch": 1.1570601689176239, + "grad_norm": 1.686172366142273, + "learning_rate": 3.782334056868908e-05, + "loss": 0.8705, + "step": 181110 + }, + { + "epoch": 1.1571240560673626, + "grad_norm": 1.0223690271377563, + "learning_rate": 3.78184740146344e-05, + "loss": 0.7805, + "step": 181120 + }, + { + "epoch": 1.1571879432171013, + "grad_norm": 0.9894540309906006, + "learning_rate": 3.781360758325822e-05, + "loss": 0.7278, + "step": 181130 + }, + { + "epoch": 1.15725183036684, + "grad_norm": 0.6399027109146118, + "learning_rate": 3.780874127460956e-05, + "loss": 1.0004, + "step": 181140 + }, + { + "epoch": 1.1573157175165787, + "grad_norm": 0.7714298963546753, + "learning_rate": 3.780387508873742e-05, + "loss": 0.8707, + "step": 181150 + }, + { + "epoch": 1.1573796046663174, + "grad_norm": 1.1210808753967285, + "learning_rate": 3.779900902569081e-05, + "loss": 0.9824, + "step": 181160 + }, + { + "epoch": 1.157443491816056, + "grad_norm": 0.8586247563362122, + "learning_rate": 3.779414308551873e-05, + "loss": 0.8191, + "step": 181170 + }, + { + "epoch": 1.1575073789657948, + "grad_norm": 1.4679404497146606, + "learning_rate": 3.778927726827018e-05, + "loss": 0.9684, + "step": 181180 + }, + { + "epoch": 1.1575712661155335, + "grad_norm": 1.372011661529541, + "learning_rate": 3.778441157399418e-05, + "loss": 0.7683, + "step": 181190 + }, + { + "epoch": 1.1576351532652722, + "grad_norm": 0.8723155856132507, + "learning_rate": 3.7779546002739724e-05, + "loss": 0.9515, + "step": 181200 + }, + { + "epoch": 1.157699040415011, + "grad_norm": 1.1017118692398071, + "learning_rate": 3.777468055455582e-05, + "loss": 0.7784, + "step": 181210 + }, + { + "epoch": 1.1577629275647496, + "grad_norm": 0.6969674825668335, + "learning_rate": 3.7769815229491454e-05, + "loss": 0.8209, + "step": 181220 + }, + { + "epoch": 1.1578268147144883, + "grad_norm": 0.9235311150550842, + "learning_rate": 3.776495002759563e-05, + "loss": 0.9272, + "step": 181230 + }, + { + "epoch": 1.157890701864227, + "grad_norm": 1.3109992742538452, + "learning_rate": 3.776008494891734e-05, + "loss": 0.8468, + "step": 181240 + }, + { + "epoch": 1.1579545890139658, + "grad_norm": 0.8040546774864197, + "learning_rate": 3.7755219993505584e-05, + "loss": 1.0643, + "step": 181250 + }, + { + "epoch": 1.1580184761637045, + "grad_norm": 0.8779431581497192, + "learning_rate": 3.775035516140936e-05, + "loss": 0.8391, + "step": 181260 + }, + { + "epoch": 1.1580823633134432, + "grad_norm": 1.0938910245895386, + "learning_rate": 3.774549045267765e-05, + "loss": 0.8048, + "step": 181270 + }, + { + "epoch": 1.1581462504631819, + "grad_norm": 0.9017459750175476, + "learning_rate": 3.774062586735946e-05, + "loss": 0.8209, + "step": 181280 + }, + { + "epoch": 1.1582101376129206, + "grad_norm": 0.5983861684799194, + "learning_rate": 3.773576140550377e-05, + "loss": 0.6446, + "step": 181290 + }, + { + "epoch": 1.1582740247626593, + "grad_norm": 0.9771537184715271, + "learning_rate": 3.7730897067159575e-05, + "loss": 1.0019, + "step": 181300 + }, + { + "epoch": 1.158337911912398, + "grad_norm": 0.7589857578277588, + "learning_rate": 3.772603285237586e-05, + "loss": 0.6826, + "step": 181310 + }, + { + "epoch": 1.1584017990621367, + "grad_norm": 0.9540377259254456, + "learning_rate": 3.772116876120161e-05, + "loss": 1.033, + "step": 181320 + }, + { + "epoch": 1.1584656862118754, + "grad_norm": 0.9302192330360413, + "learning_rate": 3.771630479368582e-05, + "loss": 0.868, + "step": 181330 + }, + { + "epoch": 1.158529573361614, + "grad_norm": 0.9737901091575623, + "learning_rate": 3.771144094987746e-05, + "loss": 0.7623, + "step": 181340 + }, + { + "epoch": 1.1585934605113528, + "grad_norm": 0.8602831959724426, + "learning_rate": 3.7706577229825524e-05, + "loss": 0.9287, + "step": 181350 + }, + { + "epoch": 1.1586573476610915, + "grad_norm": 0.8971428275108337, + "learning_rate": 3.770171363357899e-05, + "loss": 1.0126, + "step": 181360 + }, + { + "epoch": 1.1587212348108302, + "grad_norm": 1.3346503973007202, + "learning_rate": 3.769685016118684e-05, + "loss": 0.7815, + "step": 181370 + }, + { + "epoch": 1.158785121960569, + "grad_norm": 1.5257915258407593, + "learning_rate": 3.769198681269805e-05, + "loss": 1.0097, + "step": 181380 + }, + { + "epoch": 1.1588490091103076, + "grad_norm": 1.0773704051971436, + "learning_rate": 3.7687123588161596e-05, + "loss": 0.8822, + "step": 181390 + }, + { + "epoch": 1.1589128962600463, + "grad_norm": 0.8371546268463135, + "learning_rate": 3.768226048762647e-05, + "loss": 0.7492, + "step": 181400 + }, + { + "epoch": 1.158976783409785, + "grad_norm": 1.3601465225219727, + "learning_rate": 3.767739751114163e-05, + "loss": 0.8812, + "step": 181410 + }, + { + "epoch": 1.1590406705595235, + "grad_norm": 0.9463026523590088, + "learning_rate": 3.7672534658756065e-05, + "loss": 0.8712, + "step": 181420 + }, + { + "epoch": 1.1591045577092625, + "grad_norm": 0.891038179397583, + "learning_rate": 3.7667671930518736e-05, + "loss": 0.8663, + "step": 181430 + }, + { + "epoch": 1.159168444859001, + "grad_norm": 1.142344355583191, + "learning_rate": 3.766280932647862e-05, + "loss": 1.0367, + "step": 181440 + }, + { + "epoch": 1.1592323320087399, + "grad_norm": 0.9246888756752014, + "learning_rate": 3.765794684668469e-05, + "loss": 0.9055, + "step": 181450 + }, + { + "epoch": 1.1592962191584784, + "grad_norm": 1.2193877696990967, + "learning_rate": 3.7653084491185905e-05, + "loss": 0.8285, + "step": 181460 + }, + { + "epoch": 1.1593601063082173, + "grad_norm": 0.9855943918228149, + "learning_rate": 3.764822226003125e-05, + "loss": 0.7588, + "step": 181470 + }, + { + "epoch": 1.1594239934579558, + "grad_norm": 0.9939285516738892, + "learning_rate": 3.764336015326968e-05, + "loss": 0.6418, + "step": 181480 + }, + { + "epoch": 1.1594878806076945, + "grad_norm": 1.2632123231887817, + "learning_rate": 3.7638498170950165e-05, + "loss": 0.9491, + "step": 181490 + }, + { + "epoch": 1.1595517677574332, + "grad_norm": 0.8524492979049683, + "learning_rate": 3.7633636313121663e-05, + "loss": 0.955, + "step": 181500 + }, + { + "epoch": 1.1596156549071719, + "grad_norm": 1.167758584022522, + "learning_rate": 3.762877457983314e-05, + "loss": 0.8589, + "step": 181510 + }, + { + "epoch": 1.1596795420569106, + "grad_norm": 0.8116161823272705, + "learning_rate": 3.762391297113358e-05, + "loss": 0.891, + "step": 181520 + }, + { + "epoch": 1.1597434292066493, + "grad_norm": 0.7200629115104675, + "learning_rate": 3.76190514870719e-05, + "loss": 0.7444, + "step": 181530 + }, + { + "epoch": 1.159807316356388, + "grad_norm": 1.024326205253601, + "learning_rate": 3.76141901276971e-05, + "loss": 0.8589, + "step": 181540 + }, + { + "epoch": 1.1598712035061267, + "grad_norm": 5.042026042938232, + "learning_rate": 3.7609328893058104e-05, + "loss": 1.1791, + "step": 181550 + }, + { + "epoch": 1.1599350906558654, + "grad_norm": 0.9187368750572205, + "learning_rate": 3.760446778320389e-05, + "loss": 0.7202, + "step": 181560 + }, + { + "epoch": 1.1599989778056041, + "grad_norm": 0.8078513741493225, + "learning_rate": 3.7599606798183413e-05, + "loss": 0.8145, + "step": 181570 + }, + { + "epoch": 1.1600628649553428, + "grad_norm": 0.6355553865432739, + "learning_rate": 3.7594745938045625e-05, + "loss": 0.7847, + "step": 181580 + }, + { + "epoch": 1.1601267521050815, + "grad_norm": 0.6867857575416565, + "learning_rate": 3.758988520283947e-05, + "loss": 0.9891, + "step": 181590 + }, + { + "epoch": 1.1601906392548202, + "grad_norm": 1.211389422416687, + "learning_rate": 3.758502459261391e-05, + "loss": 0.9464, + "step": 181600 + }, + { + "epoch": 1.160254526404559, + "grad_norm": 0.8838968276977539, + "learning_rate": 3.758016410741789e-05, + "loss": 0.718, + "step": 181610 + }, + { + "epoch": 1.1603184135542977, + "grad_norm": 1.0951565504074097, + "learning_rate": 3.7575303747300375e-05, + "loss": 1.0658, + "step": 181620 + }, + { + "epoch": 1.1603823007040364, + "grad_norm": 0.9639030694961548, + "learning_rate": 3.7570443512310287e-05, + "loss": 1.1464, + "step": 181630 + }, + { + "epoch": 1.160446187853775, + "grad_norm": 0.8996525406837463, + "learning_rate": 3.756558340249659e-05, + "loss": 1.005, + "step": 181640 + }, + { + "epoch": 1.1605100750035138, + "grad_norm": 0.6685171723365784, + "learning_rate": 3.7560723417908225e-05, + "loss": 0.8409, + "step": 181650 + }, + { + "epoch": 1.1605739621532525, + "grad_norm": 0.6640573143959045, + "learning_rate": 3.755586355859414e-05, + "loss": 0.8569, + "step": 181660 + }, + { + "epoch": 1.1606378493029912, + "grad_norm": 0.5627730488777161, + "learning_rate": 3.755100382460327e-05, + "loss": 0.9019, + "step": 181670 + }, + { + "epoch": 1.16070173645273, + "grad_norm": 0.6873822808265686, + "learning_rate": 3.754614421598456e-05, + "loss": 0.8357, + "step": 181680 + }, + { + "epoch": 1.1607656236024686, + "grad_norm": 1.4170078039169312, + "learning_rate": 3.7541284732786955e-05, + "loss": 0.8245, + "step": 181690 + }, + { + "epoch": 1.1608295107522073, + "grad_norm": 0.745650053024292, + "learning_rate": 3.753642537505939e-05, + "loss": 0.9163, + "step": 181700 + }, + { + "epoch": 1.160893397901946, + "grad_norm": 1.3696508407592773, + "learning_rate": 3.75315661428508e-05, + "loss": 0.932, + "step": 181710 + }, + { + "epoch": 1.1609572850516847, + "grad_norm": 1.0974339246749878, + "learning_rate": 3.752670703621013e-05, + "loss": 0.881, + "step": 181720 + }, + { + "epoch": 1.1610211722014234, + "grad_norm": 0.8808803558349609, + "learning_rate": 3.752184805518631e-05, + "loss": 0.7972, + "step": 181730 + }, + { + "epoch": 1.1610850593511621, + "grad_norm": 1.2625608444213867, + "learning_rate": 3.7516989199828276e-05, + "loss": 0.7407, + "step": 181740 + }, + { + "epoch": 1.1611489465009008, + "grad_norm": 0.8603794574737549, + "learning_rate": 3.7512130470184965e-05, + "loss": 0.7863, + "step": 181750 + }, + { + "epoch": 1.1612128336506395, + "grad_norm": 1.2565032243728638, + "learning_rate": 3.75072718663053e-05, + "loss": 0.8844, + "step": 181760 + }, + { + "epoch": 1.1612767208003782, + "grad_norm": 1.3598438501358032, + "learning_rate": 3.750241338823821e-05, + "loss": 0.8526, + "step": 181770 + }, + { + "epoch": 1.161340607950117, + "grad_norm": 2.179485321044922, + "learning_rate": 3.749755503603264e-05, + "loss": 0.9197, + "step": 181780 + }, + { + "epoch": 1.1614044950998557, + "grad_norm": 0.8402548432350159, + "learning_rate": 3.74926968097375e-05, + "loss": 0.9501, + "step": 181790 + }, + { + "epoch": 1.1614683822495944, + "grad_norm": 0.6361011862754822, + "learning_rate": 3.748783870940172e-05, + "loss": 0.9137, + "step": 181800 + }, + { + "epoch": 1.161532269399333, + "grad_norm": 1.048244833946228, + "learning_rate": 3.748298073507424e-05, + "loss": 0.9456, + "step": 181810 + }, + { + "epoch": 1.1615961565490718, + "grad_norm": 1.0340771675109863, + "learning_rate": 3.747812288680396e-05, + "loss": 0.721, + "step": 181820 + }, + { + "epoch": 1.1616600436988105, + "grad_norm": 0.6859948635101318, + "learning_rate": 3.7473265164639824e-05, + "loss": 0.9426, + "step": 181830 + }, + { + "epoch": 1.1617239308485492, + "grad_norm": 0.9537613987922668, + "learning_rate": 3.746840756863074e-05, + "loss": 0.9536, + "step": 181840 + }, + { + "epoch": 1.161787817998288, + "grad_norm": 0.8040690422058105, + "learning_rate": 3.746355009882564e-05, + "loss": 0.9398, + "step": 181850 + }, + { + "epoch": 1.1618517051480266, + "grad_norm": 1.4307293891906738, + "learning_rate": 3.745869275527343e-05, + "loss": 0.9606, + "step": 181860 + }, + { + "epoch": 1.1619155922977653, + "grad_norm": 0.8316680788993835, + "learning_rate": 3.7453835538023035e-05, + "loss": 0.9238, + "step": 181870 + }, + { + "epoch": 1.161979479447504, + "grad_norm": 1.3545746803283691, + "learning_rate": 3.744897844712337e-05, + "loss": 0.764, + "step": 181880 + }, + { + "epoch": 1.1620433665972427, + "grad_norm": 1.1133147478103638, + "learning_rate": 3.744412148262335e-05, + "loss": 0.9603, + "step": 181890 + }, + { + "epoch": 1.1621072537469814, + "grad_norm": 0.8634859919548035, + "learning_rate": 3.7439264644571894e-05, + "loss": 0.7834, + "step": 181900 + }, + { + "epoch": 1.16217114089672, + "grad_norm": 0.7529647350311279, + "learning_rate": 3.743440793301789e-05, + "loss": 0.7559, + "step": 181910 + }, + { + "epoch": 1.1622350280464588, + "grad_norm": 1.1919103860855103, + "learning_rate": 3.742955134801028e-05, + "loss": 1.2041, + "step": 181920 + }, + { + "epoch": 1.1622989151961973, + "grad_norm": 0.8481646180152893, + "learning_rate": 3.742469488959798e-05, + "loss": 0.8569, + "step": 181930 + }, + { + "epoch": 1.1623628023459363, + "grad_norm": 0.7014936804771423, + "learning_rate": 3.741983855782986e-05, + "loss": 0.8347, + "step": 181940 + }, + { + "epoch": 1.1624266894956747, + "grad_norm": 0.9098723530769348, + "learning_rate": 3.741498235275486e-05, + "loss": 1.1005, + "step": 181950 + }, + { + "epoch": 1.1624905766454137, + "grad_norm": 0.6613329648971558, + "learning_rate": 3.741012627442188e-05, + "loss": 0.7433, + "step": 181960 + }, + { + "epoch": 1.1625544637951521, + "grad_norm": 0.695220410823822, + "learning_rate": 3.74052703228798e-05, + "loss": 0.7844, + "step": 181970 + }, + { + "epoch": 1.1626183509448909, + "grad_norm": 1.442959189414978, + "learning_rate": 3.740041449817756e-05, + "loss": 0.7808, + "step": 181980 + }, + { + "epoch": 1.1626822380946296, + "grad_norm": 0.615817129611969, + "learning_rate": 3.739555880036405e-05, + "loss": 0.7967, + "step": 181990 + }, + { + "epoch": 1.1627461252443683, + "grad_norm": 0.6628965139389038, + "learning_rate": 3.739070322948816e-05, + "loss": 0.818, + "step": 182000 + }, + { + "epoch": 1.162810012394107, + "grad_norm": 1.3861855268478394, + "learning_rate": 3.738584778559881e-05, + "loss": 0.8205, + "step": 182010 + }, + { + "epoch": 1.1628738995438457, + "grad_norm": 0.8403356075286865, + "learning_rate": 3.738099246874488e-05, + "loss": 0.7642, + "step": 182020 + }, + { + "epoch": 1.1629377866935844, + "grad_norm": 2.395003080368042, + "learning_rate": 3.737613727897525e-05, + "loss": 0.9817, + "step": 182030 + }, + { + "epoch": 1.163001673843323, + "grad_norm": 1.15186607837677, + "learning_rate": 3.7371282216338866e-05, + "loss": 1.169, + "step": 182040 + }, + { + "epoch": 1.1630655609930618, + "grad_norm": 0.9080342650413513, + "learning_rate": 3.736642728088458e-05, + "loss": 0.8127, + "step": 182050 + }, + { + "epoch": 1.1631294481428005, + "grad_norm": 0.6692883372306824, + "learning_rate": 3.736157247266131e-05, + "loss": 0.8561, + "step": 182060 + }, + { + "epoch": 1.1631933352925392, + "grad_norm": 0.8358718156814575, + "learning_rate": 3.735671779171793e-05, + "loss": 1.0591, + "step": 182070 + }, + { + "epoch": 1.163257222442278, + "grad_norm": 1.2735133171081543, + "learning_rate": 3.7352348687733605e-05, + "loss": 1.0301, + "step": 182080 + }, + { + "epoch": 1.1633211095920166, + "grad_norm": 0.8889757990837097, + "learning_rate": 3.734749424875673e-05, + "loss": 0.7145, + "step": 182090 + }, + { + "epoch": 1.1633849967417553, + "grad_norm": 0.9000428915023804, + "learning_rate": 3.734263993720153e-05, + "loss": 0.9598, + "step": 182100 + }, + { + "epoch": 1.163448883891494, + "grad_norm": 0.8241376876831055, + "learning_rate": 3.733778575311691e-05, + "loss": 0.8692, + "step": 182110 + }, + { + "epoch": 1.1635127710412327, + "grad_norm": 1.059684157371521, + "learning_rate": 3.7332931696551724e-05, + "loss": 0.7761, + "step": 182120 + }, + { + "epoch": 1.1635766581909714, + "grad_norm": 0.8571027517318726, + "learning_rate": 3.732807776755489e-05, + "loss": 0.8801, + "step": 182130 + }, + { + "epoch": 1.1636405453407102, + "grad_norm": 0.9349920749664307, + "learning_rate": 3.7323223966175265e-05, + "loss": 0.7988, + "step": 182140 + }, + { + "epoch": 1.1637044324904489, + "grad_norm": 1.583733320236206, + "learning_rate": 3.731837029246174e-05, + "loss": 1.1716, + "step": 182150 + }, + { + "epoch": 1.1637683196401876, + "grad_norm": 0.6162463426589966, + "learning_rate": 3.731351674646322e-05, + "loss": 0.9512, + "step": 182160 + }, + { + "epoch": 1.1638322067899263, + "grad_norm": 1.2393447160720825, + "learning_rate": 3.730866332822854e-05, + "loss": 0.7938, + "step": 182170 + }, + { + "epoch": 1.163896093939665, + "grad_norm": 0.8612910509109497, + "learning_rate": 3.7303810037806605e-05, + "loss": 0.7164, + "step": 182180 + }, + { + "epoch": 1.1639599810894037, + "grad_norm": 0.8981246948242188, + "learning_rate": 3.729895687524629e-05, + "loss": 1.391, + "step": 182190 + }, + { + "epoch": 1.1640238682391424, + "grad_norm": 0.7715309858322144, + "learning_rate": 3.729410384059646e-05, + "loss": 0.9247, + "step": 182200 + }, + { + "epoch": 1.164087755388881, + "grad_norm": 1.1804099082946777, + "learning_rate": 3.7289250933906e-05, + "loss": 0.7421, + "step": 182210 + }, + { + "epoch": 1.1641516425386198, + "grad_norm": 1.579128623008728, + "learning_rate": 3.7284398155223774e-05, + "loss": 0.7398, + "step": 182220 + }, + { + "epoch": 1.1642155296883585, + "grad_norm": 2.8554675579071045, + "learning_rate": 3.7279545504598666e-05, + "loss": 0.9659, + "step": 182230 + }, + { + "epoch": 1.1642794168380972, + "grad_norm": 0.661597490310669, + "learning_rate": 3.727469298207954e-05, + "loss": 0.9344, + "step": 182240 + }, + { + "epoch": 1.164343303987836, + "grad_norm": 0.8794891238212585, + "learning_rate": 3.7269840587715264e-05, + "loss": 0.8297, + "step": 182250 + }, + { + "epoch": 1.1644071911375746, + "grad_norm": 0.934339165687561, + "learning_rate": 3.72649883215547e-05, + "loss": 0.8278, + "step": 182260 + }, + { + "epoch": 1.1644710782873133, + "grad_norm": 0.8373285531997681, + "learning_rate": 3.726013618364673e-05, + "loss": 0.8641, + "step": 182270 + }, + { + "epoch": 1.164534965437052, + "grad_norm": 1.0624688863754272, + "learning_rate": 3.7255284174040204e-05, + "loss": 1.1135, + "step": 182280 + }, + { + "epoch": 1.1645988525867907, + "grad_norm": 1.0716806650161743, + "learning_rate": 3.7250432292784e-05, + "loss": 0.803, + "step": 182290 + }, + { + "epoch": 1.1646627397365295, + "grad_norm": 0.7583722472190857, + "learning_rate": 3.724558053992696e-05, + "loss": 0.8769, + "step": 182300 + }, + { + "epoch": 1.1647266268862682, + "grad_norm": 1.5714266300201416, + "learning_rate": 3.724072891551797e-05, + "loss": 1.0371, + "step": 182310 + }, + { + "epoch": 1.1647905140360069, + "grad_norm": 2.1298489570617676, + "learning_rate": 3.723587741960587e-05, + "loss": 0.9073, + "step": 182320 + }, + { + "epoch": 1.1648544011857456, + "grad_norm": 1.1624418497085571, + "learning_rate": 3.7231026052239525e-05, + "loss": 0.9129, + "step": 182330 + }, + { + "epoch": 1.1649182883354843, + "grad_norm": 0.9117680191993713, + "learning_rate": 3.7226174813467805e-05, + "loss": 0.836, + "step": 182340 + }, + { + "epoch": 1.164982175485223, + "grad_norm": 1.1857670545578003, + "learning_rate": 3.722132370333954e-05, + "loss": 0.6342, + "step": 182350 + }, + { + "epoch": 1.1650460626349617, + "grad_norm": 1.1982570886611938, + "learning_rate": 3.7216472721903604e-05, + "loss": 0.8073, + "step": 182360 + }, + { + "epoch": 1.1651099497847004, + "grad_norm": 0.8804404139518738, + "learning_rate": 3.721162186920886e-05, + "loss": 0.7809, + "step": 182370 + }, + { + "epoch": 1.165173836934439, + "grad_norm": 1.8570173978805542, + "learning_rate": 3.7206771145304136e-05, + "loss": 0.6439, + "step": 182380 + }, + { + "epoch": 1.1652377240841778, + "grad_norm": 1.0087871551513672, + "learning_rate": 3.72019205502383e-05, + "loss": 1.0128, + "step": 182390 + }, + { + "epoch": 1.1653016112339163, + "grad_norm": 0.9948041439056396, + "learning_rate": 3.7197070084060195e-05, + "loss": 0.8732, + "step": 182400 + }, + { + "epoch": 1.1653654983836552, + "grad_norm": 1.0306204557418823, + "learning_rate": 3.719221974681867e-05, + "loss": 1.0214, + "step": 182410 + }, + { + "epoch": 1.1654293855333937, + "grad_norm": 0.7993441820144653, + "learning_rate": 3.718736953856258e-05, + "loss": 0.9127, + "step": 182420 + }, + { + "epoch": 1.1654932726831326, + "grad_norm": 0.7198377251625061, + "learning_rate": 3.718251945934075e-05, + "loss": 0.785, + "step": 182430 + }, + { + "epoch": 1.1655571598328711, + "grad_norm": 0.7696216106414795, + "learning_rate": 3.717766950920204e-05, + "loss": 0.8648, + "step": 182440 + }, + { + "epoch": 1.1656210469826098, + "grad_norm": 2.3993189334869385, + "learning_rate": 3.71728196881953e-05, + "loss": 0.9621, + "step": 182450 + }, + { + "epoch": 1.1656849341323485, + "grad_norm": 1.0224963426589966, + "learning_rate": 3.716796999636936e-05, + "loss": 1.0035, + "step": 182460 + }, + { + "epoch": 1.1657488212820872, + "grad_norm": 1.0298407077789307, + "learning_rate": 3.716312043377306e-05, + "loss": 1.0118, + "step": 182470 + }, + { + "epoch": 1.165812708431826, + "grad_norm": 0.9027889966964722, + "learning_rate": 3.7158271000455236e-05, + "loss": 0.9612, + "step": 182480 + }, + { + "epoch": 1.1658765955815646, + "grad_norm": 0.8727611303329468, + "learning_rate": 3.715342169646474e-05, + "loss": 0.9595, + "step": 182490 + }, + { + "epoch": 1.1659404827313034, + "grad_norm": 1.0932813882827759, + "learning_rate": 3.714857252185041e-05, + "loss": 0.7695, + "step": 182500 + }, + { + "epoch": 1.166004369881042, + "grad_norm": 1.0781934261322021, + "learning_rate": 3.714372347666106e-05, + "loss": 0.8286, + "step": 182510 + }, + { + "epoch": 1.1660682570307808, + "grad_norm": 1.1852980852127075, + "learning_rate": 3.713887456094554e-05, + "loss": 1.361, + "step": 182520 + }, + { + "epoch": 1.1661321441805195, + "grad_norm": 1.407976746559143, + "learning_rate": 3.713402577475268e-05, + "loss": 0.9317, + "step": 182530 + }, + { + "epoch": 1.1661960313302582, + "grad_norm": 1.2148253917694092, + "learning_rate": 3.7129177118131315e-05, + "loss": 0.9379, + "step": 182540 + }, + { + "epoch": 1.1662599184799969, + "grad_norm": 0.6507695317268372, + "learning_rate": 3.712432859113026e-05, + "loss": 0.8821, + "step": 182550 + }, + { + "epoch": 1.1663238056297356, + "grad_norm": 1.3190046548843384, + "learning_rate": 3.711948019379836e-05, + "loss": 0.9343, + "step": 182560 + }, + { + "epoch": 1.1663876927794743, + "grad_norm": 1.001220464706421, + "learning_rate": 3.711463192618444e-05, + "loss": 0.9505, + "step": 182570 + }, + { + "epoch": 1.166451579929213, + "grad_norm": 0.7457411289215088, + "learning_rate": 3.710978378833733e-05, + "loss": 0.7789, + "step": 182580 + }, + { + "epoch": 1.1665154670789517, + "grad_norm": 0.7126203179359436, + "learning_rate": 3.710493578030584e-05, + "loss": 0.7784, + "step": 182590 + }, + { + "epoch": 1.1665793542286904, + "grad_norm": 1.4537057876586914, + "learning_rate": 3.71000879021388e-05, + "loss": 0.8228, + "step": 182600 + }, + { + "epoch": 1.1666432413784291, + "grad_norm": 1.0841671228408813, + "learning_rate": 3.709524015388505e-05, + "loss": 0.822, + "step": 182610 + }, + { + "epoch": 1.1667071285281678, + "grad_norm": 0.9426870346069336, + "learning_rate": 3.709039253559339e-05, + "loss": 1.1529, + "step": 182620 + }, + { + "epoch": 1.1667710156779065, + "grad_norm": 1.0441092252731323, + "learning_rate": 3.708554504731264e-05, + "loss": 0.922, + "step": 182630 + }, + { + "epoch": 1.1668349028276452, + "grad_norm": 1.3361924886703491, + "learning_rate": 3.708069768909165e-05, + "loss": 0.7883, + "step": 182640 + }, + { + "epoch": 1.166898789977384, + "grad_norm": 0.7366297245025635, + "learning_rate": 3.707585046097918e-05, + "loss": 0.8257, + "step": 182650 + }, + { + "epoch": 1.1669626771271226, + "grad_norm": 0.9412416219711304, + "learning_rate": 3.707100336302409e-05, + "loss": 1.0877, + "step": 182660 + }, + { + "epoch": 1.1670265642768614, + "grad_norm": 1.1593157052993774, + "learning_rate": 3.706615639527516e-05, + "loss": 0.8254, + "step": 182670 + }, + { + "epoch": 1.1670904514266, + "grad_norm": 0.8784003257751465, + "learning_rate": 3.706130955778124e-05, + "loss": 0.741, + "step": 182680 + }, + { + "epoch": 1.1671543385763388, + "grad_norm": 0.787391722202301, + "learning_rate": 3.705646285059113e-05, + "loss": 0.9372, + "step": 182690 + }, + { + "epoch": 1.1672182257260775, + "grad_norm": 0.7201464176177979, + "learning_rate": 3.705161627375363e-05, + "loss": 0.8021, + "step": 182700 + }, + { + "epoch": 1.1672821128758162, + "grad_norm": 2.2414963245391846, + "learning_rate": 3.7046769827317565e-05, + "loss": 0.776, + "step": 182710 + }, + { + "epoch": 1.1673460000255549, + "grad_norm": 0.7492812275886536, + "learning_rate": 3.7041923511331725e-05, + "loss": 0.7326, + "step": 182720 + }, + { + "epoch": 1.1674098871752936, + "grad_norm": 0.9507798552513123, + "learning_rate": 3.7037077325844923e-05, + "loss": 0.8424, + "step": 182730 + }, + { + "epoch": 1.1674737743250323, + "grad_norm": 1.3532758951187134, + "learning_rate": 3.7032231270905984e-05, + "loss": 0.8017, + "step": 182740 + }, + { + "epoch": 1.167537661474771, + "grad_norm": 1.3244647979736328, + "learning_rate": 3.702738534656368e-05, + "loss": 0.9646, + "step": 182750 + }, + { + "epoch": 1.1676015486245097, + "grad_norm": 0.6468150019645691, + "learning_rate": 3.702253955286683e-05, + "loss": 0.8012, + "step": 182760 + }, + { + "epoch": 1.1676654357742484, + "grad_norm": 0.9107818603515625, + "learning_rate": 3.7017693889864236e-05, + "loss": 0.795, + "step": 182770 + }, + { + "epoch": 1.1677293229239871, + "grad_norm": 0.9616878032684326, + "learning_rate": 3.7012848357604703e-05, + "loss": 0.8296, + "step": 182780 + }, + { + "epoch": 1.1677932100737258, + "grad_norm": 1.004823088645935, + "learning_rate": 3.700800295613701e-05, + "loss": 0.7486, + "step": 182790 + }, + { + "epoch": 1.1678570972234645, + "grad_norm": 0.8042672276496887, + "learning_rate": 3.700315768550997e-05, + "loss": 0.787, + "step": 182800 + }, + { + "epoch": 1.1679209843732032, + "grad_norm": 1.9837151765823364, + "learning_rate": 3.6998312545772385e-05, + "loss": 0.9188, + "step": 182810 + }, + { + "epoch": 1.167984871522942, + "grad_norm": 0.9530550837516785, + "learning_rate": 3.6993467536973034e-05, + "loss": 0.7108, + "step": 182820 + }, + { + "epoch": 1.1680487586726807, + "grad_norm": 0.8177058100700378, + "learning_rate": 3.698862265916071e-05, + "loss": 0.8103, + "step": 182830 + }, + { + "epoch": 1.1681126458224194, + "grad_norm": 1.3955106735229492, + "learning_rate": 3.698377791238422e-05, + "loss": 1.0225, + "step": 182840 + }, + { + "epoch": 1.168176532972158, + "grad_norm": 0.8228069543838501, + "learning_rate": 3.6978933296692354e-05, + "loss": 0.8187, + "step": 182850 + }, + { + "epoch": 1.1682404201218968, + "grad_norm": 0.820376992225647, + "learning_rate": 3.6974088812133885e-05, + "loss": 1.0945, + "step": 182860 + }, + { + "epoch": 1.1683043072716355, + "grad_norm": 1.0790095329284668, + "learning_rate": 3.696924445875761e-05, + "loss": 0.8802, + "step": 182870 + }, + { + "epoch": 1.1683681944213742, + "grad_norm": 0.557696521282196, + "learning_rate": 3.6964400236612306e-05, + "loss": 0.9731, + "step": 182880 + }, + { + "epoch": 1.1684320815711127, + "grad_norm": 1.1943516731262207, + "learning_rate": 3.695955614574679e-05, + "loss": 1.0052, + "step": 182890 + }, + { + "epoch": 1.1684959687208516, + "grad_norm": 0.8318352699279785, + "learning_rate": 3.695471218620981e-05, + "loss": 0.7788, + "step": 182900 + }, + { + "epoch": 1.16855985587059, + "grad_norm": 0.8753002882003784, + "learning_rate": 3.6949868358050174e-05, + "loss": 0.7583, + "step": 182910 + }, + { + "epoch": 1.168623743020329, + "grad_norm": 1.1179732084274292, + "learning_rate": 3.694502466131665e-05, + "loss": 1.0665, + "step": 182920 + }, + { + "epoch": 1.1686876301700675, + "grad_norm": 1.8066426515579224, + "learning_rate": 3.6940181096058026e-05, + "loss": 1.0061, + "step": 182930 + }, + { + "epoch": 1.1687515173198062, + "grad_norm": 1.7594103813171387, + "learning_rate": 3.6935337662323074e-05, + "loss": 0.5617, + "step": 182940 + }, + { + "epoch": 1.168815404469545, + "grad_norm": 0.6220073103904724, + "learning_rate": 3.693049436016057e-05, + "loss": 0.7875, + "step": 182950 + }, + { + "epoch": 1.1688792916192836, + "grad_norm": 0.6708908081054688, + "learning_rate": 3.692565118961931e-05, + "loss": 1.0373, + "step": 182960 + }, + { + "epoch": 1.1689431787690223, + "grad_norm": 0.5550050139427185, + "learning_rate": 3.6920808150748035e-05, + "loss": 0.881, + "step": 182970 + }, + { + "epoch": 1.169007065918761, + "grad_norm": 2.151418685913086, + "learning_rate": 3.6915965243595543e-05, + "loss": 0.9676, + "step": 182980 + }, + { + "epoch": 1.1690709530684997, + "grad_norm": 0.657231867313385, + "learning_rate": 3.69111224682106e-05, + "loss": 0.9571, + "step": 182990 + }, + { + "epoch": 1.1691348402182384, + "grad_norm": 0.6494762897491455, + "learning_rate": 3.6906279824641975e-05, + "loss": 0.9356, + "step": 183000 + }, + { + "epoch": 1.1691987273679771, + "grad_norm": 0.6837002038955688, + "learning_rate": 3.690143731293845e-05, + "loss": 0.8295, + "step": 183010 + }, + { + "epoch": 1.1692626145177158, + "grad_norm": 0.991506040096283, + "learning_rate": 3.689659493314877e-05, + "loss": 0.933, + "step": 183020 + }, + { + "epoch": 1.1693265016674546, + "grad_norm": 1.013289213180542, + "learning_rate": 3.689175268532172e-05, + "loss": 0.7162, + "step": 183030 + }, + { + "epoch": 1.1693903888171933, + "grad_norm": 0.630157470703125, + "learning_rate": 3.688691056950606e-05, + "loss": 0.6827, + "step": 183040 + }, + { + "epoch": 1.169454275966932, + "grad_norm": 1.3331143856048584, + "learning_rate": 3.688206858575056e-05, + "loss": 0.8385, + "step": 183050 + }, + { + "epoch": 1.1695181631166707, + "grad_norm": 1.0507668256759644, + "learning_rate": 3.687722673410398e-05, + "loss": 0.9496, + "step": 183060 + }, + { + "epoch": 1.1695820502664094, + "grad_norm": 0.6389208436012268, + "learning_rate": 3.6872385014615074e-05, + "loss": 0.9461, + "step": 183070 + }, + { + "epoch": 1.169645937416148, + "grad_norm": 1.128082036972046, + "learning_rate": 3.68675434273326e-05, + "loss": 0.85, + "step": 183080 + }, + { + "epoch": 1.1697098245658868, + "grad_norm": 1.2686810493469238, + "learning_rate": 3.686270197230533e-05, + "loss": 0.7047, + "step": 183090 + }, + { + "epoch": 1.1697737117156255, + "grad_norm": 0.8824436664581299, + "learning_rate": 3.685786064958202e-05, + "loss": 0.9775, + "step": 183100 + }, + { + "epoch": 1.1698375988653642, + "grad_norm": 1.6692858934402466, + "learning_rate": 3.6853019459211424e-05, + "loss": 0.8535, + "step": 183110 + }, + { + "epoch": 1.169901486015103, + "grad_norm": 1.2711756229400635, + "learning_rate": 3.6848178401242296e-05, + "loss": 0.9764, + "step": 183120 + }, + { + "epoch": 1.1699653731648416, + "grad_norm": 0.6435794234275818, + "learning_rate": 3.6843337475723405e-05, + "loss": 0.9709, + "step": 183130 + }, + { + "epoch": 1.1700292603145803, + "grad_norm": 1.0502516031265259, + "learning_rate": 3.683849668270347e-05, + "loss": 0.7854, + "step": 183140 + }, + { + "epoch": 1.170093147464319, + "grad_norm": 0.8473712205886841, + "learning_rate": 3.6833656022231266e-05, + "loss": 0.6378, + "step": 183150 + }, + { + "epoch": 1.1701570346140577, + "grad_norm": 1.302493691444397, + "learning_rate": 3.682881549435553e-05, + "loss": 0.8931, + "step": 183160 + }, + { + "epoch": 1.1702209217637964, + "grad_norm": 0.6629976630210876, + "learning_rate": 3.682397509912502e-05, + "loss": 0.9276, + "step": 183170 + }, + { + "epoch": 1.1702848089135351, + "grad_norm": 1.529744029045105, + "learning_rate": 3.6819134836588476e-05, + "loss": 0.9018, + "step": 183180 + }, + { + "epoch": 1.1703486960632739, + "grad_norm": 0.7747004628181458, + "learning_rate": 3.681429470679465e-05, + "loss": 0.9624, + "step": 183190 + }, + { + "epoch": 1.1704125832130126, + "grad_norm": 1.035202980041504, + "learning_rate": 3.6809454709792266e-05, + "loss": 0.9256, + "step": 183200 + }, + { + "epoch": 1.1704764703627513, + "grad_norm": 1.0493172407150269, + "learning_rate": 3.6804614845630106e-05, + "loss": 0.8489, + "step": 183210 + }, + { + "epoch": 1.17054035751249, + "grad_norm": 0.969811737537384, + "learning_rate": 3.679977511435688e-05, + "loss": 1.0871, + "step": 183220 + }, + { + "epoch": 1.1706042446622287, + "grad_norm": 1.3164643049240112, + "learning_rate": 3.6794935516021346e-05, + "loss": 0.9199, + "step": 183230 + }, + { + "epoch": 1.1706681318119674, + "grad_norm": 2.7494232654571533, + "learning_rate": 3.679009605067223e-05, + "loss": 0.9381, + "step": 183240 + }, + { + "epoch": 1.170732018961706, + "grad_norm": 1.1078988313674927, + "learning_rate": 3.6785256718358276e-05, + "loss": 0.8591, + "step": 183250 + }, + { + "epoch": 1.1707959061114448, + "grad_norm": 0.8365240693092346, + "learning_rate": 3.678041751912822e-05, + "loss": 0.8841, + "step": 183260 + }, + { + "epoch": 1.1708597932611835, + "grad_norm": 1.2854210138320923, + "learning_rate": 3.67755784530308e-05, + "loss": 1.0024, + "step": 183270 + }, + { + "epoch": 1.1709236804109222, + "grad_norm": 0.6243909001350403, + "learning_rate": 3.677073952011474e-05, + "loss": 0.9515, + "step": 183280 + }, + { + "epoch": 1.170987567560661, + "grad_norm": 0.8205077648162842, + "learning_rate": 3.676590072042878e-05, + "loss": 0.7729, + "step": 183290 + }, + { + "epoch": 1.1710514547103996, + "grad_norm": 0.6808330416679382, + "learning_rate": 3.676106205402165e-05, + "loss": 0.7247, + "step": 183300 + }, + { + "epoch": 1.1711153418601383, + "grad_norm": 1.3510143756866455, + "learning_rate": 3.6756223520942076e-05, + "loss": 0.8529, + "step": 183310 + }, + { + "epoch": 1.171179229009877, + "grad_norm": 1.1353574991226196, + "learning_rate": 3.6751385121238795e-05, + "loss": 0.7977, + "step": 183320 + }, + { + "epoch": 1.1712431161596157, + "grad_norm": 1.2666032314300537, + "learning_rate": 3.674654685496052e-05, + "loss": 1.217, + "step": 183330 + }, + { + "epoch": 1.1713070033093544, + "grad_norm": 0.8943315148353577, + "learning_rate": 3.674170872215599e-05, + "loss": 1.0473, + "step": 183340 + }, + { + "epoch": 1.1713708904590932, + "grad_norm": 2.6050353050231934, + "learning_rate": 3.673687072287392e-05, + "loss": 1.0005, + "step": 183350 + }, + { + "epoch": 1.1714347776088319, + "grad_norm": 0.4434867799282074, + "learning_rate": 3.6732032857163035e-05, + "loss": 0.7291, + "step": 183360 + }, + { + "epoch": 1.1714986647585706, + "grad_norm": 1.2048684358596802, + "learning_rate": 3.672719512507206e-05, + "loss": 0.6699, + "step": 183370 + }, + { + "epoch": 1.171562551908309, + "grad_norm": 0.9108380079269409, + "learning_rate": 3.672235752664971e-05, + "loss": 0.8089, + "step": 183380 + }, + { + "epoch": 1.171626439058048, + "grad_norm": 0.8623518943786621, + "learning_rate": 3.671752006194471e-05, + "loss": 0.7186, + "step": 183390 + }, + { + "epoch": 1.1716903262077865, + "grad_norm": 0.6611031293869019, + "learning_rate": 3.6712682731005774e-05, + "loss": 0.9761, + "step": 183400 + }, + { + "epoch": 1.1717542133575254, + "grad_norm": 0.9480814337730408, + "learning_rate": 3.6707845533881605e-05, + "loss": 0.7531, + "step": 183410 + }, + { + "epoch": 1.1718181005072639, + "grad_norm": 0.6748718023300171, + "learning_rate": 3.670300847062094e-05, + "loss": 0.9506, + "step": 183420 + }, + { + "epoch": 1.1718819876570026, + "grad_norm": 1.0344165563583374, + "learning_rate": 3.6698171541272486e-05, + "loss": 0.7988, + "step": 183430 + }, + { + "epoch": 1.1719458748067413, + "grad_norm": 3.343217134475708, + "learning_rate": 3.669333474588496e-05, + "loss": 1.0061, + "step": 183440 + }, + { + "epoch": 1.17200976195648, + "grad_norm": 0.6782224178314209, + "learning_rate": 3.668849808450705e-05, + "loss": 0.9939, + "step": 183450 + }, + { + "epoch": 1.1720736491062187, + "grad_norm": 2.6960456371307373, + "learning_rate": 3.668366155718749e-05, + "loss": 0.8139, + "step": 183460 + }, + { + "epoch": 1.1721375362559574, + "grad_norm": 0.6687380075454712, + "learning_rate": 3.6678825163974974e-05, + "loss": 0.8677, + "step": 183470 + }, + { + "epoch": 1.172201423405696, + "grad_norm": 0.8012658953666687, + "learning_rate": 3.667398890491821e-05, + "loss": 0.862, + "step": 183480 + }, + { + "epoch": 1.1722653105554348, + "grad_norm": 0.8338684439659119, + "learning_rate": 3.6669152780065906e-05, + "loss": 0.641, + "step": 183490 + }, + { + "epoch": 1.1723291977051735, + "grad_norm": 1.8749988079071045, + "learning_rate": 3.6664316789466777e-05, + "loss": 1.1108, + "step": 183500 + }, + { + "epoch": 1.1723930848549122, + "grad_norm": 1.0634840726852417, + "learning_rate": 3.665948093316951e-05, + "loss": 1.0941, + "step": 183510 + }, + { + "epoch": 1.172456972004651, + "grad_norm": 0.8182593584060669, + "learning_rate": 3.6654645211222806e-05, + "loss": 0.8858, + "step": 183520 + }, + { + "epoch": 1.1725208591543896, + "grad_norm": 0.7956867814064026, + "learning_rate": 3.664980962367538e-05, + "loss": 0.8432, + "step": 183530 + }, + { + "epoch": 1.1725847463041283, + "grad_norm": 1.3294975757598877, + "learning_rate": 3.664497417057591e-05, + "loss": 1.1343, + "step": 183540 + }, + { + "epoch": 1.172648633453867, + "grad_norm": 1.4603157043457031, + "learning_rate": 3.6640138851973113e-05, + "loss": 1.0972, + "step": 183550 + }, + { + "epoch": 1.1727125206036058, + "grad_norm": 0.9470663070678711, + "learning_rate": 3.663530366791567e-05, + "loss": 0.9428, + "step": 183560 + }, + { + "epoch": 1.1727764077533445, + "grad_norm": 0.9879772067070007, + "learning_rate": 3.6630468618452284e-05, + "loss": 0.7917, + "step": 183570 + }, + { + "epoch": 1.1728402949030832, + "grad_norm": 0.8244022727012634, + "learning_rate": 3.662563370363164e-05, + "loss": 0.9972, + "step": 183580 + }, + { + "epoch": 1.1729041820528219, + "grad_norm": 0.7531734108924866, + "learning_rate": 3.662079892350244e-05, + "loss": 0.8495, + "step": 183590 + }, + { + "epoch": 1.1729680692025606, + "grad_norm": 0.9825856685638428, + "learning_rate": 3.6615964278113366e-05, + "loss": 1.0305, + "step": 183600 + }, + { + "epoch": 1.1730319563522993, + "grad_norm": 0.920281171798706, + "learning_rate": 3.6611129767513134e-05, + "loss": 0.8422, + "step": 183610 + }, + { + "epoch": 1.173095843502038, + "grad_norm": 0.9334577322006226, + "learning_rate": 3.6606295391750375e-05, + "loss": 0.8659, + "step": 183620 + }, + { + "epoch": 1.1731597306517767, + "grad_norm": 0.7163191437721252, + "learning_rate": 3.6601461150873825e-05, + "loss": 0.9231, + "step": 183630 + }, + { + "epoch": 1.1732236178015154, + "grad_norm": 0.898003876209259, + "learning_rate": 3.659662704493215e-05, + "loss": 0.8257, + "step": 183640 + }, + { + "epoch": 1.1732875049512541, + "grad_norm": 1.5005582571029663, + "learning_rate": 3.6591793073974035e-05, + "loss": 0.8273, + "step": 183650 + }, + { + "epoch": 1.1733513921009928, + "grad_norm": 1.2959163188934326, + "learning_rate": 3.658695923804816e-05, + "loss": 1.1756, + "step": 183660 + }, + { + "epoch": 1.1734152792507315, + "grad_norm": 1.2547670602798462, + "learning_rate": 3.6582125537203215e-05, + "loss": 0.7682, + "step": 183670 + }, + { + "epoch": 1.1734791664004702, + "grad_norm": 2.360750913619995, + "learning_rate": 3.657729197148787e-05, + "loss": 0.7794, + "step": 183680 + }, + { + "epoch": 1.173543053550209, + "grad_norm": 0.991039514541626, + "learning_rate": 3.657245854095081e-05, + "loss": 0.8745, + "step": 183690 + }, + { + "epoch": 1.1736069406999476, + "grad_norm": 1.098197340965271, + "learning_rate": 3.656762524564071e-05, + "loss": 0.926, + "step": 183700 + }, + { + "epoch": 1.1736708278496863, + "grad_norm": 0.8927314281463623, + "learning_rate": 3.656279208560624e-05, + "loss": 0.7737, + "step": 183710 + }, + { + "epoch": 1.173734714999425, + "grad_norm": 1.753037691116333, + "learning_rate": 3.655795906089608e-05, + "loss": 0.8657, + "step": 183720 + }, + { + "epoch": 1.1737986021491638, + "grad_norm": 1.0958718061447144, + "learning_rate": 3.655312617155889e-05, + "loss": 0.8419, + "step": 183730 + }, + { + "epoch": 1.1738624892989025, + "grad_norm": 0.8453962206840515, + "learning_rate": 3.654829341764336e-05, + "loss": 0.6998, + "step": 183740 + }, + { + "epoch": 1.1739263764486412, + "grad_norm": 0.9455485939979553, + "learning_rate": 3.654346079919816e-05, + "loss": 0.7942, + "step": 183750 + }, + { + "epoch": 1.1739902635983799, + "grad_norm": 1.053681492805481, + "learning_rate": 3.653862831627195e-05, + "loss": 1.0206, + "step": 183760 + }, + { + "epoch": 1.1740541507481186, + "grad_norm": 1.9257044792175293, + "learning_rate": 3.65337959689134e-05, + "loss": 0.8839, + "step": 183770 + }, + { + "epoch": 1.1741180378978573, + "grad_norm": 0.8294272422790527, + "learning_rate": 3.6528963757171175e-05, + "loss": 0.8785, + "step": 183780 + }, + { + "epoch": 1.174181925047596, + "grad_norm": 0.706585168838501, + "learning_rate": 3.652413168109393e-05, + "loss": 0.6012, + "step": 183790 + }, + { + "epoch": 1.1742458121973347, + "grad_norm": 0.9635806083679199, + "learning_rate": 3.6519299740730345e-05, + "loss": 1.0587, + "step": 183800 + }, + { + "epoch": 1.1743096993470734, + "grad_norm": 1.0544793605804443, + "learning_rate": 3.651446793612907e-05, + "loss": 0.8475, + "step": 183810 + }, + { + "epoch": 1.1743735864968121, + "grad_norm": 0.8509196639060974, + "learning_rate": 3.6509636267338776e-05, + "loss": 0.9506, + "step": 183820 + }, + { + "epoch": 1.1744374736465508, + "grad_norm": 0.7948233485221863, + "learning_rate": 3.650480473440811e-05, + "loss": 0.6293, + "step": 183830 + }, + { + "epoch": 1.1745013607962895, + "grad_norm": 0.9166902899742126, + "learning_rate": 3.649997333738574e-05, + "loss": 0.8128, + "step": 183840 + }, + { + "epoch": 1.174565247946028, + "grad_norm": 1.2377371788024902, + "learning_rate": 3.649514207632031e-05, + "loss": 0.887, + "step": 183850 + }, + { + "epoch": 1.174629135095767, + "grad_norm": 1.338071346282959, + "learning_rate": 3.6490310951260486e-05, + "loss": 1.0392, + "step": 183860 + }, + { + "epoch": 1.1746930222455054, + "grad_norm": 1.4467830657958984, + "learning_rate": 3.648547996225492e-05, + "loss": 1.032, + "step": 183870 + }, + { + "epoch": 1.1747569093952444, + "grad_norm": 0.8257285952568054, + "learning_rate": 3.6480649109352264e-05, + "loss": 0.833, + "step": 183880 + }, + { + "epoch": 1.1748207965449828, + "grad_norm": 1.4109437465667725, + "learning_rate": 3.6475818392601163e-05, + "loss": 0.847, + "step": 183890 + }, + { + "epoch": 1.1748846836947218, + "grad_norm": 0.6960985660552979, + "learning_rate": 3.647098781205027e-05, + "loss": 1.0269, + "step": 183900 + }, + { + "epoch": 1.1749485708444602, + "grad_norm": 0.9446884989738464, + "learning_rate": 3.646615736774824e-05, + "loss": 0.8225, + "step": 183910 + }, + { + "epoch": 1.175012457994199, + "grad_norm": 1.0841772556304932, + "learning_rate": 3.646132705974371e-05, + "loss": 0.9256, + "step": 183920 + }, + { + "epoch": 1.1750763451439377, + "grad_norm": 0.8433025479316711, + "learning_rate": 3.645649688808532e-05, + "loss": 0.7972, + "step": 183930 + }, + { + "epoch": 1.1751402322936764, + "grad_norm": 0.9971235990524292, + "learning_rate": 3.645166685282173e-05, + "loss": 0.8408, + "step": 183940 + }, + { + "epoch": 1.175204119443415, + "grad_norm": 0.9026851058006287, + "learning_rate": 3.6446836954001584e-05, + "loss": 1.3246, + "step": 183950 + }, + { + "epoch": 1.1752680065931538, + "grad_norm": 1.0484285354614258, + "learning_rate": 3.6442007191673514e-05, + "loss": 0.8291, + "step": 183960 + }, + { + "epoch": 1.1753318937428925, + "grad_norm": 0.8531259894371033, + "learning_rate": 3.643717756588615e-05, + "loss": 0.7325, + "step": 183970 + }, + { + "epoch": 1.1753957808926312, + "grad_norm": 0.6556183099746704, + "learning_rate": 3.643234807668815e-05, + "loss": 0.8424, + "step": 183980 + }, + { + "epoch": 1.17545966804237, + "grad_norm": 1.1358847618103027, + "learning_rate": 3.642751872412814e-05, + "loss": 0.7372, + "step": 183990 + }, + { + "epoch": 1.1755235551921086, + "grad_norm": 0.8008434176445007, + "learning_rate": 3.642268950825476e-05, + "loss": 0.7901, + "step": 184000 + }, + { + "epoch": 1.1755874423418473, + "grad_norm": 1.2002679109573364, + "learning_rate": 3.6417860429116635e-05, + "loss": 1.1483, + "step": 184010 + }, + { + "epoch": 1.175651329491586, + "grad_norm": 0.8582051992416382, + "learning_rate": 3.6413031486762415e-05, + "loss": 0.7933, + "step": 184020 + }, + { + "epoch": 1.1757152166413247, + "grad_norm": 0.6789759993553162, + "learning_rate": 3.640820268124072e-05, + "loss": 0.8829, + "step": 184030 + }, + { + "epoch": 1.1757791037910634, + "grad_norm": 0.9038664698600769, + "learning_rate": 3.6403374012600184e-05, + "loss": 0.7159, + "step": 184040 + }, + { + "epoch": 1.1758429909408021, + "grad_norm": 0.4721551239490509, + "learning_rate": 3.6398545480889434e-05, + "loss": 0.8584, + "step": 184050 + }, + { + "epoch": 1.1759068780905408, + "grad_norm": 1.1332751512527466, + "learning_rate": 3.63937170861571e-05, + "loss": 0.709, + "step": 184060 + }, + { + "epoch": 1.1759707652402795, + "grad_norm": 1.0257160663604736, + "learning_rate": 3.6388888828451796e-05, + "loss": 1.1302, + "step": 184070 + }, + { + "epoch": 1.1760346523900183, + "grad_norm": 0.7956812381744385, + "learning_rate": 3.638406070782217e-05, + "loss": 1.0162, + "step": 184080 + }, + { + "epoch": 1.176098539539757, + "grad_norm": 1.498618245124817, + "learning_rate": 3.637923272431682e-05, + "loss": 0.7363, + "step": 184090 + }, + { + "epoch": 1.1761624266894957, + "grad_norm": 0.9680002331733704, + "learning_rate": 3.63744048779844e-05, + "loss": 1.1146, + "step": 184100 + }, + { + "epoch": 1.1762263138392344, + "grad_norm": 0.9070209860801697, + "learning_rate": 3.636957716887349e-05, + "loss": 0.5749, + "step": 184110 + }, + { + "epoch": 1.176290200988973, + "grad_norm": 0.7833470702171326, + "learning_rate": 3.636474959703274e-05, + "loss": 1.0808, + "step": 184120 + }, + { + "epoch": 1.1763540881387118, + "grad_norm": 0.7628843188285828, + "learning_rate": 3.635992216251075e-05, + "loss": 0.7796, + "step": 184130 + }, + { + "epoch": 1.1764179752884505, + "grad_norm": 1.3704489469528198, + "learning_rate": 3.635509486535615e-05, + "loss": 0.9249, + "step": 184140 + }, + { + "epoch": 1.1764818624381892, + "grad_norm": 1.166788101196289, + "learning_rate": 3.6350267705617544e-05, + "loss": 0.7454, + "step": 184150 + }, + { + "epoch": 1.176545749587928, + "grad_norm": 1.0495282411575317, + "learning_rate": 3.6345440683343555e-05, + "loss": 1.0876, + "step": 184160 + }, + { + "epoch": 1.1766096367376666, + "grad_norm": 1.9336224794387817, + "learning_rate": 3.6340613798582796e-05, + "loss": 0.7674, + "step": 184170 + }, + { + "epoch": 1.1766735238874053, + "grad_norm": 1.1545357704162598, + "learning_rate": 3.633578705138386e-05, + "loss": 1.0439, + "step": 184180 + }, + { + "epoch": 1.176737411037144, + "grad_norm": 0.904642641544342, + "learning_rate": 3.633096044179538e-05, + "loss": 0.959, + "step": 184190 + }, + { + "epoch": 1.1768012981868827, + "grad_norm": 1.4052455425262451, + "learning_rate": 3.632613396986595e-05, + "loss": 1.0384, + "step": 184200 + }, + { + "epoch": 1.1768651853366214, + "grad_norm": 0.9831133484840393, + "learning_rate": 3.6321307635644186e-05, + "loss": 0.9644, + "step": 184210 + }, + { + "epoch": 1.1769290724863601, + "grad_norm": 0.7129910588264465, + "learning_rate": 3.631648143917869e-05, + "loss": 0.7711, + "step": 184220 + }, + { + "epoch": 1.1769929596360988, + "grad_norm": 1.238707423210144, + "learning_rate": 3.631165538051805e-05, + "loss": 0.8986, + "step": 184230 + }, + { + "epoch": 1.1770568467858376, + "grad_norm": 3.398481607437134, + "learning_rate": 3.630682945971089e-05, + "loss": 0.7843, + "step": 184240 + }, + { + "epoch": 1.1771207339355763, + "grad_norm": 0.990300714969635, + "learning_rate": 3.630200367680581e-05, + "loss": 0.9905, + "step": 184250 + }, + { + "epoch": 1.177184621085315, + "grad_norm": 0.805274248123169, + "learning_rate": 3.6297178031851375e-05, + "loss": 0.852, + "step": 184260 + }, + { + "epoch": 1.1772485082350537, + "grad_norm": 1.4010913372039795, + "learning_rate": 3.629235252489624e-05, + "loss": 0.8368, + "step": 184270 + }, + { + "epoch": 1.1773123953847924, + "grad_norm": 0.9194250106811523, + "learning_rate": 3.6287527155988966e-05, + "loss": 0.792, + "step": 184280 + }, + { + "epoch": 1.177376282534531, + "grad_norm": 1.0088683366775513, + "learning_rate": 3.628270192517816e-05, + "loss": 1.0042, + "step": 184290 + }, + { + "epoch": 1.1774401696842698, + "grad_norm": 0.8841618895530701, + "learning_rate": 3.6277876832512405e-05, + "loss": 0.9673, + "step": 184300 + }, + { + "epoch": 1.1775040568340085, + "grad_norm": 0.8211191892623901, + "learning_rate": 3.627305187804031e-05, + "loss": 1.0045, + "step": 184310 + }, + { + "epoch": 1.1775679439837472, + "grad_norm": 1.2937853336334229, + "learning_rate": 3.6268227061810454e-05, + "loss": 0.8511, + "step": 184320 + }, + { + "epoch": 1.177631831133486, + "grad_norm": 1.1388708353042603, + "learning_rate": 3.626340238387144e-05, + "loss": 0.9739, + "step": 184330 + }, + { + "epoch": 1.1776957182832244, + "grad_norm": 0.8496128916740417, + "learning_rate": 3.625857784427183e-05, + "loss": 1.0829, + "step": 184340 + }, + { + "epoch": 1.1777596054329633, + "grad_norm": 0.6666727066040039, + "learning_rate": 3.625375344306025e-05, + "loss": 1.1618, + "step": 184350 + }, + { + "epoch": 1.1778234925827018, + "grad_norm": 0.7936133146286011, + "learning_rate": 3.6248929180285254e-05, + "loss": 0.7061, + "step": 184360 + }, + { + "epoch": 1.1778873797324407, + "grad_norm": 0.8399503827095032, + "learning_rate": 3.624410505599544e-05, + "loss": 0.7688, + "step": 184370 + }, + { + "epoch": 1.1779512668821792, + "grad_norm": 0.7703272104263306, + "learning_rate": 3.623928107023938e-05, + "loss": 1.1013, + "step": 184380 + }, + { + "epoch": 1.1780151540319181, + "grad_norm": 1.9948519468307495, + "learning_rate": 3.623445722306567e-05, + "loss": 0.8829, + "step": 184390 + }, + { + "epoch": 1.1780790411816566, + "grad_norm": 0.795413076877594, + "learning_rate": 3.6229633514522886e-05, + "loss": 0.9189, + "step": 184400 + }, + { + "epoch": 1.1781429283313953, + "grad_norm": 1.251691460609436, + "learning_rate": 3.6224809944659604e-05, + "loss": 0.9461, + "step": 184410 + }, + { + "epoch": 1.178206815481134, + "grad_norm": 0.6770833134651184, + "learning_rate": 3.621998651352441e-05, + "loss": 0.9295, + "step": 184420 + }, + { + "epoch": 1.1782707026308727, + "grad_norm": 1.9652451276779175, + "learning_rate": 3.621516322116586e-05, + "loss": 0.8064, + "step": 184430 + }, + { + "epoch": 1.1783345897806115, + "grad_norm": 0.9045741558074951, + "learning_rate": 3.6210340067632556e-05, + "loss": 0.9916, + "step": 184440 + }, + { + "epoch": 1.1783984769303502, + "grad_norm": 0.6965897083282471, + "learning_rate": 3.6205517052973045e-05, + "loss": 0.901, + "step": 184450 + }, + { + "epoch": 1.1784623640800889, + "grad_norm": 0.9617449641227722, + "learning_rate": 3.62006941772359e-05, + "loss": 0.8419, + "step": 184460 + }, + { + "epoch": 1.1785262512298276, + "grad_norm": 0.9782170057296753, + "learning_rate": 3.6195871440469734e-05, + "loss": 1.013, + "step": 184470 + }, + { + "epoch": 1.1785901383795663, + "grad_norm": 1.2114006280899048, + "learning_rate": 3.6191048842723065e-05, + "loss": 1.045, + "step": 184480 + }, + { + "epoch": 1.178654025529305, + "grad_norm": 0.7640635371208191, + "learning_rate": 3.618622638404449e-05, + "loss": 0.7731, + "step": 184490 + }, + { + "epoch": 1.1787179126790437, + "grad_norm": 0.8108700513839722, + "learning_rate": 3.618140406448256e-05, + "loss": 1.1081, + "step": 184500 + }, + { + "epoch": 1.1787817998287824, + "grad_norm": 1.1722303628921509, + "learning_rate": 3.6176581884085844e-05, + "loss": 0.6224, + "step": 184510 + }, + { + "epoch": 1.178845686978521, + "grad_norm": 0.7348328828811646, + "learning_rate": 3.6171759842902916e-05, + "loss": 1.0012, + "step": 184520 + }, + { + "epoch": 1.1789095741282598, + "grad_norm": 1.2766432762145996, + "learning_rate": 3.616693794098233e-05, + "loss": 1.2396, + "step": 184530 + }, + { + "epoch": 1.1789734612779985, + "grad_norm": 2.398635149002075, + "learning_rate": 3.616211617837264e-05, + "loss": 0.5746, + "step": 184540 + }, + { + "epoch": 1.1790373484277372, + "grad_norm": 0.6826220154762268, + "learning_rate": 3.6157294555122406e-05, + "loss": 0.8408, + "step": 184550 + }, + { + "epoch": 1.179101235577476, + "grad_norm": 1.0390657186508179, + "learning_rate": 3.6152473071280204e-05, + "loss": 0.6888, + "step": 184560 + }, + { + "epoch": 1.1791651227272146, + "grad_norm": 2.681307792663574, + "learning_rate": 3.614765172689456e-05, + "loss": 0.7737, + "step": 184570 + }, + { + "epoch": 1.1792290098769533, + "grad_norm": 0.9121163487434387, + "learning_rate": 3.614283052201408e-05, + "loss": 0.6505, + "step": 184580 + }, + { + "epoch": 1.179292897026692, + "grad_norm": 0.9245350956916809, + "learning_rate": 3.613800945668726e-05, + "loss": 0.8225, + "step": 184590 + }, + { + "epoch": 1.1793567841764308, + "grad_norm": 0.9262069463729858, + "learning_rate": 3.613318853096268e-05, + "loss": 0.9348, + "step": 184600 + }, + { + "epoch": 1.1794206713261695, + "grad_norm": 0.5917856693267822, + "learning_rate": 3.612836774488889e-05, + "loss": 1.1775, + "step": 184610 + }, + { + "epoch": 1.1794845584759082, + "grad_norm": 0.7416051030158997, + "learning_rate": 3.612354709851444e-05, + "loss": 0.8081, + "step": 184620 + }, + { + "epoch": 1.1795484456256469, + "grad_norm": 1.2204169034957886, + "learning_rate": 3.611872659188787e-05, + "loss": 0.9868, + "step": 184630 + }, + { + "epoch": 1.1796123327753856, + "grad_norm": 1.0252538919448853, + "learning_rate": 3.6113906225057735e-05, + "loss": 1.1028, + "step": 184640 + }, + { + "epoch": 1.1796762199251243, + "grad_norm": 0.9787299633026123, + "learning_rate": 3.610908599807258e-05, + "loss": 0.9507, + "step": 184650 + }, + { + "epoch": 1.179740107074863, + "grad_norm": 1.029250979423523, + "learning_rate": 3.6104265910980936e-05, + "loss": 0.8878, + "step": 184660 + }, + { + "epoch": 1.1798039942246017, + "grad_norm": 2.0720760822296143, + "learning_rate": 3.609944596383137e-05, + "loss": 0.8151, + "step": 184670 + }, + { + "epoch": 1.1798678813743404, + "grad_norm": 1.1255959272384644, + "learning_rate": 3.6094626156672394e-05, + "loss": 0.7925, + "step": 184680 + }, + { + "epoch": 1.179931768524079, + "grad_norm": 0.9386995434761047, + "learning_rate": 3.6089806489552576e-05, + "loss": 1.0739, + "step": 184690 + }, + { + "epoch": 1.1799956556738178, + "grad_norm": 1.8800048828125, + "learning_rate": 3.6084986962520434e-05, + "loss": 0.9886, + "step": 184700 + }, + { + "epoch": 1.1800595428235565, + "grad_norm": 0.7651383280754089, + "learning_rate": 3.608016757562451e-05, + "loss": 0.859, + "step": 184710 + }, + { + "epoch": 1.1801234299732952, + "grad_norm": 0.9939292073249817, + "learning_rate": 3.6075348328913344e-05, + "loss": 0.9842, + "step": 184720 + }, + { + "epoch": 1.180187317123034, + "grad_norm": 0.6939488649368286, + "learning_rate": 3.6070529222435466e-05, + "loss": 1.0664, + "step": 184730 + }, + { + "epoch": 1.1802512042727726, + "grad_norm": 0.9182581305503845, + "learning_rate": 3.606571025623941e-05, + "loss": 0.9928, + "step": 184740 + }, + { + "epoch": 1.1803150914225113, + "grad_norm": 1.1776940822601318, + "learning_rate": 3.606089143037371e-05, + "loss": 0.8528, + "step": 184750 + }, + { + "epoch": 1.18037897857225, + "grad_norm": 0.787538468837738, + "learning_rate": 3.6056072744886885e-05, + "loss": 0.8125, + "step": 184760 + }, + { + "epoch": 1.1804428657219888, + "grad_norm": 1.208063006401062, + "learning_rate": 3.605125419982748e-05, + "loss": 0.7899, + "step": 184770 + }, + { + "epoch": 1.1805067528717275, + "grad_norm": 0.7524814605712891, + "learning_rate": 3.6046435795243994e-05, + "loss": 0.7772, + "step": 184780 + }, + { + "epoch": 1.1805706400214662, + "grad_norm": 0.9039172530174255, + "learning_rate": 3.604161753118498e-05, + "loss": 0.8546, + "step": 184790 + }, + { + "epoch": 1.1806345271712049, + "grad_norm": 0.9902855753898621, + "learning_rate": 3.6036799407698964e-05, + "loss": 0.7876, + "step": 184800 + }, + { + "epoch": 1.1806984143209436, + "grad_norm": 0.6805626749992371, + "learning_rate": 3.603198142483445e-05, + "loss": 0.6933, + "step": 184810 + }, + { + "epoch": 1.1807623014706823, + "grad_norm": 1.1463366746902466, + "learning_rate": 3.6027163582639966e-05, + "loss": 0.7168, + "step": 184820 + }, + { + "epoch": 1.1808261886204208, + "grad_norm": 1.0004279613494873, + "learning_rate": 3.602234588116403e-05, + "loss": 0.7084, + "step": 184830 + }, + { + "epoch": 1.1808900757701597, + "grad_norm": 0.9166737198829651, + "learning_rate": 3.601752832045517e-05, + "loss": 0.6373, + "step": 184840 + }, + { + "epoch": 1.1809539629198982, + "grad_norm": 1.7508405447006226, + "learning_rate": 3.6012710900561895e-05, + "loss": 0.673, + "step": 184850 + }, + { + "epoch": 1.181017850069637, + "grad_norm": 2.30367374420166, + "learning_rate": 3.6007893621532725e-05, + "loss": 0.8872, + "step": 184860 + }, + { + "epoch": 1.1810817372193756, + "grad_norm": 0.9799571633338928, + "learning_rate": 3.6003076483416166e-05, + "loss": 1.0567, + "step": 184870 + }, + { + "epoch": 1.1811456243691143, + "grad_norm": 1.3801004886627197, + "learning_rate": 3.5998259486260736e-05, + "loss": 0.9645, + "step": 184880 + }, + { + "epoch": 1.181209511518853, + "grad_norm": 3.1836953163146973, + "learning_rate": 3.5993442630114946e-05, + "loss": 0.7476, + "step": 184890 + }, + { + "epoch": 1.1812733986685917, + "grad_norm": 1.1182852983474731, + "learning_rate": 3.5988625915027316e-05, + "loss": 0.7176, + "step": 184900 + }, + { + "epoch": 1.1813372858183304, + "grad_norm": 1.3729959726333618, + "learning_rate": 3.5983809341046334e-05, + "loss": 0.769, + "step": 184910 + }, + { + "epoch": 1.1814011729680691, + "grad_norm": 1.1916794776916504, + "learning_rate": 3.597899290822052e-05, + "loss": 0.6671, + "step": 184920 + }, + { + "epoch": 1.1814650601178078, + "grad_norm": 0.9152897000312805, + "learning_rate": 3.597417661659838e-05, + "loss": 0.7897, + "step": 184930 + }, + { + "epoch": 1.1815289472675465, + "grad_norm": 0.7210372686386108, + "learning_rate": 3.596936046622841e-05, + "loss": 0.7957, + "step": 184940 + }, + { + "epoch": 1.1815928344172852, + "grad_norm": 1.240045189857483, + "learning_rate": 3.596454445715912e-05, + "loss": 0.872, + "step": 184950 + }, + { + "epoch": 1.181656721567024, + "grad_norm": 0.7053720355033875, + "learning_rate": 3.595972858943901e-05, + "loss": 0.8205, + "step": 184960 + }, + { + "epoch": 1.1817206087167627, + "grad_norm": 0.6720781326293945, + "learning_rate": 3.595491286311659e-05, + "loss": 1.0007, + "step": 184970 + }, + { + "epoch": 1.1817844958665014, + "grad_norm": 0.8549431562423706, + "learning_rate": 3.595009727824033e-05, + "loss": 0.848, + "step": 184980 + }, + { + "epoch": 1.18184838301624, + "grad_norm": 1.6900193691253662, + "learning_rate": 3.5945281834858744e-05, + "loss": 1.2726, + "step": 184990 + }, + { + "epoch": 1.1819122701659788, + "grad_norm": 0.8604527115821838, + "learning_rate": 3.5940466533020344e-05, + "loss": 0.8544, + "step": 185000 + }, + { + "epoch": 1.1819761573157175, + "grad_norm": 1.0963983535766602, + "learning_rate": 3.5935651372773604e-05, + "loss": 0.7911, + "step": 185010 + }, + { + "epoch": 1.1820400444654562, + "grad_norm": 1.3671964406967163, + "learning_rate": 3.5930836354167017e-05, + "loss": 0.7819, + "step": 185020 + }, + { + "epoch": 1.182103931615195, + "grad_norm": 0.5053865909576416, + "learning_rate": 3.592602147724909e-05, + "loss": 0.839, + "step": 185030 + }, + { + "epoch": 1.1821678187649336, + "grad_norm": 0.7368679046630859, + "learning_rate": 3.59212067420683e-05, + "loss": 0.8822, + "step": 185040 + }, + { + "epoch": 1.1822317059146723, + "grad_norm": 1.082725167274475, + "learning_rate": 3.591639214867313e-05, + "loss": 0.813, + "step": 185050 + }, + { + "epoch": 1.182295593064411, + "grad_norm": 1.574652075767517, + "learning_rate": 3.591157769711209e-05, + "loss": 0.8479, + "step": 185060 + }, + { + "epoch": 1.1823594802141497, + "grad_norm": 0.8046184778213501, + "learning_rate": 3.5906763387433655e-05, + "loss": 0.8053, + "step": 185070 + }, + { + "epoch": 1.1824233673638884, + "grad_norm": 1.8920392990112305, + "learning_rate": 3.590194921968629e-05, + "loss": 0.847, + "step": 185080 + }, + { + "epoch": 1.1824872545136271, + "grad_norm": 2.120652675628662, + "learning_rate": 3.58971351939185e-05, + "loss": 0.938, + "step": 185090 + }, + { + "epoch": 1.1825511416633658, + "grad_norm": 0.6371691226959229, + "learning_rate": 3.589232131017875e-05, + "loss": 0.849, + "step": 185100 + }, + { + "epoch": 1.1826150288131045, + "grad_norm": 0.5732691287994385, + "learning_rate": 3.588750756851552e-05, + "loss": 0.9148, + "step": 185110 + }, + { + "epoch": 1.1826789159628432, + "grad_norm": 0.6342383623123169, + "learning_rate": 3.5882693968977315e-05, + "loss": 0.7332, + "step": 185120 + }, + { + "epoch": 1.182742803112582, + "grad_norm": 0.8432683348655701, + "learning_rate": 3.587788051161259e-05, + "loss": 0.7871, + "step": 185130 + }, + { + "epoch": 1.1828066902623207, + "grad_norm": 0.6797798871994019, + "learning_rate": 3.5873067196469824e-05, + "loss": 0.7023, + "step": 185140 + }, + { + "epoch": 1.1828705774120594, + "grad_norm": 0.8828827142715454, + "learning_rate": 3.5868254023597495e-05, + "loss": 0.9116, + "step": 185150 + }, + { + "epoch": 1.182934464561798, + "grad_norm": 0.8826804161071777, + "learning_rate": 3.5863440993044076e-05, + "loss": 1.0683, + "step": 185160 + }, + { + "epoch": 1.1829983517115368, + "grad_norm": 0.9741883277893066, + "learning_rate": 3.5858628104858036e-05, + "loss": 0.9385, + "step": 185170 + }, + { + "epoch": 1.1830622388612755, + "grad_norm": 0.9632142782211304, + "learning_rate": 3.585381535908784e-05, + "loss": 0.721, + "step": 185180 + }, + { + "epoch": 1.1831261260110142, + "grad_norm": 1.2238630056381226, + "learning_rate": 3.5849002755781967e-05, + "loss": 0.8154, + "step": 185190 + }, + { + "epoch": 1.183190013160753, + "grad_norm": 0.9553030133247375, + "learning_rate": 3.584419029498888e-05, + "loss": 0.819, + "step": 185200 + }, + { + "epoch": 1.1832539003104916, + "grad_norm": 1.1240880489349365, + "learning_rate": 3.583937797675704e-05, + "loss": 0.8524, + "step": 185210 + }, + { + "epoch": 1.1833177874602303, + "grad_norm": 1.025412678718567, + "learning_rate": 3.583456580113491e-05, + "loss": 1.0, + "step": 185220 + }, + { + "epoch": 1.183381674609969, + "grad_norm": 2.941777229309082, + "learning_rate": 3.582975376817096e-05, + "loss": 0.9068, + "step": 185230 + }, + { + "epoch": 1.1834455617597077, + "grad_norm": 0.9563724398612976, + "learning_rate": 3.5824941877913656e-05, + "loss": 0.9853, + "step": 185240 + }, + { + "epoch": 1.1835094489094464, + "grad_norm": 0.7037461996078491, + "learning_rate": 3.582013013041144e-05, + "loss": 0.8759, + "step": 185250 + }, + { + "epoch": 1.1835733360591851, + "grad_norm": 0.9281525015830994, + "learning_rate": 3.581531852571278e-05, + "loss": 0.629, + "step": 185260 + }, + { + "epoch": 1.1836372232089238, + "grad_norm": 1.0594696998596191, + "learning_rate": 3.5810507063866147e-05, + "loss": 0.7937, + "step": 185270 + }, + { + "epoch": 1.1837011103586625, + "grad_norm": 1.3226358890533447, + "learning_rate": 3.5805695744919976e-05, + "loss": 0.8282, + "step": 185280 + }, + { + "epoch": 1.1837649975084013, + "grad_norm": 0.8504054546356201, + "learning_rate": 3.5800884568922724e-05, + "loss": 0.8351, + "step": 185290 + }, + { + "epoch": 1.18382888465814, + "grad_norm": 1.078147292137146, + "learning_rate": 3.5796073535922856e-05, + "loss": 1.1989, + "step": 185300 + }, + { + "epoch": 1.1838927718078787, + "grad_norm": 1.0209671258926392, + "learning_rate": 3.57912626459688e-05, + "loss": 0.7663, + "step": 185310 + }, + { + "epoch": 1.1839566589576171, + "grad_norm": 1.1877695322036743, + "learning_rate": 3.578645189910903e-05, + "loss": 0.8764, + "step": 185320 + }, + { + "epoch": 1.184020546107356, + "grad_norm": 1.095737099647522, + "learning_rate": 3.5781641295391995e-05, + "loss": 0.9265, + "step": 185330 + }, + { + "epoch": 1.1840844332570946, + "grad_norm": 0.9853129982948303, + "learning_rate": 3.577683083486613e-05, + "loss": 1.2863, + "step": 185340 + }, + { + "epoch": 1.1841483204068335, + "grad_norm": 0.9213379621505737, + "learning_rate": 3.577202051757987e-05, + "loss": 0.9187, + "step": 185350 + }, + { + "epoch": 1.184212207556572, + "grad_norm": 0.8555194735527039, + "learning_rate": 3.576721034358169e-05, + "loss": 0.7985, + "step": 185360 + }, + { + "epoch": 1.1842760947063107, + "grad_norm": 1.0572147369384766, + "learning_rate": 3.576240031292001e-05, + "loss": 0.9271, + "step": 185370 + }, + { + "epoch": 1.1843399818560494, + "grad_norm": 0.991568922996521, + "learning_rate": 3.5757590425643276e-05, + "loss": 0.894, + "step": 185380 + }, + { + "epoch": 1.184403869005788, + "grad_norm": 0.6871430277824402, + "learning_rate": 3.575278068179992e-05, + "loss": 0.9197, + "step": 185390 + }, + { + "epoch": 1.1844677561555268, + "grad_norm": 0.7673249244689941, + "learning_rate": 3.5747971081438394e-05, + "loss": 0.7462, + "step": 185400 + }, + { + "epoch": 1.1845316433052655, + "grad_norm": 1.055765986442566, + "learning_rate": 3.574316162460713e-05, + "loss": 0.785, + "step": 185410 + }, + { + "epoch": 1.1845955304550042, + "grad_norm": 1.6008297204971313, + "learning_rate": 3.5738352311354565e-05, + "loss": 0.8627, + "step": 185420 + }, + { + "epoch": 1.184659417604743, + "grad_norm": 1.0407480001449585, + "learning_rate": 3.573354314172912e-05, + "loss": 0.8712, + "step": 185430 + }, + { + "epoch": 1.1847233047544816, + "grad_norm": 0.9765558242797852, + "learning_rate": 3.572873411577925e-05, + "loss": 0.9753, + "step": 185440 + }, + { + "epoch": 1.1847871919042203, + "grad_norm": 0.7692517042160034, + "learning_rate": 3.572392523355337e-05, + "loss": 0.8015, + "step": 185450 + }, + { + "epoch": 1.184851079053959, + "grad_norm": 1.113476276397705, + "learning_rate": 3.571911649509991e-05, + "loss": 0.8312, + "step": 185460 + }, + { + "epoch": 1.1849149662036977, + "grad_norm": 0.9064470529556274, + "learning_rate": 3.5714307900467306e-05, + "loss": 0.8722, + "step": 185470 + }, + { + "epoch": 1.1849788533534364, + "grad_norm": 0.764999508857727, + "learning_rate": 3.570949944970397e-05, + "loss": 0.7575, + "step": 185480 + }, + { + "epoch": 1.1850427405031752, + "grad_norm": 0.7299469709396362, + "learning_rate": 3.570469114285835e-05, + "loss": 1.0981, + "step": 185490 + }, + { + "epoch": 1.1851066276529139, + "grad_norm": 0.8814761638641357, + "learning_rate": 3.569988297997885e-05, + "loss": 1.0155, + "step": 185500 + }, + { + "epoch": 1.1851705148026526, + "grad_norm": 0.8254292607307434, + "learning_rate": 3.5695074961113905e-05, + "loss": 0.9247, + "step": 185510 + }, + { + "epoch": 1.1852344019523913, + "grad_norm": 1.3654801845550537, + "learning_rate": 3.5690267086311915e-05, + "loss": 0.7811, + "step": 185520 + }, + { + "epoch": 1.18529828910213, + "grad_norm": 0.8215779066085815, + "learning_rate": 3.568545935562133e-05, + "loss": 0.9981, + "step": 185530 + }, + { + "epoch": 1.1853621762518687, + "grad_norm": 0.9064726233482361, + "learning_rate": 3.568065176909055e-05, + "loss": 0.9709, + "step": 185540 + }, + { + "epoch": 1.1854260634016074, + "grad_norm": 0.8759022355079651, + "learning_rate": 3.567584432676799e-05, + "loss": 0.9627, + "step": 185550 + }, + { + "epoch": 1.185489950551346, + "grad_norm": 0.8267679810523987, + "learning_rate": 3.5671037028702103e-05, + "loss": 0.6302, + "step": 185560 + }, + { + "epoch": 1.1855538377010848, + "grad_norm": 1.9346404075622559, + "learning_rate": 3.566622987494124e-05, + "loss": 1.0868, + "step": 185570 + }, + { + "epoch": 1.1856177248508235, + "grad_norm": 0.7702456712722778, + "learning_rate": 3.5661422865533845e-05, + "loss": 0.9252, + "step": 185580 + }, + { + "epoch": 1.1856816120005622, + "grad_norm": 1.2653367519378662, + "learning_rate": 3.565661600052833e-05, + "loss": 0.7676, + "step": 185590 + }, + { + "epoch": 1.185745499150301, + "grad_norm": 1.1973912715911865, + "learning_rate": 3.5651809279973094e-05, + "loss": 1.0397, + "step": 185600 + }, + { + "epoch": 1.1858093863000396, + "grad_norm": 0.9906696081161499, + "learning_rate": 3.564700270391655e-05, + "loss": 0.6995, + "step": 185610 + }, + { + "epoch": 1.1858732734497783, + "grad_norm": 0.7793400287628174, + "learning_rate": 3.5642196272407115e-05, + "loss": 0.9983, + "step": 185620 + }, + { + "epoch": 1.185937160599517, + "grad_norm": 0.97843337059021, + "learning_rate": 3.5637389985493176e-05, + "loss": 0.9133, + "step": 185630 + }, + { + "epoch": 1.1860010477492557, + "grad_norm": 0.5301520824432373, + "learning_rate": 3.5632583843223144e-05, + "loss": 0.993, + "step": 185640 + }, + { + "epoch": 1.1860649348989944, + "grad_norm": 0.8656491637229919, + "learning_rate": 3.562777784564543e-05, + "loss": 0.8884, + "step": 185650 + }, + { + "epoch": 1.1861288220487332, + "grad_norm": 0.8169345855712891, + "learning_rate": 3.5622971992808424e-05, + "loss": 0.8378, + "step": 185660 + }, + { + "epoch": 1.1861927091984719, + "grad_norm": 0.7268931865692139, + "learning_rate": 3.561816628476053e-05, + "loss": 0.9106, + "step": 185670 + }, + { + "epoch": 1.1862565963482106, + "grad_norm": 0.6114735007286072, + "learning_rate": 3.5613360721550136e-05, + "loss": 0.6921, + "step": 185680 + }, + { + "epoch": 1.1863204834979493, + "grad_norm": 1.0138870477676392, + "learning_rate": 3.560855530322565e-05, + "loss": 1.2098, + "step": 185690 + }, + { + "epoch": 1.186384370647688, + "grad_norm": 2.26399564743042, + "learning_rate": 3.560375002983547e-05, + "loss": 0.9302, + "step": 185700 + }, + { + "epoch": 1.1864482577974267, + "grad_norm": 1.1107441186904907, + "learning_rate": 3.5598944901427976e-05, + "loss": 1.0999, + "step": 185710 + }, + { + "epoch": 1.1865121449471654, + "grad_norm": 0.8785502910614014, + "learning_rate": 3.559413991805157e-05, + "loss": 0.927, + "step": 185720 + }, + { + "epoch": 1.186576032096904, + "grad_norm": 1.5931460857391357, + "learning_rate": 3.558933507975463e-05, + "loss": 0.8109, + "step": 185730 + }, + { + "epoch": 1.1866399192466428, + "grad_norm": 0.8328737020492554, + "learning_rate": 3.558453038658556e-05, + "loss": 0.9513, + "step": 185740 + }, + { + "epoch": 1.1867038063963815, + "grad_norm": 0.948833167552948, + "learning_rate": 3.5579725838592734e-05, + "loss": 0.804, + "step": 185750 + }, + { + "epoch": 1.1867676935461202, + "grad_norm": 1.2792750597000122, + "learning_rate": 3.557492143582454e-05, + "loss": 0.9218, + "step": 185760 + }, + { + "epoch": 1.186831580695859, + "grad_norm": 0.9221693873405457, + "learning_rate": 3.557011717832938e-05, + "loss": 0.8005, + "step": 185770 + }, + { + "epoch": 1.1868954678455976, + "grad_norm": 1.035339117050171, + "learning_rate": 3.5565313066155616e-05, + "loss": 0.9736, + "step": 185780 + }, + { + "epoch": 1.1869593549953361, + "grad_norm": 1.0393927097320557, + "learning_rate": 3.5560509099351636e-05, + "loss": 0.7689, + "step": 185790 + }, + { + "epoch": 1.187023242145075, + "grad_norm": 0.6707600355148315, + "learning_rate": 3.5555705277965825e-05, + "loss": 0.6814, + "step": 185800 + }, + { + "epoch": 1.1870871292948135, + "grad_norm": 0.6006231904029846, + "learning_rate": 3.555090160204655e-05, + "loss": 1.014, + "step": 185810 + }, + { + "epoch": 1.1871510164445525, + "grad_norm": 1.551177978515625, + "learning_rate": 3.5546098071642205e-05, + "loss": 0.8504, + "step": 185820 + }, + { + "epoch": 1.187214903594291, + "grad_norm": 1.0770034790039062, + "learning_rate": 3.554129468680115e-05, + "loss": 0.9141, + "step": 185830 + }, + { + "epoch": 1.1872787907440299, + "grad_norm": 0.6229815483093262, + "learning_rate": 3.5536491447571765e-05, + "loss": 0.7852, + "step": 185840 + }, + { + "epoch": 1.1873426778937684, + "grad_norm": 0.5835037231445312, + "learning_rate": 3.553168835400243e-05, + "loss": 0.7914, + "step": 185850 + }, + { + "epoch": 1.187406565043507, + "grad_norm": 1.2109571695327759, + "learning_rate": 3.552688540614151e-05, + "loss": 0.8617, + "step": 185860 + }, + { + "epoch": 1.1874704521932458, + "grad_norm": 0.7813235521316528, + "learning_rate": 3.5522082604037376e-05, + "loss": 0.9206, + "step": 185870 + }, + { + "epoch": 1.1875343393429845, + "grad_norm": 1.2372418642044067, + "learning_rate": 3.551727994773839e-05, + "loss": 0.9416, + "step": 185880 + }, + { + "epoch": 1.1875982264927232, + "grad_norm": 1.2593668699264526, + "learning_rate": 3.551247743729293e-05, + "loss": 0.9913, + "step": 185890 + }, + { + "epoch": 1.1876621136424619, + "grad_norm": 1.3940966129302979, + "learning_rate": 3.550767507274935e-05, + "loss": 0.9156, + "step": 185900 + }, + { + "epoch": 1.1877260007922006, + "grad_norm": 1.2378836870193481, + "learning_rate": 3.550287285415602e-05, + "loss": 0.8502, + "step": 185910 + }, + { + "epoch": 1.1877898879419393, + "grad_norm": 1.2201237678527832, + "learning_rate": 3.549807078156131e-05, + "loss": 0.9283, + "step": 185920 + }, + { + "epoch": 1.187853775091678, + "grad_norm": 1.0473310947418213, + "learning_rate": 3.549326885501357e-05, + "loss": 0.9494, + "step": 185930 + }, + { + "epoch": 1.1879176622414167, + "grad_norm": 1.258222222328186, + "learning_rate": 3.548846707456116e-05, + "loss": 0.7309, + "step": 185940 + }, + { + "epoch": 1.1879815493911554, + "grad_norm": 0.8088693022727966, + "learning_rate": 3.548366544025245e-05, + "loss": 0.8427, + "step": 185950 + }, + { + "epoch": 1.1880454365408941, + "grad_norm": 0.7002352476119995, + "learning_rate": 3.547886395213577e-05, + "loss": 0.8715, + "step": 185960 + }, + { + "epoch": 1.1881093236906328, + "grad_norm": 1.0667836666107178, + "learning_rate": 3.5474062610259506e-05, + "loss": 0.7263, + "step": 185970 + }, + { + "epoch": 1.1881732108403715, + "grad_norm": 1.051770567893982, + "learning_rate": 3.5469261414671996e-05, + "loss": 0.9918, + "step": 185980 + }, + { + "epoch": 1.1882370979901102, + "grad_norm": 1.470700979232788, + "learning_rate": 3.5464460365421584e-05, + "loss": 0.8464, + "step": 185990 + }, + { + "epoch": 1.188300985139849, + "grad_norm": 0.7622199058532715, + "learning_rate": 3.5459659462556646e-05, + "loss": 0.7845, + "step": 186000 + }, + { + "epoch": 1.1883648722895876, + "grad_norm": 1.0996206998825073, + "learning_rate": 3.54548587061255e-05, + "loss": 0.9458, + "step": 186010 + }, + { + "epoch": 1.1884287594393264, + "grad_norm": 0.9898949265480042, + "learning_rate": 3.545005809617653e-05, + "loss": 0.8094, + "step": 186020 + }, + { + "epoch": 1.188492646589065, + "grad_norm": 0.7668837308883667, + "learning_rate": 3.5445257632758054e-05, + "loss": 0.8887, + "step": 186030 + }, + { + "epoch": 1.1885565337388038, + "grad_norm": 1.3512365818023682, + "learning_rate": 3.544045731591843e-05, + "loss": 0.7852, + "step": 186040 + }, + { + "epoch": 1.1886204208885425, + "grad_norm": 0.7477654814720154, + "learning_rate": 3.543565714570599e-05, + "loss": 0.9058, + "step": 186050 + }, + { + "epoch": 1.1886843080382812, + "grad_norm": 1.4681755304336548, + "learning_rate": 3.543085712216909e-05, + "loss": 0.9091, + "step": 186060 + }, + { + "epoch": 1.1887481951880199, + "grad_norm": 1.0334805250167847, + "learning_rate": 3.5426057245356055e-05, + "loss": 0.8175, + "step": 186070 + }, + { + "epoch": 1.1888120823377586, + "grad_norm": 1.928889274597168, + "learning_rate": 3.542125751531523e-05, + "loss": 0.7668, + "step": 186080 + }, + { + "epoch": 1.1888759694874973, + "grad_norm": 1.266452431678772, + "learning_rate": 3.541645793209496e-05, + "loss": 0.9653, + "step": 186090 + }, + { + "epoch": 1.188939856637236, + "grad_norm": 0.9438464641571045, + "learning_rate": 3.541165849574357e-05, + "loss": 0.9455, + "step": 186100 + }, + { + "epoch": 1.1890037437869747, + "grad_norm": 0.8136021494865417, + "learning_rate": 3.5406859206309405e-05, + "loss": 0.7879, + "step": 186110 + }, + { + "epoch": 1.1890676309367134, + "grad_norm": 1.0168657302856445, + "learning_rate": 3.540206006384079e-05, + "loss": 0.9168, + "step": 186120 + }, + { + "epoch": 1.1891315180864521, + "grad_norm": 1.042479395866394, + "learning_rate": 3.539726106838606e-05, + "loss": 0.7625, + "step": 186130 + }, + { + "epoch": 1.1891954052361908, + "grad_norm": 0.821614682674408, + "learning_rate": 3.539246221999354e-05, + "loss": 0.829, + "step": 186140 + }, + { + "epoch": 1.1892592923859295, + "grad_norm": 0.6949642896652222, + "learning_rate": 3.5387663518711564e-05, + "loss": 0.8162, + "step": 186150 + }, + { + "epoch": 1.1893231795356682, + "grad_norm": 1.2734757661819458, + "learning_rate": 3.5382864964588447e-05, + "loss": 0.8209, + "step": 186160 + }, + { + "epoch": 1.189387066685407, + "grad_norm": 2.130889654159546, + "learning_rate": 3.537806655767254e-05, + "loss": 0.9647, + "step": 186170 + }, + { + "epoch": 1.1894509538351457, + "grad_norm": 1.1269334554672241, + "learning_rate": 3.537326829801215e-05, + "loss": 1.0093, + "step": 186180 + }, + { + "epoch": 1.1895148409848844, + "grad_norm": 0.8635377883911133, + "learning_rate": 3.5368470185655605e-05, + "loss": 0.9047, + "step": 186190 + }, + { + "epoch": 1.189578728134623, + "grad_norm": 1.0611902475357056, + "learning_rate": 3.536367222065122e-05, + "loss": 0.9056, + "step": 186200 + }, + { + "epoch": 1.1896426152843618, + "grad_norm": 2.2567262649536133, + "learning_rate": 3.535887440304732e-05, + "loss": 0.9235, + "step": 186210 + }, + { + "epoch": 1.1897065024341005, + "grad_norm": 0.820324718952179, + "learning_rate": 3.535407673289222e-05, + "loss": 0.8314, + "step": 186220 + }, + { + "epoch": 1.1897703895838392, + "grad_norm": 1.0150549411773682, + "learning_rate": 3.53497589558613e-05, + "loss": 0.9562, + "step": 186230 + }, + { + "epoch": 1.189834276733578, + "grad_norm": 1.3573201894760132, + "learning_rate": 3.534496156599203e-05, + "loss": 1.0057, + "step": 186240 + }, + { + "epoch": 1.1898981638833166, + "grad_norm": 1.0177377462387085, + "learning_rate": 3.534016432371168e-05, + "loss": 0.9452, + "step": 186250 + }, + { + "epoch": 1.1899620510330553, + "grad_norm": 1.215646505355835, + "learning_rate": 3.533536722906856e-05, + "loss": 0.8697, + "step": 186260 + }, + { + "epoch": 1.190025938182794, + "grad_norm": 0.8700612187385559, + "learning_rate": 3.533057028211097e-05, + "loss": 1.0512, + "step": 186270 + }, + { + "epoch": 1.1900898253325325, + "grad_norm": 1.3516863584518433, + "learning_rate": 3.532577348288723e-05, + "loss": 0.7342, + "step": 186280 + }, + { + "epoch": 1.1901537124822714, + "grad_norm": 0.7948817610740662, + "learning_rate": 3.532097683144565e-05, + "loss": 0.9811, + "step": 186290 + }, + { + "epoch": 1.19021759963201, + "grad_norm": 1.0154379606246948, + "learning_rate": 3.5316180327834525e-05, + "loss": 0.7686, + "step": 186300 + }, + { + "epoch": 1.1902814867817488, + "grad_norm": 0.7849642634391785, + "learning_rate": 3.5311383972102175e-05, + "loss": 0.7886, + "step": 186310 + }, + { + "epoch": 1.1903453739314873, + "grad_norm": 0.8592081665992737, + "learning_rate": 3.530658776429689e-05, + "loss": 0.7697, + "step": 186320 + }, + { + "epoch": 1.1904092610812262, + "grad_norm": 0.3784794807434082, + "learning_rate": 3.530179170446699e-05, + "loss": 0.7808, + "step": 186330 + }, + { + "epoch": 1.1904731482309647, + "grad_norm": 1.2555115222930908, + "learning_rate": 3.529699579266076e-05, + "loss": 0.6747, + "step": 186340 + }, + { + "epoch": 1.1905370353807034, + "grad_norm": 0.9044711589813232, + "learning_rate": 3.5292200028926494e-05, + "loss": 0.8883, + "step": 186350 + }, + { + "epoch": 1.1906009225304421, + "grad_norm": 0.9068841934204102, + "learning_rate": 3.5287404413312506e-05, + "loss": 0.9085, + "step": 186360 + }, + { + "epoch": 1.1906648096801808, + "grad_norm": 1.7138638496398926, + "learning_rate": 3.528260894586708e-05, + "loss": 1.3497, + "step": 186370 + }, + { + "epoch": 1.1907286968299196, + "grad_norm": 1.0467746257781982, + "learning_rate": 3.527781362663851e-05, + "loss": 1.687, + "step": 186380 + }, + { + "epoch": 1.1907925839796583, + "grad_norm": 0.8251565098762512, + "learning_rate": 3.52730184556751e-05, + "loss": 0.7175, + "step": 186390 + }, + { + "epoch": 1.190856471129397, + "grad_norm": 0.8762297630310059, + "learning_rate": 3.5268223433025135e-05, + "loss": 0.9619, + "step": 186400 + }, + { + "epoch": 1.1909203582791357, + "grad_norm": 0.6899451613426208, + "learning_rate": 3.5263428558736896e-05, + "loss": 0.7858, + "step": 186410 + }, + { + "epoch": 1.1909842454288744, + "grad_norm": 1.063529133796692, + "learning_rate": 3.5258633832858696e-05, + "loss": 0.9464, + "step": 186420 + }, + { + "epoch": 1.191048132578613, + "grad_norm": 0.955309271812439, + "learning_rate": 3.525383925543879e-05, + "loss": 0.7051, + "step": 186430 + }, + { + "epoch": 1.1911120197283518, + "grad_norm": 0.8504273891448975, + "learning_rate": 3.524904482652549e-05, + "loss": 0.7116, + "step": 186440 + }, + { + "epoch": 1.1911759068780905, + "grad_norm": 1.1198617219924927, + "learning_rate": 3.524425054616707e-05, + "loss": 0.8511, + "step": 186450 + }, + { + "epoch": 1.1912397940278292, + "grad_norm": 0.7477229833602905, + "learning_rate": 3.523945641441181e-05, + "loss": 0.5929, + "step": 186460 + }, + { + "epoch": 1.191303681177568, + "grad_norm": 0.9625139832496643, + "learning_rate": 3.5234662431308e-05, + "loss": 0.8808, + "step": 186470 + }, + { + "epoch": 1.1913675683273066, + "grad_norm": 0.9500938653945923, + "learning_rate": 3.522986859690389e-05, + "loss": 0.7244, + "step": 186480 + }, + { + "epoch": 1.1914314554770453, + "grad_norm": 1.6534185409545898, + "learning_rate": 3.52250749112478e-05, + "loss": 0.9015, + "step": 186490 + }, + { + "epoch": 1.191495342626784, + "grad_norm": 1.031301736831665, + "learning_rate": 3.522028137438799e-05, + "loss": 0.5996, + "step": 186500 + }, + { + "epoch": 1.1915592297765227, + "grad_norm": 0.9952258467674255, + "learning_rate": 3.521548798637272e-05, + "loss": 1.109, + "step": 186510 + }, + { + "epoch": 1.1916231169262614, + "grad_norm": 1.1881864070892334, + "learning_rate": 3.5210694747250286e-05, + "loss": 0.968, + "step": 186520 + }, + { + "epoch": 1.1916870040760001, + "grad_norm": 1.4568711519241333, + "learning_rate": 3.5205901657068953e-05, + "loss": 0.8803, + "step": 186530 + }, + { + "epoch": 1.1917508912257389, + "grad_norm": 0.7728120684623718, + "learning_rate": 3.520110871587698e-05, + "loss": 0.9651, + "step": 186540 + }, + { + "epoch": 1.1918147783754776, + "grad_norm": 0.7609994411468506, + "learning_rate": 3.5196315923722655e-05, + "loss": 1.0502, + "step": 186550 + }, + { + "epoch": 1.1918786655252163, + "grad_norm": 1.627550721168518, + "learning_rate": 3.519152328065423e-05, + "loss": 0.9135, + "step": 186560 + }, + { + "epoch": 1.191942552674955, + "grad_norm": 0.8452563881874084, + "learning_rate": 3.518673078671998e-05, + "loss": 0.944, + "step": 186570 + }, + { + "epoch": 1.1920064398246937, + "grad_norm": 0.8744309544563293, + "learning_rate": 3.518193844196816e-05, + "loss": 0.7588, + "step": 186580 + }, + { + "epoch": 1.1920703269744324, + "grad_norm": 0.9594758152961731, + "learning_rate": 3.5177146246447046e-05, + "loss": 0.8855, + "step": 186590 + }, + { + "epoch": 1.192134214124171, + "grad_norm": 0.9259825348854065, + "learning_rate": 3.517235420020489e-05, + "loss": 0.911, + "step": 186600 + }, + { + "epoch": 1.1921981012739098, + "grad_norm": 2.5260446071624756, + "learning_rate": 3.516756230328995e-05, + "loss": 0.7676, + "step": 186610 + }, + { + "epoch": 1.1922619884236485, + "grad_norm": 0.7416990399360657, + "learning_rate": 3.51627705557505e-05, + "loss": 0.8542, + "step": 186620 + }, + { + "epoch": 1.1923258755733872, + "grad_norm": 0.6459031105041504, + "learning_rate": 3.515797895763478e-05, + "loss": 0.8635, + "step": 186630 + }, + { + "epoch": 1.192389762723126, + "grad_norm": 1.0951290130615234, + "learning_rate": 3.5153187508991055e-05, + "loss": 0.9024, + "step": 186640 + }, + { + "epoch": 1.1924536498728646, + "grad_norm": 0.676724374294281, + "learning_rate": 3.514839620986757e-05, + "loss": 0.8166, + "step": 186650 + }, + { + "epoch": 1.1925175370226033, + "grad_norm": 1.0528342723846436, + "learning_rate": 3.514360506031259e-05, + "loss": 0.6472, + "step": 186660 + }, + { + "epoch": 1.192581424172342, + "grad_norm": 0.7471928596496582, + "learning_rate": 3.513881406037437e-05, + "loss": 0.6945, + "step": 186670 + }, + { + "epoch": 1.1926453113220807, + "grad_norm": 1.1996952295303345, + "learning_rate": 3.513402321010114e-05, + "loss": 1.0772, + "step": 186680 + }, + { + "epoch": 1.1927091984718194, + "grad_norm": 0.7529562711715698, + "learning_rate": 3.512923250954115e-05, + "loss": 1.1773, + "step": 186690 + }, + { + "epoch": 1.1927730856215581, + "grad_norm": 0.6808781623840332, + "learning_rate": 3.512444195874266e-05, + "loss": 0.895, + "step": 186700 + }, + { + "epoch": 1.1928369727712969, + "grad_norm": 0.4089086651802063, + "learning_rate": 3.511965155775391e-05, + "loss": 0.8306, + "step": 186710 + }, + { + "epoch": 1.1929008599210356, + "grad_norm": 0.682090163230896, + "learning_rate": 3.511486130662313e-05, + "loss": 0.5964, + "step": 186720 + }, + { + "epoch": 1.1929647470707743, + "grad_norm": 1.0363860130310059, + "learning_rate": 3.5110071205398587e-05, + "loss": 0.976, + "step": 186730 + }, + { + "epoch": 1.193028634220513, + "grad_norm": 0.8405871391296387, + "learning_rate": 3.5105281254128504e-05, + "loss": 0.9637, + "step": 186740 + }, + { + "epoch": 1.1930925213702517, + "grad_norm": 0.6451852917671204, + "learning_rate": 3.510049145286112e-05, + "loss": 0.7558, + "step": 186750 + }, + { + "epoch": 1.1931564085199904, + "grad_norm": 0.9564416408538818, + "learning_rate": 3.5095701801644686e-05, + "loss": 0.6832, + "step": 186760 + }, + { + "epoch": 1.1932202956697289, + "grad_norm": 0.837658703327179, + "learning_rate": 3.5090912300527424e-05, + "loss": 0.8651, + "step": 186770 + }, + { + "epoch": 1.1932841828194678, + "grad_norm": 1.0022040605545044, + "learning_rate": 3.5086122949557574e-05, + "loss": 0.7418, + "step": 186780 + }, + { + "epoch": 1.1933480699692063, + "grad_norm": 0.6673353910446167, + "learning_rate": 3.508133374878337e-05, + "loss": 0.9588, + "step": 186790 + }, + { + "epoch": 1.1934119571189452, + "grad_norm": 1.4611053466796875, + "learning_rate": 3.507654469825303e-05, + "loss": 1.0155, + "step": 186800 + }, + { + "epoch": 1.1934758442686837, + "grad_norm": 1.2848858833312988, + "learning_rate": 3.507175579801481e-05, + "loss": 0.7561, + "step": 186810 + }, + { + "epoch": 1.1935397314184226, + "grad_norm": 0.8244970440864563, + "learning_rate": 3.506696704811691e-05, + "loss": 0.9435, + "step": 186820 + }, + { + "epoch": 1.193603618568161, + "grad_norm": 1.652975082397461, + "learning_rate": 3.5062178448607586e-05, + "loss": 0.8877, + "step": 186830 + }, + { + "epoch": 1.1936675057178998, + "grad_norm": 0.8719720840454102, + "learning_rate": 3.505738999953504e-05, + "loss": 0.7601, + "step": 186840 + }, + { + "epoch": 1.1937313928676385, + "grad_norm": 0.8868855237960815, + "learning_rate": 3.5052601700947506e-05, + "loss": 0.6681, + "step": 186850 + }, + { + "epoch": 1.1937952800173772, + "grad_norm": 0.731994092464447, + "learning_rate": 3.50478135528932e-05, + "loss": 0.6869, + "step": 186860 + }, + { + "epoch": 1.193859167167116, + "grad_norm": 1.1442980766296387, + "learning_rate": 3.504302555542035e-05, + "loss": 0.9518, + "step": 186870 + }, + { + "epoch": 1.1939230543168546, + "grad_norm": 0.6606703400611877, + "learning_rate": 3.503823770857718e-05, + "loss": 0.9076, + "step": 186880 + }, + { + "epoch": 1.1939869414665933, + "grad_norm": 0.9944251775741577, + "learning_rate": 3.503345001241189e-05, + "loss": 0.6579, + "step": 186890 + }, + { + "epoch": 1.194050828616332, + "grad_norm": 0.8377325534820557, + "learning_rate": 3.5028662466972715e-05, + "loss": 0.832, + "step": 186900 + }, + { + "epoch": 1.1941147157660708, + "grad_norm": 1.2907224893569946, + "learning_rate": 3.5023875072307855e-05, + "loss": 0.6895, + "step": 186910 + }, + { + "epoch": 1.1941786029158095, + "grad_norm": 2.1751160621643066, + "learning_rate": 3.501908782846553e-05, + "loss": 0.8754, + "step": 186920 + }, + { + "epoch": 1.1942424900655482, + "grad_norm": 1.0982850790023804, + "learning_rate": 3.501430073549394e-05, + "loss": 0.849, + "step": 186930 + }, + { + "epoch": 1.1943063772152869, + "grad_norm": 2.043400526046753, + "learning_rate": 3.500951379344132e-05, + "loss": 0.8272, + "step": 186940 + }, + { + "epoch": 1.1943702643650256, + "grad_norm": 1.3341394662857056, + "learning_rate": 3.5004727002355864e-05, + "loss": 0.901, + "step": 186950 + }, + { + "epoch": 1.1944341515147643, + "grad_norm": 1.1250461339950562, + "learning_rate": 3.4999940362285776e-05, + "loss": 1.1599, + "step": 186960 + }, + { + "epoch": 1.194498038664503, + "grad_norm": 1.1271651983261108, + "learning_rate": 3.499515387327927e-05, + "loss": 0.8843, + "step": 186970 + }, + { + "epoch": 1.1945619258142417, + "grad_norm": 0.9995719194412231, + "learning_rate": 3.499036753538454e-05, + "loss": 0.8182, + "step": 186980 + }, + { + "epoch": 1.1946258129639804, + "grad_norm": 0.8369683027267456, + "learning_rate": 3.49855813486498e-05, + "loss": 0.8878, + "step": 186990 + }, + { + "epoch": 1.194689700113719, + "grad_norm": 1.3011962175369263, + "learning_rate": 3.498079531312324e-05, + "loss": 0.8289, + "step": 187000 + }, + { + "epoch": 1.1947535872634578, + "grad_norm": 2.0897181034088135, + "learning_rate": 3.4976009428853055e-05, + "loss": 0.7438, + "step": 187010 + }, + { + "epoch": 1.1948174744131965, + "grad_norm": 0.9380792379379272, + "learning_rate": 3.4971223695887464e-05, + "loss": 0.9273, + "step": 187020 + }, + { + "epoch": 1.1948813615629352, + "grad_norm": 0.6861240863800049, + "learning_rate": 3.496643811427466e-05, + "loss": 0.6861, + "step": 187030 + }, + { + "epoch": 1.194945248712674, + "grad_norm": 1.5726001262664795, + "learning_rate": 3.4961652684062815e-05, + "loss": 0.8486, + "step": 187040 + }, + { + "epoch": 1.1950091358624126, + "grad_norm": 0.8573664426803589, + "learning_rate": 3.495686740530015e-05, + "loss": 0.8446, + "step": 187050 + }, + { + "epoch": 1.1950730230121513, + "grad_norm": 0.8347463607788086, + "learning_rate": 3.4952082278034836e-05, + "loss": 0.9351, + "step": 187060 + }, + { + "epoch": 1.19513691016189, + "grad_norm": 0.9354316592216492, + "learning_rate": 3.494729730231507e-05, + "loss": 0.8548, + "step": 187070 + }, + { + "epoch": 1.1952007973116288, + "grad_norm": 0.9122055172920227, + "learning_rate": 3.494251247818905e-05, + "loss": 0.8454, + "step": 187080 + }, + { + "epoch": 1.1952646844613675, + "grad_norm": 1.073345422744751, + "learning_rate": 3.493772780570496e-05, + "loss": 0.6588, + "step": 187090 + }, + { + "epoch": 1.1953285716111062, + "grad_norm": 0.8936179876327515, + "learning_rate": 3.4932943284910967e-05, + "loss": 0.878, + "step": 187100 + }, + { + "epoch": 1.1953924587608449, + "grad_norm": 1.066080927848816, + "learning_rate": 3.492815891585528e-05, + "loss": 0.8864, + "step": 187110 + }, + { + "epoch": 1.1954563459105836, + "grad_norm": 1.137028455734253, + "learning_rate": 3.4923374698586074e-05, + "loss": 0.8987, + "step": 187120 + }, + { + "epoch": 1.1955202330603223, + "grad_norm": 1.0188874006271362, + "learning_rate": 3.491859063315152e-05, + "loss": 0.7188, + "step": 187130 + }, + { + "epoch": 1.195584120210061, + "grad_norm": 0.8354491591453552, + "learning_rate": 3.491380671959981e-05, + "loss": 1.045, + "step": 187140 + }, + { + "epoch": 1.1956480073597997, + "grad_norm": 1.3807976245880127, + "learning_rate": 3.490902295797912e-05, + "loss": 0.8854, + "step": 187150 + }, + { + "epoch": 1.1957118945095384, + "grad_norm": 1.2778666019439697, + "learning_rate": 3.490423934833763e-05, + "loss": 0.8915, + "step": 187160 + }, + { + "epoch": 1.1957757816592771, + "grad_norm": 0.9576749205589294, + "learning_rate": 3.48994558907235e-05, + "loss": 0.8141, + "step": 187170 + }, + { + "epoch": 1.1958396688090158, + "grad_norm": 1.2838776111602783, + "learning_rate": 3.4894672585184916e-05, + "loss": 0.8692, + "step": 187180 + }, + { + "epoch": 1.1959035559587545, + "grad_norm": 1.1865975856781006, + "learning_rate": 3.488988943177005e-05, + "loss": 0.8883, + "step": 187190 + }, + { + "epoch": 1.1959674431084932, + "grad_norm": 0.44902050495147705, + "learning_rate": 3.488510643052706e-05, + "loss": 0.5436, + "step": 187200 + }, + { + "epoch": 1.196031330258232, + "grad_norm": 1.1116329431533813, + "learning_rate": 3.4880323581504126e-05, + "loss": 0.8478, + "step": 187210 + }, + { + "epoch": 1.1960952174079706, + "grad_norm": 1.1291851997375488, + "learning_rate": 3.487554088474942e-05, + "loss": 0.8885, + "step": 187220 + }, + { + "epoch": 1.1961591045577094, + "grad_norm": 0.8840822577476501, + "learning_rate": 3.48707583403111e-05, + "loss": 1.0356, + "step": 187230 + }, + { + "epoch": 1.196222991707448, + "grad_norm": 0.7359145283699036, + "learning_rate": 3.486597594823733e-05, + "loss": 0.7619, + "step": 187240 + }, + { + "epoch": 1.1962868788571868, + "grad_norm": 0.3909396529197693, + "learning_rate": 3.4861193708576276e-05, + "loss": 0.6675, + "step": 187250 + }, + { + "epoch": 1.1963507660069252, + "grad_norm": 0.7241166234016418, + "learning_rate": 3.4856411621376096e-05, + "loss": 1.0381, + "step": 187260 + }, + { + "epoch": 1.1964146531566642, + "grad_norm": 0.7049499154090881, + "learning_rate": 3.485162968668496e-05, + "loss": 0.8549, + "step": 187270 + }, + { + "epoch": 1.1964785403064027, + "grad_norm": 0.6992123126983643, + "learning_rate": 3.4846847904551006e-05, + "loss": 0.8191, + "step": 187280 + }, + { + "epoch": 1.1965424274561416, + "grad_norm": 0.6694308519363403, + "learning_rate": 3.484206627502241e-05, + "loss": 0.9966, + "step": 187290 + }, + { + "epoch": 1.19660631460588, + "grad_norm": 2.5797371864318848, + "learning_rate": 3.483728479814732e-05, + "loss": 0.8075, + "step": 187300 + }, + { + "epoch": 1.1966702017556188, + "grad_norm": 1.4046313762664795, + "learning_rate": 3.4832503473973885e-05, + "loss": 0.8559, + "step": 187310 + }, + { + "epoch": 1.1967340889053575, + "grad_norm": 0.6860223412513733, + "learning_rate": 3.482772230255027e-05, + "loss": 1.0585, + "step": 187320 + }, + { + "epoch": 1.1967979760550962, + "grad_norm": 1.6226650476455688, + "learning_rate": 3.4822941283924604e-05, + "loss": 0.9188, + "step": 187330 + }, + { + "epoch": 1.196861863204835, + "grad_norm": 0.9351048469543457, + "learning_rate": 3.481816041814506e-05, + "loss": 1.1464, + "step": 187340 + }, + { + "epoch": 1.1969257503545736, + "grad_norm": 0.7233309745788574, + "learning_rate": 3.4813379705259775e-05, + "loss": 0.7825, + "step": 187350 + }, + { + "epoch": 1.1969896375043123, + "grad_norm": 0.7308865189552307, + "learning_rate": 3.480859914531689e-05, + "loss": 1.0493, + "step": 187360 + }, + { + "epoch": 1.197053524654051, + "grad_norm": 0.6127648949623108, + "learning_rate": 3.480381873836456e-05, + "loss": 0.6914, + "step": 187370 + }, + { + "epoch": 1.1971174118037897, + "grad_norm": 0.5740570425987244, + "learning_rate": 3.479903848445092e-05, + "loss": 0.828, + "step": 187380 + }, + { + "epoch": 1.1971812989535284, + "grad_norm": 0.9117375016212463, + "learning_rate": 3.4794258383624114e-05, + "loss": 0.9847, + "step": 187390 + }, + { + "epoch": 1.1972451861032671, + "grad_norm": 1.000638484954834, + "learning_rate": 3.478947843593228e-05, + "loss": 0.7421, + "step": 187400 + }, + { + "epoch": 1.1973090732530058, + "grad_norm": 1.0521003007888794, + "learning_rate": 3.4784698641423553e-05, + "loss": 0.8257, + "step": 187410 + }, + { + "epoch": 1.1973729604027445, + "grad_norm": 0.79627525806427, + "learning_rate": 3.477991900014608e-05, + "loss": 0.8698, + "step": 187420 + }, + { + "epoch": 1.1974368475524833, + "grad_norm": 1.0833988189697266, + "learning_rate": 3.477513951214798e-05, + "loss": 0.8366, + "step": 187430 + }, + { + "epoch": 1.197500734702222, + "grad_norm": 0.6187765002250671, + "learning_rate": 3.47703601774774e-05, + "loss": 0.7401, + "step": 187440 + }, + { + "epoch": 1.1975646218519607, + "grad_norm": 2.6944620609283447, + "learning_rate": 3.4765580996182476e-05, + "loss": 0.826, + "step": 187450 + }, + { + "epoch": 1.1976285090016994, + "grad_norm": 1.1974648237228394, + "learning_rate": 3.476080196831133e-05, + "loss": 1.0353, + "step": 187460 + }, + { + "epoch": 1.197692396151438, + "grad_norm": 1.277348518371582, + "learning_rate": 3.475602309391208e-05, + "loss": 0.9897, + "step": 187470 + }, + { + "epoch": 1.1977562833011768, + "grad_norm": 0.7508608102798462, + "learning_rate": 3.475124437303288e-05, + "loss": 0.6981, + "step": 187480 + }, + { + "epoch": 1.1978201704509155, + "grad_norm": 0.5950019359588623, + "learning_rate": 3.474646580572184e-05, + "loss": 0.8133, + "step": 187490 + }, + { + "epoch": 1.1978840576006542, + "grad_norm": 0.7577312588691711, + "learning_rate": 3.474168739202708e-05, + "loss": 1.0592, + "step": 187500 + }, + { + "epoch": 1.197947944750393, + "grad_norm": 0.863229513168335, + "learning_rate": 3.473690913199672e-05, + "loss": 0.5926, + "step": 187510 + }, + { + "epoch": 1.1980118319001316, + "grad_norm": 1.0712931156158447, + "learning_rate": 3.4732131025678905e-05, + "loss": 0.9704, + "step": 187520 + }, + { + "epoch": 1.1980757190498703, + "grad_norm": 0.918846845626831, + "learning_rate": 3.472735307312173e-05, + "loss": 0.8673, + "step": 187530 + }, + { + "epoch": 1.198139606199609, + "grad_norm": 1.0764986276626587, + "learning_rate": 3.4722575274373315e-05, + "loss": 1.0774, + "step": 187540 + }, + { + "epoch": 1.1982034933493477, + "grad_norm": 1.566809058189392, + "learning_rate": 3.47177976294818e-05, + "loss": 0.7657, + "step": 187550 + }, + { + "epoch": 1.1982673804990864, + "grad_norm": 1.0461945533752441, + "learning_rate": 3.471302013849527e-05, + "loss": 0.8546, + "step": 187560 + }, + { + "epoch": 1.1983312676488251, + "grad_norm": 1.0666048526763916, + "learning_rate": 3.4708242801461866e-05, + "loss": 0.7968, + "step": 187570 + }, + { + "epoch": 1.1983951547985638, + "grad_norm": 1.0975289344787598, + "learning_rate": 3.470346561842967e-05, + "loss": 0.7919, + "step": 187580 + }, + { + "epoch": 1.1984590419483026, + "grad_norm": 1.0460991859436035, + "learning_rate": 3.469868858944683e-05, + "loss": 0.7248, + "step": 187590 + }, + { + "epoch": 1.1985229290980413, + "grad_norm": 0.9325404167175293, + "learning_rate": 3.469391171456142e-05, + "loss": 0.8127, + "step": 187600 + }, + { + "epoch": 1.19858681624778, + "grad_norm": 1.0985088348388672, + "learning_rate": 3.468913499382156e-05, + "loss": 0.7552, + "step": 187610 + }, + { + "epoch": 1.1986507033975187, + "grad_norm": 1.1224279403686523, + "learning_rate": 3.468435842727536e-05, + "loss": 1.0798, + "step": 187620 + }, + { + "epoch": 1.1987145905472574, + "grad_norm": 0.7109350562095642, + "learning_rate": 3.4679582014970924e-05, + "loss": 0.8925, + "step": 187630 + }, + { + "epoch": 1.198778477696996, + "grad_norm": 1.715866208076477, + "learning_rate": 3.467480575695635e-05, + "loss": 0.8057, + "step": 187640 + }, + { + "epoch": 1.1988423648467348, + "grad_norm": 0.8582981824874878, + "learning_rate": 3.467002965327974e-05, + "loss": 0.8066, + "step": 187650 + }, + { + "epoch": 1.1989062519964735, + "grad_norm": 1.1021723747253418, + "learning_rate": 3.46652537039892e-05, + "loss": 0.971, + "step": 187660 + }, + { + "epoch": 1.1989701391462122, + "grad_norm": 1.0566543340682983, + "learning_rate": 3.4660477909132804e-05, + "loss": 0.87, + "step": 187670 + }, + { + "epoch": 1.199034026295951, + "grad_norm": 1.064985752105713, + "learning_rate": 3.4655702268758675e-05, + "loss": 1.1492, + "step": 187680 + }, + { + "epoch": 1.1990979134456896, + "grad_norm": 0.6998661756515503, + "learning_rate": 3.4650926782914906e-05, + "loss": 0.6774, + "step": 187690 + }, + { + "epoch": 1.1991618005954283, + "grad_norm": 0.8879082798957825, + "learning_rate": 3.4646151451649575e-05, + "loss": 1.0302, + "step": 187700 + }, + { + "epoch": 1.199225687745167, + "grad_norm": 0.7385240197181702, + "learning_rate": 3.4641376275010786e-05, + "loss": 0.6694, + "step": 187710 + }, + { + "epoch": 1.1992895748949057, + "grad_norm": 0.822397768497467, + "learning_rate": 3.4636601253046616e-05, + "loss": 0.7778, + "step": 187720 + }, + { + "epoch": 1.1993534620446444, + "grad_norm": 1.2824455499649048, + "learning_rate": 3.4631826385805165e-05, + "loss": 1.0233, + "step": 187730 + }, + { + "epoch": 1.1994173491943831, + "grad_norm": 1.1572147607803345, + "learning_rate": 3.462705167333452e-05, + "loss": 0.8526, + "step": 187740 + }, + { + "epoch": 1.1994812363441216, + "grad_norm": 0.6877502799034119, + "learning_rate": 3.4622277115682765e-05, + "loss": 0.9332, + "step": 187750 + }, + { + "epoch": 1.1995451234938606, + "grad_norm": 1.0212002992630005, + "learning_rate": 3.4617502712897986e-05, + "loss": 1.216, + "step": 187760 + }, + { + "epoch": 1.199609010643599, + "grad_norm": 0.8066684603691101, + "learning_rate": 3.461272846502826e-05, + "loss": 0.897, + "step": 187770 + }, + { + "epoch": 1.199672897793338, + "grad_norm": 1.236677885055542, + "learning_rate": 3.460795437212167e-05, + "loss": 0.9163, + "step": 187780 + }, + { + "epoch": 1.1997367849430765, + "grad_norm": 1.0097334384918213, + "learning_rate": 3.46031804342263e-05, + "loss": 0.7069, + "step": 187790 + }, + { + "epoch": 1.1998006720928152, + "grad_norm": 0.9955973625183105, + "learning_rate": 3.459840665139021e-05, + "loss": 0.8812, + "step": 187800 + }, + { + "epoch": 1.1998645592425539, + "grad_norm": 1.2567247152328491, + "learning_rate": 3.4593633023661503e-05, + "loss": 0.9321, + "step": 187810 + }, + { + "epoch": 1.1999284463922926, + "grad_norm": 1.278200387954712, + "learning_rate": 3.458885955108824e-05, + "loss": 1.1415, + "step": 187820 + }, + { + "epoch": 1.1999923335420313, + "grad_norm": 0.7412505745887756, + "learning_rate": 3.458408623371848e-05, + "loss": 0.805, + "step": 187830 + }, + { + "epoch": 1.20005622069177, + "grad_norm": 0.7039709687232971, + "learning_rate": 3.457931307160032e-05, + "loss": 1.0012, + "step": 187840 + }, + { + "epoch": 1.2001201078415087, + "grad_norm": 0.9330798387527466, + "learning_rate": 3.4574540064781814e-05, + "loss": 0.8153, + "step": 187850 + }, + { + "epoch": 1.2001839949912474, + "grad_norm": 0.8370354771614075, + "learning_rate": 3.456976721331102e-05, + "loss": 0.633, + "step": 187860 + }, + { + "epoch": 1.200247882140986, + "grad_norm": 1.2622960805892944, + "learning_rate": 3.4564994517236036e-05, + "loss": 0.8454, + "step": 187870 + }, + { + "epoch": 1.2003117692907248, + "grad_norm": 0.9235817790031433, + "learning_rate": 3.456022197660491e-05, + "loss": 1.0776, + "step": 187880 + }, + { + "epoch": 1.2003756564404635, + "grad_norm": 0.7652870416641235, + "learning_rate": 3.4555449591465704e-05, + "loss": 0.9432, + "step": 187890 + }, + { + "epoch": 1.2004395435902022, + "grad_norm": 0.9901490211486816, + "learning_rate": 3.455067736186649e-05, + "loss": 0.8574, + "step": 187900 + }, + { + "epoch": 1.200503430739941, + "grad_norm": 1.7025309801101685, + "learning_rate": 3.454590528785531e-05, + "loss": 0.7714, + "step": 187910 + }, + { + "epoch": 1.2005673178896796, + "grad_norm": 0.802376389503479, + "learning_rate": 3.454113336948024e-05, + "loss": 1.3025, + "step": 187920 + }, + { + "epoch": 1.2006312050394183, + "grad_norm": 1.1113643646240234, + "learning_rate": 3.453636160678933e-05, + "loss": 0.9288, + "step": 187930 + }, + { + "epoch": 1.200695092189157, + "grad_norm": 0.5715786814689636, + "learning_rate": 3.4531589999830626e-05, + "loss": 0.8644, + "step": 187940 + }, + { + "epoch": 1.2007589793388957, + "grad_norm": 1.303452968597412, + "learning_rate": 3.452681854865221e-05, + "loss": 0.7462, + "step": 187950 + }, + { + "epoch": 1.2008228664886345, + "grad_norm": 0.7303462028503418, + "learning_rate": 3.45220472533021e-05, + "loss": 0.6541, + "step": 187960 + }, + { + "epoch": 1.2008867536383732, + "grad_norm": 1.4614161252975464, + "learning_rate": 3.451727611382838e-05, + "loss": 0.8383, + "step": 187970 + }, + { + "epoch": 1.2009506407881119, + "grad_norm": 0.9907222390174866, + "learning_rate": 3.4512505130279074e-05, + "loss": 0.6691, + "step": 187980 + }, + { + "epoch": 1.2010145279378506, + "grad_norm": 0.7968649864196777, + "learning_rate": 3.450773430270224e-05, + "loss": 0.7508, + "step": 187990 + }, + { + "epoch": 1.2010784150875893, + "grad_norm": 1.011256217956543, + "learning_rate": 3.450296363114593e-05, + "loss": 0.8145, + "step": 188000 + }, + { + "epoch": 1.201142302237328, + "grad_norm": 2.9437148571014404, + "learning_rate": 3.4498193115658184e-05, + "loss": 0.838, + "step": 188010 + }, + { + "epoch": 1.2012061893870667, + "grad_norm": 1.3569848537445068, + "learning_rate": 3.449342275628704e-05, + "loss": 1.2062, + "step": 188020 + }, + { + "epoch": 1.2012700765368054, + "grad_norm": 0.9692744612693787, + "learning_rate": 3.448865255308054e-05, + "loss": 0.9286, + "step": 188030 + }, + { + "epoch": 1.201333963686544, + "grad_norm": 1.1860573291778564, + "learning_rate": 3.448388250608674e-05, + "loss": 0.8968, + "step": 188040 + }, + { + "epoch": 1.2013978508362828, + "grad_norm": 1.0160366296768188, + "learning_rate": 3.4479112615353654e-05, + "loss": 0.8594, + "step": 188050 + }, + { + "epoch": 1.2014617379860215, + "grad_norm": 0.9599398970603943, + "learning_rate": 3.447434288092932e-05, + "loss": 0.8734, + "step": 188060 + }, + { + "epoch": 1.2015256251357602, + "grad_norm": 0.8406432271003723, + "learning_rate": 3.4469573302861806e-05, + "loss": 0.8191, + "step": 188070 + }, + { + "epoch": 1.201589512285499, + "grad_norm": 1.5487374067306519, + "learning_rate": 3.446480388119912e-05, + "loss": 0.9018, + "step": 188080 + }, + { + "epoch": 1.2016533994352376, + "grad_norm": 1.019119381904602, + "learning_rate": 3.44600346159893e-05, + "loss": 0.9307, + "step": 188090 + }, + { + "epoch": 1.2017172865849763, + "grad_norm": 0.5902031064033508, + "learning_rate": 3.4455265507280374e-05, + "loss": 0.8977, + "step": 188100 + }, + { + "epoch": 1.201781173734715, + "grad_norm": 0.7112128734588623, + "learning_rate": 3.445049655512037e-05, + "loss": 0.8325, + "step": 188110 + }, + { + "epoch": 1.2018450608844538, + "grad_norm": 0.95228111743927, + "learning_rate": 3.444572775955732e-05, + "loss": 0.8221, + "step": 188120 + }, + { + "epoch": 1.2019089480341925, + "grad_norm": 0.8991641402244568, + "learning_rate": 3.444095912063927e-05, + "loss": 1.1107, + "step": 188130 + }, + { + "epoch": 1.2019728351839312, + "grad_norm": 0.8079290390014648, + "learning_rate": 3.44361906384142e-05, + "loss": 1.0447, + "step": 188140 + }, + { + "epoch": 1.2020367223336699, + "grad_norm": 2.5006215572357178, + "learning_rate": 3.443142231293016e-05, + "loss": 1.1094, + "step": 188150 + }, + { + "epoch": 1.2021006094834086, + "grad_norm": 1.0621665716171265, + "learning_rate": 3.4426654144235157e-05, + "loss": 0.7572, + "step": 188160 + }, + { + "epoch": 1.2021644966331473, + "grad_norm": 1.2878823280334473, + "learning_rate": 3.442188613237723e-05, + "loss": 0.6545, + "step": 188170 + }, + { + "epoch": 1.202228383782886, + "grad_norm": 1.7365878820419312, + "learning_rate": 3.441711827740437e-05, + "loss": 1.0443, + "step": 188180 + }, + { + "epoch": 1.2022922709326247, + "grad_norm": 1.051690697669983, + "learning_rate": 3.441235057936462e-05, + "loss": 0.9973, + "step": 188190 + }, + { + "epoch": 1.2023561580823634, + "grad_norm": 0.9624873995780945, + "learning_rate": 3.440758303830599e-05, + "loss": 0.8223, + "step": 188200 + }, + { + "epoch": 1.202420045232102, + "grad_norm": 1.7145956754684448, + "learning_rate": 3.4402815654276475e-05, + "loss": 0.8796, + "step": 188210 + }, + { + "epoch": 1.2024839323818406, + "grad_norm": 1.1179742813110352, + "learning_rate": 3.439804842732411e-05, + "loss": 0.7697, + "step": 188220 + }, + { + "epoch": 1.2025478195315795, + "grad_norm": 0.8818740844726562, + "learning_rate": 3.43932813574969e-05, + "loss": 0.8465, + "step": 188230 + }, + { + "epoch": 1.202611706681318, + "grad_norm": 0.8461005687713623, + "learning_rate": 3.4388514444842835e-05, + "loss": 0.8352, + "step": 188240 + }, + { + "epoch": 1.202675593831057, + "grad_norm": 1.1533595323562622, + "learning_rate": 3.4383747689409944e-05, + "loss": 0.7536, + "step": 188250 + }, + { + "epoch": 1.2027394809807954, + "grad_norm": 2.3494560718536377, + "learning_rate": 3.437898109124622e-05, + "loss": 0.843, + "step": 188260 + }, + { + "epoch": 1.2028033681305343, + "grad_norm": 1.2678680419921875, + "learning_rate": 3.4374214650399675e-05, + "loss": 0.7982, + "step": 188270 + }, + { + "epoch": 1.2028672552802728, + "grad_norm": 0.7134614586830139, + "learning_rate": 3.43694483669183e-05, + "loss": 1.0344, + "step": 188280 + }, + { + "epoch": 1.2029311424300115, + "grad_norm": 2.609257698059082, + "learning_rate": 3.436468224085011e-05, + "loss": 0.8643, + "step": 188290 + }, + { + "epoch": 1.2029950295797502, + "grad_norm": 0.8375333547592163, + "learning_rate": 3.43599162722431e-05, + "loss": 0.7731, + "step": 188300 + }, + { + "epoch": 1.203058916729489, + "grad_norm": 0.7802673578262329, + "learning_rate": 3.435562703516575e-05, + "loss": 0.7635, + "step": 188310 + }, + { + "epoch": 1.2031228038792277, + "grad_norm": 0.7882365584373474, + "learning_rate": 3.4350861365867205e-05, + "loss": 0.7978, + "step": 188320 + }, + { + "epoch": 1.2031866910289664, + "grad_norm": 0.7824183106422424, + "learning_rate": 3.4346095854169016e-05, + "loss": 0.6797, + "step": 188330 + }, + { + "epoch": 1.203250578178705, + "grad_norm": 0.7336857318878174, + "learning_rate": 3.434133050011919e-05, + "loss": 0.7331, + "step": 188340 + }, + { + "epoch": 1.2033144653284438, + "grad_norm": 1.238508701324463, + "learning_rate": 3.433656530376571e-05, + "loss": 0.6601, + "step": 188350 + }, + { + "epoch": 1.2033783524781825, + "grad_norm": 1.044569730758667, + "learning_rate": 3.4331800265156565e-05, + "loss": 0.9067, + "step": 188360 + }, + { + "epoch": 1.2034422396279212, + "grad_norm": 0.8683834075927734, + "learning_rate": 3.432703538433976e-05, + "loss": 0.7587, + "step": 188370 + }, + { + "epoch": 1.20350612677766, + "grad_norm": 1.4416515827178955, + "learning_rate": 3.432227066136326e-05, + "loss": 1.0052, + "step": 188380 + }, + { + "epoch": 1.2035700139273986, + "grad_norm": 0.8313100337982178, + "learning_rate": 3.431750609627504e-05, + "loss": 0.9759, + "step": 188390 + }, + { + "epoch": 1.2036339010771373, + "grad_norm": 1.365005373954773, + "learning_rate": 3.4312741689123115e-05, + "loss": 0.7126, + "step": 188400 + }, + { + "epoch": 1.203697788226876, + "grad_norm": 0.9269607663154602, + "learning_rate": 3.430797743995546e-05, + "loss": 0.8952, + "step": 188410 + }, + { + "epoch": 1.2037616753766147, + "grad_norm": 0.9355165362358093, + "learning_rate": 3.430321334882004e-05, + "loss": 1.0636, + "step": 188420 + }, + { + "epoch": 1.2038255625263534, + "grad_norm": 0.822027862071991, + "learning_rate": 3.4298449415764846e-05, + "loss": 0.7767, + "step": 188430 + }, + { + "epoch": 1.2038894496760921, + "grad_norm": 0.9582952857017517, + "learning_rate": 3.429368564083784e-05, + "loss": 0.783, + "step": 188440 + }, + { + "epoch": 1.2039533368258308, + "grad_norm": 0.9899607300758362, + "learning_rate": 3.428892202408702e-05, + "loss": 0.6833, + "step": 188450 + }, + { + "epoch": 1.2040172239755695, + "grad_norm": 0.8969405889511108, + "learning_rate": 3.428415856556034e-05, + "loss": 0.9458, + "step": 188460 + }, + { + "epoch": 1.2040811111253082, + "grad_norm": 0.8101646304130554, + "learning_rate": 3.427939526530578e-05, + "loss": 0.8489, + "step": 188470 + }, + { + "epoch": 1.204144998275047, + "grad_norm": 0.9676410555839539, + "learning_rate": 3.427463212337131e-05, + "loss": 1.2028, + "step": 188480 + }, + { + "epoch": 1.2042088854247857, + "grad_norm": 1.022590160369873, + "learning_rate": 3.42698691398049e-05, + "loss": 0.6856, + "step": 188490 + }, + { + "epoch": 1.2042727725745244, + "grad_norm": 1.153812050819397, + "learning_rate": 3.4265106314654506e-05, + "loss": 0.7006, + "step": 188500 + }, + { + "epoch": 1.204336659724263, + "grad_norm": 1.513657808303833, + "learning_rate": 3.426034364796811e-05, + "loss": 0.9024, + "step": 188510 + }, + { + "epoch": 1.2044005468740018, + "grad_norm": 2.0316057205200195, + "learning_rate": 3.425558113979367e-05, + "loss": 0.9124, + "step": 188520 + }, + { + "epoch": 1.2044644340237405, + "grad_norm": 0.7945497035980225, + "learning_rate": 3.425081879017916e-05, + "loss": 0.7946, + "step": 188530 + }, + { + "epoch": 1.2045283211734792, + "grad_norm": 0.8211964964866638, + "learning_rate": 3.4246056599172516e-05, + "loss": 0.895, + "step": 188540 + }, + { + "epoch": 1.204592208323218, + "grad_norm": 0.8268355131149292, + "learning_rate": 3.424129456682172e-05, + "loss": 0.9794, + "step": 188550 + }, + { + "epoch": 1.2046560954729566, + "grad_norm": 1.3753883838653564, + "learning_rate": 3.4236532693174716e-05, + "loss": 0.8483, + "step": 188560 + }, + { + "epoch": 1.2047199826226953, + "grad_norm": 1.092286467552185, + "learning_rate": 3.4231770978279474e-05, + "loss": 0.7269, + "step": 188570 + }, + { + "epoch": 1.204783869772434, + "grad_norm": 0.8295246362686157, + "learning_rate": 3.422700942218393e-05, + "loss": 1.006, + "step": 188580 + }, + { + "epoch": 1.2048477569221727, + "grad_norm": 1.040263056755066, + "learning_rate": 3.422224802493605e-05, + "loss": 0.6638, + "step": 188590 + }, + { + "epoch": 1.2049116440719114, + "grad_norm": 0.6874042749404907, + "learning_rate": 3.421748678658378e-05, + "loss": 0.8416, + "step": 188600 + }, + { + "epoch": 1.2049755312216501, + "grad_norm": 0.7841446995735168, + "learning_rate": 3.4212725707175075e-05, + "loss": 0.9625, + "step": 188610 + }, + { + "epoch": 1.2050394183713888, + "grad_norm": 1.3204227685928345, + "learning_rate": 3.4207964786757876e-05, + "loss": 0.9017, + "step": 188620 + }, + { + "epoch": 1.2051033055211275, + "grad_norm": 1.4805320501327515, + "learning_rate": 3.4203204025380145e-05, + "loss": 0.8957, + "step": 188630 + }, + { + "epoch": 1.2051671926708662, + "grad_norm": 1.7014871835708618, + "learning_rate": 3.41984434230898e-05, + "loss": 1.0828, + "step": 188640 + }, + { + "epoch": 1.205231079820605, + "grad_norm": 1.158589243888855, + "learning_rate": 3.419368297993481e-05, + "loss": 0.8051, + "step": 188650 + }, + { + "epoch": 1.2052949669703437, + "grad_norm": 1.0002838373184204, + "learning_rate": 3.41889226959631e-05, + "loss": 0.6792, + "step": 188660 + }, + { + "epoch": 1.2053588541200824, + "grad_norm": 0.8275355696678162, + "learning_rate": 3.418416257122262e-05, + "loss": 0.9482, + "step": 188670 + }, + { + "epoch": 1.205422741269821, + "grad_norm": 0.9177698493003845, + "learning_rate": 3.417940260576131e-05, + "loss": 1.1724, + "step": 188680 + }, + { + "epoch": 1.2054866284195598, + "grad_norm": 0.7963119745254517, + "learning_rate": 3.41746427996271e-05, + "loss": 0.779, + "step": 188690 + }, + { + "epoch": 1.2055505155692985, + "grad_norm": 1.003836750984192, + "learning_rate": 3.4169883152867925e-05, + "loss": 0.8695, + "step": 188700 + }, + { + "epoch": 1.205614402719037, + "grad_norm": 1.1356700658798218, + "learning_rate": 3.4165123665531715e-05, + "loss": 0.7465, + "step": 188710 + }, + { + "epoch": 1.205678289868776, + "grad_norm": 0.8815405368804932, + "learning_rate": 3.416036433766642e-05, + "loss": 1.0118, + "step": 188720 + }, + { + "epoch": 1.2057421770185144, + "grad_norm": 0.5799207091331482, + "learning_rate": 3.415560516931996e-05, + "loss": 0.7291, + "step": 188730 + }, + { + "epoch": 1.2058060641682533, + "grad_norm": 1.3417963981628418, + "learning_rate": 3.415084616054025e-05, + "loss": 0.6866, + "step": 188740 + }, + { + "epoch": 1.2058699513179918, + "grad_norm": 2.6382036209106445, + "learning_rate": 3.414608731137525e-05, + "loss": 0.8929, + "step": 188750 + }, + { + "epoch": 1.2059338384677307, + "grad_norm": 0.8235493302345276, + "learning_rate": 3.414132862187287e-05, + "loss": 0.7851, + "step": 188760 + }, + { + "epoch": 1.2059977256174692, + "grad_norm": 0.9888988137245178, + "learning_rate": 3.413657009208102e-05, + "loss": 0.8698, + "step": 188770 + }, + { + "epoch": 1.206061612767208, + "grad_norm": 2.198369026184082, + "learning_rate": 3.413181172204763e-05, + "loss": 0.8233, + "step": 188780 + }, + { + "epoch": 1.2061254999169466, + "grad_norm": 0.8852475881576538, + "learning_rate": 3.4127053511820626e-05, + "loss": 1.0084, + "step": 188790 + }, + { + "epoch": 1.2061893870666853, + "grad_norm": 1.3241829872131348, + "learning_rate": 3.412229546144792e-05, + "loss": 1.0275, + "step": 188800 + }, + { + "epoch": 1.206253274216424, + "grad_norm": 1.0022987127304077, + "learning_rate": 3.4117537570977443e-05, + "loss": 1.2873, + "step": 188810 + }, + { + "epoch": 1.2063171613661627, + "grad_norm": 0.8337856531143188, + "learning_rate": 3.41127798404571e-05, + "loss": 0.8505, + "step": 188820 + }, + { + "epoch": 1.2063810485159014, + "grad_norm": 1.0508147478103638, + "learning_rate": 3.410802226993479e-05, + "loss": 0.7073, + "step": 188830 + }, + { + "epoch": 1.2064449356656402, + "grad_norm": 0.8216559290885925, + "learning_rate": 3.4103264859458464e-05, + "loss": 0.8745, + "step": 188840 + }, + { + "epoch": 1.2065088228153789, + "grad_norm": 0.7753427624702454, + "learning_rate": 3.409850760907601e-05, + "loss": 0.7886, + "step": 188850 + }, + { + "epoch": 1.2065727099651176, + "grad_norm": 0.857014000415802, + "learning_rate": 3.4093750518835346e-05, + "loss": 0.8163, + "step": 188860 + }, + { + "epoch": 1.2066365971148563, + "grad_norm": 1.3117613792419434, + "learning_rate": 3.408899358878437e-05, + "loss": 0.6619, + "step": 188870 + }, + { + "epoch": 1.206700484264595, + "grad_norm": 0.9748005270957947, + "learning_rate": 3.4084236818970996e-05, + "loss": 0.9795, + "step": 188880 + }, + { + "epoch": 1.2067643714143337, + "grad_norm": 0.5203637480735779, + "learning_rate": 3.407948020944312e-05, + "loss": 0.8075, + "step": 188890 + }, + { + "epoch": 1.2068282585640724, + "grad_norm": 0.7732601761817932, + "learning_rate": 3.407472376024866e-05, + "loss": 0.8769, + "step": 188900 + }, + { + "epoch": 1.206892145713811, + "grad_norm": 0.9090991020202637, + "learning_rate": 3.4069967471435506e-05, + "loss": 0.6339, + "step": 188910 + }, + { + "epoch": 1.2069560328635498, + "grad_norm": 0.8993181586265564, + "learning_rate": 3.406521134305156e-05, + "loss": 1.1261, + "step": 188920 + }, + { + "epoch": 1.2070199200132885, + "grad_norm": 0.8620532751083374, + "learning_rate": 3.406045537514472e-05, + "loss": 0.7915, + "step": 188930 + }, + { + "epoch": 1.2070838071630272, + "grad_norm": 1.1491893529891968, + "learning_rate": 3.405569956776289e-05, + "loss": 0.8156, + "step": 188940 + }, + { + "epoch": 1.207147694312766, + "grad_norm": 0.9209848046302795, + "learning_rate": 3.405094392095395e-05, + "loss": 0.7993, + "step": 188950 + }, + { + "epoch": 1.2072115814625046, + "grad_norm": 0.7707030177116394, + "learning_rate": 3.404618843476581e-05, + "loss": 1.0284, + "step": 188960 + }, + { + "epoch": 1.2072754686122433, + "grad_norm": 0.7875609397888184, + "learning_rate": 3.404143310924635e-05, + "loss": 0.8219, + "step": 188970 + }, + { + "epoch": 1.207339355761982, + "grad_norm": 0.9179214835166931, + "learning_rate": 3.403667794444347e-05, + "loss": 0.7936, + "step": 188980 + }, + { + "epoch": 1.2074032429117207, + "grad_norm": 0.7444946765899658, + "learning_rate": 3.403192294040505e-05, + "loss": 0.7553, + "step": 188990 + }, + { + "epoch": 1.2074671300614594, + "grad_norm": 1.558156132698059, + "learning_rate": 3.4027168097178976e-05, + "loss": 0.8449, + "step": 189000 + }, + { + "epoch": 1.2075310172111982, + "grad_norm": 2.1795334815979004, + "learning_rate": 3.402241341481314e-05, + "loss": 1.1889, + "step": 189010 + }, + { + "epoch": 1.2075949043609369, + "grad_norm": 0.6319822072982788, + "learning_rate": 3.4017658893355434e-05, + "loss": 1.2551, + "step": 189020 + }, + { + "epoch": 1.2076587915106756, + "grad_norm": 0.8189280033111572, + "learning_rate": 3.401290453285371e-05, + "loss": 0.8926, + "step": 189030 + }, + { + "epoch": 1.2077226786604143, + "grad_norm": 0.9746350049972534, + "learning_rate": 3.4008150333355875e-05, + "loss": 0.7228, + "step": 189040 + }, + { + "epoch": 1.207786565810153, + "grad_norm": 0.8227341175079346, + "learning_rate": 3.4003396294909804e-05, + "loss": 0.7056, + "step": 189050 + }, + { + "epoch": 1.2078504529598917, + "grad_norm": 1.0430861711502075, + "learning_rate": 3.3998642417563375e-05, + "loss": 0.6838, + "step": 189060 + }, + { + "epoch": 1.2079143401096304, + "grad_norm": 0.838440477848053, + "learning_rate": 3.3993888701364457e-05, + "loss": 0.9032, + "step": 189070 + }, + { + "epoch": 1.207978227259369, + "grad_norm": 0.6549668312072754, + "learning_rate": 3.398913514636093e-05, + "loss": 0.8481, + "step": 189080 + }, + { + "epoch": 1.2080421144091078, + "grad_norm": 0.7392321825027466, + "learning_rate": 3.398438175260066e-05, + "loss": 0.8972, + "step": 189090 + }, + { + "epoch": 1.2081060015588465, + "grad_norm": 0.9022709131240845, + "learning_rate": 3.3979628520131524e-05, + "loss": 0.9335, + "step": 189100 + }, + { + "epoch": 1.2081698887085852, + "grad_norm": 1.0506974458694458, + "learning_rate": 3.397487544900139e-05, + "loss": 1.1411, + "step": 189110 + }, + { + "epoch": 1.208233775858324, + "grad_norm": 0.7597492933273315, + "learning_rate": 3.3970122539258114e-05, + "loss": 0.8804, + "step": 189120 + }, + { + "epoch": 1.2082976630080626, + "grad_norm": 0.8924365043640137, + "learning_rate": 3.396536979094958e-05, + "loss": 0.7818, + "step": 189130 + }, + { + "epoch": 1.2083615501578013, + "grad_norm": 0.6869126558303833, + "learning_rate": 3.3960617204123646e-05, + "loss": 1.0775, + "step": 189140 + }, + { + "epoch": 1.20842543730754, + "grad_norm": 0.6726001501083374, + "learning_rate": 3.3955864778828167e-05, + "loss": 0.9098, + "step": 189150 + }, + { + "epoch": 1.2084893244572787, + "grad_norm": 0.5952368378639221, + "learning_rate": 3.395111251511101e-05, + "loss": 0.7089, + "step": 189160 + }, + { + "epoch": 1.2085532116070175, + "grad_norm": 1.8950144052505493, + "learning_rate": 3.394636041302004e-05, + "loss": 0.974, + "step": 189170 + }, + { + "epoch": 1.2086170987567562, + "grad_norm": 1.1229348182678223, + "learning_rate": 3.3941608472603106e-05, + "loss": 1.1186, + "step": 189180 + }, + { + "epoch": 1.2086809859064949, + "grad_norm": 0.742145836353302, + "learning_rate": 3.393685669390806e-05, + "loss": 0.7087, + "step": 189190 + }, + { + "epoch": 1.2087448730562333, + "grad_norm": 0.7637613415718079, + "learning_rate": 3.393210507698278e-05, + "loss": 0.9503, + "step": 189200 + }, + { + "epoch": 1.2088087602059723, + "grad_norm": 0.7326201796531677, + "learning_rate": 3.392735362187509e-05, + "loss": 1.0978, + "step": 189210 + }, + { + "epoch": 1.2088726473557108, + "grad_norm": 0.764445960521698, + "learning_rate": 3.392260232863286e-05, + "loss": 0.8275, + "step": 189220 + }, + { + "epoch": 1.2089365345054497, + "grad_norm": 1.7644976377487183, + "learning_rate": 3.3917851197303926e-05, + "loss": 0.8628, + "step": 189230 + }, + { + "epoch": 1.2090004216551882, + "grad_norm": 0.5463082790374756, + "learning_rate": 3.391310022793613e-05, + "loss": 0.9615, + "step": 189240 + }, + { + "epoch": 1.209064308804927, + "grad_norm": 1.1664972305297852, + "learning_rate": 3.390834942057738e-05, + "loss": 0.7439, + "step": 189250 + }, + { + "epoch": 1.2091281959546656, + "grad_norm": 0.6254557371139526, + "learning_rate": 3.390359877527544e-05, + "loss": 0.6675, + "step": 189260 + }, + { + "epoch": 1.2091920831044043, + "grad_norm": 0.7890540361404419, + "learning_rate": 3.389884829207819e-05, + "loss": 0.7559, + "step": 189270 + }, + { + "epoch": 1.209255970254143, + "grad_norm": 1.1657845973968506, + "learning_rate": 3.3894097971033465e-05, + "loss": 0.7273, + "step": 189280 + }, + { + "epoch": 1.2093198574038817, + "grad_norm": 0.8113539218902588, + "learning_rate": 3.388934781218911e-05, + "loss": 0.9024, + "step": 189290 + }, + { + "epoch": 1.2093837445536204, + "grad_norm": 0.9730884432792664, + "learning_rate": 3.388459781559296e-05, + "loss": 0.6144, + "step": 189300 + }, + { + "epoch": 1.2094476317033591, + "grad_norm": 1.0396467447280884, + "learning_rate": 3.387984798129284e-05, + "loss": 0.6144, + "step": 189310 + }, + { + "epoch": 1.2095115188530978, + "grad_norm": 1.3935688734054565, + "learning_rate": 3.387509830933661e-05, + "loss": 0.8328, + "step": 189320 + }, + { + "epoch": 1.2095754060028365, + "grad_norm": 2.3891825675964355, + "learning_rate": 3.387034879977209e-05, + "loss": 0.8898, + "step": 189330 + }, + { + "epoch": 1.2096392931525752, + "grad_norm": 1.1916571855545044, + "learning_rate": 3.38655994526471e-05, + "loss": 0.911, + "step": 189340 + }, + { + "epoch": 1.209703180302314, + "grad_norm": 1.0677696466445923, + "learning_rate": 3.386085026800948e-05, + "loss": 0.7844, + "step": 189350 + }, + { + "epoch": 1.2097670674520526, + "grad_norm": 0.8219583034515381, + "learning_rate": 3.385610124590707e-05, + "loss": 0.7822, + "step": 189360 + }, + { + "epoch": 1.2098309546017914, + "grad_norm": 0.892532229423523, + "learning_rate": 3.3851352386387694e-05, + "loss": 0.9249, + "step": 189370 + }, + { + "epoch": 1.20989484175153, + "grad_norm": 1.4263323545455933, + "learning_rate": 3.3846603689499155e-05, + "loss": 0.9323, + "step": 189380 + }, + { + "epoch": 1.2099587289012688, + "grad_norm": 0.8026622533798218, + "learning_rate": 3.3841855155289304e-05, + "loss": 0.7288, + "step": 189390 + }, + { + "epoch": 1.2100226160510075, + "grad_norm": 0.7562341690063477, + "learning_rate": 3.383710678380595e-05, + "loss": 0.8231, + "step": 189400 + }, + { + "epoch": 1.2100865032007462, + "grad_norm": 1.2895803451538086, + "learning_rate": 3.38323585750969e-05, + "loss": 0.9344, + "step": 189410 + }, + { + "epoch": 1.2101503903504849, + "grad_norm": 1.0178850889205933, + "learning_rate": 3.382761052921e-05, + "loss": 0.9551, + "step": 189420 + }, + { + "epoch": 1.2102142775002236, + "grad_norm": 1.0130857229232788, + "learning_rate": 3.382286264619304e-05, + "loss": 1.0137, + "step": 189430 + }, + { + "epoch": 1.2102781646499623, + "grad_norm": 1.47183096408844, + "learning_rate": 3.381811492609386e-05, + "loss": 0.9249, + "step": 189440 + }, + { + "epoch": 1.210342051799701, + "grad_norm": 1.0401761531829834, + "learning_rate": 3.381336736896026e-05, + "loss": 0.8981, + "step": 189450 + }, + { + "epoch": 1.2104059389494397, + "grad_norm": 1.007563829421997, + "learning_rate": 3.3808619974840053e-05, + "loss": 0.5923, + "step": 189460 + }, + { + "epoch": 1.2104698260991784, + "grad_norm": 0.8079440593719482, + "learning_rate": 3.3803872743781054e-05, + "loss": 1.1246, + "step": 189470 + }, + { + "epoch": 1.2105337132489171, + "grad_norm": 1.758446455001831, + "learning_rate": 3.379912567583106e-05, + "loss": 0.8868, + "step": 189480 + }, + { + "epoch": 1.2105976003986558, + "grad_norm": 0.5423232316970825, + "learning_rate": 3.379437877103789e-05, + "loss": 0.7521, + "step": 189490 + }, + { + "epoch": 1.2106614875483945, + "grad_norm": 0.869480311870575, + "learning_rate": 3.378963202944935e-05, + "loss": 0.9616, + "step": 189500 + }, + { + "epoch": 1.2107253746981332, + "grad_norm": 0.9608466029167175, + "learning_rate": 3.3784885451113235e-05, + "loss": 0.766, + "step": 189510 + }, + { + "epoch": 1.210789261847872, + "grad_norm": 1.1704931259155273, + "learning_rate": 3.3780139036077355e-05, + "loss": 0.8673, + "step": 189520 + }, + { + "epoch": 1.2108531489976107, + "grad_norm": 0.8335651159286499, + "learning_rate": 3.37753927843895e-05, + "loss": 1.0876, + "step": 189530 + }, + { + "epoch": 1.2109170361473494, + "grad_norm": 0.5794630646705627, + "learning_rate": 3.377064669609748e-05, + "loss": 0.7555, + "step": 189540 + }, + { + "epoch": 1.210980923297088, + "grad_norm": 0.7695865035057068, + "learning_rate": 3.3765900771249094e-05, + "loss": 0.7292, + "step": 189550 + }, + { + "epoch": 1.2110448104468268, + "grad_norm": 1.706060767173767, + "learning_rate": 3.3761155009892106e-05, + "loss": 0.9732, + "step": 189560 + }, + { + "epoch": 1.2111086975965655, + "grad_norm": 1.0700571537017822, + "learning_rate": 3.3756409412074365e-05, + "loss": 0.7203, + "step": 189570 + }, + { + "epoch": 1.2111725847463042, + "grad_norm": 1.068686842918396, + "learning_rate": 3.3751663977843616e-05, + "loss": 0.8256, + "step": 189580 + }, + { + "epoch": 1.2112364718960429, + "grad_norm": 0.7472430467605591, + "learning_rate": 3.374691870724768e-05, + "loss": 1.0348, + "step": 189590 + }, + { + "epoch": 1.2113003590457816, + "grad_norm": 1.1850546598434448, + "learning_rate": 3.374217360033433e-05, + "loss": 0.794, + "step": 189600 + }, + { + "epoch": 1.2113642461955203, + "grad_norm": 0.6383751034736633, + "learning_rate": 3.373742865715136e-05, + "loss": 0.7884, + "step": 189610 + }, + { + "epoch": 1.211428133345259, + "grad_norm": 0.866807222366333, + "learning_rate": 3.373268387774655e-05, + "loss": 1.3421, + "step": 189620 + }, + { + "epoch": 1.2114920204949977, + "grad_norm": 1.0041619539260864, + "learning_rate": 3.372793926216769e-05, + "loss": 0.7278, + "step": 189630 + }, + { + "epoch": 1.2115559076447364, + "grad_norm": 2.272212266921997, + "learning_rate": 3.372319481046254e-05, + "loss": 1.0229, + "step": 189640 + }, + { + "epoch": 1.2116197947944751, + "grad_norm": 1.066819667816162, + "learning_rate": 3.371845052267892e-05, + "loss": 0.9702, + "step": 189650 + }, + { + "epoch": 1.2116836819442138, + "grad_norm": 0.9112004637718201, + "learning_rate": 3.371370639886459e-05, + "loss": 0.8027, + "step": 189660 + }, + { + "epoch": 1.2117475690939525, + "grad_norm": 0.8083162307739258, + "learning_rate": 3.3708962439067316e-05, + "loss": 0.6973, + "step": 189670 + }, + { + "epoch": 1.2118114562436912, + "grad_norm": 1.1185535192489624, + "learning_rate": 3.3704218643334884e-05, + "loss": 0.7762, + "step": 189680 + }, + { + "epoch": 1.2118753433934297, + "grad_norm": 0.5453072786331177, + "learning_rate": 3.369947501171507e-05, + "loss": 0.7053, + "step": 189690 + }, + { + "epoch": 1.2119392305431687, + "grad_norm": 0.6224724054336548, + "learning_rate": 3.3694731544255646e-05, + "loss": 0.8089, + "step": 189700 + }, + { + "epoch": 1.2120031176929071, + "grad_norm": 0.9544569253921509, + "learning_rate": 3.3689988241004385e-05, + "loss": 0.8551, + "step": 189710 + }, + { + "epoch": 1.212067004842646, + "grad_norm": 0.9484859704971313, + "learning_rate": 3.368524510200904e-05, + "loss": 0.7186, + "step": 189720 + }, + { + "epoch": 1.2121308919923846, + "grad_norm": 1.351324200630188, + "learning_rate": 3.36805021273174e-05, + "loss": 0.8923, + "step": 189730 + }, + { + "epoch": 1.2121947791421233, + "grad_norm": 1.0908766984939575, + "learning_rate": 3.367575931697724e-05, + "loss": 0.9379, + "step": 189740 + }, + { + "epoch": 1.212258666291862, + "grad_norm": 1.441122055053711, + "learning_rate": 3.3671016671036286e-05, + "loss": 0.917, + "step": 189750 + }, + { + "epoch": 1.2123225534416007, + "grad_norm": 0.6979114413261414, + "learning_rate": 3.3666274189542327e-05, + "loss": 0.6953, + "step": 189760 + }, + { + "epoch": 1.2123864405913394, + "grad_norm": 0.9482870697975159, + "learning_rate": 3.3661531872543114e-05, + "loss": 0.9533, + "step": 189770 + }, + { + "epoch": 1.212450327741078, + "grad_norm": 0.9386551976203918, + "learning_rate": 3.365678972008641e-05, + "loss": 1.1059, + "step": 189780 + }, + { + "epoch": 1.2125142148908168, + "grad_norm": 0.9457994103431702, + "learning_rate": 3.365204773221997e-05, + "loss": 0.919, + "step": 189790 + }, + { + "epoch": 1.2125781020405555, + "grad_norm": 1.090871810913086, + "learning_rate": 3.364730590899156e-05, + "loss": 0.9031, + "step": 189800 + }, + { + "epoch": 1.2126419891902942, + "grad_norm": 1.0352469682693481, + "learning_rate": 3.3642564250448917e-05, + "loss": 0.7198, + "step": 189810 + }, + { + "epoch": 1.212705876340033, + "grad_norm": 0.983015775680542, + "learning_rate": 3.3637822756639806e-05, + "loss": 0.9113, + "step": 189820 + }, + { + "epoch": 1.2127697634897716, + "grad_norm": 3.4296910762786865, + "learning_rate": 3.3633081427611976e-05, + "loss": 0.8825, + "step": 189830 + }, + { + "epoch": 1.2128336506395103, + "grad_norm": 0.7872678637504578, + "learning_rate": 3.362834026341317e-05, + "loss": 1.1606, + "step": 189840 + }, + { + "epoch": 1.212897537789249, + "grad_norm": 0.6566711664199829, + "learning_rate": 3.362359926409115e-05, + "loss": 0.7937, + "step": 189850 + }, + { + "epoch": 1.2129614249389877, + "grad_norm": 0.7389879822731018, + "learning_rate": 3.361885842969365e-05, + "loss": 0.7068, + "step": 189860 + }, + { + "epoch": 1.2130253120887264, + "grad_norm": 1.2746154069900513, + "learning_rate": 3.3614117760268415e-05, + "loss": 0.9289, + "step": 189870 + }, + { + "epoch": 1.2130891992384651, + "grad_norm": 0.8612440228462219, + "learning_rate": 3.360937725586318e-05, + "loss": 1.0348, + "step": 189880 + }, + { + "epoch": 1.2131530863882038, + "grad_norm": 0.6161855459213257, + "learning_rate": 3.360463691652571e-05, + "loss": 0.7618, + "step": 189890 + }, + { + "epoch": 1.2132169735379426, + "grad_norm": 1.0843278169631958, + "learning_rate": 3.3599896742303726e-05, + "loss": 0.7903, + "step": 189900 + }, + { + "epoch": 1.2132808606876813, + "grad_norm": 0.8129070401191711, + "learning_rate": 3.359515673324497e-05, + "loss": 1.0778, + "step": 189910 + }, + { + "epoch": 1.21334474783742, + "grad_norm": 1.6477254629135132, + "learning_rate": 3.359041688939718e-05, + "loss": 0.6379, + "step": 189920 + }, + { + "epoch": 1.2134086349871587, + "grad_norm": 1.2734267711639404, + "learning_rate": 3.358567721080809e-05, + "loss": 1.2492, + "step": 189930 + }, + { + "epoch": 1.2134725221368974, + "grad_norm": 0.704581081867218, + "learning_rate": 3.358093769752543e-05, + "loss": 1.0449, + "step": 189940 + }, + { + "epoch": 1.213536409286636, + "grad_norm": 0.846922755241394, + "learning_rate": 3.357619834959693e-05, + "loss": 1.0302, + "step": 189950 + }, + { + "epoch": 1.2136002964363748, + "grad_norm": 0.7383447289466858, + "learning_rate": 3.357145916707033e-05, + "loss": 1.0175, + "step": 189960 + }, + { + "epoch": 1.2136641835861135, + "grad_norm": 0.7878567576408386, + "learning_rate": 3.356672014999333e-05, + "loss": 0.7262, + "step": 189970 + }, + { + "epoch": 1.2137280707358522, + "grad_norm": 1.0822579860687256, + "learning_rate": 3.3561981298413695e-05, + "loss": 0.9257, + "step": 189980 + }, + { + "epoch": 1.213791957885591, + "grad_norm": 0.9476717114448547, + "learning_rate": 3.355724261237911e-05, + "loss": 1.0248, + "step": 189990 + }, + { + "epoch": 1.2138558450353296, + "grad_norm": 1.3661209344863892, + "learning_rate": 3.3552504091937334e-05, + "loss": 0.7913, + "step": 190000 + }, + { + "epoch": 1.2139197321850683, + "grad_norm": 2.8625452518463135, + "learning_rate": 3.354776573713606e-05, + "loss": 0.7928, + "step": 190010 + }, + { + "epoch": 1.213983619334807, + "grad_norm": 0.8434425592422485, + "learning_rate": 3.354302754802303e-05, + "loss": 0.674, + "step": 190020 + }, + { + "epoch": 1.2140475064845457, + "grad_norm": 0.6829739809036255, + "learning_rate": 3.353828952464594e-05, + "loss": 0.7713, + "step": 190030 + }, + { + "epoch": 1.2141113936342844, + "grad_norm": 1.4027974605560303, + "learning_rate": 3.353355166705251e-05, + "loss": 0.7881, + "step": 190040 + }, + { + "epoch": 1.2141752807840231, + "grad_norm": 1.39128839969635, + "learning_rate": 3.352881397529047e-05, + "loss": 1.0224, + "step": 190050 + }, + { + "epoch": 1.2142391679337619, + "grad_norm": 0.9742370843887329, + "learning_rate": 3.352407644940753e-05, + "loss": 0.8518, + "step": 190060 + }, + { + "epoch": 1.2143030550835006, + "grad_norm": 1.9331669807434082, + "learning_rate": 3.351933908945138e-05, + "loss": 0.766, + "step": 190070 + }, + { + "epoch": 1.2143669422332393, + "grad_norm": 1.6505056619644165, + "learning_rate": 3.3514601895469736e-05, + "loss": 0.9179, + "step": 190080 + }, + { + "epoch": 1.214430829382978, + "grad_norm": 1.2252271175384521, + "learning_rate": 3.3509864867510325e-05, + "loss": 1.046, + "step": 190090 + }, + { + "epoch": 1.2144947165327167, + "grad_norm": 0.7833622694015503, + "learning_rate": 3.3505128005620845e-05, + "loss": 0.7841, + "step": 190100 + }, + { + "epoch": 1.2145586036824554, + "grad_norm": 0.8546239137649536, + "learning_rate": 3.3500391309848986e-05, + "loss": 0.8422, + "step": 190110 + }, + { + "epoch": 1.214622490832194, + "grad_norm": 1.0654160976409912, + "learning_rate": 3.349565478024247e-05, + "loss": 0.874, + "step": 190120 + }, + { + "epoch": 1.2146863779819328, + "grad_norm": 1.2579134702682495, + "learning_rate": 3.349091841684898e-05, + "loss": 0.8686, + "step": 190130 + }, + { + "epoch": 1.2147502651316715, + "grad_norm": 0.8867964744567871, + "learning_rate": 3.3486182219716235e-05, + "loss": 1.1406, + "step": 190140 + }, + { + "epoch": 1.2148141522814102, + "grad_norm": 0.8828266859054565, + "learning_rate": 3.348144618889191e-05, + "loss": 0.7873, + "step": 190150 + }, + { + "epoch": 1.214878039431149, + "grad_norm": 0.7557622194290161, + "learning_rate": 3.347671032442372e-05, + "loss": 0.9467, + "step": 190160 + }, + { + "epoch": 1.2149419265808876, + "grad_norm": 1.5542864799499512, + "learning_rate": 3.3471974626359346e-05, + "loss": 0.9072, + "step": 190170 + }, + { + "epoch": 1.215005813730626, + "grad_norm": 1.122002363204956, + "learning_rate": 3.3467239094746494e-05, + "loss": 1.0869, + "step": 190180 + }, + { + "epoch": 1.215069700880365, + "grad_norm": 1.4732335805892944, + "learning_rate": 3.346250372963284e-05, + "loss": 0.6884, + "step": 190190 + }, + { + "epoch": 1.2151335880301035, + "grad_norm": 0.712639570236206, + "learning_rate": 3.345776853106609e-05, + "loss": 0.691, + "step": 190200 + }, + { + "epoch": 1.2151974751798424, + "grad_norm": 0.7023781538009644, + "learning_rate": 3.345303349909391e-05, + "loss": 0.7389, + "step": 190210 + }, + { + "epoch": 1.215261362329581, + "grad_norm": 1.098771333694458, + "learning_rate": 3.344829863376402e-05, + "loss": 0.7947, + "step": 190220 + }, + { + "epoch": 1.2153252494793196, + "grad_norm": 0.7597054243087769, + "learning_rate": 3.344356393512407e-05, + "loss": 0.6477, + "step": 190230 + }, + { + "epoch": 1.2153891366290583, + "grad_norm": 1.0728919506072998, + "learning_rate": 3.343882940322174e-05, + "loss": 0.7738, + "step": 190240 + }, + { + "epoch": 1.215453023778797, + "grad_norm": 1.2330894470214844, + "learning_rate": 3.343409503810474e-05, + "loss": 1.2956, + "step": 190250 + }, + { + "epoch": 1.2155169109285358, + "grad_norm": 1.1694852113723755, + "learning_rate": 3.3429360839820725e-05, + "loss": 0.7961, + "step": 190260 + }, + { + "epoch": 1.2155807980782745, + "grad_norm": 0.6705962419509888, + "learning_rate": 3.342462680841739e-05, + "loss": 1.0564, + "step": 190270 + }, + { + "epoch": 1.2156446852280132, + "grad_norm": 1.190106749534607, + "learning_rate": 3.34198929439424e-05, + "loss": 0.7572, + "step": 190280 + }, + { + "epoch": 1.2157085723777519, + "grad_norm": 0.7782067656517029, + "learning_rate": 3.341515924644343e-05, + "loss": 1.093, + "step": 190290 + }, + { + "epoch": 1.2157724595274906, + "grad_norm": 0.8707125186920166, + "learning_rate": 3.341042571596816e-05, + "loss": 0.6307, + "step": 190300 + }, + { + "epoch": 1.2158363466772293, + "grad_norm": 1.6781306266784668, + "learning_rate": 3.3405692352564246e-05, + "loss": 0.8267, + "step": 190310 + }, + { + "epoch": 1.215900233826968, + "grad_norm": 1.4789141416549683, + "learning_rate": 3.3400959156279377e-05, + "loss": 0.6929, + "step": 190320 + }, + { + "epoch": 1.2159641209767067, + "grad_norm": 1.2151391506195068, + "learning_rate": 3.33962261271612e-05, + "loss": 0.8728, + "step": 190330 + }, + { + "epoch": 1.2160280081264454, + "grad_norm": 1.0864027738571167, + "learning_rate": 3.339149326525739e-05, + "loss": 1.0483, + "step": 190340 + }, + { + "epoch": 1.216091895276184, + "grad_norm": 0.9961766600608826, + "learning_rate": 3.338676057061562e-05, + "loss": 0.872, + "step": 190350 + }, + { + "epoch": 1.2161557824259228, + "grad_norm": 1.3357287645339966, + "learning_rate": 3.3382028043283536e-05, + "loss": 1.0093, + "step": 190360 + }, + { + "epoch": 1.2162196695756615, + "grad_norm": 0.7204846143722534, + "learning_rate": 3.3377295683308806e-05, + "loss": 0.7794, + "step": 190370 + }, + { + "epoch": 1.2162835567254002, + "grad_norm": 0.8506677150726318, + "learning_rate": 3.3372563490739094e-05, + "loss": 0.9173, + "step": 190380 + }, + { + "epoch": 1.216347443875139, + "grad_norm": 1.3914761543273926, + "learning_rate": 3.336783146562205e-05, + "loss": 0.9274, + "step": 190390 + }, + { + "epoch": 1.2164113310248776, + "grad_norm": 0.8650677800178528, + "learning_rate": 3.3363099608005335e-05, + "loss": 0.7552, + "step": 190400 + }, + { + "epoch": 1.2164752181746163, + "grad_norm": 1.133594036102295, + "learning_rate": 3.335836791793658e-05, + "loss": 0.6892, + "step": 190410 + }, + { + "epoch": 1.216539105324355, + "grad_norm": 0.9037141799926758, + "learning_rate": 3.3353636395463485e-05, + "loss": 1.0421, + "step": 190420 + }, + { + "epoch": 1.2166029924740938, + "grad_norm": 1.0905022621154785, + "learning_rate": 3.334890504063366e-05, + "loss": 0.8529, + "step": 190430 + }, + { + "epoch": 1.2166668796238325, + "grad_norm": 1.3914424180984497, + "learning_rate": 3.334417385349476e-05, + "loss": 0.9783, + "step": 190440 + }, + { + "epoch": 1.2167307667735712, + "grad_norm": 0.8010818958282471, + "learning_rate": 3.3339442834094454e-05, + "loss": 1.014, + "step": 190450 + }, + { + "epoch": 1.2167946539233099, + "grad_norm": 0.6773678064346313, + "learning_rate": 3.333471198248036e-05, + "loss": 0.8047, + "step": 190460 + }, + { + "epoch": 1.2168585410730486, + "grad_norm": 0.8083564043045044, + "learning_rate": 3.332998129870014e-05, + "loss": 0.7306, + "step": 190470 + }, + { + "epoch": 1.2169224282227873, + "grad_norm": 0.7467992901802063, + "learning_rate": 3.332525078280143e-05, + "loss": 0.7576, + "step": 190480 + }, + { + "epoch": 1.216986315372526, + "grad_norm": 1.5822430849075317, + "learning_rate": 3.332052043483187e-05, + "loss": 0.7701, + "step": 190490 + }, + { + "epoch": 1.2170502025222647, + "grad_norm": 0.7331676483154297, + "learning_rate": 3.3315790254839095e-05, + "loss": 0.8167, + "step": 190500 + }, + { + "epoch": 1.2171140896720034, + "grad_norm": 0.6782907247543335, + "learning_rate": 3.331106024287075e-05, + "loss": 1.0039, + "step": 190510 + }, + { + "epoch": 1.2171779768217421, + "grad_norm": 0.7742295861244202, + "learning_rate": 3.330633039897447e-05, + "loss": 0.8161, + "step": 190520 + }, + { + "epoch": 1.2172418639714808, + "grad_norm": 0.5957803726196289, + "learning_rate": 3.330160072319788e-05, + "loss": 0.8088, + "step": 190530 + }, + { + "epoch": 1.2173057511212195, + "grad_norm": 0.9488677978515625, + "learning_rate": 3.329687121558862e-05, + "loss": 0.8291, + "step": 190540 + }, + { + "epoch": 1.2173696382709582, + "grad_norm": 0.7537437081336975, + "learning_rate": 3.329214187619432e-05, + "loss": 1.0181, + "step": 190550 + }, + { + "epoch": 1.217433525420697, + "grad_norm": 1.4307500123977661, + "learning_rate": 3.328741270506259e-05, + "loss": 0.7942, + "step": 190560 + }, + { + "epoch": 1.2174974125704356, + "grad_norm": 0.7630217671394348, + "learning_rate": 3.328268370224109e-05, + "loss": 0.8375, + "step": 190570 + }, + { + "epoch": 1.2175612997201744, + "grad_norm": 1.0125505924224854, + "learning_rate": 3.327795486777742e-05, + "loss": 0.778, + "step": 190580 + }, + { + "epoch": 1.217625186869913, + "grad_norm": 0.6416699886322021, + "learning_rate": 3.327322620171921e-05, + "loss": 0.8773, + "step": 190590 + }, + { + "epoch": 1.2176890740196518, + "grad_norm": 1.3993701934814453, + "learning_rate": 3.326849770411408e-05, + "loss": 0.8246, + "step": 190600 + }, + { + "epoch": 1.2177529611693905, + "grad_norm": 0.6875872015953064, + "learning_rate": 3.326376937500965e-05, + "loss": 1.1023, + "step": 190610 + }, + { + "epoch": 1.2178168483191292, + "grad_norm": 0.9343377351760864, + "learning_rate": 3.325904121445354e-05, + "loss": 0.7328, + "step": 190620 + }, + { + "epoch": 1.2178807354688679, + "grad_norm": 0.8546002507209778, + "learning_rate": 3.325431322249338e-05, + "loss": 0.7355, + "step": 190630 + }, + { + "epoch": 1.2179446226186066, + "grad_norm": 0.593250572681427, + "learning_rate": 3.324958539917677e-05, + "loss": 0.6562, + "step": 190640 + }, + { + "epoch": 1.218008509768345, + "grad_norm": 0.856907844543457, + "learning_rate": 3.3244857744551325e-05, + "loss": 0.9296, + "step": 190650 + }, + { + "epoch": 1.218072396918084, + "grad_norm": 0.9372380375862122, + "learning_rate": 3.324013025866467e-05, + "loss": 0.8646, + "step": 190660 + }, + { + "epoch": 1.2181362840678225, + "grad_norm": 0.7435608506202698, + "learning_rate": 3.323540294156439e-05, + "loss": 0.9015, + "step": 190670 + }, + { + "epoch": 1.2182001712175614, + "grad_norm": 0.686058521270752, + "learning_rate": 3.3230675793298104e-05, + "loss": 1.0199, + "step": 190680 + }, + { + "epoch": 1.2182640583673, + "grad_norm": 0.8744069933891296, + "learning_rate": 3.322594881391342e-05, + "loss": 0.8603, + "step": 190690 + }, + { + "epoch": 1.2183279455170388, + "grad_norm": 0.6687055826187134, + "learning_rate": 3.322169467690033e-05, + "loss": 0.8983, + "step": 190700 + }, + { + "epoch": 1.2183918326667773, + "grad_norm": 0.745978832244873, + "learning_rate": 3.321696801852185e-05, + "loss": 0.7131, + "step": 190710 + }, + { + "epoch": 1.218455719816516, + "grad_norm": 0.979073703289032, + "learning_rate": 3.3212241529163016e-05, + "loss": 0.7781, + "step": 190720 + }, + { + "epoch": 1.2185196069662547, + "grad_norm": 1.0259783267974854, + "learning_rate": 3.320751520887142e-05, + "loss": 0.7987, + "step": 190730 + }, + { + "epoch": 1.2185834941159934, + "grad_norm": 1.424546241760254, + "learning_rate": 3.3202789057694683e-05, + "loss": 0.8471, + "step": 190740 + }, + { + "epoch": 1.2186473812657321, + "grad_norm": 0.8425334095954895, + "learning_rate": 3.31980630756804e-05, + "loss": 0.7256, + "step": 190750 + }, + { + "epoch": 1.2187112684154708, + "grad_norm": 1.0759365558624268, + "learning_rate": 3.319333726287615e-05, + "loss": 0.9268, + "step": 190760 + }, + { + "epoch": 1.2187751555652095, + "grad_norm": 0.826580286026001, + "learning_rate": 3.318861161932954e-05, + "loss": 0.9029, + "step": 190770 + }, + { + "epoch": 1.2188390427149483, + "grad_norm": 0.7088884711265564, + "learning_rate": 3.3183886145088146e-05, + "loss": 0.9784, + "step": 190780 + }, + { + "epoch": 1.218902929864687, + "grad_norm": 2.455657720565796, + "learning_rate": 3.3179160840199566e-05, + "loss": 0.8813, + "step": 190790 + }, + { + "epoch": 1.2189668170144257, + "grad_norm": 1.166312575340271, + "learning_rate": 3.3174435704711396e-05, + "loss": 0.7366, + "step": 190800 + }, + { + "epoch": 1.2190307041641644, + "grad_norm": 0.9697980880737305, + "learning_rate": 3.316971073867121e-05, + "loss": 1.0195, + "step": 190810 + }, + { + "epoch": 1.219094591313903, + "grad_norm": 0.8241257667541504, + "learning_rate": 3.31649859421266e-05, + "loss": 0.9047, + "step": 190820 + }, + { + "epoch": 1.2191584784636418, + "grad_norm": 1.1289937496185303, + "learning_rate": 3.3160261315125145e-05, + "loss": 0.9061, + "step": 190830 + }, + { + "epoch": 1.2192223656133805, + "grad_norm": 1.415996789932251, + "learning_rate": 3.315553685771443e-05, + "loss": 0.892, + "step": 190840 + }, + { + "epoch": 1.2192862527631192, + "grad_norm": 1.2349244356155396, + "learning_rate": 3.315081256994204e-05, + "loss": 0.8346, + "step": 190850 + }, + { + "epoch": 1.219350139912858, + "grad_norm": 0.7951316833496094, + "learning_rate": 3.314608845185553e-05, + "loss": 0.8926, + "step": 190860 + }, + { + "epoch": 1.2194140270625966, + "grad_norm": 0.918441891670227, + "learning_rate": 3.3141364503502495e-05, + "loss": 0.7915, + "step": 190870 + }, + { + "epoch": 1.2194779142123353, + "grad_norm": 1.6058706045150757, + "learning_rate": 3.31366407249305e-05, + "loss": 0.8828, + "step": 190880 + }, + { + "epoch": 1.219541801362074, + "grad_norm": 0.8341480493545532, + "learning_rate": 3.313191711618712e-05, + "loss": 0.6081, + "step": 190890 + }, + { + "epoch": 1.2196056885118127, + "grad_norm": 1.021825909614563, + "learning_rate": 3.312719367731993e-05, + "loss": 0.9466, + "step": 190900 + }, + { + "epoch": 1.2196695756615514, + "grad_norm": 1.2775073051452637, + "learning_rate": 3.31224704083765e-05, + "loss": 0.7967, + "step": 190910 + }, + { + "epoch": 1.2197334628112901, + "grad_norm": 1.687404751777649, + "learning_rate": 3.3117747309404397e-05, + "loss": 0.9625, + "step": 190920 + }, + { + "epoch": 1.2197973499610288, + "grad_norm": 0.7202122211456299, + "learning_rate": 3.3113024380451176e-05, + "loss": 0.7475, + "step": 190930 + }, + { + "epoch": 1.2198612371107675, + "grad_norm": 1.1875548362731934, + "learning_rate": 3.310830162156441e-05, + "loss": 0.8918, + "step": 190940 + }, + { + "epoch": 1.2199251242605063, + "grad_norm": 1.0419461727142334, + "learning_rate": 3.310357903279166e-05, + "loss": 0.623, + "step": 190950 + }, + { + "epoch": 1.219989011410245, + "grad_norm": 1.0206866264343262, + "learning_rate": 3.3098856614180495e-05, + "loss": 1.1513, + "step": 190960 + }, + { + "epoch": 1.2200528985599837, + "grad_norm": 0.7848687767982483, + "learning_rate": 3.309413436577846e-05, + "loss": 0.8548, + "step": 190970 + }, + { + "epoch": 1.2201167857097224, + "grad_norm": 1.786482334136963, + "learning_rate": 3.308941228763311e-05, + "loss": 1.0621, + "step": 190980 + }, + { + "epoch": 1.220180672859461, + "grad_norm": 1.0571478605270386, + "learning_rate": 3.3084690379792014e-05, + "loss": 1.0426, + "step": 190990 + }, + { + "epoch": 1.2202445600091998, + "grad_norm": 0.8210548758506775, + "learning_rate": 3.307996864230273e-05, + "loss": 1.103, + "step": 191000 + }, + { + "epoch": 1.2203084471589385, + "grad_norm": 1.0246258974075317, + "learning_rate": 3.3075247075212785e-05, + "loss": 0.7603, + "step": 191010 + }, + { + "epoch": 1.2203723343086772, + "grad_norm": 0.6933344602584839, + "learning_rate": 3.3070525678569745e-05, + "loss": 0.7222, + "step": 191020 + }, + { + "epoch": 1.220436221458416, + "grad_norm": 0.7241206169128418, + "learning_rate": 3.306580445242117e-05, + "loss": 0.6726, + "step": 191030 + }, + { + "epoch": 1.2205001086081546, + "grad_norm": 0.6975740194320679, + "learning_rate": 3.306108339681458e-05, + "loss": 0.8145, + "step": 191040 + }, + { + "epoch": 1.2205639957578933, + "grad_norm": 0.8648557662963867, + "learning_rate": 3.305636251179753e-05, + "loss": 0.8079, + "step": 191050 + }, + { + "epoch": 1.220627882907632, + "grad_norm": 0.6180557012557983, + "learning_rate": 3.305164179741758e-05, + "loss": 0.9549, + "step": 191060 + }, + { + "epoch": 1.2206917700573707, + "grad_norm": 0.8685292601585388, + "learning_rate": 3.304692125372225e-05, + "loss": 0.9466, + "step": 191070 + }, + { + "epoch": 1.2207556572071094, + "grad_norm": 0.7630671262741089, + "learning_rate": 3.30422008807591e-05, + "loss": 0.8865, + "step": 191080 + }, + { + "epoch": 1.2208195443568481, + "grad_norm": 1.0005232095718384, + "learning_rate": 3.303748067857565e-05, + "loss": 1.246, + "step": 191090 + }, + { + "epoch": 1.2208834315065868, + "grad_norm": 5.073404788970947, + "learning_rate": 3.303276064721945e-05, + "loss": 0.786, + "step": 191100 + }, + { + "epoch": 1.2209473186563256, + "grad_norm": 1.2058204412460327, + "learning_rate": 3.302804078673804e-05, + "loss": 1.02, + "step": 191110 + }, + { + "epoch": 1.2210112058060643, + "grad_norm": 1.3755779266357422, + "learning_rate": 3.302332109717892e-05, + "loss": 0.751, + "step": 191120 + }, + { + "epoch": 1.221075092955803, + "grad_norm": 1.1300941705703735, + "learning_rate": 3.301860157858966e-05, + "loss": 0.8189, + "step": 191130 + }, + { + "epoch": 1.2211389801055414, + "grad_norm": 1.0134592056274414, + "learning_rate": 3.3013882231017764e-05, + "loss": 0.7888, + "step": 191140 + }, + { + "epoch": 1.2212028672552804, + "grad_norm": 0.8436269164085388, + "learning_rate": 3.300916305451077e-05, + "loss": 0.7243, + "step": 191150 + }, + { + "epoch": 1.2212667544050189, + "grad_norm": 1.3353471755981445, + "learning_rate": 3.30044440491162e-05, + "loss": 0.8878, + "step": 191160 + }, + { + "epoch": 1.2213306415547578, + "grad_norm": 1.0653821229934692, + "learning_rate": 3.2999725214881597e-05, + "loss": 0.8101, + "step": 191170 + }, + { + "epoch": 1.2213945287044963, + "grad_norm": 1.0878440141677856, + "learning_rate": 3.299500655185445e-05, + "loss": 1.0963, + "step": 191180 + }, + { + "epoch": 1.2214584158542352, + "grad_norm": 0.7621514201164246, + "learning_rate": 3.2990288060082306e-05, + "loss": 0.9118, + "step": 191190 + }, + { + "epoch": 1.2215223030039737, + "grad_norm": 1.2937744855880737, + "learning_rate": 3.2985569739612676e-05, + "loss": 0.7773, + "step": 191200 + }, + { + "epoch": 1.2215861901537124, + "grad_norm": 0.9347992539405823, + "learning_rate": 3.298085159049308e-05, + "loss": 0.8938, + "step": 191210 + }, + { + "epoch": 1.221650077303451, + "grad_norm": 1.0627859830856323, + "learning_rate": 3.297613361277103e-05, + "loss": 1.0111, + "step": 191220 + }, + { + "epoch": 1.2217139644531898, + "grad_norm": 0.9506715536117554, + "learning_rate": 3.297141580649405e-05, + "loss": 0.7416, + "step": 191230 + }, + { + "epoch": 1.2217778516029285, + "grad_norm": 1.2876331806182861, + "learning_rate": 3.296669817170964e-05, + "loss": 0.7476, + "step": 191240 + }, + { + "epoch": 1.2218417387526672, + "grad_norm": 1.331278681755066, + "learning_rate": 3.2961980708465315e-05, + "loss": 0.8709, + "step": 191250 + }, + { + "epoch": 1.221905625902406, + "grad_norm": 1.1935945749282837, + "learning_rate": 3.295726341680857e-05, + "loss": 0.923, + "step": 191260 + }, + { + "epoch": 1.2219695130521446, + "grad_norm": 0.8223473429679871, + "learning_rate": 3.2952546296786934e-05, + "loss": 0.9257, + "step": 191270 + }, + { + "epoch": 1.2220334002018833, + "grad_norm": 0.7247016429901123, + "learning_rate": 3.294782934844791e-05, + "loss": 0.8564, + "step": 191280 + }, + { + "epoch": 1.222097287351622, + "grad_norm": 0.6738024950027466, + "learning_rate": 3.2943112571838996e-05, + "loss": 0.8172, + "step": 191290 + }, + { + "epoch": 1.2221611745013607, + "grad_norm": 1.0389232635498047, + "learning_rate": 3.29383959670077e-05, + "loss": 0.9731, + "step": 191300 + }, + { + "epoch": 1.2222250616510995, + "grad_norm": 0.9643621444702148, + "learning_rate": 3.2933679534001515e-05, + "loss": 0.9725, + "step": 191310 + }, + { + "epoch": 1.2222889488008382, + "grad_norm": 2.0298118591308594, + "learning_rate": 3.292896327286794e-05, + "loss": 0.6888, + "step": 191320 + }, + { + "epoch": 1.2223528359505769, + "grad_norm": 0.9155459403991699, + "learning_rate": 3.2924247183654464e-05, + "loss": 0.9241, + "step": 191330 + }, + { + "epoch": 1.2224167231003156, + "grad_norm": 0.6312406659126282, + "learning_rate": 3.291953126640863e-05, + "loss": 0.9328, + "step": 191340 + }, + { + "epoch": 1.2224806102500543, + "grad_norm": 0.8391063213348389, + "learning_rate": 3.291481552117786e-05, + "loss": 1.0933, + "step": 191350 + }, + { + "epoch": 1.222544497399793, + "grad_norm": 1.127882719039917, + "learning_rate": 3.291009994800968e-05, + "loss": 0.9218, + "step": 191360 + }, + { + "epoch": 1.2226083845495317, + "grad_norm": 0.8426359295845032, + "learning_rate": 3.290538454695157e-05, + "loss": 0.8109, + "step": 191370 + }, + { + "epoch": 1.2226722716992704, + "grad_norm": 0.7490180730819702, + "learning_rate": 3.2900669318051036e-05, + "loss": 0.8676, + "step": 191380 + }, + { + "epoch": 1.222736158849009, + "grad_norm": 0.675996720790863, + "learning_rate": 3.289595426135555e-05, + "loss": 0.9908, + "step": 191390 + }, + { + "epoch": 1.2228000459987478, + "grad_norm": 1.0120384693145752, + "learning_rate": 3.2891239376912614e-05, + "loss": 0.6029, + "step": 191400 + }, + { + "epoch": 1.2228639331484865, + "grad_norm": 1.0978142023086548, + "learning_rate": 3.288652466476969e-05, + "loss": 1.0245, + "step": 191410 + }, + { + "epoch": 1.2229278202982252, + "grad_norm": 0.7779439091682434, + "learning_rate": 3.288181012497427e-05, + "loss": 0.7234, + "step": 191420 + }, + { + "epoch": 1.222991707447964, + "grad_norm": 1.177938461303711, + "learning_rate": 3.287709575757383e-05, + "loss": 0.8939, + "step": 191430 + }, + { + "epoch": 1.2230555945977026, + "grad_norm": 1.2629878520965576, + "learning_rate": 3.2872381562615853e-05, + "loss": 0.7932, + "step": 191440 + }, + { + "epoch": 1.2231194817474413, + "grad_norm": 0.6756806969642639, + "learning_rate": 3.286766754014781e-05, + "loss": 0.8743, + "step": 191450 + }, + { + "epoch": 1.22318336889718, + "grad_norm": 0.9224807620048523, + "learning_rate": 3.2862953690217176e-05, + "loss": 0.9364, + "step": 191460 + }, + { + "epoch": 1.2232472560469188, + "grad_norm": 1.0424355268478394, + "learning_rate": 3.2858240012871424e-05, + "loss": 0.8107, + "step": 191470 + }, + { + "epoch": 1.2233111431966575, + "grad_norm": 1.0432953834533691, + "learning_rate": 3.2853526508158014e-05, + "loss": 0.8718, + "step": 191480 + }, + { + "epoch": 1.2233750303463962, + "grad_norm": 1.1413217782974243, + "learning_rate": 3.284881317612444e-05, + "loss": 0.9998, + "step": 191490 + }, + { + "epoch": 1.2234389174961349, + "grad_norm": 2.2246835231781006, + "learning_rate": 3.284410001681815e-05, + "loss": 0.6342, + "step": 191500 + }, + { + "epoch": 1.2235028046458736, + "grad_norm": 0.6261698007583618, + "learning_rate": 3.283938703028662e-05, + "loss": 0.7577, + "step": 191510 + }, + { + "epoch": 1.2235666917956123, + "grad_norm": 1.1829447746276855, + "learning_rate": 3.28346742165773e-05, + "loss": 0.7946, + "step": 191520 + }, + { + "epoch": 1.223630578945351, + "grad_norm": 0.9962440133094788, + "learning_rate": 3.282996157573767e-05, + "loss": 0.9398, + "step": 191530 + }, + { + "epoch": 1.2236944660950897, + "grad_norm": 0.9726690053939819, + "learning_rate": 3.282524910781517e-05, + "loss": 0.8636, + "step": 191540 + }, + { + "epoch": 1.2237583532448284, + "grad_norm": 1.3285902738571167, + "learning_rate": 3.282053681285728e-05, + "loss": 1.0809, + "step": 191550 + }, + { + "epoch": 1.223822240394567, + "grad_norm": 0.8424548506736755, + "learning_rate": 3.2815824690911444e-05, + "loss": 0.8103, + "step": 191560 + }, + { + "epoch": 1.2238861275443058, + "grad_norm": 0.6262918710708618, + "learning_rate": 3.2811112742025115e-05, + "loss": 0.8269, + "step": 191570 + }, + { + "epoch": 1.2239500146940445, + "grad_norm": 0.8682219982147217, + "learning_rate": 3.2806400966245745e-05, + "loss": 0.9268, + "step": 191580 + }, + { + "epoch": 1.2240139018437832, + "grad_norm": 1.0514038801193237, + "learning_rate": 3.28016893636208e-05, + "loss": 0.7626, + "step": 191590 + }, + { + "epoch": 1.224077788993522, + "grad_norm": 1.6504403352737427, + "learning_rate": 3.279697793419773e-05, + "loss": 1.0023, + "step": 191600 + }, + { + "epoch": 1.2241416761432606, + "grad_norm": 0.846878170967102, + "learning_rate": 3.279226667802396e-05, + "loss": 0.7239, + "step": 191610 + }, + { + "epoch": 1.2242055632929993, + "grad_norm": 0.8492310643196106, + "learning_rate": 3.278755559514696e-05, + "loss": 1.0004, + "step": 191620 + }, + { + "epoch": 1.2242694504427378, + "grad_norm": 0.77544105052948, + "learning_rate": 3.2782844685614164e-05, + "loss": 0.7413, + "step": 191630 + }, + { + "epoch": 1.2243333375924768, + "grad_norm": 0.8446138501167297, + "learning_rate": 3.2778133949473025e-05, + "loss": 0.9818, + "step": 191640 + }, + { + "epoch": 1.2243972247422152, + "grad_norm": 0.6860386729240417, + "learning_rate": 3.277342338677096e-05, + "loss": 1.087, + "step": 191650 + }, + { + "epoch": 1.2244611118919542, + "grad_norm": 0.47785627841949463, + "learning_rate": 3.276871299755544e-05, + "loss": 0.801, + "step": 191660 + }, + { + "epoch": 1.2245249990416927, + "grad_norm": 0.8878980278968811, + "learning_rate": 3.276400278187388e-05, + "loss": 0.9389, + "step": 191670 + }, + { + "epoch": 1.2245888861914314, + "grad_norm": 1.2902075052261353, + "learning_rate": 3.275929273977373e-05, + "loss": 0.8234, + "step": 191680 + }, + { + "epoch": 1.22465277334117, + "grad_norm": 1.8300102949142456, + "learning_rate": 3.275458287130241e-05, + "loss": 0.6632, + "step": 191690 + }, + { + "epoch": 1.2247166604909088, + "grad_norm": 1.1442055702209473, + "learning_rate": 3.2749873176507364e-05, + "loss": 0.9729, + "step": 191700 + }, + { + "epoch": 1.2247805476406475, + "grad_norm": 3.596463918685913, + "learning_rate": 3.274516365543601e-05, + "loss": 0.6886, + "step": 191710 + }, + { + "epoch": 1.2248444347903862, + "grad_norm": 1.2126635313034058, + "learning_rate": 3.274045430813579e-05, + "loss": 0.9968, + "step": 191720 + }, + { + "epoch": 1.224908321940125, + "grad_norm": 1.014853835105896, + "learning_rate": 3.273574513465413e-05, + "loss": 0.7696, + "step": 191730 + }, + { + "epoch": 1.2249722090898636, + "grad_norm": 1.4263054132461548, + "learning_rate": 3.273103613503846e-05, + "loss": 1.0814, + "step": 191740 + }, + { + "epoch": 1.2250360962396023, + "grad_norm": 1.2331584692001343, + "learning_rate": 3.272632730933618e-05, + "loss": 0.9616, + "step": 191750 + }, + { + "epoch": 1.225099983389341, + "grad_norm": 1.4743117094039917, + "learning_rate": 3.272161865759474e-05, + "loss": 1.1666, + "step": 191760 + }, + { + "epoch": 1.2251638705390797, + "grad_norm": 0.7259349822998047, + "learning_rate": 3.2716910179861537e-05, + "loss": 0.8348, + "step": 191770 + }, + { + "epoch": 1.2252277576888184, + "grad_norm": 1.300727128982544, + "learning_rate": 3.2712201876184004e-05, + "loss": 0.705, + "step": 191780 + }, + { + "epoch": 1.2252916448385571, + "grad_norm": 1.0736730098724365, + "learning_rate": 3.2707493746609554e-05, + "loss": 0.714, + "step": 191790 + }, + { + "epoch": 1.2253555319882958, + "grad_norm": 0.7128208875656128, + "learning_rate": 3.2702785791185606e-05, + "loss": 0.8655, + "step": 191800 + }, + { + "epoch": 1.2254194191380345, + "grad_norm": 1.3977357149124146, + "learning_rate": 3.269807800995957e-05, + "loss": 1.2464, + "step": 191810 + }, + { + "epoch": 1.2254833062877732, + "grad_norm": 1.369526982307434, + "learning_rate": 3.269337040297885e-05, + "loss": 0.8314, + "step": 191820 + }, + { + "epoch": 1.225547193437512, + "grad_norm": 1.0817862749099731, + "learning_rate": 3.2688662970290885e-05, + "loss": 0.9184, + "step": 191830 + }, + { + "epoch": 1.2256110805872507, + "grad_norm": 1.0620317459106445, + "learning_rate": 3.268395571194305e-05, + "loss": 1.0192, + "step": 191840 + }, + { + "epoch": 1.2256749677369894, + "grad_norm": 0.844619870185852, + "learning_rate": 3.267924862798275e-05, + "loss": 0.8088, + "step": 191850 + }, + { + "epoch": 1.225738854886728, + "grad_norm": 0.6405353546142578, + "learning_rate": 3.267454171845741e-05, + "loss": 0.8423, + "step": 191860 + }, + { + "epoch": 1.2258027420364668, + "grad_norm": 1.2829723358154297, + "learning_rate": 3.2669834983414416e-05, + "loss": 1.0363, + "step": 191870 + }, + { + "epoch": 1.2258666291862055, + "grad_norm": 1.374717116355896, + "learning_rate": 3.266512842290118e-05, + "loss": 0.9436, + "step": 191880 + }, + { + "epoch": 1.2259305163359442, + "grad_norm": 1.1757957935333252, + "learning_rate": 3.266042203696509e-05, + "loss": 0.8262, + "step": 191890 + }, + { + "epoch": 1.225994403485683, + "grad_norm": 1.0372143983840942, + "learning_rate": 3.265571582565355e-05, + "loss": 0.7161, + "step": 191900 + }, + { + "epoch": 1.2260582906354216, + "grad_norm": 0.8718850016593933, + "learning_rate": 3.265100978901396e-05, + "loss": 1.1899, + "step": 191910 + }, + { + "epoch": 1.2261221777851603, + "grad_norm": 1.32882821559906, + "learning_rate": 3.2646303927093716e-05, + "loss": 0.9109, + "step": 191920 + }, + { + "epoch": 1.226186064934899, + "grad_norm": 1.2186968326568604, + "learning_rate": 3.2641598239940206e-05, + "loss": 0.8187, + "step": 191930 + }, + { + "epoch": 1.2262499520846377, + "grad_norm": 1.1600444316864014, + "learning_rate": 3.263689272760081e-05, + "loss": 1.1301, + "step": 191940 + }, + { + "epoch": 1.2263138392343764, + "grad_norm": 1.9314838647842407, + "learning_rate": 3.263218739012294e-05, + "loss": 0.9535, + "step": 191950 + }, + { + "epoch": 1.2263777263841151, + "grad_norm": 0.8182092308998108, + "learning_rate": 3.2627482227553954e-05, + "loss": 0.6432, + "step": 191960 + }, + { + "epoch": 1.2264416135338538, + "grad_norm": 0.852057695388794, + "learning_rate": 3.262277723994126e-05, + "loss": 0.9246, + "step": 191970 + }, + { + "epoch": 1.2265055006835925, + "grad_norm": 0.7511641979217529, + "learning_rate": 3.2618072427332224e-05, + "loss": 0.7807, + "step": 191980 + }, + { + "epoch": 1.2265693878333312, + "grad_norm": 0.7494274377822876, + "learning_rate": 3.261336778977424e-05, + "loss": 0.7198, + "step": 191990 + }, + { + "epoch": 1.22663327498307, + "grad_norm": 1.205119013786316, + "learning_rate": 3.260866332731469e-05, + "loss": 0.7046, + "step": 192000 + }, + { + "epoch": 1.2266971621328087, + "grad_norm": 0.45299506187438965, + "learning_rate": 3.2603959040000944e-05, + "loss": 0.6128, + "step": 192010 + }, + { + "epoch": 1.2267610492825474, + "grad_norm": 1.5304423570632935, + "learning_rate": 3.259925492788037e-05, + "loss": 1.0436, + "step": 192020 + }, + { + "epoch": 1.226824936432286, + "grad_norm": 0.8956804275512695, + "learning_rate": 3.2594550991000364e-05, + "loss": 0.7963, + "step": 192030 + }, + { + "epoch": 1.2268888235820248, + "grad_norm": 1.0764594078063965, + "learning_rate": 3.258984722940829e-05, + "loss": 0.855, + "step": 192040 + }, + { + "epoch": 1.2269527107317635, + "grad_norm": 1.0527817010879517, + "learning_rate": 3.2585143643151505e-05, + "loss": 0.7834, + "step": 192050 + }, + { + "epoch": 1.2270165978815022, + "grad_norm": 1.3264557123184204, + "learning_rate": 3.25804402322774e-05, + "loss": 0.7283, + "step": 192060 + }, + { + "epoch": 1.227080485031241, + "grad_norm": 0.8222655653953552, + "learning_rate": 3.2575736996833325e-05, + "loss": 0.8905, + "step": 192070 + }, + { + "epoch": 1.2271443721809796, + "grad_norm": 2.544546604156494, + "learning_rate": 3.2571033936866653e-05, + "loss": 1.1683, + "step": 192080 + }, + { + "epoch": 1.2272082593307183, + "grad_norm": 1.200516939163208, + "learning_rate": 3.2566331052424744e-05, + "loss": 1.0274, + "step": 192090 + }, + { + "epoch": 1.227272146480457, + "grad_norm": 0.9582846760749817, + "learning_rate": 3.256162834355497e-05, + "loss": 0.7701, + "step": 192100 + }, + { + "epoch": 1.2273360336301957, + "grad_norm": 0.7705967426300049, + "learning_rate": 3.255692581030467e-05, + "loss": 0.9304, + "step": 192110 + }, + { + "epoch": 1.2273999207799342, + "grad_norm": 0.9376071691513062, + "learning_rate": 3.2552223452721234e-05, + "loss": 0.897, + "step": 192120 + }, + { + "epoch": 1.2274638079296731, + "grad_norm": 1.4855351448059082, + "learning_rate": 3.2547521270852e-05, + "loss": 0.7594, + "step": 192130 + }, + { + "epoch": 1.2275276950794116, + "grad_norm": 0.7116539478302002, + "learning_rate": 3.254281926474432e-05, + "loss": 0.802, + "step": 192140 + }, + { + "epoch": 1.2275915822291505, + "grad_norm": 1.2037607431411743, + "learning_rate": 3.2538117434445556e-05, + "loss": 0.9778, + "step": 192150 + }, + { + "epoch": 1.227655469378889, + "grad_norm": 0.8576934337615967, + "learning_rate": 3.253341578000306e-05, + "loss": 0.7583, + "step": 192160 + }, + { + "epoch": 1.2277193565286277, + "grad_norm": 0.8133600354194641, + "learning_rate": 3.252871430146417e-05, + "loss": 0.948, + "step": 192170 + }, + { + "epoch": 1.2277832436783664, + "grad_norm": 0.8566866517066956, + "learning_rate": 3.252401299887625e-05, + "loss": 0.8403, + "step": 192180 + }, + { + "epoch": 1.2278471308281051, + "grad_norm": 0.8782423138618469, + "learning_rate": 3.251931187228664e-05, + "loss": 0.7959, + "step": 192190 + }, + { + "epoch": 1.2279110179778439, + "grad_norm": 0.6727086305618286, + "learning_rate": 3.251461092174267e-05, + "loss": 0.8979, + "step": 192200 + }, + { + "epoch": 1.2279749051275826, + "grad_norm": 1.1492390632629395, + "learning_rate": 3.2509910147291704e-05, + "loss": 0.6958, + "step": 192210 + }, + { + "epoch": 1.2280387922773213, + "grad_norm": 1.4259767532348633, + "learning_rate": 3.2505209548981074e-05, + "loss": 0.831, + "step": 192220 + }, + { + "epoch": 1.22810267942706, + "grad_norm": 0.921388566493988, + "learning_rate": 3.2500509126858115e-05, + "loss": 0.8291, + "step": 192230 + }, + { + "epoch": 1.2281665665767987, + "grad_norm": 0.6656931638717651, + "learning_rate": 3.2495808880970166e-05, + "loss": 0.7202, + "step": 192240 + }, + { + "epoch": 1.2282304537265374, + "grad_norm": 0.8429076075553894, + "learning_rate": 3.249110881136458e-05, + "loss": 1.0291, + "step": 192250 + }, + { + "epoch": 1.228294340876276, + "grad_norm": 1.1925345659255981, + "learning_rate": 3.248640891808866e-05, + "loss": 1.0243, + "step": 192260 + }, + { + "epoch": 1.2283582280260148, + "grad_norm": 0.6658920049667358, + "learning_rate": 3.248170920118976e-05, + "loss": 0.9112, + "step": 192270 + }, + { + "epoch": 1.2284221151757535, + "grad_norm": 1.026580572128296, + "learning_rate": 3.2477009660715195e-05, + "loss": 0.6001, + "step": 192280 + }, + { + "epoch": 1.2284860023254922, + "grad_norm": 0.8293872475624084, + "learning_rate": 3.247231029671232e-05, + "loss": 0.8562, + "step": 192290 + }, + { + "epoch": 1.228549889475231, + "grad_norm": 0.8070954084396362, + "learning_rate": 3.2467611109228426e-05, + "loss": 0.8052, + "step": 192300 + }, + { + "epoch": 1.2286137766249696, + "grad_norm": 0.7816912531852722, + "learning_rate": 3.2462912098310876e-05, + "loss": 0.7136, + "step": 192310 + }, + { + "epoch": 1.2286776637747083, + "grad_norm": 0.7612060308456421, + "learning_rate": 3.245821326400696e-05, + "loss": 0.9215, + "step": 192320 + }, + { + "epoch": 1.228741550924447, + "grad_norm": 0.707601010799408, + "learning_rate": 3.245351460636401e-05, + "loss": 0.9532, + "step": 192330 + }, + { + "epoch": 1.2288054380741857, + "grad_norm": 1.278326153755188, + "learning_rate": 3.244881612542935e-05, + "loss": 0.8568, + "step": 192340 + }, + { + "epoch": 1.2288693252239244, + "grad_norm": 0.7208459377288818, + "learning_rate": 3.24441178212503e-05, + "loss": 1.0636, + "step": 192350 + }, + { + "epoch": 1.2289332123736632, + "grad_norm": 0.7168135643005371, + "learning_rate": 3.243941969387416e-05, + "loss": 0.7759, + "step": 192360 + }, + { + "epoch": 1.2289970995234019, + "grad_norm": 0.7322360873222351, + "learning_rate": 3.243472174334827e-05, + "loss": 0.7761, + "step": 192370 + }, + { + "epoch": 1.2290609866731406, + "grad_norm": 0.6737679839134216, + "learning_rate": 3.243002396971992e-05, + "loss": 0.9582, + "step": 192380 + }, + { + "epoch": 1.2291248738228793, + "grad_norm": 0.962618350982666, + "learning_rate": 3.242532637303643e-05, + "loss": 1.1989, + "step": 192390 + }, + { + "epoch": 1.229188760972618, + "grad_norm": 0.9641563296318054, + "learning_rate": 3.2420628953345105e-05, + "loss": 0.74, + "step": 192400 + }, + { + "epoch": 1.2292526481223567, + "grad_norm": 0.8702797293663025, + "learning_rate": 3.241593171069326e-05, + "loss": 0.9423, + "step": 192410 + }, + { + "epoch": 1.2293165352720954, + "grad_norm": 1.1447423696517944, + "learning_rate": 3.241123464512819e-05, + "loss": 0.9131, + "step": 192420 + }, + { + "epoch": 1.229380422421834, + "grad_norm": 1.0794869661331177, + "learning_rate": 3.24065377566972e-05, + "loss": 0.7764, + "step": 192430 + }, + { + "epoch": 1.2294443095715728, + "grad_norm": 1.5134004354476929, + "learning_rate": 3.24018410454476e-05, + "loss": 1.0517, + "step": 192440 + }, + { + "epoch": 1.2295081967213115, + "grad_norm": 1.1577551364898682, + "learning_rate": 3.239714451142668e-05, + "loss": 0.8146, + "step": 192450 + }, + { + "epoch": 1.2295720838710502, + "grad_norm": 1.0094727277755737, + "learning_rate": 3.239244815468175e-05, + "loss": 0.8686, + "step": 192460 + }, + { + "epoch": 1.229635971020789, + "grad_norm": 0.7291733026504517, + "learning_rate": 3.23877519752601e-05, + "loss": 0.9864, + "step": 192470 + }, + { + "epoch": 1.2296998581705276, + "grad_norm": 0.7421193718910217, + "learning_rate": 3.238305597320903e-05, + "loss": 0.9084, + "step": 192480 + }, + { + "epoch": 1.2297637453202663, + "grad_norm": 0.5920099020004272, + "learning_rate": 3.237836014857581e-05, + "loss": 0.9251, + "step": 192490 + }, + { + "epoch": 1.229827632470005, + "grad_norm": 0.7378367781639099, + "learning_rate": 3.2373664501407766e-05, + "loss": 0.98, + "step": 192500 + }, + { + "epoch": 1.2298915196197437, + "grad_norm": 1.1757078170776367, + "learning_rate": 3.236896903175216e-05, + "loss": 0.7089, + "step": 192510 + }, + { + "epoch": 1.2299554067694825, + "grad_norm": 0.9285265803337097, + "learning_rate": 3.236427373965629e-05, + "loss": 0.7968, + "step": 192520 + }, + { + "epoch": 1.2300192939192212, + "grad_norm": 1.0863308906555176, + "learning_rate": 3.235957862516745e-05, + "loss": 0.8758, + "step": 192530 + }, + { + "epoch": 1.2300831810689599, + "grad_norm": 1.2274580001831055, + "learning_rate": 3.2354883688332906e-05, + "loss": 0.8596, + "step": 192540 + }, + { + "epoch": 1.2301470682186986, + "grad_norm": 0.7428937554359436, + "learning_rate": 3.235018892919995e-05, + "loss": 0.8672, + "step": 192550 + }, + { + "epoch": 1.2302109553684373, + "grad_norm": 1.2199949026107788, + "learning_rate": 3.234549434781586e-05, + "loss": 0.8324, + "step": 192560 + }, + { + "epoch": 1.230274842518176, + "grad_norm": 1.1966480016708374, + "learning_rate": 3.234079994422791e-05, + "loss": 1.0522, + "step": 192570 + }, + { + "epoch": 1.2303387296679147, + "grad_norm": 0.7035223245620728, + "learning_rate": 3.233610571848339e-05, + "loss": 0.7794, + "step": 192580 + }, + { + "epoch": 1.2304026168176534, + "grad_norm": 0.9693596959114075, + "learning_rate": 3.2331411670629564e-05, + "loss": 0.7748, + "step": 192590 + }, + { + "epoch": 1.230466503967392, + "grad_norm": 0.9353430867195129, + "learning_rate": 3.2326717800713706e-05, + "loss": 0.9076, + "step": 192600 + }, + { + "epoch": 1.2305303911171306, + "grad_norm": 0.6712273359298706, + "learning_rate": 3.232202410878309e-05, + "loss": 0.8616, + "step": 192610 + }, + { + "epoch": 1.2305942782668695, + "grad_norm": 2.5598068237304688, + "learning_rate": 3.2317330594884986e-05, + "loss": 0.735, + "step": 192620 + }, + { + "epoch": 1.230658165416608, + "grad_norm": 1.0685700178146362, + "learning_rate": 3.2312637259066654e-05, + "loss": 0.955, + "step": 192630 + }, + { + "epoch": 1.230722052566347, + "grad_norm": 0.6315664052963257, + "learning_rate": 3.230794410137537e-05, + "loss": 0.6154, + "step": 192640 + }, + { + "epoch": 1.2307859397160854, + "grad_norm": 0.6316712498664856, + "learning_rate": 3.23032511218584e-05, + "loss": 1.101, + "step": 192650 + }, + { + "epoch": 1.2308498268658241, + "grad_norm": 1.0245057344436646, + "learning_rate": 3.2298558320563e-05, + "loss": 1.1672, + "step": 192660 + }, + { + "epoch": 1.2309137140155628, + "grad_norm": 0.8964295387268066, + "learning_rate": 3.2293865697536426e-05, + "loss": 0.916, + "step": 192670 + }, + { + "epoch": 1.2309776011653015, + "grad_norm": 1.661603331565857, + "learning_rate": 3.228917325282595e-05, + "loss": 0.8007, + "step": 192680 + }, + { + "epoch": 1.2310414883150402, + "grad_norm": 0.7393468022346497, + "learning_rate": 3.2284480986478813e-05, + "loss": 0.7766, + "step": 192690 + }, + { + "epoch": 1.231105375464779, + "grad_norm": 1.1308892965316772, + "learning_rate": 3.227978889854229e-05, + "loss": 0.8523, + "step": 192700 + }, + { + "epoch": 1.2311692626145176, + "grad_norm": 0.988743782043457, + "learning_rate": 3.2275096989063616e-05, + "loss": 0.9271, + "step": 192710 + }, + { + "epoch": 1.2312331497642564, + "grad_norm": 0.7368068695068359, + "learning_rate": 3.2270405258090054e-05, + "loss": 0.7516, + "step": 192720 + }, + { + "epoch": 1.231297036913995, + "grad_norm": 0.8602904677391052, + "learning_rate": 3.2266182852874775e-05, + "loss": 0.6705, + "step": 192730 + }, + { + "epoch": 1.2313609240637338, + "grad_norm": 0.8390285968780518, + "learning_rate": 3.226149146119108e-05, + "loss": 0.7796, + "step": 192740 + }, + { + "epoch": 1.2314248112134725, + "grad_norm": 0.7270960211753845, + "learning_rate": 3.225680024814951e-05, + "loss": 1.0052, + "step": 192750 + }, + { + "epoch": 1.2314886983632112, + "grad_norm": 0.8915246725082397, + "learning_rate": 3.2252109213797317e-05, + "loss": 0.8711, + "step": 192760 + }, + { + "epoch": 1.2315525855129499, + "grad_norm": 0.6008066534996033, + "learning_rate": 3.2247418358181734e-05, + "loss": 0.7466, + "step": 192770 + }, + { + "epoch": 1.2316164726626886, + "grad_norm": 0.7765604257583618, + "learning_rate": 3.224272768135002e-05, + "loss": 0.8973, + "step": 192780 + }, + { + "epoch": 1.2316803598124273, + "grad_norm": 0.9872602820396423, + "learning_rate": 3.223803718334939e-05, + "loss": 1.0003, + "step": 192790 + }, + { + "epoch": 1.231744246962166, + "grad_norm": 0.7912330031394958, + "learning_rate": 3.22333468642271e-05, + "loss": 1.015, + "step": 192800 + }, + { + "epoch": 1.2318081341119047, + "grad_norm": 0.8799734711647034, + "learning_rate": 3.222865672403037e-05, + "loss": 0.6711, + "step": 192810 + }, + { + "epoch": 1.2318720212616434, + "grad_norm": 0.6278074383735657, + "learning_rate": 3.2223966762806446e-05, + "loss": 0.9729, + "step": 192820 + }, + { + "epoch": 1.2319359084113821, + "grad_norm": 1.1327438354492188, + "learning_rate": 3.221927698060255e-05, + "loss": 1.0065, + "step": 192830 + }, + { + "epoch": 1.2319997955611208, + "grad_norm": 1.7792538404464722, + "learning_rate": 3.221458737746592e-05, + "loss": 0.7122, + "step": 192840 + }, + { + "epoch": 1.2320636827108595, + "grad_norm": 0.876568078994751, + "learning_rate": 3.220989795344378e-05, + "loss": 0.9008, + "step": 192850 + }, + { + "epoch": 1.2321275698605982, + "grad_norm": 0.969321608543396, + "learning_rate": 3.2205208708583355e-05, + "loss": 1.1909, + "step": 192860 + }, + { + "epoch": 1.232191457010337, + "grad_norm": 0.681924045085907, + "learning_rate": 3.220051964293188e-05, + "loss": 0.9973, + "step": 192870 + }, + { + "epoch": 1.2322553441600756, + "grad_norm": 1.640181541442871, + "learning_rate": 3.2195830756536574e-05, + "loss": 0.7027, + "step": 192880 + }, + { + "epoch": 1.2323192313098144, + "grad_norm": 0.5599543452262878, + "learning_rate": 3.2191142049444646e-05, + "loss": 0.7875, + "step": 192890 + }, + { + "epoch": 1.232383118459553, + "grad_norm": 1.001563310623169, + "learning_rate": 3.218645352170333e-05, + "loss": 0.8589, + "step": 192900 + }, + { + "epoch": 1.2324470056092918, + "grad_norm": 1.0089075565338135, + "learning_rate": 3.2181765173359836e-05, + "loss": 0.8123, + "step": 192910 + }, + { + "epoch": 1.2325108927590305, + "grad_norm": 1.5100098848342896, + "learning_rate": 3.217707700446138e-05, + "loss": 0.7626, + "step": 192920 + }, + { + "epoch": 1.2325747799087692, + "grad_norm": 0.9354732632637024, + "learning_rate": 3.2172389015055184e-05, + "loss": 0.8715, + "step": 192930 + }, + { + "epoch": 1.2326386670585079, + "grad_norm": 1.4577165842056274, + "learning_rate": 3.216770120518846e-05, + "loss": 0.8773, + "step": 192940 + }, + { + "epoch": 1.2327025542082466, + "grad_norm": 0.8571488857269287, + "learning_rate": 3.21630135749084e-05, + "loss": 0.8343, + "step": 192950 + }, + { + "epoch": 1.2327664413579853, + "grad_norm": 1.2611160278320312, + "learning_rate": 3.2158326124262225e-05, + "loss": 0.674, + "step": 192960 + }, + { + "epoch": 1.232830328507724, + "grad_norm": 0.894373893737793, + "learning_rate": 3.215363885329714e-05, + "loss": 1.2393, + "step": 192970 + }, + { + "epoch": 1.2328942156574627, + "grad_norm": 0.5427113175392151, + "learning_rate": 3.2148951762060356e-05, + "loss": 0.794, + "step": 192980 + }, + { + "epoch": 1.2329581028072014, + "grad_norm": 2.207989454269409, + "learning_rate": 3.214426485059906e-05, + "loss": 0.8682, + "step": 192990 + }, + { + "epoch": 1.2330219899569401, + "grad_norm": 1.0269194841384888, + "learning_rate": 3.213957811896048e-05, + "loss": 1.0283, + "step": 193000 + }, + { + "epoch": 1.2330858771066788, + "grad_norm": 0.8881884217262268, + "learning_rate": 3.213489156719179e-05, + "loss": 0.9296, + "step": 193010 + }, + { + "epoch": 1.2331497642564175, + "grad_norm": 1.8312245607376099, + "learning_rate": 3.2130205195340204e-05, + "loss": 0.627, + "step": 193020 + }, + { + "epoch": 1.2332136514061562, + "grad_norm": 1.116809368133545, + "learning_rate": 3.21255190034529e-05, + "loss": 0.8914, + "step": 193030 + }, + { + "epoch": 1.233277538555895, + "grad_norm": 1.0772712230682373, + "learning_rate": 3.2120832991577094e-05, + "loss": 0.7899, + "step": 193040 + }, + { + "epoch": 1.2333414257056337, + "grad_norm": 1.190745234489441, + "learning_rate": 3.2116147159759966e-05, + "loss": 0.895, + "step": 193050 + }, + { + "epoch": 1.2334053128553724, + "grad_norm": 0.9631375670433044, + "learning_rate": 3.21114615080487e-05, + "loss": 0.9266, + "step": 193060 + }, + { + "epoch": 1.233469200005111, + "grad_norm": 0.6763384342193604, + "learning_rate": 3.2106776036490494e-05, + "loss": 1.1094, + "step": 193070 + }, + { + "epoch": 1.2335330871548496, + "grad_norm": 0.7202142477035522, + "learning_rate": 3.210209074513253e-05, + "loss": 0.9108, + "step": 193080 + }, + { + "epoch": 1.2335969743045885, + "grad_norm": 1.647141933441162, + "learning_rate": 3.2097405634022005e-05, + "loss": 1.1182, + "step": 193090 + }, + { + "epoch": 1.233660861454327, + "grad_norm": 1.490446925163269, + "learning_rate": 3.209272070320609e-05, + "loss": 1.0184, + "step": 193100 + }, + { + "epoch": 1.233724748604066, + "grad_norm": 0.8899498581886292, + "learning_rate": 3.208803595273198e-05, + "loss": 0.7708, + "step": 193110 + }, + { + "epoch": 1.2337886357538044, + "grad_norm": 1.018667221069336, + "learning_rate": 3.2083351382646834e-05, + "loss": 1.0222, + "step": 193120 + }, + { + "epoch": 1.2338525229035433, + "grad_norm": 1.8434399366378784, + "learning_rate": 3.2078666992997834e-05, + "loss": 0.7171, + "step": 193130 + }, + { + "epoch": 1.2339164100532818, + "grad_norm": 0.5759685039520264, + "learning_rate": 3.207398278383217e-05, + "loss": 0.8113, + "step": 193140 + }, + { + "epoch": 1.2339802972030205, + "grad_norm": 0.9592820405960083, + "learning_rate": 3.206929875519701e-05, + "loss": 0.9567, + "step": 193150 + }, + { + "epoch": 1.2340441843527592, + "grad_norm": 1.4874789714813232, + "learning_rate": 3.206461490713951e-05, + "loss": 1.1218, + "step": 193160 + }, + { + "epoch": 1.234108071502498, + "grad_norm": 0.6673001050949097, + "learning_rate": 3.205993123970687e-05, + "loss": 0.7205, + "step": 193170 + }, + { + "epoch": 1.2341719586522366, + "grad_norm": 0.860629141330719, + "learning_rate": 3.205524775294624e-05, + "loss": 0.8202, + "step": 193180 + }, + { + "epoch": 1.2342358458019753, + "grad_norm": 0.6842666268348694, + "learning_rate": 3.205056444690478e-05, + "loss": 1.0571, + "step": 193190 + }, + { + "epoch": 1.234299732951714, + "grad_norm": 1.0597096681594849, + "learning_rate": 3.2045881321629664e-05, + "loss": 0.8567, + "step": 193200 + }, + { + "epoch": 1.2343636201014527, + "grad_norm": 0.901559591293335, + "learning_rate": 3.2041198377168066e-05, + "loss": 1.1343, + "step": 193210 + }, + { + "epoch": 1.2344275072511914, + "grad_norm": 0.7798748016357422, + "learning_rate": 3.203651561356714e-05, + "loss": 0.8617, + "step": 193220 + }, + { + "epoch": 1.2344913944009301, + "grad_norm": 0.8212858438491821, + "learning_rate": 3.203183303087403e-05, + "loss": 0.8774, + "step": 193230 + }, + { + "epoch": 1.2345552815506688, + "grad_norm": 1.2697757482528687, + "learning_rate": 3.202715062913592e-05, + "loss": 0.8944, + "step": 193240 + }, + { + "epoch": 1.2346191687004076, + "grad_norm": 1.0339276790618896, + "learning_rate": 3.202246840839994e-05, + "loss": 0.9764, + "step": 193250 + }, + { + "epoch": 1.2346830558501463, + "grad_norm": 1.3854345083236694, + "learning_rate": 3.201778636871325e-05, + "loss": 0.9081, + "step": 193260 + }, + { + "epoch": 1.234746942999885, + "grad_norm": 1.7363767623901367, + "learning_rate": 3.201310451012303e-05, + "loss": 0.9234, + "step": 193270 + }, + { + "epoch": 1.2348108301496237, + "grad_norm": 0.8943231105804443, + "learning_rate": 3.200842283267638e-05, + "loss": 0.9892, + "step": 193280 + }, + { + "epoch": 1.2348747172993624, + "grad_norm": 0.6131458282470703, + "learning_rate": 3.200374133642049e-05, + "loss": 0.6881, + "step": 193290 + }, + { + "epoch": 1.234938604449101, + "grad_norm": 0.9070971608161926, + "learning_rate": 3.19990600214025e-05, + "loss": 0.8429, + "step": 193300 + }, + { + "epoch": 1.2350024915988398, + "grad_norm": 3.0522189140319824, + "learning_rate": 3.199437888766954e-05, + "loss": 1.1211, + "step": 193310 + }, + { + "epoch": 1.2350663787485785, + "grad_norm": 1.3165589570999146, + "learning_rate": 3.198969793526877e-05, + "loss": 0.7646, + "step": 193320 + }, + { + "epoch": 1.2351302658983172, + "grad_norm": 1.1292219161987305, + "learning_rate": 3.1985017164247325e-05, + "loss": 0.8684, + "step": 193330 + }, + { + "epoch": 1.235194153048056, + "grad_norm": 1.0225328207015991, + "learning_rate": 3.198033657465233e-05, + "loss": 0.8399, + "step": 193340 + }, + { + "epoch": 1.2352580401977946, + "grad_norm": 0.7963989973068237, + "learning_rate": 3.1975656166530946e-05, + "loss": 0.824, + "step": 193350 + }, + { + "epoch": 1.2353219273475333, + "grad_norm": 1.6515822410583496, + "learning_rate": 3.197097593993029e-05, + "loss": 0.8579, + "step": 193360 + }, + { + "epoch": 1.235385814497272, + "grad_norm": 0.9961569905281067, + "learning_rate": 3.196629589489751e-05, + "loss": 1.0444, + "step": 193370 + }, + { + "epoch": 1.2354497016470107, + "grad_norm": 0.921463131904602, + "learning_rate": 3.196161603147972e-05, + "loss": 1.0651, + "step": 193380 + }, + { + "epoch": 1.2355135887967494, + "grad_norm": 0.5503140091896057, + "learning_rate": 3.195693634972408e-05, + "loss": 0.7384, + "step": 193390 + }, + { + "epoch": 1.2355774759464881, + "grad_norm": 1.171606183052063, + "learning_rate": 3.1952256849677684e-05, + "loss": 0.8194, + "step": 193400 + }, + { + "epoch": 1.2356413630962269, + "grad_norm": 0.7877267599105835, + "learning_rate": 3.194757753138769e-05, + "loss": 0.8236, + "step": 193410 + }, + { + "epoch": 1.2357052502459656, + "grad_norm": 0.6800931096076965, + "learning_rate": 3.19428983949012e-05, + "loss": 0.9972, + "step": 193420 + }, + { + "epoch": 1.2357691373957043, + "grad_norm": 1.1945538520812988, + "learning_rate": 3.1938219440265355e-05, + "loss": 1.0245, + "step": 193430 + }, + { + "epoch": 1.235833024545443, + "grad_norm": 0.9393901824951172, + "learning_rate": 3.1933540667527256e-05, + "loss": 1.0416, + "step": 193440 + }, + { + "epoch": 1.2358969116951817, + "grad_norm": 0.9092146754264832, + "learning_rate": 3.192886207673404e-05, + "loss": 1.0766, + "step": 193450 + }, + { + "epoch": 1.2359607988449204, + "grad_norm": 0.8779313564300537, + "learning_rate": 3.19241836679328e-05, + "loss": 0.9729, + "step": 193460 + }, + { + "epoch": 1.236024685994659, + "grad_norm": 1.0780725479125977, + "learning_rate": 3.191950544117068e-05, + "loss": 1.0057, + "step": 193470 + }, + { + "epoch": 1.2360885731443978, + "grad_norm": 1.7977213859558105, + "learning_rate": 3.1914827396494776e-05, + "loss": 0.767, + "step": 193480 + }, + { + "epoch": 1.2361524602941365, + "grad_norm": 1.097962498664856, + "learning_rate": 3.191014953395221e-05, + "loss": 0.8727, + "step": 193490 + }, + { + "epoch": 1.2362163474438752, + "grad_norm": 1.1607288122177124, + "learning_rate": 3.190547185359008e-05, + "loss": 0.9234, + "step": 193500 + }, + { + "epoch": 1.236280234593614, + "grad_norm": 0.9202963709831238, + "learning_rate": 3.1900794355455514e-05, + "loss": 1.3214, + "step": 193510 + }, + { + "epoch": 1.2363441217433526, + "grad_norm": 0.9579522609710693, + "learning_rate": 3.1896117039595606e-05, + "loss": 0.7031, + "step": 193520 + }, + { + "epoch": 1.2364080088930913, + "grad_norm": 0.6659561395645142, + "learning_rate": 3.189143990605746e-05, + "loss": 0.8339, + "step": 193530 + }, + { + "epoch": 1.23647189604283, + "grad_norm": 1.919846534729004, + "learning_rate": 3.188676295488817e-05, + "loss": 0.8283, + "step": 193540 + }, + { + "epoch": 1.2365357831925687, + "grad_norm": 0.9194838404655457, + "learning_rate": 3.1882086186134866e-05, + "loss": 0.8513, + "step": 193550 + }, + { + "epoch": 1.2365996703423074, + "grad_norm": 0.7087141275405884, + "learning_rate": 3.187740959984461e-05, + "loss": 0.8013, + "step": 193560 + }, + { + "epoch": 1.236663557492046, + "grad_norm": 1.0234582424163818, + "learning_rate": 3.187273319606453e-05, + "loss": 0.8943, + "step": 193570 + }, + { + "epoch": 1.2367274446417849, + "grad_norm": 0.5449272990226746, + "learning_rate": 3.18680569748417e-05, + "loss": 0.8737, + "step": 193580 + }, + { + "epoch": 1.2367913317915233, + "grad_norm": 0.695446789264679, + "learning_rate": 3.186338093622323e-05, + "loss": 1.0306, + "step": 193590 + }, + { + "epoch": 1.2368552189412623, + "grad_norm": 0.7840051651000977, + "learning_rate": 3.185870508025619e-05, + "loss": 0.701, + "step": 193600 + }, + { + "epoch": 1.2369191060910008, + "grad_norm": 1.039993405342102, + "learning_rate": 3.185402940698769e-05, + "loss": 0.8228, + "step": 193610 + }, + { + "epoch": 1.2369829932407397, + "grad_norm": 0.6685113906860352, + "learning_rate": 3.1849353916464826e-05, + "loss": 0.9071, + "step": 193620 + }, + { + "epoch": 1.2370468803904782, + "grad_norm": 1.0278228521347046, + "learning_rate": 3.1844678608734664e-05, + "loss": 0.729, + "step": 193630 + }, + { + "epoch": 1.2371107675402169, + "grad_norm": 0.9203469753265381, + "learning_rate": 3.1840003483844296e-05, + "loss": 0.6591, + "step": 193640 + }, + { + "epoch": 1.2371746546899556, + "grad_norm": 0.6178497076034546, + "learning_rate": 3.1835328541840796e-05, + "loss": 0.8599, + "step": 193650 + }, + { + "epoch": 1.2372385418396943, + "grad_norm": 0.9682593941688538, + "learning_rate": 3.183065378277126e-05, + "loss": 0.8228, + "step": 193660 + }, + { + "epoch": 1.237302428989433, + "grad_norm": 0.5989915132522583, + "learning_rate": 3.1825979206682753e-05, + "loss": 0.8924, + "step": 193670 + }, + { + "epoch": 1.2373663161391717, + "grad_norm": 0.9581364989280701, + "learning_rate": 3.182130481362237e-05, + "loss": 0.9859, + "step": 193680 + }, + { + "epoch": 1.2374302032889104, + "grad_norm": 1.2858854532241821, + "learning_rate": 3.181663060363717e-05, + "loss": 0.9602, + "step": 193690 + }, + { + "epoch": 1.237494090438649, + "grad_norm": 1.281391978263855, + "learning_rate": 3.181195657677422e-05, + "loss": 0.7789, + "step": 193700 + }, + { + "epoch": 1.2375579775883878, + "grad_norm": 0.7628605365753174, + "learning_rate": 3.180728273308061e-05, + "loss": 0.8923, + "step": 193710 + }, + { + "epoch": 1.2376218647381265, + "grad_norm": 0.8544741272926331, + "learning_rate": 3.180260907260339e-05, + "loss": 1.0046, + "step": 193720 + }, + { + "epoch": 1.2376857518878652, + "grad_norm": 0.8687821626663208, + "learning_rate": 3.179793559538966e-05, + "loss": 0.7476, + "step": 193730 + }, + { + "epoch": 1.237749639037604, + "grad_norm": 0.9045094847679138, + "learning_rate": 3.179326230148646e-05, + "loss": 0.8623, + "step": 193740 + }, + { + "epoch": 1.2378135261873426, + "grad_norm": 0.7554922699928284, + "learning_rate": 3.1788589190940856e-05, + "loss": 0.7176, + "step": 193750 + }, + { + "epoch": 1.2378774133370813, + "grad_norm": 1.8358988761901855, + "learning_rate": 3.17839162637999e-05, + "loss": 0.8954, + "step": 193760 + }, + { + "epoch": 1.23794130048682, + "grad_norm": 1.0413012504577637, + "learning_rate": 3.177924352011069e-05, + "loss": 0.6721, + "step": 193770 + }, + { + "epoch": 1.2380051876365588, + "grad_norm": 0.7290290594100952, + "learning_rate": 3.177457095992025e-05, + "loss": 0.8537, + "step": 193780 + }, + { + "epoch": 1.2380690747862975, + "grad_norm": 1.435036540031433, + "learning_rate": 3.1769898583275646e-05, + "loss": 1.1552, + "step": 193790 + }, + { + "epoch": 1.2381329619360362, + "grad_norm": 1.10448157787323, + "learning_rate": 3.176522639022394e-05, + "loss": 0.6483, + "step": 193800 + }, + { + "epoch": 1.2381968490857749, + "grad_norm": 0.7884492874145508, + "learning_rate": 3.1760554380812165e-05, + "loss": 1.0546, + "step": 193810 + }, + { + "epoch": 1.2382607362355136, + "grad_norm": 0.8081900477409363, + "learning_rate": 3.17558825550874e-05, + "loss": 0.8096, + "step": 193820 + }, + { + "epoch": 1.2383246233852523, + "grad_norm": 1.044958233833313, + "learning_rate": 3.175121091309669e-05, + "loss": 0.8731, + "step": 193830 + }, + { + "epoch": 1.238388510534991, + "grad_norm": 0.8754411935806274, + "learning_rate": 3.1746539454887055e-05, + "loss": 0.9596, + "step": 193840 + }, + { + "epoch": 1.2384523976847297, + "grad_norm": 0.7443397045135498, + "learning_rate": 3.174186818050557e-05, + "loss": 0.8056, + "step": 193850 + }, + { + "epoch": 1.2385162848344684, + "grad_norm": 1.0024755001068115, + "learning_rate": 3.173719708999926e-05, + "loss": 0.9443, + "step": 193860 + }, + { + "epoch": 1.2385801719842071, + "grad_norm": 1.0295993089675903, + "learning_rate": 3.1732526183415186e-05, + "loss": 0.8562, + "step": 193870 + }, + { + "epoch": 1.2386440591339458, + "grad_norm": 2.135509729385376, + "learning_rate": 3.172785546080037e-05, + "loss": 0.8922, + "step": 193880 + }, + { + "epoch": 1.2387079462836845, + "grad_norm": 1.0635528564453125, + "learning_rate": 3.1723184922201854e-05, + "loss": 0.835, + "step": 193890 + }, + { + "epoch": 1.2387718334334232, + "grad_norm": 0.9661139249801636, + "learning_rate": 3.1718514567666685e-05, + "loss": 0.7667, + "step": 193900 + }, + { + "epoch": 1.238835720583162, + "grad_norm": 0.840607225894928, + "learning_rate": 3.1713844397241886e-05, + "loss": 0.7449, + "step": 193910 + }, + { + "epoch": 1.2388996077329006, + "grad_norm": 0.5099063515663147, + "learning_rate": 3.1709174410974504e-05, + "loss": 0.8591, + "step": 193920 + }, + { + "epoch": 1.2389634948826393, + "grad_norm": 0.8879222273826599, + "learning_rate": 3.1704504608911554e-05, + "loss": 0.8582, + "step": 193930 + }, + { + "epoch": 1.239027382032378, + "grad_norm": 0.518216609954834, + "learning_rate": 3.169983499110006e-05, + "loss": 0.9629, + "step": 193940 + }, + { + "epoch": 1.2390912691821168, + "grad_norm": 1.1863969564437866, + "learning_rate": 3.169516555758707e-05, + "loss": 0.9316, + "step": 193950 + }, + { + "epoch": 1.2391551563318555, + "grad_norm": 0.9996239542961121, + "learning_rate": 3.169049630841959e-05, + "loss": 0.7029, + "step": 193960 + }, + { + "epoch": 1.2392190434815942, + "grad_norm": 1.6813340187072754, + "learning_rate": 3.168582724364466e-05, + "loss": 0.7484, + "step": 193970 + }, + { + "epoch": 1.2392829306313329, + "grad_norm": 1.458324909210205, + "learning_rate": 3.168115836330929e-05, + "loss": 0.7864, + "step": 193980 + }, + { + "epoch": 1.2393468177810716, + "grad_norm": 1.1674745082855225, + "learning_rate": 3.167648966746051e-05, + "loss": 0.8352, + "step": 193990 + }, + { + "epoch": 1.2394107049308103, + "grad_norm": 5.070311546325684, + "learning_rate": 3.167182115614532e-05, + "loss": 1.0867, + "step": 194000 + }, + { + "epoch": 1.239474592080549, + "grad_norm": 1.2760869264602661, + "learning_rate": 3.1667152829410755e-05, + "loss": 0.8498, + "step": 194010 + }, + { + "epoch": 1.2395384792302877, + "grad_norm": 1.25545334815979, + "learning_rate": 3.166248468730382e-05, + "loss": 0.7311, + "step": 194020 + }, + { + "epoch": 1.2396023663800264, + "grad_norm": 1.5020123720169067, + "learning_rate": 3.1657816729871524e-05, + "loss": 1.0177, + "step": 194030 + }, + { + "epoch": 1.2396662535297651, + "grad_norm": 1.1968601942062378, + "learning_rate": 3.1653148957160886e-05, + "loss": 0.7715, + "step": 194040 + }, + { + "epoch": 1.2397301406795038, + "grad_norm": 0.8993906378746033, + "learning_rate": 3.1648481369218905e-05, + "loss": 0.9558, + "step": 194050 + }, + { + "epoch": 1.2397940278292423, + "grad_norm": 1.380210280418396, + "learning_rate": 3.164381396609261e-05, + "loss": 0.8604, + "step": 194060 + }, + { + "epoch": 1.2398579149789812, + "grad_norm": 0.8667689561843872, + "learning_rate": 3.163914674782897e-05, + "loss": 0.7385, + "step": 194070 + }, + { + "epoch": 1.2399218021287197, + "grad_norm": 1.3134424686431885, + "learning_rate": 3.163447971447501e-05, + "loss": 0.8272, + "step": 194080 + }, + { + "epoch": 1.2399856892784586, + "grad_norm": 0.7698752284049988, + "learning_rate": 3.162981286607773e-05, + "loss": 0.7033, + "step": 194090 + }, + { + "epoch": 1.2400495764281971, + "grad_norm": 0.7156646847724915, + "learning_rate": 3.162514620268413e-05, + "loss": 1.1074, + "step": 194100 + }, + { + "epoch": 1.2401134635779358, + "grad_norm": 0.8076853156089783, + "learning_rate": 3.16204797243412e-05, + "loss": 0.7992, + "step": 194110 + }, + { + "epoch": 1.2401773507276745, + "grad_norm": 1.0264087915420532, + "learning_rate": 3.161581343109594e-05, + "loss": 0.8523, + "step": 194120 + }, + { + "epoch": 1.2402412378774132, + "grad_norm": 1.4501597881317139, + "learning_rate": 3.1611147322995335e-05, + "loss": 0.9584, + "step": 194130 + }, + { + "epoch": 1.240305125027152, + "grad_norm": 1.5734119415283203, + "learning_rate": 3.160648140008639e-05, + "loss": 1.0022, + "step": 194140 + }, + { + "epoch": 1.2403690121768907, + "grad_norm": 0.6637634038925171, + "learning_rate": 3.160181566241609e-05, + "loss": 0.9012, + "step": 194150 + }, + { + "epoch": 1.2404328993266294, + "grad_norm": 0.6105542182922363, + "learning_rate": 3.1597150110031436e-05, + "loss": 0.75, + "step": 194160 + }, + { + "epoch": 1.240496786476368, + "grad_norm": 0.6703608632087708, + "learning_rate": 3.159248474297939e-05, + "loss": 0.7869, + "step": 194170 + }, + { + "epoch": 1.2405606736261068, + "grad_norm": 1.018967628479004, + "learning_rate": 3.158781956130695e-05, + "loss": 0.6713, + "step": 194180 + }, + { + "epoch": 1.2406245607758455, + "grad_norm": 1.2115850448608398, + "learning_rate": 3.1583154565061094e-05, + "loss": 0.848, + "step": 194190 + }, + { + "epoch": 1.2406884479255842, + "grad_norm": 1.2993463277816772, + "learning_rate": 3.157848975428881e-05, + "loss": 0.9696, + "step": 194200 + }, + { + "epoch": 1.240752335075323, + "grad_norm": 1.075815200805664, + "learning_rate": 3.157382512903707e-05, + "loss": 0.8145, + "step": 194210 + }, + { + "epoch": 1.2408162222250616, + "grad_norm": 2.959564447402954, + "learning_rate": 3.1569160689352844e-05, + "loss": 0.8707, + "step": 194220 + }, + { + "epoch": 1.2408801093748003, + "grad_norm": 0.7935780882835388, + "learning_rate": 3.156449643528312e-05, + "loss": 0.8066, + "step": 194230 + }, + { + "epoch": 1.240943996524539, + "grad_norm": 1.5406337976455688, + "learning_rate": 3.155983236687486e-05, + "loss": 0.8783, + "step": 194240 + }, + { + "epoch": 1.2410078836742777, + "grad_norm": 3.1429214477539062, + "learning_rate": 3.155516848417505e-05, + "loss": 0.8014, + "step": 194250 + }, + { + "epoch": 1.2410717708240164, + "grad_norm": 0.884060263633728, + "learning_rate": 3.155050478723065e-05, + "loss": 0.5811, + "step": 194260 + }, + { + "epoch": 1.2411356579737551, + "grad_norm": 1.2400538921356201, + "learning_rate": 3.1545841276088625e-05, + "loss": 0.9368, + "step": 194270 + }, + { + "epoch": 1.2411995451234938, + "grad_norm": 0.5689629912376404, + "learning_rate": 3.154117795079594e-05, + "loss": 0.6744, + "step": 194280 + }, + { + "epoch": 1.2412634322732325, + "grad_norm": 1.01374089717865, + "learning_rate": 3.153651481139956e-05, + "loss": 0.7638, + "step": 194290 + }, + { + "epoch": 1.2413273194229713, + "grad_norm": 1.1598334312438965, + "learning_rate": 3.153185185794646e-05, + "loss": 0.8611, + "step": 194300 + }, + { + "epoch": 1.24139120657271, + "grad_norm": 1.4664697647094727, + "learning_rate": 3.152718909048359e-05, + "loss": 0.6583, + "step": 194310 + }, + { + "epoch": 1.2414550937224487, + "grad_norm": 0.7975323796272278, + "learning_rate": 3.152252650905789e-05, + "loss": 0.8507, + "step": 194320 + }, + { + "epoch": 1.2415189808721874, + "grad_norm": 1.023056983947754, + "learning_rate": 3.151786411371634e-05, + "loss": 0.9125, + "step": 194330 + }, + { + "epoch": 1.241582868021926, + "grad_norm": 0.8665074706077576, + "learning_rate": 3.151320190450589e-05, + "loss": 0.9165, + "step": 194340 + }, + { + "epoch": 1.2416467551716648, + "grad_norm": 1.0078295469284058, + "learning_rate": 3.1508539881473495e-05, + "loss": 1.0755, + "step": 194350 + }, + { + "epoch": 1.2417106423214035, + "grad_norm": 1.1269526481628418, + "learning_rate": 3.1503878044666095e-05, + "loss": 1.0677, + "step": 194360 + }, + { + "epoch": 1.2417745294711422, + "grad_norm": 0.7636091709136963, + "learning_rate": 3.1499216394130646e-05, + "loss": 1.1119, + "step": 194370 + }, + { + "epoch": 1.241838416620881, + "grad_norm": 0.6014677882194519, + "learning_rate": 3.14945549299141e-05, + "loss": 0.768, + "step": 194380 + }, + { + "epoch": 1.2419023037706196, + "grad_norm": 1.1704468727111816, + "learning_rate": 3.1489893652063384e-05, + "loss": 0.8032, + "step": 194390 + }, + { + "epoch": 1.2419661909203583, + "grad_norm": 1.918944239616394, + "learning_rate": 3.148523256062548e-05, + "loss": 0.7275, + "step": 194400 + }, + { + "epoch": 1.242030078070097, + "grad_norm": 1.5561200380325317, + "learning_rate": 3.148057165564728e-05, + "loss": 1.0547, + "step": 194410 + }, + { + "epoch": 1.2420939652198357, + "grad_norm": 0.7839997410774231, + "learning_rate": 3.147591093717575e-05, + "loss": 0.8712, + "step": 194420 + }, + { + "epoch": 1.2421578523695744, + "grad_norm": 1.108949065208435, + "learning_rate": 3.147125040525781e-05, + "loss": 0.7229, + "step": 194430 + }, + { + "epoch": 1.2422217395193131, + "grad_norm": 0.8365166783332825, + "learning_rate": 3.146659005994042e-05, + "loss": 0.8297, + "step": 194440 + }, + { + "epoch": 1.2422856266690518, + "grad_norm": 0.9354850053787231, + "learning_rate": 3.146192990127049e-05, + "loss": 0.8969, + "step": 194450 + }, + { + "epoch": 1.2423495138187906, + "grad_norm": 0.6852608323097229, + "learning_rate": 3.145726992929497e-05, + "loss": 0.8833, + "step": 194460 + }, + { + "epoch": 1.2424134009685293, + "grad_norm": 0.6809893846511841, + "learning_rate": 3.145261014406079e-05, + "loss": 0.8947, + "step": 194470 + }, + { + "epoch": 1.242477288118268, + "grad_norm": 1.083614706993103, + "learning_rate": 3.1447950545614854e-05, + "loss": 0.8988, + "step": 194480 + }, + { + "epoch": 1.2425411752680067, + "grad_norm": 1.200301170349121, + "learning_rate": 3.144329113400413e-05, + "loss": 1.0784, + "step": 194490 + }, + { + "epoch": 1.2426050624177454, + "grad_norm": 1.089137077331543, + "learning_rate": 3.14386319092755e-05, + "loss": 1.0893, + "step": 194500 + }, + { + "epoch": 1.242668949567484, + "grad_norm": 0.8329066038131714, + "learning_rate": 3.1433972871475914e-05, + "loss": 1.0404, + "step": 194510 + }, + { + "epoch": 1.2427328367172228, + "grad_norm": 1.1481367349624634, + "learning_rate": 3.142931402065228e-05, + "loss": 0.7052, + "step": 194520 + }, + { + "epoch": 1.2427967238669615, + "grad_norm": 0.6459367871284485, + "learning_rate": 3.142465535685152e-05, + "loss": 0.8926, + "step": 194530 + }, + { + "epoch": 1.2428606110167002, + "grad_norm": 0.748914897441864, + "learning_rate": 3.141999688012055e-05, + "loss": 0.7519, + "step": 194540 + }, + { + "epoch": 1.2429244981664387, + "grad_norm": 0.689520001411438, + "learning_rate": 3.141533859050628e-05, + "loss": 0.9843, + "step": 194550 + }, + { + "epoch": 1.2429883853161776, + "grad_norm": 0.8296200633049011, + "learning_rate": 3.141068048805563e-05, + "loss": 0.9837, + "step": 194560 + }, + { + "epoch": 1.243052272465916, + "grad_norm": 0.9028075933456421, + "learning_rate": 3.140602257281552e-05, + "loss": 1.0621, + "step": 194570 + }, + { + "epoch": 1.243116159615655, + "grad_norm": 1.247193455696106, + "learning_rate": 3.1401364844832846e-05, + "loss": 1.0044, + "step": 194580 + }, + { + "epoch": 1.2431800467653935, + "grad_norm": 1.976901650428772, + "learning_rate": 3.139670730415451e-05, + "loss": 0.6906, + "step": 194590 + }, + { + "epoch": 1.2432439339151322, + "grad_norm": 0.7390189170837402, + "learning_rate": 3.139204995082743e-05, + "loss": 0.9386, + "step": 194600 + }, + { + "epoch": 1.243307821064871, + "grad_norm": 0.6892824769020081, + "learning_rate": 3.138739278489851e-05, + "loss": 0.8764, + "step": 194610 + }, + { + "epoch": 1.2433717082146096, + "grad_norm": 2.279017686843872, + "learning_rate": 3.138273580641464e-05, + "loss": 0.7605, + "step": 194620 + }, + { + "epoch": 1.2434355953643483, + "grad_norm": 0.8297634720802307, + "learning_rate": 3.137807901542272e-05, + "loss": 1.045, + "step": 194630 + }, + { + "epoch": 1.243499482514087, + "grad_norm": 0.7811027765274048, + "learning_rate": 3.137342241196967e-05, + "loss": 0.8654, + "step": 194640 + }, + { + "epoch": 1.2435633696638257, + "grad_norm": 1.1974806785583496, + "learning_rate": 3.136876599610235e-05, + "loss": 0.9548, + "step": 194650 + }, + { + "epoch": 1.2436272568135645, + "grad_norm": 1.1165138483047485, + "learning_rate": 3.136410976786769e-05, + "loss": 0.6052, + "step": 194660 + }, + { + "epoch": 1.2436911439633032, + "grad_norm": 0.7963413000106812, + "learning_rate": 3.135945372731257e-05, + "loss": 0.9678, + "step": 194670 + }, + { + "epoch": 1.2437550311130419, + "grad_norm": 1.1966108083724976, + "learning_rate": 3.135479787448387e-05, + "loss": 0.9356, + "step": 194680 + }, + { + "epoch": 1.2438189182627806, + "grad_norm": 0.6879404783248901, + "learning_rate": 3.135014220942849e-05, + "loss": 0.7895, + "step": 194690 + }, + { + "epoch": 1.2438828054125193, + "grad_norm": 0.9736181497573853, + "learning_rate": 3.1345486732193306e-05, + "loss": 0.9982, + "step": 194700 + }, + { + "epoch": 1.243946692562258, + "grad_norm": 0.9534584283828735, + "learning_rate": 3.1340831442825214e-05, + "loss": 0.8743, + "step": 194710 + }, + { + "epoch": 1.2440105797119967, + "grad_norm": 1.5264173746109009, + "learning_rate": 3.133617634137109e-05, + "loss": 0.9591, + "step": 194720 + }, + { + "epoch": 1.2440744668617354, + "grad_norm": 1.1958099603652954, + "learning_rate": 3.133152142787782e-05, + "loss": 0.7427, + "step": 194730 + }, + { + "epoch": 1.244138354011474, + "grad_norm": 1.0329030752182007, + "learning_rate": 3.132686670239228e-05, + "loss": 1.0823, + "step": 194740 + }, + { + "epoch": 1.2442022411612128, + "grad_norm": 0.9985544085502625, + "learning_rate": 3.132221216496134e-05, + "loss": 0.8161, + "step": 194750 + }, + { + "epoch": 1.2442661283109515, + "grad_norm": 2.1537814140319824, + "learning_rate": 3.131755781563189e-05, + "loss": 0.9614, + "step": 194760 + }, + { + "epoch": 1.2443300154606902, + "grad_norm": 0.9693751335144043, + "learning_rate": 3.1312903654450796e-05, + "loss": 1.1127, + "step": 194770 + }, + { + "epoch": 1.244393902610429, + "grad_norm": 0.8665310740470886, + "learning_rate": 3.130824968146492e-05, + "loss": 0.9338, + "step": 194780 + }, + { + "epoch": 1.2444577897601676, + "grad_norm": 1.027868390083313, + "learning_rate": 3.130359589672115e-05, + "loss": 0.9843, + "step": 194790 + }, + { + "epoch": 1.2445216769099063, + "grad_norm": 1.299623966217041, + "learning_rate": 3.1298942300266344e-05, + "loss": 0.7385, + "step": 194800 + }, + { + "epoch": 1.244585564059645, + "grad_norm": 0.7882144451141357, + "learning_rate": 3.129428889214736e-05, + "loss": 0.6872, + "step": 194810 + }, + { + "epoch": 1.2446494512093838, + "grad_norm": 1.2492918968200684, + "learning_rate": 3.1289635672411076e-05, + "loss": 0.8676, + "step": 194820 + }, + { + "epoch": 1.2447133383591225, + "grad_norm": 1.1216018199920654, + "learning_rate": 3.1284982641104344e-05, + "loss": 0.9722, + "step": 194830 + }, + { + "epoch": 1.2447772255088612, + "grad_norm": 0.9031000137329102, + "learning_rate": 3.128032979827403e-05, + "loss": 0.8253, + "step": 194840 + }, + { + "epoch": 1.2448411126585999, + "grad_norm": 2.528519630432129, + "learning_rate": 3.1275677143966985e-05, + "loss": 1.207, + "step": 194850 + }, + { + "epoch": 1.2449049998083386, + "grad_norm": 0.9558800458908081, + "learning_rate": 3.127102467823007e-05, + "loss": 0.9207, + "step": 194860 + }, + { + "epoch": 1.2449688869580773, + "grad_norm": 0.5063762664794922, + "learning_rate": 3.1266372401110134e-05, + "loss": 0.8639, + "step": 194870 + }, + { + "epoch": 1.245032774107816, + "grad_norm": 0.9158902764320374, + "learning_rate": 3.1261720312654044e-05, + "loss": 0.9413, + "step": 194880 + }, + { + "epoch": 1.2450966612575547, + "grad_norm": 0.824600338935852, + "learning_rate": 3.125706841290866e-05, + "loss": 0.8742, + "step": 194890 + }, + { + "epoch": 1.2451605484072934, + "grad_norm": 0.9135797619819641, + "learning_rate": 3.12524167019208e-05, + "loss": 0.712, + "step": 194900 + }, + { + "epoch": 1.245224435557032, + "grad_norm": 0.5630869269371033, + "learning_rate": 3.124776517973731e-05, + "loss": 0.7903, + "step": 194910 + }, + { + "epoch": 1.2452883227067708, + "grad_norm": 1.2710615396499634, + "learning_rate": 3.124311384640505e-05, + "loss": 0.8565, + "step": 194920 + }, + { + "epoch": 1.2453522098565095, + "grad_norm": 1.194273829460144, + "learning_rate": 3.123846270197087e-05, + "loss": 0.7865, + "step": 194930 + }, + { + "epoch": 1.2454160970062482, + "grad_norm": 1.1291714906692505, + "learning_rate": 3.123381174648159e-05, + "loss": 0.6668, + "step": 194940 + }, + { + "epoch": 1.245479984155987, + "grad_norm": 1.230466365814209, + "learning_rate": 3.1229160979984065e-05, + "loss": 0.7462, + "step": 194950 + }, + { + "epoch": 1.2455438713057256, + "grad_norm": 1.0248101949691772, + "learning_rate": 3.122451040252513e-05, + "loss": 0.8264, + "step": 194960 + }, + { + "epoch": 1.2456077584554643, + "grad_norm": 0.7084385752677917, + "learning_rate": 3.1219860014151616e-05, + "loss": 0.9541, + "step": 194970 + }, + { + "epoch": 1.245671645605203, + "grad_norm": 0.9096771478652954, + "learning_rate": 3.121520981491035e-05, + "loss": 0.8213, + "step": 194980 + }, + { + "epoch": 1.2457355327549418, + "grad_norm": 0.7235396504402161, + "learning_rate": 3.121055980484819e-05, + "loss": 0.681, + "step": 194990 + }, + { + "epoch": 1.2457994199046805, + "grad_norm": 1.0630961656570435, + "learning_rate": 3.120590998401194e-05, + "loss": 0.8318, + "step": 195000 + }, + { + "epoch": 1.2458633070544192, + "grad_norm": 0.9689163565635681, + "learning_rate": 3.120126035244844e-05, + "loss": 0.7171, + "step": 195010 + }, + { + "epoch": 1.2459271942041577, + "grad_norm": 1.0845943689346313, + "learning_rate": 3.11966109102045e-05, + "loss": 0.8903, + "step": 195020 + }, + { + "epoch": 1.2459910813538966, + "grad_norm": 1.0844838619232178, + "learning_rate": 3.1191961657326965e-05, + "loss": 0.9605, + "step": 195030 + }, + { + "epoch": 1.246054968503635, + "grad_norm": 0.8601891994476318, + "learning_rate": 3.118731259386265e-05, + "loss": 0.784, + "step": 195040 + }, + { + "epoch": 1.246118855653374, + "grad_norm": 0.6311177015304565, + "learning_rate": 3.1182663719858364e-05, + "loss": 0.7194, + "step": 195050 + }, + { + "epoch": 1.2461827428031125, + "grad_norm": 0.9953409433364868, + "learning_rate": 3.117801503536094e-05, + "loss": 1.0471, + "step": 195060 + }, + { + "epoch": 1.2462466299528514, + "grad_norm": 0.5984851121902466, + "learning_rate": 3.117336654041718e-05, + "loss": 0.7776, + "step": 195070 + }, + { + "epoch": 1.2463105171025899, + "grad_norm": 1.1203337907791138, + "learning_rate": 3.116871823507391e-05, + "loss": 0.8142, + "step": 195080 + }, + { + "epoch": 1.2463744042523286, + "grad_norm": 0.659214437007904, + "learning_rate": 3.1164070119377944e-05, + "loss": 0.78, + "step": 195090 + }, + { + "epoch": 1.2464382914020673, + "grad_norm": 1.9500774145126343, + "learning_rate": 3.115942219337609e-05, + "loss": 0.9543, + "step": 195100 + }, + { + "epoch": 1.246502178551806, + "grad_norm": 0.9088325500488281, + "learning_rate": 3.1154774457115144e-05, + "loss": 0.7535, + "step": 195110 + }, + { + "epoch": 1.2465660657015447, + "grad_norm": 0.9878364205360413, + "learning_rate": 3.1150126910641926e-05, + "loss": 0.8623, + "step": 195120 + }, + { + "epoch": 1.2466299528512834, + "grad_norm": 0.9298221468925476, + "learning_rate": 3.114547955400324e-05, + "loss": 0.7283, + "step": 195130 + }, + { + "epoch": 1.2466938400010221, + "grad_norm": 0.98885577917099, + "learning_rate": 3.1140832387245885e-05, + "loss": 0.6204, + "step": 195140 + }, + { + "epoch": 1.2467577271507608, + "grad_norm": 1.1176702976226807, + "learning_rate": 3.113618541041666e-05, + "loss": 0.7693, + "step": 195150 + }, + { + "epoch": 1.2468216143004995, + "grad_norm": 0.5629855394363403, + "learning_rate": 3.1131538623562375e-05, + "loss": 0.8783, + "step": 195160 + }, + { + "epoch": 1.2468855014502382, + "grad_norm": 0.6804969310760498, + "learning_rate": 3.112689202672981e-05, + "loss": 1.0256, + "step": 195170 + }, + { + "epoch": 1.246949388599977, + "grad_norm": 0.5467301607131958, + "learning_rate": 3.1122245619965764e-05, + "loss": 0.853, + "step": 195180 + }, + { + "epoch": 1.2470132757497157, + "grad_norm": 0.6404921412467957, + "learning_rate": 3.111759940331704e-05, + "loss": 0.7456, + "step": 195190 + }, + { + "epoch": 1.2470771628994544, + "grad_norm": 1.025201439857483, + "learning_rate": 3.111295337683044e-05, + "loss": 0.8196, + "step": 195200 + }, + { + "epoch": 1.247141050049193, + "grad_norm": 0.9093812108039856, + "learning_rate": 3.110830754055273e-05, + "loss": 0.8695, + "step": 195210 + }, + { + "epoch": 1.2472049371989318, + "grad_norm": 0.4639185667037964, + "learning_rate": 3.110366189453071e-05, + "loss": 0.8906, + "step": 195220 + }, + { + "epoch": 1.2472688243486705, + "grad_norm": 1.9653066396713257, + "learning_rate": 3.1099016438811156e-05, + "loss": 0.7307, + "step": 195230 + }, + { + "epoch": 1.2473327114984092, + "grad_norm": 1.3371989727020264, + "learning_rate": 3.1094371173440864e-05, + "loss": 0.9541, + "step": 195240 + }, + { + "epoch": 1.247396598648148, + "grad_norm": 1.0733586549758911, + "learning_rate": 3.108972609846661e-05, + "loss": 0.583, + "step": 195250 + }, + { + "epoch": 1.2474604857978866, + "grad_norm": 1.862831950187683, + "learning_rate": 3.108508121393517e-05, + "loss": 1.1434, + "step": 195260 + }, + { + "epoch": 1.2475243729476253, + "grad_norm": 0.9113034605979919, + "learning_rate": 3.108043651989333e-05, + "loss": 0.9859, + "step": 195270 + }, + { + "epoch": 1.247588260097364, + "grad_norm": 1.056965708732605, + "learning_rate": 3.107579201638786e-05, + "loss": 1.1087, + "step": 195280 + }, + { + "epoch": 1.2476521472471027, + "grad_norm": 0.8885679841041565, + "learning_rate": 3.107114770346554e-05, + "loss": 0.9199, + "step": 195290 + }, + { + "epoch": 1.2477160343968414, + "grad_norm": 0.5740584135055542, + "learning_rate": 3.106650358117314e-05, + "loss": 0.7694, + "step": 195300 + }, + { + "epoch": 1.2477799215465801, + "grad_norm": 1.5246422290802002, + "learning_rate": 3.106185964955742e-05, + "loss": 0.8454, + "step": 195310 + }, + { + "epoch": 1.2478438086963188, + "grad_norm": 0.8860229849815369, + "learning_rate": 3.105721590866516e-05, + "loss": 1.0609, + "step": 195320 + }, + { + "epoch": 1.2479076958460575, + "grad_norm": 0.7222634553909302, + "learning_rate": 3.105257235854312e-05, + "loss": 0.8697, + "step": 195330 + }, + { + "epoch": 1.2479715829957962, + "grad_norm": 1.1808580160140991, + "learning_rate": 3.1047928999238074e-05, + "loss": 0.7241, + "step": 195340 + }, + { + "epoch": 1.248035470145535, + "grad_norm": 0.7724068760871887, + "learning_rate": 3.1043285830796776e-05, + "loss": 0.8333, + "step": 195350 + }, + { + "epoch": 1.2480993572952737, + "grad_norm": 1.1032763719558716, + "learning_rate": 3.1038642853266e-05, + "loss": 0.802, + "step": 195360 + }, + { + "epoch": 1.2481632444450124, + "grad_norm": 0.927266001701355, + "learning_rate": 3.1034000066692496e-05, + "loss": 0.8883, + "step": 195370 + }, + { + "epoch": 1.248227131594751, + "grad_norm": 0.8661271333694458, + "learning_rate": 3.1029357471123e-05, + "loss": 0.9052, + "step": 195380 + }, + { + "epoch": 1.2482910187444898, + "grad_norm": 1.4251922369003296, + "learning_rate": 3.10247150666043e-05, + "loss": 0.9302, + "step": 195390 + }, + { + "epoch": 1.2483549058942285, + "grad_norm": 0.845050573348999, + "learning_rate": 3.102007285318313e-05, + "loss": 0.737, + "step": 195400 + }, + { + "epoch": 1.2484187930439672, + "grad_norm": 1.1283615827560425, + "learning_rate": 3.101543083090624e-05, + "loss": 0.6329, + "step": 195410 + }, + { + "epoch": 1.248482680193706, + "grad_norm": 1.0217286348342896, + "learning_rate": 3.1010788999820396e-05, + "loss": 1.0233, + "step": 195420 + }, + { + "epoch": 1.2485465673434446, + "grad_norm": 0.9749916791915894, + "learning_rate": 3.100614735997233e-05, + "loss": 0.9356, + "step": 195430 + }, + { + "epoch": 1.2486104544931833, + "grad_norm": 0.8937061429023743, + "learning_rate": 3.10015059114088e-05, + "loss": 0.7443, + "step": 195440 + }, + { + "epoch": 1.248674341642922, + "grad_norm": 0.9025242328643799, + "learning_rate": 3.0996864654176525e-05, + "loss": 0.6304, + "step": 195450 + }, + { + "epoch": 1.2487382287926607, + "grad_norm": 2.0243749618530273, + "learning_rate": 3.099222358832228e-05, + "loss": 0.8513, + "step": 195460 + }, + { + "epoch": 1.2488021159423994, + "grad_norm": 1.7450834512710571, + "learning_rate": 3.0987582713892784e-05, + "loss": 0.7545, + "step": 195470 + }, + { + "epoch": 1.2488660030921381, + "grad_norm": 1.119706392288208, + "learning_rate": 3.098294203093477e-05, + "loss": 0.6262, + "step": 195480 + }, + { + "epoch": 1.2489298902418768, + "grad_norm": 1.065361499786377, + "learning_rate": 3.097830153949498e-05, + "loss": 0.8959, + "step": 195490 + }, + { + "epoch": 1.2489937773916155, + "grad_norm": 0.8525824546813965, + "learning_rate": 3.097366123962015e-05, + "loss": 0.8344, + "step": 195500 + }, + { + "epoch": 1.249057664541354, + "grad_norm": 0.8765125870704651, + "learning_rate": 3.096902113135702e-05, + "loss": 1.0526, + "step": 195510 + }, + { + "epoch": 1.249121551691093, + "grad_norm": 1.3569093942642212, + "learning_rate": 3.09643812147523e-05, + "loss": 0.7687, + "step": 195520 + }, + { + "epoch": 1.2491854388408314, + "grad_norm": 1.1045929193496704, + "learning_rate": 3.0959741489852746e-05, + "loss": 0.8376, + "step": 195530 + }, + { + "epoch": 1.2492493259905704, + "grad_norm": 0.7480252981185913, + "learning_rate": 3.095510195670506e-05, + "loss": 0.8225, + "step": 195540 + }, + { + "epoch": 1.2493132131403089, + "grad_norm": 1.0368316173553467, + "learning_rate": 3.095046261535597e-05, + "loss": 0.8449, + "step": 195550 + }, + { + "epoch": 1.2493771002900478, + "grad_norm": 1.2551803588867188, + "learning_rate": 3.0945823465852204e-05, + "loss": 0.7941, + "step": 195560 + }, + { + "epoch": 1.2494409874397863, + "grad_norm": 1.1370221376419067, + "learning_rate": 3.094118450824048e-05, + "loss": 0.8795, + "step": 195570 + }, + { + "epoch": 1.249504874589525, + "grad_norm": 1.0258110761642456, + "learning_rate": 3.0936545742567514e-05, + "loss": 0.7045, + "step": 195580 + }, + { + "epoch": 1.2495687617392637, + "grad_norm": 1.3879752159118652, + "learning_rate": 3.0931907168880027e-05, + "loss": 0.9711, + "step": 195590 + }, + { + "epoch": 1.2496326488890024, + "grad_norm": 1.1215720176696777, + "learning_rate": 3.0927268787224734e-05, + "loss": 0.7599, + "step": 195600 + }, + { + "epoch": 1.249696536038741, + "grad_norm": 0.6473504304885864, + "learning_rate": 3.092263059764834e-05, + "loss": 1.1675, + "step": 195610 + }, + { + "epoch": 1.2497604231884798, + "grad_norm": 1.1090569496154785, + "learning_rate": 3.091799260019757e-05, + "loss": 0.9287, + "step": 195620 + }, + { + "epoch": 1.2498243103382185, + "grad_norm": 1.0651631355285645, + "learning_rate": 3.0913354794919105e-05, + "loss": 0.772, + "step": 195630 + }, + { + "epoch": 1.2498881974879572, + "grad_norm": 0.8213653564453125, + "learning_rate": 3.090871718185968e-05, + "loss": 0.8394, + "step": 195640 + }, + { + "epoch": 1.249952084637696, + "grad_norm": 1.845961332321167, + "learning_rate": 3.090407976106599e-05, + "loss": 0.7424, + "step": 195650 + }, + { + "epoch": 1.2500159717874346, + "grad_norm": 1.1385325193405151, + "learning_rate": 3.089944253258473e-05, + "loss": 1.0301, + "step": 195660 + }, + { + "epoch": 1.2500798589371733, + "grad_norm": 1.0003492832183838, + "learning_rate": 3.089480549646262e-05, + "loss": 0.8332, + "step": 195670 + }, + { + "epoch": 1.250143746086912, + "grad_norm": 1.6224372386932373, + "learning_rate": 3.089016865274634e-05, + "loss": 0.5775, + "step": 195680 + }, + { + "epoch": 1.2502076332366507, + "grad_norm": 0.7729224562644958, + "learning_rate": 3.08855320014826e-05, + "loss": 0.9885, + "step": 195690 + }, + { + "epoch": 1.2502715203863894, + "grad_norm": 0.7621885538101196, + "learning_rate": 3.088089554271808e-05, + "loss": 0.8704, + "step": 195700 + }, + { + "epoch": 1.2503354075361282, + "grad_norm": 0.81306391954422, + "learning_rate": 3.0876259276499475e-05, + "loss": 0.9436, + "step": 195710 + }, + { + "epoch": 1.2503992946858669, + "grad_norm": 0.8978168964385986, + "learning_rate": 3.087162320287349e-05, + "loss": 0.8368, + "step": 195720 + }, + { + "epoch": 1.2504631818356056, + "grad_norm": 0.8647493124008179, + "learning_rate": 3.086698732188682e-05, + "loss": 0.63, + "step": 195730 + }, + { + "epoch": 1.2505270689853443, + "grad_norm": 0.9556788206100464, + "learning_rate": 3.086235163358613e-05, + "loss": 0.7618, + "step": 195740 + }, + { + "epoch": 1.250590956135083, + "grad_norm": 1.103699803352356, + "learning_rate": 3.0857716138018115e-05, + "loss": 0.6418, + "step": 195750 + }, + { + "epoch": 1.2506548432848217, + "grad_norm": 0.795481264591217, + "learning_rate": 3.0853080835229465e-05, + "loss": 0.9007, + "step": 195760 + }, + { + "epoch": 1.2507187304345604, + "grad_norm": 0.680173397064209, + "learning_rate": 3.084844572526685e-05, + "loss": 1.0055, + "step": 195770 + }, + { + "epoch": 1.250782617584299, + "grad_norm": 0.8150850534439087, + "learning_rate": 3.0843810808176956e-05, + "loss": 0.5794, + "step": 195780 + }, + { + "epoch": 1.2508465047340378, + "grad_norm": 1.4309477806091309, + "learning_rate": 3.083917608400646e-05, + "loss": 0.9091, + "step": 195790 + }, + { + "epoch": 1.2509103918837765, + "grad_norm": 0.845729410648346, + "learning_rate": 3.083454155280204e-05, + "loss": 0.8298, + "step": 195800 + }, + { + "epoch": 1.2509742790335152, + "grad_norm": 0.8186609148979187, + "learning_rate": 3.0829907214610366e-05, + "loss": 1.1078, + "step": 195810 + }, + { + "epoch": 1.251038166183254, + "grad_norm": 0.772532045841217, + "learning_rate": 3.082527306947811e-05, + "loss": 0.9443, + "step": 195820 + }, + { + "epoch": 1.2511020533329926, + "grad_norm": 0.7840969562530518, + "learning_rate": 3.082063911745194e-05, + "loss": 0.7895, + "step": 195830 + }, + { + "epoch": 1.2511659404827313, + "grad_norm": 0.6535977125167847, + "learning_rate": 3.081600535857853e-05, + "loss": 0.9439, + "step": 195840 + }, + { + "epoch": 1.25122982763247, + "grad_norm": 1.039899468421936, + "learning_rate": 3.081137179290454e-05, + "loss": 0.8581, + "step": 195850 + }, + { + "epoch": 1.2512937147822087, + "grad_norm": 0.915335476398468, + "learning_rate": 3.080673842047666e-05, + "loss": 1.1498, + "step": 195860 + }, + { + "epoch": 1.2513576019319474, + "grad_norm": 0.7430047392845154, + "learning_rate": 3.0802105241341494e-05, + "loss": 0.6901, + "step": 195870 + }, + { + "epoch": 1.2514214890816862, + "grad_norm": 1.866835594177246, + "learning_rate": 3.0797472255545755e-05, + "loss": 0.9969, + "step": 195880 + }, + { + "epoch": 1.2514853762314249, + "grad_norm": 1.2250245809555054, + "learning_rate": 3.079283946313608e-05, + "loss": 0.8529, + "step": 195890 + }, + { + "epoch": 1.2515492633811636, + "grad_norm": 1.3015787601470947, + "learning_rate": 3.078820686415912e-05, + "loss": 0.9146, + "step": 195900 + }, + { + "epoch": 1.2516131505309023, + "grad_norm": 0.7178229093551636, + "learning_rate": 3.078357445866155e-05, + "loss": 0.9116, + "step": 195910 + }, + { + "epoch": 1.251677037680641, + "grad_norm": 0.6988886594772339, + "learning_rate": 3.0778942246690004e-05, + "loss": 0.7955, + "step": 195920 + }, + { + "epoch": 1.2517409248303797, + "grad_norm": 0.7773096561431885, + "learning_rate": 3.077431022829113e-05, + "loss": 0.7112, + "step": 195930 + }, + { + "epoch": 1.2518048119801184, + "grad_norm": 0.7745444178581238, + "learning_rate": 3.076967840351159e-05, + "loss": 0.777, + "step": 195940 + }, + { + "epoch": 1.251868699129857, + "grad_norm": 0.8006591796875, + "learning_rate": 3.076504677239803e-05, + "loss": 0.851, + "step": 195950 + }, + { + "epoch": 1.2519325862795958, + "grad_norm": 0.7855653166770935, + "learning_rate": 3.0760415334997084e-05, + "loss": 0.8963, + "step": 195960 + }, + { + "epoch": 1.2519964734293345, + "grad_norm": 1.0454212427139282, + "learning_rate": 3.075578409135541e-05, + "loss": 0.9647, + "step": 195970 + }, + { + "epoch": 1.252060360579073, + "grad_norm": 0.722151517868042, + "learning_rate": 3.075115304151963e-05, + "loss": 1.154, + "step": 195980 + }, + { + "epoch": 1.252124247728812, + "grad_norm": 0.9353591203689575, + "learning_rate": 3.074652218553639e-05, + "loss": 0.9055, + "step": 195990 + }, + { + "epoch": 1.2521881348785504, + "grad_norm": 0.6113839745521545, + "learning_rate": 3.0741891523452334e-05, + "loss": 0.7152, + "step": 196000 + }, + { + "epoch": 1.2522520220282893, + "grad_norm": 0.8380521535873413, + "learning_rate": 3.0737261055314085e-05, + "loss": 0.8254, + "step": 196010 + }, + { + "epoch": 1.2523159091780278, + "grad_norm": 1.1471387147903442, + "learning_rate": 3.073309379985188e-05, + "loss": 1.0999, + "step": 196020 + }, + { + "epoch": 1.2523797963277667, + "grad_norm": 0.9497846364974976, + "learning_rate": 3.072846370033915e-05, + "loss": 0.6518, + "step": 196030 + }, + { + "epoch": 1.2524436834775052, + "grad_norm": 1.842544436454773, + "learning_rate": 3.0723833794907464e-05, + "loss": 0.7074, + "step": 196040 + }, + { + "epoch": 1.2525075706272442, + "grad_norm": 1.0247862339019775, + "learning_rate": 3.071920408360344e-05, + "loss": 0.8442, + "step": 196050 + }, + { + "epoch": 1.2525714577769826, + "grad_norm": 1.2822681665420532, + "learning_rate": 3.071457456647372e-05, + "loss": 0.7831, + "step": 196060 + }, + { + "epoch": 1.2526353449267216, + "grad_norm": 1.1187118291854858, + "learning_rate": 3.070994524356492e-05, + "loss": 0.7251, + "step": 196070 + }, + { + "epoch": 1.25269923207646, + "grad_norm": 1.0392087697982788, + "learning_rate": 3.070531611492366e-05, + "loss": 0.6043, + "step": 196080 + }, + { + "epoch": 1.2527631192261988, + "grad_norm": 0.8084442615509033, + "learning_rate": 3.070068718059655e-05, + "loss": 1.0564, + "step": 196090 + }, + { + "epoch": 1.2528270063759375, + "grad_norm": 0.8950974941253662, + "learning_rate": 3.069605844063023e-05, + "loss": 0.7615, + "step": 196100 + }, + { + "epoch": 1.2528908935256762, + "grad_norm": 0.8876667022705078, + "learning_rate": 3.069142989507129e-05, + "loss": 0.9022, + "step": 196110 + }, + { + "epoch": 1.2529547806754149, + "grad_norm": 0.6729691028594971, + "learning_rate": 3.0686801543966356e-05, + "loss": 1.014, + "step": 196120 + }, + { + "epoch": 1.2530186678251536, + "grad_norm": 1.1825615167617798, + "learning_rate": 3.0682173387362046e-05, + "loss": 0.7734, + "step": 196130 + }, + { + "epoch": 1.2530825549748923, + "grad_norm": 2.5809266567230225, + "learning_rate": 3.0677545425304955e-05, + "loss": 0.7741, + "step": 196140 + }, + { + "epoch": 1.253146442124631, + "grad_norm": 1.2367600202560425, + "learning_rate": 3.0672917657841696e-05, + "loss": 0.8362, + "step": 196150 + }, + { + "epoch": 1.2532103292743697, + "grad_norm": 1.5456008911132812, + "learning_rate": 3.066829008501888e-05, + "loss": 0.8762, + "step": 196160 + }, + { + "epoch": 1.2532742164241084, + "grad_norm": 0.9506556987762451, + "learning_rate": 3.06636627068831e-05, + "loss": 0.7799, + "step": 196170 + }, + { + "epoch": 1.2533381035738471, + "grad_norm": 1.2111282348632812, + "learning_rate": 3.065903552348098e-05, + "loss": 0.8307, + "step": 196180 + }, + { + "epoch": 1.2534019907235858, + "grad_norm": 1.244231939315796, + "learning_rate": 3.0654408534859094e-05, + "loss": 0.7969, + "step": 196190 + }, + { + "epoch": 1.2534658778733245, + "grad_norm": 0.7318935990333557, + "learning_rate": 3.064978174106406e-05, + "loss": 0.9228, + "step": 196200 + }, + { + "epoch": 1.2535297650230632, + "grad_norm": 0.7732114195823669, + "learning_rate": 3.0645155142142455e-05, + "loss": 1.0287, + "step": 196210 + }, + { + "epoch": 1.253593652172802, + "grad_norm": 0.8616588115692139, + "learning_rate": 3.064052873814088e-05, + "loss": 0.8152, + "step": 196220 + }, + { + "epoch": 1.2536575393225406, + "grad_norm": 2.2946550846099854, + "learning_rate": 3.063590252910594e-05, + "loss": 0.753, + "step": 196230 + }, + { + "epoch": 1.2537214264722794, + "grad_norm": 0.6799231171607971, + "learning_rate": 3.0631276515084205e-05, + "loss": 0.8045, + "step": 196240 + }, + { + "epoch": 1.253785313622018, + "grad_norm": 1.3932530879974365, + "learning_rate": 3.062665069612228e-05, + "loss": 0.623, + "step": 196250 + }, + { + "epoch": 1.2538492007717568, + "grad_norm": 1.0541167259216309, + "learning_rate": 3.062202507226674e-05, + "loss": 0.9539, + "step": 196260 + }, + { + "epoch": 1.2539130879214955, + "grad_norm": 0.8246195912361145, + "learning_rate": 3.061739964356417e-05, + "loss": 0.7437, + "step": 196270 + }, + { + "epoch": 1.2539769750712342, + "grad_norm": 1.299216866493225, + "learning_rate": 3.0612774410061154e-05, + "loss": 0.882, + "step": 196280 + }, + { + "epoch": 1.2540408622209729, + "grad_norm": 0.7007962465286255, + "learning_rate": 3.060814937180427e-05, + "loss": 0.5721, + "step": 196290 + }, + { + "epoch": 1.2541047493707116, + "grad_norm": 0.92581707239151, + "learning_rate": 3.06035245288401e-05, + "loss": 0.9306, + "step": 196300 + }, + { + "epoch": 1.2541686365204503, + "grad_norm": 0.7394659519195557, + "learning_rate": 3.059889988121521e-05, + "loss": 0.8613, + "step": 196310 + }, + { + "epoch": 1.254232523670189, + "grad_norm": 0.8280680179595947, + "learning_rate": 3.05942754289762e-05, + "loss": 0.9097, + "step": 196320 + }, + { + "epoch": 1.2542964108199277, + "grad_norm": 0.8643571734428406, + "learning_rate": 3.058965117216961e-05, + "loss": 0.8487, + "step": 196330 + }, + { + "epoch": 1.2543602979696664, + "grad_norm": 0.7133888602256775, + "learning_rate": 3.0585027110842033e-05, + "loss": 1.0621, + "step": 196340 + }, + { + "epoch": 1.2544241851194051, + "grad_norm": 0.9498386979103088, + "learning_rate": 3.0580403245040016e-05, + "loss": 0.8696, + "step": 196350 + }, + { + "epoch": 1.2544880722691438, + "grad_norm": 0.786381721496582, + "learning_rate": 3.0575779574810147e-05, + "loss": 0.9355, + "step": 196360 + }, + { + "epoch": 1.2545519594188825, + "grad_norm": 0.961892306804657, + "learning_rate": 3.0571156100198986e-05, + "loss": 0.7257, + "step": 196370 + }, + { + "epoch": 1.2546158465686212, + "grad_norm": 2.5090837478637695, + "learning_rate": 3.056653282125309e-05, + "loss": 0.9214, + "step": 196380 + }, + { + "epoch": 1.25467973371836, + "grad_norm": 0.7858637571334839, + "learning_rate": 3.056190973801902e-05, + "loss": 0.8509, + "step": 196390 + }, + { + "epoch": 1.2547436208680987, + "grad_norm": 1.216647744178772, + "learning_rate": 3.0557286850543345e-05, + "loss": 0.8276, + "step": 196400 + }, + { + "epoch": 1.2548075080178374, + "grad_norm": 0.8572155237197876, + "learning_rate": 3.05526641588726e-05, + "loss": 0.6625, + "step": 196410 + }, + { + "epoch": 1.254871395167576, + "grad_norm": 1.4202823638916016, + "learning_rate": 3.054804166305335e-05, + "loss": 0.6364, + "step": 196420 + }, + { + "epoch": 1.2549352823173148, + "grad_norm": 0.9645159244537354, + "learning_rate": 3.0543419363132154e-05, + "loss": 0.9696, + "step": 196430 + }, + { + "epoch": 1.2549991694670535, + "grad_norm": 0.9839354753494263, + "learning_rate": 3.053879725915556e-05, + "loss": 0.9092, + "step": 196440 + }, + { + "epoch": 1.255063056616792, + "grad_norm": 1.0340949296951294, + "learning_rate": 3.053417535117011e-05, + "loss": 0.8126, + "step": 196450 + }, + { + "epoch": 1.255126943766531, + "grad_norm": 1.2581630945205688, + "learning_rate": 3.052955363922235e-05, + "loss": 0.691, + "step": 196460 + }, + { + "epoch": 1.2551908309162694, + "grad_norm": 0.8256975412368774, + "learning_rate": 3.052493212335884e-05, + "loss": 0.8336, + "step": 196470 + }, + { + "epoch": 1.2552547180660083, + "grad_norm": 1.2465929985046387, + "learning_rate": 3.052031080362611e-05, + "loss": 1.2096, + "step": 196480 + }, + { + "epoch": 1.2553186052157468, + "grad_norm": 1.2942495346069336, + "learning_rate": 3.05156896800707e-05, + "loss": 0.8418, + "step": 196490 + }, + { + "epoch": 1.2553824923654857, + "grad_norm": 1.565579891204834, + "learning_rate": 3.051106875273915e-05, + "loss": 0.8584, + "step": 196500 + }, + { + "epoch": 1.2554463795152242, + "grad_norm": 0.9140028357505798, + "learning_rate": 3.0506448021678004e-05, + "loss": 1.2363, + "step": 196510 + }, + { + "epoch": 1.2555102666649631, + "grad_norm": 1.4598805904388428, + "learning_rate": 3.050182748693378e-05, + "loss": 0.7464, + "step": 196520 + }, + { + "epoch": 1.2555741538147016, + "grad_norm": 1.033338189125061, + "learning_rate": 3.049720714855303e-05, + "loss": 0.9934, + "step": 196530 + }, + { + "epoch": 1.2556380409644405, + "grad_norm": 2.08406925201416, + "learning_rate": 3.0492587006582267e-05, + "loss": 1.2789, + "step": 196540 + }, + { + "epoch": 1.255701928114179, + "grad_norm": 0.5521570444107056, + "learning_rate": 3.0487967061068036e-05, + "loss": 0.7427, + "step": 196550 + }, + { + "epoch": 1.255765815263918, + "grad_norm": 1.137140154838562, + "learning_rate": 3.0483347312056853e-05, + "loss": 0.769, + "step": 196560 + }, + { + "epoch": 1.2558297024136564, + "grad_norm": 1.1652560234069824, + "learning_rate": 3.0478727759595248e-05, + "loss": 0.7821, + "step": 196570 + }, + { + "epoch": 1.2558935895633951, + "grad_norm": 1.1104927062988281, + "learning_rate": 3.0474108403729752e-05, + "loss": 0.8623, + "step": 196580 + }, + { + "epoch": 1.2559574767131338, + "grad_norm": 0.8415306210517883, + "learning_rate": 3.0469489244506865e-05, + "loss": 0.9975, + "step": 196590 + }, + { + "epoch": 1.2560213638628726, + "grad_norm": 1.3708555698394775, + "learning_rate": 3.0464870281973123e-05, + "loss": 0.773, + "step": 196600 + }, + { + "epoch": 1.2560852510126113, + "grad_norm": 1.2786868810653687, + "learning_rate": 3.046025151617503e-05, + "loss": 0.8677, + "step": 196610 + }, + { + "epoch": 1.25614913816235, + "grad_norm": 1.9700566530227661, + "learning_rate": 3.0455632947159117e-05, + "loss": 0.8535, + "step": 196620 + }, + { + "epoch": 1.2562130253120887, + "grad_norm": 0.4788389205932617, + "learning_rate": 3.0451014574971892e-05, + "loss": 1.1064, + "step": 196630 + }, + { + "epoch": 1.2562769124618274, + "grad_norm": 0.41164281964302063, + "learning_rate": 3.0446396399659855e-05, + "loss": 0.9192, + "step": 196640 + }, + { + "epoch": 1.256340799611566, + "grad_norm": 0.979625940322876, + "learning_rate": 3.0441778421269523e-05, + "loss": 0.9667, + "step": 196650 + }, + { + "epoch": 1.2564046867613048, + "grad_norm": 0.9145374298095703, + "learning_rate": 3.0437160639847405e-05, + "loss": 0.9318, + "step": 196660 + }, + { + "epoch": 1.2564685739110435, + "grad_norm": 1.1360008716583252, + "learning_rate": 3.043254305544e-05, + "loss": 0.8037, + "step": 196670 + }, + { + "epoch": 1.2565324610607822, + "grad_norm": 0.9261152744293213, + "learning_rate": 3.0427925668093804e-05, + "loss": 1.0606, + "step": 196680 + }, + { + "epoch": 1.256596348210521, + "grad_norm": 3.820808172225952, + "learning_rate": 3.0423308477855344e-05, + "loss": 0.8752, + "step": 196690 + }, + { + "epoch": 1.2566602353602596, + "grad_norm": 1.1207785606384277, + "learning_rate": 3.04186914847711e-05, + "loss": 0.869, + "step": 196700 + }, + { + "epoch": 1.2567241225099983, + "grad_norm": 0.5529508590698242, + "learning_rate": 3.041407468888758e-05, + "loss": 0.8173, + "step": 196710 + }, + { + "epoch": 1.256788009659737, + "grad_norm": 0.9832448959350586, + "learning_rate": 3.0409458090251265e-05, + "loss": 0.8848, + "step": 196720 + }, + { + "epoch": 1.2568518968094757, + "grad_norm": 0.6877294778823853, + "learning_rate": 3.040484168890866e-05, + "loss": 0.7783, + "step": 196730 + }, + { + "epoch": 1.2569157839592144, + "grad_norm": 0.9312469959259033, + "learning_rate": 3.0400225484906243e-05, + "loss": 0.7148, + "step": 196740 + }, + { + "epoch": 1.2569796711089531, + "grad_norm": 0.7746081352233887, + "learning_rate": 3.0395609478290522e-05, + "loss": 0.9964, + "step": 196750 + }, + { + "epoch": 1.2570435582586919, + "grad_norm": 1.5182709693908691, + "learning_rate": 3.0390993669107966e-05, + "loss": 0.9878, + "step": 196760 + }, + { + "epoch": 1.2571074454084306, + "grad_norm": 1.004671573638916, + "learning_rate": 3.0386378057405067e-05, + "loss": 1.0418, + "step": 196770 + }, + { + "epoch": 1.2571713325581693, + "grad_norm": 0.7396956086158752, + "learning_rate": 3.0381762643228316e-05, + "loss": 0.9656, + "step": 196780 + }, + { + "epoch": 1.257235219707908, + "grad_norm": 1.0894277095794678, + "learning_rate": 3.0377147426624186e-05, + "loss": 0.947, + "step": 196790 + }, + { + "epoch": 1.2572991068576467, + "grad_norm": 0.967881441116333, + "learning_rate": 3.0372532407639155e-05, + "loss": 0.9679, + "step": 196800 + }, + { + "epoch": 1.2573629940073854, + "grad_norm": 1.0434962511062622, + "learning_rate": 3.0367917586319704e-05, + "loss": 0.7369, + "step": 196810 + }, + { + "epoch": 1.257426881157124, + "grad_norm": 0.7365828156471252, + "learning_rate": 3.0363302962712305e-05, + "loss": 0.8695, + "step": 196820 + }, + { + "epoch": 1.2574907683068628, + "grad_norm": 0.7892202734947205, + "learning_rate": 3.0358688536863433e-05, + "loss": 0.8961, + "step": 196830 + }, + { + "epoch": 1.2575546554566015, + "grad_norm": 0.9244031310081482, + "learning_rate": 3.0354074308819563e-05, + "loss": 0.8052, + "step": 196840 + }, + { + "epoch": 1.2576185426063402, + "grad_norm": 0.9582728147506714, + "learning_rate": 3.0349460278627163e-05, + "loss": 0.9504, + "step": 196850 + }, + { + "epoch": 1.257682429756079, + "grad_norm": 1.100187063217163, + "learning_rate": 3.0344846446332692e-05, + "loss": 0.7798, + "step": 196860 + }, + { + "epoch": 1.2577463169058176, + "grad_norm": 0.7992842197418213, + "learning_rate": 3.0340232811982628e-05, + "loss": 0.9233, + "step": 196870 + }, + { + "epoch": 1.2578102040555563, + "grad_norm": 1.033463716506958, + "learning_rate": 3.0335619375623408e-05, + "loss": 0.8229, + "step": 196880 + }, + { + "epoch": 1.257874091205295, + "grad_norm": 1.035979986190796, + "learning_rate": 3.033100613730153e-05, + "loss": 1.1535, + "step": 196890 + }, + { + "epoch": 1.2579379783550337, + "grad_norm": 0.9329689741134644, + "learning_rate": 3.0326393097063432e-05, + "loss": 0.7973, + "step": 196900 + }, + { + "epoch": 1.2580018655047724, + "grad_norm": 1.8775664567947388, + "learning_rate": 3.032178025495558e-05, + "loss": 0.84, + "step": 196910 + }, + { + "epoch": 1.2580657526545111, + "grad_norm": 0.7317743301391602, + "learning_rate": 3.0317167611024423e-05, + "loss": 0.8763, + "step": 196920 + }, + { + "epoch": 1.2581296398042499, + "grad_norm": 0.7808213233947754, + "learning_rate": 3.031255516531642e-05, + "loss": 0.8849, + "step": 196930 + }, + { + "epoch": 1.2581935269539883, + "grad_norm": 0.7983580827713013, + "learning_rate": 3.0307942917878014e-05, + "loss": 0.9162, + "step": 196940 + }, + { + "epoch": 1.2582574141037273, + "grad_norm": 1.1718703508377075, + "learning_rate": 3.0303330868755663e-05, + "loss": 0.6528, + "step": 196950 + }, + { + "epoch": 1.2583213012534658, + "grad_norm": 0.7717203497886658, + "learning_rate": 3.02987190179958e-05, + "loss": 0.7946, + "step": 196960 + }, + { + "epoch": 1.2583851884032047, + "grad_norm": 1.1155431270599365, + "learning_rate": 3.029410736564489e-05, + "loss": 0.9417, + "step": 196970 + }, + { + "epoch": 1.2584490755529432, + "grad_norm": 1.0586241483688354, + "learning_rate": 3.0289495911749387e-05, + "loss": 0.7738, + "step": 196980 + }, + { + "epoch": 1.258512962702682, + "grad_norm": 1.0919402837753296, + "learning_rate": 3.0284884656355695e-05, + "loss": 1.0467, + "step": 196990 + }, + { + "epoch": 1.2585768498524206, + "grad_norm": 0.8078431487083435, + "learning_rate": 3.028027359951025e-05, + "loss": 1.0393, + "step": 197000 + }, + { + "epoch": 1.2586407370021595, + "grad_norm": 1.106803297996521, + "learning_rate": 3.0275662741259527e-05, + "loss": 0.8736, + "step": 197010 + }, + { + "epoch": 1.258704624151898, + "grad_norm": 0.7414875030517578, + "learning_rate": 3.0271052081649942e-05, + "loss": 0.7568, + "step": 197020 + }, + { + "epoch": 1.258768511301637, + "grad_norm": 0.8508109450340271, + "learning_rate": 3.026644162072793e-05, + "loss": 0.989, + "step": 197030 + }, + { + "epoch": 1.2588323984513754, + "grad_norm": 1.1377967596054077, + "learning_rate": 3.0261831358539926e-05, + "loss": 0.908, + "step": 197040 + }, + { + "epoch": 1.2588962856011143, + "grad_norm": 0.7796303629875183, + "learning_rate": 3.0257221295132354e-05, + "loss": 0.6397, + "step": 197050 + }, + { + "epoch": 1.2589601727508528, + "grad_norm": 1.2880197763442993, + "learning_rate": 3.025261143055164e-05, + "loss": 0.8341, + "step": 197060 + }, + { + "epoch": 1.2590240599005915, + "grad_norm": 0.8077268600463867, + "learning_rate": 3.024800176484422e-05, + "loss": 0.5744, + "step": 197070 + }, + { + "epoch": 1.2590879470503302, + "grad_norm": 0.8637357354164124, + "learning_rate": 3.0243392298056505e-05, + "loss": 1.15, + "step": 197080 + }, + { + "epoch": 1.259151834200069, + "grad_norm": 1.3049520254135132, + "learning_rate": 3.0238783030234925e-05, + "loss": 1.1268, + "step": 197090 + }, + { + "epoch": 1.2592157213498076, + "grad_norm": 0.9427032470703125, + "learning_rate": 3.0234173961425894e-05, + "loss": 0.8156, + "step": 197100 + }, + { + "epoch": 1.2592796084995463, + "grad_norm": 0.559723973274231, + "learning_rate": 3.0229565091675826e-05, + "loss": 0.9744, + "step": 197110 + }, + { + "epoch": 1.259343495649285, + "grad_norm": 0.7008159160614014, + "learning_rate": 3.0224956421031146e-05, + "loss": 0.9577, + "step": 197120 + }, + { + "epoch": 1.2594073827990238, + "grad_norm": 1.0198113918304443, + "learning_rate": 3.0220347949538264e-05, + "loss": 0.8379, + "step": 197130 + }, + { + "epoch": 1.2594712699487625, + "grad_norm": 1.9242488145828247, + "learning_rate": 3.0215739677243593e-05, + "loss": 0.9567, + "step": 197140 + }, + { + "epoch": 1.2595351570985012, + "grad_norm": 1.6917319297790527, + "learning_rate": 3.0211131604193532e-05, + "loss": 0.7219, + "step": 197150 + }, + { + "epoch": 1.2595990442482399, + "grad_norm": 0.9386038780212402, + "learning_rate": 3.02065237304345e-05, + "loss": 1.1366, + "step": 197160 + }, + { + "epoch": 1.2596629313979786, + "grad_norm": 1.1322776079177856, + "learning_rate": 3.020191605601289e-05, + "loss": 0.8448, + "step": 197170 + }, + { + "epoch": 1.2597268185477173, + "grad_norm": 1.149288296699524, + "learning_rate": 3.0197308580975126e-05, + "loss": 0.9294, + "step": 197180 + }, + { + "epoch": 1.259790705697456, + "grad_norm": 0.8767491579055786, + "learning_rate": 3.0192701305367587e-05, + "loss": 0.7238, + "step": 197190 + }, + { + "epoch": 1.2598545928471947, + "grad_norm": 0.973002552986145, + "learning_rate": 3.0188094229236674e-05, + "loss": 0.8918, + "step": 197200 + }, + { + "epoch": 1.2599184799969334, + "grad_norm": 2.295564651489258, + "learning_rate": 3.0183487352628802e-05, + "loss": 0.8087, + "step": 197210 + }, + { + "epoch": 1.259982367146672, + "grad_norm": 0.9704807996749878, + "learning_rate": 3.017888067559036e-05, + "loss": 0.7005, + "step": 197220 + }, + { + "epoch": 1.2600462542964108, + "grad_norm": 0.8905156850814819, + "learning_rate": 3.0174274198167728e-05, + "loss": 1.0, + "step": 197230 + }, + { + "epoch": 1.2601101414461495, + "grad_norm": 1.1902166604995728, + "learning_rate": 3.016966792040732e-05, + "loss": 1.016, + "step": 197240 + }, + { + "epoch": 1.2601740285958882, + "grad_norm": 1.5252759456634521, + "learning_rate": 3.0165061842355503e-05, + "loss": 0.8924, + "step": 197250 + }, + { + "epoch": 1.260237915745627, + "grad_norm": 0.6883949041366577, + "learning_rate": 3.016045596405867e-05, + "loss": 1.0273, + "step": 197260 + }, + { + "epoch": 1.2603018028953656, + "grad_norm": 1.1482892036437988, + "learning_rate": 3.0155850285563213e-05, + "loss": 1.0745, + "step": 197270 + }, + { + "epoch": 1.2603656900451043, + "grad_norm": 1.319387435913086, + "learning_rate": 3.0151244806915513e-05, + "loss": 0.9949, + "step": 197280 + }, + { + "epoch": 1.260429577194843, + "grad_norm": 1.022769570350647, + "learning_rate": 3.0146639528161947e-05, + "loss": 0.764, + "step": 197290 + }, + { + "epoch": 1.2604934643445818, + "grad_norm": 0.9216936230659485, + "learning_rate": 3.0142034449348898e-05, + "loss": 1.0093, + "step": 197300 + }, + { + "epoch": 1.2605573514943205, + "grad_norm": 1.382753610610962, + "learning_rate": 3.013742957052274e-05, + "loss": 0.8119, + "step": 197310 + }, + { + "epoch": 1.2606212386440592, + "grad_norm": 0.9630887508392334, + "learning_rate": 3.013282489172985e-05, + "loss": 0.9822, + "step": 197320 + }, + { + "epoch": 1.2606851257937979, + "grad_norm": 1.4023778438568115, + "learning_rate": 3.0128220413016604e-05, + "loss": 0.8795, + "step": 197330 + }, + { + "epoch": 1.2607490129435366, + "grad_norm": 0.5781784653663635, + "learning_rate": 3.0123616134429368e-05, + "loss": 0.9546, + "step": 197340 + }, + { + "epoch": 1.2608129000932753, + "grad_norm": 1.133649468421936, + "learning_rate": 3.0119012056014513e-05, + "loss": 0.5692, + "step": 197350 + }, + { + "epoch": 1.260876787243014, + "grad_norm": 0.9575822353363037, + "learning_rate": 3.0114408177818405e-05, + "loss": 0.7463, + "step": 197360 + }, + { + "epoch": 1.2609406743927527, + "grad_norm": 1.02314293384552, + "learning_rate": 3.0109804499887406e-05, + "loss": 0.8035, + "step": 197370 + }, + { + "epoch": 1.2610045615424914, + "grad_norm": 0.8817002177238464, + "learning_rate": 3.0105201022267894e-05, + "loss": 1.0917, + "step": 197380 + }, + { + "epoch": 1.2610684486922301, + "grad_norm": 0.5148343443870544, + "learning_rate": 3.010059774500621e-05, + "loss": 0.7236, + "step": 197390 + }, + { + "epoch": 1.2611323358419688, + "grad_norm": 0.890557050704956, + "learning_rate": 3.0095994668148725e-05, + "loss": 0.8603, + "step": 197400 + }, + { + "epoch": 1.2611962229917075, + "grad_norm": 1.0954136848449707, + "learning_rate": 3.0091391791741784e-05, + "loss": 1.0766, + "step": 197410 + }, + { + "epoch": 1.2612601101414462, + "grad_norm": 1.0484198331832886, + "learning_rate": 3.008678911583176e-05, + "loss": 0.9275, + "step": 197420 + }, + { + "epoch": 1.2613239972911847, + "grad_norm": 1.0964504480361938, + "learning_rate": 3.0082186640465e-05, + "loss": 0.8913, + "step": 197430 + }, + { + "epoch": 1.2613878844409236, + "grad_norm": 0.7645079493522644, + "learning_rate": 3.0077584365687848e-05, + "loss": 1.0927, + "step": 197440 + }, + { + "epoch": 1.2614517715906621, + "grad_norm": 0.9771102666854858, + "learning_rate": 3.007298229154666e-05, + "loss": 0.9772, + "step": 197450 + }, + { + "epoch": 1.261515658740401, + "grad_norm": 0.7499563694000244, + "learning_rate": 3.0068380418087792e-05, + "loss": 0.8616, + "step": 197460 + }, + { + "epoch": 1.2615795458901395, + "grad_norm": 0.7899210453033447, + "learning_rate": 3.0063778745357563e-05, + "loss": 0.7892, + "step": 197470 + }, + { + "epoch": 1.2616434330398785, + "grad_norm": 0.6827421188354492, + "learning_rate": 3.005917727340233e-05, + "loss": 0.8407, + "step": 197480 + }, + { + "epoch": 1.261707320189617, + "grad_norm": 1.049285888671875, + "learning_rate": 3.0054576002268432e-05, + "loss": 0.9158, + "step": 197490 + }, + { + "epoch": 1.2617712073393559, + "grad_norm": 0.9220422506332397, + "learning_rate": 3.004997493200221e-05, + "loss": 0.8226, + "step": 197500 + }, + { + "epoch": 1.2618350944890944, + "grad_norm": 0.9056112170219421, + "learning_rate": 3.004537406265e-05, + "loss": 0.7789, + "step": 197510 + }, + { + "epoch": 1.2618989816388333, + "grad_norm": 0.9944071769714355, + "learning_rate": 3.0040773394258128e-05, + "loss": 0.8665, + "step": 197520 + }, + { + "epoch": 1.2619628687885718, + "grad_norm": 1.2284094095230103, + "learning_rate": 3.0036172926872937e-05, + "loss": 1.025, + "step": 197530 + }, + { + "epoch": 1.2620267559383105, + "grad_norm": 0.7492192387580872, + "learning_rate": 3.0031572660540764e-05, + "loss": 1.1059, + "step": 197540 + }, + { + "epoch": 1.2620906430880492, + "grad_norm": 0.6289995312690735, + "learning_rate": 3.0026972595307924e-05, + "loss": 0.9333, + "step": 197550 + }, + { + "epoch": 1.262154530237788, + "grad_norm": 0.9772926568984985, + "learning_rate": 3.002237273122075e-05, + "loss": 0.8707, + "step": 197560 + }, + { + "epoch": 1.2622184173875266, + "grad_norm": 1.2405903339385986, + "learning_rate": 3.0017773068325566e-05, + "loss": 0.7973, + "step": 197570 + }, + { + "epoch": 1.2622823045372653, + "grad_norm": 0.9798763990402222, + "learning_rate": 3.00131736066687e-05, + "loss": 0.9111, + "step": 197580 + }, + { + "epoch": 1.262346191687004, + "grad_norm": 0.9784902334213257, + "learning_rate": 3.000857434629646e-05, + "loss": 0.9779, + "step": 197590 + }, + { + "epoch": 1.2624100788367427, + "grad_norm": 0.873577892780304, + "learning_rate": 3.0003975287255172e-05, + "loss": 0.8118, + "step": 197600 + }, + { + "epoch": 1.2624739659864814, + "grad_norm": 1.508326768875122, + "learning_rate": 2.9999376429591154e-05, + "loss": 1.0005, + "step": 197610 + }, + { + "epoch": 1.2625378531362201, + "grad_norm": 0.8667922616004944, + "learning_rate": 2.9994777773350713e-05, + "loss": 0.8489, + "step": 197620 + }, + { + "epoch": 1.2626017402859588, + "grad_norm": 1.183371901512146, + "learning_rate": 2.9990179318580176e-05, + "loss": 0.8954, + "step": 197630 + }, + { + "epoch": 1.2626656274356975, + "grad_norm": 0.7158766984939575, + "learning_rate": 2.998558106532584e-05, + "loss": 0.9686, + "step": 197640 + }, + { + "epoch": 1.2627295145854363, + "grad_norm": 0.7767731547355652, + "learning_rate": 2.998098301363401e-05, + "loss": 1.0142, + "step": 197650 + }, + { + "epoch": 1.262793401735175, + "grad_norm": 1.035164475440979, + "learning_rate": 2.9976385163551012e-05, + "loss": 0.9737, + "step": 197660 + }, + { + "epoch": 1.2628572888849137, + "grad_norm": 0.8004717230796814, + "learning_rate": 2.9971787515123135e-05, + "loss": 0.935, + "step": 197670 + }, + { + "epoch": 1.2629211760346524, + "grad_norm": 1.0571773052215576, + "learning_rate": 2.9967190068396677e-05, + "loss": 0.9205, + "step": 197680 + }, + { + "epoch": 1.262985063184391, + "grad_norm": 0.6689577102661133, + "learning_rate": 2.9962592823417955e-05, + "loss": 0.977, + "step": 197690 + }, + { + "epoch": 1.2630489503341298, + "grad_norm": 0.8524081110954285, + "learning_rate": 2.9957995780233256e-05, + "loss": 0.9482, + "step": 197700 + }, + { + "epoch": 1.2631128374838685, + "grad_norm": 1.0077905654907227, + "learning_rate": 2.9953398938888878e-05, + "loss": 0.8617, + "step": 197710 + }, + { + "epoch": 1.2631767246336072, + "grad_norm": 1.403054118156433, + "learning_rate": 2.9948802299431113e-05, + "loss": 0.8789, + "step": 197720 + }, + { + "epoch": 1.263240611783346, + "grad_norm": 0.8395767211914062, + "learning_rate": 2.994420586190625e-05, + "loss": 1.0282, + "step": 197730 + }, + { + "epoch": 1.2633044989330846, + "grad_norm": 1.5448747873306274, + "learning_rate": 2.9939609626360588e-05, + "loss": 0.8616, + "step": 197740 + }, + { + "epoch": 1.2633683860828233, + "grad_norm": 3.1555674076080322, + "learning_rate": 2.9935013592840423e-05, + "loss": 1.0513, + "step": 197750 + }, + { + "epoch": 1.263432273232562, + "grad_norm": 1.0118703842163086, + "learning_rate": 2.9930417761392015e-05, + "loss": 0.9609, + "step": 197760 + }, + { + "epoch": 1.2634961603823007, + "grad_norm": 2.897057294845581, + "learning_rate": 2.9925822132061677e-05, + "loss": 1.2673, + "step": 197770 + }, + { + "epoch": 1.2635600475320394, + "grad_norm": 0.9232097864151001, + "learning_rate": 2.9921226704895667e-05, + "loss": 0.9313, + "step": 197780 + }, + { + "epoch": 1.2636239346817781, + "grad_norm": 0.918526291847229, + "learning_rate": 2.9916631479940278e-05, + "loss": 0.8895, + "step": 197790 + }, + { + "epoch": 1.2636878218315168, + "grad_norm": 1.1380006074905396, + "learning_rate": 2.9912036457241788e-05, + "loss": 0.8376, + "step": 197800 + }, + { + "epoch": 1.2637517089812556, + "grad_norm": 0.881756603717804, + "learning_rate": 2.990744163684646e-05, + "loss": 1.0668, + "step": 197810 + }, + { + "epoch": 1.2638155961309943, + "grad_norm": 1.0361684560775757, + "learning_rate": 2.9902847018800584e-05, + "loss": 1.0976, + "step": 197820 + }, + { + "epoch": 1.263879483280733, + "grad_norm": 0.8286011219024658, + "learning_rate": 2.9898252603150424e-05, + "loss": 1.007, + "step": 197830 + }, + { + "epoch": 1.2639433704304717, + "grad_norm": 1.2168878316879272, + "learning_rate": 2.9893658389942252e-05, + "loss": 0.677, + "step": 197840 + }, + { + "epoch": 1.2640072575802104, + "grad_norm": 1.848334789276123, + "learning_rate": 2.9889064379222332e-05, + "loss": 0.9353, + "step": 197850 + }, + { + "epoch": 1.264071144729949, + "grad_norm": 0.9026688933372498, + "learning_rate": 2.9884470571036937e-05, + "loss": 0.9686, + "step": 197860 + }, + { + "epoch": 1.2641350318796878, + "grad_norm": 0.7856665849685669, + "learning_rate": 2.987987696543232e-05, + "loss": 0.7682, + "step": 197870 + }, + { + "epoch": 1.2641989190294265, + "grad_norm": 3.0376346111297607, + "learning_rate": 2.9875283562454748e-05, + "loss": 0.7571, + "step": 197880 + }, + { + "epoch": 1.2642628061791652, + "grad_norm": 0.8250278830528259, + "learning_rate": 2.987069036215049e-05, + "loss": 0.8977, + "step": 197890 + }, + { + "epoch": 1.264326693328904, + "grad_norm": 0.7953085899353027, + "learning_rate": 2.9866097364565783e-05, + "loss": 0.7127, + "step": 197900 + }, + { + "epoch": 1.2643905804786426, + "grad_norm": 0.7985715866088867, + "learning_rate": 2.98615045697469e-05, + "loss": 0.9748, + "step": 197910 + }, + { + "epoch": 1.264454467628381, + "grad_norm": 1.1099581718444824, + "learning_rate": 2.9856911977740088e-05, + "loss": 0.9364, + "step": 197920 + }, + { + "epoch": 1.26451835477812, + "grad_norm": 0.980712354183197, + "learning_rate": 2.9852319588591588e-05, + "loss": 0.7622, + "step": 197930 + }, + { + "epoch": 1.2645822419278585, + "grad_norm": 0.9018016457557678, + "learning_rate": 2.984772740234767e-05, + "loss": 0.7392, + "step": 197940 + }, + { + "epoch": 1.2646461290775974, + "grad_norm": 1.323249101638794, + "learning_rate": 2.984313541905459e-05, + "loss": 0.8726, + "step": 197950 + }, + { + "epoch": 1.264710016227336, + "grad_norm": 0.9077749848365784, + "learning_rate": 2.9838543638758554e-05, + "loss": 0.6612, + "step": 197960 + }, + { + "epoch": 1.2647739033770748, + "grad_norm": 0.6576750874519348, + "learning_rate": 2.9833952061505832e-05, + "loss": 0.7659, + "step": 197970 + }, + { + "epoch": 1.2648377905268133, + "grad_norm": 0.9078711867332458, + "learning_rate": 2.982936068734265e-05, + "loss": 0.9886, + "step": 197980 + }, + { + "epoch": 1.2649016776765523, + "grad_norm": 1.642651081085205, + "learning_rate": 2.982476951631526e-05, + "loss": 0.9448, + "step": 197990 + }, + { + "epoch": 1.2649655648262907, + "grad_norm": 0.8564273118972778, + "learning_rate": 2.9820178548469896e-05, + "loss": 0.9367, + "step": 198000 + }, + { + "epoch": 1.2650294519760297, + "grad_norm": 1.8555928468704224, + "learning_rate": 2.9815587783852794e-05, + "loss": 0.9377, + "step": 198010 + }, + { + "epoch": 1.2650933391257682, + "grad_norm": 0.5984758138656616, + "learning_rate": 2.981099722251018e-05, + "loss": 0.7801, + "step": 198020 + }, + { + "epoch": 1.2651572262755069, + "grad_norm": 0.771589457988739, + "learning_rate": 2.980640686448829e-05, + "loss": 0.845, + "step": 198030 + }, + { + "epoch": 1.2652211134252456, + "grad_norm": 0.9007082581520081, + "learning_rate": 2.9801816709833353e-05, + "loss": 0.8925, + "step": 198040 + }, + { + "epoch": 1.2652850005749843, + "grad_norm": 1.667065978050232, + "learning_rate": 2.979722675859159e-05, + "loss": 0.8729, + "step": 198050 + }, + { + "epoch": 1.265348887724723, + "grad_norm": 1.2028627395629883, + "learning_rate": 2.979263701080924e-05, + "loss": 1.0106, + "step": 198060 + }, + { + "epoch": 1.2654127748744617, + "grad_norm": 1.319518804550171, + "learning_rate": 2.9788047466532515e-05, + "loss": 0.9549, + "step": 198070 + }, + { + "epoch": 1.2654766620242004, + "grad_norm": 0.5411072373390198, + "learning_rate": 2.978345812580764e-05, + "loss": 0.6616, + "step": 198080 + }, + { + "epoch": 1.265540549173939, + "grad_norm": 0.7059124708175659, + "learning_rate": 2.977886898868083e-05, + "loss": 0.6686, + "step": 198090 + }, + { + "epoch": 1.2656044363236778, + "grad_norm": 1.225595235824585, + "learning_rate": 2.97742800551983e-05, + "loss": 0.7636, + "step": 198100 + }, + { + "epoch": 1.2656683234734165, + "grad_norm": 1.0952637195587158, + "learning_rate": 2.9769691325406273e-05, + "loss": 1.007, + "step": 198110 + }, + { + "epoch": 1.2657322106231552, + "grad_norm": 0.895312488079071, + "learning_rate": 2.976510279935095e-05, + "loss": 0.8403, + "step": 198120 + }, + { + "epoch": 1.265796097772894, + "grad_norm": 0.6350535154342651, + "learning_rate": 2.9760514477078554e-05, + "loss": 0.7502, + "step": 198130 + }, + { + "epoch": 1.2658599849226326, + "grad_norm": 0.737232506275177, + "learning_rate": 2.975592635863529e-05, + "loss": 0.839, + "step": 198140 + }, + { + "epoch": 1.2659238720723713, + "grad_norm": 0.6077031493186951, + "learning_rate": 2.975133844406735e-05, + "loss": 0.7285, + "step": 198150 + }, + { + "epoch": 1.26598775922211, + "grad_norm": 0.9517359137535095, + "learning_rate": 2.9746750733420958e-05, + "loss": 0.7235, + "step": 198160 + }, + { + "epoch": 1.2660516463718487, + "grad_norm": 0.7495896220207214, + "learning_rate": 2.9742163226742304e-05, + "loss": 0.8919, + "step": 198170 + }, + { + "epoch": 1.2661155335215875, + "grad_norm": 2.226393699645996, + "learning_rate": 2.9737575924077593e-05, + "loss": 0.9916, + "step": 198180 + }, + { + "epoch": 1.2661794206713262, + "grad_norm": 0.949053168296814, + "learning_rate": 2.973298882547302e-05, + "loss": 0.6971, + "step": 198190 + }, + { + "epoch": 1.2662433078210649, + "grad_norm": 1.5963283777236938, + "learning_rate": 2.972840193097478e-05, + "loss": 0.8806, + "step": 198200 + }, + { + "epoch": 1.2663071949708036, + "grad_norm": 1.0817798376083374, + "learning_rate": 2.9723815240629083e-05, + "loss": 1.0479, + "step": 198210 + }, + { + "epoch": 1.2663710821205423, + "grad_norm": 0.6202261447906494, + "learning_rate": 2.9719228754482097e-05, + "loss": 0.6736, + "step": 198220 + }, + { + "epoch": 1.266434969270281, + "grad_norm": 1.1600061655044556, + "learning_rate": 2.9714642472580024e-05, + "loss": 0.9442, + "step": 198230 + }, + { + "epoch": 1.2664988564200197, + "grad_norm": 1.1812655925750732, + "learning_rate": 2.9710056394969056e-05, + "loss": 0.8706, + "step": 198240 + }, + { + "epoch": 1.2665627435697584, + "grad_norm": 1.3576058149337769, + "learning_rate": 2.9705470521695368e-05, + "loss": 0.9426, + "step": 198250 + }, + { + "epoch": 1.266626630719497, + "grad_norm": 1.4901412725448608, + "learning_rate": 2.9700884852805133e-05, + "loss": 0.8986, + "step": 198260 + }, + { + "epoch": 1.2666905178692358, + "grad_norm": 0.6664626002311707, + "learning_rate": 2.9696299388344572e-05, + "loss": 1.1324, + "step": 198270 + }, + { + "epoch": 1.2667544050189745, + "grad_norm": 0.7542868256568909, + "learning_rate": 2.969171412835983e-05, + "loss": 0.9587, + "step": 198280 + }, + { + "epoch": 1.2668182921687132, + "grad_norm": 1.228543996810913, + "learning_rate": 2.96871290728971e-05, + "loss": 0.9034, + "step": 198290 + }, + { + "epoch": 1.266882179318452, + "grad_norm": 0.6784017086029053, + "learning_rate": 2.968254422200256e-05, + "loss": 0.6943, + "step": 198300 + }, + { + "epoch": 1.2669460664681906, + "grad_norm": 0.9931260347366333, + "learning_rate": 2.967795957572237e-05, + "loss": 0.9826, + "step": 198310 + }, + { + "epoch": 1.2670099536179293, + "grad_norm": 1.9696242809295654, + "learning_rate": 2.967337513410271e-05, + "loss": 0.8181, + "step": 198320 + }, + { + "epoch": 1.267073840767668, + "grad_norm": 2.127666473388672, + "learning_rate": 2.9668790897189748e-05, + "loss": 0.7966, + "step": 198330 + }, + { + "epoch": 1.2671377279174068, + "grad_norm": 0.9904428720474243, + "learning_rate": 2.9664206865029652e-05, + "loss": 0.8374, + "step": 198340 + }, + { + "epoch": 1.2672016150671455, + "grad_norm": 0.7887634038925171, + "learning_rate": 2.965962303766858e-05, + "loss": 0.8897, + "step": 198350 + }, + { + "epoch": 1.2672655022168842, + "grad_norm": 1.2394057512283325, + "learning_rate": 2.9655039415152708e-05, + "loss": 1.0418, + "step": 198360 + }, + { + "epoch": 1.2673293893666229, + "grad_norm": 1.0656827688217163, + "learning_rate": 2.965045599752818e-05, + "loss": 0.764, + "step": 198370 + }, + { + "epoch": 1.2673932765163616, + "grad_norm": 0.8545356392860413, + "learning_rate": 2.9645872784841176e-05, + "loss": 0.7794, + "step": 198380 + }, + { + "epoch": 1.2674571636661003, + "grad_norm": 0.9308279752731323, + "learning_rate": 2.964128977713784e-05, + "loss": 0.8551, + "step": 198390 + }, + { + "epoch": 1.267521050815839, + "grad_norm": 1.182393193244934, + "learning_rate": 2.9636706974464324e-05, + "loss": 0.7615, + "step": 198400 + }, + { + "epoch": 1.2675849379655775, + "grad_norm": 0.7643315196037292, + "learning_rate": 2.9632124376866787e-05, + "loss": 0.8983, + "step": 198410 + }, + { + "epoch": 1.2676488251153164, + "grad_norm": 0.9876617789268494, + "learning_rate": 2.9627541984391377e-05, + "loss": 0.9168, + "step": 198420 + }, + { + "epoch": 1.2677127122650549, + "grad_norm": 1.2725698947906494, + "learning_rate": 2.962295979708426e-05, + "loss": 0.8674, + "step": 198430 + }, + { + "epoch": 1.2677765994147938, + "grad_norm": 0.755490243434906, + "learning_rate": 2.961837781499155e-05, + "loss": 0.8002, + "step": 198440 + }, + { + "epoch": 1.2678404865645323, + "grad_norm": 0.9564643502235413, + "learning_rate": 2.9613796038159407e-05, + "loss": 0.7711, + "step": 198450 + }, + { + "epoch": 1.2679043737142712, + "grad_norm": 0.7563437223434448, + "learning_rate": 2.9609214466633984e-05, + "loss": 0.8209, + "step": 198460 + }, + { + "epoch": 1.2679682608640097, + "grad_norm": 1.0872384309768677, + "learning_rate": 2.96046331004614e-05, + "loss": 0.9514, + "step": 198470 + }, + { + "epoch": 1.2680321480137486, + "grad_norm": 0.8535314798355103, + "learning_rate": 2.9600051939687812e-05, + "loss": 0.914, + "step": 198480 + }, + { + "epoch": 1.2680960351634871, + "grad_norm": 1.0216346979141235, + "learning_rate": 2.9595470984359352e-05, + "loss": 0.9734, + "step": 198490 + }, + { + "epoch": 1.268159922313226, + "grad_norm": 1.756752610206604, + "learning_rate": 2.9590890234522147e-05, + "loss": 0.9822, + "step": 198500 + }, + { + "epoch": 1.2682238094629645, + "grad_norm": 1.0920852422714233, + "learning_rate": 2.9586309690222337e-05, + "loss": 1.1895, + "step": 198510 + }, + { + "epoch": 1.2682876966127032, + "grad_norm": 0.9064481854438782, + "learning_rate": 2.9581729351506036e-05, + "loss": 0.8215, + "step": 198520 + }, + { + "epoch": 1.268351583762442, + "grad_norm": 0.8382233381271362, + "learning_rate": 2.9577149218419398e-05, + "loss": 0.9083, + "step": 198530 + }, + { + "epoch": 1.2684154709121807, + "grad_norm": 1.0321544408798218, + "learning_rate": 2.9572569291008534e-05, + "loss": 1.1281, + "step": 198540 + }, + { + "epoch": 1.2684793580619194, + "grad_norm": 0.7525801658630371, + "learning_rate": 2.956798956931957e-05, + "loss": 1.0749, + "step": 198550 + }, + { + "epoch": 1.268543245211658, + "grad_norm": 0.8359514474868774, + "learning_rate": 2.956341005339862e-05, + "loss": 0.9472, + "step": 198560 + }, + { + "epoch": 1.2686071323613968, + "grad_norm": 0.6227117776870728, + "learning_rate": 2.9558830743291822e-05, + "loss": 0.8116, + "step": 198570 + }, + { + "epoch": 1.2686710195111355, + "grad_norm": 1.3645145893096924, + "learning_rate": 2.9554251639045266e-05, + "loss": 0.9916, + "step": 198580 + }, + { + "epoch": 1.2687349066608742, + "grad_norm": 0.8045088052749634, + "learning_rate": 2.954967274070509e-05, + "loss": 0.7611, + "step": 198590 + }, + { + "epoch": 1.268798793810613, + "grad_norm": 0.8705744743347168, + "learning_rate": 2.9545094048317412e-05, + "loss": 0.9175, + "step": 198600 + }, + { + "epoch": 1.2688626809603516, + "grad_norm": 0.8257783055305481, + "learning_rate": 2.954051556192833e-05, + "loss": 0.7481, + "step": 198610 + }, + { + "epoch": 1.2689265681100903, + "grad_norm": 1.0180158615112305, + "learning_rate": 2.9535937281583947e-05, + "loss": 0.9748, + "step": 198620 + }, + { + "epoch": 1.268990455259829, + "grad_norm": 0.809568464756012, + "learning_rate": 2.953135920733039e-05, + "loss": 0.8703, + "step": 198630 + }, + { + "epoch": 1.2690543424095677, + "grad_norm": 1.0721912384033203, + "learning_rate": 2.952678133921375e-05, + "loss": 0.8546, + "step": 198640 + }, + { + "epoch": 1.2691182295593064, + "grad_norm": 1.1542046070098877, + "learning_rate": 2.9522203677280136e-05, + "loss": 0.7243, + "step": 198650 + }, + { + "epoch": 1.2691821167090451, + "grad_norm": 0.7112246751785278, + "learning_rate": 2.9517626221575645e-05, + "loss": 1.0079, + "step": 198660 + }, + { + "epoch": 1.2692460038587838, + "grad_norm": 0.9132784008979797, + "learning_rate": 2.9513048972146373e-05, + "loss": 0.8804, + "step": 198670 + }, + { + "epoch": 1.2693098910085225, + "grad_norm": 0.778862476348877, + "learning_rate": 2.950847192903843e-05, + "loss": 0.9576, + "step": 198680 + }, + { + "epoch": 1.2693737781582612, + "grad_norm": 0.9121737480163574, + "learning_rate": 2.9503895092297894e-05, + "loss": 0.9902, + "step": 198690 + }, + { + "epoch": 1.269437665308, + "grad_norm": 0.9337990880012512, + "learning_rate": 2.949931846197087e-05, + "loss": 0.9189, + "step": 198700 + }, + { + "epoch": 1.2695015524577387, + "grad_norm": 1.054032325744629, + "learning_rate": 2.9494742038103444e-05, + "loss": 0.8774, + "step": 198710 + }, + { + "epoch": 1.2695654396074774, + "grad_norm": 0.9600640535354614, + "learning_rate": 2.949016582074171e-05, + "loss": 0.6799, + "step": 198720 + }, + { + "epoch": 1.269629326757216, + "grad_norm": 2.320675849914551, + "learning_rate": 2.9485589809931746e-05, + "loss": 0.714, + "step": 198730 + }, + { + "epoch": 1.2696932139069548, + "grad_norm": 0.8881000876426697, + "learning_rate": 2.9481014005719644e-05, + "loss": 0.9684, + "step": 198740 + }, + { + "epoch": 1.2697571010566935, + "grad_norm": Infinity, + "learning_rate": 2.9476895958607998e-05, + "loss": 0.995, + "step": 198750 + }, + { + "epoch": 1.2698209882064322, + "grad_norm": 0.66578608751297, + "learning_rate": 2.9472320547058774e-05, + "loss": 0.9148, + "step": 198760 + }, + { + "epoch": 1.269884875356171, + "grad_norm": 0.7851095795631409, + "learning_rate": 2.946774534224105e-05, + "loss": 1.02, + "step": 198770 + }, + { + "epoch": 1.2699487625059096, + "grad_norm": 0.8484801650047302, + "learning_rate": 2.9463170344200885e-05, + "loss": 0.8683, + "step": 198780 + }, + { + "epoch": 1.2700126496556483, + "grad_norm": 1.1238106489181519, + "learning_rate": 2.9458595552984368e-05, + "loss": 0.7311, + "step": 198790 + }, + { + "epoch": 1.270076536805387, + "grad_norm": 0.7110521197319031, + "learning_rate": 2.945402096863756e-05, + "loss": 0.6694, + "step": 198800 + }, + { + "epoch": 1.2701404239551257, + "grad_norm": 1.1236708164215088, + "learning_rate": 2.9449446591206536e-05, + "loss": 0.7267, + "step": 198810 + }, + { + "epoch": 1.2702043111048644, + "grad_norm": 1.026677131652832, + "learning_rate": 2.9444872420737362e-05, + "loss": 0.9999, + "step": 198820 + }, + { + "epoch": 1.2702681982546031, + "grad_norm": 1.0339289903640747, + "learning_rate": 2.9440298457276105e-05, + "loss": 0.7496, + "step": 198830 + }, + { + "epoch": 1.2703320854043418, + "grad_norm": 0.7419828176498413, + "learning_rate": 2.943572470086884e-05, + "loss": 1.0398, + "step": 198840 + }, + { + "epoch": 1.2703959725540805, + "grad_norm": 1.2208056449890137, + "learning_rate": 2.9431151151561607e-05, + "loss": 0.6542, + "step": 198850 + }, + { + "epoch": 1.2704598597038192, + "grad_norm": 1.2347742319107056, + "learning_rate": 2.942657780940048e-05, + "loss": 0.9713, + "step": 198860 + }, + { + "epoch": 1.270523746853558, + "grad_norm": 0.9808398485183716, + "learning_rate": 2.9422004674431514e-05, + "loss": 0.8772, + "step": 198870 + }, + { + "epoch": 1.2705876340032964, + "grad_norm": 0.9095727801322937, + "learning_rate": 2.941743174670076e-05, + "loss": 1.0907, + "step": 198880 + }, + { + "epoch": 1.2706515211530354, + "grad_norm": 0.946310818195343, + "learning_rate": 2.9412859026254276e-05, + "loss": 0.7978, + "step": 198890 + }, + { + "epoch": 1.2707154083027739, + "grad_norm": 1.1127965450286865, + "learning_rate": 2.9408286513138102e-05, + "loss": 1.0265, + "step": 198900 + }, + { + "epoch": 1.2707792954525128, + "grad_norm": 0.7210429310798645, + "learning_rate": 2.9403714207398303e-05, + "loss": 0.8068, + "step": 198910 + }, + { + "epoch": 1.2708431826022513, + "grad_norm": 1.0879623889923096, + "learning_rate": 2.939914210908093e-05, + "loss": 0.852, + "step": 198920 + }, + { + "epoch": 1.2709070697519902, + "grad_norm": 1.1112303733825684, + "learning_rate": 2.939457021823201e-05, + "loss": 1.1495, + "step": 198930 + }, + { + "epoch": 1.2709709569017287, + "grad_norm": 0.6538977026939392, + "learning_rate": 2.93899985348976e-05, + "loss": 0.9615, + "step": 198940 + }, + { + "epoch": 1.2710348440514676, + "grad_norm": 1.4446771144866943, + "learning_rate": 2.9385427059123732e-05, + "loss": 1.1292, + "step": 198950 + }, + { + "epoch": 1.271098731201206, + "grad_norm": 1.2213329076766968, + "learning_rate": 2.9380855790956448e-05, + "loss": 0.9452, + "step": 198960 + }, + { + "epoch": 1.271162618350945, + "grad_norm": 0.6651431322097778, + "learning_rate": 2.9376284730441784e-05, + "loss": 0.957, + "step": 198970 + }, + { + "epoch": 1.2712265055006835, + "grad_norm": 1.4860223531723022, + "learning_rate": 2.9371713877625772e-05, + "loss": 0.8377, + "step": 198980 + }, + { + "epoch": 1.2712903926504224, + "grad_norm": 0.955312192440033, + "learning_rate": 2.936714323255445e-05, + "loss": 0.9017, + "step": 198990 + }, + { + "epoch": 1.271354279800161, + "grad_norm": 0.6875607967376709, + "learning_rate": 2.9362572795273846e-05, + "loss": 0.6367, + "step": 199000 + }, + { + "epoch": 1.2714181669498996, + "grad_norm": 0.8261491060256958, + "learning_rate": 2.935800256582999e-05, + "loss": 0.8404, + "step": 199010 + }, + { + "epoch": 1.2714820540996383, + "grad_norm": 1.1034409999847412, + "learning_rate": 2.93534325442689e-05, + "loss": 0.7114, + "step": 199020 + }, + { + "epoch": 1.271545941249377, + "grad_norm": 0.9923743605613708, + "learning_rate": 2.9348862730636616e-05, + "loss": 0.9492, + "step": 199030 + }, + { + "epoch": 1.2716098283991157, + "grad_norm": 1.0676493644714355, + "learning_rate": 2.934429312497914e-05, + "loss": 0.8737, + "step": 199040 + }, + { + "epoch": 1.2716737155488544, + "grad_norm": 1.1333630084991455, + "learning_rate": 2.9339723727342505e-05, + "loss": 1.0908, + "step": 199050 + }, + { + "epoch": 1.2717376026985932, + "grad_norm": 3.124783515930176, + "learning_rate": 2.933515453777273e-05, + "loss": 0.8235, + "step": 199060 + }, + { + "epoch": 1.2718014898483319, + "grad_norm": 1.512219786643982, + "learning_rate": 2.9330585556315833e-05, + "loss": 0.8718, + "step": 199070 + }, + { + "epoch": 1.2718653769980706, + "grad_norm": 0.9839432239532471, + "learning_rate": 2.9326016783017806e-05, + "loss": 0.9304, + "step": 199080 + }, + { + "epoch": 1.2719292641478093, + "grad_norm": 0.8138784766197205, + "learning_rate": 2.9321448217924686e-05, + "loss": 0.884, + "step": 199090 + }, + { + "epoch": 1.271993151297548, + "grad_norm": 0.846782386302948, + "learning_rate": 2.931687986108247e-05, + "loss": 0.9937, + "step": 199100 + }, + { + "epoch": 1.2720570384472867, + "grad_norm": 1.5242908000946045, + "learning_rate": 2.931231171253716e-05, + "loss": 0.8606, + "step": 199110 + }, + { + "epoch": 1.2721209255970254, + "grad_norm": 0.8409215211868286, + "learning_rate": 2.9307743772334773e-05, + "loss": 1.0581, + "step": 199120 + }, + { + "epoch": 1.272184812746764, + "grad_norm": 0.7096886038780212, + "learning_rate": 2.9303176040521306e-05, + "loss": 0.6941, + "step": 199130 + }, + { + "epoch": 1.2722486998965028, + "grad_norm": 0.7271231412887573, + "learning_rate": 2.9298608517142762e-05, + "loss": 0.9551, + "step": 199140 + }, + { + "epoch": 1.2723125870462415, + "grad_norm": 0.9017435312271118, + "learning_rate": 2.929404120224514e-05, + "loss": 1.4412, + "step": 199150 + }, + { + "epoch": 1.2723764741959802, + "grad_norm": 2.8430840969085693, + "learning_rate": 2.9289474095874436e-05, + "loss": 0.8965, + "step": 199160 + }, + { + "epoch": 1.272440361345719, + "grad_norm": 0.8325692415237427, + "learning_rate": 2.9284907198076643e-05, + "loss": 0.7069, + "step": 199170 + }, + { + "epoch": 1.2725042484954576, + "grad_norm": 0.8542130589485168, + "learning_rate": 2.9280340508897765e-05, + "loss": 0.8619, + "step": 199180 + }, + { + "epoch": 1.2725681356451963, + "grad_norm": 0.831527829170227, + "learning_rate": 2.9275774028383773e-05, + "loss": 0.8887, + "step": 199190 + }, + { + "epoch": 1.272632022794935, + "grad_norm": 1.026160478591919, + "learning_rate": 2.9271207756580665e-05, + "loss": 0.8894, + "step": 199200 + }, + { + "epoch": 1.2726959099446737, + "grad_norm": 0.9291976094245911, + "learning_rate": 2.9266641693534437e-05, + "loss": 0.9255, + "step": 199210 + }, + { + "epoch": 1.2727597970944124, + "grad_norm": 0.7681089043617249, + "learning_rate": 2.9262075839291046e-05, + "loss": 0.7491, + "step": 199220 + }, + { + "epoch": 1.2728236842441512, + "grad_norm": 1.4214307069778442, + "learning_rate": 2.9257510193896504e-05, + "loss": 0.8776, + "step": 199230 + }, + { + "epoch": 1.2728875713938899, + "grad_norm": 1.1611994504928589, + "learning_rate": 2.9252944757396776e-05, + "loss": 0.8153, + "step": 199240 + }, + { + "epoch": 1.2729514585436286, + "grad_norm": 1.6443861722946167, + "learning_rate": 2.924837952983785e-05, + "loss": 0.7736, + "step": 199250 + }, + { + "epoch": 1.2730153456933673, + "grad_norm": 0.7098362445831299, + "learning_rate": 2.9243814511265686e-05, + "loss": 0.8077, + "step": 199260 + }, + { + "epoch": 1.273079232843106, + "grad_norm": 0.9944263696670532, + "learning_rate": 2.923924970172628e-05, + "loss": 0.8855, + "step": 199270 + }, + { + "epoch": 1.2731431199928447, + "grad_norm": 0.6278926134109497, + "learning_rate": 2.923468510126558e-05, + "loss": 0.6761, + "step": 199280 + }, + { + "epoch": 1.2732070071425834, + "grad_norm": 1.4945859909057617, + "learning_rate": 2.9230120709929567e-05, + "loss": 1.3176, + "step": 199290 + }, + { + "epoch": 1.273270894292322, + "grad_norm": 0.8660402894020081, + "learning_rate": 2.922555652776421e-05, + "loss": 0.8399, + "step": 199300 + }, + { + "epoch": 1.2733347814420608, + "grad_norm": 0.6661750674247742, + "learning_rate": 2.922099255481547e-05, + "loss": 0.8871, + "step": 199310 + }, + { + "epoch": 1.2733986685917995, + "grad_norm": 1.4136496782302856, + "learning_rate": 2.921642879112931e-05, + "loss": 0.6499, + "step": 199320 + }, + { + "epoch": 1.2734625557415382, + "grad_norm": 2.2149500846862793, + "learning_rate": 2.921186523675169e-05, + "loss": 0.7559, + "step": 199330 + }, + { + "epoch": 1.273526442891277, + "grad_norm": 0.7235051989555359, + "learning_rate": 2.920730189172858e-05, + "loss": 0.8331, + "step": 199340 + }, + { + "epoch": 1.2735903300410156, + "grad_norm": 0.7026883363723755, + "learning_rate": 2.920273875610592e-05, + "loss": 0.7484, + "step": 199350 + }, + { + "epoch": 1.2736542171907543, + "grad_norm": 1.0162301063537598, + "learning_rate": 2.9198175829929674e-05, + "loss": 0.9959, + "step": 199360 + }, + { + "epoch": 1.2737181043404928, + "grad_norm": 0.7359153628349304, + "learning_rate": 2.9193613113245794e-05, + "loss": 0.8109, + "step": 199370 + }, + { + "epoch": 1.2737819914902317, + "grad_norm": 0.8668492436408997, + "learning_rate": 2.918905060610022e-05, + "loss": 1.1629, + "step": 199380 + }, + { + "epoch": 1.2738458786399702, + "grad_norm": 1.0416288375854492, + "learning_rate": 2.9184488308538933e-05, + "loss": 1.0551, + "step": 199390 + }, + { + "epoch": 1.2739097657897092, + "grad_norm": 1.0758754014968872, + "learning_rate": 2.9179926220607833e-05, + "loss": 0.8616, + "step": 199400 + }, + { + "epoch": 1.2739736529394476, + "grad_norm": 1.3386660814285278, + "learning_rate": 2.9175364342352906e-05, + "loss": 0.8793, + "step": 199410 + }, + { + "epoch": 1.2740375400891866, + "grad_norm": 0.882012665271759, + "learning_rate": 2.9170802673820064e-05, + "loss": 0.808, + "step": 199420 + }, + { + "epoch": 1.274101427238925, + "grad_norm": 0.8384029865264893, + "learning_rate": 2.916624121505528e-05, + "loss": 1.3155, + "step": 199430 + }, + { + "epoch": 1.274165314388664, + "grad_norm": 1.40079665184021, + "learning_rate": 2.916167996610444e-05, + "loss": 0.7067, + "step": 199440 + }, + { + "epoch": 1.2742292015384025, + "grad_norm": 0.5675820112228394, + "learning_rate": 2.9157118927013537e-05, + "loss": 0.6397, + "step": 199450 + }, + { + "epoch": 1.2742930886881414, + "grad_norm": 1.4623277187347412, + "learning_rate": 2.9152558097828454e-05, + "loss": 0.7759, + "step": 199460 + }, + { + "epoch": 1.2743569758378799, + "grad_norm": 0.9035980701446533, + "learning_rate": 2.914799747859517e-05, + "loss": 0.9244, + "step": 199470 + }, + { + "epoch": 1.2744208629876188, + "grad_norm": 1.0479638576507568, + "learning_rate": 2.9143437069359568e-05, + "loss": 0.6907, + "step": 199480 + }, + { + "epoch": 1.2744847501373573, + "grad_norm": 1.0720633268356323, + "learning_rate": 2.9138876870167624e-05, + "loss": 1.023, + "step": 199490 + }, + { + "epoch": 1.274548637287096, + "grad_norm": 1.4346104860305786, + "learning_rate": 2.9134316881065217e-05, + "loss": 0.9242, + "step": 199500 + }, + { + "epoch": 1.2746125244368347, + "grad_norm": 0.823266863822937, + "learning_rate": 2.9129757102098305e-05, + "loss": 0.7421, + "step": 199510 + }, + { + "epoch": 1.2746764115865734, + "grad_norm": 1.276611566543579, + "learning_rate": 2.9125197533312776e-05, + "loss": 1.0202, + "step": 199520 + }, + { + "epoch": 1.2747402987363121, + "grad_norm": 1.1702380180358887, + "learning_rate": 2.9120638174754567e-05, + "loss": 1.0277, + "step": 199530 + }, + { + "epoch": 1.2748041858860508, + "grad_norm": 1.1897376775741577, + "learning_rate": 2.9116079026469617e-05, + "loss": 1.0578, + "step": 199540 + }, + { + "epoch": 1.2748680730357895, + "grad_norm": 0.6272988319396973, + "learning_rate": 2.911152008850382e-05, + "loss": 1.0736, + "step": 199550 + }, + { + "epoch": 1.2749319601855282, + "grad_norm": 1.393921136856079, + "learning_rate": 2.9106961360903084e-05, + "loss": 0.923, + "step": 199560 + }, + { + "epoch": 1.274995847335267, + "grad_norm": 0.8716452121734619, + "learning_rate": 2.91024028437133e-05, + "loss": 0.6029, + "step": 199570 + }, + { + "epoch": 1.2750597344850056, + "grad_norm": 2.0013890266418457, + "learning_rate": 2.9097844536980425e-05, + "loss": 0.7438, + "step": 199580 + }, + { + "epoch": 1.2751236216347444, + "grad_norm": 0.7331926822662354, + "learning_rate": 2.909328644075031e-05, + "loss": 0.9836, + "step": 199590 + }, + { + "epoch": 1.275187508784483, + "grad_norm": 0.9569740891456604, + "learning_rate": 2.908872855506891e-05, + "loss": 1.0353, + "step": 199600 + }, + { + "epoch": 1.2752513959342218, + "grad_norm": 0.787571907043457, + "learning_rate": 2.9084170879982088e-05, + "loss": 0.8915, + "step": 199610 + }, + { + "epoch": 1.2753152830839605, + "grad_norm": 1.9850256443023682, + "learning_rate": 2.9079613415535777e-05, + "loss": 0.9739, + "step": 199620 + }, + { + "epoch": 1.2753791702336992, + "grad_norm": 1.550048828125, + "learning_rate": 2.9075056161775837e-05, + "loss": 0.9745, + "step": 199630 + }, + { + "epoch": 1.2754430573834379, + "grad_norm": 0.9888909459114075, + "learning_rate": 2.9070499118748208e-05, + "loss": 0.5989, + "step": 199640 + }, + { + "epoch": 1.2755069445331766, + "grad_norm": 1.1229398250579834, + "learning_rate": 2.906594228649873e-05, + "loss": 1.1172, + "step": 199650 + }, + { + "epoch": 1.2755708316829153, + "grad_norm": 0.9776995182037354, + "learning_rate": 2.906138566507333e-05, + "loss": 0.9073, + "step": 199660 + }, + { + "epoch": 1.275634718832654, + "grad_norm": 1.1696735620498657, + "learning_rate": 2.9056829254517916e-05, + "loss": 0.723, + "step": 199670 + }, + { + "epoch": 1.2756986059823927, + "grad_norm": 1.5058003664016724, + "learning_rate": 2.9052273054878322e-05, + "loss": 1.1077, + "step": 199680 + }, + { + "epoch": 1.2757624931321314, + "grad_norm": 1.1655230522155762, + "learning_rate": 2.9047717066200486e-05, + "loss": 1.0817, + "step": 199690 + }, + { + "epoch": 1.2758263802818701, + "grad_norm": 0.8833178281784058, + "learning_rate": 2.904316128853024e-05, + "loss": 0.6168, + "step": 199700 + }, + { + "epoch": 1.2758902674316088, + "grad_norm": 0.8839700222015381, + "learning_rate": 2.9038605721913513e-05, + "loss": 0.7247, + "step": 199710 + }, + { + "epoch": 1.2759541545813475, + "grad_norm": 1.2364578247070312, + "learning_rate": 2.9034050366396143e-05, + "loss": 0.9347, + "step": 199720 + }, + { + "epoch": 1.2760180417310862, + "grad_norm": 0.8872677683830261, + "learning_rate": 2.902949522202404e-05, + "loss": 0.786, + "step": 199730 + }, + { + "epoch": 1.276081928880825, + "grad_norm": 0.7343273758888245, + "learning_rate": 2.902494028884305e-05, + "loss": 1.0002, + "step": 199740 + }, + { + "epoch": 1.2761458160305637, + "grad_norm": 1.0674399137496948, + "learning_rate": 2.9020385566899067e-05, + "loss": 0.7747, + "step": 199750 + }, + { + "epoch": 1.2762097031803024, + "grad_norm": 1.0332404375076294, + "learning_rate": 2.9015831056237935e-05, + "loss": 0.82, + "step": 199760 + }, + { + "epoch": 1.276273590330041, + "grad_norm": 1.2516356706619263, + "learning_rate": 2.9011276756905557e-05, + "loss": 1.0702, + "step": 199770 + }, + { + "epoch": 1.2763374774797798, + "grad_norm": 0.9094335436820984, + "learning_rate": 2.900672266894776e-05, + "loss": 1.0714, + "step": 199780 + }, + { + "epoch": 1.2764013646295185, + "grad_norm": 0.7820824980735779, + "learning_rate": 2.9002168792410456e-05, + "loss": 0.9105, + "step": 199790 + }, + { + "epoch": 1.2764652517792572, + "grad_norm": 0.857791006565094, + "learning_rate": 2.899761512733945e-05, + "loss": 0.7399, + "step": 199800 + }, + { + "epoch": 1.2765291389289959, + "grad_norm": 0.8450620770454407, + "learning_rate": 2.8993061673780654e-05, + "loss": 0.6665, + "step": 199810 + }, + { + "epoch": 1.2765930260787346, + "grad_norm": 0.7609385251998901, + "learning_rate": 2.898850843177987e-05, + "loss": 0.8598, + "step": 199820 + }, + { + "epoch": 1.2766569132284733, + "grad_norm": 4.130343914031982, + "learning_rate": 2.898395540138301e-05, + "loss": 0.8879, + "step": 199830 + }, + { + "epoch": 1.276720800378212, + "grad_norm": 1.2083723545074463, + "learning_rate": 2.8979402582635883e-05, + "loss": 0.747, + "step": 199840 + }, + { + "epoch": 1.2767846875279507, + "grad_norm": 1.0473322868347168, + "learning_rate": 2.8974849975584356e-05, + "loss": 0.8937, + "step": 199850 + }, + { + "epoch": 1.2768485746776892, + "grad_norm": 2.8024537563323975, + "learning_rate": 2.8970297580274298e-05, + "loss": 0.9525, + "step": 199860 + }, + { + "epoch": 1.2769124618274281, + "grad_norm": 0.6386640071868896, + "learning_rate": 2.896574539675152e-05, + "loss": 0.6814, + "step": 199870 + }, + { + "epoch": 1.2769763489771666, + "grad_norm": 0.840392529964447, + "learning_rate": 2.8961193425061893e-05, + "loss": 0.8324, + "step": 199880 + }, + { + "epoch": 1.2770402361269055, + "grad_norm": 1.0705910921096802, + "learning_rate": 2.895664166525124e-05, + "loss": 0.9371, + "step": 199890 + }, + { + "epoch": 1.277104123276644, + "grad_norm": 1.714959979057312, + "learning_rate": 2.8952090117365427e-05, + "loss": 1.107, + "step": 199900 + }, + { + "epoch": 1.277168010426383, + "grad_norm": 1.1649478673934937, + "learning_rate": 2.8947538781450257e-05, + "loss": 0.9453, + "step": 199910 + }, + { + "epoch": 1.2772318975761214, + "grad_norm": 2.98532772064209, + "learning_rate": 2.89429876575516e-05, + "loss": 0.9335, + "step": 199920 + }, + { + "epoch": 1.2772957847258604, + "grad_norm": 1.2889913320541382, + "learning_rate": 2.893843674571526e-05, + "loss": 0.8936, + "step": 199930 + }, + { + "epoch": 1.2773596718755988, + "grad_norm": 0.9181677103042603, + "learning_rate": 2.8933886045987102e-05, + "loss": 0.76, + "step": 199940 + }, + { + "epoch": 1.2774235590253378, + "grad_norm": 1.176107406616211, + "learning_rate": 2.8929335558412918e-05, + "loss": 0.8422, + "step": 199950 + }, + { + "epoch": 1.2774874461750763, + "grad_norm": 1.2638664245605469, + "learning_rate": 2.892478528303857e-05, + "loss": 0.8383, + "step": 199960 + }, + { + "epoch": 1.277551333324815, + "grad_norm": 1.9292041063308716, + "learning_rate": 2.8920235219909842e-05, + "loss": 1.1438, + "step": 199970 + }, + { + "epoch": 1.2776152204745537, + "grad_norm": 0.9688105583190918, + "learning_rate": 2.8915685369072608e-05, + "loss": 1.0188, + "step": 199980 + }, + { + "epoch": 1.2776791076242924, + "grad_norm": 0.9021857380867004, + "learning_rate": 2.8911135730572643e-05, + "loss": 0.698, + "step": 199990 + }, + { + "epoch": 1.277742994774031, + "grad_norm": 1.147022008895874, + "learning_rate": 2.890658630445581e-05, + "loss": 0.8454, + "step": 200000 + }, + { + "epoch": 1.2778068819237698, + "grad_norm": 0.6684355139732361, + "learning_rate": 2.890203709076787e-05, + "loss": 0.8473, + "step": 200010 + }, + { + "epoch": 1.2778707690735085, + "grad_norm": 4.026229381561279, + "learning_rate": 2.8897488089554692e-05, + "loss": 0.7693, + "step": 200020 + }, + { + "epoch": 1.2779346562232472, + "grad_norm": 1.4114011526107788, + "learning_rate": 2.889293930086205e-05, + "loss": 1.0826, + "step": 200030 + }, + { + "epoch": 1.277998543372986, + "grad_norm": 1.0624281167984009, + "learning_rate": 2.8888390724735788e-05, + "loss": 0.7916, + "step": 200040 + }, + { + "epoch": 1.2780624305227246, + "grad_norm": 0.7880474328994751, + "learning_rate": 2.888384236122169e-05, + "loss": 0.9133, + "step": 200050 + }, + { + "epoch": 1.2781263176724633, + "grad_norm": 1.1904906034469604, + "learning_rate": 2.887929421036556e-05, + "loss": 0.9532, + "step": 200060 + }, + { + "epoch": 1.278190204822202, + "grad_norm": 0.9358816146850586, + "learning_rate": 2.8874746272213217e-05, + "loss": 1.1367, + "step": 200070 + }, + { + "epoch": 1.2782540919719407, + "grad_norm": 0.8555280566215515, + "learning_rate": 2.887019854681044e-05, + "loss": 0.7676, + "step": 200080 + }, + { + "epoch": 1.2783179791216794, + "grad_norm": 0.6473132371902466, + "learning_rate": 2.8865651034203068e-05, + "loss": 0.9187, + "step": 200090 + }, + { + "epoch": 1.2783818662714181, + "grad_norm": 5.737834453582764, + "learning_rate": 2.8861103734436846e-05, + "loss": 0.7291, + "step": 200100 + }, + { + "epoch": 1.2784457534211568, + "grad_norm": 0.8243213891983032, + "learning_rate": 2.885655664755762e-05, + "loss": 0.8604, + "step": 200110 + }, + { + "epoch": 1.2785096405708956, + "grad_norm": 1.6044676303863525, + "learning_rate": 2.8852009773611137e-05, + "loss": 0.7958, + "step": 200120 + }, + { + "epoch": 1.2785735277206343, + "grad_norm": 0.990822434425354, + "learning_rate": 2.8847463112643236e-05, + "loss": 1.2024, + "step": 200130 + }, + { + "epoch": 1.278637414870373, + "grad_norm": 0.9312714338302612, + "learning_rate": 2.884291666469966e-05, + "loss": 0.8222, + "step": 200140 + }, + { + "epoch": 1.2787013020201117, + "grad_norm": 0.9203009605407715, + "learning_rate": 2.8838370429826235e-05, + "loss": 0.8486, + "step": 200150 + }, + { + "epoch": 1.2787651891698504, + "grad_norm": 0.8465953469276428, + "learning_rate": 2.883382440806871e-05, + "loss": 0.7348, + "step": 200160 + }, + { + "epoch": 1.278829076319589, + "grad_norm": 2.1238362789154053, + "learning_rate": 2.8829278599472903e-05, + "loss": 0.9623, + "step": 200170 + }, + { + "epoch": 1.2788929634693278, + "grad_norm": 0.8500341773033142, + "learning_rate": 2.8824733004084558e-05, + "loss": 0.8904, + "step": 200180 + }, + { + "epoch": 1.2789568506190665, + "grad_norm": 0.94657963514328, + "learning_rate": 2.882018762194947e-05, + "loss": 0.7947, + "step": 200190 + }, + { + "epoch": 1.2790207377688052, + "grad_norm": 0.8682937622070312, + "learning_rate": 2.8815642453113435e-05, + "loss": 0.9038, + "step": 200200 + }, + { + "epoch": 1.279084624918544, + "grad_norm": 0.8084182739257812, + "learning_rate": 2.8811097497622185e-05, + "loss": 0.9295, + "step": 200210 + }, + { + "epoch": 1.2791485120682826, + "grad_norm": 1.071580171585083, + "learning_rate": 2.8806552755521532e-05, + "loss": 0.8428, + "step": 200220 + }, + { + "epoch": 1.2792123992180213, + "grad_norm": 1.3232358694076538, + "learning_rate": 2.8802008226857214e-05, + "loss": 1.0202, + "step": 200230 + }, + { + "epoch": 1.27927628636776, + "grad_norm": 1.0317996740341187, + "learning_rate": 2.8797463911675028e-05, + "loss": 1.0649, + "step": 200240 + }, + { + "epoch": 1.2793401735174987, + "grad_norm": 1.255834937095642, + "learning_rate": 2.8792919810020706e-05, + "loss": 0.7789, + "step": 200250 + }, + { + "epoch": 1.2794040606672374, + "grad_norm": 1.0513917207717896, + "learning_rate": 2.8788375921940047e-05, + "loss": 0.9803, + "step": 200260 + }, + { + "epoch": 1.2794679478169761, + "grad_norm": 0.9030227065086365, + "learning_rate": 2.8783832247478776e-05, + "loss": 0.9298, + "step": 200270 + }, + { + "epoch": 1.2795318349667149, + "grad_norm": 0.9306374788284302, + "learning_rate": 2.8779288786682685e-05, + "loss": 0.8644, + "step": 200280 + }, + { + "epoch": 1.2795957221164536, + "grad_norm": 1.2579879760742188, + "learning_rate": 2.8774745539597498e-05, + "loss": 0.7549, + "step": 200290 + }, + { + "epoch": 1.2796596092661923, + "grad_norm": 1.0946049690246582, + "learning_rate": 2.8770202506269007e-05, + "loss": 1.0356, + "step": 200300 + }, + { + "epoch": 1.279723496415931, + "grad_norm": 0.580696702003479, + "learning_rate": 2.876565968674292e-05, + "loss": 0.9136, + "step": 200310 + }, + { + "epoch": 1.2797873835656697, + "grad_norm": 0.6239465475082397, + "learning_rate": 2.8761117081065025e-05, + "loss": 0.6196, + "step": 200320 + }, + { + "epoch": 1.2798512707154084, + "grad_norm": 1.3467539548873901, + "learning_rate": 2.875657468928104e-05, + "loss": 0.7165, + "step": 200330 + }, + { + "epoch": 1.279915157865147, + "grad_norm": 0.8384279608726501, + "learning_rate": 2.8752032511436745e-05, + "loss": 0.8655, + "step": 200340 + }, + { + "epoch": 1.2799790450148856, + "grad_norm": 1.4086518287658691, + "learning_rate": 2.874749054757785e-05, + "loss": 0.9132, + "step": 200350 + }, + { + "epoch": 1.2800429321646245, + "grad_norm": 1.0973162651062012, + "learning_rate": 2.8742948797750124e-05, + "loss": 0.7084, + "step": 200360 + }, + { + "epoch": 1.280106819314363, + "grad_norm": 0.7938587665557861, + "learning_rate": 2.873840726199928e-05, + "loss": 0.7629, + "step": 200370 + }, + { + "epoch": 1.280170706464102, + "grad_norm": 0.7673661708831787, + "learning_rate": 2.8733865940371062e-05, + "loss": 0.7815, + "step": 200380 + }, + { + "epoch": 1.2802345936138404, + "grad_norm": 0.8518702387809753, + "learning_rate": 2.8729324832911236e-05, + "loss": 0.8724, + "step": 200390 + }, + { + "epoch": 1.2802984807635793, + "grad_norm": 1.0190377235412598, + "learning_rate": 2.872478393966549e-05, + "loss": 0.6522, + "step": 200400 + }, + { + "epoch": 1.2803623679133178, + "grad_norm": 0.7286405563354492, + "learning_rate": 2.8720243260679598e-05, + "loss": 0.6123, + "step": 200410 + }, + { + "epoch": 1.2804262550630567, + "grad_norm": 0.8743591904640198, + "learning_rate": 2.8715702795999245e-05, + "loss": 0.9299, + "step": 200420 + }, + { + "epoch": 1.2804901422127952, + "grad_norm": 1.6980204582214355, + "learning_rate": 2.8711162545670195e-05, + "loss": 0.752, + "step": 200430 + }, + { + "epoch": 1.2805540293625342, + "grad_norm": 0.8956275582313538, + "learning_rate": 2.8706622509738133e-05, + "loss": 0.862, + "step": 200440 + }, + { + "epoch": 1.2806179165122726, + "grad_norm": 1.4533246755599976, + "learning_rate": 2.8702082688248834e-05, + "loss": 1.0016, + "step": 200450 + }, + { + "epoch": 1.2806818036620113, + "grad_norm": 2.4909677505493164, + "learning_rate": 2.8697543081247958e-05, + "loss": 1.2088, + "step": 200460 + }, + { + "epoch": 1.28074569081175, + "grad_norm": 1.0458720922470093, + "learning_rate": 2.8693003688781283e-05, + "loss": 0.8815, + "step": 200470 + }, + { + "epoch": 1.2808095779614888, + "grad_norm": 0.624754011631012, + "learning_rate": 2.8688464510894464e-05, + "loss": 1.0686, + "step": 200480 + }, + { + "epoch": 1.2808734651112275, + "grad_norm": 2.7535908222198486, + "learning_rate": 2.868392554763327e-05, + "loss": 0.7738, + "step": 200490 + }, + { + "epoch": 1.2809373522609662, + "grad_norm": 1.5427417755126953, + "learning_rate": 2.867938679904336e-05, + "loss": 0.9673, + "step": 200500 + }, + { + "epoch": 1.2810012394107049, + "grad_norm": 1.1224215030670166, + "learning_rate": 2.8674848265170495e-05, + "loss": 0.9368, + "step": 200510 + }, + { + "epoch": 1.2810651265604436, + "grad_norm": 0.9177605509757996, + "learning_rate": 2.867030994606036e-05, + "loss": 0.715, + "step": 200520 + }, + { + "epoch": 1.2811290137101823, + "grad_norm": 1.9078919887542725, + "learning_rate": 2.8665771841758632e-05, + "loss": 0.6437, + "step": 200530 + }, + { + "epoch": 1.281192900859921, + "grad_norm": 0.9317499399185181, + "learning_rate": 2.866123395231106e-05, + "loss": 1.0355, + "step": 200540 + }, + { + "epoch": 1.2812567880096597, + "grad_norm": 0.7947119474411011, + "learning_rate": 2.86566962777633e-05, + "loss": 0.9663, + "step": 200550 + }, + { + "epoch": 1.2813206751593984, + "grad_norm": 1.0166733264923096, + "learning_rate": 2.8652158818161096e-05, + "loss": 1.0026, + "step": 200560 + }, + { + "epoch": 1.281384562309137, + "grad_norm": 0.824131965637207, + "learning_rate": 2.86476215735501e-05, + "loss": 0.8661, + "step": 200570 + }, + { + "epoch": 1.2814484494588758, + "grad_norm": 0.5033380389213562, + "learning_rate": 2.864308454397605e-05, + "loss": 0.7417, + "step": 200580 + }, + { + "epoch": 1.2815123366086145, + "grad_norm": 0.7473288774490356, + "learning_rate": 2.8638547729484587e-05, + "loss": 0.8514, + "step": 200590 + }, + { + "epoch": 1.2815762237583532, + "grad_norm": 1.0465642213821411, + "learning_rate": 2.8634011130121456e-05, + "loss": 0.8228, + "step": 200600 + }, + { + "epoch": 1.281640110908092, + "grad_norm": 0.9337648749351501, + "learning_rate": 2.8629474745932294e-05, + "loss": 0.9006, + "step": 200610 + }, + { + "epoch": 1.2817039980578306, + "grad_norm": 0.7650608420372009, + "learning_rate": 2.8624938576962833e-05, + "loss": 0.9708, + "step": 200620 + }, + { + "epoch": 1.2817678852075693, + "grad_norm": 1.056187391281128, + "learning_rate": 2.8620402623258715e-05, + "loss": 0.8352, + "step": 200630 + }, + { + "epoch": 1.281831772357308, + "grad_norm": 1.137027382850647, + "learning_rate": 2.861586688486565e-05, + "loss": 0.7863, + "step": 200640 + }, + { + "epoch": 1.2818956595070468, + "grad_norm": 1.4955174922943115, + "learning_rate": 2.861133136182929e-05, + "loss": 0.9078, + "step": 200650 + }, + { + "epoch": 1.2819595466567855, + "grad_norm": 0.8363558053970337, + "learning_rate": 2.860679605419535e-05, + "loss": 0.8063, + "step": 200660 + }, + { + "epoch": 1.2820234338065242, + "grad_norm": 0.9371567368507385, + "learning_rate": 2.8602260962009453e-05, + "loss": 1.1893, + "step": 200670 + }, + { + "epoch": 1.2820873209562629, + "grad_norm": 0.6184449195861816, + "learning_rate": 2.8597726085317323e-05, + "loss": 0.8258, + "step": 200680 + }, + { + "epoch": 1.2821512081060016, + "grad_norm": 1.352858543395996, + "learning_rate": 2.859319142416459e-05, + "loss": 0.9911, + "step": 200690 + }, + { + "epoch": 1.2822150952557403, + "grad_norm": 0.9840089678764343, + "learning_rate": 2.858865697859694e-05, + "loss": 0.8355, + "step": 200700 + }, + { + "epoch": 1.282278982405479, + "grad_norm": 0.8363627195358276, + "learning_rate": 2.858412274866006e-05, + "loss": 0.7131, + "step": 200710 + }, + { + "epoch": 1.2823428695552177, + "grad_norm": 0.9808521866798401, + "learning_rate": 2.8579588734399565e-05, + "loss": 0.7284, + "step": 200720 + }, + { + "epoch": 1.2824067567049564, + "grad_norm": 1.230603814125061, + "learning_rate": 2.8575054935861158e-05, + "loss": 0.9945, + "step": 200730 + }, + { + "epoch": 1.2824706438546951, + "grad_norm": 1.1982591152191162, + "learning_rate": 2.8570521353090473e-05, + "loss": 1.0323, + "step": 200740 + }, + { + "epoch": 1.2825345310044338, + "grad_norm": 1.8492811918258667, + "learning_rate": 2.856598798613319e-05, + "loss": 0.8135, + "step": 200750 + }, + { + "epoch": 1.2825984181541725, + "grad_norm": 1.0338141918182373, + "learning_rate": 2.856145483503494e-05, + "loss": 1.0249, + "step": 200760 + }, + { + "epoch": 1.2826623053039112, + "grad_norm": 1.3097282648086548, + "learning_rate": 2.8556921899841394e-05, + "loss": 0.8617, + "step": 200770 + }, + { + "epoch": 1.28272619245365, + "grad_norm": 0.9267164468765259, + "learning_rate": 2.8552389180598183e-05, + "loss": 0.9097, + "step": 200780 + }, + { + "epoch": 1.2827900796033886, + "grad_norm": 3.339057683944702, + "learning_rate": 2.8547856677350992e-05, + "loss": 0.9968, + "step": 200790 + }, + { + "epoch": 1.2828539667531274, + "grad_norm": 0.5326239466667175, + "learning_rate": 2.8543324390145416e-05, + "loss": 0.7836, + "step": 200800 + }, + { + "epoch": 1.282917853902866, + "grad_norm": 1.0044302940368652, + "learning_rate": 2.853924551641376e-05, + "loss": 0.8953, + "step": 200810 + }, + { + "epoch": 1.2829817410526045, + "grad_norm": 1.041451334953308, + "learning_rate": 2.8534713639813047e-05, + "loss": 0.7091, + "step": 200820 + }, + { + "epoch": 1.2830456282023435, + "grad_norm": 0.7834107279777527, + "learning_rate": 2.853018197938635e-05, + "loss": 1.0314, + "step": 200830 + }, + { + "epoch": 1.283109515352082, + "grad_norm": 0.8394607901573181, + "learning_rate": 2.8525650535179306e-05, + "loss": 0.8523, + "step": 200840 + }, + { + "epoch": 1.2831734025018209, + "grad_norm": 1.2169694900512695, + "learning_rate": 2.852111930723752e-05, + "loss": 0.82, + "step": 200850 + }, + { + "epoch": 1.2832372896515594, + "grad_norm": 0.7641561031341553, + "learning_rate": 2.8516588295606673e-05, + "loss": 0.9652, + "step": 200860 + }, + { + "epoch": 1.2833011768012983, + "grad_norm": 1.29936683177948, + "learning_rate": 2.8512057500332333e-05, + "loss": 0.6658, + "step": 200870 + }, + { + "epoch": 1.2833650639510368, + "grad_norm": 0.9101029634475708, + "learning_rate": 2.8507526921460193e-05, + "loss": 0.6327, + "step": 200880 + }, + { + "epoch": 1.2834289511007757, + "grad_norm": 1.0572748184204102, + "learning_rate": 2.8502996559035833e-05, + "loss": 1.1782, + "step": 200890 + }, + { + "epoch": 1.2834928382505142, + "grad_norm": 1.0327401161193848, + "learning_rate": 2.8498466413104906e-05, + "loss": 0.5408, + "step": 200900 + }, + { + "epoch": 1.2835567254002531, + "grad_norm": 1.2162421941757202, + "learning_rate": 2.849393648371301e-05, + "loss": 1.127, + "step": 200910 + }, + { + "epoch": 1.2836206125499916, + "grad_norm": 1.2150144577026367, + "learning_rate": 2.8489406770905802e-05, + "loss": 0.8937, + "step": 200920 + }, + { + "epoch": 1.2836844996997305, + "grad_norm": 1.1148834228515625, + "learning_rate": 2.848487727472885e-05, + "loss": 0.9379, + "step": 200930 + }, + { + "epoch": 1.283748386849469, + "grad_norm": 1.6276755332946777, + "learning_rate": 2.8480347995227824e-05, + "loss": 0.83, + "step": 200940 + }, + { + "epoch": 1.2838122739992077, + "grad_norm": 0.8462209701538086, + "learning_rate": 2.8475818932448284e-05, + "loss": 0.9984, + "step": 200950 + }, + { + "epoch": 1.2838761611489464, + "grad_norm": 0.7349916696548462, + "learning_rate": 2.8471290086435896e-05, + "loss": 0.9028, + "step": 200960 + }, + { + "epoch": 1.2839400482986851, + "grad_norm": 1.0558744668960571, + "learning_rate": 2.846676145723621e-05, + "loss": 1.2691, + "step": 200970 + }, + { + "epoch": 1.2840039354484238, + "grad_norm": 0.8234534859657288, + "learning_rate": 2.8462233044894898e-05, + "loss": 0.6873, + "step": 200980 + }, + { + "epoch": 1.2840678225981625, + "grad_norm": 1.1932810544967651, + "learning_rate": 2.8457704849457513e-05, + "loss": 0.9897, + "step": 200990 + }, + { + "epoch": 1.2841317097479013, + "grad_norm": 0.8133253455162048, + "learning_rate": 2.845317687096969e-05, + "loss": 0.6946, + "step": 201000 + }, + { + "epoch": 1.28419559689764, + "grad_norm": 0.9386922717094421, + "learning_rate": 2.8448649109476987e-05, + "loss": 0.8382, + "step": 201010 + }, + { + "epoch": 1.2842594840473787, + "grad_norm": 0.8915868997573853, + "learning_rate": 2.8444121565025066e-05, + "loss": 0.8555, + "step": 201020 + }, + { + "epoch": 1.2843233711971174, + "grad_norm": 1.2032678127288818, + "learning_rate": 2.8439594237659466e-05, + "loss": 0.8164, + "step": 201030 + }, + { + "epoch": 1.284387258346856, + "grad_norm": 0.8416345119476318, + "learning_rate": 2.8435067127425808e-05, + "loss": 1.003, + "step": 201040 + }, + { + "epoch": 1.2844511454965948, + "grad_norm": 1.1380306482315063, + "learning_rate": 2.8430540234369694e-05, + "loss": 0.8604, + "step": 201050 + }, + { + "epoch": 1.2845150326463335, + "grad_norm": 1.1704416275024414, + "learning_rate": 2.842601355853668e-05, + "loss": 0.831, + "step": 201060 + }, + { + "epoch": 1.2845789197960722, + "grad_norm": 1.0205214023590088, + "learning_rate": 2.84214870999724e-05, + "loss": 0.7554, + "step": 201070 + }, + { + "epoch": 1.284642806945811, + "grad_norm": 1.102547526359558, + "learning_rate": 2.8416960858722385e-05, + "loss": 1.1079, + "step": 201080 + }, + { + "epoch": 1.2847066940955496, + "grad_norm": 1.2095087766647339, + "learning_rate": 2.841243483483227e-05, + "loss": 0.8655, + "step": 201090 + }, + { + "epoch": 1.2847705812452883, + "grad_norm": 1.368201494216919, + "learning_rate": 2.840790902834759e-05, + "loss": 0.8103, + "step": 201100 + }, + { + "epoch": 1.284834468395027, + "grad_norm": 0.7406502962112427, + "learning_rate": 2.840338343931397e-05, + "loss": 1.0044, + "step": 201110 + }, + { + "epoch": 1.2848983555447657, + "grad_norm": 1.0951242446899414, + "learning_rate": 2.8398858067776946e-05, + "loss": 0.7321, + "step": 201120 + }, + { + "epoch": 1.2849622426945044, + "grad_norm": 0.8413940072059631, + "learning_rate": 2.839433291378212e-05, + "loss": 0.9743, + "step": 201130 + }, + { + "epoch": 1.2850261298442431, + "grad_norm": 0.8289148211479187, + "learning_rate": 2.8389807977375037e-05, + "loss": 0.7121, + "step": 201140 + }, + { + "epoch": 1.2850900169939818, + "grad_norm": 2.189805030822754, + "learning_rate": 2.8385283258601304e-05, + "loss": 0.986, + "step": 201150 + }, + { + "epoch": 1.2851539041437205, + "grad_norm": 1.2476427555084229, + "learning_rate": 2.8380758757506463e-05, + "loss": 1.1473, + "step": 201160 + }, + { + "epoch": 1.2852177912934593, + "grad_norm": 1.2025208473205566, + "learning_rate": 2.8376234474136065e-05, + "loss": 0.8451, + "step": 201170 + }, + { + "epoch": 1.285281678443198, + "grad_norm": 1.0035260915756226, + "learning_rate": 2.8371710408535722e-05, + "loss": 0.7409, + "step": 201180 + }, + { + "epoch": 1.2853455655929367, + "grad_norm": 1.0282871723175049, + "learning_rate": 2.8367186560750936e-05, + "loss": 0.9917, + "step": 201190 + }, + { + "epoch": 1.2854094527426754, + "grad_norm": 0.9853460192680359, + "learning_rate": 2.8362662930827323e-05, + "loss": 1.0054, + "step": 201200 + }, + { + "epoch": 1.285473339892414, + "grad_norm": 0.8255294561386108, + "learning_rate": 2.835813951881039e-05, + "loss": 0.9538, + "step": 201210 + }, + { + "epoch": 1.2855372270421528, + "grad_norm": 1.0002115964889526, + "learning_rate": 2.8353616324745737e-05, + "loss": 0.7627, + "step": 201220 + }, + { + "epoch": 1.2856011141918915, + "grad_norm": 0.8844050765037537, + "learning_rate": 2.834909334867888e-05, + "loss": 0.5698, + "step": 201230 + }, + { + "epoch": 1.2856650013416302, + "grad_norm": 0.8857330083847046, + "learning_rate": 2.8344570590655394e-05, + "loss": 0.9497, + "step": 201240 + }, + { + "epoch": 1.285728888491369, + "grad_norm": 1.0770478248596191, + "learning_rate": 2.83400480507208e-05, + "loss": 0.8344, + "step": 201250 + }, + { + "epoch": 1.2857927756411076, + "grad_norm": 1.0126256942749023, + "learning_rate": 2.8335525728920676e-05, + "loss": 0.9859, + "step": 201260 + }, + { + "epoch": 1.2858566627908463, + "grad_norm": 1.0135153532028198, + "learning_rate": 2.833100362530054e-05, + "loss": 1.059, + "step": 201270 + }, + { + "epoch": 1.285920549940585, + "grad_norm": 0.8402441740036011, + "learning_rate": 2.8326481739905958e-05, + "loss": 1.0928, + "step": 201280 + }, + { + "epoch": 1.2859844370903237, + "grad_norm": 0.8969540596008301, + "learning_rate": 2.832196007278244e-05, + "loss": 0.8163, + "step": 201290 + }, + { + "epoch": 1.2860483242400624, + "grad_norm": 1.4373033046722412, + "learning_rate": 2.831743862397555e-05, + "loss": 0.7645, + "step": 201300 + }, + { + "epoch": 1.286112211389801, + "grad_norm": 2.0069103240966797, + "learning_rate": 2.831291739353079e-05, + "loss": 0.7204, + "step": 201310 + }, + { + "epoch": 1.2861760985395398, + "grad_norm": 0.9032400846481323, + "learning_rate": 2.8308396381493747e-05, + "loss": 0.9155, + "step": 201320 + }, + { + "epoch": 1.2862399856892783, + "grad_norm": 1.127811312675476, + "learning_rate": 2.830387558790989e-05, + "loss": 0.7727, + "step": 201330 + }, + { + "epoch": 1.2863038728390173, + "grad_norm": 1.0291417837142944, + "learning_rate": 2.829935501282479e-05, + "loss": 0.886, + "step": 201340 + }, + { + "epoch": 1.2863677599887557, + "grad_norm": 1.3500255346298218, + "learning_rate": 2.8294834656283952e-05, + "loss": 1.0935, + "step": 201350 + }, + { + "epoch": 1.2864316471384947, + "grad_norm": 0.9647353291511536, + "learning_rate": 2.8290314518332895e-05, + "loss": 1.2346, + "step": 201360 + }, + { + "epoch": 1.2864955342882332, + "grad_norm": 1.557719111442566, + "learning_rate": 2.828579459901718e-05, + "loss": 0.8093, + "step": 201370 + }, + { + "epoch": 1.286559421437972, + "grad_norm": 0.6619776487350464, + "learning_rate": 2.8281274898382275e-05, + "loss": 0.8722, + "step": 201380 + }, + { + "epoch": 1.2866233085877106, + "grad_norm": 0.9074650406837463, + "learning_rate": 2.8276755416473744e-05, + "loss": 0.6571, + "step": 201390 + }, + { + "epoch": 1.2866871957374495, + "grad_norm": 1.4630404710769653, + "learning_rate": 2.8272236153337055e-05, + "loss": 1.0, + "step": 201400 + }, + { + "epoch": 1.286751082887188, + "grad_norm": 0.8344331979751587, + "learning_rate": 2.8267717109017765e-05, + "loss": 0.7465, + "step": 201410 + }, + { + "epoch": 1.286814970036927, + "grad_norm": 0.973581850528717, + "learning_rate": 2.8263198283561347e-05, + "loss": 0.8632, + "step": 201420 + }, + { + "epoch": 1.2868788571866654, + "grad_norm": 1.26335608959198, + "learning_rate": 2.825867967701335e-05, + "loss": 0.8703, + "step": 201430 + }, + { + "epoch": 1.286942744336404, + "grad_norm": 0.9915609955787659, + "learning_rate": 2.8254161289419233e-05, + "loss": 0.8515, + "step": 201440 + }, + { + "epoch": 1.2870066314861428, + "grad_norm": 1.6091701984405518, + "learning_rate": 2.824964312082455e-05, + "loss": 1.0284, + "step": 201450 + }, + { + "epoch": 1.2870705186358815, + "grad_norm": 0.7491897940635681, + "learning_rate": 2.8245125171274755e-05, + "loss": 0.9658, + "step": 201460 + }, + { + "epoch": 1.2871344057856202, + "grad_norm": 1.554604411125183, + "learning_rate": 2.8240607440815388e-05, + "loss": 0.8597, + "step": 201470 + }, + { + "epoch": 1.287198292935359, + "grad_norm": 0.7069583535194397, + "learning_rate": 2.8236089929491912e-05, + "loss": 0.7607, + "step": 201480 + }, + { + "epoch": 1.2872621800850976, + "grad_norm": 0.6213060617446899, + "learning_rate": 2.8231572637349856e-05, + "loss": 0.9623, + "step": 201490 + }, + { + "epoch": 1.2873260672348363, + "grad_norm": 1.7921992540359497, + "learning_rate": 2.822705556443468e-05, + "loss": 0.6583, + "step": 201500 + }, + { + "epoch": 1.287389954384575, + "grad_norm": 0.7730182409286499, + "learning_rate": 2.8222538710791903e-05, + "loss": 0.6463, + "step": 201510 + }, + { + "epoch": 1.2874538415343137, + "grad_norm": 0.7597857117652893, + "learning_rate": 2.8218022076466987e-05, + "loss": 0.8251, + "step": 201520 + }, + { + "epoch": 1.2875177286840525, + "grad_norm": 1.0613079071044922, + "learning_rate": 2.821350566150545e-05, + "loss": 1.0157, + "step": 201530 + }, + { + "epoch": 1.2875816158337912, + "grad_norm": 1.0213360786437988, + "learning_rate": 2.820898946595274e-05, + "loss": 0.807, + "step": 201540 + }, + { + "epoch": 1.2876455029835299, + "grad_norm": 0.9103221297264099, + "learning_rate": 2.820447348985436e-05, + "loss": 0.7805, + "step": 201550 + }, + { + "epoch": 1.2877093901332686, + "grad_norm": 1.3538568019866943, + "learning_rate": 2.8199957733255806e-05, + "loss": 0.8029, + "step": 201560 + }, + { + "epoch": 1.2877732772830073, + "grad_norm": 0.81965571641922, + "learning_rate": 2.8195442196202517e-05, + "loss": 1.0121, + "step": 201570 + }, + { + "epoch": 1.287837164432746, + "grad_norm": 0.8054186105728149, + "learning_rate": 2.819092687874001e-05, + "loss": 0.7716, + "step": 201580 + }, + { + "epoch": 1.2879010515824847, + "grad_norm": 1.1727731227874756, + "learning_rate": 2.8186411780913713e-05, + "loss": 1.0159, + "step": 201590 + }, + { + "epoch": 1.2879649387322234, + "grad_norm": 0.9019149541854858, + "learning_rate": 2.8181896902769146e-05, + "loss": 1.0994, + "step": 201600 + }, + { + "epoch": 1.288028825881962, + "grad_norm": 1.1492351293563843, + "learning_rate": 2.8177382244351736e-05, + "loss": 1.0837, + "step": 201610 + }, + { + "epoch": 1.2880927130317008, + "grad_norm": 0.8392195105552673, + "learning_rate": 2.817286780570698e-05, + "loss": 1.1313, + "step": 201620 + }, + { + "epoch": 1.2881566001814395, + "grad_norm": 2.194507598876953, + "learning_rate": 2.8168353586880304e-05, + "loss": 0.9563, + "step": 201630 + }, + { + "epoch": 1.2882204873311782, + "grad_norm": 1.052587866783142, + "learning_rate": 2.816383958791724e-05, + "loss": 0.7306, + "step": 201640 + }, + { + "epoch": 1.288284374480917, + "grad_norm": 3.5647919178009033, + "learning_rate": 2.8159325808863164e-05, + "loss": 0.7595, + "step": 201650 + }, + { + "epoch": 1.2883482616306556, + "grad_norm": 0.9285479187965393, + "learning_rate": 2.8154812249763592e-05, + "loss": 0.7623, + "step": 201660 + }, + { + "epoch": 1.2884121487803943, + "grad_norm": 0.8144568800926208, + "learning_rate": 2.8150298910663934e-05, + "loss": 0.7509, + "step": 201670 + }, + { + "epoch": 1.288476035930133, + "grad_norm": 0.8013102412223816, + "learning_rate": 2.8145785791609674e-05, + "loss": 0.7349, + "step": 201680 + }, + { + "epoch": 1.2885399230798718, + "grad_norm": 1.1229387521743774, + "learning_rate": 2.8141272892646276e-05, + "loss": 1.0336, + "step": 201690 + }, + { + "epoch": 1.2886038102296105, + "grad_norm": 1.2837129831314087, + "learning_rate": 2.8136760213819148e-05, + "loss": 0.8879, + "step": 201700 + }, + { + "epoch": 1.2886676973793492, + "grad_norm": 2.003021717071533, + "learning_rate": 2.813224775517378e-05, + "loss": 0.7515, + "step": 201710 + }, + { + "epoch": 1.2887315845290879, + "grad_norm": 1.0468463897705078, + "learning_rate": 2.812773551675558e-05, + "loss": 1.0192, + "step": 201720 + }, + { + "epoch": 1.2887954716788266, + "grad_norm": 1.0019460916519165, + "learning_rate": 2.812322349861002e-05, + "loss": 1.0245, + "step": 201730 + }, + { + "epoch": 1.2888593588285653, + "grad_norm": 1.5761841535568237, + "learning_rate": 2.8118711700782506e-05, + "loss": 0.7242, + "step": 201740 + }, + { + "epoch": 1.288923245978304, + "grad_norm": 0.9731926918029785, + "learning_rate": 2.811420012331852e-05, + "loss": 0.9004, + "step": 201750 + }, + { + "epoch": 1.2889871331280427, + "grad_norm": 1.1265745162963867, + "learning_rate": 2.8109688766263446e-05, + "loss": 0.6219, + "step": 201760 + }, + { + "epoch": 1.2890510202777814, + "grad_norm": 1.1114789247512817, + "learning_rate": 2.8105177629662772e-05, + "loss": 0.8625, + "step": 201770 + }, + { + "epoch": 1.28911490742752, + "grad_norm": 1.0359795093536377, + "learning_rate": 2.8100666713561873e-05, + "loss": 0.6437, + "step": 201780 + }, + { + "epoch": 1.2891787945772588, + "grad_norm": 1.1450669765472412, + "learning_rate": 2.8096156018006226e-05, + "loss": 0.9989, + "step": 201790 + }, + { + "epoch": 1.2892426817269973, + "grad_norm": 0.6831559538841248, + "learning_rate": 2.8091645543041222e-05, + "loss": 0.7288, + "step": 201800 + }, + { + "epoch": 1.2893065688767362, + "grad_norm": 1.1486716270446777, + "learning_rate": 2.808713528871232e-05, + "loss": 0.8573, + "step": 201810 + }, + { + "epoch": 1.2893704560264747, + "grad_norm": 0.6990073323249817, + "learning_rate": 2.8082625255064903e-05, + "loss": 0.9046, + "step": 201820 + }, + { + "epoch": 1.2894343431762136, + "grad_norm": 0.7559660077095032, + "learning_rate": 2.807811544214443e-05, + "loss": 0.7093, + "step": 201830 + }, + { + "epoch": 1.2894982303259521, + "grad_norm": 1.0000022649765015, + "learning_rate": 2.807360584999628e-05, + "loss": 1.1475, + "step": 201840 + }, + { + "epoch": 1.289562117475691, + "grad_norm": 2.2029664516448975, + "learning_rate": 2.8069096478665912e-05, + "loss": 0.8044, + "step": 201850 + }, + { + "epoch": 1.2896260046254295, + "grad_norm": 1.135852575302124, + "learning_rate": 2.8064587328198687e-05, + "loss": 0.7948, + "step": 201860 + }, + { + "epoch": 1.2896898917751685, + "grad_norm": 1.2041714191436768, + "learning_rate": 2.806007839864005e-05, + "loss": 0.6547, + "step": 201870 + }, + { + "epoch": 1.289753778924907, + "grad_norm": 0.8794607520103455, + "learning_rate": 2.8055569690035422e-05, + "loss": 0.8086, + "step": 201880 + }, + { + "epoch": 1.2898176660746459, + "grad_norm": 1.313482642173767, + "learning_rate": 2.8051061202430174e-05, + "loss": 1.059, + "step": 201890 + }, + { + "epoch": 1.2898815532243844, + "grad_norm": 1.017690896987915, + "learning_rate": 2.8046552935869748e-05, + "loss": 0.8971, + "step": 201900 + }, + { + "epoch": 1.2899454403741233, + "grad_norm": 0.9315025806427002, + "learning_rate": 2.8042044890399503e-05, + "loss": 0.8328, + "step": 201910 + }, + { + "epoch": 1.2900093275238618, + "grad_norm": 1.0727083683013916, + "learning_rate": 2.803753706606489e-05, + "loss": 0.8185, + "step": 201920 + }, + { + "epoch": 1.2900732146736005, + "grad_norm": 0.7859057784080505, + "learning_rate": 2.8033029462911254e-05, + "loss": 0.8365, + "step": 201930 + }, + { + "epoch": 1.2901371018233392, + "grad_norm": 0.6739004254341125, + "learning_rate": 2.8028522080984032e-05, + "loss": 1.0226, + "step": 201940 + }, + { + "epoch": 1.290200988973078, + "grad_norm": 0.8840377926826477, + "learning_rate": 2.8024014920328585e-05, + "loss": 0.9998, + "step": 201950 + }, + { + "epoch": 1.2902648761228166, + "grad_norm": 0.8898053169250488, + "learning_rate": 2.8019507980990335e-05, + "loss": 0.8163, + "step": 201960 + }, + { + "epoch": 1.2903287632725553, + "grad_norm": 0.9021614789962769, + "learning_rate": 2.801500126301464e-05, + "loss": 0.9241, + "step": 201970 + }, + { + "epoch": 1.290392650422294, + "grad_norm": 0.8183213472366333, + "learning_rate": 2.801049476644692e-05, + "loss": 1.012, + "step": 201980 + }, + { + "epoch": 1.2904565375720327, + "grad_norm": 0.7091869115829468, + "learning_rate": 2.8005988491332523e-05, + "loss": 0.7812, + "step": 201990 + }, + { + "epoch": 1.2905204247217714, + "grad_norm": 0.7986821532249451, + "learning_rate": 2.800148243771687e-05, + "loss": 0.67, + "step": 202000 + }, + { + "epoch": 1.2905843118715101, + "grad_norm": 1.1651382446289062, + "learning_rate": 2.79969766056453e-05, + "loss": 0.8387, + "step": 202010 + }, + { + "epoch": 1.2906481990212488, + "grad_norm": 0.9483960866928101, + "learning_rate": 2.799247099516323e-05, + "loss": 0.8228, + "step": 202020 + }, + { + "epoch": 1.2907120861709875, + "grad_norm": 0.8649300932884216, + "learning_rate": 2.798796560631599e-05, + "loss": 0.8634, + "step": 202030 + }, + { + "epoch": 1.2907759733207262, + "grad_norm": 0.8618859052658081, + "learning_rate": 2.7983460439149013e-05, + "loss": 0.6676, + "step": 202040 + }, + { + "epoch": 1.290839860470465, + "grad_norm": 0.7927425503730774, + "learning_rate": 2.7978955493707605e-05, + "loss": 1.071, + "step": 202050 + }, + { + "epoch": 1.2909037476202037, + "grad_norm": 1.109482765197754, + "learning_rate": 2.7974450770037193e-05, + "loss": 0.8577, + "step": 202060 + }, + { + "epoch": 1.2909676347699424, + "grad_norm": 0.8940995931625366, + "learning_rate": 2.796994626818309e-05, + "loss": 0.9325, + "step": 202070 + }, + { + "epoch": 1.291031521919681, + "grad_norm": 2.62735652923584, + "learning_rate": 2.7965441988190694e-05, + "loss": 1.0661, + "step": 202080 + }, + { + "epoch": 1.2910954090694198, + "grad_norm": 0.9144634008407593, + "learning_rate": 2.7960937930105378e-05, + "loss": 0.7727, + "step": 202090 + }, + { + "epoch": 1.2911592962191585, + "grad_norm": 2.9492197036743164, + "learning_rate": 2.795643409397246e-05, + "loss": 0.943, + "step": 202100 + }, + { + "epoch": 1.2912231833688972, + "grad_norm": 1.3305987119674683, + "learning_rate": 2.7951930479837342e-05, + "loss": 1.0945, + "step": 202110 + }, + { + "epoch": 1.291287070518636, + "grad_norm": 0.779472291469574, + "learning_rate": 2.7947427087745336e-05, + "loss": 0.8876, + "step": 202120 + }, + { + "epoch": 1.2913509576683746, + "grad_norm": 0.6640340685844421, + "learning_rate": 2.7942923917741864e-05, + "loss": 0.6774, + "step": 202130 + }, + { + "epoch": 1.2914148448181133, + "grad_norm": 2.7910993099212646, + "learning_rate": 2.7938420969872187e-05, + "loss": 0.8141, + "step": 202140 + }, + { + "epoch": 1.291478731967852, + "grad_norm": 4.312468528747559, + "learning_rate": 2.7933918244181716e-05, + "loss": 0.9578, + "step": 202150 + }, + { + "epoch": 1.2915426191175907, + "grad_norm": 1.027847170829773, + "learning_rate": 2.792941574071576e-05, + "loss": 0.9301, + "step": 202160 + }, + { + "epoch": 1.2916065062673294, + "grad_norm": 0.7103813886642456, + "learning_rate": 2.7924913459519697e-05, + "loss": 0.9317, + "step": 202170 + }, + { + "epoch": 1.2916703934170681, + "grad_norm": 1.4262120723724365, + "learning_rate": 2.792041140063884e-05, + "loss": 0.7602, + "step": 202180 + }, + { + "epoch": 1.2917342805668068, + "grad_norm": 1.1603199243545532, + "learning_rate": 2.791590956411856e-05, + "loss": 0.6214, + "step": 202190 + }, + { + "epoch": 1.2917981677165455, + "grad_norm": 0.8781431913375854, + "learning_rate": 2.7911407950004155e-05, + "loss": 0.8384, + "step": 202200 + }, + { + "epoch": 1.2918620548662842, + "grad_norm": 0.8929979205131531, + "learning_rate": 2.790690655834098e-05, + "loss": 0.8108, + "step": 202210 + }, + { + "epoch": 1.291925942016023, + "grad_norm": 0.9821604490280151, + "learning_rate": 2.790240538917439e-05, + "loss": 0.886, + "step": 202220 + }, + { + "epoch": 1.2919898291657617, + "grad_norm": 0.861404299736023, + "learning_rate": 2.789790444254967e-05, + "loss": 0.8588, + "step": 202230 + }, + { + "epoch": 1.2920537163155004, + "grad_norm": 1.100280523300171, + "learning_rate": 2.7893403718512202e-05, + "loss": 0.9536, + "step": 202240 + }, + { + "epoch": 1.292117603465239, + "grad_norm": 1.1146948337554932, + "learning_rate": 2.7888903217107258e-05, + "loss": 0.8328, + "step": 202250 + }, + { + "epoch": 1.2921814906149778, + "grad_norm": 0.7769910097122192, + "learning_rate": 2.7884402938380205e-05, + "loss": 0.9671, + "step": 202260 + }, + { + "epoch": 1.2922453777647165, + "grad_norm": 0.9938095211982727, + "learning_rate": 2.7879902882376335e-05, + "loss": 0.9469, + "step": 202270 + }, + { + "epoch": 1.2923092649144552, + "grad_norm": 0.9267186522483826, + "learning_rate": 2.7875403049140998e-05, + "loss": 0.9497, + "step": 202280 + }, + { + "epoch": 1.2923731520641937, + "grad_norm": 0.7188292741775513, + "learning_rate": 2.7870903438719464e-05, + "loss": 0.9467, + "step": 202290 + }, + { + "epoch": 1.2924370392139326, + "grad_norm": 0.8244433999061584, + "learning_rate": 2.786640405115711e-05, + "loss": 0.7807, + "step": 202300 + }, + { + "epoch": 1.292500926363671, + "grad_norm": 1.0395586490631104, + "learning_rate": 2.7861904886499186e-05, + "loss": 1.056, + "step": 202310 + }, + { + "epoch": 1.29256481351341, + "grad_norm": 0.6310107707977295, + "learning_rate": 2.7857405944791055e-05, + "loss": 0.951, + "step": 202320 + }, + { + "epoch": 1.2926287006631485, + "grad_norm": 1.7220028638839722, + "learning_rate": 2.7852907226077984e-05, + "loss": 0.9764, + "step": 202330 + }, + { + "epoch": 1.2926925878128874, + "grad_norm": 1.0622987747192383, + "learning_rate": 2.784840873040531e-05, + "loss": 0.6364, + "step": 202340 + }, + { + "epoch": 1.292756474962626, + "grad_norm": 2.590120792388916, + "learning_rate": 2.7843910457818313e-05, + "loss": 0.7963, + "step": 202350 + }, + { + "epoch": 1.2928203621123648, + "grad_norm": 0.7834983468055725, + "learning_rate": 2.7839412408362318e-05, + "loss": 0.926, + "step": 202360 + }, + { + "epoch": 1.2928842492621033, + "grad_norm": 0.9251220226287842, + "learning_rate": 2.7834914582082595e-05, + "loss": 0.8344, + "step": 202370 + }, + { + "epoch": 1.2929481364118423, + "grad_norm": 0.9570052027702332, + "learning_rate": 2.7830416979024476e-05, + "loss": 0.8473, + "step": 202380 + }, + { + "epoch": 1.2930120235615807, + "grad_norm": 0.9787630438804626, + "learning_rate": 2.7825919599233217e-05, + "loss": 0.8888, + "step": 202390 + }, + { + "epoch": 1.2930759107113194, + "grad_norm": 0.9614351391792297, + "learning_rate": 2.782142244275414e-05, + "loss": 0.8092, + "step": 202400 + }, + { + "epoch": 1.2931397978610581, + "grad_norm": 0.7499952912330627, + "learning_rate": 2.781692550963254e-05, + "loss": 0.7004, + "step": 202410 + }, + { + "epoch": 1.2932036850107969, + "grad_norm": 1.7899943590164185, + "learning_rate": 2.781242879991367e-05, + "loss": 1.1293, + "step": 202420 + }, + { + "epoch": 1.2932675721605356, + "grad_norm": 1.5646029710769653, + "learning_rate": 2.780793231364286e-05, + "loss": 0.8839, + "step": 202430 + }, + { + "epoch": 1.2933314593102743, + "grad_norm": 0.5450949668884277, + "learning_rate": 2.7803436050865346e-05, + "loss": 0.8994, + "step": 202440 + }, + { + "epoch": 1.293395346460013, + "grad_norm": 0.9725117087364197, + "learning_rate": 2.779894001162645e-05, + "loss": 0.7777, + "step": 202450 + }, + { + "epoch": 1.2934592336097517, + "grad_norm": 1.1340949535369873, + "learning_rate": 2.7794444195971426e-05, + "loss": 0.7504, + "step": 202460 + }, + { + "epoch": 1.2935231207594904, + "grad_norm": 0.9177945852279663, + "learning_rate": 2.778994860394557e-05, + "loss": 0.7452, + "step": 202470 + }, + { + "epoch": 1.293587007909229, + "grad_norm": 1.466374158859253, + "learning_rate": 2.778545323559413e-05, + "loss": 0.7265, + "step": 202480 + }, + { + "epoch": 1.2936508950589678, + "grad_norm": 1.491048812866211, + "learning_rate": 2.7780958090962416e-05, + "loss": 0.8083, + "step": 202490 + }, + { + "epoch": 1.2937147822087065, + "grad_norm": 0.8648363947868347, + "learning_rate": 2.7776463170095657e-05, + "loss": 0.8844, + "step": 202500 + }, + { + "epoch": 1.2937786693584452, + "grad_norm": 0.8939248919487, + "learning_rate": 2.7771968473039156e-05, + "loss": 0.7783, + "step": 202510 + }, + { + "epoch": 1.293842556508184, + "grad_norm": 1.2932512760162354, + "learning_rate": 2.7767473999838146e-05, + "loss": 0.8104, + "step": 202520 + }, + { + "epoch": 1.2939064436579226, + "grad_norm": 2.392772912979126, + "learning_rate": 2.776297975053792e-05, + "loss": 0.9477, + "step": 202530 + }, + { + "epoch": 1.2939703308076613, + "grad_norm": 0.9002924561500549, + "learning_rate": 2.7758485725183715e-05, + "loss": 0.7206, + "step": 202540 + }, + { + "epoch": 1.2940342179574, + "grad_norm": 0.7732874751091003, + "learning_rate": 2.775399192382081e-05, + "loss": 0.7847, + "step": 202550 + }, + { + "epoch": 1.2940981051071387, + "grad_norm": 0.7313796877861023, + "learning_rate": 2.774949834649444e-05, + "loss": 0.7352, + "step": 202560 + }, + { + "epoch": 1.2941619922568774, + "grad_norm": 0.8885713815689087, + "learning_rate": 2.774500499324989e-05, + "loss": 0.5844, + "step": 202570 + }, + { + "epoch": 1.2942258794066162, + "grad_norm": 1.0987149477005005, + "learning_rate": 2.7740511864132367e-05, + "loss": 0.8516, + "step": 202580 + }, + { + "epoch": 1.2942897665563549, + "grad_norm": 1.1210942268371582, + "learning_rate": 2.773601895918717e-05, + "loss": 0.9402, + "step": 202590 + }, + { + "epoch": 1.2943536537060936, + "grad_norm": 0.91703200340271, + "learning_rate": 2.7731526278459508e-05, + "loss": 1.0145, + "step": 202600 + }, + { + "epoch": 1.2944175408558323, + "grad_norm": 1.1130025386810303, + "learning_rate": 2.7727033821994658e-05, + "loss": 0.9353, + "step": 202610 + }, + { + "epoch": 1.294481428005571, + "grad_norm": 1.0177847146987915, + "learning_rate": 2.7722541589837847e-05, + "loss": 0.9186, + "step": 202620 + }, + { + "epoch": 1.2945453151553097, + "grad_norm": 1.2630997896194458, + "learning_rate": 2.771804958203429e-05, + "loss": 0.938, + "step": 202630 + }, + { + "epoch": 1.2946092023050484, + "grad_norm": 0.7808921337127686, + "learning_rate": 2.7713557798629274e-05, + "loss": 0.8576, + "step": 202640 + }, + { + "epoch": 1.294673089454787, + "grad_norm": 0.8194741010665894, + "learning_rate": 2.7709066239667992e-05, + "loss": 0.6407, + "step": 202650 + }, + { + "epoch": 1.2947369766045258, + "grad_norm": 0.8097875714302063, + "learning_rate": 2.770457490519572e-05, + "loss": 1.0252, + "step": 202660 + }, + { + "epoch": 1.2948008637542645, + "grad_norm": 1.127073884010315, + "learning_rate": 2.770008379525764e-05, + "loss": 0.8304, + "step": 202670 + }, + { + "epoch": 1.2948647509040032, + "grad_norm": 1.1275410652160645, + "learning_rate": 2.769559290989904e-05, + "loss": 0.8219, + "step": 202680 + }, + { + "epoch": 1.294928638053742, + "grad_norm": 1.8037633895874023, + "learning_rate": 2.7691102249165084e-05, + "loss": 1.0246, + "step": 202690 + }, + { + "epoch": 1.2949925252034806, + "grad_norm": 1.1666561365127563, + "learning_rate": 2.7686611813101048e-05, + "loss": 0.7787, + "step": 202700 + }, + { + "epoch": 1.2950564123532193, + "grad_norm": 1.1435728073120117, + "learning_rate": 2.768212160175212e-05, + "loss": 0.8529, + "step": 202710 + }, + { + "epoch": 1.295120299502958, + "grad_norm": 1.3430248498916626, + "learning_rate": 2.7677631615163535e-05, + "loss": 1.0102, + "step": 202720 + }, + { + "epoch": 1.2951841866526967, + "grad_norm": 2.008981704711914, + "learning_rate": 2.7673141853380534e-05, + "loss": 0.9389, + "step": 202730 + }, + { + "epoch": 1.2952480738024355, + "grad_norm": 1.6296296119689941, + "learning_rate": 2.7668652316448284e-05, + "loss": 1.104, + "step": 202740 + }, + { + "epoch": 1.2953119609521742, + "grad_norm": 0.7582008838653564, + "learning_rate": 2.7664163004412046e-05, + "loss": 0.8314, + "step": 202750 + }, + { + "epoch": 1.2953758481019129, + "grad_norm": 0.9481918215751648, + "learning_rate": 2.7659673917317e-05, + "loss": 0.798, + "step": 202760 + }, + { + "epoch": 1.2954397352516516, + "grad_norm": 0.6187413930892944, + "learning_rate": 2.7655185055208365e-05, + "loss": 0.9045, + "step": 202770 + }, + { + "epoch": 1.29550362240139, + "grad_norm": 0.6845240592956543, + "learning_rate": 2.7650696418131338e-05, + "loss": 0.8514, + "step": 202780 + }, + { + "epoch": 1.295567509551129, + "grad_norm": 1.132522702217102, + "learning_rate": 2.7646208006131158e-05, + "loss": 0.9722, + "step": 202790 + }, + { + "epoch": 1.2956313967008675, + "grad_norm": 1.2485088109970093, + "learning_rate": 2.7641719819252976e-05, + "loss": 0.9917, + "step": 202800 + }, + { + "epoch": 1.2956952838506064, + "grad_norm": 0.7616110444068909, + "learning_rate": 2.763723185754204e-05, + "loss": 0.745, + "step": 202810 + }, + { + "epoch": 1.2957591710003449, + "grad_norm": 1.2537869215011597, + "learning_rate": 2.7632744121043506e-05, + "loss": 0.8795, + "step": 202820 + }, + { + "epoch": 1.2958230581500838, + "grad_norm": 0.6776690483093262, + "learning_rate": 2.7628256609802604e-05, + "loss": 1.0657, + "step": 202830 + }, + { + "epoch": 1.2958869452998223, + "grad_norm": 1.1423174142837524, + "learning_rate": 2.76237693238645e-05, + "loss": 0.8462, + "step": 202840 + }, + { + "epoch": 1.2959508324495612, + "grad_norm": 1.0729666948318481, + "learning_rate": 2.7619282263274414e-05, + "loss": 0.6347, + "step": 202850 + }, + { + "epoch": 1.2960147195992997, + "grad_norm": 0.7181875109672546, + "learning_rate": 2.76147954280775e-05, + "loss": 0.8179, + "step": 202860 + }, + { + "epoch": 1.2960786067490386, + "grad_norm": 1.6496679782867432, + "learning_rate": 2.7610308818318975e-05, + "loss": 0.7566, + "step": 202870 + }, + { + "epoch": 1.2961424938987771, + "grad_norm": 0.7662792205810547, + "learning_rate": 2.760582243404399e-05, + "loss": 0.8662, + "step": 202880 + }, + { + "epoch": 1.2962063810485158, + "grad_norm": 1.0914605855941772, + "learning_rate": 2.7601336275297774e-05, + "loss": 1.068, + "step": 202890 + }, + { + "epoch": 1.2962702681982545, + "grad_norm": 1.0450533628463745, + "learning_rate": 2.7596850342125457e-05, + "loss": 0.7495, + "step": 202900 + }, + { + "epoch": 1.2963341553479932, + "grad_norm": 0.9505026936531067, + "learning_rate": 2.759236463457226e-05, + "loss": 0.7982, + "step": 202910 + }, + { + "epoch": 1.296398042497732, + "grad_norm": 0.6942030787467957, + "learning_rate": 2.7587879152683316e-05, + "loss": 1.0429, + "step": 202920 + }, + { + "epoch": 1.2964619296474706, + "grad_norm": 0.8163788914680481, + "learning_rate": 2.7583393896503817e-05, + "loss": 1.1042, + "step": 202930 + }, + { + "epoch": 1.2965258167972094, + "grad_norm": 1.385325312614441, + "learning_rate": 2.7578908866078957e-05, + "loss": 0.8366, + "step": 202940 + }, + { + "epoch": 1.296589703946948, + "grad_norm": 0.7578178644180298, + "learning_rate": 2.757442406145385e-05, + "loss": 1.1626, + "step": 202950 + }, + { + "epoch": 1.2966535910966868, + "grad_norm": 0.9005945324897766, + "learning_rate": 2.7569939482673724e-05, + "loss": 0.8325, + "step": 202960 + }, + { + "epoch": 1.2967174782464255, + "grad_norm": 0.9879159331321716, + "learning_rate": 2.7565455129783692e-05, + "loss": 1.0183, + "step": 202970 + }, + { + "epoch": 1.2967813653961642, + "grad_norm": 0.6550315618515015, + "learning_rate": 2.7560971002828954e-05, + "loss": 0.6492, + "step": 202980 + }, + { + "epoch": 1.2968452525459029, + "grad_norm": 0.9671928882598877, + "learning_rate": 2.7556487101854632e-05, + "loss": 0.997, + "step": 202990 + }, + { + "epoch": 1.2969091396956416, + "grad_norm": 1.200095772743225, + "learning_rate": 2.755200342690592e-05, + "loss": 0.9367, + "step": 203000 + }, + { + "epoch": 1.2969730268453803, + "grad_norm": 0.5706303715705872, + "learning_rate": 2.7547519978027936e-05, + "loss": 0.8241, + "step": 203010 + }, + { + "epoch": 1.297036913995119, + "grad_norm": 3.405540704727173, + "learning_rate": 2.7543036755265868e-05, + "loss": 0.7368, + "step": 203020 + }, + { + "epoch": 1.2971008011448577, + "grad_norm": 0.9094310998916626, + "learning_rate": 2.7538553758664825e-05, + "loss": 0.755, + "step": 203030 + }, + { + "epoch": 1.2971646882945964, + "grad_norm": 0.6244282126426697, + "learning_rate": 2.7534070988269998e-05, + "loss": 0.7651, + "step": 203040 + }, + { + "epoch": 1.2972285754443351, + "grad_norm": 0.7120745182037354, + "learning_rate": 2.7529588444126498e-05, + "loss": 1.2066, + "step": 203050 + }, + { + "epoch": 1.2972924625940738, + "grad_norm": 0.8211696743965149, + "learning_rate": 2.75251061262795e-05, + "loss": 0.8375, + "step": 203060 + }, + { + "epoch": 1.2973563497438125, + "grad_norm": 1.2300297021865845, + "learning_rate": 2.7520624034774102e-05, + "loss": 0.9132, + "step": 203070 + }, + { + "epoch": 1.2974202368935512, + "grad_norm": 0.8189200758934021, + "learning_rate": 2.751614216965549e-05, + "loss": 0.7244, + "step": 203080 + }, + { + "epoch": 1.29748412404329, + "grad_norm": 0.8568248152732849, + "learning_rate": 2.7511660530968763e-05, + "loss": 0.9051, + "step": 203090 + }, + { + "epoch": 1.2975480111930286, + "grad_norm": 0.8858509659767151, + "learning_rate": 2.7507179118759087e-05, + "loss": 0.7753, + "step": 203100 + }, + { + "epoch": 1.2976118983427674, + "grad_norm": 1.5307356119155884, + "learning_rate": 2.750269793307157e-05, + "loss": 1.024, + "step": 203110 + }, + { + "epoch": 1.297675785492506, + "grad_norm": 1.075430989265442, + "learning_rate": 2.7498216973951328e-05, + "loss": 0.9816, + "step": 203120 + }, + { + "epoch": 1.2977396726422448, + "grad_norm": 0.9090355634689331, + "learning_rate": 2.7493736241443536e-05, + "loss": 0.7039, + "step": 203130 + }, + { + "epoch": 1.2978035597919835, + "grad_norm": 0.9631739854812622, + "learning_rate": 2.7489703775977403e-05, + "loss": 0.9271, + "step": 203140 + }, + { + "epoch": 1.2978674469417222, + "grad_norm": 0.9455099105834961, + "learning_rate": 2.748522347415753e-05, + "loss": 0.9177, + "step": 203150 + }, + { + "epoch": 1.2979313340914609, + "grad_norm": 0.8888641595840454, + "learning_rate": 2.7480743399080912e-05, + "loss": 1.0949, + "step": 203160 + }, + { + "epoch": 1.2979952212411996, + "grad_norm": 0.8014704585075378, + "learning_rate": 2.7476263550792703e-05, + "loss": 0.776, + "step": 203170 + }, + { + "epoch": 1.2980591083909383, + "grad_norm": 0.8085066080093384, + "learning_rate": 2.747178392933799e-05, + "loss": 1.003, + "step": 203180 + }, + { + "epoch": 1.298122995540677, + "grad_norm": 1.0630215406417847, + "learning_rate": 2.746730453476193e-05, + "loss": 0.9934, + "step": 203190 + }, + { + "epoch": 1.2981868826904157, + "grad_norm": 0.7368289232254028, + "learning_rate": 2.7462825367109574e-05, + "loss": 0.9137, + "step": 203200 + }, + { + "epoch": 1.2982507698401544, + "grad_norm": 0.7954838275909424, + "learning_rate": 2.745834642642609e-05, + "loss": 0.7045, + "step": 203210 + }, + { + "epoch": 1.2983146569898931, + "grad_norm": 0.7497609853744507, + "learning_rate": 2.7453867712756542e-05, + "loss": 1.1313, + "step": 203220 + }, + { + "epoch": 1.2983785441396318, + "grad_norm": 1.072345495223999, + "learning_rate": 2.7449389226146066e-05, + "loss": 0.8416, + "step": 203230 + }, + { + "epoch": 1.2984424312893705, + "grad_norm": 0.7450870871543884, + "learning_rate": 2.7444910966639735e-05, + "loss": 0.8321, + "step": 203240 + }, + { + "epoch": 1.298506318439109, + "grad_norm": 1.326153039932251, + "learning_rate": 2.7440432934282677e-05, + "loss": 0.9216, + "step": 203250 + }, + { + "epoch": 1.298570205588848, + "grad_norm": 1.1570442914962769, + "learning_rate": 2.7435955129119984e-05, + "loss": 0.8476, + "step": 203260 + }, + { + "epoch": 1.2986340927385864, + "grad_norm": 1.0663647651672363, + "learning_rate": 2.7431477551196716e-05, + "loss": 0.9252, + "step": 203270 + }, + { + "epoch": 1.2986979798883254, + "grad_norm": 0.7784600257873535, + "learning_rate": 2.742700020055802e-05, + "loss": 0.9039, + "step": 203280 + }, + { + "epoch": 1.2987618670380638, + "grad_norm": 1.4422059059143066, + "learning_rate": 2.742252307724893e-05, + "loss": 0.875, + "step": 203290 + }, + { + "epoch": 1.2988257541878028, + "grad_norm": 1.8490077257156372, + "learning_rate": 2.7418046181314595e-05, + "loss": 1.1439, + "step": 203300 + }, + { + "epoch": 1.2988896413375413, + "grad_norm": 0.5135200619697571, + "learning_rate": 2.7413569512800048e-05, + "loss": 1.1235, + "step": 203310 + }, + { + "epoch": 1.2989535284872802, + "grad_norm": 1.370787501335144, + "learning_rate": 2.7409093071750415e-05, + "loss": 0.7495, + "step": 203320 + }, + { + "epoch": 1.2990174156370187, + "grad_norm": 1.1378347873687744, + "learning_rate": 2.7404616858210742e-05, + "loss": 0.7128, + "step": 203330 + }, + { + "epoch": 1.2990813027867576, + "grad_norm": 0.702448308467865, + "learning_rate": 2.7400140872226145e-05, + "loss": 0.6843, + "step": 203340 + }, + { + "epoch": 1.299145189936496, + "grad_norm": 0.7693853378295898, + "learning_rate": 2.7395665113841655e-05, + "loss": 1.0961, + "step": 203350 + }, + { + "epoch": 1.299209077086235, + "grad_norm": 0.9960201978683472, + "learning_rate": 2.7391189583102393e-05, + "loss": 1.2116, + "step": 203360 + }, + { + "epoch": 1.2992729642359735, + "grad_norm": 1.000061273574829, + "learning_rate": 2.73867142800534e-05, + "loss": 1.019, + "step": 203370 + }, + { + "epoch": 1.2993368513857122, + "grad_norm": 0.744613766670227, + "learning_rate": 2.7382239204739752e-05, + "loss": 0.8507, + "step": 203380 + }, + { + "epoch": 1.299400738535451, + "grad_norm": 1.8533433675765991, + "learning_rate": 2.7377764357206542e-05, + "loss": 0.6657, + "step": 203390 + }, + { + "epoch": 1.2994646256851896, + "grad_norm": 0.7121914029121399, + "learning_rate": 2.7373289737498798e-05, + "loss": 1.226, + "step": 203400 + }, + { + "epoch": 1.2995285128349283, + "grad_norm": 0.8955390453338623, + "learning_rate": 2.736881534566162e-05, + "loss": 0.8282, + "step": 203410 + }, + { + "epoch": 1.299592399984667, + "grad_norm": 1.5612226724624634, + "learning_rate": 2.7364341181740027e-05, + "loss": 0.7724, + "step": 203420 + }, + { + "epoch": 1.2996562871344057, + "grad_norm": 0.8913554549217224, + "learning_rate": 2.7359867245779124e-05, + "loss": 0.7936, + "step": 203430 + }, + { + "epoch": 1.2997201742841444, + "grad_norm": 0.7717186808586121, + "learning_rate": 2.7355393537823925e-05, + "loss": 1.1769, + "step": 203440 + }, + { + "epoch": 1.2997840614338831, + "grad_norm": 1.203664779663086, + "learning_rate": 2.7350920057919527e-05, + "loss": 0.7913, + "step": 203450 + }, + { + "epoch": 1.2998479485836218, + "grad_norm": 1.1021778583526611, + "learning_rate": 2.7346446806110938e-05, + "loss": 0.7538, + "step": 203460 + }, + { + "epoch": 1.2999118357333606, + "grad_norm": 1.160457730293274, + "learning_rate": 2.7341973782443242e-05, + "loss": 0.7314, + "step": 203470 + }, + { + "epoch": 1.2999757228830993, + "grad_norm": 0.8621507287025452, + "learning_rate": 2.7337500986961452e-05, + "loss": 0.7591, + "step": 203480 + }, + { + "epoch": 1.300039610032838, + "grad_norm": 0.8076770901679993, + "learning_rate": 2.7333028419710654e-05, + "loss": 0.8353, + "step": 203490 + }, + { + "epoch": 1.3001034971825767, + "grad_norm": 0.6418294310569763, + "learning_rate": 2.732855608073585e-05, + "loss": 0.8851, + "step": 203500 + }, + { + "epoch": 1.3001673843323154, + "grad_norm": 0.958096444606781, + "learning_rate": 2.7324083970082127e-05, + "loss": 0.8157, + "step": 203510 + }, + { + "epoch": 1.300231271482054, + "grad_norm": 0.9913421869277954, + "learning_rate": 2.7319612087794465e-05, + "loss": 0.7058, + "step": 203520 + }, + { + "epoch": 1.3002951586317928, + "grad_norm": 0.5462644100189209, + "learning_rate": 2.731514043391795e-05, + "loss": 0.8737, + "step": 203530 + }, + { + "epoch": 1.3003590457815315, + "grad_norm": 0.8152220845222473, + "learning_rate": 2.7310669008497585e-05, + "loss": 0.9175, + "step": 203540 + }, + { + "epoch": 1.3004229329312702, + "grad_norm": 0.9578920602798462, + "learning_rate": 2.7306197811578426e-05, + "loss": 0.9223, + "step": 203550 + }, + { + "epoch": 1.300486820081009, + "grad_norm": 0.9416980743408203, + "learning_rate": 2.730172684320547e-05, + "loss": 0.7785, + "step": 203560 + }, + { + "epoch": 1.3005507072307476, + "grad_norm": 1.7427188158035278, + "learning_rate": 2.7297256103423763e-05, + "loss": 0.7719, + "step": 203570 + }, + { + "epoch": 1.3006145943804863, + "grad_norm": 0.680845320224762, + "learning_rate": 2.729278559227835e-05, + "loss": 0.6889, + "step": 203580 + }, + { + "epoch": 1.300678481530225, + "grad_norm": 0.7768273949623108, + "learning_rate": 2.7288315309814205e-05, + "loss": 1.0102, + "step": 203590 + }, + { + "epoch": 1.3007423686799637, + "grad_norm": 1.1281657218933105, + "learning_rate": 2.728384525607639e-05, + "loss": 0.8399, + "step": 203600 + }, + { + "epoch": 1.3008062558297024, + "grad_norm": 1.1821376085281372, + "learning_rate": 2.7279375431109894e-05, + "loss": 0.8216, + "step": 203610 + }, + { + "epoch": 1.3008701429794411, + "grad_norm": 0.8288279175758362, + "learning_rate": 2.7274905834959762e-05, + "loss": 0.7453, + "step": 203620 + }, + { + "epoch": 1.3009340301291799, + "grad_norm": 0.7814729809761047, + "learning_rate": 2.7270436467670967e-05, + "loss": 0.9919, + "step": 203630 + }, + { + "epoch": 1.3009979172789186, + "grad_norm": 0.95287024974823, + "learning_rate": 2.7265967329288557e-05, + "loss": 0.8718, + "step": 203640 + }, + { + "epoch": 1.3010618044286573, + "grad_norm": 1.3219366073608398, + "learning_rate": 2.7261498419857513e-05, + "loss": 1.092, + "step": 203650 + }, + { + "epoch": 1.301125691578396, + "grad_norm": 0.7606185078620911, + "learning_rate": 2.7257029739422855e-05, + "loss": 0.995, + "step": 203660 + }, + { + "epoch": 1.3011895787281347, + "grad_norm": 1.000482201576233, + "learning_rate": 2.7252561288029577e-05, + "loss": 0.8025, + "step": 203670 + }, + { + "epoch": 1.3012534658778734, + "grad_norm": 1.1020547151565552, + "learning_rate": 2.72480930657227e-05, + "loss": 0.7672, + "step": 203680 + }, + { + "epoch": 1.301317353027612, + "grad_norm": 1.0821468830108643, + "learning_rate": 2.7243625072547196e-05, + "loss": 0.6712, + "step": 203690 + }, + { + "epoch": 1.3013812401773508, + "grad_norm": 0.9761326313018799, + "learning_rate": 2.7239157308548093e-05, + "loss": 0.96, + "step": 203700 + }, + { + "epoch": 1.3014451273270895, + "grad_norm": 0.9703685641288757, + "learning_rate": 2.723468977377034e-05, + "loss": 0.6992, + "step": 203710 + }, + { + "epoch": 1.3015090144768282, + "grad_norm": 0.81026691198349, + "learning_rate": 2.7230222468258982e-05, + "loss": 0.8019, + "step": 203720 + }, + { + "epoch": 1.301572901626567, + "grad_norm": 1.1991046667099, + "learning_rate": 2.7225755392058977e-05, + "loss": 1.0236, + "step": 203730 + }, + { + "epoch": 1.3016367887763054, + "grad_norm": 0.9381265044212341, + "learning_rate": 2.72212885452153e-05, + "loss": 0.6681, + "step": 203740 + }, + { + "epoch": 1.3017006759260443, + "grad_norm": 0.7405054569244385, + "learning_rate": 2.7216821927772972e-05, + "loss": 0.8605, + "step": 203750 + }, + { + "epoch": 1.3017645630757828, + "grad_norm": 1.3777233362197876, + "learning_rate": 2.7212355539776947e-05, + "loss": 1.4742, + "step": 203760 + }, + { + "epoch": 1.3018284502255217, + "grad_norm": 1.1950933933258057, + "learning_rate": 2.7207889381272224e-05, + "loss": 0.794, + "step": 203770 + }, + { + "epoch": 1.3018923373752602, + "grad_norm": 0.6299750804901123, + "learning_rate": 2.720342345230376e-05, + "loss": 0.6164, + "step": 203780 + }, + { + "epoch": 1.3019562245249992, + "grad_norm": 0.9684973955154419, + "learning_rate": 2.7198957752916566e-05, + "loss": 1.3586, + "step": 203790 + }, + { + "epoch": 1.3020201116747376, + "grad_norm": 0.8507335782051086, + "learning_rate": 2.7194492283155566e-05, + "loss": 0.9001, + "step": 203800 + }, + { + "epoch": 1.3020839988244766, + "grad_norm": 1.953037977218628, + "learning_rate": 2.719002704306578e-05, + "loss": 1.129, + "step": 203810 + }, + { + "epoch": 1.302147885974215, + "grad_norm": 0.7484040260314941, + "learning_rate": 2.7185562032692137e-05, + "loss": 1.016, + "step": 203820 + }, + { + "epoch": 1.302211773123954, + "grad_norm": 1.1943165063858032, + "learning_rate": 2.7181097252079636e-05, + "loss": 0.9488, + "step": 203830 + }, + { + "epoch": 1.3022756602736925, + "grad_norm": 1.1450726985931396, + "learning_rate": 2.7176632701273215e-05, + "loss": 0.8312, + "step": 203840 + }, + { + "epoch": 1.3023395474234314, + "grad_norm": 0.670720636844635, + "learning_rate": 2.7172168380317864e-05, + "loss": 0.9008, + "step": 203850 + }, + { + "epoch": 1.3024034345731699, + "grad_norm": 1.0023084878921509, + "learning_rate": 2.7167704289258504e-05, + "loss": 0.8328, + "step": 203860 + }, + { + "epoch": 1.3024673217229086, + "grad_norm": 0.6697094440460205, + "learning_rate": 2.7163240428140136e-05, + "loss": 0.6307, + "step": 203870 + }, + { + "epoch": 1.3025312088726473, + "grad_norm": 0.6451807022094727, + "learning_rate": 2.715877679700768e-05, + "loss": 0.7335, + "step": 203880 + }, + { + "epoch": 1.302595096022386, + "grad_norm": 1.0496819019317627, + "learning_rate": 2.7154313395906118e-05, + "loss": 0.7024, + "step": 203890 + }, + { + "epoch": 1.3026589831721247, + "grad_norm": 1.360062599182129, + "learning_rate": 2.714985022488036e-05, + "loss": 0.8303, + "step": 203900 + }, + { + "epoch": 1.3027228703218634, + "grad_norm": 1.0181471109390259, + "learning_rate": 2.7145387283975375e-05, + "loss": 0.9936, + "step": 203910 + }, + { + "epoch": 1.302786757471602, + "grad_norm": 0.9711088538169861, + "learning_rate": 2.7140924573236142e-05, + "loss": 0.8272, + "step": 203920 + }, + { + "epoch": 1.3028506446213408, + "grad_norm": 1.731081247329712, + "learning_rate": 2.7136462092707556e-05, + "loss": 0.9242, + "step": 203930 + }, + { + "epoch": 1.3029145317710795, + "grad_norm": 1.1637697219848633, + "learning_rate": 2.7131999842434586e-05, + "loss": 1.0044, + "step": 203940 + }, + { + "epoch": 1.3029784189208182, + "grad_norm": 0.9074548482894897, + "learning_rate": 2.7127537822462146e-05, + "loss": 0.7322, + "step": 203950 + }, + { + "epoch": 1.303042306070557, + "grad_norm": 1.384960651397705, + "learning_rate": 2.7123076032835214e-05, + "loss": 1.0899, + "step": 203960 + }, + { + "epoch": 1.3031061932202956, + "grad_norm": 1.675883412361145, + "learning_rate": 2.711861447359867e-05, + "loss": 0.9829, + "step": 203970 + }, + { + "epoch": 1.3031700803700343, + "grad_norm": 1.1326029300689697, + "learning_rate": 2.7114153144797494e-05, + "loss": 0.7603, + "step": 203980 + }, + { + "epoch": 1.303233967519773, + "grad_norm": 0.8392053842544556, + "learning_rate": 2.7109692046476576e-05, + "loss": 0.947, + "step": 203990 + }, + { + "epoch": 1.3032978546695118, + "grad_norm": 0.8956617712974548, + "learning_rate": 2.7105231178680883e-05, + "loss": 0.9789, + "step": 204000 + }, + { + "epoch": 1.3033617418192505, + "grad_norm": 0.7766060829162598, + "learning_rate": 2.71007705414553e-05, + "loss": 0.7753, + "step": 204010 + }, + { + "epoch": 1.3034256289689892, + "grad_norm": 0.7224856019020081, + "learning_rate": 2.7096310134844783e-05, + "loss": 0.815, + "step": 204020 + }, + { + "epoch": 1.3034895161187279, + "grad_norm": 1.0599384307861328, + "learning_rate": 2.7091849958894223e-05, + "loss": 0.866, + "step": 204030 + }, + { + "epoch": 1.3035534032684666, + "grad_norm": 2.4030120372772217, + "learning_rate": 2.7087390013648574e-05, + "loss": 0.8354, + "step": 204040 + }, + { + "epoch": 1.3036172904182053, + "grad_norm": 1.1198670864105225, + "learning_rate": 2.7082930299152704e-05, + "loss": 0.6503, + "step": 204050 + }, + { + "epoch": 1.303681177567944, + "grad_norm": 1.2229379415512085, + "learning_rate": 2.7078470815451574e-05, + "loss": 1.0611, + "step": 204060 + }, + { + "epoch": 1.3037450647176827, + "grad_norm": 0.9799173474311829, + "learning_rate": 2.7074011562590053e-05, + "loss": 0.837, + "step": 204070 + }, + { + "epoch": 1.3038089518674214, + "grad_norm": 1.169978380203247, + "learning_rate": 2.7069552540613084e-05, + "loss": 1.0953, + "step": 204080 + }, + { + "epoch": 1.3038728390171601, + "grad_norm": 0.8275418877601624, + "learning_rate": 2.706509374956555e-05, + "loss": 0.8888, + "step": 204090 + }, + { + "epoch": 1.3039367261668988, + "grad_norm": 0.7325239777565002, + "learning_rate": 2.7060635189492355e-05, + "loss": 0.6354, + "step": 204100 + }, + { + "epoch": 1.3040006133166375, + "grad_norm": 1.0800108909606934, + "learning_rate": 2.705617686043843e-05, + "loss": 0.8502, + "step": 204110 + }, + { + "epoch": 1.3040645004663762, + "grad_norm": 1.2910447120666504, + "learning_rate": 2.705171876244864e-05, + "loss": 0.7463, + "step": 204120 + }, + { + "epoch": 1.304128387616115, + "grad_norm": 0.8230818510055542, + "learning_rate": 2.7047260895567906e-05, + "loss": 0.7637, + "step": 204130 + }, + { + "epoch": 1.3041922747658536, + "grad_norm": 1.0600754022598267, + "learning_rate": 2.70428032598411e-05, + "loss": 0.8405, + "step": 204140 + }, + { + "epoch": 1.3042561619155923, + "grad_norm": 1.061686635017395, + "learning_rate": 2.703834585531314e-05, + "loss": 0.8679, + "step": 204150 + }, + { + "epoch": 1.304320049065331, + "grad_norm": 0.8024836778640747, + "learning_rate": 2.7033888682028885e-05, + "loss": 1.0255, + "step": 204160 + }, + { + "epoch": 1.3043839362150698, + "grad_norm": 0.7017011642456055, + "learning_rate": 2.7029431740033252e-05, + "loss": 0.5799, + "step": 204170 + }, + { + "epoch": 1.3044478233648085, + "grad_norm": 0.8407653570175171, + "learning_rate": 2.7024975029371102e-05, + "loss": 0.6548, + "step": 204180 + }, + { + "epoch": 1.3045117105145472, + "grad_norm": 1.2621287107467651, + "learning_rate": 2.702051855008735e-05, + "loss": 0.821, + "step": 204190 + }, + { + "epoch": 1.3045755976642859, + "grad_norm": 0.8817234039306641, + "learning_rate": 2.701606230222683e-05, + "loss": 0.8289, + "step": 204200 + }, + { + "epoch": 1.3046394848140246, + "grad_norm": 1.5565587282180786, + "learning_rate": 2.7011606285834467e-05, + "loss": 1.0017, + "step": 204210 + }, + { + "epoch": 1.3047033719637633, + "grad_norm": 0.8994980454444885, + "learning_rate": 2.7007150500955124e-05, + "loss": 1.0877, + "step": 204220 + }, + { + "epoch": 1.3047672591135018, + "grad_norm": 1.5029292106628418, + "learning_rate": 2.7002694947633643e-05, + "loss": 0.7565, + "step": 204230 + }, + { + "epoch": 1.3048311462632407, + "grad_norm": 0.6795468330383301, + "learning_rate": 2.6998239625914934e-05, + "loss": 0.7578, + "step": 204240 + }, + { + "epoch": 1.3048950334129792, + "grad_norm": 2.024456262588501, + "learning_rate": 2.6993784535843842e-05, + "loss": 0.8655, + "step": 204250 + }, + { + "epoch": 1.3049589205627181, + "grad_norm": 1.2158528566360474, + "learning_rate": 2.6989329677465257e-05, + "loss": 0.7881, + "step": 204260 + }, + { + "epoch": 1.3050228077124566, + "grad_norm": 1.367469310760498, + "learning_rate": 2.698487505082401e-05, + "loss": 0.8068, + "step": 204270 + }, + { + "epoch": 1.3050866948621955, + "grad_norm": 0.9547830820083618, + "learning_rate": 2.6980420655965e-05, + "loss": 1.0195, + "step": 204280 + }, + { + "epoch": 1.305150582011934, + "grad_norm": 1.4500901699066162, + "learning_rate": 2.6975966492933047e-05, + "loss": 0.8647, + "step": 204290 + }, + { + "epoch": 1.305214469161673, + "grad_norm": 0.7481394410133362, + "learning_rate": 2.697151256177306e-05, + "loss": 0.9651, + "step": 204300 + }, + { + "epoch": 1.3052783563114114, + "grad_norm": 1.3653556108474731, + "learning_rate": 2.696705886252984e-05, + "loss": 0.8868, + "step": 204310 + }, + { + "epoch": 1.3053422434611504, + "grad_norm": 0.5968084335327148, + "learning_rate": 2.6962605395248276e-05, + "loss": 1.1226, + "step": 204320 + }, + { + "epoch": 1.3054061306108888, + "grad_norm": 0.7193194627761841, + "learning_rate": 2.6958152159973195e-05, + "loss": 0.8533, + "step": 204330 + }, + { + "epoch": 1.3054700177606278, + "grad_norm": 0.8077860474586487, + "learning_rate": 2.6953699156749467e-05, + "loss": 0.7623, + "step": 204340 + }, + { + "epoch": 1.3055339049103662, + "grad_norm": 1.3645210266113281, + "learning_rate": 2.6949246385621912e-05, + "loss": 0.7674, + "step": 204350 + }, + { + "epoch": 1.305597792060105, + "grad_norm": 0.9736807942390442, + "learning_rate": 2.694479384663541e-05, + "loss": 0.8348, + "step": 204360 + }, + { + "epoch": 1.3056616792098437, + "grad_norm": 0.9883267283439636, + "learning_rate": 2.694034153983475e-05, + "loss": 0.8904, + "step": 204370 + }, + { + "epoch": 1.3057255663595824, + "grad_norm": 1.2597261667251587, + "learning_rate": 2.693588946526483e-05, + "loss": 0.7717, + "step": 204380 + }, + { + "epoch": 1.305789453509321, + "grad_norm": 1.2701901197433472, + "learning_rate": 2.6931437622970434e-05, + "loss": 0.9456, + "step": 204390 + }, + { + "epoch": 1.3058533406590598, + "grad_norm": 0.84967440366745, + "learning_rate": 2.6926986012996436e-05, + "loss": 0.7379, + "step": 204400 + }, + { + "epoch": 1.3059172278087985, + "grad_norm": 0.866371750831604, + "learning_rate": 2.692253463538763e-05, + "loss": 1.0045, + "step": 204410 + }, + { + "epoch": 1.3059811149585372, + "grad_norm": 0.6633437871932983, + "learning_rate": 2.6918083490188865e-05, + "loss": 0.8149, + "step": 204420 + }, + { + "epoch": 1.306045002108276, + "grad_norm": 1.0880591869354248, + "learning_rate": 2.6913632577444993e-05, + "loss": 0.7891, + "step": 204430 + }, + { + "epoch": 1.3061088892580146, + "grad_norm": 0.9105168581008911, + "learning_rate": 2.6909181897200785e-05, + "loss": 0.7461, + "step": 204440 + }, + { + "epoch": 1.3061727764077533, + "grad_norm": 1.9122934341430664, + "learning_rate": 2.690473144950112e-05, + "loss": 0.7552, + "step": 204450 + }, + { + "epoch": 1.306236663557492, + "grad_norm": 0.7733789682388306, + "learning_rate": 2.6900281234390763e-05, + "loss": 0.7394, + "step": 204460 + }, + { + "epoch": 1.3063005507072307, + "grad_norm": 0.8355092406272888, + "learning_rate": 2.6895831251914584e-05, + "loss": 0.8982, + "step": 204470 + }, + { + "epoch": 1.3063644378569694, + "grad_norm": 0.7982395887374878, + "learning_rate": 2.6891381502117346e-05, + "loss": 0.7494, + "step": 204480 + }, + { + "epoch": 1.3064283250067081, + "grad_norm": 1.543968677520752, + "learning_rate": 2.688693198504391e-05, + "loss": 0.8552, + "step": 204490 + }, + { + "epoch": 1.3064922121564468, + "grad_norm": 0.8467128276824951, + "learning_rate": 2.6882482700739047e-05, + "loss": 0.7735, + "step": 204500 + }, + { + "epoch": 1.3065560993061855, + "grad_norm": 0.7143219709396362, + "learning_rate": 2.6878033649247603e-05, + "loss": 0.9257, + "step": 204510 + }, + { + "epoch": 1.3066199864559243, + "grad_norm": 0.6512458324432373, + "learning_rate": 2.687358483061434e-05, + "loss": 0.886, + "step": 204520 + }, + { + "epoch": 1.306683873605663, + "grad_norm": 1.0182723999023438, + "learning_rate": 2.6869136244884108e-05, + "loss": 0.7258, + "step": 204530 + }, + { + "epoch": 1.3067477607554017, + "grad_norm": 0.6728656888008118, + "learning_rate": 2.6864687892101658e-05, + "loss": 0.8172, + "step": 204540 + }, + { + "epoch": 1.3068116479051404, + "grad_norm": 3.287576675415039, + "learning_rate": 2.6860239772311845e-05, + "loss": 0.6821, + "step": 204550 + }, + { + "epoch": 1.306875535054879, + "grad_norm": 0.7472635507583618, + "learning_rate": 2.685579188555941e-05, + "loss": 0.9219, + "step": 204560 + }, + { + "epoch": 1.3069394222046178, + "grad_norm": 0.9223730564117432, + "learning_rate": 2.6851344231889196e-05, + "loss": 0.7348, + "step": 204570 + }, + { + "epoch": 1.3070033093543565, + "grad_norm": 0.9666236042976379, + "learning_rate": 2.6846896811345956e-05, + "loss": 1.095, + "step": 204580 + }, + { + "epoch": 1.3070671965040952, + "grad_norm": 0.946058452129364, + "learning_rate": 2.6842449623974508e-05, + "loss": 0.8144, + "step": 204590 + }, + { + "epoch": 1.307131083653834, + "grad_norm": 0.9167945981025696, + "learning_rate": 2.6838002669819616e-05, + "loss": 1.1801, + "step": 204600 + }, + { + "epoch": 1.3071949708035726, + "grad_norm": 1.565016746520996, + "learning_rate": 2.6833555948926088e-05, + "loss": 0.8639, + "step": 204610 + }, + { + "epoch": 1.3072588579533113, + "grad_norm": 1.3720842599868774, + "learning_rate": 2.6829109461338675e-05, + "loss": 0.6832, + "step": 204620 + }, + { + "epoch": 1.30732274510305, + "grad_norm": 0.6299474835395813, + "learning_rate": 2.6824663207102174e-05, + "loss": 0.6936, + "step": 204630 + }, + { + "epoch": 1.3073866322527887, + "grad_norm": 5.6102752685546875, + "learning_rate": 2.6820217186261387e-05, + "loss": 0.8758, + "step": 204640 + }, + { + "epoch": 1.3074505194025274, + "grad_norm": 0.8517172336578369, + "learning_rate": 2.6815771398861044e-05, + "loss": 0.8169, + "step": 204650 + }, + { + "epoch": 1.3075144065522661, + "grad_norm": 0.85049968957901, + "learning_rate": 2.6811325844945957e-05, + "loss": 0.8638, + "step": 204660 + }, + { + "epoch": 1.3075782937020048, + "grad_norm": 1.035893201828003, + "learning_rate": 2.680688052456086e-05, + "loss": 0.7654, + "step": 204670 + }, + { + "epoch": 1.3076421808517436, + "grad_norm": 1.053261399269104, + "learning_rate": 2.6802435437750573e-05, + "loss": 0.8929, + "step": 204680 + }, + { + "epoch": 1.3077060680014823, + "grad_norm": 0.8119083046913147, + "learning_rate": 2.6797990584559794e-05, + "loss": 0.825, + "step": 204690 + }, + { + "epoch": 1.307769955151221, + "grad_norm": 1.3222981691360474, + "learning_rate": 2.6793545965033375e-05, + "loss": 0.7915, + "step": 204700 + }, + { + "epoch": 1.3078338423009597, + "grad_norm": 1.0780036449432373, + "learning_rate": 2.678910157921598e-05, + "loss": 0.7689, + "step": 204710 + }, + { + "epoch": 1.3078977294506982, + "grad_norm": 0.8080828785896301, + "learning_rate": 2.6784657427152427e-05, + "loss": 0.8974, + "step": 204720 + }, + { + "epoch": 1.307961616600437, + "grad_norm": 1.2234289646148682, + "learning_rate": 2.678021350888744e-05, + "loss": 0.7415, + "step": 204730 + }, + { + "epoch": 1.3080255037501756, + "grad_norm": 2.8002803325653076, + "learning_rate": 2.67757698244658e-05, + "loss": 0.8183, + "step": 204740 + }, + { + "epoch": 1.3080893908999145, + "grad_norm": 1.115430474281311, + "learning_rate": 2.677132637393226e-05, + "loss": 1.0415, + "step": 204750 + }, + { + "epoch": 1.308153278049653, + "grad_norm": 1.0875638723373413, + "learning_rate": 2.6766883157331545e-05, + "loss": 0.8105, + "step": 204760 + }, + { + "epoch": 1.308217165199392, + "grad_norm": 1.2501674890518188, + "learning_rate": 2.6762440174708432e-05, + "loss": 0.8226, + "step": 204770 + }, + { + "epoch": 1.3082810523491304, + "grad_norm": 0.7301515340805054, + "learning_rate": 2.6757997426107627e-05, + "loss": 0.8192, + "step": 204780 + }, + { + "epoch": 1.3083449394988693, + "grad_norm": 1.4526420831680298, + "learning_rate": 2.675355491157392e-05, + "loss": 0.9808, + "step": 204790 + }, + { + "epoch": 1.3084088266486078, + "grad_norm": 1.0374690294265747, + "learning_rate": 2.6749112631152005e-05, + "loss": 0.8933, + "step": 204800 + }, + { + "epoch": 1.3084727137983467, + "grad_norm": 1.1937415599822998, + "learning_rate": 2.6744670584886665e-05, + "loss": 0.6994, + "step": 204810 + }, + { + "epoch": 1.3085366009480852, + "grad_norm": 0.8651727437973022, + "learning_rate": 2.6740228772822583e-05, + "loss": 0.8431, + "step": 204820 + }, + { + "epoch": 1.308600488097824, + "grad_norm": 0.766620397567749, + "learning_rate": 2.6735787195004545e-05, + "loss": 0.9116, + "step": 204830 + }, + { + "epoch": 1.3086643752475626, + "grad_norm": 2.3932688236236572, + "learning_rate": 2.6731345851477237e-05, + "loss": 0.937, + "step": 204840 + }, + { + "epoch": 1.3087282623973013, + "grad_norm": 0.8344107866287231, + "learning_rate": 2.6726904742285425e-05, + "loss": 0.7876, + "step": 204850 + }, + { + "epoch": 1.30879214954704, + "grad_norm": 0.6391538381576538, + "learning_rate": 2.6722463867473796e-05, + "loss": 1.1981, + "step": 204860 + }, + { + "epoch": 1.3088560366967787, + "grad_norm": 0.9423249363899231, + "learning_rate": 2.6718023227087114e-05, + "loss": 1.0876, + "step": 204870 + }, + { + "epoch": 1.3089199238465175, + "grad_norm": 2.713463306427002, + "learning_rate": 2.671358282117006e-05, + "loss": 0.8284, + "step": 204880 + }, + { + "epoch": 1.3089838109962562, + "grad_norm": 0.9338700175285339, + "learning_rate": 2.6709142649767393e-05, + "loss": 0.7454, + "step": 204890 + }, + { + "epoch": 1.3090476981459949, + "grad_norm": 0.5116655230522156, + "learning_rate": 2.6704702712923783e-05, + "loss": 0.6845, + "step": 204900 + }, + { + "epoch": 1.3091115852957336, + "grad_norm": 1.5850673913955688, + "learning_rate": 2.6700263010684e-05, + "loss": 1.094, + "step": 204910 + }, + { + "epoch": 1.3091754724454723, + "grad_norm": 0.7147300839424133, + "learning_rate": 2.6695823543092703e-05, + "loss": 0.7666, + "step": 204920 + }, + { + "epoch": 1.309239359595211, + "grad_norm": 0.9105631709098816, + "learning_rate": 2.6691384310194632e-05, + "loss": 1.0769, + "step": 204930 + }, + { + "epoch": 1.3093032467449497, + "grad_norm": 0.7940059304237366, + "learning_rate": 2.6686945312034483e-05, + "loss": 0.7876, + "step": 204940 + }, + { + "epoch": 1.3093671338946884, + "grad_norm": 0.9063543081283569, + "learning_rate": 2.6682506548656956e-05, + "loss": 0.8683, + "step": 204950 + }, + { + "epoch": 1.309431021044427, + "grad_norm": 0.8429856300354004, + "learning_rate": 2.6678068020106777e-05, + "loss": 0.8658, + "step": 204960 + }, + { + "epoch": 1.3094949081941658, + "grad_norm": 0.8013402819633484, + "learning_rate": 2.6673629726428616e-05, + "loss": 1.0001, + "step": 204970 + }, + { + "epoch": 1.3095587953439045, + "grad_norm": 1.4775573015213013, + "learning_rate": 2.66691916676672e-05, + "loss": 0.6973, + "step": 204980 + }, + { + "epoch": 1.3096226824936432, + "grad_norm": 0.7938587665557861, + "learning_rate": 2.6664753843867186e-05, + "loss": 0.8371, + "step": 204990 + }, + { + "epoch": 1.309686569643382, + "grad_norm": 1.0357427597045898, + "learning_rate": 2.6660316255073313e-05, + "loss": 0.6869, + "step": 205000 + }, + { + "epoch": 1.3097504567931206, + "grad_norm": 1.279012680053711, + "learning_rate": 2.6655878901330222e-05, + "loss": 0.9997, + "step": 205010 + }, + { + "epoch": 1.3098143439428593, + "grad_norm": 1.9539459943771362, + "learning_rate": 2.665144178268265e-05, + "loss": 0.7651, + "step": 205020 + }, + { + "epoch": 1.309878231092598, + "grad_norm": 0.8567860126495361, + "learning_rate": 2.6647004899175244e-05, + "loss": 0.8133, + "step": 205030 + }, + { + "epoch": 1.3099421182423368, + "grad_norm": 0.858010470867157, + "learning_rate": 2.664256825085271e-05, + "loss": 0.9452, + "step": 205040 + }, + { + "epoch": 1.3100060053920755, + "grad_norm": 1.1622644662857056, + "learning_rate": 2.6638131837759705e-05, + "loss": 0.7909, + "step": 205050 + }, + { + "epoch": 1.3100698925418142, + "grad_norm": 1.1447198390960693, + "learning_rate": 2.663369565994095e-05, + "loss": 1.0433, + "step": 205060 + }, + { + "epoch": 1.3101337796915529, + "grad_norm": 1.075270175933838, + "learning_rate": 2.6629259717441067e-05, + "loss": 0.9877, + "step": 205070 + }, + { + "epoch": 1.3101976668412916, + "grad_norm": 0.7950900197029114, + "learning_rate": 2.6624824010304778e-05, + "loss": 0.7474, + "step": 205080 + }, + { + "epoch": 1.3102615539910303, + "grad_norm": 0.9394268989562988, + "learning_rate": 2.662038853857671e-05, + "loss": 0.8652, + "step": 205090 + }, + { + "epoch": 1.310325441140769, + "grad_norm": 0.8892121911048889, + "learning_rate": 2.661595330230159e-05, + "loss": 0.628, + "step": 205100 + }, + { + "epoch": 1.3103893282905077, + "grad_norm": 1.6581279039382935, + "learning_rate": 2.6611518301524017e-05, + "loss": 1.0087, + "step": 205110 + }, + { + "epoch": 1.3104532154402464, + "grad_norm": 1.5723330974578857, + "learning_rate": 2.6607083536288714e-05, + "loss": 0.8922, + "step": 205120 + }, + { + "epoch": 1.310517102589985, + "grad_norm": 1.2891085147857666, + "learning_rate": 2.6602649006640297e-05, + "loss": 1.057, + "step": 205130 + }, + { + "epoch": 1.3105809897397238, + "grad_norm": 0.9596815705299377, + "learning_rate": 2.6598214712623466e-05, + "loss": 0.9955, + "step": 205140 + }, + { + "epoch": 1.3106448768894625, + "grad_norm": 1.300654649734497, + "learning_rate": 2.6593780654282836e-05, + "loss": 0.6487, + "step": 205150 + }, + { + "epoch": 1.3107087640392012, + "grad_norm": 0.7440917491912842, + "learning_rate": 2.6589346831663086e-05, + "loss": 1.0035, + "step": 205160 + }, + { + "epoch": 1.31077265118894, + "grad_norm": 1.055013656616211, + "learning_rate": 2.6584913244808886e-05, + "loss": 0.8536, + "step": 205170 + }, + { + "epoch": 1.3108365383386786, + "grad_norm": 0.8633022308349609, + "learning_rate": 2.6580479893764842e-05, + "loss": 0.8186, + "step": 205180 + }, + { + "epoch": 1.3109004254884173, + "grad_norm": 0.9468743801116943, + "learning_rate": 2.6576046778575673e-05, + "loss": 1.0472, + "step": 205190 + }, + { + "epoch": 1.310964312638156, + "grad_norm": 1.479753851890564, + "learning_rate": 2.657161389928594e-05, + "loss": 1.029, + "step": 205200 + }, + { + "epoch": 1.3110281997878945, + "grad_norm": 0.9764959812164307, + "learning_rate": 2.6567181255940333e-05, + "loss": 0.7093, + "step": 205210 + }, + { + "epoch": 1.3110920869376335, + "grad_norm": 0.9881764054298401, + "learning_rate": 2.6563192078698378e-05, + "loss": 1.1946, + "step": 205220 + }, + { + "epoch": 1.311155974087372, + "grad_norm": 0.648379385471344, + "learning_rate": 2.655875988376958e-05, + "loss": 1.0451, + "step": 205230 + }, + { + "epoch": 1.3112198612371109, + "grad_norm": 0.9674725532531738, + "learning_rate": 2.6554327924914335e-05, + "loss": 1.0126, + "step": 205240 + }, + { + "epoch": 1.3112837483868494, + "grad_norm": 0.6536328792572021, + "learning_rate": 2.6549896202177305e-05, + "loss": 0.7331, + "step": 205250 + }, + { + "epoch": 1.3113476355365883, + "grad_norm": 0.9326198101043701, + "learning_rate": 2.654546471560309e-05, + "loss": 0.8201, + "step": 205260 + }, + { + "epoch": 1.3114115226863268, + "grad_norm": 0.8836464285850525, + "learning_rate": 2.6541033465236335e-05, + "loss": 0.7147, + "step": 205270 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 0.9043899178504944, + "learning_rate": 2.653660245112169e-05, + "loss": 0.8064, + "step": 205280 + }, + { + "epoch": 1.3115392969858042, + "grad_norm": 0.5837283134460449, + "learning_rate": 2.6532171673303736e-05, + "loss": 1.2369, + "step": 205290 + }, + { + "epoch": 1.311603184135543, + "grad_norm": 1.130934238433838, + "learning_rate": 2.652774113182713e-05, + "loss": 0.8727, + "step": 205300 + }, + { + "epoch": 1.3116670712852816, + "grad_norm": 1.0953987836837769, + "learning_rate": 2.652331082673647e-05, + "loss": 1.0385, + "step": 205310 + }, + { + "epoch": 1.3117309584350203, + "grad_norm": 0.9843935966491699, + "learning_rate": 2.651888075807639e-05, + "loss": 0.8724, + "step": 205320 + }, + { + "epoch": 1.311794845584759, + "grad_norm": 1.0219281911849976, + "learning_rate": 2.6514450925891476e-05, + "loss": 0.8231, + "step": 205330 + }, + { + "epoch": 1.3118587327344977, + "grad_norm": 0.6493175625801086, + "learning_rate": 2.65100213302264e-05, + "loss": 1.2553, + "step": 205340 + }, + { + "epoch": 1.3119226198842364, + "grad_norm": 1.3615765571594238, + "learning_rate": 2.6505591971125694e-05, + "loss": 1.0284, + "step": 205350 + }, + { + "epoch": 1.3119865070339751, + "grad_norm": 1.0743480920791626, + "learning_rate": 2.650116284863402e-05, + "loss": 0.8086, + "step": 205360 + }, + { + "epoch": 1.3120503941837138, + "grad_norm": 0.7754031419754028, + "learning_rate": 2.6496733962795944e-05, + "loss": 0.8282, + "step": 205370 + }, + { + "epoch": 1.3121142813334525, + "grad_norm": 1.0241520404815674, + "learning_rate": 2.6492305313656108e-05, + "loss": 1.0423, + "step": 205380 + }, + { + "epoch": 1.3121781684831912, + "grad_norm": 0.8662487864494324, + "learning_rate": 2.648787690125908e-05, + "loss": 0.9171, + "step": 205390 + }, + { + "epoch": 1.31224205563293, + "grad_norm": 1.0339150428771973, + "learning_rate": 2.6483448725649473e-05, + "loss": 0.9033, + "step": 205400 + }, + { + "epoch": 1.3123059427826687, + "grad_norm": 0.7531540393829346, + "learning_rate": 2.64790207868719e-05, + "loss": 0.8135, + "step": 205410 + }, + { + "epoch": 1.3123698299324074, + "grad_norm": 0.8095221519470215, + "learning_rate": 2.6474593084970913e-05, + "loss": 0.7887, + "step": 205420 + }, + { + "epoch": 1.312433717082146, + "grad_norm": 1.9569956064224243, + "learning_rate": 2.647016561999115e-05, + "loss": 0.6692, + "step": 205430 + }, + { + "epoch": 1.3124976042318848, + "grad_norm": 1.539527177810669, + "learning_rate": 2.6465738391977158e-05, + "loss": 1.0463, + "step": 205440 + }, + { + "epoch": 1.3125614913816235, + "grad_norm": 0.9961333870887756, + "learning_rate": 2.6461311400973553e-05, + "loss": 0.868, + "step": 205450 + }, + { + "epoch": 1.3126253785313622, + "grad_norm": 0.5948057174682617, + "learning_rate": 2.6456884647024894e-05, + "loss": 0.6536, + "step": 205460 + }, + { + "epoch": 1.312689265681101, + "grad_norm": 1.2141871452331543, + "learning_rate": 2.6452458130175784e-05, + "loss": 0.9659, + "step": 205470 + }, + { + "epoch": 1.3127531528308396, + "grad_norm": 0.9608542919158936, + "learning_rate": 2.644803185047078e-05, + "loss": 0.7377, + "step": 205480 + }, + { + "epoch": 1.3128170399805783, + "grad_norm": 1.4741836786270142, + "learning_rate": 2.6443605807954492e-05, + "loss": 0.9431, + "step": 205490 + }, + { + "epoch": 1.312880927130317, + "grad_norm": 1.3734500408172607, + "learning_rate": 2.6439180002671448e-05, + "loss": 0.8823, + "step": 205500 + }, + { + "epoch": 1.3129448142800557, + "grad_norm": 1.110561490058899, + "learning_rate": 2.6434754434666264e-05, + "loss": 0.8809, + "step": 205510 + }, + { + "epoch": 1.3130087014297944, + "grad_norm": 0.6751860976219177, + "learning_rate": 2.6430329103983475e-05, + "loss": 0.9978, + "step": 205520 + }, + { + "epoch": 1.3130725885795331, + "grad_norm": 0.9452330470085144, + "learning_rate": 2.642590401066768e-05, + "loss": 0.893, + "step": 205530 + }, + { + "epoch": 1.3131364757292718, + "grad_norm": 1.2708438634872437, + "learning_rate": 2.6421479154763406e-05, + "loss": 0.8002, + "step": 205540 + }, + { + "epoch": 1.3132003628790105, + "grad_norm": 1.1103713512420654, + "learning_rate": 2.6417054536315254e-05, + "loss": 0.832, + "step": 205550 + }, + { + "epoch": 1.3132642500287492, + "grad_norm": 1.1159241199493408, + "learning_rate": 2.6412630155367747e-05, + "loss": 0.9068, + "step": 205560 + }, + { + "epoch": 1.313328137178488, + "grad_norm": 1.3691654205322266, + "learning_rate": 2.6408206011965474e-05, + "loss": 0.8826, + "step": 205570 + }, + { + "epoch": 1.3133920243282267, + "grad_norm": 0.7377268075942993, + "learning_rate": 2.6403782106152964e-05, + "loss": 0.774, + "step": 205580 + }, + { + "epoch": 1.3134559114779654, + "grad_norm": 1.0901238918304443, + "learning_rate": 2.6399358437974776e-05, + "loss": 0.798, + "step": 205590 + }, + { + "epoch": 1.313519798627704, + "grad_norm": 0.6787892580032349, + "learning_rate": 2.6394935007475492e-05, + "loss": 0.8506, + "step": 205600 + }, + { + "epoch": 1.3135836857774428, + "grad_norm": 0.7032554149627686, + "learning_rate": 2.639051181469961e-05, + "loss": 0.8875, + "step": 205610 + }, + { + "epoch": 1.3136475729271815, + "grad_norm": 0.7459577918052673, + "learning_rate": 2.6386088859691714e-05, + "loss": 0.9456, + "step": 205620 + }, + { + "epoch": 1.3137114600769202, + "grad_norm": 1.2270710468292236, + "learning_rate": 2.6381666142496324e-05, + "loss": 0.8654, + "step": 205630 + }, + { + "epoch": 1.313775347226659, + "grad_norm": 1.247017741203308, + "learning_rate": 2.6377243663158e-05, + "loss": 0.9573, + "step": 205640 + }, + { + "epoch": 1.3138392343763976, + "grad_norm": 1.0125858783721924, + "learning_rate": 2.6372821421721255e-05, + "loss": 0.9125, + "step": 205650 + }, + { + "epoch": 1.3139031215261363, + "grad_norm": 0.7184951901435852, + "learning_rate": 2.6368399418230656e-05, + "loss": 1.0898, + "step": 205660 + }, + { + "epoch": 1.313967008675875, + "grad_norm": 0.9848329424858093, + "learning_rate": 2.6363977652730698e-05, + "loss": 1.1097, + "step": 205670 + }, + { + "epoch": 1.3140308958256135, + "grad_norm": 1.3619470596313477, + "learning_rate": 2.6359556125265948e-05, + "loss": 0.9542, + "step": 205680 + }, + { + "epoch": 1.3140947829753524, + "grad_norm": 1.8601772785186768, + "learning_rate": 2.6355134835880912e-05, + "loss": 1.1239, + "step": 205690 + }, + { + "epoch": 1.314158670125091, + "grad_norm": 1.0715014934539795, + "learning_rate": 2.6350713784620135e-05, + "loss": 0.8293, + "step": 205700 + }, + { + "epoch": 1.3142225572748298, + "grad_norm": 1.2173364162445068, + "learning_rate": 2.634629297152811e-05, + "loss": 0.6619, + "step": 205710 + }, + { + "epoch": 1.3142864444245683, + "grad_norm": 0.6807637214660645, + "learning_rate": 2.6341872396649404e-05, + "loss": 0.8755, + "step": 205720 + }, + { + "epoch": 1.3143503315743073, + "grad_norm": 1.259724736213684, + "learning_rate": 2.6337452060028485e-05, + "loss": 0.9458, + "step": 205730 + }, + { + "epoch": 1.3144142187240457, + "grad_norm": 3.6456003189086914, + "learning_rate": 2.6333031961709914e-05, + "loss": 0.9435, + "step": 205740 + }, + { + "epoch": 1.3144781058737847, + "grad_norm": 1.5074976682662964, + "learning_rate": 2.632861210173817e-05, + "loss": 0.8298, + "step": 205750 + }, + { + "epoch": 1.3145419930235231, + "grad_norm": 1.0483545064926147, + "learning_rate": 2.6324192480157793e-05, + "loss": 0.8742, + "step": 205760 + }, + { + "epoch": 1.314605880173262, + "grad_norm": 0.7130438089370728, + "learning_rate": 2.6319773097013266e-05, + "loss": 1.0824, + "step": 205770 + }, + { + "epoch": 1.3146697673230006, + "grad_norm": 0.7423893809318542, + "learning_rate": 2.631535395234913e-05, + "loss": 0.9463, + "step": 205780 + }, + { + "epoch": 1.3147336544727395, + "grad_norm": 0.800720751285553, + "learning_rate": 2.6310935046209857e-05, + "loss": 0.9104, + "step": 205790 + }, + { + "epoch": 1.314797541622478, + "grad_norm": 1.0936715602874756, + "learning_rate": 2.6306516378639957e-05, + "loss": 0.7183, + "step": 205800 + }, + { + "epoch": 1.3148614287722167, + "grad_norm": 1.0465011596679688, + "learning_rate": 2.6302097949683947e-05, + "loss": 0.7369, + "step": 205810 + }, + { + "epoch": 1.3149253159219554, + "grad_norm": 0.7417481541633606, + "learning_rate": 2.6297679759386318e-05, + "loss": 0.7971, + "step": 205820 + }, + { + "epoch": 1.314989203071694, + "grad_norm": 0.9712346792221069, + "learning_rate": 2.629326180779156e-05, + "loss": 0.5761, + "step": 205830 + }, + { + "epoch": 1.3150530902214328, + "grad_norm": 1.158448576927185, + "learning_rate": 2.6288844094944142e-05, + "loss": 0.8257, + "step": 205840 + }, + { + "epoch": 1.3151169773711715, + "grad_norm": 0.7787433862686157, + "learning_rate": 2.62844266208886e-05, + "loss": 0.7885, + "step": 205850 + }, + { + "epoch": 1.3151808645209102, + "grad_norm": 1.6316027641296387, + "learning_rate": 2.628000938566938e-05, + "loss": 0.9431, + "step": 205860 + }, + { + "epoch": 1.315244751670649, + "grad_norm": 1.5881561040878296, + "learning_rate": 2.6275592389331004e-05, + "loss": 0.5653, + "step": 205870 + }, + { + "epoch": 1.3153086388203876, + "grad_norm": 1.7636117935180664, + "learning_rate": 2.627117563191792e-05, + "loss": 0.6936, + "step": 205880 + }, + { + "epoch": 1.3153725259701263, + "grad_norm": 1.057289719581604, + "learning_rate": 2.6266759113474644e-05, + "loss": 0.6354, + "step": 205890 + }, + { + "epoch": 1.315436413119865, + "grad_norm": 1.322198510169983, + "learning_rate": 2.6262342834045617e-05, + "loss": 0.964, + "step": 205900 + }, + { + "epoch": 1.3155003002696037, + "grad_norm": 1.3690789937973022, + "learning_rate": 2.6257926793675346e-05, + "loss": 0.7329, + "step": 205910 + }, + { + "epoch": 1.3155641874193424, + "grad_norm": 0.6922479271888733, + "learning_rate": 2.625351099240828e-05, + "loss": 0.8405, + "step": 205920 + }, + { + "epoch": 1.3156280745690812, + "grad_norm": 0.8695868849754333, + "learning_rate": 2.62490954302889e-05, + "loss": 0.9644, + "step": 205930 + }, + { + "epoch": 1.3156919617188199, + "grad_norm": 1.797025442123413, + "learning_rate": 2.6244680107361695e-05, + "loss": 0.8523, + "step": 205940 + }, + { + "epoch": 1.3157558488685586, + "grad_norm": 0.9863460063934326, + "learning_rate": 2.6240265023671096e-05, + "loss": 0.754, + "step": 205950 + }, + { + "epoch": 1.3158197360182973, + "grad_norm": 1.008402943611145, + "learning_rate": 2.62358501792616e-05, + "loss": 0.8783, + "step": 205960 + }, + { + "epoch": 1.315883623168036, + "grad_norm": 3.882580518722534, + "learning_rate": 2.623143557417763e-05, + "loss": 0.753, + "step": 205970 + }, + { + "epoch": 1.3159475103177747, + "grad_norm": 1.0112920999526978, + "learning_rate": 2.622702120846369e-05, + "loss": 0.8311, + "step": 205980 + }, + { + "epoch": 1.3160113974675134, + "grad_norm": 1.3579504489898682, + "learning_rate": 2.6222607082164196e-05, + "loss": 1.0214, + "step": 205990 + }, + { + "epoch": 1.316075284617252, + "grad_norm": 0.6648353934288025, + "learning_rate": 2.6218193195323636e-05, + "loss": 0.9964, + "step": 206000 + }, + { + "epoch": 1.3161391717669908, + "grad_norm": 1.2353508472442627, + "learning_rate": 2.6213779547986422e-05, + "loss": 0.9256, + "step": 206010 + }, + { + "epoch": 1.3162030589167295, + "grad_norm": 0.7448508143424988, + "learning_rate": 2.6209366140197045e-05, + "loss": 1.0117, + "step": 206020 + }, + { + "epoch": 1.3162669460664682, + "grad_norm": 1.2980650663375854, + "learning_rate": 2.620495297199992e-05, + "loss": 0.8087, + "step": 206030 + }, + { + "epoch": 1.316330833216207, + "grad_norm": 1.022688388824463, + "learning_rate": 2.6200540043439516e-05, + "loss": 1.1366, + "step": 206040 + }, + { + "epoch": 1.3163947203659456, + "grad_norm": 1.5544421672821045, + "learning_rate": 2.6196127354560252e-05, + "loss": 0.7394, + "step": 206050 + }, + { + "epoch": 1.3164586075156843, + "grad_norm": 1.0281933546066284, + "learning_rate": 2.6191714905406596e-05, + "loss": 0.8466, + "step": 206060 + }, + { + "epoch": 1.316522494665423, + "grad_norm": 1.1553758382797241, + "learning_rate": 2.6187302696022942e-05, + "loss": 0.9573, + "step": 206070 + }, + { + "epoch": 1.3165863818151617, + "grad_norm": 0.6869220733642578, + "learning_rate": 2.6182890726453774e-05, + "loss": 0.9941, + "step": 206080 + }, + { + "epoch": 1.3166502689649004, + "grad_norm": 0.8922250866889954, + "learning_rate": 2.6178478996743483e-05, + "loss": 0.8216, + "step": 206090 + }, + { + "epoch": 1.3167141561146392, + "grad_norm": 1.8326857089996338, + "learning_rate": 2.6174067506936534e-05, + "loss": 0.7848, + "step": 206100 + }, + { + "epoch": 1.3167780432643779, + "grad_norm": 1.1187126636505127, + "learning_rate": 2.6169656257077323e-05, + "loss": 0.8663, + "step": 206110 + }, + { + "epoch": 1.3168419304141166, + "grad_norm": 1.288608193397522, + "learning_rate": 2.6165245247210284e-05, + "loss": 0.9236, + "step": 206120 + }, + { + "epoch": 1.3169058175638553, + "grad_norm": 0.970199704170227, + "learning_rate": 2.616083447737987e-05, + "loss": 0.9909, + "step": 206130 + }, + { + "epoch": 1.316969704713594, + "grad_norm": 1.1030552387237549, + "learning_rate": 2.6156423947630448e-05, + "loss": 0.8332, + "step": 206140 + }, + { + "epoch": 1.3170335918633327, + "grad_norm": 1.9698622226715088, + "learning_rate": 2.615201365800649e-05, + "loss": 0.8564, + "step": 206150 + }, + { + "epoch": 1.3170974790130714, + "grad_norm": 0.5434449315071106, + "learning_rate": 2.6147603608552374e-05, + "loss": 0.834, + "step": 206160 + }, + { + "epoch": 1.3171613661628099, + "grad_norm": 0.7695097327232361, + "learning_rate": 2.614319379931254e-05, + "loss": 0.984, + "step": 206170 + }, + { + "epoch": 1.3172252533125488, + "grad_norm": 0.7618531584739685, + "learning_rate": 2.613878423033136e-05, + "loss": 0.8815, + "step": 206180 + }, + { + "epoch": 1.3172891404622873, + "grad_norm": 1.5356720685958862, + "learning_rate": 2.613437490165328e-05, + "loss": 0.7811, + "step": 206190 + }, + { + "epoch": 1.3173530276120262, + "grad_norm": 0.938105583190918, + "learning_rate": 2.6129965813322676e-05, + "loss": 0.8789, + "step": 206200 + }, + { + "epoch": 1.3174169147617647, + "grad_norm": 0.8890140652656555, + "learning_rate": 2.612555696538399e-05, + "loss": 1.1062, + "step": 206210 + }, + { + "epoch": 1.3174808019115036, + "grad_norm": 0.8505265116691589, + "learning_rate": 2.6121148357881582e-05, + "loss": 1.0629, + "step": 206220 + }, + { + "epoch": 1.3175446890612421, + "grad_norm": 0.7777497172355652, + "learning_rate": 2.6116739990859884e-05, + "loss": 0.8503, + "step": 206230 + }, + { + "epoch": 1.317608576210981, + "grad_norm": 1.8511273860931396, + "learning_rate": 2.611233186436326e-05, + "loss": 0.874, + "step": 206240 + }, + { + "epoch": 1.3176724633607195, + "grad_norm": 0.8401346206665039, + "learning_rate": 2.6107923978436134e-05, + "loss": 0.6313, + "step": 206250 + }, + { + "epoch": 1.3177363505104585, + "grad_norm": 0.7148954272270203, + "learning_rate": 2.6103516333122868e-05, + "loss": 1.1435, + "step": 206260 + }, + { + "epoch": 1.317800237660197, + "grad_norm": 1.3933757543563843, + "learning_rate": 2.6099108928467885e-05, + "loss": 0.8873, + "step": 206270 + }, + { + "epoch": 1.3178641248099359, + "grad_norm": 0.9285461902618408, + "learning_rate": 2.6094701764515528e-05, + "loss": 0.8671, + "step": 206280 + }, + { + "epoch": 1.3179280119596744, + "grad_norm": 0.5819171667098999, + "learning_rate": 2.6090294841310227e-05, + "loss": 1.0112, + "step": 206290 + }, + { + "epoch": 1.317991899109413, + "grad_norm": 0.757918119430542, + "learning_rate": 2.6085888158896328e-05, + "loss": 0.814, + "step": 206300 + }, + { + "epoch": 1.3180557862591518, + "grad_norm": 0.880970299243927, + "learning_rate": 2.6081481717318236e-05, + "loss": 0.7497, + "step": 206310 + }, + { + "epoch": 1.3181196734088905, + "grad_norm": 1.6692066192626953, + "learning_rate": 2.607707551662032e-05, + "loss": 0.9548, + "step": 206320 + }, + { + "epoch": 1.3181835605586292, + "grad_norm": 0.8854753971099854, + "learning_rate": 2.607266955684693e-05, + "loss": 1.0597, + "step": 206330 + }, + { + "epoch": 1.3182474477083679, + "grad_norm": 0.6867426037788391, + "learning_rate": 2.6068263838042473e-05, + "loss": 0.7143, + "step": 206340 + }, + { + "epoch": 1.3183113348581066, + "grad_norm": 0.73616623878479, + "learning_rate": 2.6063858360251293e-05, + "loss": 0.752, + "step": 206350 + }, + { + "epoch": 1.3183752220078453, + "grad_norm": 1.009392261505127, + "learning_rate": 2.6059453123517775e-05, + "loss": 0.662, + "step": 206360 + }, + { + "epoch": 1.318439109157584, + "grad_norm": 0.6935415267944336, + "learning_rate": 2.6055048127886263e-05, + "loss": 1.0914, + "step": 206370 + }, + { + "epoch": 1.3185029963073227, + "grad_norm": 0.9365649819374084, + "learning_rate": 2.605064337340115e-05, + "loss": 0.7753, + "step": 206380 + }, + { + "epoch": 1.3185668834570614, + "grad_norm": 1.1650468111038208, + "learning_rate": 2.6046238860106754e-05, + "loss": 0.7423, + "step": 206390 + }, + { + "epoch": 1.3186307706068001, + "grad_norm": 1.0778311491012573, + "learning_rate": 2.6041834588047475e-05, + "loss": 0.9013, + "step": 206400 + }, + { + "epoch": 1.3186946577565388, + "grad_norm": 0.8691341876983643, + "learning_rate": 2.603743055726764e-05, + "loss": 0.6258, + "step": 206410 + }, + { + "epoch": 1.3187585449062775, + "grad_norm": 0.8959952592849731, + "learning_rate": 2.603302676781162e-05, + "loss": 0.9495, + "step": 206420 + }, + { + "epoch": 1.3188224320560162, + "grad_norm": 0.8454727530479431, + "learning_rate": 2.6028623219723734e-05, + "loss": 1.005, + "step": 206430 + }, + { + "epoch": 1.318886319205755, + "grad_norm": 1.0210155248641968, + "learning_rate": 2.6024219913048353e-05, + "loss": 1.0072, + "step": 206440 + }, + { + "epoch": 1.3189502063554936, + "grad_norm": 1.145677089691162, + "learning_rate": 2.601981684782984e-05, + "loss": 1.0031, + "step": 206450 + }, + { + "epoch": 1.3190140935052324, + "grad_norm": 1.3724578619003296, + "learning_rate": 2.6015414024112494e-05, + "loss": 0.9964, + "step": 206460 + }, + { + "epoch": 1.319077980654971, + "grad_norm": 1.4045820236206055, + "learning_rate": 2.6011011441940703e-05, + "loss": 0.9096, + "step": 206470 + }, + { + "epoch": 1.3191418678047098, + "grad_norm": 1.1010231971740723, + "learning_rate": 2.6006609101358758e-05, + "loss": 0.731, + "step": 206480 + }, + { + "epoch": 1.3192057549544485, + "grad_norm": 0.9111044406890869, + "learning_rate": 2.6002207002411038e-05, + "loss": 0.9891, + "step": 206490 + }, + { + "epoch": 1.3192696421041872, + "grad_norm": 1.2783539295196533, + "learning_rate": 2.5997805145141834e-05, + "loss": 0.8415, + "step": 206500 + }, + { + "epoch": 1.3193335292539259, + "grad_norm": 1.8971495628356934, + "learning_rate": 2.5993403529595518e-05, + "loss": 0.746, + "step": 206510 + }, + { + "epoch": 1.3193974164036646, + "grad_norm": 1.0325102806091309, + "learning_rate": 2.5989002155816376e-05, + "loss": 0.716, + "step": 206520 + }, + { + "epoch": 1.3194613035534033, + "grad_norm": 0.8873870372772217, + "learning_rate": 2.5984601023848776e-05, + "loss": 0.8466, + "step": 206530 + }, + { + "epoch": 1.319525190703142, + "grad_norm": 0.7020831108093262, + "learning_rate": 2.5980200133736998e-05, + "loss": 1.012, + "step": 206540 + }, + { + "epoch": 1.3195890778528807, + "grad_norm": 1.0816673040390015, + "learning_rate": 2.5975799485525404e-05, + "loss": 1.2432, + "step": 206550 + }, + { + "epoch": 1.3196529650026194, + "grad_norm": 1.4802582263946533, + "learning_rate": 2.5971399079258273e-05, + "loss": 0.7478, + "step": 206560 + }, + { + "epoch": 1.3197168521523581, + "grad_norm": 0.6974547505378723, + "learning_rate": 2.5966998914979964e-05, + "loss": 0.7853, + "step": 206570 + }, + { + "epoch": 1.3197807393020968, + "grad_norm": 1.2791091203689575, + "learning_rate": 2.5962598992734743e-05, + "loss": 1.0, + "step": 206580 + }, + { + "epoch": 1.3198446264518355, + "grad_norm": 1.0931358337402344, + "learning_rate": 2.5958199312566967e-05, + "loss": 0.7855, + "step": 206590 + }, + { + "epoch": 1.3199085136015742, + "grad_norm": 0.5698302388191223, + "learning_rate": 2.5953799874520907e-05, + "loss": 0.6276, + "step": 206600 + }, + { + "epoch": 1.319972400751313, + "grad_norm": 1.2574687004089355, + "learning_rate": 2.59494006786409e-05, + "loss": 1.0715, + "step": 206610 + }, + { + "epoch": 1.3200362879010517, + "grad_norm": 0.8968929648399353, + "learning_rate": 2.594500172497122e-05, + "loss": 0.9868, + "step": 206620 + }, + { + "epoch": 1.3201001750507904, + "grad_norm": 0.8700588345527649, + "learning_rate": 2.59406030135562e-05, + "loss": 0.9474, + "step": 206630 + }, + { + "epoch": 1.320164062200529, + "grad_norm": 0.9864295125007629, + "learning_rate": 2.5936204544440102e-05, + "loss": 0.7433, + "step": 206640 + }, + { + "epoch": 1.3202279493502678, + "grad_norm": 0.8823724389076233, + "learning_rate": 2.593180631766724e-05, + "loss": 0.5835, + "step": 206650 + }, + { + "epoch": 1.3202918365000063, + "grad_norm": 0.9572489261627197, + "learning_rate": 2.5927408333281932e-05, + "loss": 0.9716, + "step": 206660 + }, + { + "epoch": 1.3203557236497452, + "grad_norm": 1.1817903518676758, + "learning_rate": 2.5923010591328435e-05, + "loss": 1.1069, + "step": 206670 + }, + { + "epoch": 1.3204196107994837, + "grad_norm": 0.7431257367134094, + "learning_rate": 2.5918613091851062e-05, + "loss": 0.7068, + "step": 206680 + }, + { + "epoch": 1.3204834979492226, + "grad_norm": 1.2763636112213135, + "learning_rate": 2.5914215834894072e-05, + "loss": 0.9138, + "step": 206690 + }, + { + "epoch": 1.320547385098961, + "grad_norm": 1.2803268432617188, + "learning_rate": 2.5909818820501785e-05, + "loss": 0.863, + "step": 206700 + }, + { + "epoch": 1.3206112722487, + "grad_norm": 1.223968267440796, + "learning_rate": 2.5905422048718443e-05, + "loss": 0.9294, + "step": 206710 + }, + { + "epoch": 1.3206751593984385, + "grad_norm": 1.168885588645935, + "learning_rate": 2.5901025519588363e-05, + "loss": 1.1139, + "step": 206720 + }, + { + "epoch": 1.3207390465481774, + "grad_norm": 0.922627866268158, + "learning_rate": 2.5896629233155788e-05, + "loss": 0.8245, + "step": 206730 + }, + { + "epoch": 1.320802933697916, + "grad_norm": 1.188184380531311, + "learning_rate": 2.589223318946503e-05, + "loss": 0.9801, + "step": 206740 + }, + { + "epoch": 1.3208668208476548, + "grad_norm": 0.7618042230606079, + "learning_rate": 2.588783738856032e-05, + "loss": 0.855, + "step": 206750 + }, + { + "epoch": 1.3209307079973933, + "grad_norm": 1.4691100120544434, + "learning_rate": 2.5883441830485965e-05, + "loss": 0.986, + "step": 206760 + }, + { + "epoch": 1.3209945951471322, + "grad_norm": 0.8927893042564392, + "learning_rate": 2.5879046515286198e-05, + "loss": 0.8979, + "step": 206770 + }, + { + "epoch": 1.3210584822968707, + "grad_norm": 0.6572344303131104, + "learning_rate": 2.5874651443005317e-05, + "loss": 0.7915, + "step": 206780 + }, + { + "epoch": 1.3211223694466094, + "grad_norm": 1.3234648704528809, + "learning_rate": 2.5870256613687573e-05, + "loss": 1.2454, + "step": 206790 + }, + { + "epoch": 1.3211862565963481, + "grad_norm": 1.0111384391784668, + "learning_rate": 2.586586202737719e-05, + "loss": 0.983, + "step": 206800 + }, + { + "epoch": 1.3212501437460868, + "grad_norm": 0.7284995317459106, + "learning_rate": 2.586146768411848e-05, + "loss": 0.597, + "step": 206810 + }, + { + "epoch": 1.3213140308958256, + "grad_norm": 0.5933718681335449, + "learning_rate": 2.5857073583955654e-05, + "loss": 0.9336, + "step": 206820 + }, + { + "epoch": 1.3213779180455643, + "grad_norm": 1.2970045804977417, + "learning_rate": 2.5852679726933006e-05, + "loss": 0.9286, + "step": 206830 + }, + { + "epoch": 1.321441805195303, + "grad_norm": 1.1350326538085938, + "learning_rate": 2.5848286113094744e-05, + "loss": 0.9883, + "step": 206840 + }, + { + "epoch": 1.3215056923450417, + "grad_norm": 0.8951261043548584, + "learning_rate": 2.5843892742485148e-05, + "loss": 0.8113, + "step": 206850 + }, + { + "epoch": 1.3215695794947804, + "grad_norm": 1.196889042854309, + "learning_rate": 2.583949961514843e-05, + "loss": 0.6894, + "step": 206860 + }, + { + "epoch": 1.321633466644519, + "grad_norm": 1.293573260307312, + "learning_rate": 2.5835106731128885e-05, + "loss": 1.0643, + "step": 206870 + }, + { + "epoch": 1.3216973537942578, + "grad_norm": 0.8331345915794373, + "learning_rate": 2.5830714090470686e-05, + "loss": 0.7408, + "step": 206880 + }, + { + "epoch": 1.3217612409439965, + "grad_norm": 2.4497036933898926, + "learning_rate": 2.5826321693218135e-05, + "loss": 0.9284, + "step": 206890 + }, + { + "epoch": 1.3218251280937352, + "grad_norm": 0.882736086845398, + "learning_rate": 2.582192953941542e-05, + "loss": 0.8527, + "step": 206900 + }, + { + "epoch": 1.321889015243474, + "grad_norm": 0.72117680311203, + "learning_rate": 2.5817537629106802e-05, + "loss": 0.8667, + "step": 206910 + }, + { + "epoch": 1.3219529023932126, + "grad_norm": 1.2809302806854248, + "learning_rate": 2.5813145962336488e-05, + "loss": 0.9314, + "step": 206920 + }, + { + "epoch": 1.3220167895429513, + "grad_norm": 1.0508041381835938, + "learning_rate": 2.580875453914874e-05, + "loss": 0.8098, + "step": 206930 + }, + { + "epoch": 1.32208067669269, + "grad_norm": 1.1330662965774536, + "learning_rate": 2.5804363359587735e-05, + "loss": 0.8803, + "step": 206940 + }, + { + "epoch": 1.3221445638424287, + "grad_norm": 0.9646334052085876, + "learning_rate": 2.579997242369775e-05, + "loss": 0.8861, + "step": 206950 + }, + { + "epoch": 1.3222084509921674, + "grad_norm": 0.8501851558685303, + "learning_rate": 2.579558173152295e-05, + "loss": 0.9793, + "step": 206960 + }, + { + "epoch": 1.3222723381419061, + "grad_norm": 1.1417040824890137, + "learning_rate": 2.5791191283107586e-05, + "loss": 0.8623, + "step": 206970 + }, + { + "epoch": 1.3223362252916449, + "grad_norm": 1.8951231241226196, + "learning_rate": 2.5786801078495893e-05, + "loss": 0.9489, + "step": 206980 + }, + { + "epoch": 1.3224001124413836, + "grad_norm": 1.1195141077041626, + "learning_rate": 2.5782411117732043e-05, + "loss": 1.0253, + "step": 206990 + }, + { + "epoch": 1.3224639995911223, + "grad_norm": 0.8514233827590942, + "learning_rate": 2.5778021400860274e-05, + "loss": 1.1297, + "step": 207000 + }, + { + "epoch": 1.322527886740861, + "grad_norm": 0.8984963297843933, + "learning_rate": 2.5773631927924768e-05, + "loss": 0.8752, + "step": 207010 + }, + { + "epoch": 1.3225917738905997, + "grad_norm": 0.8698433637619019, + "learning_rate": 2.5769242698969774e-05, + "loss": 0.6839, + "step": 207020 + }, + { + "epoch": 1.3226556610403384, + "grad_norm": 1.0516668558120728, + "learning_rate": 2.576485371403944e-05, + "loss": 0.8271, + "step": 207030 + }, + { + "epoch": 1.322719548190077, + "grad_norm": 0.9294512867927551, + "learning_rate": 2.5760464973178023e-05, + "loss": 1.1857, + "step": 207040 + }, + { + "epoch": 1.3227834353398158, + "grad_norm": 0.8164847493171692, + "learning_rate": 2.575607647642967e-05, + "loss": 0.8921, + "step": 207050 + }, + { + "epoch": 1.3228473224895545, + "grad_norm": 0.8850873112678528, + "learning_rate": 2.575168822383862e-05, + "loss": 0.9315, + "step": 207060 + }, + { + "epoch": 1.3229112096392932, + "grad_norm": 0.9530342817306519, + "learning_rate": 2.5747300215449027e-05, + "loss": 0.8774, + "step": 207070 + }, + { + "epoch": 1.322975096789032, + "grad_norm": 0.5504788756370544, + "learning_rate": 2.5742912451305124e-05, + "loss": 1.1385, + "step": 207080 + }, + { + "epoch": 1.3230389839387706, + "grad_norm": 0.5386658310890198, + "learning_rate": 2.573852493145106e-05, + "loss": 0.9027, + "step": 207090 + }, + { + "epoch": 1.3231028710885093, + "grad_norm": 0.6537347435951233, + "learning_rate": 2.5734137655931055e-05, + "loss": 1.0127, + "step": 207100 + }, + { + "epoch": 1.323166758238248, + "grad_norm": 0.8527598977088928, + "learning_rate": 2.572975062478926e-05, + "loss": 0.8942, + "step": 207110 + }, + { + "epoch": 1.3232306453879867, + "grad_norm": 0.6852341294288635, + "learning_rate": 2.5725363838069887e-05, + "loss": 0.9198, + "step": 207120 + }, + { + "epoch": 1.3232945325377254, + "grad_norm": 0.8571737408638, + "learning_rate": 2.572097729581709e-05, + "loss": 0.8471, + "step": 207130 + }, + { + "epoch": 1.3233584196874641, + "grad_norm": 0.8678430914878845, + "learning_rate": 2.571659099807507e-05, + "loss": 0.77, + "step": 207140 + }, + { + "epoch": 1.3234223068372026, + "grad_norm": 0.5002248883247375, + "learning_rate": 2.5712204944887962e-05, + "loss": 0.666, + "step": 207150 + }, + { + "epoch": 1.3234861939869416, + "grad_norm": 2.0342235565185547, + "learning_rate": 2.570781913629999e-05, + "loss": 0.5886, + "step": 207160 + }, + { + "epoch": 1.32355008113668, + "grad_norm": 1.233255386352539, + "learning_rate": 2.5703433572355272e-05, + "loss": 0.8278, + "step": 207170 + }, + { + "epoch": 1.323613968286419, + "grad_norm": 0.849678099155426, + "learning_rate": 2.5699048253098002e-05, + "loss": 0.7862, + "step": 207180 + }, + { + "epoch": 1.3236778554361575, + "grad_norm": 1.1559913158416748, + "learning_rate": 2.5694663178572352e-05, + "loss": 0.847, + "step": 207190 + }, + { + "epoch": 1.3237417425858964, + "grad_norm": 0.5825319290161133, + "learning_rate": 2.5690278348822462e-05, + "loss": 0.9429, + "step": 207200 + }, + { + "epoch": 1.3238056297356349, + "grad_norm": 1.2116233110427856, + "learning_rate": 2.568589376389251e-05, + "loss": 0.8407, + "step": 207210 + }, + { + "epoch": 1.3238695168853738, + "grad_norm": 0.8955128788948059, + "learning_rate": 2.5681509423826626e-05, + "loss": 0.8776, + "step": 207220 + }, + { + "epoch": 1.3239334040351123, + "grad_norm": 1.1272040605545044, + "learning_rate": 2.5677125328669e-05, + "loss": 0.7028, + "step": 207230 + }, + { + "epoch": 1.3239972911848512, + "grad_norm": 1.5354359149932861, + "learning_rate": 2.5672741478463746e-05, + "loss": 0.9998, + "step": 207240 + }, + { + "epoch": 1.3240611783345897, + "grad_norm": 1.0237005949020386, + "learning_rate": 2.5668357873255043e-05, + "loss": 1.1591, + "step": 207250 + }, + { + "epoch": 1.3241250654843284, + "grad_norm": 0.8075418472290039, + "learning_rate": 2.566397451308702e-05, + "loss": 0.7585, + "step": 207260 + }, + { + "epoch": 1.324188952634067, + "grad_norm": 0.9698383808135986, + "learning_rate": 2.5659591398003834e-05, + "loss": 0.8286, + "step": 207270 + }, + { + "epoch": 1.3242528397838058, + "grad_norm": 0.953010618686676, + "learning_rate": 2.565520852804963e-05, + "loss": 0.7961, + "step": 207280 + }, + { + "epoch": 1.3243167269335445, + "grad_norm": 1.0039968490600586, + "learning_rate": 2.5650825903268515e-05, + "loss": 0.836, + "step": 207290 + }, + { + "epoch": 1.3243806140832832, + "grad_norm": 0.7803800702095032, + "learning_rate": 2.5646443523704666e-05, + "loss": 1.1, + "step": 207300 + }, + { + "epoch": 1.324444501233022, + "grad_norm": 0.7452454566955566, + "learning_rate": 2.5642061389402177e-05, + "loss": 0.8007, + "step": 207310 + }, + { + "epoch": 1.3245083883827606, + "grad_norm": 1.357801914215088, + "learning_rate": 2.5637679500405225e-05, + "loss": 1.034, + "step": 207320 + }, + { + "epoch": 1.3245722755324993, + "grad_norm": 0.8555988669395447, + "learning_rate": 2.5633297856757898e-05, + "loss": 0.7433, + "step": 207330 + }, + { + "epoch": 1.324636162682238, + "grad_norm": 0.8428859114646912, + "learning_rate": 2.562891645850436e-05, + "loss": 0.8675, + "step": 207340 + }, + { + "epoch": 1.3247000498319768, + "grad_norm": 1.2644280195236206, + "learning_rate": 2.5624535305688703e-05, + "loss": 0.9043, + "step": 207350 + }, + { + "epoch": 1.3247639369817155, + "grad_norm": 0.636249303817749, + "learning_rate": 2.562015439835508e-05, + "loss": 1.1278, + "step": 207360 + }, + { + "epoch": 1.3248278241314542, + "grad_norm": 2.0940535068511963, + "learning_rate": 2.5615773736547564e-05, + "loss": 1.1164, + "step": 207370 + }, + { + "epoch": 1.3248917112811929, + "grad_norm": 0.7779254913330078, + "learning_rate": 2.5611393320310335e-05, + "loss": 1.0982, + "step": 207380 + }, + { + "epoch": 1.3249555984309316, + "grad_norm": 1.049269676208496, + "learning_rate": 2.5607013149687452e-05, + "loss": 0.8899, + "step": 207390 + }, + { + "epoch": 1.3250194855806703, + "grad_norm": 0.7360807061195374, + "learning_rate": 2.5602633224723067e-05, + "loss": 0.9655, + "step": 207400 + }, + { + "epoch": 1.325083372730409, + "grad_norm": 1.3488494157791138, + "learning_rate": 2.559825354546125e-05, + "loss": 0.8571, + "step": 207410 + }, + { + "epoch": 1.3251472598801477, + "grad_norm": 2.69454026222229, + "learning_rate": 2.5593874111946152e-05, + "loss": 0.9587, + "step": 207420 + }, + { + "epoch": 1.3252111470298864, + "grad_norm": 1.0082372426986694, + "learning_rate": 2.5589494924221836e-05, + "loss": 0.7106, + "step": 207430 + }, + { + "epoch": 1.325275034179625, + "grad_norm": 0.892281174659729, + "learning_rate": 2.558511598233245e-05, + "loss": 0.8623, + "step": 207440 + }, + { + "epoch": 1.3253389213293638, + "grad_norm": 1.0475205183029175, + "learning_rate": 2.5580737286322044e-05, + "loss": 0.901, + "step": 207450 + }, + { + "epoch": 1.3254028084791025, + "grad_norm": 0.6185365319252014, + "learning_rate": 2.5576358836234754e-05, + "loss": 0.8671, + "step": 207460 + }, + { + "epoch": 1.3254666956288412, + "grad_norm": 1.0132888555526733, + "learning_rate": 2.5571980632114644e-05, + "loss": 0.9183, + "step": 207470 + }, + { + "epoch": 1.32553058277858, + "grad_norm": 1.1423455476760864, + "learning_rate": 2.5567602674005842e-05, + "loss": 0.8692, + "step": 207480 + }, + { + "epoch": 1.3255944699283186, + "grad_norm": 1.2508022785186768, + "learning_rate": 2.55632249619524e-05, + "loss": 0.9008, + "step": 207490 + }, + { + "epoch": 1.3256583570780573, + "grad_norm": 1.0700236558914185, + "learning_rate": 2.5558847495998416e-05, + "loss": 0.9502, + "step": 207500 + }, + { + "epoch": 1.325722244227796, + "grad_norm": 0.6072807908058167, + "learning_rate": 2.5554470276188013e-05, + "loss": 0.8926, + "step": 207510 + }, + { + "epoch": 1.3257861313775348, + "grad_norm": 1.2106003761291504, + "learning_rate": 2.5550093302565216e-05, + "loss": 0.7399, + "step": 207520 + }, + { + "epoch": 1.3258500185272735, + "grad_norm": 0.6438818573951721, + "learning_rate": 2.5545716575174154e-05, + "loss": 0.8666, + "step": 207530 + }, + { + "epoch": 1.3259139056770122, + "grad_norm": 0.6634151935577393, + "learning_rate": 2.5541340094058863e-05, + "loss": 0.9733, + "step": 207540 + }, + { + "epoch": 1.3259777928267509, + "grad_norm": 0.9782295227050781, + "learning_rate": 2.553696385926345e-05, + "loss": 0.8119, + "step": 207550 + }, + { + "epoch": 1.3260416799764896, + "grad_norm": 1.1243504285812378, + "learning_rate": 2.5532587870831965e-05, + "loss": 0.8401, + "step": 207560 + }, + { + "epoch": 1.3261055671262283, + "grad_norm": 1.0808581113815308, + "learning_rate": 2.5528212128808504e-05, + "loss": 0.9154, + "step": 207570 + }, + { + "epoch": 1.326169454275967, + "grad_norm": 1.1705023050308228, + "learning_rate": 2.5523836633237098e-05, + "loss": 0.9226, + "step": 207580 + }, + { + "epoch": 1.3262333414257057, + "grad_norm": 0.9224709272384644, + "learning_rate": 2.5519461384161848e-05, + "loss": 0.7217, + "step": 207590 + }, + { + "epoch": 1.3262972285754444, + "grad_norm": 0.6616361141204834, + "learning_rate": 2.5515086381626785e-05, + "loss": 0.928, + "step": 207600 + }, + { + "epoch": 1.3263611157251831, + "grad_norm": 1.029366374015808, + "learning_rate": 2.5510711625675997e-05, + "loss": 0.9758, + "step": 207610 + }, + { + "epoch": 1.3264250028749218, + "grad_norm": 1.4462729692459106, + "learning_rate": 2.550633711635352e-05, + "loss": 0.9038, + "step": 207620 + }, + { + "epoch": 1.3264888900246605, + "grad_norm": 1.2146345376968384, + "learning_rate": 2.550196285370343e-05, + "loss": 0.8447, + "step": 207630 + }, + { + "epoch": 1.326552777174399, + "grad_norm": 0.9503288865089417, + "learning_rate": 2.549758883776975e-05, + "loss": 0.7413, + "step": 207640 + }, + { + "epoch": 1.326616664324138, + "grad_norm": 0.978327751159668, + "learning_rate": 2.549321506859657e-05, + "loss": 0.8494, + "step": 207650 + }, + { + "epoch": 1.3266805514738764, + "grad_norm": 0.7345404028892517, + "learning_rate": 2.5488841546227893e-05, + "loss": 0.7034, + "step": 207660 + }, + { + "epoch": 1.3267444386236154, + "grad_norm": 0.9454354643821716, + "learning_rate": 2.5484468270707812e-05, + "loss": 0.7887, + "step": 207670 + }, + { + "epoch": 1.3268083257733538, + "grad_norm": 0.9941151142120361, + "learning_rate": 2.5480095242080322e-05, + "loss": 0.792, + "step": 207680 + }, + { + "epoch": 1.3268722129230928, + "grad_norm": 1.8501225709915161, + "learning_rate": 2.54757224603895e-05, + "loss": 0.7835, + "step": 207690 + }, + { + "epoch": 1.3269361000728312, + "grad_norm": 1.1302766799926758, + "learning_rate": 2.5471349925679355e-05, + "loss": 0.953, + "step": 207700 + }, + { + "epoch": 1.3269999872225702, + "grad_norm": 1.4207725524902344, + "learning_rate": 2.546697763799394e-05, + "loss": 1.0259, + "step": 207710 + }, + { + "epoch": 1.3270638743723087, + "grad_norm": 0.6076548099517822, + "learning_rate": 2.5462605597377297e-05, + "loss": 0.8364, + "step": 207720 + }, + { + "epoch": 1.3271277615220476, + "grad_norm": 0.9731430411338806, + "learning_rate": 2.5458233803873427e-05, + "loss": 0.8385, + "step": 207730 + }, + { + "epoch": 1.327191648671786, + "grad_norm": 1.4916883707046509, + "learning_rate": 2.5453862257526395e-05, + "loss": 0.8845, + "step": 207740 + }, + { + "epoch": 1.3272555358215248, + "grad_norm": 0.616757869720459, + "learning_rate": 2.5449490958380185e-05, + "loss": 0.7308, + "step": 207750 + }, + { + "epoch": 1.3273194229712635, + "grad_norm": 1.0742515325546265, + "learning_rate": 2.5445119906478855e-05, + "loss": 1.1297, + "step": 207760 + }, + { + "epoch": 1.3273833101210022, + "grad_norm": 1.1675541400909424, + "learning_rate": 2.544074910186641e-05, + "loss": 0.8094, + "step": 207770 + }, + { + "epoch": 1.327447197270741, + "grad_norm": 1.1597243547439575, + "learning_rate": 2.543637854458688e-05, + "loss": 0.8895, + "step": 207780 + }, + { + "epoch": 1.3275110844204796, + "grad_norm": 1.3750035762786865, + "learning_rate": 2.5432008234684236e-05, + "loss": 0.9638, + "step": 207790 + }, + { + "epoch": 1.3275749715702183, + "grad_norm": 0.6728596687316895, + "learning_rate": 2.542763817220255e-05, + "loss": 0.7371, + "step": 207800 + }, + { + "epoch": 1.327638858719957, + "grad_norm": 0.9936996698379517, + "learning_rate": 2.5423268357185788e-05, + "loss": 1.0607, + "step": 207810 + }, + { + "epoch": 1.3277027458696957, + "grad_norm": 1.2502750158309937, + "learning_rate": 2.541889878967797e-05, + "loss": 0.758, + "step": 207820 + }, + { + "epoch": 1.3277666330194344, + "grad_norm": 2.021584987640381, + "learning_rate": 2.5414529469723132e-05, + "loss": 0.7352, + "step": 207830 + }, + { + "epoch": 1.3278305201691731, + "grad_norm": 1.755197525024414, + "learning_rate": 2.5410160397365222e-05, + "loss": 0.8273, + "step": 207840 + }, + { + "epoch": 1.3278944073189118, + "grad_norm": 1.087971568107605, + "learning_rate": 2.54057915726483e-05, + "loss": 0.7056, + "step": 207850 + }, + { + "epoch": 1.3279582944686505, + "grad_norm": 0.8218057155609131, + "learning_rate": 2.5401422995616313e-05, + "loss": 0.9178, + "step": 207860 + }, + { + "epoch": 1.3280221816183893, + "grad_norm": 0.8078576922416687, + "learning_rate": 2.5397054666313293e-05, + "loss": 0.8177, + "step": 207870 + }, + { + "epoch": 1.328086068768128, + "grad_norm": 1.413970947265625, + "learning_rate": 2.53926865847832e-05, + "loss": 0.9126, + "step": 207880 + }, + { + "epoch": 1.3281499559178667, + "grad_norm": 0.8575447797775269, + "learning_rate": 2.5388318751070062e-05, + "loss": 0.9419, + "step": 207890 + }, + { + "epoch": 1.3282138430676054, + "grad_norm": 0.9678406715393066, + "learning_rate": 2.5383951165217824e-05, + "loss": 0.5925, + "step": 207900 + }, + { + "epoch": 1.328277730217344, + "grad_norm": 0.5872654914855957, + "learning_rate": 2.5379583827270513e-05, + "loss": 0.6653, + "step": 207910 + }, + { + "epoch": 1.3283416173670828, + "grad_norm": 0.8543681502342224, + "learning_rate": 2.5375216737272078e-05, + "loss": 0.9675, + "step": 207920 + }, + { + "epoch": 1.3284055045168215, + "grad_norm": 0.8474524021148682, + "learning_rate": 2.5370849895266525e-05, + "loss": 0.9284, + "step": 207930 + }, + { + "epoch": 1.3284693916665602, + "grad_norm": 1.456979751586914, + "learning_rate": 2.5366483301297804e-05, + "loss": 1.139, + "step": 207940 + }, + { + "epoch": 1.328533278816299, + "grad_norm": 0.748672366142273, + "learning_rate": 2.536211695540993e-05, + "loss": 0.8772, + "step": 207950 + }, + { + "epoch": 1.3285971659660376, + "grad_norm": 1.5592775344848633, + "learning_rate": 2.5357750857646832e-05, + "loss": 1.3006, + "step": 207960 + }, + { + "epoch": 1.3286610531157763, + "grad_norm": 1.2369399070739746, + "learning_rate": 2.535338500805252e-05, + "loss": 0.7419, + "step": 207970 + }, + { + "epoch": 1.328724940265515, + "grad_norm": 1.2126493453979492, + "learning_rate": 2.5349019406670932e-05, + "loss": 0.9519, + "step": 207980 + }, + { + "epoch": 1.3287888274152537, + "grad_norm": 1.2261261940002441, + "learning_rate": 2.5344654053546057e-05, + "loss": 0.6931, + "step": 207990 + }, + { + "epoch": 1.3288527145649924, + "grad_norm": 0.8413811326026917, + "learning_rate": 2.5340288948721823e-05, + "loss": 0.9658, + "step": 208000 + }, + { + "epoch": 1.3289166017147311, + "grad_norm": 1.077040433883667, + "learning_rate": 2.5335924092242235e-05, + "loss": 0.8505, + "step": 208010 + }, + { + "epoch": 1.3289804888644698, + "grad_norm": 0.8440773487091064, + "learning_rate": 2.5331559484151213e-05, + "loss": 0.8107, + "step": 208020 + }, + { + "epoch": 1.3290443760142086, + "grad_norm": 0.4764833450317383, + "learning_rate": 2.5327195124492725e-05, + "loss": 0.8189, + "step": 208030 + }, + { + "epoch": 1.3291082631639473, + "grad_norm": 1.1367080211639404, + "learning_rate": 2.5322831013310742e-05, + "loss": 0.9064, + "step": 208040 + }, + { + "epoch": 1.329172150313686, + "grad_norm": 1.0161020755767822, + "learning_rate": 2.5318467150649195e-05, + "loss": 0.9505, + "step": 208050 + }, + { + "epoch": 1.3292360374634247, + "grad_norm": 1.983578085899353, + "learning_rate": 2.531410353655204e-05, + "loss": 0.8435, + "step": 208060 + }, + { + "epoch": 1.3292999246131634, + "grad_norm": 1.1380125284194946, + "learning_rate": 2.5309740171063205e-05, + "loss": 0.9795, + "step": 208070 + }, + { + "epoch": 1.329363811762902, + "grad_norm": 1.1479755640029907, + "learning_rate": 2.5305377054226663e-05, + "loss": 1.4879, + "step": 208080 + }, + { + "epoch": 1.3294276989126408, + "grad_norm": 1.1302154064178467, + "learning_rate": 2.5301014186086324e-05, + "loss": 0.8148, + "step": 208090 + }, + { + "epoch": 1.3294915860623795, + "grad_norm": 0.8572195768356323, + "learning_rate": 2.5296651566686157e-05, + "loss": 0.9606, + "step": 208100 + }, + { + "epoch": 1.329555473212118, + "grad_norm": 0.7420187592506409, + "learning_rate": 2.5292289196070063e-05, + "loss": 0.8116, + "step": 208110 + }, + { + "epoch": 1.329619360361857, + "grad_norm": 0.8453270196914673, + "learning_rate": 2.5287927074282007e-05, + "loss": 0.6985, + "step": 208120 + }, + { + "epoch": 1.3296832475115954, + "grad_norm": 1.6888545751571655, + "learning_rate": 2.528356520136589e-05, + "loss": 1.1483, + "step": 208130 + }, + { + "epoch": 1.3297471346613343, + "grad_norm": 0.7026654481887817, + "learning_rate": 2.527920357736567e-05, + "loss": 0.8764, + "step": 208140 + }, + { + "epoch": 1.3298110218110728, + "grad_norm": 0.9377880096435547, + "learning_rate": 2.527484220232524e-05, + "loss": 0.8799, + "step": 208150 + }, + { + "epoch": 1.3298749089608117, + "grad_norm": 1.017162561416626, + "learning_rate": 2.5270481076288554e-05, + "loss": 0.8685, + "step": 208160 + }, + { + "epoch": 1.3299387961105502, + "grad_norm": 1.0247211456298828, + "learning_rate": 2.5266120199299504e-05, + "loss": 0.7848, + "step": 208170 + }, + { + "epoch": 1.3300026832602891, + "grad_norm": 0.6646064519882202, + "learning_rate": 2.5261759571402033e-05, + "loss": 0.9928, + "step": 208180 + }, + { + "epoch": 1.3300665704100276, + "grad_norm": 1.235183596611023, + "learning_rate": 2.525739919264003e-05, + "loss": 0.8701, + "step": 208190 + }, + { + "epoch": 1.3301304575597666, + "grad_norm": 0.9530540108680725, + "learning_rate": 2.5253039063057443e-05, + "loss": 0.8677, + "step": 208200 + }, + { + "epoch": 1.330194344709505, + "grad_norm": 1.1182188987731934, + "learning_rate": 2.5248679182698143e-05, + "loss": 0.9195, + "step": 208210 + }, + { + "epoch": 1.330258231859244, + "grad_norm": 0.6461467146873474, + "learning_rate": 2.524431955160607e-05, + "loss": 0.9312, + "step": 208220 + }, + { + "epoch": 1.3303221190089825, + "grad_norm": 0.6050586700439453, + "learning_rate": 2.5239960169825105e-05, + "loss": 0.8003, + "step": 208230 + }, + { + "epoch": 1.3303860061587212, + "grad_norm": 1.3474839925765991, + "learning_rate": 2.523560103739916e-05, + "loss": 0.9264, + "step": 208240 + }, + { + "epoch": 1.3304498933084599, + "grad_norm": 0.6612098813056946, + "learning_rate": 2.523124215437217e-05, + "loss": 0.7328, + "step": 208250 + }, + { + "epoch": 1.3305137804581986, + "grad_norm": 0.7815212607383728, + "learning_rate": 2.522688352078796e-05, + "loss": 0.7472, + "step": 208260 + }, + { + "epoch": 1.3305776676079373, + "grad_norm": 1.1188582181930542, + "learning_rate": 2.522252513669049e-05, + "loss": 0.9148, + "step": 208270 + }, + { + "epoch": 1.330641554757676, + "grad_norm": 0.8124456405639648, + "learning_rate": 2.5218167002123605e-05, + "loss": 0.7962, + "step": 208280 + }, + { + "epoch": 1.3307054419074147, + "grad_norm": 1.276268720626831, + "learning_rate": 2.5213809117131237e-05, + "loss": 0.7497, + "step": 208290 + }, + { + "epoch": 1.3307693290571534, + "grad_norm": 0.8970780968666077, + "learning_rate": 2.5209451481757242e-05, + "loss": 0.9581, + "step": 208300 + }, + { + "epoch": 1.330833216206892, + "grad_norm": 2.7623441219329834, + "learning_rate": 2.5205094096045524e-05, + "loss": 0.9534, + "step": 208310 + }, + { + "epoch": 1.3308971033566308, + "grad_norm": 1.1253652572631836, + "learning_rate": 2.520073696003995e-05, + "loss": 1.0629, + "step": 208320 + }, + { + "epoch": 1.3309609905063695, + "grad_norm": 0.8003132939338684, + "learning_rate": 2.519638007378442e-05, + "loss": 0.7378, + "step": 208330 + }, + { + "epoch": 1.3310248776561082, + "grad_norm": 1.132372498512268, + "learning_rate": 2.5192023437322787e-05, + "loss": 0.7477, + "step": 208340 + }, + { + "epoch": 1.331088764805847, + "grad_norm": 1.6704978942871094, + "learning_rate": 2.5187667050698936e-05, + "loss": 0.8368, + "step": 208350 + }, + { + "epoch": 1.3311526519555856, + "grad_norm": 1.237809419631958, + "learning_rate": 2.518331091395677e-05, + "loss": 0.8616, + "step": 208360 + }, + { + "epoch": 1.3312165391053243, + "grad_norm": 1.075128436088562, + "learning_rate": 2.5178955027140112e-05, + "loss": 0.8805, + "step": 208370 + }, + { + "epoch": 1.331280426255063, + "grad_norm": 1.0682311058044434, + "learning_rate": 2.5174599390292865e-05, + "loss": 1.074, + "step": 208380 + }, + { + "epoch": 1.3313443134048017, + "grad_norm": 0.772784948348999, + "learning_rate": 2.5170244003458864e-05, + "loss": 0.8137, + "step": 208390 + }, + { + "epoch": 1.3314082005545405, + "grad_norm": 0.9435346126556396, + "learning_rate": 2.5165888866682004e-05, + "loss": 1.0276, + "step": 208400 + }, + { + "epoch": 1.3314720877042792, + "grad_norm": 1.0530027151107788, + "learning_rate": 2.51615339800061e-05, + "loss": 0.8332, + "step": 208410 + }, + { + "epoch": 1.3315359748540179, + "grad_norm": 0.9279080629348755, + "learning_rate": 2.5157179343475068e-05, + "loss": 0.6883, + "step": 208420 + }, + { + "epoch": 1.3315998620037566, + "grad_norm": 0.7981129884719849, + "learning_rate": 2.5152824957132715e-05, + "loss": 0.7635, + "step": 208430 + }, + { + "epoch": 1.3316637491534953, + "grad_norm": 0.923291027545929, + "learning_rate": 2.514847082102292e-05, + "loss": 1.2482, + "step": 208440 + }, + { + "epoch": 1.331727636303234, + "grad_norm": 1.3127217292785645, + "learning_rate": 2.514411693518951e-05, + "loss": 0.8983, + "step": 208450 + }, + { + "epoch": 1.3317915234529727, + "grad_norm": 0.8736202716827393, + "learning_rate": 2.5139763299676362e-05, + "loss": 0.8583, + "step": 208460 + }, + { + "epoch": 1.3318554106027114, + "grad_norm": 0.777912437915802, + "learning_rate": 2.5135409914527285e-05, + "loss": 0.8301, + "step": 208470 + }, + { + "epoch": 1.33191929775245, + "grad_norm": 1.0512322187423706, + "learning_rate": 2.5131056779786165e-05, + "loss": 1.0007, + "step": 208480 + }, + { + "epoch": 1.3319831849021888, + "grad_norm": 0.9216824769973755, + "learning_rate": 2.5126703895496794e-05, + "loss": 0.7905, + "step": 208490 + }, + { + "epoch": 1.3320470720519275, + "grad_norm": 1.1229199171066284, + "learning_rate": 2.5122351261703058e-05, + "loss": 0.8234, + "step": 208500 + }, + { + "epoch": 1.3321109592016662, + "grad_norm": 0.9037683606147766, + "learning_rate": 2.5117998878448746e-05, + "loss": 0.955, + "step": 208510 + }, + { + "epoch": 1.332174846351405, + "grad_norm": 0.6385436058044434, + "learning_rate": 2.5113646745777726e-05, + "loss": 1.1041, + "step": 208520 + }, + { + "epoch": 1.3322387335011436, + "grad_norm": 0.9948113560676575, + "learning_rate": 2.5109294863733795e-05, + "loss": 0.8906, + "step": 208530 + }, + { + "epoch": 1.3323026206508823, + "grad_norm": 0.6039278507232666, + "learning_rate": 2.510494323236082e-05, + "loss": 0.8755, + "step": 208540 + }, + { + "epoch": 1.332366507800621, + "grad_norm": 1.256967306137085, + "learning_rate": 2.5100591851702583e-05, + "loss": 0.8707, + "step": 208550 + }, + { + "epoch": 1.3324303949503598, + "grad_norm": 0.8238212466239929, + "learning_rate": 2.5096240721802926e-05, + "loss": 0.9786, + "step": 208560 + }, + { + "epoch": 1.3324942821000985, + "grad_norm": 1.652024507522583, + "learning_rate": 2.5091889842705696e-05, + "loss": 0.9173, + "step": 208570 + }, + { + "epoch": 1.3325581692498372, + "grad_norm": 1.1673933267593384, + "learning_rate": 2.5087539214454658e-05, + "loss": 0.8108, + "step": 208580 + }, + { + "epoch": 1.3326220563995759, + "grad_norm": 0.9864590167999268, + "learning_rate": 2.5083188837093674e-05, + "loss": 0.8847, + "step": 208590 + }, + { + "epoch": 1.3326859435493144, + "grad_norm": 1.695041298866272, + "learning_rate": 2.507883871066651e-05, + "loss": 0.7886, + "step": 208600 + }, + { + "epoch": 1.3327498306990533, + "grad_norm": 0.8688923120498657, + "learning_rate": 2.5074488835217026e-05, + "loss": 1.2028, + "step": 208610 + }, + { + "epoch": 1.3328137178487918, + "grad_norm": 2.205828905105591, + "learning_rate": 2.507013921078898e-05, + "loss": 0.7474, + "step": 208620 + }, + { + "epoch": 1.3328776049985307, + "grad_norm": 0.9385578632354736, + "learning_rate": 2.5065789837426225e-05, + "loss": 0.7994, + "step": 208630 + }, + { + "epoch": 1.3329414921482692, + "grad_norm": 1.1013610363006592, + "learning_rate": 2.5061440715172513e-05, + "loss": 0.9858, + "step": 208640 + }, + { + "epoch": 1.333005379298008, + "grad_norm": 0.8305615186691284, + "learning_rate": 2.5057091844071683e-05, + "loss": 0.7782, + "step": 208650 + }, + { + "epoch": 1.3330692664477466, + "grad_norm": 0.9627121686935425, + "learning_rate": 2.505274322416751e-05, + "loss": 0.8877, + "step": 208660 + }, + { + "epoch": 1.3331331535974855, + "grad_norm": 1.865303635597229, + "learning_rate": 2.5048394855503798e-05, + "loss": 0.8416, + "step": 208670 + }, + { + "epoch": 1.333197040747224, + "grad_norm": 1.4973645210266113, + "learning_rate": 2.5044046738124326e-05, + "loss": 0.9163, + "step": 208680 + }, + { + "epoch": 1.333260927896963, + "grad_norm": 0.8996737003326416, + "learning_rate": 2.5039698872072913e-05, + "loss": 0.8238, + "step": 208690 + }, + { + "epoch": 1.3333248150467014, + "grad_norm": 1.8005520105361938, + "learning_rate": 2.503535125739331e-05, + "loss": 0.653, + "step": 208700 + }, + { + "epoch": 1.3333887021964403, + "grad_norm": 1.0006129741668701, + "learning_rate": 2.503100389412933e-05, + "loss": 0.8249, + "step": 208710 + }, + { + "epoch": 1.3334525893461788, + "grad_norm": 1.0600533485412598, + "learning_rate": 2.5026656782324724e-05, + "loss": 1.1548, + "step": 208720 + }, + { + "epoch": 1.3335164764959175, + "grad_norm": 1.0672171115875244, + "learning_rate": 2.5022309922023312e-05, + "loss": 1.0526, + "step": 208730 + }, + { + "epoch": 1.3335803636456562, + "grad_norm": 0.8205702900886536, + "learning_rate": 2.5017963313268843e-05, + "loss": 0.7387, + "step": 208740 + }, + { + "epoch": 1.333644250795395, + "grad_norm": 0.9449298977851868, + "learning_rate": 2.501361695610508e-05, + "loss": 1.11, + "step": 208750 + }, + { + "epoch": 1.3337081379451337, + "grad_norm": 0.9433137774467468, + "learning_rate": 2.5009270850575828e-05, + "loss": 0.8742, + "step": 208760 + }, + { + "epoch": 1.3337720250948724, + "grad_norm": 1.927285075187683, + "learning_rate": 2.5004924996724822e-05, + "loss": 0.7368, + "step": 208770 + }, + { + "epoch": 1.333835912244611, + "grad_norm": 1.1574492454528809, + "learning_rate": 2.5000579394595864e-05, + "loss": 1.0002, + "step": 208780 + }, + { + "epoch": 1.3338997993943498, + "grad_norm": 0.8089993596076965, + "learning_rate": 2.499623404423268e-05, + "loss": 0.9104, + "step": 208790 + }, + { + "epoch": 1.3339636865440885, + "grad_norm": 1.1149111986160278, + "learning_rate": 2.4991888945679066e-05, + "loss": 0.926, + "step": 208800 + }, + { + "epoch": 1.3340275736938272, + "grad_norm": 1.1031670570373535, + "learning_rate": 2.498754409897875e-05, + "loss": 0.7789, + "step": 208810 + }, + { + "epoch": 1.334091460843566, + "grad_norm": 0.6810122728347778, + "learning_rate": 2.498319950417552e-05, + "loss": 0.9843, + "step": 208820 + }, + { + "epoch": 1.3341553479933046, + "grad_norm": 1.146023154258728, + "learning_rate": 2.4978855161313097e-05, + "loss": 0.8365, + "step": 208830 + }, + { + "epoch": 1.3342192351430433, + "grad_norm": 1.486267328262329, + "learning_rate": 2.4974511070435268e-05, + "loss": 0.8662, + "step": 208840 + }, + { + "epoch": 1.334283122292782, + "grad_norm": 0.8784306049346924, + "learning_rate": 2.497016723158574e-05, + "loss": 0.8443, + "step": 208850 + }, + { + "epoch": 1.3343470094425207, + "grad_norm": 1.1221381425857544, + "learning_rate": 2.4965823644808307e-05, + "loss": 0.9196, + "step": 208860 + }, + { + "epoch": 1.3344108965922594, + "grad_norm": 1.6030218601226807, + "learning_rate": 2.4961480310146663e-05, + "loss": 0.8168, + "step": 208870 + }, + { + "epoch": 1.3344747837419981, + "grad_norm": 1.1350743770599365, + "learning_rate": 2.4957137227644577e-05, + "loss": 0.8172, + "step": 208880 + }, + { + "epoch": 1.3345386708917368, + "grad_norm": 0.664034903049469, + "learning_rate": 2.49527943973458e-05, + "loss": 0.72, + "step": 208890 + }, + { + "epoch": 1.3346025580414755, + "grad_norm": 1.3050806522369385, + "learning_rate": 2.4948451819294034e-05, + "loss": 0.9105, + "step": 208900 + }, + { + "epoch": 1.3346664451912142, + "grad_norm": 0.4084964096546173, + "learning_rate": 2.4944109493533052e-05, + "loss": 0.764, + "step": 208910 + }, + { + "epoch": 1.334730332340953, + "grad_norm": 0.9133265614509583, + "learning_rate": 2.4939767420106547e-05, + "loss": 0.7507, + "step": 208920 + }, + { + "epoch": 1.3347942194906917, + "grad_norm": 0.8358080983161926, + "learning_rate": 2.4935425599058275e-05, + "loss": 0.8011, + "step": 208930 + }, + { + "epoch": 1.3348581066404304, + "grad_norm": 0.9465192556381226, + "learning_rate": 2.493108403043194e-05, + "loss": 1.0144, + "step": 208940 + }, + { + "epoch": 1.334921993790169, + "grad_norm": 1.0894412994384766, + "learning_rate": 2.4926742714271295e-05, + "loss": 0.9632, + "step": 208950 + }, + { + "epoch": 1.3349858809399078, + "grad_norm": 0.7485235929489136, + "learning_rate": 2.492240165062002e-05, + "loss": 0.6174, + "step": 208960 + }, + { + "epoch": 1.3350497680896465, + "grad_norm": 1.1640737056732178, + "learning_rate": 2.4918060839521874e-05, + "loss": 1.0708, + "step": 208970 + }, + { + "epoch": 1.3351136552393852, + "grad_norm": 0.9745331406593323, + "learning_rate": 2.4913720281020537e-05, + "loss": 0.9084, + "step": 208980 + }, + { + "epoch": 1.335177542389124, + "grad_norm": 0.9204380512237549, + "learning_rate": 2.4909379975159764e-05, + "loss": 0.9714, + "step": 208990 + }, + { + "epoch": 1.3352414295388626, + "grad_norm": 1.0989751815795898, + "learning_rate": 2.4905039921983213e-05, + "loss": 0.7627, + "step": 209000 + }, + { + "epoch": 1.3353053166886013, + "grad_norm": 1.1908149719238281, + "learning_rate": 2.4900700121534642e-05, + "loss": 0.8905, + "step": 209010 + }, + { + "epoch": 1.33536920383834, + "grad_norm": 1.3680340051651, + "learning_rate": 2.489636057385772e-05, + "loss": 0.8965, + "step": 209020 + }, + { + "epoch": 1.3354330909880787, + "grad_norm": 0.7258527874946594, + "learning_rate": 2.4892021278996182e-05, + "loss": 0.8439, + "step": 209030 + }, + { + "epoch": 1.3354969781378174, + "grad_norm": 1.1028822660446167, + "learning_rate": 2.4887682236993686e-05, + "loss": 0.8114, + "step": 209040 + }, + { + "epoch": 1.3355608652875561, + "grad_norm": 0.7421342730522156, + "learning_rate": 2.488334344789398e-05, + "loss": 0.7569, + "step": 209050 + }, + { + "epoch": 1.3356247524372948, + "grad_norm": 0.9636204242706299, + "learning_rate": 2.4879004911740716e-05, + "loss": 0.8479, + "step": 209060 + }, + { + "epoch": 1.3356886395870335, + "grad_norm": 1.148268222808838, + "learning_rate": 2.487466662857762e-05, + "loss": 0.9857, + "step": 209070 + }, + { + "epoch": 1.3357525267367722, + "grad_norm": 0.6279143691062927, + "learning_rate": 2.4870328598448352e-05, + "loss": 0.6136, + "step": 209080 + }, + { + "epoch": 1.3358164138865107, + "grad_norm": 0.8338180780410767, + "learning_rate": 2.486599082139662e-05, + "loss": 0.9912, + "step": 209090 + }, + { + "epoch": 1.3358803010362497, + "grad_norm": 0.9658007621765137, + "learning_rate": 2.4861653297466114e-05, + "loss": 1.1272, + "step": 209100 + }, + { + "epoch": 1.3359441881859881, + "grad_norm": 1.018320083618164, + "learning_rate": 2.4857316026700488e-05, + "loss": 0.6616, + "step": 209110 + }, + { + "epoch": 1.336008075335727, + "grad_norm": 1.224833607673645, + "learning_rate": 2.4852979009143467e-05, + "loss": 0.8024, + "step": 209120 + }, + { + "epoch": 1.3360719624854656, + "grad_norm": 0.8727512955665588, + "learning_rate": 2.484864224483868e-05, + "loss": 0.7102, + "step": 209130 + }, + { + "epoch": 1.3361358496352045, + "grad_norm": 0.7761167883872986, + "learning_rate": 2.484430573382985e-05, + "loss": 1.2143, + "step": 209140 + }, + { + "epoch": 1.336199736784943, + "grad_norm": 1.4156831502914429, + "learning_rate": 2.4839969476160596e-05, + "loss": 0.6088, + "step": 209150 + }, + { + "epoch": 1.336263623934682, + "grad_norm": 0.7810295820236206, + "learning_rate": 2.4835633471874635e-05, + "loss": 0.787, + "step": 209160 + }, + { + "epoch": 1.3363275110844204, + "grad_norm": 0.8101314306259155, + "learning_rate": 2.48312977210156e-05, + "loss": 1.043, + "step": 209170 + }, + { + "epoch": 1.3363913982341593, + "grad_norm": 0.8262963891029358, + "learning_rate": 2.4826962223627192e-05, + "loss": 0.9108, + "step": 209180 + }, + { + "epoch": 1.3364552853838978, + "grad_norm": 1.052907109260559, + "learning_rate": 2.4822626979753028e-05, + "loss": 0.891, + "step": 209190 + }, + { + "epoch": 1.3365191725336365, + "grad_norm": 0.8523666262626648, + "learning_rate": 2.4818291989436815e-05, + "loss": 0.6906, + "step": 209200 + }, + { + "epoch": 1.3365830596833752, + "grad_norm": 1.5421022176742554, + "learning_rate": 2.4813957252722165e-05, + "loss": 0.85, + "step": 209210 + }, + { + "epoch": 1.336646946833114, + "grad_norm": 1.0445187091827393, + "learning_rate": 2.4809622769652775e-05, + "loss": 0.8081, + "step": 209220 + }, + { + "epoch": 1.3367108339828526, + "grad_norm": 1.041160225868225, + "learning_rate": 2.4805288540272276e-05, + "loss": 1.0529, + "step": 209230 + }, + { + "epoch": 1.3367747211325913, + "grad_norm": 1.4466737508773804, + "learning_rate": 2.480138795076988e-05, + "loss": 0.8208, + "step": 209240 + }, + { + "epoch": 1.33683860828233, + "grad_norm": 0.7779701948165894, + "learning_rate": 2.4797054203518528e-05, + "loss": 0.7357, + "step": 209250 + }, + { + "epoch": 1.3369024954320687, + "grad_norm": 0.940628707408905, + "learning_rate": 2.4792720710082623e-05, + "loss": 1.0128, + "step": 209260 + }, + { + "epoch": 1.3369663825818074, + "grad_norm": 1.1309727430343628, + "learning_rate": 2.478882078303863e-05, + "loss": 1.0985, + "step": 209270 + }, + { + "epoch": 1.3370302697315462, + "grad_norm": 1.0964093208312988, + "learning_rate": 2.4784487771972347e-05, + "loss": 0.6566, + "step": 209280 + }, + { + "epoch": 1.3370941568812849, + "grad_norm": 1.1902695894241333, + "learning_rate": 2.4780155014848105e-05, + "loss": 0.9892, + "step": 209290 + }, + { + "epoch": 1.3371580440310236, + "grad_norm": 1.3145884275436401, + "learning_rate": 2.47758225117095e-05, + "loss": 0.8356, + "step": 209300 + }, + { + "epoch": 1.3372219311807623, + "grad_norm": 0.8866621851921082, + "learning_rate": 2.4771490262600205e-05, + "loss": 0.9547, + "step": 209310 + }, + { + "epoch": 1.337285818330501, + "grad_norm": 1.3845595121383667, + "learning_rate": 2.476715826756381e-05, + "loss": 0.9326, + "step": 209320 + }, + { + "epoch": 1.3373497054802397, + "grad_norm": 1.0303796529769897, + "learning_rate": 2.4762826526643962e-05, + "loss": 0.9338, + "step": 209330 + }, + { + "epoch": 1.3374135926299784, + "grad_norm": 0.978714644908905, + "learning_rate": 2.4758495039884305e-05, + "loss": 0.8114, + "step": 209340 + }, + { + "epoch": 1.337477479779717, + "grad_norm": 0.685970664024353, + "learning_rate": 2.4754163807328416e-05, + "loss": 0.9129, + "step": 209350 + }, + { + "epoch": 1.3375413669294558, + "grad_norm": 0.3362213969230652, + "learning_rate": 2.474983282901996e-05, + "loss": 0.79, + "step": 209360 + }, + { + "epoch": 1.3376052540791945, + "grad_norm": 0.7439454793930054, + "learning_rate": 2.474550210500251e-05, + "loss": 1.0737, + "step": 209370 + }, + { + "epoch": 1.3376691412289332, + "grad_norm": 1.3877562284469604, + "learning_rate": 2.4741171635319722e-05, + "loss": 0.6211, + "step": 209380 + }, + { + "epoch": 1.337733028378672, + "grad_norm": 0.8978985548019409, + "learning_rate": 2.4736841420015174e-05, + "loss": 0.9921, + "step": 209390 + }, + { + "epoch": 1.3377969155284106, + "grad_norm": 0.743375301361084, + "learning_rate": 2.4732511459132502e-05, + "loss": 0.7833, + "step": 209400 + }, + { + "epoch": 1.3378608026781493, + "grad_norm": 1.1048918962478638, + "learning_rate": 2.472818175271528e-05, + "loss": 0.8042, + "step": 209410 + }, + { + "epoch": 1.337924689827888, + "grad_norm": 0.9322056770324707, + "learning_rate": 2.472385230080715e-05, + "loss": 0.9332, + "step": 209420 + }, + { + "epoch": 1.3379885769776267, + "grad_norm": 1.3366330862045288, + "learning_rate": 2.4719523103451673e-05, + "loss": 1.0764, + "step": 209430 + }, + { + "epoch": 1.3380524641273654, + "grad_norm": 0.6746212244033813, + "learning_rate": 2.4715194160692494e-05, + "loss": 0.6417, + "step": 209440 + }, + { + "epoch": 1.3381163512771042, + "grad_norm": 1.4820467233657837, + "learning_rate": 2.4710865472573164e-05, + "loss": 0.866, + "step": 209450 + }, + { + "epoch": 1.3381802384268429, + "grad_norm": 0.6888427138328552, + "learning_rate": 2.4706537039137316e-05, + "loss": 1.2135, + "step": 209460 + }, + { + "epoch": 1.3382441255765816, + "grad_norm": 0.9417890310287476, + "learning_rate": 2.47022088604285e-05, + "loss": 1.0028, + "step": 209470 + }, + { + "epoch": 1.3383080127263203, + "grad_norm": 1.1863046884536743, + "learning_rate": 2.4697880936490348e-05, + "loss": 0.746, + "step": 209480 + }, + { + "epoch": 1.338371899876059, + "grad_norm": 0.9177879095077515, + "learning_rate": 2.4693553267366405e-05, + "loss": 0.805, + "step": 209490 + }, + { + "epoch": 1.3384357870257977, + "grad_norm": 0.8089203238487244, + "learning_rate": 2.468922585310029e-05, + "loss": 0.7571, + "step": 209500 + }, + { + "epoch": 1.3384996741755364, + "grad_norm": 1.0447206497192383, + "learning_rate": 2.468489869373555e-05, + "loss": 0.9604, + "step": 209510 + }, + { + "epoch": 1.338563561325275, + "grad_norm": 0.8918728828430176, + "learning_rate": 2.4680571789315803e-05, + "loss": 0.878, + "step": 209520 + }, + { + "epoch": 1.3386274484750138, + "grad_norm": 1.284941554069519, + "learning_rate": 2.46762451398846e-05, + "loss": 0.8817, + "step": 209530 + }, + { + "epoch": 1.3386913356247525, + "grad_norm": 0.9816265106201172, + "learning_rate": 2.46719187454855e-05, + "loss": 0.8537, + "step": 209540 + }, + { + "epoch": 1.3387552227744912, + "grad_norm": 1.0117629766464233, + "learning_rate": 2.4667592606162106e-05, + "loss": 0.9611, + "step": 209550 + }, + { + "epoch": 1.33881910992423, + "grad_norm": 1.9839173555374146, + "learning_rate": 2.466326672195795e-05, + "loss": 0.6887, + "step": 209560 + }, + { + "epoch": 1.3388829970739686, + "grad_norm": 0.913004994392395, + "learning_rate": 2.4658941092916642e-05, + "loss": 0.8255, + "step": 209570 + }, + { + "epoch": 1.3389468842237071, + "grad_norm": 1.4669578075408936, + "learning_rate": 2.4654615719081696e-05, + "loss": 0.8509, + "step": 209580 + }, + { + "epoch": 1.339010771373446, + "grad_norm": 0.9293825030326843, + "learning_rate": 2.4650290600496716e-05, + "loss": 0.8569, + "step": 209590 + }, + { + "epoch": 1.3390746585231845, + "grad_norm": 1.2991247177124023, + "learning_rate": 2.4645965737205224e-05, + "loss": 0.8354, + "step": 209600 + }, + { + "epoch": 1.3391385456729235, + "grad_norm": 1.1067110300064087, + "learning_rate": 2.4641641129250807e-05, + "loss": 0.9723, + "step": 209610 + }, + { + "epoch": 1.339202432822662, + "grad_norm": 1.071905255317688, + "learning_rate": 2.4637316776676987e-05, + "loss": 0.7582, + "step": 209620 + }, + { + "epoch": 1.3392663199724009, + "grad_norm": 0.8266701102256775, + "learning_rate": 2.463299267952735e-05, + "loss": 0.965, + "step": 209630 + }, + { + "epoch": 1.3393302071221393, + "grad_norm": 0.879081130027771, + "learning_rate": 2.4628668837845397e-05, + "loss": 0.7896, + "step": 209640 + }, + { + "epoch": 1.3393940942718783, + "grad_norm": 0.9813835620880127, + "learning_rate": 2.4624345251674702e-05, + "loss": 1.0807, + "step": 209650 + }, + { + "epoch": 1.3394579814216168, + "grad_norm": 1.115027904510498, + "learning_rate": 2.462002192105882e-05, + "loss": 0.8159, + "step": 209660 + }, + { + "epoch": 1.3395218685713557, + "grad_norm": 1.6251577138900757, + "learning_rate": 2.4615698846041258e-05, + "loss": 0.7014, + "step": 209670 + }, + { + "epoch": 1.3395857557210942, + "grad_norm": 0.8698475360870361, + "learning_rate": 2.4611376026665584e-05, + "loss": 0.942, + "step": 209680 + }, + { + "epoch": 1.3396496428708329, + "grad_norm": 1.2194809913635254, + "learning_rate": 2.46070534629753e-05, + "loss": 0.6803, + "step": 209690 + }, + { + "epoch": 1.3397135300205716, + "grad_norm": 1.7993695735931396, + "learning_rate": 2.4602731155013974e-05, + "loss": 0.9341, + "step": 209700 + }, + { + "epoch": 1.3397774171703103, + "grad_norm": 0.7888948917388916, + "learning_rate": 2.4598409102825103e-05, + "loss": 0.9482, + "step": 209710 + }, + { + "epoch": 1.339841304320049, + "grad_norm": 1.112903356552124, + "learning_rate": 2.4594087306452244e-05, + "loss": 0.7849, + "step": 209720 + }, + { + "epoch": 1.3399051914697877, + "grad_norm": 1.238926649093628, + "learning_rate": 2.458976576593888e-05, + "loss": 1.0857, + "step": 209730 + }, + { + "epoch": 1.3399690786195264, + "grad_norm": 1.2833373546600342, + "learning_rate": 2.4585444481328584e-05, + "loss": 0.9552, + "step": 209740 + }, + { + "epoch": 1.3400329657692651, + "grad_norm": 0.7105987668037415, + "learning_rate": 2.458112345266483e-05, + "loss": 0.7341, + "step": 209750 + }, + { + "epoch": 1.3400968529190038, + "grad_norm": 0.7378994226455688, + "learning_rate": 2.4576802679991173e-05, + "loss": 0.9365, + "step": 209760 + }, + { + "epoch": 1.3401607400687425, + "grad_norm": 0.8128080368041992, + "learning_rate": 2.4572482163351086e-05, + "loss": 0.9999, + "step": 209770 + }, + { + "epoch": 1.3402246272184812, + "grad_norm": 1.1463675498962402, + "learning_rate": 2.4568161902788118e-05, + "loss": 0.8544, + "step": 209780 + }, + { + "epoch": 1.34028851436822, + "grad_norm": 0.8119245171546936, + "learning_rate": 2.4563841898345745e-05, + "loss": 0.8122, + "step": 209790 + }, + { + "epoch": 1.3403524015179586, + "grad_norm": 0.8801420331001282, + "learning_rate": 2.4559522150067504e-05, + "loss": 0.8097, + "step": 209800 + }, + { + "epoch": 1.3404162886676974, + "grad_norm": 0.726452648639679, + "learning_rate": 2.4555202657996875e-05, + "loss": 0.8997, + "step": 209810 + }, + { + "epoch": 1.340480175817436, + "grad_norm": 1.294712781906128, + "learning_rate": 2.4550883422177378e-05, + "loss": 0.964, + "step": 209820 + }, + { + "epoch": 1.3405440629671748, + "grad_norm": 1.0049042701721191, + "learning_rate": 2.4546564442652487e-05, + "loss": 0.6612, + "step": 209830 + }, + { + "epoch": 1.3406079501169135, + "grad_norm": 0.6928468942642212, + "learning_rate": 2.4542245719465706e-05, + "loss": 0.8433, + "step": 209840 + }, + { + "epoch": 1.3406718372666522, + "grad_norm": 0.8258686065673828, + "learning_rate": 2.4537927252660565e-05, + "loss": 1.0434, + "step": 209850 + }, + { + "epoch": 1.3407357244163909, + "grad_norm": 1.3018081188201904, + "learning_rate": 2.4533609042280496e-05, + "loss": 0.9601, + "step": 209860 + }, + { + "epoch": 1.3407996115661296, + "grad_norm": 1.6450886726379395, + "learning_rate": 2.4529291088369038e-05, + "loss": 0.9166, + "step": 209870 + }, + { + "epoch": 1.3408634987158683, + "grad_norm": 1.0318559408187866, + "learning_rate": 2.4524973390969635e-05, + "loss": 1.085, + "step": 209880 + }, + { + "epoch": 1.340927385865607, + "grad_norm": 1.9742481708526611, + "learning_rate": 2.452065595012581e-05, + "loss": 0.9039, + "step": 209890 + }, + { + "epoch": 1.3409912730153457, + "grad_norm": 0.6447263956069946, + "learning_rate": 2.4516338765880996e-05, + "loss": 0.6018, + "step": 209900 + }, + { + "epoch": 1.3410551601650844, + "grad_norm": 1.2177265882492065, + "learning_rate": 2.451202183827872e-05, + "loss": 0.8312, + "step": 209910 + }, + { + "epoch": 1.3411190473148231, + "grad_norm": 0.8734244704246521, + "learning_rate": 2.450770516736242e-05, + "loss": 0.9552, + "step": 209920 + }, + { + "epoch": 1.3411829344645618, + "grad_norm": 1.1779773235321045, + "learning_rate": 2.45033887531756e-05, + "loss": 0.8273, + "step": 209930 + }, + { + "epoch": 1.3412468216143005, + "grad_norm": 0.9692370891571045, + "learning_rate": 2.449907259576169e-05, + "loss": 0.9669, + "step": 209940 + }, + { + "epoch": 1.3413107087640392, + "grad_norm": 0.7844962477684021, + "learning_rate": 2.44947566951642e-05, + "loss": 0.7729, + "step": 209950 + }, + { + "epoch": 1.341374595913778, + "grad_norm": 0.7607170939445496, + "learning_rate": 2.449044105142656e-05, + "loss": 0.6972, + "step": 209960 + }, + { + "epoch": 1.3414384830635167, + "grad_norm": 1.2908620834350586, + "learning_rate": 2.4486125664592263e-05, + "loss": 1.1027, + "step": 209970 + }, + { + "epoch": 1.3415023702132554, + "grad_norm": 1.3262430429458618, + "learning_rate": 2.4481810534704734e-05, + "loss": 0.7157, + "step": 209980 + }, + { + "epoch": 1.341566257362994, + "grad_norm": 0.7177751660346985, + "learning_rate": 2.4477495661807476e-05, + "loss": 0.7883, + "step": 209990 + }, + { + "epoch": 1.3416301445127328, + "grad_norm": 0.7598410248756409, + "learning_rate": 2.447318104594391e-05, + "loss": 0.798, + "step": 210000 + }, + { + "epoch": 1.3416940316624715, + "grad_norm": 0.940240740776062, + "learning_rate": 2.4468866687157477e-05, + "loss": 1.0023, + "step": 210010 + }, + { + "epoch": 1.3417579188122102, + "grad_norm": 0.5641880631446838, + "learning_rate": 2.4464552585491662e-05, + "loss": 0.7427, + "step": 210020 + }, + { + "epoch": 1.3418218059619489, + "grad_norm": 1.3816797733306885, + "learning_rate": 2.446023874098987e-05, + "loss": 0.8589, + "step": 210030 + }, + { + "epoch": 1.3418856931116876, + "grad_norm": 0.8124895691871643, + "learning_rate": 2.4455925153695598e-05, + "loss": 0.9625, + "step": 210040 + }, + { + "epoch": 1.341949580261426, + "grad_norm": 1.486721158027649, + "learning_rate": 2.445161182365223e-05, + "loss": 0.8313, + "step": 210050 + }, + { + "epoch": 1.342013467411165, + "grad_norm": 0.8006391525268555, + "learning_rate": 2.444729875090326e-05, + "loss": 0.9999, + "step": 210060 + }, + { + "epoch": 1.3420773545609035, + "grad_norm": 0.6810546517372131, + "learning_rate": 2.4442985935492073e-05, + "loss": 1.0687, + "step": 210070 + }, + { + "epoch": 1.3421412417106424, + "grad_norm": 0.5805924534797668, + "learning_rate": 2.443867337746215e-05, + "loss": 0.958, + "step": 210080 + }, + { + "epoch": 1.342205128860381, + "grad_norm": 1.3511457443237305, + "learning_rate": 2.443436107685688e-05, + "loss": 0.795, + "step": 210090 + }, + { + "epoch": 1.3422690160101198, + "grad_norm": 0.9503484964370728, + "learning_rate": 2.4430049033719733e-05, + "loss": 0.8793, + "step": 210100 + }, + { + "epoch": 1.3423329031598583, + "grad_norm": 1.1696393489837646, + "learning_rate": 2.4425737248094094e-05, + "loss": 1.1012, + "step": 210110 + }, + { + "epoch": 1.3423967903095972, + "grad_norm": 0.622090220451355, + "learning_rate": 2.442142572002342e-05, + "loss": 0.9457, + "step": 210120 + }, + { + "epoch": 1.3424606774593357, + "grad_norm": 0.7924548983573914, + "learning_rate": 2.4417114449551104e-05, + "loss": 0.9871, + "step": 210130 + }, + { + "epoch": 1.3425245646090747, + "grad_norm": 1.0553364753723145, + "learning_rate": 2.4412803436720595e-05, + "loss": 0.7615, + "step": 210140 + }, + { + "epoch": 1.3425884517588131, + "grad_norm": 1.3900631666183472, + "learning_rate": 2.4408492681575273e-05, + "loss": 0.8291, + "step": 210150 + }, + { + "epoch": 1.342652338908552, + "grad_norm": 1.1718361377716064, + "learning_rate": 2.4404182184158563e-05, + "loss": 0.808, + "step": 210160 + }, + { + "epoch": 1.3427162260582906, + "grad_norm": 0.7470996975898743, + "learning_rate": 2.4399871944513907e-05, + "loss": 0.7203, + "step": 210170 + }, + { + "epoch": 1.3427801132080293, + "grad_norm": 0.9686959981918335, + "learning_rate": 2.439556196268467e-05, + "loss": 1.1038, + "step": 210180 + }, + { + "epoch": 1.342844000357768, + "grad_norm": 1.1691725254058838, + "learning_rate": 2.439125223871429e-05, + "loss": 1.2793, + "step": 210190 + }, + { + "epoch": 1.3429078875075067, + "grad_norm": 0.729895293712616, + "learning_rate": 2.4386942772646138e-05, + "loss": 0.9453, + "step": 210200 + }, + { + "epoch": 1.3429717746572454, + "grad_norm": 1.0728123188018799, + "learning_rate": 2.438263356452365e-05, + "loss": 1.0757, + "step": 210210 + }, + { + "epoch": 1.343035661806984, + "grad_norm": 1.7532767057418823, + "learning_rate": 2.4378324614390187e-05, + "loss": 0.9886, + "step": 210220 + }, + { + "epoch": 1.3430995489567228, + "grad_norm": 0.6979058384895325, + "learning_rate": 2.4374015922289177e-05, + "loss": 0.8571, + "step": 210230 + }, + { + "epoch": 1.3431634361064615, + "grad_norm": 1.2367197275161743, + "learning_rate": 2.4369707488263978e-05, + "loss": 0.8007, + "step": 210240 + }, + { + "epoch": 1.3432273232562002, + "grad_norm": 1.1389049291610718, + "learning_rate": 2.436539931235802e-05, + "loss": 0.9297, + "step": 210250 + }, + { + "epoch": 1.343291210405939, + "grad_norm": 0.5948600172996521, + "learning_rate": 2.4361091394614644e-05, + "loss": 0.7676, + "step": 210260 + }, + { + "epoch": 1.3433550975556776, + "grad_norm": 0.8651537299156189, + "learning_rate": 2.4356783735077276e-05, + "loss": 1.1049, + "step": 210270 + }, + { + "epoch": 1.3434189847054163, + "grad_norm": 1.134677529335022, + "learning_rate": 2.4352476333789266e-05, + "loss": 0.8082, + "step": 210280 + }, + { + "epoch": 1.343482871855155, + "grad_norm": 0.868580162525177, + "learning_rate": 2.434816919079403e-05, + "loss": 0.9725, + "step": 210290 + }, + { + "epoch": 1.3435467590048937, + "grad_norm": 0.828855037689209, + "learning_rate": 2.4343862306134897e-05, + "loss": 0.8408, + "step": 210300 + }, + { + "epoch": 1.3436106461546324, + "grad_norm": 0.8323947787284851, + "learning_rate": 2.4339555679855284e-05, + "loss": 0.9766, + "step": 210310 + }, + { + "epoch": 1.3436745333043711, + "grad_norm": 1.219912052154541, + "learning_rate": 2.4335249311998533e-05, + "loss": 1.0028, + "step": 210320 + }, + { + "epoch": 1.3437384204541098, + "grad_norm": 0.6117528080940247, + "learning_rate": 2.4330943202608037e-05, + "loss": 0.7249, + "step": 210330 + }, + { + "epoch": 1.3438023076038486, + "grad_norm": 0.8942903876304626, + "learning_rate": 2.4326637351727134e-05, + "loss": 0.8577, + "step": 210340 + }, + { + "epoch": 1.3438661947535873, + "grad_norm": 0.9196701049804688, + "learning_rate": 2.432233175939922e-05, + "loss": 0.9215, + "step": 210350 + }, + { + "epoch": 1.343930081903326, + "grad_norm": 0.6849532723426819, + "learning_rate": 2.4318026425667623e-05, + "loss": 0.7433, + "step": 210360 + }, + { + "epoch": 1.3439939690530647, + "grad_norm": 0.8016952872276306, + "learning_rate": 2.4313721350575713e-05, + "loss": 0.8727, + "step": 210370 + }, + { + "epoch": 1.3440578562028034, + "grad_norm": 0.9626837372779846, + "learning_rate": 2.430941653416688e-05, + "loss": 0.8245, + "step": 210380 + }, + { + "epoch": 1.344121743352542, + "grad_norm": 0.9934982061386108, + "learning_rate": 2.430511197648442e-05, + "loss": 1.0403, + "step": 210390 + }, + { + "epoch": 1.3441856305022808, + "grad_norm": 0.8192433714866638, + "learning_rate": 2.4300807677571736e-05, + "loss": 1.1374, + "step": 210400 + }, + { + "epoch": 1.3442495176520195, + "grad_norm": 2.489893913269043, + "learning_rate": 2.429650363747213e-05, + "loss": 0.7958, + "step": 210410 + }, + { + "epoch": 1.3443134048017582, + "grad_norm": 0.5742020010948181, + "learning_rate": 2.4292199856228986e-05, + "loss": 0.7759, + "step": 210420 + }, + { + "epoch": 1.344377291951497, + "grad_norm": 0.7302964925765991, + "learning_rate": 2.4287896333885613e-05, + "loss": 0.7775, + "step": 210430 + }, + { + "epoch": 1.3444411791012356, + "grad_norm": 1.0013583898544312, + "learning_rate": 2.428359307048539e-05, + "loss": 0.7784, + "step": 210440 + }, + { + "epoch": 1.3445050662509743, + "grad_norm": 1.9199076890945435, + "learning_rate": 2.4279290066071608e-05, + "loss": 0.913, + "step": 210450 + }, + { + "epoch": 1.344568953400713, + "grad_norm": 0.9353756904602051, + "learning_rate": 2.4274987320687648e-05, + "loss": 0.7027, + "step": 210460 + }, + { + "epoch": 1.3446328405504517, + "grad_norm": 0.9756304621696472, + "learning_rate": 2.42706848343768e-05, + "loss": 0.9233, + "step": 210470 + }, + { + "epoch": 1.3446967277001904, + "grad_norm": 1.185735821723938, + "learning_rate": 2.4266382607182435e-05, + "loss": 0.8367, + "step": 210480 + }, + { + "epoch": 1.3447606148499291, + "grad_norm": 0.7765511870384216, + "learning_rate": 2.4262080639147865e-05, + "loss": 0.6595, + "step": 210490 + }, + { + "epoch": 1.3448245019996679, + "grad_norm": 1.5717637538909912, + "learning_rate": 2.4257778930316384e-05, + "loss": 0.703, + "step": 210500 + }, + { + "epoch": 1.3448883891494066, + "grad_norm": 1.0303682088851929, + "learning_rate": 2.4253477480731362e-05, + "loss": 0.8976, + "step": 210510 + }, + { + "epoch": 1.3449522762991453, + "grad_norm": 0.7455207109451294, + "learning_rate": 2.4249176290436077e-05, + "loss": 0.8332, + "step": 210520 + }, + { + "epoch": 1.345016163448884, + "grad_norm": 0.8036611080169678, + "learning_rate": 2.424487535947388e-05, + "loss": 0.7954, + "step": 210530 + }, + { + "epoch": 1.3450800505986225, + "grad_norm": 0.8801621198654175, + "learning_rate": 2.4240574687888052e-05, + "loss": 0.9002, + "step": 210540 + }, + { + "epoch": 1.3451439377483614, + "grad_norm": 1.1013474464416504, + "learning_rate": 2.4236274275721943e-05, + "loss": 1.0208, + "step": 210550 + }, + { + "epoch": 1.3452078248980999, + "grad_norm": 0.9770920276641846, + "learning_rate": 2.423197412301882e-05, + "loss": 0.8401, + "step": 210560 + }, + { + "epoch": 1.3452717120478388, + "grad_norm": 0.9802312254905701, + "learning_rate": 2.4227674229822028e-05, + "loss": 0.6977, + "step": 210570 + }, + { + "epoch": 1.3453355991975773, + "grad_norm": 1.036690354347229, + "learning_rate": 2.4223374596174838e-05, + "loss": 0.7917, + "step": 210580 + }, + { + "epoch": 1.3453994863473162, + "grad_norm": 0.9629283547401428, + "learning_rate": 2.421907522212058e-05, + "loss": 0.7148, + "step": 210590 + }, + { + "epoch": 1.3454633734970547, + "grad_norm": 0.5816007256507874, + "learning_rate": 2.4214776107702518e-05, + "loss": 0.9514, + "step": 210600 + }, + { + "epoch": 1.3455272606467936, + "grad_norm": 1.4846789836883545, + "learning_rate": 2.4210477252963993e-05, + "loss": 0.8683, + "step": 210610 + }, + { + "epoch": 1.345591147796532, + "grad_norm": 1.0084872245788574, + "learning_rate": 2.4206178657948246e-05, + "loss": 0.6668, + "step": 210620 + }, + { + "epoch": 1.345655034946271, + "grad_norm": 0.884223997592926, + "learning_rate": 2.4201880322698622e-05, + "loss": 1.055, + "step": 210630 + }, + { + "epoch": 1.3457189220960095, + "grad_norm": 0.749989926815033, + "learning_rate": 2.419758224725836e-05, + "loss": 0.7163, + "step": 210640 + }, + { + "epoch": 1.3457828092457484, + "grad_norm": 0.8538677096366882, + "learning_rate": 2.4193284431670786e-05, + "loss": 0.8633, + "step": 210650 + }, + { + "epoch": 1.345846696395487, + "grad_norm": 1.240678310394287, + "learning_rate": 2.4188986875979146e-05, + "loss": 0.7806, + "step": 210660 + }, + { + "epoch": 1.3459105835452256, + "grad_norm": 0.6129598021507263, + "learning_rate": 2.4184689580226756e-05, + "loss": 0.6788, + "step": 210670 + }, + { + "epoch": 1.3459744706949643, + "grad_norm": 1.047533631324768, + "learning_rate": 2.4180392544456852e-05, + "loss": 0.5952, + "step": 210680 + }, + { + "epoch": 1.346038357844703, + "grad_norm": 0.9103485941886902, + "learning_rate": 2.4176095768712737e-05, + "loss": 0.7303, + "step": 210690 + }, + { + "epoch": 1.3461022449944418, + "grad_norm": 0.8905709981918335, + "learning_rate": 2.4171799253037698e-05, + "loss": 0.7106, + "step": 210700 + }, + { + "epoch": 1.3461661321441805, + "grad_norm": 1.0654971599578857, + "learning_rate": 2.416750299747496e-05, + "loss": 0.9144, + "step": 210710 + }, + { + "epoch": 1.3462300192939192, + "grad_norm": 0.9546240568161011, + "learning_rate": 2.416320700206784e-05, + "loss": 0.7629, + "step": 210720 + }, + { + "epoch": 1.3462939064436579, + "grad_norm": 0.566611111164093, + "learning_rate": 2.4158911266859556e-05, + "loss": 0.8748, + "step": 210730 + }, + { + "epoch": 1.3463577935933966, + "grad_norm": 0.8656834363937378, + "learning_rate": 2.4154615791893415e-05, + "loss": 0.7819, + "step": 210740 + }, + { + "epoch": 1.3464216807431353, + "grad_norm": 0.9609455466270447, + "learning_rate": 2.4150320577212628e-05, + "loss": 1.2611, + "step": 210750 + }, + { + "epoch": 1.346485567892874, + "grad_norm": 1.36478853225708, + "learning_rate": 2.4146025622860498e-05, + "loss": 0.7677, + "step": 210760 + }, + { + "epoch": 1.3465494550426127, + "grad_norm": 1.402571439743042, + "learning_rate": 2.4141730928880235e-05, + "loss": 0.6639, + "step": 210770 + }, + { + "epoch": 1.3466133421923514, + "grad_norm": 0.8779550194740295, + "learning_rate": 2.413743649531513e-05, + "loss": 0.778, + "step": 210780 + }, + { + "epoch": 1.34667722934209, + "grad_norm": 0.9404392242431641, + "learning_rate": 2.41331423222084e-05, + "loss": 0.8239, + "step": 210790 + }, + { + "epoch": 1.3467411164918288, + "grad_norm": 0.8661839365959167, + "learning_rate": 2.412884840960332e-05, + "loss": 1.1463, + "step": 210800 + }, + { + "epoch": 1.3468050036415675, + "grad_norm": 0.6455258727073669, + "learning_rate": 2.41245547575431e-05, + "loss": 0.6678, + "step": 210810 + }, + { + "epoch": 1.3468688907913062, + "grad_norm": 0.808452308177948, + "learning_rate": 2.4120261366071018e-05, + "loss": 0.591, + "step": 210820 + }, + { + "epoch": 1.346932777941045, + "grad_norm": 0.7696453928947449, + "learning_rate": 2.4115968235230275e-05, + "loss": 0.788, + "step": 210830 + }, + { + "epoch": 1.3469966650907836, + "grad_norm": 0.9432682991027832, + "learning_rate": 2.4111675365064146e-05, + "loss": 0.8129, + "step": 210840 + }, + { + "epoch": 1.3470605522405223, + "grad_norm": 1.2885863780975342, + "learning_rate": 2.4107382755615822e-05, + "loss": 0.9396, + "step": 210850 + }, + { + "epoch": 1.347124439390261, + "grad_norm": 0.49041441082954407, + "learning_rate": 2.410309040692857e-05, + "loss": 0.8879, + "step": 210860 + }, + { + "epoch": 1.3471883265399998, + "grad_norm": 1.4990586042404175, + "learning_rate": 2.4098798319045583e-05, + "loss": 0.8044, + "step": 210870 + }, + { + "epoch": 1.3472522136897385, + "grad_norm": 0.9001103043556213, + "learning_rate": 2.4094506492010127e-05, + "loss": 0.8259, + "step": 210880 + }, + { + "epoch": 1.3473161008394772, + "grad_norm": 0.582728385925293, + "learning_rate": 2.409021492586538e-05, + "loss": 0.8523, + "step": 210890 + }, + { + "epoch": 1.3473799879892159, + "grad_norm": 1.2756515741348267, + "learning_rate": 2.408592362065459e-05, + "loss": 0.9167, + "step": 210900 + }, + { + "epoch": 1.3474438751389546, + "grad_norm": 0.7740147709846497, + "learning_rate": 2.4081632576420975e-05, + "loss": 0.7101, + "step": 210910 + }, + { + "epoch": 1.3475077622886933, + "grad_norm": 1.3967020511627197, + "learning_rate": 2.4077341793207737e-05, + "loss": 1.3073, + "step": 210920 + }, + { + "epoch": 1.347571649438432, + "grad_norm": 0.6911699771881104, + "learning_rate": 2.4073051271058106e-05, + "loss": 0.8531, + "step": 210930 + }, + { + "epoch": 1.3476355365881707, + "grad_norm": 0.7950164675712585, + "learning_rate": 2.4068761010015256e-05, + "loss": 0.8932, + "step": 210940 + }, + { + "epoch": 1.3476994237379094, + "grad_norm": 0.8173822164535522, + "learning_rate": 2.406447101012244e-05, + "loss": 1.1364, + "step": 210950 + }, + { + "epoch": 1.3477633108876481, + "grad_norm": 1.062247633934021, + "learning_rate": 2.4060181271422823e-05, + "loss": 0.8704, + "step": 210960 + }, + { + "epoch": 1.3478271980373868, + "grad_norm": 0.691670298576355, + "learning_rate": 2.405589179395965e-05, + "loss": 1.0231, + "step": 210970 + }, + { + "epoch": 1.3478910851871255, + "grad_norm": 1.0676474571228027, + "learning_rate": 2.405160257777606e-05, + "loss": 0.8037, + "step": 210980 + }, + { + "epoch": 1.3479549723368642, + "grad_norm": 0.8413828015327454, + "learning_rate": 2.4047313622915295e-05, + "loss": 0.9708, + "step": 210990 + }, + { + "epoch": 1.348018859486603, + "grad_norm": 1.0489099025726318, + "learning_rate": 2.404302492942052e-05, + "loss": 0.9631, + "step": 211000 + }, + { + "epoch": 1.3480827466363416, + "grad_norm": 0.7359494566917419, + "learning_rate": 2.403873649733494e-05, + "loss": 0.7335, + "step": 211010 + }, + { + "epoch": 1.3481466337860804, + "grad_norm": 1.3242499828338623, + "learning_rate": 2.4034448326701763e-05, + "loss": 0.9638, + "step": 211020 + }, + { + "epoch": 1.3482105209358188, + "grad_norm": 0.7859591245651245, + "learning_rate": 2.403016041756413e-05, + "loss": 1.0956, + "step": 211030 + }, + { + "epoch": 1.3482744080855578, + "grad_norm": 0.9521056413650513, + "learning_rate": 2.4025872769965275e-05, + "loss": 0.795, + "step": 211040 + }, + { + "epoch": 1.3483382952352962, + "grad_norm": 0.9843899011611938, + "learning_rate": 2.4021585383948325e-05, + "loss": 0.9966, + "step": 211050 + }, + { + "epoch": 1.3484021823850352, + "grad_norm": 2.194084644317627, + "learning_rate": 2.401729825955651e-05, + "loss": 0.9103, + "step": 211060 + }, + { + "epoch": 1.3484660695347737, + "grad_norm": 0.6446147561073303, + "learning_rate": 2.4013011396832956e-05, + "loss": 0.9175, + "step": 211070 + }, + { + "epoch": 1.3485299566845126, + "grad_norm": 1.1317280530929565, + "learning_rate": 2.400872479582088e-05, + "loss": 1.0431, + "step": 211080 + }, + { + "epoch": 1.348593843834251, + "grad_norm": 0.8024947047233582, + "learning_rate": 2.4004438456563417e-05, + "loss": 0.7175, + "step": 211090 + }, + { + "epoch": 1.34865773098399, + "grad_norm": 0.7585810422897339, + "learning_rate": 2.4000152379103764e-05, + "loss": 0.8535, + "step": 211100 + }, + { + "epoch": 1.3487216181337285, + "grad_norm": 0.8179787397384644, + "learning_rate": 2.399586656348505e-05, + "loss": 0.7568, + "step": 211110 + }, + { + "epoch": 1.3487855052834674, + "grad_norm": 0.7237686514854431, + "learning_rate": 2.3991581009750475e-05, + "loss": 0.8521, + "step": 211120 + }, + { + "epoch": 1.348849392433206, + "grad_norm": 0.956449568271637, + "learning_rate": 2.398729571794316e-05, + "loss": 0.962, + "step": 211130 + }, + { + "epoch": 1.3489132795829448, + "grad_norm": 0.7600199580192566, + "learning_rate": 2.39830106881063e-05, + "loss": 0.9539, + "step": 211140 + }, + { + "epoch": 1.3489771667326833, + "grad_norm": 0.7863625288009644, + "learning_rate": 2.397872592028301e-05, + "loss": 0.7805, + "step": 211150 + }, + { + "epoch": 1.349041053882422, + "grad_norm": 0.7679553627967834, + "learning_rate": 2.3974441414516485e-05, + "loss": 0.9939, + "step": 211160 + }, + { + "epoch": 1.3491049410321607, + "grad_norm": 1.3117929697036743, + "learning_rate": 2.3970157170849827e-05, + "loss": 0.8474, + "step": 211170 + }, + { + "epoch": 1.3491688281818994, + "grad_norm": 1.0582443475723267, + "learning_rate": 2.3965873189326226e-05, + "loss": 0.9592, + "step": 211180 + }, + { + "epoch": 1.3492327153316381, + "grad_norm": 0.9994162917137146, + "learning_rate": 2.3961589469988788e-05, + "loss": 0.8485, + "step": 211190 + }, + { + "epoch": 1.3492966024813768, + "grad_norm": 0.60932457447052, + "learning_rate": 2.3957306012880686e-05, + "loss": 0.7063, + "step": 211200 + }, + { + "epoch": 1.3493604896311155, + "grad_norm": 0.7182226181030273, + "learning_rate": 2.3953022818045023e-05, + "loss": 0.847, + "step": 211210 + }, + { + "epoch": 1.3494243767808543, + "grad_norm": 1.0658320188522339, + "learning_rate": 2.3948739885524957e-05, + "loss": 0.9415, + "step": 211220 + }, + { + "epoch": 1.349488263930593, + "grad_norm": 0.8645941019058228, + "learning_rate": 2.3944457215363636e-05, + "loss": 0.6805, + "step": 211230 + }, + { + "epoch": 1.3495521510803317, + "grad_norm": 1.8405535221099854, + "learning_rate": 2.3940174807604154e-05, + "loss": 0.9702, + "step": 211240 + }, + { + "epoch": 1.3496160382300704, + "grad_norm": 0.7858190536499023, + "learning_rate": 2.3935892662289678e-05, + "loss": 0.9197, + "step": 211250 + }, + { + "epoch": 1.349679925379809, + "grad_norm": 2.9478893280029297, + "learning_rate": 2.3931610779463286e-05, + "loss": 0.9379, + "step": 211260 + }, + { + "epoch": 1.3497438125295478, + "grad_norm": 0.8578464984893799, + "learning_rate": 2.392732915916815e-05, + "loss": 0.7668, + "step": 211270 + }, + { + "epoch": 1.3498076996792865, + "grad_norm": 0.8595908880233765, + "learning_rate": 2.3923047801447345e-05, + "loss": 0.7818, + "step": 211280 + }, + { + "epoch": 1.3498715868290252, + "grad_norm": 1.156435489654541, + "learning_rate": 2.3918766706344026e-05, + "loss": 1.0211, + "step": 211290 + }, + { + "epoch": 1.349935473978764, + "grad_norm": 1.146933674812317, + "learning_rate": 2.391448587390127e-05, + "loss": 1.0301, + "step": 211300 + }, + { + "epoch": 1.3499993611285026, + "grad_norm": 0.7327816486358643, + "learning_rate": 2.391020530416223e-05, + "loss": 0.7104, + "step": 211310 + }, + { + "epoch": 1.3500632482782413, + "grad_norm": 1.090005874633789, + "learning_rate": 2.3905924997169978e-05, + "loss": 0.887, + "step": 211320 + }, + { + "epoch": 1.35012713542798, + "grad_norm": 0.7504577040672302, + "learning_rate": 2.3901644952967643e-05, + "loss": 0.6344, + "step": 211330 + }, + { + "epoch": 1.3501910225777187, + "grad_norm": 0.6553131341934204, + "learning_rate": 2.389736517159831e-05, + "loss": 0.7824, + "step": 211340 + }, + { + "epoch": 1.3502549097274574, + "grad_norm": 0.6806291937828064, + "learning_rate": 2.3893085653105112e-05, + "loss": 0.7066, + "step": 211350 + }, + { + "epoch": 1.3503187968771961, + "grad_norm": 0.8419098258018494, + "learning_rate": 2.388880639753111e-05, + "loss": 0.8939, + "step": 211360 + }, + { + "epoch": 1.3503826840269348, + "grad_norm": 1.017172932624817, + "learning_rate": 2.3884527404919434e-05, + "loss": 1.1268, + "step": 211370 + }, + { + "epoch": 1.3504465711766735, + "grad_norm": 0.920089066028595, + "learning_rate": 2.3880248675313138e-05, + "loss": 1.1174, + "step": 211380 + }, + { + "epoch": 1.3505104583264123, + "grad_norm": 0.7588639259338379, + "learning_rate": 2.3875970208755354e-05, + "loss": 0.9587, + "step": 211390 + }, + { + "epoch": 1.350574345476151, + "grad_norm": 0.907305121421814, + "learning_rate": 2.3871692005289137e-05, + "loss": 0.9076, + "step": 211400 + }, + { + "epoch": 1.3506382326258897, + "grad_norm": 0.9567956924438477, + "learning_rate": 2.38674140649576e-05, + "loss": 1.0364, + "step": 211410 + }, + { + "epoch": 1.3507021197756284, + "grad_norm": 0.8349950313568115, + "learning_rate": 2.3863136387803793e-05, + "loss": 0.8609, + "step": 211420 + }, + { + "epoch": 1.350766006925367, + "grad_norm": 0.9572453498840332, + "learning_rate": 2.3858858973870813e-05, + "loss": 1.0533, + "step": 211430 + }, + { + "epoch": 1.3508298940751058, + "grad_norm": 1.308901309967041, + "learning_rate": 2.385458182320176e-05, + "loss": 0.8949, + "step": 211440 + }, + { + "epoch": 1.3508937812248445, + "grad_norm": 0.664740264415741, + "learning_rate": 2.3850304935839668e-05, + "loss": 0.951, + "step": 211450 + }, + { + "epoch": 1.3509576683745832, + "grad_norm": 0.962526261806488, + "learning_rate": 2.3846028311827666e-05, + "loss": 0.6875, + "step": 211460 + }, + { + "epoch": 1.351021555524322, + "grad_norm": 0.8609249591827393, + "learning_rate": 2.3841751951208745e-05, + "loss": 0.9577, + "step": 211470 + }, + { + "epoch": 1.3510854426740606, + "grad_norm": 0.6966132521629333, + "learning_rate": 2.3837475854026033e-05, + "loss": 0.7883, + "step": 211480 + }, + { + "epoch": 1.3511493298237993, + "grad_norm": 1.8291178941726685, + "learning_rate": 2.3833200020322554e-05, + "loss": 1.0099, + "step": 211490 + }, + { + "epoch": 1.351213216973538, + "grad_norm": 0.8845551609992981, + "learning_rate": 2.3828924450141403e-05, + "loss": 0.7502, + "step": 211500 + }, + { + "epoch": 1.3512771041232767, + "grad_norm": 1.426044225692749, + "learning_rate": 2.3824649143525613e-05, + "loss": 0.6654, + "step": 211510 + }, + { + "epoch": 1.3513409912730152, + "grad_norm": 0.8587886691093445, + "learning_rate": 2.3820374100518263e-05, + "loss": 0.9827, + "step": 211520 + }, + { + "epoch": 1.3514048784227541, + "grad_norm": 0.9691550135612488, + "learning_rate": 2.381609932116238e-05, + "loss": 0.9936, + "step": 211530 + }, + { + "epoch": 1.3514687655724926, + "grad_norm": 1.3499754667282104, + "learning_rate": 2.3811824805501025e-05, + "loss": 0.834, + "step": 211540 + }, + { + "epoch": 1.3515326527222316, + "grad_norm": 0.6362777948379517, + "learning_rate": 2.3807550553577274e-05, + "loss": 0.7756, + "step": 211550 + }, + { + "epoch": 1.35159653987197, + "grad_norm": 0.6540228724479675, + "learning_rate": 2.3803276565434124e-05, + "loss": 1.3985, + "step": 211560 + }, + { + "epoch": 1.351660427021709, + "grad_norm": 1.1397900581359863, + "learning_rate": 2.3799002841114666e-05, + "loss": 0.9984, + "step": 211570 + }, + { + "epoch": 1.3517243141714474, + "grad_norm": 0.7781268358230591, + "learning_rate": 2.3794729380661896e-05, + "loss": 0.8131, + "step": 211580 + }, + { + "epoch": 1.3517882013211864, + "grad_norm": 1.5782052278518677, + "learning_rate": 2.3790883491896017e-05, + "loss": 0.8044, + "step": 211590 + }, + { + "epoch": 1.3518520884709249, + "grad_norm": 0.9964745044708252, + "learning_rate": 2.3786610532908554e-05, + "loss": 0.7436, + "step": 211600 + }, + { + "epoch": 1.3519159756206638, + "grad_norm": 1.558613657951355, + "learning_rate": 2.3782337837912644e-05, + "loss": 0.9581, + "step": 211610 + }, + { + "epoch": 1.3519798627704023, + "grad_norm": 0.7820762991905212, + "learning_rate": 2.377806540695124e-05, + "loss": 0.9374, + "step": 211620 + }, + { + "epoch": 1.352043749920141, + "grad_norm": 1.2613965272903442, + "learning_rate": 2.3773793240067416e-05, + "loss": 0.9243, + "step": 211630 + }, + { + "epoch": 1.3521076370698797, + "grad_norm": 1.1297177076339722, + "learning_rate": 2.3769521337304163e-05, + "loss": 0.9963, + "step": 211640 + }, + { + "epoch": 1.3521715242196184, + "grad_norm": 0.7416729927062988, + "learning_rate": 2.376524969870454e-05, + "loss": 0.8046, + "step": 211650 + }, + { + "epoch": 1.352235411369357, + "grad_norm": 0.9396662712097168, + "learning_rate": 2.3760978324311533e-05, + "loss": 0.7937, + "step": 211660 + }, + { + "epoch": 1.3522992985190958, + "grad_norm": 1.451461672782898, + "learning_rate": 2.375670721416818e-05, + "loss": 0.806, + "step": 211670 + }, + { + "epoch": 1.3523631856688345, + "grad_norm": 1.6874157190322876, + "learning_rate": 2.37524363683175e-05, + "loss": 0.7712, + "step": 211680 + }, + { + "epoch": 1.3524270728185732, + "grad_norm": 1.7415262460708618, + "learning_rate": 2.3748165786802472e-05, + "loss": 0.7358, + "step": 211690 + }, + { + "epoch": 1.352490959968312, + "grad_norm": 0.7011155486106873, + "learning_rate": 2.374389546966615e-05, + "loss": 0.8004, + "step": 211700 + }, + { + "epoch": 1.3525548471180506, + "grad_norm": 1.013753056526184, + "learning_rate": 2.37396254169515e-05, + "loss": 0.6766, + "step": 211710 + }, + { + "epoch": 1.3526187342677893, + "grad_norm": 1.1585613489151, + "learning_rate": 2.373535562870155e-05, + "loss": 0.8866, + "step": 211720 + }, + { + "epoch": 1.352682621417528, + "grad_norm": 0.6112300157546997, + "learning_rate": 2.3731086104959277e-05, + "loss": 0.7901, + "step": 211730 + }, + { + "epoch": 1.3527465085672667, + "grad_norm": 1.580859661102295, + "learning_rate": 2.3726816845767712e-05, + "loss": 1.0197, + "step": 211740 + }, + { + "epoch": 1.3528103957170055, + "grad_norm": 0.9176729321479797, + "learning_rate": 2.3722547851169813e-05, + "loss": 1.0305, + "step": 211750 + }, + { + "epoch": 1.3528742828667442, + "grad_norm": 0.78628009557724, + "learning_rate": 2.3718279121208608e-05, + "loss": 0.8649, + "step": 211760 + }, + { + "epoch": 1.3529381700164829, + "grad_norm": 1.3608671426773071, + "learning_rate": 2.371401065592705e-05, + "loss": 0.8869, + "step": 211770 + }, + { + "epoch": 1.3530020571662216, + "grad_norm": 0.8958773612976074, + "learning_rate": 2.3709742455368168e-05, + "loss": 0.7032, + "step": 211780 + }, + { + "epoch": 1.3530659443159603, + "grad_norm": 0.802314817905426, + "learning_rate": 2.37054745195749e-05, + "loss": 1.0902, + "step": 211790 + }, + { + "epoch": 1.353129831465699, + "grad_norm": 0.7494603991508484, + "learning_rate": 2.3701206848590267e-05, + "loss": 0.7383, + "step": 211800 + }, + { + "epoch": 1.3531937186154377, + "grad_norm": 0.6451514959335327, + "learning_rate": 2.369693944245722e-05, + "loss": 0.7629, + "step": 211810 + }, + { + "epoch": 1.3532576057651764, + "grad_norm": 0.7738144993782043, + "learning_rate": 2.3692672301218764e-05, + "loss": 0.8307, + "step": 211820 + }, + { + "epoch": 1.353321492914915, + "grad_norm": 0.7524031400680542, + "learning_rate": 2.368840542491784e-05, + "loss": 0.9163, + "step": 211830 + }, + { + "epoch": 1.3533853800646538, + "grad_norm": 0.9877924919128418, + "learning_rate": 2.3684138813597456e-05, + "loss": 1.0076, + "step": 211840 + }, + { + "epoch": 1.3534492672143925, + "grad_norm": 1.3271013498306274, + "learning_rate": 2.3679872467300545e-05, + "loss": 1.0871, + "step": 211850 + }, + { + "epoch": 1.3535131543641312, + "grad_norm": 0.9618971943855286, + "learning_rate": 2.3675606386070083e-05, + "loss": 0.6661, + "step": 211860 + }, + { + "epoch": 1.35357704151387, + "grad_norm": 0.9295185208320618, + "learning_rate": 2.3671340569949057e-05, + "loss": 0.5962, + "step": 211870 + }, + { + "epoch": 1.3536409286636086, + "grad_norm": 1.1782742738723755, + "learning_rate": 2.3667075018980396e-05, + "loss": 0.8708, + "step": 211880 + }, + { + "epoch": 1.3537048158133473, + "grad_norm": 0.8071572780609131, + "learning_rate": 2.3662809733207082e-05, + "loss": 0.6242, + "step": 211890 + }, + { + "epoch": 1.353768702963086, + "grad_norm": 1.3956265449523926, + "learning_rate": 2.3658544712672042e-05, + "loss": 0.9122, + "step": 211900 + }, + { + "epoch": 1.3538325901128248, + "grad_norm": 0.816125750541687, + "learning_rate": 2.3654279957418268e-05, + "loss": 0.703, + "step": 211910 + }, + { + "epoch": 1.3538964772625635, + "grad_norm": 1.574976921081543, + "learning_rate": 2.365001546748866e-05, + "loss": 0.8301, + "step": 211920 + }, + { + "epoch": 1.3539603644123022, + "grad_norm": 0.6849200129508972, + "learning_rate": 2.3645751242926223e-05, + "loss": 0.8724, + "step": 211930 + }, + { + "epoch": 1.3540242515620409, + "grad_norm": 0.5813919901847839, + "learning_rate": 2.3641487283773844e-05, + "loss": 0.8879, + "step": 211940 + }, + { + "epoch": 1.3540881387117796, + "grad_norm": 0.691142201423645, + "learning_rate": 2.363722359007451e-05, + "loss": 0.9655, + "step": 211950 + }, + { + "epoch": 1.3541520258615183, + "grad_norm": 1.5124849081039429, + "learning_rate": 2.3632960161871126e-05, + "loss": 0.8739, + "step": 211960 + }, + { + "epoch": 1.354215913011257, + "grad_norm": 0.6597105264663696, + "learning_rate": 2.3629123303522642e-05, + "loss": 1.048, + "step": 211970 + }, + { + "epoch": 1.3542798001609957, + "grad_norm": 0.793196976184845, + "learning_rate": 2.3624860379879876e-05, + "loss": 0.6684, + "step": 211980 + }, + { + "epoch": 1.3543436873107344, + "grad_norm": 0.7942538857460022, + "learning_rate": 2.362059772185759e-05, + "loss": 1.0311, + "step": 211990 + }, + { + "epoch": 1.354407574460473, + "grad_norm": 1.052597999572754, + "learning_rate": 2.361633532949872e-05, + "loss": 1.015, + "step": 212000 + }, + { + "epoch": 1.3544714616102116, + "grad_norm": 1.1230298280715942, + "learning_rate": 2.3612073202846163e-05, + "loss": 1.0466, + "step": 212010 + }, + { + "epoch": 1.3545353487599505, + "grad_norm": 0.7834064364433289, + "learning_rate": 2.3607811341942875e-05, + "loss": 0.7204, + "step": 212020 + }, + { + "epoch": 1.354599235909689, + "grad_norm": 0.6216392517089844, + "learning_rate": 2.3603549746831737e-05, + "loss": 0.9645, + "step": 212030 + }, + { + "epoch": 1.354663123059428, + "grad_norm": 1.3307925462722778, + "learning_rate": 2.359928841755571e-05, + "loss": 1.0167, + "step": 212040 + }, + { + "epoch": 1.3547270102091664, + "grad_norm": 1.176498293876648, + "learning_rate": 2.3595027354157673e-05, + "loss": 0.6712, + "step": 212050 + }, + { + "epoch": 1.3547908973589053, + "grad_norm": 1.0552092790603638, + "learning_rate": 2.359076655668057e-05, + "loss": 0.9531, + "step": 212060 + }, + { + "epoch": 1.3548547845086438, + "grad_norm": 0.6958808302879333, + "learning_rate": 2.3586506025167275e-05, + "loss": 0.6582, + "step": 212070 + }, + { + "epoch": 1.3549186716583828, + "grad_norm": 0.9148417115211487, + "learning_rate": 2.3582245759660735e-05, + "loss": 1.134, + "step": 212080 + }, + { + "epoch": 1.3549825588081212, + "grad_norm": 0.9399345517158508, + "learning_rate": 2.357798576020382e-05, + "loss": 0.8481, + "step": 212090 + }, + { + "epoch": 1.3550464459578602, + "grad_norm": 1.0423952341079712, + "learning_rate": 2.357372602683946e-05, + "loss": 0.8451, + "step": 212100 + }, + { + "epoch": 1.3551103331075987, + "grad_norm": 1.2409332990646362, + "learning_rate": 2.3569466559610527e-05, + "loss": 0.8279, + "step": 212110 + }, + { + "epoch": 1.3551742202573374, + "grad_norm": 0.8522923588752747, + "learning_rate": 2.3565207358559948e-05, + "loss": 0.8555, + "step": 212120 + }, + { + "epoch": 1.355238107407076, + "grad_norm": 1.4603486061096191, + "learning_rate": 2.3560948423730578e-05, + "loss": 0.8882, + "step": 212130 + }, + { + "epoch": 1.3553019945568148, + "grad_norm": 1.0698403120040894, + "learning_rate": 2.3556689755165357e-05, + "loss": 0.9946, + "step": 212140 + }, + { + "epoch": 1.3553658817065535, + "grad_norm": 0.9794582724571228, + "learning_rate": 2.3552431352907123e-05, + "loss": 0.7777, + "step": 212150 + }, + { + "epoch": 1.3554297688562922, + "grad_norm": 1.0823343992233276, + "learning_rate": 2.3548173216998805e-05, + "loss": 1.0112, + "step": 212160 + }, + { + "epoch": 1.355493656006031, + "grad_norm": 0.6649702787399292, + "learning_rate": 2.3543915347483247e-05, + "loss": 0.7617, + "step": 212170 + }, + { + "epoch": 1.3555575431557696, + "grad_norm": 1.082112431526184, + "learning_rate": 2.3539657744403366e-05, + "loss": 0.7136, + "step": 212180 + }, + { + "epoch": 1.3556214303055083, + "grad_norm": 0.6287432312965393, + "learning_rate": 2.3535400407802012e-05, + "loss": 0.5495, + "step": 212190 + }, + { + "epoch": 1.355685317455247, + "grad_norm": 0.8478794097900391, + "learning_rate": 2.353114333772206e-05, + "loss": 0.8829, + "step": 212200 + }, + { + "epoch": 1.3557492046049857, + "grad_norm": 0.7227674126625061, + "learning_rate": 2.3526886534206422e-05, + "loss": 0.9989, + "step": 212210 + }, + { + "epoch": 1.3558130917547244, + "grad_norm": 1.2565025091171265, + "learning_rate": 2.3522629997297914e-05, + "loss": 0.9998, + "step": 212220 + }, + { + "epoch": 1.3558769789044631, + "grad_norm": 1.0990771055221558, + "learning_rate": 2.351837372703945e-05, + "loss": 0.8897, + "step": 212230 + }, + { + "epoch": 1.3559408660542018, + "grad_norm": 1.4418489933013916, + "learning_rate": 2.351411772347387e-05, + "loss": 1.0007, + "step": 212240 + }, + { + "epoch": 1.3560047532039405, + "grad_norm": 1.40555739402771, + "learning_rate": 2.3509861986644045e-05, + "loss": 0.8061, + "step": 212250 + }, + { + "epoch": 1.3560686403536792, + "grad_norm": 0.9722267985343933, + "learning_rate": 2.3505606516592798e-05, + "loss": 0.8834, + "step": 212260 + }, + { + "epoch": 1.356132527503418, + "grad_norm": 0.7908661961555481, + "learning_rate": 2.350135131336304e-05, + "loss": 0.8076, + "step": 212270 + }, + { + "epoch": 1.3561964146531567, + "grad_norm": 1.0190848112106323, + "learning_rate": 2.3497096376997578e-05, + "loss": 0.8599, + "step": 212280 + }, + { + "epoch": 1.3562603018028954, + "grad_norm": 1.968063473701477, + "learning_rate": 2.3492841707539305e-05, + "loss": 0.9196, + "step": 212290 + }, + { + "epoch": 1.356324188952634, + "grad_norm": 1.0753601789474487, + "learning_rate": 2.3488587305031023e-05, + "loss": 0.8658, + "step": 212300 + }, + { + "epoch": 1.3563880761023728, + "grad_norm": 1.183738112449646, + "learning_rate": 2.348433316951561e-05, + "loss": 0.9116, + "step": 212310 + }, + { + "epoch": 1.3564519632521115, + "grad_norm": 0.7145233154296875, + "learning_rate": 2.3480079301035918e-05, + "loss": 0.8693, + "step": 212320 + }, + { + "epoch": 1.3565158504018502, + "grad_norm": 0.6723766326904297, + "learning_rate": 2.3475825699634745e-05, + "loss": 1.0396, + "step": 212330 + }, + { + "epoch": 1.356579737551589, + "grad_norm": 0.8253989219665527, + "learning_rate": 2.347157236535498e-05, + "loss": 0.9733, + "step": 212340 + }, + { + "epoch": 1.3566436247013276, + "grad_norm": 0.9459612369537354, + "learning_rate": 2.346731929823941e-05, + "loss": 0.796, + "step": 212350 + }, + { + "epoch": 1.3567075118510663, + "grad_norm": 0.9628978371620178, + "learning_rate": 2.346306649833091e-05, + "loss": 0.7765, + "step": 212360 + }, + { + "epoch": 1.356771399000805, + "grad_norm": 0.74045330286026, + "learning_rate": 2.3458813965672267e-05, + "loss": 0.7261, + "step": 212370 + }, + { + "epoch": 1.3568352861505437, + "grad_norm": 0.8961535692214966, + "learning_rate": 2.345456170030635e-05, + "loss": 1.1465, + "step": 212380 + }, + { + "epoch": 1.3568991733002824, + "grad_norm": 1.105933427810669, + "learning_rate": 2.345030970227594e-05, + "loss": 0.8767, + "step": 212390 + }, + { + "epoch": 1.3569630604500211, + "grad_norm": 0.8771437406539917, + "learning_rate": 2.3446057971623902e-05, + "loss": 0.7039, + "step": 212400 + }, + { + "epoch": 1.3570269475997598, + "grad_norm": 1.0957084894180298, + "learning_rate": 2.344180650839301e-05, + "loss": 0.8449, + "step": 212410 + }, + { + "epoch": 1.3570908347494985, + "grad_norm": 0.6853296756744385, + "learning_rate": 2.3437555312626126e-05, + "loss": 0.8821, + "step": 212420 + }, + { + "epoch": 1.3571547218992372, + "grad_norm": 1.5772802829742432, + "learning_rate": 2.3433304384366017e-05, + "loss": 0.9634, + "step": 212430 + }, + { + "epoch": 1.357218609048976, + "grad_norm": 0.9938908815383911, + "learning_rate": 2.3429053723655535e-05, + "loss": 0.7178, + "step": 212440 + }, + { + "epoch": 1.3572824961987147, + "grad_norm": 0.8694483637809753, + "learning_rate": 2.3424803330537455e-05, + "loss": 0.9839, + "step": 212450 + }, + { + "epoch": 1.3573463833484534, + "grad_norm": 0.9842444062232971, + "learning_rate": 2.3420553205054607e-05, + "loss": 0.7376, + "step": 212460 + }, + { + "epoch": 1.357410270498192, + "grad_norm": 0.7344105243682861, + "learning_rate": 2.341630334724977e-05, + "loss": 0.9092, + "step": 212470 + }, + { + "epoch": 1.3574741576479306, + "grad_norm": 1.4985830783843994, + "learning_rate": 2.3412053757165774e-05, + "loss": 0.7086, + "step": 212480 + }, + { + "epoch": 1.3575380447976695, + "grad_norm": 0.49012845754623413, + "learning_rate": 2.340780443484538e-05, + "loss": 1.0866, + "step": 212490 + }, + { + "epoch": 1.357601931947408, + "grad_norm": 0.8759766221046448, + "learning_rate": 2.3403555380331415e-05, + "loss": 1.0279, + "step": 212500 + }, + { + "epoch": 1.357665819097147, + "grad_norm": 0.5767194032669067, + "learning_rate": 2.3399306593666647e-05, + "loss": 0.678, + "step": 212510 + }, + { + "epoch": 1.3577297062468854, + "grad_norm": 0.6936454176902771, + "learning_rate": 2.3395058074893868e-05, + "loss": 1.0519, + "step": 212520 + }, + { + "epoch": 1.3577935933966243, + "grad_norm": 0.7158231139183044, + "learning_rate": 2.3390809824055892e-05, + "loss": 1.1214, + "step": 212530 + }, + { + "epoch": 1.3578574805463628, + "grad_norm": 1.0264630317687988, + "learning_rate": 2.3386561841195458e-05, + "loss": 1.0875, + "step": 212540 + }, + { + "epoch": 1.3579213676961017, + "grad_norm": 1.9632465839385986, + "learning_rate": 2.338231412635539e-05, + "loss": 0.889, + "step": 212550 + }, + { + "epoch": 1.3579852548458402, + "grad_norm": 1.0401647090911865, + "learning_rate": 2.3378066679578436e-05, + "loss": 1.2824, + "step": 212560 + }, + { + "epoch": 1.3580491419955791, + "grad_norm": 0.8090847730636597, + "learning_rate": 2.3373819500907396e-05, + "loss": 0.8421, + "step": 212570 + }, + { + "epoch": 1.3581130291453176, + "grad_norm": 1.2447673082351685, + "learning_rate": 2.3369572590385004e-05, + "loss": 1.0739, + "step": 212580 + }, + { + "epoch": 1.3581769162950565, + "grad_norm": 1.3129416704177856, + "learning_rate": 2.3365325948054077e-05, + "loss": 1.0271, + "step": 212590 + }, + { + "epoch": 1.358240803444795, + "grad_norm": 0.7778066396713257, + "learning_rate": 2.3361079573957346e-05, + "loss": 1.0992, + "step": 212600 + }, + { + "epoch": 1.3583046905945337, + "grad_norm": 0.9241915941238403, + "learning_rate": 2.33568334681376e-05, + "loss": 0.8088, + "step": 212610 + }, + { + "epoch": 1.3583685777442724, + "grad_norm": 0.736827552318573, + "learning_rate": 2.335258763063758e-05, + "loss": 0.821, + "step": 212620 + }, + { + "epoch": 1.3584324648940111, + "grad_norm": 1.1152037382125854, + "learning_rate": 2.3348342061500067e-05, + "loss": 0.8219, + "step": 212630 + }, + { + "epoch": 1.3584963520437499, + "grad_norm": 0.8919548392295837, + "learning_rate": 2.3344096760767793e-05, + "loss": 1.0119, + "step": 212640 + }, + { + "epoch": 1.3585602391934886, + "grad_norm": 1.2961301803588867, + "learning_rate": 2.333985172848354e-05, + "loss": 0.7603, + "step": 212650 + }, + { + "epoch": 1.3586241263432273, + "grad_norm": 1.0925780534744263, + "learning_rate": 2.3335606964690032e-05, + "loss": 0.9653, + "step": 212660 + }, + { + "epoch": 1.358688013492966, + "grad_norm": 1.0221478939056396, + "learning_rate": 2.3331362469430045e-05, + "loss": 0.8009, + "step": 212670 + }, + { + "epoch": 1.3587519006427047, + "grad_norm": 0.905896008014679, + "learning_rate": 2.3327118242746288e-05, + "loss": 0.7346, + "step": 212680 + }, + { + "epoch": 1.3588157877924434, + "grad_norm": 0.7704225778579712, + "learning_rate": 2.3322874284681552e-05, + "loss": 0.9561, + "step": 212690 + }, + { + "epoch": 1.358879674942182, + "grad_norm": 1.1265662908554077, + "learning_rate": 2.3318630595278522e-05, + "loss": 0.9499, + "step": 212700 + }, + { + "epoch": 1.3589435620919208, + "grad_norm": 0.8070195317268372, + "learning_rate": 2.331438717457997e-05, + "loss": 0.8712, + "step": 212710 + }, + { + "epoch": 1.3590074492416595, + "grad_norm": 0.9437048435211182, + "learning_rate": 2.3310144022628644e-05, + "loss": 0.8336, + "step": 212720 + }, + { + "epoch": 1.3590713363913982, + "grad_norm": 0.9660305976867676, + "learning_rate": 2.3305901139467257e-05, + "loss": 0.7254, + "step": 212730 + }, + { + "epoch": 1.359135223541137, + "grad_norm": 0.7670518159866333, + "learning_rate": 2.3301658525138543e-05, + "loss": 0.8708, + "step": 212740 + }, + { + "epoch": 1.3591991106908756, + "grad_norm": 0.7318940758705139, + "learning_rate": 2.32974161796852e-05, + "loss": 0.8674, + "step": 212750 + }, + { + "epoch": 1.3592629978406143, + "grad_norm": 0.7752392888069153, + "learning_rate": 2.329317410315e-05, + "loss": 0.9143, + "step": 212760 + }, + { + "epoch": 1.359326884990353, + "grad_norm": 1.292202353477478, + "learning_rate": 2.328893229557562e-05, + "loss": 0.7913, + "step": 212770 + }, + { + "epoch": 1.3593907721400917, + "grad_norm": 0.8490725159645081, + "learning_rate": 2.3284690757004824e-05, + "loss": 1.0284, + "step": 212780 + }, + { + "epoch": 1.3594546592898304, + "grad_norm": 0.8979281783103943, + "learning_rate": 2.3280449487480284e-05, + "loss": 1.2277, + "step": 212790 + }, + { + "epoch": 1.3595185464395692, + "grad_norm": 0.689542293548584, + "learning_rate": 2.327620848704475e-05, + "loss": 0.9464, + "step": 212800 + }, + { + "epoch": 1.3595824335893079, + "grad_norm": 0.6301685571670532, + "learning_rate": 2.327196775574089e-05, + "loss": 1.0883, + "step": 212810 + }, + { + "epoch": 1.3596463207390466, + "grad_norm": 0.6643343567848206, + "learning_rate": 2.326772729361147e-05, + "loss": 0.7041, + "step": 212820 + }, + { + "epoch": 1.3597102078887853, + "grad_norm": 1.3023102283477783, + "learning_rate": 2.3263487100699132e-05, + "loss": 1.058, + "step": 212830 + }, + { + "epoch": 1.359774095038524, + "grad_norm": 0.8597766757011414, + "learning_rate": 2.3259247177046618e-05, + "loss": 0.8037, + "step": 212840 + }, + { + "epoch": 1.3598379821882627, + "grad_norm": 0.8535871505737305, + "learning_rate": 2.3255007522696638e-05, + "loss": 0.8148, + "step": 212850 + }, + { + "epoch": 1.3599018693380014, + "grad_norm": 1.6547738313674927, + "learning_rate": 2.3250768137691843e-05, + "loss": 0.8184, + "step": 212860 + }, + { + "epoch": 1.35996575648774, + "grad_norm": 0.9821819067001343, + "learning_rate": 2.324652902207498e-05, + "loss": 0.8821, + "step": 212870 + }, + { + "epoch": 1.3600296436374788, + "grad_norm": 0.9379867315292358, + "learning_rate": 2.324229017588869e-05, + "loss": 1.3132, + "step": 212880 + }, + { + "epoch": 1.3600935307872175, + "grad_norm": 1.1494848728179932, + "learning_rate": 2.3238051599175714e-05, + "loss": 1.2216, + "step": 212890 + }, + { + "epoch": 1.3601574179369562, + "grad_norm": 1.0652223825454712, + "learning_rate": 2.3233813291978684e-05, + "loss": 1.2067, + "step": 212900 + }, + { + "epoch": 1.360221305086695, + "grad_norm": 0.7528551816940308, + "learning_rate": 2.3229575254340335e-05, + "loss": 0.7722, + "step": 212910 + }, + { + "epoch": 1.3602851922364336, + "grad_norm": 1.1288870573043823, + "learning_rate": 2.32253374863033e-05, + "loss": 0.669, + "step": 212920 + }, + { + "epoch": 1.3603490793861723, + "grad_norm": 1.154624581336975, + "learning_rate": 2.32210999879103e-05, + "loss": 0.9427, + "step": 212930 + }, + { + "epoch": 1.360412966535911, + "grad_norm": 1.0726332664489746, + "learning_rate": 2.3216862759203973e-05, + "loss": 0.9466, + "step": 212940 + }, + { + "epoch": 1.3604768536856497, + "grad_norm": 1.285915732383728, + "learning_rate": 2.321262580022703e-05, + "loss": 1.0116, + "step": 212950 + }, + { + "epoch": 1.3605407408353885, + "grad_norm": 0.9262716174125671, + "learning_rate": 2.3208389111022095e-05, + "loss": 0.6734, + "step": 212960 + }, + { + "epoch": 1.360604627985127, + "grad_norm": 0.6906473636627197, + "learning_rate": 2.3204152691631874e-05, + "loss": 0.8864, + "step": 212970 + }, + { + "epoch": 1.3606685151348659, + "grad_norm": 1.7085462808609009, + "learning_rate": 2.319991654209901e-05, + "loss": 0.9229, + "step": 212980 + }, + { + "epoch": 1.3607324022846043, + "grad_norm": 0.7899442315101624, + "learning_rate": 2.3195680662466183e-05, + "loss": 0.6897, + "step": 212990 + }, + { + "epoch": 1.3607962894343433, + "grad_norm": 0.758979082107544, + "learning_rate": 2.3191445052776024e-05, + "loss": 0.9965, + "step": 213000 + }, + { + "epoch": 1.3608601765840818, + "grad_norm": 1.0996798276901245, + "learning_rate": 2.3187209713071222e-05, + "loss": 1.0065, + "step": 213010 + }, + { + "epoch": 1.3609240637338207, + "grad_norm": 0.710588812828064, + "learning_rate": 2.31829746433944e-05, + "loss": 0.7498, + "step": 213020 + }, + { + "epoch": 1.3609879508835592, + "grad_norm": 0.9244322180747986, + "learning_rate": 2.3178739843788244e-05, + "loss": 1.0425, + "step": 213030 + }, + { + "epoch": 1.361051838033298, + "grad_norm": 1.1750261783599854, + "learning_rate": 2.317450531429536e-05, + "loss": 0.7303, + "step": 213040 + }, + { + "epoch": 1.3611157251830366, + "grad_norm": 0.7922186255455017, + "learning_rate": 2.3170271054958416e-05, + "loss": 1.1356, + "step": 213050 + }, + { + "epoch": 1.3611796123327755, + "grad_norm": 0.9548922777175903, + "learning_rate": 2.3166037065820067e-05, + "loss": 0.8428, + "step": 213060 + }, + { + "epoch": 1.361243499482514, + "grad_norm": 0.9790717959403992, + "learning_rate": 2.3161803346922927e-05, + "loss": 0.7591, + "step": 213070 + }, + { + "epoch": 1.361307386632253, + "grad_norm": 0.8037964701652527, + "learning_rate": 2.315756989830966e-05, + "loss": 0.6687, + "step": 213080 + }, + { + "epoch": 1.3613712737819914, + "grad_norm": 1.0827137231826782, + "learning_rate": 2.3153336720022867e-05, + "loss": 0.9129, + "step": 213090 + }, + { + "epoch": 1.3614351609317301, + "grad_norm": 0.7812821865081787, + "learning_rate": 2.314910381210522e-05, + "loss": 0.8242, + "step": 213100 + }, + { + "epoch": 1.3614990480814688, + "grad_norm": 0.9069188833236694, + "learning_rate": 2.3144871174599308e-05, + "loss": 0.7806, + "step": 213110 + }, + { + "epoch": 1.3615629352312075, + "grad_norm": 1.1667894124984741, + "learning_rate": 2.31406388075478e-05, + "loss": 1.0365, + "step": 213120 + }, + { + "epoch": 1.3616268223809462, + "grad_norm": 0.7389217019081116, + "learning_rate": 2.313640671099327e-05, + "loss": 1.1612, + "step": 213130 + }, + { + "epoch": 1.361690709530685, + "grad_norm": 0.9983386993408203, + "learning_rate": 2.3132174884978388e-05, + "loss": 1.0671, + "step": 213140 + }, + { + "epoch": 1.3617545966804236, + "grad_norm": 1.3475415706634521, + "learning_rate": 2.3127943329545727e-05, + "loss": 0.9481, + "step": 213150 + }, + { + "epoch": 1.3618184838301624, + "grad_norm": 0.8335529565811157, + "learning_rate": 2.3123712044737946e-05, + "loss": 0.688, + "step": 213160 + }, + { + "epoch": 1.361882370979901, + "grad_norm": 2.321277379989624, + "learning_rate": 2.311948103059761e-05, + "loss": 0.8938, + "step": 213170 + }, + { + "epoch": 1.3619462581296398, + "grad_norm": 0.7873269319534302, + "learning_rate": 2.311525028716738e-05, + "loss": 0.8673, + "step": 213180 + }, + { + "epoch": 1.3620101452793785, + "grad_norm": 0.9356293082237244, + "learning_rate": 2.311101981448982e-05, + "loss": 0.7351, + "step": 213190 + }, + { + "epoch": 1.3620740324291172, + "grad_norm": 0.7549158334732056, + "learning_rate": 2.3106789612607567e-05, + "loss": 0.7732, + "step": 213200 + }, + { + "epoch": 1.3621379195788559, + "grad_norm": 1.441564679145813, + "learning_rate": 2.3102559681563214e-05, + "loss": 0.9888, + "step": 213210 + }, + { + "epoch": 1.3622018067285946, + "grad_norm": 1.071210503578186, + "learning_rate": 2.309833002139933e-05, + "loss": 0.6508, + "step": 213220 + }, + { + "epoch": 1.3622656938783333, + "grad_norm": 0.7152714729309082, + "learning_rate": 2.309410063215856e-05, + "loss": 1.1104, + "step": 213230 + }, + { + "epoch": 1.362329581028072, + "grad_norm": 0.8112555742263794, + "learning_rate": 2.308987151388345e-05, + "loss": 0.8675, + "step": 213240 + }, + { + "epoch": 1.3623934681778107, + "grad_norm": 0.9434011578559875, + "learning_rate": 2.3085642666616637e-05, + "loss": 0.9279, + "step": 213250 + }, + { + "epoch": 1.3624573553275494, + "grad_norm": 1.1366301774978638, + "learning_rate": 2.3081414090400666e-05, + "loss": 0.7816, + "step": 213260 + }, + { + "epoch": 1.3625212424772881, + "grad_norm": 1.0150545835494995, + "learning_rate": 2.3077185785278166e-05, + "loss": 0.9145, + "step": 213270 + }, + { + "epoch": 1.3625851296270268, + "grad_norm": 0.6110414266586304, + "learning_rate": 2.3072957751291675e-05, + "loss": 0.7733, + "step": 213280 + }, + { + "epoch": 1.3626490167767655, + "grad_norm": 0.844383955001831, + "learning_rate": 2.306872998848381e-05, + "loss": 0.8608, + "step": 213290 + }, + { + "epoch": 1.3627129039265042, + "grad_norm": 0.8418871760368347, + "learning_rate": 2.3064502496897118e-05, + "loss": 0.9518, + "step": 213300 + }, + { + "epoch": 1.362776791076243, + "grad_norm": 0.8400182723999023, + "learning_rate": 2.3060275276574206e-05, + "loss": 0.8602, + "step": 213310 + }, + { + "epoch": 1.3628406782259816, + "grad_norm": 1.9541168212890625, + "learning_rate": 2.3056048327557604e-05, + "loss": 0.8702, + "step": 213320 + }, + { + "epoch": 1.3629045653757204, + "grad_norm": 1.0552854537963867, + "learning_rate": 2.305182164988993e-05, + "loss": 0.9381, + "step": 213330 + }, + { + "epoch": 1.362968452525459, + "grad_norm": 0.7301352024078369, + "learning_rate": 2.3047595243613705e-05, + "loss": 0.7445, + "step": 213340 + }, + { + "epoch": 1.3630323396751978, + "grad_norm": 0.8270401954650879, + "learning_rate": 2.3043369108771535e-05, + "loss": 0.6635, + "step": 213350 + }, + { + "epoch": 1.3630962268249365, + "grad_norm": 1.0124050378799438, + "learning_rate": 2.303914324540594e-05, + "loss": 0.7198, + "step": 213360 + }, + { + "epoch": 1.3631601139746752, + "grad_norm": 1.3121830224990845, + "learning_rate": 2.3034917653559497e-05, + "loss": 0.8133, + "step": 213370 + }, + { + "epoch": 1.3632240011244139, + "grad_norm": 0.9895945191383362, + "learning_rate": 2.3030692333274777e-05, + "loss": 0.7254, + "step": 213380 + }, + { + "epoch": 1.3632878882741526, + "grad_norm": 0.8924286365509033, + "learning_rate": 2.30264672845943e-05, + "loss": 0.7883, + "step": 213390 + }, + { + "epoch": 1.3633517754238913, + "grad_norm": 0.6295365691184998, + "learning_rate": 2.3022242507560647e-05, + "loss": 0.8513, + "step": 213400 + }, + { + "epoch": 1.36341566257363, + "grad_norm": 0.8795775771141052, + "learning_rate": 2.301801800221634e-05, + "loss": 0.9533, + "step": 213410 + }, + { + "epoch": 1.3634795497233687, + "grad_norm": 0.703488290309906, + "learning_rate": 2.3013793768603948e-05, + "loss": 0.8536, + "step": 213420 + }, + { + "epoch": 1.3635434368731074, + "grad_norm": 1.8506830930709839, + "learning_rate": 2.300956980676598e-05, + "loss": 0.9916, + "step": 213430 + }, + { + "epoch": 1.3636073240228461, + "grad_norm": 0.8599974513053894, + "learning_rate": 2.3005346116745014e-05, + "loss": 0.81, + "step": 213440 + }, + { + "epoch": 1.3636712111725848, + "grad_norm": 1.0625780820846558, + "learning_rate": 2.3001122698583548e-05, + "loss": 0.6802, + "step": 213450 + }, + { + "epoch": 1.3637350983223233, + "grad_norm": 0.860063910484314, + "learning_rate": 2.299689955232415e-05, + "loss": 0.7409, + "step": 213460 + }, + { + "epoch": 1.3637989854720622, + "grad_norm": 1.2501275539398193, + "learning_rate": 2.2992676678009324e-05, + "loss": 0.9112, + "step": 213470 + }, + { + "epoch": 1.3638628726218007, + "grad_norm": 1.930945634841919, + "learning_rate": 2.298845407568162e-05, + "loss": 0.7647, + "step": 213480 + }, + { + "epoch": 1.3639267597715397, + "grad_norm": 1.4821895360946655, + "learning_rate": 2.2984231745383533e-05, + "loss": 1.1456, + "step": 213490 + }, + { + "epoch": 1.3639906469212781, + "grad_norm": 2.2362570762634277, + "learning_rate": 2.2980009687157628e-05, + "loss": 1.0458, + "step": 213500 + }, + { + "epoch": 1.364054534071017, + "grad_norm": 1.25023353099823, + "learning_rate": 2.297578790104638e-05, + "loss": 1.0971, + "step": 213510 + }, + { + "epoch": 1.3641184212207556, + "grad_norm": 0.988797664642334, + "learning_rate": 2.297156638709234e-05, + "loss": 0.8612, + "step": 213520 + }, + { + "epoch": 1.3641823083704945, + "grad_norm": 0.8659194707870483, + "learning_rate": 2.2967345145338e-05, + "loss": 0.9736, + "step": 213530 + }, + { + "epoch": 1.364246195520233, + "grad_norm": 0.8853728175163269, + "learning_rate": 2.2963124175825896e-05, + "loss": 0.9972, + "step": 213540 + }, + { + "epoch": 1.364310082669972, + "grad_norm": 1.851197361946106, + "learning_rate": 2.2958903478598504e-05, + "loss": 0.9088, + "step": 213550 + }, + { + "epoch": 1.3643739698197104, + "grad_norm": 0.7157253623008728, + "learning_rate": 2.2954683053698344e-05, + "loss": 0.7841, + "step": 213560 + }, + { + "epoch": 1.3644378569694493, + "grad_norm": 1.3922441005706787, + "learning_rate": 2.295046290116794e-05, + "loss": 0.8863, + "step": 213570 + }, + { + "epoch": 1.3645017441191878, + "grad_norm": 0.6246136426925659, + "learning_rate": 2.2946243021049763e-05, + "loss": 0.8969, + "step": 213580 + }, + { + "epoch": 1.3645656312689265, + "grad_norm": 1.028562307357788, + "learning_rate": 2.2942023413386344e-05, + "loss": 0.7831, + "step": 213590 + }, + { + "epoch": 1.3646295184186652, + "grad_norm": 0.7829952836036682, + "learning_rate": 2.2937804078220132e-05, + "loss": 0.8312, + "step": 213600 + }, + { + "epoch": 1.364693405568404, + "grad_norm": 0.6332594752311707, + "learning_rate": 2.2933585015593666e-05, + "loss": 1.2098, + "step": 213610 + }, + { + "epoch": 1.3647572927181426, + "grad_norm": 0.7257765531539917, + "learning_rate": 2.2929366225549398e-05, + "loss": 0.6906, + "step": 213620 + }, + { + "epoch": 1.3648211798678813, + "grad_norm": 0.8252881169319153, + "learning_rate": 2.292514770812985e-05, + "loss": 0.729, + "step": 213630 + }, + { + "epoch": 1.36488506701762, + "grad_norm": 1.1068850755691528, + "learning_rate": 2.2920929463377474e-05, + "loss": 0.8345, + "step": 213640 + }, + { + "epoch": 1.3649489541673587, + "grad_norm": 1.2068063020706177, + "learning_rate": 2.2916711491334773e-05, + "loss": 0.9002, + "step": 213650 + }, + { + "epoch": 1.3650128413170974, + "grad_norm": 1.1177639961242676, + "learning_rate": 2.2912493792044204e-05, + "loss": 0.7525, + "step": 213660 + }, + { + "epoch": 1.3650767284668361, + "grad_norm": 1.1064202785491943, + "learning_rate": 2.2908276365548276e-05, + "loss": 0.847, + "step": 213670 + }, + { + "epoch": 1.3651406156165748, + "grad_norm": 1.1914925575256348, + "learning_rate": 2.2904059211889423e-05, + "loss": 0.9992, + "step": 213680 + }, + { + "epoch": 1.3652045027663136, + "grad_norm": 1.1300867795944214, + "learning_rate": 2.2899842331110155e-05, + "loss": 1.0911, + "step": 213690 + }, + { + "epoch": 1.3652683899160523, + "grad_norm": 1.6770097017288208, + "learning_rate": 2.2895625723252916e-05, + "loss": 1.1771, + "step": 213700 + }, + { + "epoch": 1.365332277065791, + "grad_norm": 0.8899660110473633, + "learning_rate": 2.2891409388360165e-05, + "loss": 0.7521, + "step": 213710 + }, + { + "epoch": 1.3653961642155297, + "grad_norm": 0.9477804899215698, + "learning_rate": 2.2887193326474383e-05, + "loss": 0.7287, + "step": 213720 + }, + { + "epoch": 1.3654600513652684, + "grad_norm": 2.1322531700134277, + "learning_rate": 2.2882977537638007e-05, + "loss": 0.9309, + "step": 213730 + }, + { + "epoch": 1.365523938515007, + "grad_norm": 1.0908210277557373, + "learning_rate": 2.287876202189352e-05, + "loss": 0.8519, + "step": 213740 + }, + { + "epoch": 1.3655878256647458, + "grad_norm": 0.8647819757461548, + "learning_rate": 2.2874546779283344e-05, + "loss": 0.6995, + "step": 213750 + }, + { + "epoch": 1.3656517128144845, + "grad_norm": 1.0555542707443237, + "learning_rate": 2.2870331809849978e-05, + "loss": 0.8651, + "step": 213760 + }, + { + "epoch": 1.3657155999642232, + "grad_norm": 0.5738083720207214, + "learning_rate": 2.286611711363581e-05, + "loss": 0.8654, + "step": 213770 + }, + { + "epoch": 1.365779487113962, + "grad_norm": 0.8562591671943665, + "learning_rate": 2.286190269068334e-05, + "loss": 0.862, + "step": 213780 + }, + { + "epoch": 1.3658433742637006, + "grad_norm": 1.0389251708984375, + "learning_rate": 2.285768854103497e-05, + "loss": 0.9576, + "step": 213790 + }, + { + "epoch": 1.3659072614134393, + "grad_norm": 1.2807830572128296, + "learning_rate": 2.2853474664733178e-05, + "loss": 1.1481, + "step": 213800 + }, + { + "epoch": 1.365971148563178, + "grad_norm": 1.054508090019226, + "learning_rate": 2.284926106182036e-05, + "loss": 1.0283, + "step": 213810 + }, + { + "epoch": 1.3660350357129167, + "grad_norm": 0.9369847774505615, + "learning_rate": 2.284504773233899e-05, + "loss": 0.6915, + "step": 213820 + }, + { + "epoch": 1.3660989228626554, + "grad_norm": 0.5408312082290649, + "learning_rate": 2.2840834676331464e-05, + "loss": 0.6874, + "step": 213830 + }, + { + "epoch": 1.3661628100123941, + "grad_norm": 1.0407315492630005, + "learning_rate": 2.2836621893840247e-05, + "loss": 1.0371, + "step": 213840 + }, + { + "epoch": 1.3662266971621329, + "grad_norm": 0.866386890411377, + "learning_rate": 2.2832409384907726e-05, + "loss": 0.8674, + "step": 213850 + }, + { + "epoch": 1.3662905843118716, + "grad_norm": 1.1507446765899658, + "learning_rate": 2.282819714957637e-05, + "loss": 0.6694, + "step": 213860 + }, + { + "epoch": 1.3663544714616103, + "grad_norm": 1.4874440431594849, + "learning_rate": 2.282398518788856e-05, + "loss": 0.7711, + "step": 213870 + }, + { + "epoch": 1.366418358611349, + "grad_norm": 0.7873359322547913, + "learning_rate": 2.281977349988672e-05, + "loss": 0.9726, + "step": 213880 + }, + { + "epoch": 1.3664822457610877, + "grad_norm": 0.7608168125152588, + "learning_rate": 2.2815562085613294e-05, + "loss": 0.8677, + "step": 213890 + }, + { + "epoch": 1.3665461329108264, + "grad_norm": 1.3678696155548096, + "learning_rate": 2.2811350945110665e-05, + "loss": 1.1596, + "step": 213900 + }, + { + "epoch": 1.366610020060565, + "grad_norm": 1.2935272455215454, + "learning_rate": 2.2807140078421268e-05, + "loss": 0.8756, + "step": 213910 + }, + { + "epoch": 1.3666739072103038, + "grad_norm": 1.2438666820526123, + "learning_rate": 2.2802929485587476e-05, + "loss": 0.9217, + "step": 213920 + }, + { + "epoch": 1.3667377943600425, + "grad_norm": 1.114298939704895, + "learning_rate": 2.2798719166651734e-05, + "loss": 1.0421, + "step": 213930 + }, + { + "epoch": 1.3668016815097812, + "grad_norm": 0.7132619023323059, + "learning_rate": 2.2794509121656406e-05, + "loss": 0.7425, + "step": 213940 + }, + { + "epoch": 1.3668655686595197, + "grad_norm": 0.914767861366272, + "learning_rate": 2.2790299350643917e-05, + "loss": 0.7722, + "step": 213950 + }, + { + "epoch": 1.3669294558092586, + "grad_norm": 0.7639245390892029, + "learning_rate": 2.2786089853656645e-05, + "loss": 0.6271, + "step": 213960 + }, + { + "epoch": 1.366993342958997, + "grad_norm": 0.6949672102928162, + "learning_rate": 2.2781880630737e-05, + "loss": 0.8217, + "step": 213970 + }, + { + "epoch": 1.367057230108736, + "grad_norm": 0.8127015233039856, + "learning_rate": 2.2777671681927355e-05, + "loss": 1.0723, + "step": 213980 + }, + { + "epoch": 1.3671211172584745, + "grad_norm": 1.067530632019043, + "learning_rate": 2.2773463007270118e-05, + "loss": 0.7072, + "step": 213990 + }, + { + "epoch": 1.3671850044082134, + "grad_norm": 1.0031954050064087, + "learning_rate": 2.276925460680765e-05, + "loss": 0.8714, + "step": 214000 + }, + { + "epoch": 1.367248891557952, + "grad_norm": 1.203041672706604, + "learning_rate": 2.276504648058236e-05, + "loss": 0.8492, + "step": 214010 + }, + { + "epoch": 1.3673127787076909, + "grad_norm": 0.9548153877258301, + "learning_rate": 2.27608386286366e-05, + "loss": 1.481, + "step": 214020 + }, + { + "epoch": 1.3673766658574293, + "grad_norm": 1.148245096206665, + "learning_rate": 2.2756631051012772e-05, + "loss": 1.1608, + "step": 214030 + }, + { + "epoch": 1.3674405530071683, + "grad_norm": 0.7084174156188965, + "learning_rate": 2.2752423747753227e-05, + "loss": 0.8583, + "step": 214040 + }, + { + "epoch": 1.3675044401569068, + "grad_norm": 1.07353937625885, + "learning_rate": 2.2748216718900363e-05, + "loss": 0.9307, + "step": 214050 + }, + { + "epoch": 1.3675683273066455, + "grad_norm": 0.5779439806938171, + "learning_rate": 2.274400996449651e-05, + "loss": 0.8697, + "step": 214060 + }, + { + "epoch": 1.3676322144563842, + "grad_norm": 0.9872804284095764, + "learning_rate": 2.2739803484584077e-05, + "loss": 0.8438, + "step": 214070 + }, + { + "epoch": 1.3676961016061229, + "grad_norm": 0.7021150588989258, + "learning_rate": 2.2735597279205388e-05, + "loss": 0.8816, + "step": 214080 + }, + { + "epoch": 1.3677599887558616, + "grad_norm": 1.5829733610153198, + "learning_rate": 2.2731391348402824e-05, + "loss": 0.9133, + "step": 214090 + }, + { + "epoch": 1.3678238759056003, + "grad_norm": 1.0218287706375122, + "learning_rate": 2.272718569221876e-05, + "loss": 1.0297, + "step": 214100 + }, + { + "epoch": 1.367887763055339, + "grad_norm": 1.0792396068572998, + "learning_rate": 2.2722980310695508e-05, + "loss": 0.9565, + "step": 214110 + }, + { + "epoch": 1.3679516502050777, + "grad_norm": 0.630308210849762, + "learning_rate": 2.2718775203875465e-05, + "loss": 0.7774, + "step": 214120 + }, + { + "epoch": 1.3680155373548164, + "grad_norm": 1.0366673469543457, + "learning_rate": 2.271457037180093e-05, + "loss": 0.9403, + "step": 214130 + }, + { + "epoch": 1.368079424504555, + "grad_norm": 4.574321746826172, + "learning_rate": 2.27103658145143e-05, + "loss": 0.7539, + "step": 214140 + }, + { + "epoch": 1.3681433116542938, + "grad_norm": 1.278727650642395, + "learning_rate": 2.270616153205788e-05, + "loss": 0.7088, + "step": 214150 + }, + { + "epoch": 1.3682071988040325, + "grad_norm": 1.062374472618103, + "learning_rate": 2.2701957524474042e-05, + "loss": 0.7578, + "step": 214160 + }, + { + "epoch": 1.3682710859537712, + "grad_norm": 1.1994844675064087, + "learning_rate": 2.2697753791805087e-05, + "loss": 0.9603, + "step": 214170 + }, + { + "epoch": 1.36833497310351, + "grad_norm": 0.8804671168327332, + "learning_rate": 2.2693550334093412e-05, + "loss": 0.8427, + "step": 214180 + }, + { + "epoch": 1.3683988602532486, + "grad_norm": 1.0678255558013916, + "learning_rate": 2.268934715138127e-05, + "loss": 0.7044, + "step": 214190 + }, + { + "epoch": 1.3684627474029873, + "grad_norm": 0.7160975933074951, + "learning_rate": 2.2685144243711048e-05, + "loss": 0.7304, + "step": 214200 + }, + { + "epoch": 1.368526634552726, + "grad_norm": 2.113140821456909, + "learning_rate": 2.2680941611125032e-05, + "loss": 0.9366, + "step": 214210 + }, + { + "epoch": 1.3685905217024648, + "grad_norm": 0.808111310005188, + "learning_rate": 2.2676739253665574e-05, + "loss": 0.674, + "step": 214220 + }, + { + "epoch": 1.3686544088522035, + "grad_norm": 0.9904662370681763, + "learning_rate": 2.2672537171375007e-05, + "loss": 1.0068, + "step": 214230 + }, + { + "epoch": 1.3687182960019422, + "grad_norm": 1.1128408908843994, + "learning_rate": 2.2668335364295613e-05, + "loss": 0.7818, + "step": 214240 + }, + { + "epoch": 1.3687821831516809, + "grad_norm": 0.9577661752700806, + "learning_rate": 2.2664133832469746e-05, + "loss": 1.0456, + "step": 214250 + }, + { + "epoch": 1.3688460703014196, + "grad_norm": 0.9240872859954834, + "learning_rate": 2.265993257593968e-05, + "loss": 1.0572, + "step": 214260 + }, + { + "epoch": 1.3689099574511583, + "grad_norm": 1.0088399648666382, + "learning_rate": 2.2655731594747764e-05, + "loss": 0.9687, + "step": 214270 + }, + { + "epoch": 1.368973844600897, + "grad_norm": 0.9549551010131836, + "learning_rate": 2.2651530888936274e-05, + "loss": 1.1469, + "step": 214280 + }, + { + "epoch": 1.3690377317506357, + "grad_norm": 0.9399086833000183, + "learning_rate": 2.264733045854754e-05, + "loss": 1.0533, + "step": 214290 + }, + { + "epoch": 1.3691016189003744, + "grad_norm": 0.9131207466125488, + "learning_rate": 2.2643130303623834e-05, + "loss": 0.8803, + "step": 214300 + }, + { + "epoch": 1.3691655060501131, + "grad_norm": 0.5964922308921814, + "learning_rate": 2.2638930424207493e-05, + "loss": 0.8778, + "step": 214310 + }, + { + "epoch": 1.3692293931998518, + "grad_norm": 1.3003700971603394, + "learning_rate": 2.263473082034077e-05, + "loss": 0.6881, + "step": 214320 + }, + { + "epoch": 1.3692932803495905, + "grad_norm": 1.2452034950256348, + "learning_rate": 2.263053149206601e-05, + "loss": 0.4928, + "step": 214330 + }, + { + "epoch": 1.3693571674993292, + "grad_norm": 1.003343105316162, + "learning_rate": 2.262633243942545e-05, + "loss": 1.0223, + "step": 214340 + }, + { + "epoch": 1.369421054649068, + "grad_norm": 1.2331476211547852, + "learning_rate": 2.2622133662461424e-05, + "loss": 0.7122, + "step": 214350 + }, + { + "epoch": 1.3694849417988066, + "grad_norm": 1.6258493661880493, + "learning_rate": 2.2617935161216182e-05, + "loss": 0.8656, + "step": 214360 + }, + { + "epoch": 1.3695488289485453, + "grad_norm": 1.027157187461853, + "learning_rate": 2.2613736935732034e-05, + "loss": 0.7493, + "step": 214370 + }, + { + "epoch": 1.369612716098284, + "grad_norm": 0.9395402669906616, + "learning_rate": 2.2609538986051232e-05, + "loss": 1.0075, + "step": 214380 + }, + { + "epoch": 1.3696766032480228, + "grad_norm": 1.0628516674041748, + "learning_rate": 2.2605341312216088e-05, + "loss": 0.8902, + "step": 214390 + }, + { + "epoch": 1.3697404903977615, + "grad_norm": 1.0474220514297485, + "learning_rate": 2.260114391426884e-05, + "loss": 0.8962, + "step": 214400 + }, + { + "epoch": 1.3698043775475002, + "grad_norm": 0.9830883741378784, + "learning_rate": 2.2596946792251772e-05, + "loss": 0.9756, + "step": 214410 + }, + { + "epoch": 1.3698682646972389, + "grad_norm": 1.4582486152648926, + "learning_rate": 2.2592749946207175e-05, + "loss": 0.7602, + "step": 214420 + }, + { + "epoch": 1.3699321518469776, + "grad_norm": 1.0433655977249146, + "learning_rate": 2.2588553376177283e-05, + "loss": 0.853, + "step": 214430 + }, + { + "epoch": 1.369996038996716, + "grad_norm": 0.9852280616760254, + "learning_rate": 2.2584357082204387e-05, + "loss": 1.0361, + "step": 214440 + }, + { + "epoch": 1.370059926146455, + "grad_norm": 0.9670477509498596, + "learning_rate": 2.2580161064330717e-05, + "loss": 0.8454, + "step": 214450 + }, + { + "epoch": 1.3701238132961935, + "grad_norm": 0.7297714948654175, + "learning_rate": 2.2575965322598562e-05, + "loss": 0.7035, + "step": 214460 + }, + { + "epoch": 1.3701877004459324, + "grad_norm": 1.3668649196624756, + "learning_rate": 2.2571769857050147e-05, + "loss": 0.8924, + "step": 214470 + }, + { + "epoch": 1.370251587595671, + "grad_norm": 1.1395583152770996, + "learning_rate": 2.2567574667727754e-05, + "loss": 0.76, + "step": 214480 + }, + { + "epoch": 1.3703154747454098, + "grad_norm": 0.6072282791137695, + "learning_rate": 2.2563379754673604e-05, + "loss": 0.9047, + "step": 214490 + }, + { + "epoch": 1.3703793618951483, + "grad_norm": 0.9671122431755066, + "learning_rate": 2.2559185117929966e-05, + "loss": 0.6726, + "step": 214500 + }, + { + "epoch": 1.3704432490448872, + "grad_norm": 1.3676090240478516, + "learning_rate": 2.2554990757539057e-05, + "loss": 0.9646, + "step": 214510 + }, + { + "epoch": 1.3705071361946257, + "grad_norm": 0.5860486626625061, + "learning_rate": 2.2550796673543158e-05, + "loss": 0.8328, + "step": 214520 + }, + { + "epoch": 1.3705710233443646, + "grad_norm": 0.5600946545600891, + "learning_rate": 2.2546602865984463e-05, + "loss": 0.8783, + "step": 214530 + }, + { + "epoch": 1.3706349104941031, + "grad_norm": 0.7447695136070251, + "learning_rate": 2.254240933490524e-05, + "loss": 0.7817, + "step": 214540 + }, + { + "epoch": 1.3706987976438418, + "grad_norm": 0.5831778049468994, + "learning_rate": 2.2538216080347696e-05, + "loss": 0.5659, + "step": 214550 + }, + { + "epoch": 1.3707626847935805, + "grad_norm": 0.9292523264884949, + "learning_rate": 2.253402310235409e-05, + "loss": 0.8999, + "step": 214560 + }, + { + "epoch": 1.3708265719433192, + "grad_norm": 1.09566068649292, + "learning_rate": 2.2529830400966618e-05, + "loss": 0.6296, + "step": 214570 + }, + { + "epoch": 1.370890459093058, + "grad_norm": 0.8166540265083313, + "learning_rate": 2.2525637976227536e-05, + "loss": 0.9464, + "step": 214580 + }, + { + "epoch": 1.3709543462427967, + "grad_norm": 0.8931240439414978, + "learning_rate": 2.2521445828179028e-05, + "loss": 0.9223, + "step": 214590 + }, + { + "epoch": 1.3710182333925354, + "grad_norm": 0.9479050040245056, + "learning_rate": 2.2517253956863356e-05, + "loss": 0.7455, + "step": 214600 + }, + { + "epoch": 1.371082120542274, + "grad_norm": 0.949516236782074, + "learning_rate": 2.251306236232269e-05, + "loss": 0.7871, + "step": 214610 + }, + { + "epoch": 1.3711460076920128, + "grad_norm": 0.8050344586372375, + "learning_rate": 2.2508871044599268e-05, + "loss": 0.9032, + "step": 214620 + }, + { + "epoch": 1.3712098948417515, + "grad_norm": 0.5675163865089417, + "learning_rate": 2.2504680003735314e-05, + "loss": 0.9212, + "step": 214630 + }, + { + "epoch": 1.3712737819914902, + "grad_norm": 1.4616913795471191, + "learning_rate": 2.2500489239772997e-05, + "loss": 0.9347, + "step": 214640 + }, + { + "epoch": 1.371337669141229, + "grad_norm": 1.066999077796936, + "learning_rate": 2.249629875275457e-05, + "loss": 0.8779, + "step": 214650 + }, + { + "epoch": 1.3714015562909676, + "grad_norm": 1.2175489664077759, + "learning_rate": 2.2492108542722185e-05, + "loss": 1.121, + "step": 214660 + }, + { + "epoch": 1.3714654434407063, + "grad_norm": 1.4719328880310059, + "learning_rate": 2.2487918609718105e-05, + "loss": 0.9578, + "step": 214670 + }, + { + "epoch": 1.371529330590445, + "grad_norm": 1.204595923423767, + "learning_rate": 2.2483728953784445e-05, + "loss": 0.8123, + "step": 214680 + }, + { + "epoch": 1.3715932177401837, + "grad_norm": 0.9094199538230896, + "learning_rate": 2.247953957496346e-05, + "loss": 0.7774, + "step": 214690 + }, + { + "epoch": 1.3716571048899224, + "grad_norm": 0.7834303379058838, + "learning_rate": 2.2475350473297303e-05, + "loss": 0.8452, + "step": 214700 + }, + { + "epoch": 1.3717209920396611, + "grad_norm": 0.8282777667045593, + "learning_rate": 2.24711616488282e-05, + "loss": 0.996, + "step": 214710 + }, + { + "epoch": 1.3717848791893998, + "grad_norm": 2.026803731918335, + "learning_rate": 2.2466973101598288e-05, + "loss": 1.13, + "step": 214720 + }, + { + "epoch": 1.3718487663391385, + "grad_norm": 0.8557829856872559, + "learning_rate": 2.246278483164978e-05, + "loss": 0.6976, + "step": 214730 + }, + { + "epoch": 1.3719126534888773, + "grad_norm": 0.7997044920921326, + "learning_rate": 2.2458596839024875e-05, + "loss": 0.7832, + "step": 214740 + }, + { + "epoch": 1.371976540638616, + "grad_norm": 0.9678969383239746, + "learning_rate": 2.24544091237657e-05, + "loss": 0.7593, + "step": 214750 + }, + { + "epoch": 1.3720404277883547, + "grad_norm": 1.38689124584198, + "learning_rate": 2.245022168591448e-05, + "loss": 0.6785, + "step": 214760 + }, + { + "epoch": 1.3721043149380934, + "grad_norm": 0.7496582865715027, + "learning_rate": 2.2446034525513343e-05, + "loss": 0.7313, + "step": 214770 + }, + { + "epoch": 1.372168202087832, + "grad_norm": 1.183701992034912, + "learning_rate": 2.244184764260449e-05, + "loss": 1.1634, + "step": 214780 + }, + { + "epoch": 1.3722320892375708, + "grad_norm": 1.503637671470642, + "learning_rate": 2.2437661037230063e-05, + "loss": 0.9871, + "step": 214790 + }, + { + "epoch": 1.3722959763873095, + "grad_norm": 0.6041831970214844, + "learning_rate": 2.2433474709432245e-05, + "loss": 0.7255, + "step": 214800 + }, + { + "epoch": 1.3723598635370482, + "grad_norm": 1.6717568635940552, + "learning_rate": 2.2429288659253166e-05, + "loss": 0.8908, + "step": 214810 + }, + { + "epoch": 1.372423750686787, + "grad_norm": 1.4359157085418701, + "learning_rate": 2.2425102886735023e-05, + "loss": 1.1106, + "step": 214820 + }, + { + "epoch": 1.3724876378365256, + "grad_norm": 1.5275332927703857, + "learning_rate": 2.2420917391919933e-05, + "loss": 1.1028, + "step": 214830 + }, + { + "epoch": 1.3725515249862643, + "grad_norm": 1.115195870399475, + "learning_rate": 2.241673217485008e-05, + "loss": 1.1771, + "step": 214840 + }, + { + "epoch": 1.372615412136003, + "grad_norm": 0.9098389148712158, + "learning_rate": 2.241254723556758e-05, + "loss": 0.8661, + "step": 214850 + }, + { + "epoch": 1.3726792992857417, + "grad_norm": 0.8639233708381653, + "learning_rate": 2.240836257411461e-05, + "loss": 0.9158, + "step": 214860 + }, + { + "epoch": 1.3727431864354804, + "grad_norm": 0.9827595949172974, + "learning_rate": 2.2404178190533286e-05, + "loss": 0.832, + "step": 214870 + }, + { + "epoch": 1.3728070735852191, + "grad_norm": 1.564755916595459, + "learning_rate": 2.239999408486578e-05, + "loss": 1.0676, + "step": 214880 + }, + { + "epoch": 1.3728709607349578, + "grad_norm": 1.3222931623458862, + "learning_rate": 2.2395810257154186e-05, + "loss": 0.8444, + "step": 214890 + }, + { + "epoch": 1.3729348478846966, + "grad_norm": 1.785544991493225, + "learning_rate": 2.239162670744069e-05, + "loss": 0.7951, + "step": 214900 + }, + { + "epoch": 1.372998735034435, + "grad_norm": 0.7356242537498474, + "learning_rate": 2.2387443435767375e-05, + "loss": 0.6265, + "step": 214910 + }, + { + "epoch": 1.373062622184174, + "grad_norm": 1.0977994203567505, + "learning_rate": 2.2383260442176413e-05, + "loss": 0.9025, + "step": 214920 + }, + { + "epoch": 1.3731265093339124, + "grad_norm": 0.8271603584289551, + "learning_rate": 2.2379077726709892e-05, + "loss": 1.1127, + "step": 214930 + }, + { + "epoch": 1.3731903964836514, + "grad_norm": 1.3959122896194458, + "learning_rate": 2.2374895289409953e-05, + "loss": 0.9131, + "step": 214940 + }, + { + "epoch": 1.3732542836333899, + "grad_norm": 0.8944525718688965, + "learning_rate": 2.237071313031873e-05, + "loss": 0.8691, + "step": 214950 + }, + { + "epoch": 1.3733181707831288, + "grad_norm": 1.157148003578186, + "learning_rate": 2.2366531249478316e-05, + "loss": 0.861, + "step": 214960 + }, + { + "epoch": 1.3733820579328673, + "grad_norm": 1.1422433853149414, + "learning_rate": 2.2362349646930858e-05, + "loss": 0.9311, + "step": 214970 + }, + { + "epoch": 1.3734459450826062, + "grad_norm": 0.9563591480255127, + "learning_rate": 2.2358168322718433e-05, + "loss": 0.5491, + "step": 214980 + }, + { + "epoch": 1.3735098322323447, + "grad_norm": 1.180920958518982, + "learning_rate": 2.2353987276883182e-05, + "loss": 0.8662, + "step": 214990 + }, + { + "epoch": 1.3735737193820836, + "grad_norm": 0.8338070511817932, + "learning_rate": 2.2349806509467174e-05, + "loss": 0.8664, + "step": 215000 + }, + { + "epoch": 1.373637606531822, + "grad_norm": 0.6759228706359863, + "learning_rate": 2.2345626020512556e-05, + "loss": 0.7186, + "step": 215010 + }, + { + "epoch": 1.373701493681561, + "grad_norm": 0.5735316872596741, + "learning_rate": 2.2341445810061395e-05, + "loss": 0.702, + "step": 215020 + }, + { + "epoch": 1.3737653808312995, + "grad_norm": 0.9189698696136475, + "learning_rate": 2.2337265878155816e-05, + "loss": 0.9387, + "step": 215030 + }, + { + "epoch": 1.3738292679810382, + "grad_norm": 0.998117208480835, + "learning_rate": 2.2333086224837886e-05, + "loss": 1.0075, + "step": 215040 + }, + { + "epoch": 1.373893155130777, + "grad_norm": 0.8722880482673645, + "learning_rate": 2.232890685014973e-05, + "loss": 0.7019, + "step": 215050 + }, + { + "epoch": 1.3739570422805156, + "grad_norm": 0.9833821058273315, + "learning_rate": 2.23247277541334e-05, + "loss": 1.0301, + "step": 215060 + }, + { + "epoch": 1.3740209294302543, + "grad_norm": 0.890064001083374, + "learning_rate": 2.2320548936831025e-05, + "loss": 0.7118, + "step": 215070 + }, + { + "epoch": 1.374084816579993, + "grad_norm": 0.7501860857009888, + "learning_rate": 2.2316370398284646e-05, + "loss": 0.8444, + "step": 215080 + }, + { + "epoch": 1.3741487037297317, + "grad_norm": 1.1047126054763794, + "learning_rate": 2.2312192138536393e-05, + "loss": 0.8822, + "step": 215090 + }, + { + "epoch": 1.3742125908794705, + "grad_norm": 1.0687460899353027, + "learning_rate": 2.2308014157628294e-05, + "loss": 0.9311, + "step": 215100 + }, + { + "epoch": 1.3742764780292092, + "grad_norm": 1.225905179977417, + "learning_rate": 2.2303836455602468e-05, + "loss": 0.8496, + "step": 215110 + }, + { + "epoch": 1.3743403651789479, + "grad_norm": 1.1884292364120483, + "learning_rate": 2.2299659032500953e-05, + "loss": 1.0965, + "step": 215120 + }, + { + "epoch": 1.3744042523286866, + "grad_norm": 1.1230326890945435, + "learning_rate": 2.2295481888365856e-05, + "loss": 0.7909, + "step": 215130 + }, + { + "epoch": 1.3744681394784253, + "grad_norm": 0.9136203527450562, + "learning_rate": 2.22913050232392e-05, + "loss": 0.9625, + "step": 215140 + }, + { + "epoch": 1.374532026628164, + "grad_norm": 1.0029051303863525, + "learning_rate": 2.2287128437163095e-05, + "loss": 0.7709, + "step": 215150 + }, + { + "epoch": 1.3745959137779027, + "grad_norm": 1.027402639389038, + "learning_rate": 2.2282952130179584e-05, + "loss": 0.6512, + "step": 215160 + }, + { + "epoch": 1.3746598009276414, + "grad_norm": 2.28485107421875, + "learning_rate": 2.2278776102330706e-05, + "loss": 0.948, + "step": 215170 + }, + { + "epoch": 1.37472368807738, + "grad_norm": 0.8364056348800659, + "learning_rate": 2.2274600353658555e-05, + "loss": 0.9034, + "step": 215180 + }, + { + "epoch": 1.3747875752271188, + "grad_norm": 0.9878214001655579, + "learning_rate": 2.227042488420514e-05, + "loss": 0.9154, + "step": 215190 + }, + { + "epoch": 1.3748514623768575, + "grad_norm": 0.9499024152755737, + "learning_rate": 2.2266249694012553e-05, + "loss": 0.7886, + "step": 215200 + }, + { + "epoch": 1.3749153495265962, + "grad_norm": 1.2275502681732178, + "learning_rate": 2.226207478312281e-05, + "loss": 0.8554, + "step": 215210 + }, + { + "epoch": 1.374979236676335, + "grad_norm": 0.6945877075195312, + "learning_rate": 2.2257900151577992e-05, + "loss": 0.7988, + "step": 215220 + }, + { + "epoch": 1.3750431238260736, + "grad_norm": 1.13132905960083, + "learning_rate": 2.22537257994201e-05, + "loss": 1.0158, + "step": 215230 + }, + { + "epoch": 1.3751070109758123, + "grad_norm": 0.9972148537635803, + "learning_rate": 2.224955172669121e-05, + "loss": 0.9711, + "step": 215240 + }, + { + "epoch": 1.375170898125551, + "grad_norm": 1.326120138168335, + "learning_rate": 2.224537793343332e-05, + "loss": 1.03, + "step": 215250 + }, + { + "epoch": 1.3752347852752898, + "grad_norm": 1.3188718557357788, + "learning_rate": 2.224120441968849e-05, + "loss": 0.9548, + "step": 215260 + }, + { + "epoch": 1.3752986724250285, + "grad_norm": 1.372585654258728, + "learning_rate": 2.2237031185498764e-05, + "loss": 0.7345, + "step": 215270 + }, + { + "epoch": 1.3753625595747672, + "grad_norm": 0.8943201303482056, + "learning_rate": 2.223285823090613e-05, + "loss": 0.9941, + "step": 215280 + }, + { + "epoch": 1.3754264467245059, + "grad_norm": 0.7395158410072327, + "learning_rate": 2.2228685555952655e-05, + "loss": 1.0877, + "step": 215290 + }, + { + "epoch": 1.3754903338742446, + "grad_norm": 0.8535774350166321, + "learning_rate": 2.2224513160680327e-05, + "loss": 1.0003, + "step": 215300 + }, + { + "epoch": 1.3755542210239833, + "grad_norm": 0.8217079639434814, + "learning_rate": 2.22203410451312e-05, + "loss": 0.9577, + "step": 215310 + }, + { + "epoch": 1.375618108173722, + "grad_norm": 0.6563711166381836, + "learning_rate": 2.221616920934725e-05, + "loss": 0.8651, + "step": 215320 + }, + { + "epoch": 1.3756819953234607, + "grad_norm": 0.7311186194419861, + "learning_rate": 2.221199765337053e-05, + "loss": 0.7602, + "step": 215330 + }, + { + "epoch": 1.3757458824731994, + "grad_norm": 0.7395635843276978, + "learning_rate": 2.220782637724302e-05, + "loss": 0.8297, + "step": 215340 + }, + { + "epoch": 1.375809769622938, + "grad_norm": 0.8966172933578491, + "learning_rate": 2.2203655381006755e-05, + "loss": 0.7369, + "step": 215350 + }, + { + "epoch": 1.3758736567726768, + "grad_norm": 1.1480859518051147, + "learning_rate": 2.219948466470371e-05, + "loss": 0.5877, + "step": 215360 + }, + { + "epoch": 1.3759375439224155, + "grad_norm": 0.6096826195716858, + "learning_rate": 2.219531422837593e-05, + "loss": 1.1601, + "step": 215370 + }, + { + "epoch": 1.3760014310721542, + "grad_norm": 1.1459797620773315, + "learning_rate": 2.2191144072065362e-05, + "loss": 0.733, + "step": 215380 + }, + { + "epoch": 1.376065318221893, + "grad_norm": 0.884146511554718, + "learning_rate": 2.218697419581406e-05, + "loss": 0.9174, + "step": 215390 + }, + { + "epoch": 1.3761292053716314, + "grad_norm": 1.501900553703308, + "learning_rate": 2.2182804599663963e-05, + "loss": 0.822, + "step": 215400 + }, + { + "epoch": 1.3761930925213703, + "grad_norm": 0.6949141621589661, + "learning_rate": 2.217863528365711e-05, + "loss": 0.7998, + "step": 215410 + }, + { + "epoch": 1.3762569796711088, + "grad_norm": 0.9651074409484863, + "learning_rate": 2.217446624783545e-05, + "loss": 0.8932, + "step": 215420 + }, + { + "epoch": 1.3763208668208478, + "grad_norm": 0.8191916942596436, + "learning_rate": 2.217029749224101e-05, + "loss": 0.8928, + "step": 215430 + }, + { + "epoch": 1.3763847539705862, + "grad_norm": 1.7742459774017334, + "learning_rate": 2.2166129016915726e-05, + "loss": 0.9563, + "step": 215440 + }, + { + "epoch": 1.3764486411203252, + "grad_norm": 1.2688603401184082, + "learning_rate": 2.216196082190162e-05, + "loss": 0.7006, + "step": 215450 + }, + { + "epoch": 1.3765125282700637, + "grad_norm": 1.0408973693847656, + "learning_rate": 2.2157792907240637e-05, + "loss": 0.8796, + "step": 215460 + }, + { + "epoch": 1.3765764154198026, + "grad_norm": 1.0547093152999878, + "learning_rate": 2.215362527297477e-05, + "loss": 1.0913, + "step": 215470 + }, + { + "epoch": 1.376640302569541, + "grad_norm": 0.9978485107421875, + "learning_rate": 2.2149457919146e-05, + "loss": 0.9159, + "step": 215480 + }, + { + "epoch": 1.37670418971928, + "grad_norm": 0.8262386918067932, + "learning_rate": 2.2145290845796268e-05, + "loss": 0.8887, + "step": 215490 + }, + { + "epoch": 1.3767680768690185, + "grad_norm": 0.5713022351264954, + "learning_rate": 2.2141124052967572e-05, + "loss": 0.6874, + "step": 215500 + }, + { + "epoch": 1.3768319640187574, + "grad_norm": 0.887799859046936, + "learning_rate": 2.2136957540701836e-05, + "loss": 0.9792, + "step": 215510 + }, + { + "epoch": 1.3768958511684959, + "grad_norm": 3.2510287761688232, + "learning_rate": 2.2132791309041066e-05, + "loss": 0.7196, + "step": 215520 + }, + { + "epoch": 1.3769597383182346, + "grad_norm": 1.2461315393447876, + "learning_rate": 2.212862535802717e-05, + "loss": 0.7465, + "step": 215530 + }, + { + "epoch": 1.3770236254679733, + "grad_norm": 0.8961158394813538, + "learning_rate": 2.212445968770216e-05, + "loss": 0.8395, + "step": 215540 + }, + { + "epoch": 1.377087512617712, + "grad_norm": 0.9440718293190002, + "learning_rate": 2.212029429810793e-05, + "loss": 0.9693, + "step": 215550 + }, + { + "epoch": 1.3771513997674507, + "grad_norm": 5.130351543426514, + "learning_rate": 2.2116129189286472e-05, + "loss": 0.8738, + "step": 215560 + }, + { + "epoch": 1.3772152869171894, + "grad_norm": 1.188360571861267, + "learning_rate": 2.2111964361279704e-05, + "loss": 0.996, + "step": 215570 + }, + { + "epoch": 1.3772791740669281, + "grad_norm": 1.9239073991775513, + "learning_rate": 2.21077998141296e-05, + "loss": 0.7404, + "step": 215580 + }, + { + "epoch": 1.3773430612166668, + "grad_norm": 1.9211300611495972, + "learning_rate": 2.2103635547878053e-05, + "loss": 0.7482, + "step": 215590 + }, + { + "epoch": 1.3774069483664055, + "grad_norm": 0.9710404276847839, + "learning_rate": 2.209947156256706e-05, + "loss": 0.7785, + "step": 215600 + }, + { + "epoch": 1.3774708355161442, + "grad_norm": 1.1858057975769043, + "learning_rate": 2.20953078582385e-05, + "loss": 0.8877, + "step": 215610 + }, + { + "epoch": 1.377534722665883, + "grad_norm": 0.7331709861755371, + "learning_rate": 2.2091144434934352e-05, + "loss": 0.8328, + "step": 215620 + }, + { + "epoch": 1.3775986098156217, + "grad_norm": 0.984566330909729, + "learning_rate": 2.2086981292696506e-05, + "loss": 0.7231, + "step": 215630 + }, + { + "epoch": 1.3776624969653604, + "grad_norm": 1.0698328018188477, + "learning_rate": 2.2082818431566926e-05, + "loss": 0.9652, + "step": 215640 + }, + { + "epoch": 1.377726384115099, + "grad_norm": 0.6931266784667969, + "learning_rate": 2.2078655851587514e-05, + "loss": 0.7886, + "step": 215650 + }, + { + "epoch": 1.3777902712648378, + "grad_norm": 0.5347517132759094, + "learning_rate": 2.207449355280018e-05, + "loss": 0.8593, + "step": 215660 + }, + { + "epoch": 1.3778541584145765, + "grad_norm": 1.114911675453186, + "learning_rate": 2.2070331535246868e-05, + "loss": 0.6501, + "step": 215670 + }, + { + "epoch": 1.3779180455643152, + "grad_norm": 1.0739355087280273, + "learning_rate": 2.206616979896946e-05, + "loss": 0.8551, + "step": 215680 + }, + { + "epoch": 1.377981932714054, + "grad_norm": 1.1093593835830688, + "learning_rate": 2.206200834400991e-05, + "loss": 0.9642, + "step": 215690 + }, + { + "epoch": 1.3780458198637926, + "grad_norm": 1.3782254457473755, + "learning_rate": 2.205784717041009e-05, + "loss": 1.0764, + "step": 215700 + }, + { + "epoch": 1.3781097070135313, + "grad_norm": 0.859857439994812, + "learning_rate": 2.205368627821194e-05, + "loss": 0.8035, + "step": 215710 + }, + { + "epoch": 1.37817359416327, + "grad_norm": 1.3140043020248413, + "learning_rate": 2.2049525667457322e-05, + "loss": 0.8525, + "step": 215720 + }, + { + "epoch": 1.3782374813130087, + "grad_norm": 0.9456557035446167, + "learning_rate": 2.2045365338188185e-05, + "loss": 1.048, + "step": 215730 + }, + { + "epoch": 1.3783013684627474, + "grad_norm": 0.9949221611022949, + "learning_rate": 2.2041205290446383e-05, + "loss": 0.7744, + "step": 215740 + }, + { + "epoch": 1.3783652556124861, + "grad_norm": 2.190789222717285, + "learning_rate": 2.2037045524273847e-05, + "loss": 0.9398, + "step": 215750 + }, + { + "epoch": 1.3784291427622248, + "grad_norm": 2.9107577800750732, + "learning_rate": 2.203288603971244e-05, + "loss": 1.0338, + "step": 215760 + }, + { + "epoch": 1.3784930299119635, + "grad_norm": 1.2673238515853882, + "learning_rate": 2.202872683680408e-05, + "loss": 0.6364, + "step": 215770 + }, + { + "epoch": 1.3785569170617022, + "grad_norm": 2.4107301235198975, + "learning_rate": 2.2024567915590627e-05, + "loss": 0.8876, + "step": 215780 + }, + { + "epoch": 1.378620804211441, + "grad_norm": 0.8848767876625061, + "learning_rate": 2.2020409276113972e-05, + "loss": 0.9391, + "step": 215790 + }, + { + "epoch": 1.3786846913611797, + "grad_norm": 1.3917646408081055, + "learning_rate": 2.201625091841602e-05, + "loss": 1.2833, + "step": 215800 + }, + { + "epoch": 1.3787485785109184, + "grad_norm": 0.8002511858940125, + "learning_rate": 2.2012092842538618e-05, + "loss": 0.7029, + "step": 215810 + }, + { + "epoch": 1.378812465660657, + "grad_norm": 0.7884570360183716, + "learning_rate": 2.2007935048523664e-05, + "loss": 0.9076, + "step": 215820 + }, + { + "epoch": 1.3788763528103958, + "grad_norm": 1.1183714866638184, + "learning_rate": 2.2003777536413007e-05, + "loss": 0.8994, + "step": 215830 + }, + { + "epoch": 1.3789402399601345, + "grad_norm": 1.8624705076217651, + "learning_rate": 2.1999620306248547e-05, + "loss": 1.0381, + "step": 215840 + }, + { + "epoch": 1.3790041271098732, + "grad_norm": 1.2032179832458496, + "learning_rate": 2.199546335807212e-05, + "loss": 0.7494, + "step": 215850 + }, + { + "epoch": 1.379068014259612, + "grad_norm": 0.9324236512184143, + "learning_rate": 2.1991306691925616e-05, + "loss": 0.5596, + "step": 215860 + }, + { + "epoch": 1.3791319014093506, + "grad_norm": 1.2062602043151855, + "learning_rate": 2.1987150307850874e-05, + "loss": 0.8037, + "step": 215870 + }, + { + "epoch": 1.3791957885590893, + "grad_norm": 1.706902027130127, + "learning_rate": 2.198299420588978e-05, + "loss": 0.7597, + "step": 215880 + }, + { + "epoch": 1.3792596757088278, + "grad_norm": 0.8762345314025879, + "learning_rate": 2.197883838608415e-05, + "loss": 0.9189, + "step": 215890 + }, + { + "epoch": 1.3793235628585667, + "grad_norm": 1.1141246557235718, + "learning_rate": 2.1974682848475874e-05, + "loss": 0.7046, + "step": 215900 + }, + { + "epoch": 1.3793874500083052, + "grad_norm": 2.5568530559539795, + "learning_rate": 2.1970527593106777e-05, + "loss": 0.9037, + "step": 215910 + }, + { + "epoch": 1.3794513371580441, + "grad_norm": 0.8247811198234558, + "learning_rate": 2.1966372620018733e-05, + "loss": 0.8501, + "step": 215920 + }, + { + "epoch": 1.3795152243077826, + "grad_norm": 0.7716491222381592, + "learning_rate": 2.196221792925355e-05, + "loss": 0.8452, + "step": 215930 + }, + { + "epoch": 1.3795791114575215, + "grad_norm": 0.950640082359314, + "learning_rate": 2.1958063520853107e-05, + "loss": 0.8706, + "step": 215940 + }, + { + "epoch": 1.37964299860726, + "grad_norm": 0.7813429236412048, + "learning_rate": 2.1953909394859202e-05, + "loss": 0.8404, + "step": 215950 + }, + { + "epoch": 1.379706885756999, + "grad_norm": 1.2660363912582397, + "learning_rate": 2.194975555131371e-05, + "loss": 0.9184, + "step": 215960 + }, + { + "epoch": 1.3797707729067374, + "grad_norm": 0.8251976370811462, + "learning_rate": 2.1945601990258434e-05, + "loss": 0.9045, + "step": 215970 + }, + { + "epoch": 1.3798346600564764, + "grad_norm": 1.511618971824646, + "learning_rate": 2.1941448711735234e-05, + "loss": 0.639, + "step": 215980 + }, + { + "epoch": 1.3798985472062149, + "grad_norm": 0.873430609703064, + "learning_rate": 2.1937711002663826e-05, + "loss": 0.763, + "step": 215990 + }, + { + "epoch": 1.3799624343559538, + "grad_norm": 0.9361000657081604, + "learning_rate": 2.1933558261066743e-05, + "loss": 0.8217, + "step": 216000 + }, + { + "epoch": 1.3800263215056923, + "grad_norm": 1.2457658052444458, + "learning_rate": 2.1929405802123038e-05, + "loss": 0.9674, + "step": 216010 + }, + { + "epoch": 1.380090208655431, + "grad_norm": 1.4045180082321167, + "learning_rate": 2.1925253625874474e-05, + "loss": 0.628, + "step": 216020 + }, + { + "epoch": 1.3801540958051697, + "grad_norm": 0.8839975595474243, + "learning_rate": 2.192110173236292e-05, + "loss": 0.7358, + "step": 216030 + }, + { + "epoch": 1.3802179829549084, + "grad_norm": 0.8986994028091431, + "learning_rate": 2.1916950121630144e-05, + "loss": 0.7854, + "step": 216040 + }, + { + "epoch": 1.380281870104647, + "grad_norm": 1.0271767377853394, + "learning_rate": 2.1912798793717986e-05, + "loss": 1.0156, + "step": 216050 + }, + { + "epoch": 1.3803457572543858, + "grad_norm": 1.1156021356582642, + "learning_rate": 2.190864774866823e-05, + "loss": 0.8847, + "step": 216060 + }, + { + "epoch": 1.3804096444041245, + "grad_norm": 1.4262725114822388, + "learning_rate": 2.1904496986522715e-05, + "loss": 0.8734, + "step": 216070 + }, + { + "epoch": 1.3804735315538632, + "grad_norm": 0.9161853790283203, + "learning_rate": 2.19003465073232e-05, + "loss": 0.8119, + "step": 216080 + }, + { + "epoch": 1.380537418703602, + "grad_norm": 0.8312424421310425, + "learning_rate": 2.1896196311111523e-05, + "loss": 1.0997, + "step": 216090 + }, + { + "epoch": 1.3806013058533406, + "grad_norm": 1.6136534214019775, + "learning_rate": 2.1892046397929444e-05, + "loss": 0.9357, + "step": 216100 + }, + { + "epoch": 1.3806651930030793, + "grad_norm": 0.7072123289108276, + "learning_rate": 2.188789676781878e-05, + "loss": 0.8362, + "step": 216110 + }, + { + "epoch": 1.380729080152818, + "grad_norm": 1.436051845550537, + "learning_rate": 2.1883747420821327e-05, + "loss": 0.806, + "step": 216120 + }, + { + "epoch": 1.3807929673025567, + "grad_norm": 1.851639986038208, + "learning_rate": 2.1879598356978848e-05, + "loss": 1.0181, + "step": 216130 + }, + { + "epoch": 1.3808568544522954, + "grad_norm": 1.6991009712219238, + "learning_rate": 2.187544957633316e-05, + "loss": 0.9347, + "step": 216140 + }, + { + "epoch": 1.3809207416020342, + "grad_norm": 0.9016327261924744, + "learning_rate": 2.1871301078926e-05, + "loss": 1.0082, + "step": 216150 + }, + { + "epoch": 1.3809846287517729, + "grad_norm": 0.9443797469139099, + "learning_rate": 2.1867152864799194e-05, + "loss": 0.725, + "step": 216160 + }, + { + "epoch": 1.3810485159015116, + "grad_norm": 1.1066298484802246, + "learning_rate": 2.1863004933994484e-05, + "loss": 0.6321, + "step": 216170 + }, + { + "epoch": 1.3811124030512503, + "grad_norm": 1.2663660049438477, + "learning_rate": 2.1858857286553676e-05, + "loss": 0.9728, + "step": 216180 + }, + { + "epoch": 1.381176290200989, + "grad_norm": 2.669292688369751, + "learning_rate": 2.1854709922518495e-05, + "loss": 1.0648, + "step": 216190 + }, + { + "epoch": 1.3812401773507277, + "grad_norm": 1.2658430337905884, + "learning_rate": 2.1850562841930756e-05, + "loss": 1.0093, + "step": 216200 + }, + { + "epoch": 1.3813040645004664, + "grad_norm": 0.9764119982719421, + "learning_rate": 2.184641604483218e-05, + "loss": 0.9526, + "step": 216210 + }, + { + "epoch": 1.381367951650205, + "grad_norm": 1.3725136518478394, + "learning_rate": 2.1842269531264575e-05, + "loss": 0.7896, + "step": 216220 + }, + { + "epoch": 1.3814318387999438, + "grad_norm": 0.7538588047027588, + "learning_rate": 2.1838123301269653e-05, + "loss": 0.6495, + "step": 216230 + }, + { + "epoch": 1.3814957259496825, + "grad_norm": 1.4654730558395386, + "learning_rate": 2.1833977354889212e-05, + "loss": 0.9431, + "step": 216240 + }, + { + "epoch": 1.3815596130994212, + "grad_norm": 1.0827220678329468, + "learning_rate": 2.182983169216497e-05, + "loss": 1.0752, + "step": 216250 + }, + { + "epoch": 1.38162350024916, + "grad_norm": 1.0800297260284424, + "learning_rate": 2.182568631313871e-05, + "loss": 0.8834, + "step": 216260 + }, + { + "epoch": 1.3816873873988986, + "grad_norm": 3.10566782951355, + "learning_rate": 2.1821541217852164e-05, + "loss": 0.7927, + "step": 216270 + }, + { + "epoch": 1.3817512745486373, + "grad_norm": 1.3620527982711792, + "learning_rate": 2.1817396406347056e-05, + "loss": 0.9694, + "step": 216280 + }, + { + "epoch": 1.381815161698376, + "grad_norm": 0.870901882648468, + "learning_rate": 2.181325187866517e-05, + "loss": 1.1438, + "step": 216290 + }, + { + "epoch": 1.3818790488481147, + "grad_norm": 1.473087191581726, + "learning_rate": 2.1809107634848202e-05, + "loss": 0.726, + "step": 216300 + }, + { + "epoch": 1.3819429359978534, + "grad_norm": 0.7571781277656555, + "learning_rate": 2.1804963674937924e-05, + "loss": 0.7195, + "step": 216310 + }, + { + "epoch": 1.3820068231475922, + "grad_norm": 1.2076596021652222, + "learning_rate": 2.180081999897604e-05, + "loss": 0.91, + "step": 216320 + }, + { + "epoch": 1.3820707102973309, + "grad_norm": 1.4176630973815918, + "learning_rate": 2.1796676607004307e-05, + "loss": 0.8324, + "step": 216330 + }, + { + "epoch": 1.3821345974470696, + "grad_norm": 1.1328399181365967, + "learning_rate": 2.179253349906443e-05, + "loss": 1.0424, + "step": 216340 + }, + { + "epoch": 1.3821984845968083, + "grad_norm": 0.8762274980545044, + "learning_rate": 2.1788390675198157e-05, + "loss": 1.0355, + "step": 216350 + }, + { + "epoch": 1.382262371746547, + "grad_norm": 0.8054455518722534, + "learning_rate": 2.1784248135447177e-05, + "loss": 0.832, + "step": 216360 + }, + { + "epoch": 1.3823262588962857, + "grad_norm": 1.6195545196533203, + "learning_rate": 2.1780105879853247e-05, + "loss": 0.8238, + "step": 216370 + }, + { + "epoch": 1.3823901460460242, + "grad_norm": 1.0791467428207397, + "learning_rate": 2.1775963908458047e-05, + "loss": 0.8072, + "step": 216380 + }, + { + "epoch": 1.382454033195763, + "grad_norm": 0.9370194673538208, + "learning_rate": 2.1771822221303323e-05, + "loss": 0.9024, + "step": 216390 + }, + { + "epoch": 1.3825179203455016, + "grad_norm": 0.7822071313858032, + "learning_rate": 2.176768081843076e-05, + "loss": 0.6919, + "step": 216400 + }, + { + "epoch": 1.3825818074952405, + "grad_norm": 0.9446224570274353, + "learning_rate": 2.1763539699882087e-05, + "loss": 0.9925, + "step": 216410 + }, + { + "epoch": 1.382645694644979, + "grad_norm": 0.71107017993927, + "learning_rate": 2.1759398865698977e-05, + "loss": 0.9031, + "step": 216420 + }, + { + "epoch": 1.382709581794718, + "grad_norm": 0.7870688438415527, + "learning_rate": 2.175525831592316e-05, + "loss": 0.8776, + "step": 216430 + }, + { + "epoch": 1.3827734689444564, + "grad_norm": 1.1367087364196777, + "learning_rate": 2.1751118050596336e-05, + "loss": 0.8533, + "step": 216440 + }, + { + "epoch": 1.3828373560941953, + "grad_norm": 0.8634241819381714, + "learning_rate": 2.1746978069760184e-05, + "loss": 1.0664, + "step": 216450 + }, + { + "epoch": 1.3829012432439338, + "grad_norm": 0.9241101741790771, + "learning_rate": 2.1742838373456415e-05, + "loss": 0.9777, + "step": 216460 + }, + { + "epoch": 1.3829651303936727, + "grad_norm": 0.8248175382614136, + "learning_rate": 2.1738698961726694e-05, + "loss": 0.7993, + "step": 216470 + }, + { + "epoch": 1.3830290175434112, + "grad_norm": 0.4851258397102356, + "learning_rate": 2.1734559834612745e-05, + "loss": 0.8707, + "step": 216480 + }, + { + "epoch": 1.38309290469315, + "grad_norm": 0.9273546934127808, + "learning_rate": 2.173042099215621e-05, + "loss": 0.8059, + "step": 216490 + }, + { + "epoch": 1.3831567918428886, + "grad_norm": 0.8645648956298828, + "learning_rate": 2.172628243439881e-05, + "loss": 1.1275, + "step": 216500 + }, + { + "epoch": 1.3832206789926274, + "grad_norm": 0.38949495553970337, + "learning_rate": 2.172214416138219e-05, + "loss": 0.745, + "step": 216510 + }, + { + "epoch": 1.383284566142366, + "grad_norm": 1.0118781328201294, + "learning_rate": 2.171800617314806e-05, + "loss": 0.8089, + "step": 216520 + }, + { + "epoch": 1.3833484532921048, + "grad_norm": 0.7357847690582275, + "learning_rate": 2.1713868469738057e-05, + "loss": 0.8804, + "step": 216530 + }, + { + "epoch": 1.3834123404418435, + "grad_norm": 2.8220245838165283, + "learning_rate": 2.1709731051193887e-05, + "loss": 0.9014, + "step": 216540 + }, + { + "epoch": 1.3834762275915822, + "grad_norm": 0.9902445077896118, + "learning_rate": 2.1705593917557183e-05, + "loss": 0.8465, + "step": 216550 + }, + { + "epoch": 1.3835401147413209, + "grad_norm": 1.9937920570373535, + "learning_rate": 2.170145706886964e-05, + "loss": 1.14, + "step": 216560 + }, + { + "epoch": 1.3836040018910596, + "grad_norm": 1.017596960067749, + "learning_rate": 2.169732050517289e-05, + "loss": 0.7427, + "step": 216570 + }, + { + "epoch": 1.3836678890407983, + "grad_norm": 0.7377170324325562, + "learning_rate": 2.1693184226508635e-05, + "loss": 0.9442, + "step": 216580 + }, + { + "epoch": 1.383731776190537, + "grad_norm": 1.1950017213821411, + "learning_rate": 2.1689048232918475e-05, + "loss": 0.809, + "step": 216590 + }, + { + "epoch": 1.3837956633402757, + "grad_norm": 1.2431923151016235, + "learning_rate": 2.168491252444411e-05, + "loss": 0.748, + "step": 216600 + }, + { + "epoch": 1.3838595504900144, + "grad_norm": 0.7115759253501892, + "learning_rate": 2.168077710112716e-05, + "loss": 0.8706, + "step": 216610 + }, + { + "epoch": 1.3839234376397531, + "grad_norm": 0.7203143835067749, + "learning_rate": 2.16766419630093e-05, + "loss": 0.7823, + "step": 216620 + }, + { + "epoch": 1.3839873247894918, + "grad_norm": 0.8078294396400452, + "learning_rate": 2.1672507110132134e-05, + "loss": 0.8612, + "step": 216630 + }, + { + "epoch": 1.3840512119392305, + "grad_norm": 1.1752616167068481, + "learning_rate": 2.1668372542537336e-05, + "loss": 0.8119, + "step": 216640 + }, + { + "epoch": 1.3841150990889692, + "grad_norm": 1.060088872909546, + "learning_rate": 2.1664238260266556e-05, + "loss": 0.86, + "step": 216650 + }, + { + "epoch": 1.384178986238708, + "grad_norm": 4.633458614349365, + "learning_rate": 2.1660104263361385e-05, + "loss": 1.0264, + "step": 216660 + }, + { + "epoch": 1.3842428733884466, + "grad_norm": 0.9945595264434814, + "learning_rate": 2.1655970551863502e-05, + "loss": 0.8249, + "step": 216670 + }, + { + "epoch": 1.3843067605381854, + "grad_norm": 1.518704891204834, + "learning_rate": 2.1651837125814506e-05, + "loss": 0.8972, + "step": 216680 + }, + { + "epoch": 1.384370647687924, + "grad_norm": 1.1230610609054565, + "learning_rate": 2.1647703985256047e-05, + "loss": 0.8053, + "step": 216690 + }, + { + "epoch": 1.3844345348376628, + "grad_norm": 0.8968077898025513, + "learning_rate": 2.1643571130229718e-05, + "loss": 0.7231, + "step": 216700 + }, + { + "epoch": 1.3844984219874015, + "grad_norm": 0.8500282168388367, + "learning_rate": 2.1639438560777182e-05, + "loss": 0.9592, + "step": 216710 + }, + { + "epoch": 1.3845623091371402, + "grad_norm": 0.7221376895904541, + "learning_rate": 2.163530627694001e-05, + "loss": 0.987, + "step": 216720 + }, + { + "epoch": 1.3846261962868789, + "grad_norm": 1.0014212131500244, + "learning_rate": 2.1631174278759864e-05, + "loss": 0.9718, + "step": 216730 + }, + { + "epoch": 1.3846900834366176, + "grad_norm": 1.3466345071792603, + "learning_rate": 2.162704256627832e-05, + "loss": 0.7581, + "step": 216740 + }, + { + "epoch": 1.3847539705863563, + "grad_norm": 0.6369860172271729, + "learning_rate": 2.1622911139537015e-05, + "loss": 0.7653, + "step": 216750 + }, + { + "epoch": 1.384817857736095, + "grad_norm": 0.9595978856086731, + "learning_rate": 2.1618779998577553e-05, + "loss": 0.7024, + "step": 216760 + }, + { + "epoch": 1.3848817448858337, + "grad_norm": 1.0387907028198242, + "learning_rate": 2.1614649143441508e-05, + "loss": 0.9721, + "step": 216770 + }, + { + "epoch": 1.3849456320355724, + "grad_norm": 1.3099180459976196, + "learning_rate": 2.1610518574170518e-05, + "loss": 0.833, + "step": 216780 + }, + { + "epoch": 1.3850095191853111, + "grad_norm": 0.8216334581375122, + "learning_rate": 2.160638829080615e-05, + "loss": 1.0301, + "step": 216790 + }, + { + "epoch": 1.3850734063350498, + "grad_norm": 1.4595752954483032, + "learning_rate": 2.1602258293390038e-05, + "loss": 0.6508, + "step": 216800 + }, + { + "epoch": 1.3851372934847885, + "grad_norm": 0.721744954586029, + "learning_rate": 2.1598128581963733e-05, + "loss": 0.9679, + "step": 216810 + }, + { + "epoch": 1.3852011806345272, + "grad_norm": 0.7369323372840881, + "learning_rate": 2.159399915656886e-05, + "loss": 0.9548, + "step": 216820 + }, + { + "epoch": 1.385265067784266, + "grad_norm": 0.7699748873710632, + "learning_rate": 2.1589870017246973e-05, + "loss": 0.74, + "step": 216830 + }, + { + "epoch": 1.3853289549340047, + "grad_norm": 0.5005737543106079, + "learning_rate": 2.158574116403969e-05, + "loss": 0.8124, + "step": 216840 + }, + { + "epoch": 1.3853928420837434, + "grad_norm": 1.0250499248504639, + "learning_rate": 2.158161259698856e-05, + "loss": 0.7027, + "step": 216850 + }, + { + "epoch": 1.385456729233482, + "grad_norm": 1.1445086002349854, + "learning_rate": 2.1577484316135194e-05, + "loss": 1.0355, + "step": 216860 + }, + { + "epoch": 1.3855206163832205, + "grad_norm": 2.682488203048706, + "learning_rate": 2.1573356321521132e-05, + "loss": 1.2289, + "step": 216870 + }, + { + "epoch": 1.3855845035329595, + "grad_norm": 1.184380292892456, + "learning_rate": 2.156922861318798e-05, + "loss": 0.8661, + "step": 216880 + }, + { + "epoch": 1.385648390682698, + "grad_norm": 1.1919716596603394, + "learning_rate": 2.156510119117727e-05, + "loss": 0.789, + "step": 216890 + }, + { + "epoch": 1.385712277832437, + "grad_norm": 0.9529315233230591, + "learning_rate": 2.156097405553062e-05, + "loss": 0.776, + "step": 216900 + }, + { + "epoch": 1.3857761649821754, + "grad_norm": 1.4514542818069458, + "learning_rate": 2.1556847206289537e-05, + "loss": 0.7248, + "step": 216910 + }, + { + "epoch": 1.3858400521319143, + "grad_norm": 1.3809146881103516, + "learning_rate": 2.1552720643495632e-05, + "loss": 0.7629, + "step": 216920 + }, + { + "epoch": 1.3859039392816528, + "grad_norm": 1.508679986000061, + "learning_rate": 2.1548594367190422e-05, + "loss": 0.9625, + "step": 216930 + }, + { + "epoch": 1.3859678264313917, + "grad_norm": 0.9351204037666321, + "learning_rate": 2.15444683774155e-05, + "loss": 1.1496, + "step": 216940 + }, + { + "epoch": 1.3860317135811302, + "grad_norm": 0.6512327790260315, + "learning_rate": 2.1540342674212377e-05, + "loss": 0.9739, + "step": 216950 + }, + { + "epoch": 1.3860956007308691, + "grad_norm": 1.4669448137283325, + "learning_rate": 2.1536217257622627e-05, + "loss": 0.8527, + "step": 216960 + }, + { + "epoch": 1.3861594878806076, + "grad_norm": 0.9330663681030273, + "learning_rate": 2.1532092127687813e-05, + "loss": 1.2608, + "step": 216970 + }, + { + "epoch": 1.3862233750303463, + "grad_norm": 1.1834372282028198, + "learning_rate": 2.152796728444944e-05, + "loss": 1.0855, + "step": 216980 + }, + { + "epoch": 1.386287262180085, + "grad_norm": 1.1099574565887451, + "learning_rate": 2.1523842727949084e-05, + "loss": 0.6813, + "step": 216990 + }, + { + "epoch": 1.3863511493298237, + "grad_norm": 0.8671126365661621, + "learning_rate": 2.151971845822825e-05, + "loss": 1.033, + "step": 217000 + }, + { + "epoch": 1.3864150364795624, + "grad_norm": 0.9332876205444336, + "learning_rate": 2.151559447532851e-05, + "loss": 0.8845, + "step": 217010 + }, + { + "epoch": 1.3864789236293011, + "grad_norm": 1.091829776763916, + "learning_rate": 2.151147077929136e-05, + "loss": 0.8085, + "step": 217020 + }, + { + "epoch": 1.3865428107790398, + "grad_norm": 1.0661778450012207, + "learning_rate": 2.150734737015836e-05, + "loss": 1.1284, + "step": 217030 + }, + { + "epoch": 1.3866066979287786, + "grad_norm": 1.2264496088027954, + "learning_rate": 2.1503224247971003e-05, + "loss": 1.2297, + "step": 217040 + }, + { + "epoch": 1.3866705850785173, + "grad_norm": 0.6267675757408142, + "learning_rate": 2.149910141277085e-05, + "loss": 0.7822, + "step": 217050 + }, + { + "epoch": 1.386734472228256, + "grad_norm": 1.2916646003723145, + "learning_rate": 2.1494978864599384e-05, + "loss": 0.7288, + "step": 217060 + }, + { + "epoch": 1.3867983593779947, + "grad_norm": 1.2757755517959595, + "learning_rate": 2.1490856603498165e-05, + "loss": 0.7771, + "step": 217070 + }, + { + "epoch": 1.3868622465277334, + "grad_norm": 1.0068482160568237, + "learning_rate": 2.1486734629508657e-05, + "loss": 0.8135, + "step": 217080 + }, + { + "epoch": 1.386926133677472, + "grad_norm": 0.8916890621185303, + "learning_rate": 2.1482612942672426e-05, + "loss": 0.9525, + "step": 217090 + }, + { + "epoch": 1.3869900208272108, + "grad_norm": 1.7053278684616089, + "learning_rate": 2.1478491543030925e-05, + "loss": 0.8083, + "step": 217100 + }, + { + "epoch": 1.3870539079769495, + "grad_norm": 0.7205654978752136, + "learning_rate": 2.1474370430625716e-05, + "loss": 0.8443, + "step": 217110 + }, + { + "epoch": 1.3871177951266882, + "grad_norm": 0.5934941172599792, + "learning_rate": 2.1470249605498254e-05, + "loss": 0.7754, + "step": 217120 + }, + { + "epoch": 1.387181682276427, + "grad_norm": 0.9044548273086548, + "learning_rate": 2.1466129067690076e-05, + "loss": 1.0092, + "step": 217130 + }, + { + "epoch": 1.3872455694261656, + "grad_norm": 1.3107562065124512, + "learning_rate": 2.146200881724265e-05, + "loss": 0.9626, + "step": 217140 + }, + { + "epoch": 1.3873094565759043, + "grad_norm": 1.0588704347610474, + "learning_rate": 2.14578888541975e-05, + "loss": 0.859, + "step": 217150 + }, + { + "epoch": 1.387373343725643, + "grad_norm": 1.0064996480941772, + "learning_rate": 2.1453769178596083e-05, + "loss": 0.8617, + "step": 217160 + }, + { + "epoch": 1.3874372308753817, + "grad_norm": 0.774937093257904, + "learning_rate": 2.1449649790479904e-05, + "loss": 0.8253, + "step": 217170 + }, + { + "epoch": 1.3875011180251204, + "grad_norm": 1.3672817945480347, + "learning_rate": 2.1445530689890475e-05, + "loss": 0.7635, + "step": 217180 + }, + { + "epoch": 1.3875650051748591, + "grad_norm": 1.0915000438690186, + "learning_rate": 2.1441411876869233e-05, + "loss": 0.922, + "step": 217190 + }, + { + "epoch": 1.3876288923245979, + "grad_norm": 0.8849831819534302, + "learning_rate": 2.14372933514577e-05, + "loss": 1.0391, + "step": 217200 + }, + { + "epoch": 1.3876927794743366, + "grad_norm": 0.9821801781654358, + "learning_rate": 2.1433175113697312e-05, + "loss": 0.7842, + "step": 217210 + }, + { + "epoch": 1.3877566666240753, + "grad_norm": 1.1746443510055542, + "learning_rate": 2.1429057163629584e-05, + "loss": 0.8189, + "step": 217220 + }, + { + "epoch": 1.387820553773814, + "grad_norm": 1.5725518465042114, + "learning_rate": 2.1424939501295947e-05, + "loss": 0.7708, + "step": 217230 + }, + { + "epoch": 1.3878844409235527, + "grad_norm": 0.9691241979598999, + "learning_rate": 2.142082212673793e-05, + "loss": 1.0128, + "step": 217240 + }, + { + "epoch": 1.3879483280732914, + "grad_norm": 1.709093689918518, + "learning_rate": 2.1416705039996924e-05, + "loss": 0.9637, + "step": 217250 + }, + { + "epoch": 1.38801221522303, + "grad_norm": 1.2288743257522583, + "learning_rate": 2.1412999908047865e-05, + "loss": 0.9048, + "step": 217260 + }, + { + "epoch": 1.3880761023727688, + "grad_norm": 1.160415530204773, + "learning_rate": 2.1408883368273474e-05, + "loss": 0.8645, + "step": 217270 + }, + { + "epoch": 1.3881399895225075, + "grad_norm": 1.401985764503479, + "learning_rate": 2.140476711643636e-05, + "loss": 1.0005, + "step": 217280 + }, + { + "epoch": 1.3882038766722462, + "grad_norm": 1.4683265686035156, + "learning_rate": 2.1400651152577996e-05, + "loss": 0.7122, + "step": 217290 + }, + { + "epoch": 1.388267763821985, + "grad_norm": 0.975030243396759, + "learning_rate": 2.13965354767398e-05, + "loss": 1.0364, + "step": 217300 + }, + { + "epoch": 1.3883316509717236, + "grad_norm": 0.9857359528541565, + "learning_rate": 2.1392420088963254e-05, + "loss": 1.1408, + "step": 217310 + }, + { + "epoch": 1.3883955381214623, + "grad_norm": 1.1964335441589355, + "learning_rate": 2.138830498928977e-05, + "loss": 0.7693, + "step": 217320 + }, + { + "epoch": 1.388459425271201, + "grad_norm": 1.0191142559051514, + "learning_rate": 2.1384190177760828e-05, + "loss": 0.6866, + "step": 217330 + }, + { + "epoch": 1.3885233124209395, + "grad_norm": 1.0814679861068726, + "learning_rate": 2.1380075654417825e-05, + "loss": 0.9072, + "step": 217340 + }, + { + "epoch": 1.3885871995706784, + "grad_norm": 1.1120654344558716, + "learning_rate": 2.1375961419302237e-05, + "loss": 0.8842, + "step": 217350 + }, + { + "epoch": 1.388651086720417, + "grad_norm": 0.6735295653343201, + "learning_rate": 2.1371847472455465e-05, + "loss": 0.8524, + "step": 217360 + }, + { + "epoch": 1.3887149738701559, + "grad_norm": 0.8723472952842712, + "learning_rate": 2.1367733813918972e-05, + "loss": 0.901, + "step": 217370 + }, + { + "epoch": 1.3887788610198943, + "grad_norm": 1.0541127920150757, + "learning_rate": 2.1363620443734155e-05, + "loss": 1.0075, + "step": 217380 + }, + { + "epoch": 1.3888427481696333, + "grad_norm": 0.9354879260063171, + "learning_rate": 2.1359507361942488e-05, + "loss": 0.7836, + "step": 217390 + }, + { + "epoch": 1.3889066353193718, + "grad_norm": 0.9547199606895447, + "learning_rate": 2.1355394568585326e-05, + "loss": 0.9017, + "step": 217400 + }, + { + "epoch": 1.3889705224691107, + "grad_norm": 1.1774011850357056, + "learning_rate": 2.135128206370412e-05, + "loss": 0.7809, + "step": 217410 + }, + { + "epoch": 1.3890344096188492, + "grad_norm": 0.6935076117515564, + "learning_rate": 2.1347169847340305e-05, + "loss": 0.9671, + "step": 217420 + }, + { + "epoch": 1.389098296768588, + "grad_norm": 1.1165943145751953, + "learning_rate": 2.1343057919535266e-05, + "loss": 0.8947, + "step": 217430 + }, + { + "epoch": 1.3891621839183266, + "grad_norm": 1.291062593460083, + "learning_rate": 2.1338946280330435e-05, + "loss": 0.8758, + "step": 217440 + }, + { + "epoch": 1.3892260710680655, + "grad_norm": 3.222094774246216, + "learning_rate": 2.1334834929767196e-05, + "loss": 1.3247, + "step": 217450 + }, + { + "epoch": 1.389289958217804, + "grad_norm": 0.9645674228668213, + "learning_rate": 2.133072386788699e-05, + "loss": 0.7852, + "step": 217460 + }, + { + "epoch": 1.3893538453675427, + "grad_norm": 2.5170841217041016, + "learning_rate": 2.1326613094731174e-05, + "loss": 0.8575, + "step": 217470 + }, + { + "epoch": 1.3894177325172814, + "grad_norm": 1.117675542831421, + "learning_rate": 2.1322502610341188e-05, + "loss": 0.9775, + "step": 217480 + }, + { + "epoch": 1.38948161966702, + "grad_norm": 0.6247068643569946, + "learning_rate": 2.1318392414758394e-05, + "loss": 0.7127, + "step": 217490 + }, + { + "epoch": 1.3895455068167588, + "grad_norm": 0.7332262992858887, + "learning_rate": 2.131428250802422e-05, + "loss": 0.9891, + "step": 217500 + }, + { + "epoch": 1.3896093939664975, + "grad_norm": 2.6846208572387695, + "learning_rate": 2.1310172890180018e-05, + "loss": 0.9265, + "step": 217510 + }, + { + "epoch": 1.3896732811162362, + "grad_norm": 1.089535117149353, + "learning_rate": 2.1306063561267214e-05, + "loss": 0.8925, + "step": 217520 + }, + { + "epoch": 1.389737168265975, + "grad_norm": 0.637092649936676, + "learning_rate": 2.1301954521327154e-05, + "loss": 0.8965, + "step": 217530 + }, + { + "epoch": 1.3898010554157136, + "grad_norm": 0.7360275983810425, + "learning_rate": 2.1297845770401258e-05, + "loss": 0.8893, + "step": 217540 + }, + { + "epoch": 1.3898649425654523, + "grad_norm": 0.8324083089828491, + "learning_rate": 2.129373730853087e-05, + "loss": 0.9598, + "step": 217550 + }, + { + "epoch": 1.389928829715191, + "grad_norm": 2.202166795730591, + "learning_rate": 2.12896291357574e-05, + "loss": 0.873, + "step": 217560 + }, + { + "epoch": 1.3899927168649298, + "grad_norm": 0.8910554647445679, + "learning_rate": 2.1285521252122188e-05, + "loss": 0.9841, + "step": 217570 + }, + { + "epoch": 1.3900566040146685, + "grad_norm": 0.9161718487739563, + "learning_rate": 2.128141365766663e-05, + "loss": 0.7002, + "step": 217580 + }, + { + "epoch": 1.3901204911644072, + "grad_norm": 0.8528456091880798, + "learning_rate": 2.1277306352432063e-05, + "loss": 0.736, + "step": 217590 + }, + { + "epoch": 1.3901843783141459, + "grad_norm": 0.5835663080215454, + "learning_rate": 2.1273199336459893e-05, + "loss": 0.7411, + "step": 217600 + }, + { + "epoch": 1.3902482654638846, + "grad_norm": 1.2048559188842773, + "learning_rate": 2.1269092609791437e-05, + "loss": 0.9812, + "step": 217610 + }, + { + "epoch": 1.3903121526136233, + "grad_norm": 0.8390817642211914, + "learning_rate": 2.1264986172468077e-05, + "loss": 0.9444, + "step": 217620 + }, + { + "epoch": 1.390376039763362, + "grad_norm": 0.9608392715454102, + "learning_rate": 2.1260880024531176e-05, + "loss": 1.0438, + "step": 217630 + }, + { + "epoch": 1.3904399269131007, + "grad_norm": 0.9510350823402405, + "learning_rate": 2.1256774166022064e-05, + "loss": 0.7556, + "step": 217640 + }, + { + "epoch": 1.3905038140628394, + "grad_norm": 1.1506633758544922, + "learning_rate": 2.1252668596982123e-05, + "loss": 0.8586, + "step": 217650 + }, + { + "epoch": 1.3905677012125781, + "grad_norm": 0.8959683775901794, + "learning_rate": 2.124856331745266e-05, + "loss": 0.9765, + "step": 217660 + }, + { + "epoch": 1.3906315883623168, + "grad_norm": 0.5582273602485657, + "learning_rate": 2.124445832747505e-05, + "loss": 0.8269, + "step": 217670 + }, + { + "epoch": 1.3906954755120555, + "grad_norm": 0.9217997193336487, + "learning_rate": 2.124035362709061e-05, + "loss": 1.1126, + "step": 217680 + }, + { + "epoch": 1.3907593626617942, + "grad_norm": 0.7768685817718506, + "learning_rate": 2.1236249216340704e-05, + "loss": 0.9109, + "step": 217690 + }, + { + "epoch": 1.390823249811533, + "grad_norm": 1.1810338497161865, + "learning_rate": 2.1232145095266635e-05, + "loss": 1.0181, + "step": 217700 + }, + { + "epoch": 1.3908871369612716, + "grad_norm": 1.0850905179977417, + "learning_rate": 2.122804126390977e-05, + "loss": 0.7316, + "step": 217710 + }, + { + "epoch": 1.3909510241110103, + "grad_norm": 1.9918285608291626, + "learning_rate": 2.1223937722311404e-05, + "loss": 0.9262, + "step": 217720 + }, + { + "epoch": 1.391014911260749, + "grad_norm": 0.724561333656311, + "learning_rate": 2.12198344705129e-05, + "loss": 0.6552, + "step": 217730 + }, + { + "epoch": 1.3910787984104878, + "grad_norm": 0.9518063068389893, + "learning_rate": 2.121573150855554e-05, + "loss": 0.9679, + "step": 217740 + }, + { + "epoch": 1.3911426855602265, + "grad_norm": 0.7545220851898193, + "learning_rate": 2.121162883648069e-05, + "loss": 0.6445, + "step": 217750 + }, + { + "epoch": 1.3912065727099652, + "grad_norm": 0.9066117405891418, + "learning_rate": 2.120752645432962e-05, + "loss": 0.9309, + "step": 217760 + }, + { + "epoch": 1.3912704598597039, + "grad_norm": 0.7924219369888306, + "learning_rate": 2.120342436214368e-05, + "loss": 0.858, + "step": 217770 + }, + { + "epoch": 1.3913343470094426, + "grad_norm": 0.6461116671562195, + "learning_rate": 2.1199322559964163e-05, + "loss": 0.8988, + "step": 217780 + }, + { + "epoch": 1.3913982341591813, + "grad_norm": 0.9200494885444641, + "learning_rate": 2.1195221047832392e-05, + "loss": 1.0004, + "step": 217790 + }, + { + "epoch": 1.39146212130892, + "grad_norm": 0.7647868394851685, + "learning_rate": 2.1191119825789652e-05, + "loss": 0.8146, + "step": 217800 + }, + { + "epoch": 1.3915260084586587, + "grad_norm": 1.1263049840927124, + "learning_rate": 2.118701889387726e-05, + "loss": 0.6912, + "step": 217810 + }, + { + "epoch": 1.3915898956083974, + "grad_norm": 0.7079087495803833, + "learning_rate": 2.118291825213653e-05, + "loss": 0.6851, + "step": 217820 + }, + { + "epoch": 1.391653782758136, + "grad_norm": 1.2798701524734497, + "learning_rate": 2.117881790060872e-05, + "loss": 0.8018, + "step": 217830 + }, + { + "epoch": 1.3917176699078748, + "grad_norm": 0.9899066090583801, + "learning_rate": 2.1174717839335172e-05, + "loss": 1.1333, + "step": 217840 + }, + { + "epoch": 1.3917815570576133, + "grad_norm": 0.3420279026031494, + "learning_rate": 2.1170618068357134e-05, + "loss": 0.6272, + "step": 217850 + }, + { + "epoch": 1.3918454442073522, + "grad_norm": 0.8472849130630493, + "learning_rate": 2.1166518587715932e-05, + "loss": 0.7776, + "step": 217860 + }, + { + "epoch": 1.3919093313570907, + "grad_norm": 1.085749864578247, + "learning_rate": 2.1162419397452814e-05, + "loss": 0.734, + "step": 217870 + }, + { + "epoch": 1.3919732185068296, + "grad_norm": 0.9656279683113098, + "learning_rate": 2.1158320497609117e-05, + "loss": 0.7615, + "step": 217880 + }, + { + "epoch": 1.3920371056565681, + "grad_norm": 0.6101782917976379, + "learning_rate": 2.1154221888226046e-05, + "loss": 1.0044, + "step": 217890 + }, + { + "epoch": 1.392100992806307, + "grad_norm": 1.6533408164978027, + "learning_rate": 2.1150123569344938e-05, + "loss": 1.1256, + "step": 217900 + }, + { + "epoch": 1.3921648799560455, + "grad_norm": 0.6959205865859985, + "learning_rate": 2.114602554100702e-05, + "loss": 0.6763, + "step": 217910 + }, + { + "epoch": 1.3922287671057845, + "grad_norm": 1.275519847869873, + "learning_rate": 2.1141927803253608e-05, + "loss": 0.703, + "step": 217920 + }, + { + "epoch": 1.392292654255523, + "grad_norm": 1.3255060911178589, + "learning_rate": 2.113783035612593e-05, + "loss": 0.9429, + "step": 217930 + }, + { + "epoch": 1.3923565414052619, + "grad_norm": 0.5562800765037537, + "learning_rate": 2.1133733199665274e-05, + "loss": 0.695, + "step": 217940 + }, + { + "epoch": 1.3924204285550004, + "grad_norm": 1.1394197940826416, + "learning_rate": 2.112963633391291e-05, + "loss": 0.6777, + "step": 217950 + }, + { + "epoch": 1.392484315704739, + "grad_norm": 1.1498068571090698, + "learning_rate": 2.112553975891007e-05, + "loss": 0.7624, + "step": 217960 + }, + { + "epoch": 1.3925482028544778, + "grad_norm": 2.9445035457611084, + "learning_rate": 2.1121443474698038e-05, + "loss": 0.7222, + "step": 217970 + }, + { + "epoch": 1.3926120900042165, + "grad_norm": 0.9396551847457886, + "learning_rate": 2.111734748131804e-05, + "loss": 0.8654, + "step": 217980 + }, + { + "epoch": 1.3926759771539552, + "grad_norm": 0.7392997145652771, + "learning_rate": 2.111325177881136e-05, + "loss": 0.7883, + "step": 217990 + }, + { + "epoch": 1.392739864303694, + "grad_norm": 0.49229753017425537, + "learning_rate": 2.1109156367219202e-05, + "loss": 0.8635, + "step": 218000 + }, + { + "epoch": 1.3928037514534326, + "grad_norm": 1.3487190008163452, + "learning_rate": 2.1105061246582854e-05, + "loss": 1.0008, + "step": 218010 + }, + { + "epoch": 1.3928676386031713, + "grad_norm": 0.7436345219612122, + "learning_rate": 2.110096641694352e-05, + "loss": 0.9823, + "step": 218020 + }, + { + "epoch": 1.39293152575291, + "grad_norm": 0.8380775451660156, + "learning_rate": 2.1096871878342472e-05, + "loss": 0.9085, + "step": 218030 + }, + { + "epoch": 1.3929954129026487, + "grad_norm": 0.9696984887123108, + "learning_rate": 2.109277763082091e-05, + "loss": 1.1286, + "step": 218040 + }, + { + "epoch": 1.3930593000523874, + "grad_norm": 1.152116060256958, + "learning_rate": 2.1088683674420108e-05, + "loss": 0.8687, + "step": 218050 + }, + { + "epoch": 1.3931231872021261, + "grad_norm": 0.7602722644805908, + "learning_rate": 2.1084590009181248e-05, + "loss": 1.1443, + "step": 218060 + }, + { + "epoch": 1.3931870743518648, + "grad_norm": 0.7400329113006592, + "learning_rate": 2.1080496635145604e-05, + "loss": 0.7862, + "step": 218070 + }, + { + "epoch": 1.3932509615016035, + "grad_norm": 1.3810337781906128, + "learning_rate": 2.107640355235436e-05, + "loss": 0.8758, + "step": 218080 + }, + { + "epoch": 1.3933148486513423, + "grad_norm": 1.6315202713012695, + "learning_rate": 2.1072310760848773e-05, + "loss": 0.9272, + "step": 218090 + }, + { + "epoch": 1.393378735801081, + "grad_norm": 0.7120254635810852, + "learning_rate": 2.1068218260670024e-05, + "loss": 0.9861, + "step": 218100 + }, + { + "epoch": 1.3934426229508197, + "grad_norm": 1.1353204250335693, + "learning_rate": 2.1064126051859363e-05, + "loss": 0.7806, + "step": 218110 + }, + { + "epoch": 1.3935065101005584, + "grad_norm": 0.9815794825553894, + "learning_rate": 2.1060034134457967e-05, + "loss": 0.9848, + "step": 218120 + }, + { + "epoch": 1.393570397250297, + "grad_norm": 0.5680572390556335, + "learning_rate": 2.105594250850707e-05, + "loss": 0.9667, + "step": 218130 + }, + { + "epoch": 1.3936342844000358, + "grad_norm": 0.7751454710960388, + "learning_rate": 2.1051851174047886e-05, + "loss": 0.8767, + "step": 218140 + }, + { + "epoch": 1.3936981715497745, + "grad_norm": 1.4941065311431885, + "learning_rate": 2.1047760131121587e-05, + "loss": 0.9672, + "step": 218150 + }, + { + "epoch": 1.3937620586995132, + "grad_norm": 0.8780585527420044, + "learning_rate": 2.104366937976941e-05, + "loss": 0.9633, + "step": 218160 + }, + { + "epoch": 1.393825945849252, + "grad_norm": 0.7456322908401489, + "learning_rate": 2.103957892003251e-05, + "loss": 0.9585, + "step": 218170 + }, + { + "epoch": 1.3938898329989906, + "grad_norm": 1.5976117849349976, + "learning_rate": 2.1035488751952126e-05, + "loss": 0.8625, + "step": 218180 + }, + { + "epoch": 1.3939537201487293, + "grad_norm": 1.2155790328979492, + "learning_rate": 2.1031398875569414e-05, + "loss": 0.8107, + "step": 218190 + }, + { + "epoch": 1.394017607298468, + "grad_norm": 0.7908154726028442, + "learning_rate": 2.102730929092559e-05, + "loss": 0.7408, + "step": 218200 + }, + { + "epoch": 1.3940814944482067, + "grad_norm": 0.5561248660087585, + "learning_rate": 2.102321999806181e-05, + "loss": 0.8089, + "step": 218210 + }, + { + "epoch": 1.3941453815979454, + "grad_norm": 0.9182764291763306, + "learning_rate": 2.1019130997019286e-05, + "loss": 0.9739, + "step": 218220 + }, + { + "epoch": 1.3942092687476841, + "grad_norm": 1.1026109457015991, + "learning_rate": 2.1015042287839176e-05, + "loss": 1.1646, + "step": 218230 + }, + { + "epoch": 1.3942731558974228, + "grad_norm": 2.6774260997772217, + "learning_rate": 2.1010953870562676e-05, + "loss": 0.7264, + "step": 218240 + }, + { + "epoch": 1.3943370430471616, + "grad_norm": 0.9442703127861023, + "learning_rate": 2.1006865745230935e-05, + "loss": 0.8565, + "step": 218250 + }, + { + "epoch": 1.3944009301969003, + "grad_norm": 0.9139619469642639, + "learning_rate": 2.1002777911885156e-05, + "loss": 0.7589, + "step": 218260 + }, + { + "epoch": 1.394464817346639, + "grad_norm": 1.131667137145996, + "learning_rate": 2.0998690370566478e-05, + "loss": 0.8819, + "step": 218270 + }, + { + "epoch": 1.3945287044963777, + "grad_norm": 1.2039259672164917, + "learning_rate": 2.0994603121316086e-05, + "loss": 0.7831, + "step": 218280 + }, + { + "epoch": 1.3945925916461164, + "grad_norm": 0.6849585771560669, + "learning_rate": 2.0990516164175116e-05, + "loss": 0.7287, + "step": 218290 + }, + { + "epoch": 1.394656478795855, + "grad_norm": 1.0747158527374268, + "learning_rate": 2.0986429499184768e-05, + "loss": 1.0633, + "step": 218300 + }, + { + "epoch": 1.3947203659455938, + "grad_norm": 1.6333814859390259, + "learning_rate": 2.0982343126386157e-05, + "loss": 0.9955, + "step": 218310 + }, + { + "epoch": 1.3947842530953323, + "grad_norm": 2.6120517253875732, + "learning_rate": 2.0978257045820472e-05, + "loss": 0.8598, + "step": 218320 + }, + { + "epoch": 1.3948481402450712, + "grad_norm": 0.9668644070625305, + "learning_rate": 2.0974171257528828e-05, + "loss": 0.9431, + "step": 218330 + }, + { + "epoch": 1.3949120273948097, + "grad_norm": 0.8444498777389526, + "learning_rate": 2.0970085761552388e-05, + "loss": 1.2379, + "step": 218340 + }, + { + "epoch": 1.3949759145445486, + "grad_norm": 1.2543483972549438, + "learning_rate": 2.096600055793232e-05, + "loss": 1.2642, + "step": 218350 + }, + { + "epoch": 1.395039801694287, + "grad_norm": 0.5028043389320374, + "learning_rate": 2.096191564670974e-05, + "loss": 0.9692, + "step": 218360 + }, + { + "epoch": 1.395103688844026, + "grad_norm": 0.7030778527259827, + "learning_rate": 2.0957831027925795e-05, + "loss": 0.946, + "step": 218370 + }, + { + "epoch": 1.3951675759937645, + "grad_norm": 0.9780330061912537, + "learning_rate": 2.0953746701621597e-05, + "loss": 0.8594, + "step": 218380 + }, + { + "epoch": 1.3952314631435034, + "grad_norm": 0.7831057906150818, + "learning_rate": 2.0949662667838316e-05, + "loss": 0.8269, + "step": 218390 + }, + { + "epoch": 1.395295350293242, + "grad_norm": 1.1478424072265625, + "learning_rate": 2.0945578926617047e-05, + "loss": 0.8945, + "step": 218400 + }, + { + "epoch": 1.3953592374429808, + "grad_norm": 1.120334267616272, + "learning_rate": 2.094149547799895e-05, + "loss": 1.2116, + "step": 218410 + }, + { + "epoch": 1.3954231245927193, + "grad_norm": 0.8612017035484314, + "learning_rate": 2.0937412322025118e-05, + "loss": 0.8987, + "step": 218420 + }, + { + "epoch": 1.395487011742458, + "grad_norm": 1.0608981847763062, + "learning_rate": 2.0933329458736704e-05, + "loss": 0.8946, + "step": 218430 + }, + { + "epoch": 1.3955508988921967, + "grad_norm": 3.73439884185791, + "learning_rate": 2.0929246888174793e-05, + "loss": 0.884, + "step": 218440 + }, + { + "epoch": 1.3956147860419355, + "grad_norm": 0.8932453393936157, + "learning_rate": 2.092516461038051e-05, + "loss": 0.9874, + "step": 218450 + }, + { + "epoch": 1.3956786731916742, + "grad_norm": 1.17633855342865, + "learning_rate": 2.092108262539499e-05, + "loss": 0.8117, + "step": 218460 + }, + { + "epoch": 1.3957425603414129, + "grad_norm": 1.3153350353240967, + "learning_rate": 2.0917000933259312e-05, + "loss": 1.0372, + "step": 218470 + }, + { + "epoch": 1.3958064474911516, + "grad_norm": 1.2601327896118164, + "learning_rate": 2.0912919534014608e-05, + "loss": 0.7329, + "step": 218480 + }, + { + "epoch": 1.3958703346408903, + "grad_norm": 0.7555443048477173, + "learning_rate": 2.0908838427701954e-05, + "loss": 0.9326, + "step": 218490 + }, + { + "epoch": 1.395934221790629, + "grad_norm": 0.595453679561615, + "learning_rate": 2.090475761436248e-05, + "loss": 0.9627, + "step": 218500 + }, + { + "epoch": 1.3959981089403677, + "grad_norm": 1.642309308052063, + "learning_rate": 2.090067709403725e-05, + "loss": 0.7874, + "step": 218510 + }, + { + "epoch": 1.3960619960901064, + "grad_norm": 4.310575008392334, + "learning_rate": 2.0896596866767397e-05, + "loss": 0.7205, + "step": 218520 + }, + { + "epoch": 1.396125883239845, + "grad_norm": 0.8452205061912537, + "learning_rate": 2.089251693259397e-05, + "loss": 0.9381, + "step": 218530 + }, + { + "epoch": 1.3961897703895838, + "grad_norm": 0.8055098056793213, + "learning_rate": 2.08884372915581e-05, + "loss": 0.7921, + "step": 218540 + }, + { + "epoch": 1.3962536575393225, + "grad_norm": 1.0302882194519043, + "learning_rate": 2.088435794370083e-05, + "loss": 1.0015, + "step": 218550 + }, + { + "epoch": 1.3963175446890612, + "grad_norm": 1.0790350437164307, + "learning_rate": 2.088027888906329e-05, + "loss": 0.7645, + "step": 218560 + }, + { + "epoch": 1.3963814318388, + "grad_norm": 0.9331994652748108, + "learning_rate": 2.087620012768651e-05, + "loss": 0.9206, + "step": 218570 + }, + { + "epoch": 1.3964453189885386, + "grad_norm": 1.6856683492660522, + "learning_rate": 2.0872121659611604e-05, + "loss": 1.0113, + "step": 218580 + }, + { + "epoch": 1.3965092061382773, + "grad_norm": 0.9520609974861145, + "learning_rate": 2.0868043484879625e-05, + "loss": 0.8135, + "step": 218590 + }, + { + "epoch": 1.396573093288016, + "grad_norm": 1.096556544303894, + "learning_rate": 2.086396560353166e-05, + "loss": 0.7393, + "step": 218600 + }, + { + "epoch": 1.3966369804377547, + "grad_norm": 0.6126187443733215, + "learning_rate": 2.0859888015608754e-05, + "loss": 1.0076, + "step": 218610 + }, + { + "epoch": 1.3967008675874935, + "grad_norm": 1.1883734464645386, + "learning_rate": 2.0855810721151998e-05, + "loss": 0.9263, + "step": 218620 + }, + { + "epoch": 1.3967647547372322, + "grad_norm": 1.0129221677780151, + "learning_rate": 2.0851733720202427e-05, + "loss": 1.2883, + "step": 218630 + }, + { + "epoch": 1.3968286418869709, + "grad_norm": 0.758244514465332, + "learning_rate": 2.084765701280113e-05, + "loss": 0.7941, + "step": 218640 + }, + { + "epoch": 1.3968925290367096, + "grad_norm": 0.7279469966888428, + "learning_rate": 2.084358059898913e-05, + "loss": 0.6846, + "step": 218650 + }, + { + "epoch": 1.3969564161864483, + "grad_norm": 0.8215343356132507, + "learning_rate": 2.0839504478807505e-05, + "loss": 0.7877, + "step": 218660 + }, + { + "epoch": 1.397020303336187, + "grad_norm": 1.332100510597229, + "learning_rate": 2.0835428652297305e-05, + "loss": 0.6285, + "step": 218670 + }, + { + "epoch": 1.3970841904859257, + "grad_norm": 0.7422265410423279, + "learning_rate": 2.0831353119499557e-05, + "loss": 1.1877, + "step": 218680 + }, + { + "epoch": 1.3971480776356644, + "grad_norm": 0.9069400429725647, + "learning_rate": 2.0827277880455336e-05, + "loss": 0.6081, + "step": 218690 + }, + { + "epoch": 1.397211964785403, + "grad_norm": 1.2797025442123413, + "learning_rate": 2.0823202935205644e-05, + "loss": 1.1428, + "step": 218700 + }, + { + "epoch": 1.3972758519351418, + "grad_norm": 0.7592565417289734, + "learning_rate": 2.0819128283791557e-05, + "loss": 0.7437, + "step": 218710 + }, + { + "epoch": 1.3973397390848805, + "grad_norm": 0.8271276354789734, + "learning_rate": 2.081505392625408e-05, + "loss": 0.7639, + "step": 218720 + }, + { + "epoch": 1.3974036262346192, + "grad_norm": 0.9150506854057312, + "learning_rate": 2.081097986263427e-05, + "loss": 0.8553, + "step": 218730 + }, + { + "epoch": 1.397467513384358, + "grad_norm": 0.5852721929550171, + "learning_rate": 2.0806906092973134e-05, + "loss": 0.7293, + "step": 218740 + }, + { + "epoch": 1.3975314005340966, + "grad_norm": 1.0465751886367798, + "learning_rate": 2.0802832617311724e-05, + "loss": 1.1388, + "step": 218750 + }, + { + "epoch": 1.3975952876838353, + "grad_norm": 1.1258282661437988, + "learning_rate": 2.079875943569103e-05, + "loss": 1.0519, + "step": 218760 + }, + { + "epoch": 1.397659174833574, + "grad_norm": 1.0739339590072632, + "learning_rate": 2.0794686548152108e-05, + "loss": 0.7896, + "step": 218770 + }, + { + "epoch": 1.3977230619833128, + "grad_norm": 1.094150185585022, + "learning_rate": 2.0790613954735944e-05, + "loss": 0.6958, + "step": 218780 + }, + { + "epoch": 1.3977869491330515, + "grad_norm": 2.0920755863189697, + "learning_rate": 2.0786541655483582e-05, + "loss": 0.708, + "step": 218790 + }, + { + "epoch": 1.3978508362827902, + "grad_norm": 1.8577488660812378, + "learning_rate": 2.0782469650436005e-05, + "loss": 0.5569, + "step": 218800 + }, + { + "epoch": 1.3979147234325286, + "grad_norm": 1.0605297088623047, + "learning_rate": 2.0778397939634247e-05, + "loss": 0.6519, + "step": 218810 + }, + { + "epoch": 1.3979786105822676, + "grad_norm": 0.8613771200180054, + "learning_rate": 2.0774326523119288e-05, + "loss": 0.8127, + "step": 218820 + }, + { + "epoch": 1.398042497732006, + "grad_norm": 1.1038992404937744, + "learning_rate": 2.077025540093216e-05, + "loss": 0.8351, + "step": 218830 + }, + { + "epoch": 1.398106384881745, + "grad_norm": 1.2433876991271973, + "learning_rate": 2.0766184573113833e-05, + "loss": 0.6526, + "step": 218840 + }, + { + "epoch": 1.3981702720314835, + "grad_norm": 0.8283995985984802, + "learning_rate": 2.0762114039705337e-05, + "loss": 0.8199, + "step": 218850 + }, + { + "epoch": 1.3982341591812224, + "grad_norm": 0.9119390845298767, + "learning_rate": 2.0758043800747646e-05, + "loss": 1.1715, + "step": 218860 + }, + { + "epoch": 1.3982980463309609, + "grad_norm": 1.2474699020385742, + "learning_rate": 2.0753973856281728e-05, + "loss": 0.8878, + "step": 218870 + }, + { + "epoch": 1.3983619334806998, + "grad_norm": 0.7916103005409241, + "learning_rate": 2.074990420634862e-05, + "loss": 0.8466, + "step": 218880 + }, + { + "epoch": 1.3984258206304383, + "grad_norm": 0.9151920676231384, + "learning_rate": 2.074583485098926e-05, + "loss": 0.9369, + "step": 218890 + }, + { + "epoch": 1.3984897077801772, + "grad_norm": 0.9492369890213013, + "learning_rate": 2.0741765790244673e-05, + "loss": 0.9729, + "step": 218900 + }, + { + "epoch": 1.3985535949299157, + "grad_norm": 0.7728332281112671, + "learning_rate": 2.0737697024155796e-05, + "loss": 0.8331, + "step": 218910 + }, + { + "epoch": 1.3986174820796544, + "grad_norm": 0.49122798442840576, + "learning_rate": 2.073362855276364e-05, + "loss": 0.9952, + "step": 218920 + }, + { + "epoch": 1.3986813692293931, + "grad_norm": 0.6504753232002258, + "learning_rate": 2.0729560376109147e-05, + "loss": 0.7918, + "step": 218930 + }, + { + "epoch": 1.3987452563791318, + "grad_norm": 0.7926653027534485, + "learning_rate": 2.0725492494233328e-05, + "loss": 0.7887, + "step": 218940 + }, + { + "epoch": 1.3988091435288705, + "grad_norm": 1.5223766565322876, + "learning_rate": 2.07214249071771e-05, + "loss": 0.766, + "step": 218950 + }, + { + "epoch": 1.3988730306786092, + "grad_norm": 1.3729267120361328, + "learning_rate": 2.0717357614981476e-05, + "loss": 0.7495, + "step": 218960 + }, + { + "epoch": 1.398936917828348, + "grad_norm": 0.6617783308029175, + "learning_rate": 2.0713290617687375e-05, + "loss": 1.032, + "step": 218970 + }, + { + "epoch": 1.3990008049780867, + "grad_norm": 0.7680843472480774, + "learning_rate": 2.0709223915335774e-05, + "loss": 1.1359, + "step": 218980 + }, + { + "epoch": 1.3990646921278254, + "grad_norm": 0.45521071553230286, + "learning_rate": 2.0705157507967642e-05, + "loss": 0.7724, + "step": 218990 + }, + { + "epoch": 1.399128579277564, + "grad_norm": 1.3348525762557983, + "learning_rate": 2.0701091395623902e-05, + "loss": 0.9884, + "step": 219000 + }, + { + "epoch": 1.3991924664273028, + "grad_norm": 0.8209075331687927, + "learning_rate": 2.0697025578345535e-05, + "loss": 0.9539, + "step": 219010 + }, + { + "epoch": 1.3992563535770415, + "grad_norm": 1.1476972103118896, + "learning_rate": 2.069296005617346e-05, + "loss": 0.7315, + "step": 219020 + }, + { + "epoch": 1.3993202407267802, + "grad_norm": 1.3872950077056885, + "learning_rate": 2.068889482914864e-05, + "loss": 0.9604, + "step": 219030 + }, + { + "epoch": 1.399384127876519, + "grad_norm": 0.8013013601303101, + "learning_rate": 2.0684829897311998e-05, + "loss": 0.8888, + "step": 219040 + }, + { + "epoch": 1.3994480150262576, + "grad_norm": 0.8018562197685242, + "learning_rate": 2.0680765260704494e-05, + "loss": 0.7031, + "step": 219050 + }, + { + "epoch": 1.3995119021759963, + "grad_norm": 1.0981850624084473, + "learning_rate": 2.067670091936703e-05, + "loss": 0.8978, + "step": 219060 + }, + { + "epoch": 1.399575789325735, + "grad_norm": 1.376737356185913, + "learning_rate": 2.0672636873340572e-05, + "loss": 0.7644, + "step": 219070 + }, + { + "epoch": 1.3996396764754737, + "grad_norm": 1.0223459005355835, + "learning_rate": 2.0668573122666017e-05, + "loss": 0.9286, + "step": 219080 + }, + { + "epoch": 1.3997035636252124, + "grad_norm": 0.9271156191825867, + "learning_rate": 2.0664509667384323e-05, + "loss": 0.9949, + "step": 219090 + }, + { + "epoch": 1.3997674507749511, + "grad_norm": 0.8813492655754089, + "learning_rate": 2.066044650753638e-05, + "loss": 0.7916, + "step": 219100 + }, + { + "epoch": 1.3998313379246898, + "grad_norm": 1.286502480506897, + "learning_rate": 2.065638364316314e-05, + "loss": 0.8053, + "step": 219110 + }, + { + "epoch": 1.3998952250744285, + "grad_norm": 0.827023446559906, + "learning_rate": 2.0652321074305487e-05, + "loss": 0.5385, + "step": 219120 + }, + { + "epoch": 1.3999591122241672, + "grad_norm": 2.0776171684265137, + "learning_rate": 2.0648258801004362e-05, + "loss": 0.8984, + "step": 219130 + }, + { + "epoch": 1.400022999373906, + "grad_norm": 0.699685275554657, + "learning_rate": 2.064419682330065e-05, + "loss": 0.9742, + "step": 219140 + }, + { + "epoch": 1.4000868865236447, + "grad_norm": 0.9540247321128845, + "learning_rate": 2.0640135141235288e-05, + "loss": 0.9716, + "step": 219150 + }, + { + "epoch": 1.4001507736733834, + "grad_norm": 0.72536700963974, + "learning_rate": 2.063607375484915e-05, + "loss": 0.8139, + "step": 219160 + }, + { + "epoch": 1.400214660823122, + "grad_norm": 1.4952839612960815, + "learning_rate": 2.063201266418317e-05, + "loss": 0.8069, + "step": 219170 + }, + { + "epoch": 1.4002785479728608, + "grad_norm": 0.8095305562019348, + "learning_rate": 2.062795186927821e-05, + "loss": 0.7897, + "step": 219180 + }, + { + "epoch": 1.4003424351225995, + "grad_norm": 2.51078462600708, + "learning_rate": 2.0623891370175187e-05, + "loss": 0.9514, + "step": 219190 + }, + { + "epoch": 1.4004063222723382, + "grad_norm": 0.8767799735069275, + "learning_rate": 2.0619831166915005e-05, + "loss": 0.9333, + "step": 219200 + }, + { + "epoch": 1.400470209422077, + "grad_norm": 0.9809701442718506, + "learning_rate": 2.0615771259538526e-05, + "loss": 0.9765, + "step": 219210 + }, + { + "epoch": 1.4005340965718156, + "grad_norm": 0.9264981746673584, + "learning_rate": 2.0611711648086668e-05, + "loss": 0.9022, + "step": 219220 + }, + { + "epoch": 1.4005979837215543, + "grad_norm": 0.977389395236969, + "learning_rate": 2.0607652332600285e-05, + "loss": 0.9624, + "step": 219230 + }, + { + "epoch": 1.400661870871293, + "grad_norm": 0.8509820699691772, + "learning_rate": 2.0603593313120286e-05, + "loss": 0.7289, + "step": 219240 + }, + { + "epoch": 1.4007257580210317, + "grad_norm": 0.7682042717933655, + "learning_rate": 2.0599534589687514e-05, + "loss": 0.7303, + "step": 219250 + }, + { + "epoch": 1.4007896451707704, + "grad_norm": 1.349259614944458, + "learning_rate": 2.059547616234288e-05, + "loss": 1.1691, + "step": 219260 + }, + { + "epoch": 1.4008535323205091, + "grad_norm": 1.2627943754196167, + "learning_rate": 2.059141803112723e-05, + "loss": 0.8081, + "step": 219270 + }, + { + "epoch": 1.4009174194702476, + "grad_norm": 1.5132968425750732, + "learning_rate": 2.0587360196081452e-05, + "loss": 0.8969, + "step": 219280 + }, + { + "epoch": 1.4009813066199865, + "grad_norm": 0.8733160495758057, + "learning_rate": 2.0583302657246388e-05, + "loss": 0.9716, + "step": 219290 + }, + { + "epoch": 1.401045193769725, + "grad_norm": 1.0363460779190063, + "learning_rate": 2.0579245414662934e-05, + "loss": 0.7691, + "step": 219300 + }, + { + "epoch": 1.401109080919464, + "grad_norm": 1.1736512184143066, + "learning_rate": 2.057518846837191e-05, + "loss": 0.7217, + "step": 219310 + }, + { + "epoch": 1.4011729680692024, + "grad_norm": 3.8237926959991455, + "learning_rate": 2.0571131818414213e-05, + "loss": 0.7578, + "step": 219320 + }, + { + "epoch": 1.4012368552189414, + "grad_norm": 0.9899071455001831, + "learning_rate": 2.0567075464830682e-05, + "loss": 0.6559, + "step": 219330 + }, + { + "epoch": 1.4013007423686799, + "grad_norm": 0.8879222869873047, + "learning_rate": 2.0563019407662143e-05, + "loss": 0.877, + "step": 219340 + }, + { + "epoch": 1.4013646295184188, + "grad_norm": 1.2862142324447632, + "learning_rate": 2.0558963646949487e-05, + "loss": 0.8434, + "step": 219350 + }, + { + "epoch": 1.4014285166681573, + "grad_norm": 1.0837254524230957, + "learning_rate": 2.055490818273351e-05, + "loss": 0.8101, + "step": 219360 + }, + { + "epoch": 1.4014924038178962, + "grad_norm": 1.694610357284546, + "learning_rate": 2.055085301505511e-05, + "loss": 0.8474, + "step": 219370 + }, + { + "epoch": 1.4015562909676347, + "grad_norm": 0.8167562484741211, + "learning_rate": 2.054679814395507e-05, + "loss": 1.0352, + "step": 219380 + }, + { + "epoch": 1.4016201781173736, + "grad_norm": 1.1539112329483032, + "learning_rate": 2.0542743569474277e-05, + "loss": 0.8725, + "step": 219390 + }, + { + "epoch": 1.401684065267112, + "grad_norm": 1.7117409706115723, + "learning_rate": 2.0538689291653522e-05, + "loss": 0.7507, + "step": 219400 + }, + { + "epoch": 1.4017479524168508, + "grad_norm": 1.2127348184585571, + "learning_rate": 2.0534635310533673e-05, + "loss": 0.7178, + "step": 219410 + }, + { + "epoch": 1.4018118395665895, + "grad_norm": 0.7299874424934387, + "learning_rate": 2.0530581626155514e-05, + "loss": 0.8289, + "step": 219420 + }, + { + "epoch": 1.4018757267163282, + "grad_norm": 0.7044321894645691, + "learning_rate": 2.0526528238559915e-05, + "loss": 0.9426, + "step": 219430 + }, + { + "epoch": 1.401939613866067, + "grad_norm": 1.472915530204773, + "learning_rate": 2.0522475147787656e-05, + "loss": 0.8895, + "step": 219440 + }, + { + "epoch": 1.4020035010158056, + "grad_norm": 1.1369291543960571, + "learning_rate": 2.051842235387959e-05, + "loss": 0.7921, + "step": 219450 + }, + { + "epoch": 1.4020673881655443, + "grad_norm": 0.7624445557594299, + "learning_rate": 2.0514369856876503e-05, + "loss": 0.7382, + "step": 219460 + }, + { + "epoch": 1.402131275315283, + "grad_norm": 0.6840057373046875, + "learning_rate": 2.0510317656819233e-05, + "loss": 0.9609, + "step": 219470 + }, + { + "epoch": 1.4021951624650217, + "grad_norm": 0.5969876646995544, + "learning_rate": 2.050626575374856e-05, + "loss": 0.9641, + "step": 219480 + }, + { + "epoch": 1.4022590496147604, + "grad_norm": 1.046563744544983, + "learning_rate": 2.0502214147705327e-05, + "loss": 0.9015, + "step": 219490 + }, + { + "epoch": 1.4023229367644992, + "grad_norm": 1.2152701616287231, + "learning_rate": 2.0498162838730295e-05, + "loss": 0.8255, + "step": 219500 + }, + { + "epoch": 1.4023868239142379, + "grad_norm": 0.8147159814834595, + "learning_rate": 2.0494111826864287e-05, + "loss": 0.7135, + "step": 219510 + }, + { + "epoch": 1.4024507110639766, + "grad_norm": 0.9122790098190308, + "learning_rate": 2.0490061112148123e-05, + "loss": 1.2854, + "step": 219520 + }, + { + "epoch": 1.4025145982137153, + "grad_norm": 0.8407679796218872, + "learning_rate": 2.048601069462255e-05, + "loss": 1.038, + "step": 219530 + }, + { + "epoch": 1.402578485363454, + "grad_norm": 0.7449984550476074, + "learning_rate": 2.04819605743284e-05, + "loss": 0.7232, + "step": 219540 + }, + { + "epoch": 1.4026423725131927, + "grad_norm": 1.1216732263565063, + "learning_rate": 2.0477910751306427e-05, + "loss": 0.7442, + "step": 219550 + }, + { + "epoch": 1.4027062596629314, + "grad_norm": 1.1450462341308594, + "learning_rate": 2.0473861225597445e-05, + "loss": 0.6044, + "step": 219560 + }, + { + "epoch": 1.40277014681267, + "grad_norm": 1.2385889291763306, + "learning_rate": 2.0469811997242218e-05, + "loss": 0.7567, + "step": 219570 + }, + { + "epoch": 1.4028340339624088, + "grad_norm": 0.7899599075317383, + "learning_rate": 2.046576306628154e-05, + "loss": 0.7281, + "step": 219580 + }, + { + "epoch": 1.4028979211121475, + "grad_norm": 0.6492139101028442, + "learning_rate": 2.0461714432756162e-05, + "loss": 0.8082, + "step": 219590 + }, + { + "epoch": 1.4029618082618862, + "grad_norm": 0.7320935726165771, + "learning_rate": 2.0457666096706896e-05, + "loss": 0.7874, + "step": 219600 + }, + { + "epoch": 1.403025695411625, + "grad_norm": 0.9208077192306519, + "learning_rate": 2.0453618058174473e-05, + "loss": 1.1376, + "step": 219610 + }, + { + "epoch": 1.4030895825613636, + "grad_norm": 1.1169242858886719, + "learning_rate": 2.044957031719969e-05, + "loss": 0.8853, + "step": 219620 + }, + { + "epoch": 1.4031534697111023, + "grad_norm": 1.4261136054992676, + "learning_rate": 2.044552287382328e-05, + "loss": 0.9555, + "step": 219630 + }, + { + "epoch": 1.403217356860841, + "grad_norm": 1.1854610443115234, + "learning_rate": 2.0441475728086047e-05, + "loss": 0.7595, + "step": 219640 + }, + { + "epoch": 1.4032812440105797, + "grad_norm": 1.04494309425354, + "learning_rate": 2.0437428880028704e-05, + "loss": 0.8212, + "step": 219650 + }, + { + "epoch": 1.4033451311603184, + "grad_norm": 0.8896396160125732, + "learning_rate": 2.0433382329692048e-05, + "loss": 0.8674, + "step": 219660 + }, + { + "epoch": 1.4034090183100572, + "grad_norm": 0.9874494075775146, + "learning_rate": 2.042933607711679e-05, + "loss": 1.0137, + "step": 219670 + }, + { + "epoch": 1.4034729054597959, + "grad_norm": 0.7782787084579468, + "learning_rate": 2.0425290122343716e-05, + "loss": 0.9088, + "step": 219680 + }, + { + "epoch": 1.4035367926095346, + "grad_norm": 1.2257322072982788, + "learning_rate": 2.0421244465413543e-05, + "loss": 0.8651, + "step": 219690 + }, + { + "epoch": 1.4036006797592733, + "grad_norm": 1.2603285312652588, + "learning_rate": 2.0417199106367042e-05, + "loss": 0.8384, + "step": 219700 + }, + { + "epoch": 1.403664566909012, + "grad_norm": 1.0163233280181885, + "learning_rate": 2.041315404524492e-05, + "loss": 0.8613, + "step": 219710 + }, + { + "epoch": 1.4037284540587507, + "grad_norm": 1.0572268962860107, + "learning_rate": 2.0409109282087933e-05, + "loss": 1.0124, + "step": 219720 + }, + { + "epoch": 1.4037923412084894, + "grad_norm": 1.1266729831695557, + "learning_rate": 2.0405064816936837e-05, + "loss": 0.9166, + "step": 219730 + }, + { + "epoch": 1.403856228358228, + "grad_norm": 1.2244828939437866, + "learning_rate": 2.0401020649832318e-05, + "loss": 0.8353, + "step": 219740 + }, + { + "epoch": 1.4039201155079668, + "grad_norm": 1.0080479383468628, + "learning_rate": 2.0396976780815153e-05, + "loss": 0.8262, + "step": 219750 + }, + { + "epoch": 1.4039840026577055, + "grad_norm": 0.8630098700523376, + "learning_rate": 2.039293320992602e-05, + "loss": 0.9429, + "step": 219760 + }, + { + "epoch": 1.404047889807444, + "grad_norm": 0.6371837258338928, + "learning_rate": 2.038888993720568e-05, + "loss": 0.8733, + "step": 219770 + }, + { + "epoch": 1.404111776957183, + "grad_norm": 0.805311381816864, + "learning_rate": 2.038484696269482e-05, + "loss": 0.6292, + "step": 219780 + }, + { + "epoch": 1.4041756641069214, + "grad_norm": 1.1514561176300049, + "learning_rate": 2.038080428643419e-05, + "loss": 1.1225, + "step": 219790 + }, + { + "epoch": 1.4042395512566603, + "grad_norm": 0.9797691702842712, + "learning_rate": 2.0376761908464464e-05, + "loss": 0.7642, + "step": 219800 + }, + { + "epoch": 1.4043034384063988, + "grad_norm": 0.9690388441085815, + "learning_rate": 2.03727198288264e-05, + "loss": 0.9549, + "step": 219810 + }, + { + "epoch": 1.4043673255561377, + "grad_norm": 0.8218302130699158, + "learning_rate": 2.036867804756067e-05, + "loss": 0.9085, + "step": 219820 + }, + { + "epoch": 1.4044312127058762, + "grad_norm": 0.8038190007209778, + "learning_rate": 2.0364636564707972e-05, + "loss": 0.8697, + "step": 219830 + }, + { + "epoch": 1.4044950998556152, + "grad_norm": 0.8588317632675171, + "learning_rate": 2.0360595380309038e-05, + "loss": 0.7064, + "step": 219840 + }, + { + "epoch": 1.4045589870053536, + "grad_norm": 1.052468180656433, + "learning_rate": 2.0356554494404534e-05, + "loss": 0.9069, + "step": 219850 + }, + { + "epoch": 1.4046228741550926, + "grad_norm": 0.8712472915649414, + "learning_rate": 2.0352513907035187e-05, + "loss": 0.697, + "step": 219860 + }, + { + "epoch": 1.404686761304831, + "grad_norm": 0.8167270421981812, + "learning_rate": 2.034847361824166e-05, + "loss": 0.951, + "step": 219870 + }, + { + "epoch": 1.40475064845457, + "grad_norm": 0.6242989897727966, + "learning_rate": 2.034443362806467e-05, + "loss": 1.0533, + "step": 219880 + }, + { + "epoch": 1.4048145356043085, + "grad_norm": 0.9029278755187988, + "learning_rate": 2.0340393936544872e-05, + "loss": 0.9406, + "step": 219890 + }, + { + "epoch": 1.4048784227540472, + "grad_norm": 1.5568900108337402, + "learning_rate": 2.0336354543722986e-05, + "loss": 0.9028, + "step": 219900 + }, + { + "epoch": 1.4049423099037859, + "grad_norm": 1.0189552307128906, + "learning_rate": 2.033231544963965e-05, + "loss": 0.9335, + "step": 219910 + }, + { + "epoch": 1.4050061970535246, + "grad_norm": 0.8458846807479858, + "learning_rate": 2.032827665433559e-05, + "loss": 1.0823, + "step": 219920 + }, + { + "epoch": 1.4050700842032633, + "grad_norm": 1.2060385942459106, + "learning_rate": 2.032423815785143e-05, + "loss": 0.8751, + "step": 219930 + }, + { + "epoch": 1.405133971353002, + "grad_norm": 0.7118062376976013, + "learning_rate": 2.0320199960227882e-05, + "loss": 0.9541, + "step": 219940 + }, + { + "epoch": 1.4051978585027407, + "grad_norm": 0.9647814035415649, + "learning_rate": 2.031616206150558e-05, + "loss": 1.0803, + "step": 219950 + }, + { + "epoch": 1.4052617456524794, + "grad_norm": 1.1308926343917847, + "learning_rate": 2.0312124461725222e-05, + "loss": 0.9342, + "step": 219960 + }, + { + "epoch": 1.4053256328022181, + "grad_norm": 0.7198635935783386, + "learning_rate": 2.030808716092744e-05, + "loss": 1.0821, + "step": 219970 + }, + { + "epoch": 1.4053895199519568, + "grad_norm": 1.2182941436767578, + "learning_rate": 2.0304050159152928e-05, + "loss": 0.6881, + "step": 219980 + }, + { + "epoch": 1.4054534071016955, + "grad_norm": 0.7641770243644714, + "learning_rate": 2.0300013456442295e-05, + "loss": 0.8982, + "step": 219990 + }, + { + "epoch": 1.4055172942514342, + "grad_norm": 1.0956320762634277, + "learning_rate": 2.0295977052836245e-05, + "loss": 1.013, + "step": 220000 + }, + { + "epoch": 1.405581181401173, + "grad_norm": 1.0628162622451782, + "learning_rate": 2.029194094837539e-05, + "loss": 0.8651, + "step": 220010 + }, + { + "epoch": 1.4056450685509116, + "grad_norm": 1.4605565071105957, + "learning_rate": 2.02879051431004e-05, + "loss": 0.9023, + "step": 220020 + }, + { + "epoch": 1.4057089557006504, + "grad_norm": 1.8608735799789429, + "learning_rate": 2.0283869637051893e-05, + "loss": 1.0067, + "step": 220030 + }, + { + "epoch": 1.405772842850389, + "grad_norm": 1.1879396438598633, + "learning_rate": 2.0279834430270526e-05, + "loss": 0.8728, + "step": 220040 + }, + { + "epoch": 1.4058367300001278, + "grad_norm": 0.9974208474159241, + "learning_rate": 2.0275799522796962e-05, + "loss": 0.83, + "step": 220050 + }, + { + "epoch": 1.4059006171498665, + "grad_norm": 2.049335241317749, + "learning_rate": 2.0271764914671794e-05, + "loss": 1.1515, + "step": 220060 + }, + { + "epoch": 1.4059645042996052, + "grad_norm": 2.182713031768799, + "learning_rate": 2.0267730605935686e-05, + "loss": 0.9389, + "step": 220070 + }, + { + "epoch": 1.4060283914493439, + "grad_norm": 0.6502020955085754, + "learning_rate": 2.0263696596629235e-05, + "loss": 0.9203, + "step": 220080 + }, + { + "epoch": 1.4060922785990826, + "grad_norm": 0.9814128279685974, + "learning_rate": 2.0259662886793102e-05, + "loss": 0.9169, + "step": 220090 + }, + { + "epoch": 1.4061561657488213, + "grad_norm": 0.8696256279945374, + "learning_rate": 2.0255629476467873e-05, + "loss": 1.0024, + "step": 220100 + }, + { + "epoch": 1.40622005289856, + "grad_norm": 0.8340265154838562, + "learning_rate": 2.0251596365694213e-05, + "loss": 0.7545, + "step": 220110 + }, + { + "epoch": 1.4062839400482987, + "grad_norm": 1.115959882736206, + "learning_rate": 2.024756355451269e-05, + "loss": 0.9034, + "step": 220120 + }, + { + "epoch": 1.4063478271980374, + "grad_norm": 0.9420958161354065, + "learning_rate": 2.024353104296396e-05, + "loss": 1.0772, + "step": 220130 + }, + { + "epoch": 1.4064117143477761, + "grad_norm": 0.8073174357414246, + "learning_rate": 2.02394988310886e-05, + "loss": 0.8894, + "step": 220140 + }, + { + "epoch": 1.4064756014975148, + "grad_norm": 0.7026411890983582, + "learning_rate": 2.0235466918927247e-05, + "loss": 0.8304, + "step": 220150 + }, + { + "epoch": 1.4065394886472535, + "grad_norm": 0.9653884768486023, + "learning_rate": 2.023143530652048e-05, + "loss": 1.1283, + "step": 220160 + }, + { + "epoch": 1.4066033757969922, + "grad_norm": 0.9749668836593628, + "learning_rate": 2.0227403993908928e-05, + "loss": 1.1303, + "step": 220170 + }, + { + "epoch": 1.406667262946731, + "grad_norm": 0.8139040470123291, + "learning_rate": 2.0223372981133154e-05, + "loss": 0.9092, + "step": 220180 + }, + { + "epoch": 1.4067311500964697, + "grad_norm": 2.1971898078918457, + "learning_rate": 2.02193422682338e-05, + "loss": 0.6938, + "step": 220190 + }, + { + "epoch": 1.4067950372462084, + "grad_norm": 1.9047714471817017, + "learning_rate": 2.0215311855251406e-05, + "loss": 0.8362, + "step": 220200 + }, + { + "epoch": 1.406858924395947, + "grad_norm": 2.2644927501678467, + "learning_rate": 2.0211281742226612e-05, + "loss": 1.3439, + "step": 220210 + }, + { + "epoch": 1.4069228115456858, + "grad_norm": 0.9895037412643433, + "learning_rate": 2.0207251929199966e-05, + "loss": 0.7277, + "step": 220220 + }, + { + "epoch": 1.4069866986954245, + "grad_norm": 1.0718899965286255, + "learning_rate": 2.0203222416212082e-05, + "loss": 1.074, + "step": 220230 + }, + { + "epoch": 1.4070505858451632, + "grad_norm": 1.2079740762710571, + "learning_rate": 2.019919320330351e-05, + "loss": 0.9072, + "step": 220240 + }, + { + "epoch": 1.4071144729949019, + "grad_norm": 0.6357789635658264, + "learning_rate": 2.019516429051484e-05, + "loss": 0.7015, + "step": 220250 + }, + { + "epoch": 1.4071783601446404, + "grad_norm": 1.1232203245162964, + "learning_rate": 2.0191135677886668e-05, + "loss": 0.8269, + "step": 220260 + }, + { + "epoch": 1.4072422472943793, + "grad_norm": 1.065622091293335, + "learning_rate": 2.0187107365459535e-05, + "loss": 0.966, + "step": 220270 + }, + { + "epoch": 1.4073061344441178, + "grad_norm": 1.32682466506958, + "learning_rate": 2.0183079353274036e-05, + "loss": 0.8666, + "step": 220280 + }, + { + "epoch": 1.4073700215938567, + "grad_norm": 0.5304492712020874, + "learning_rate": 2.0179051641370712e-05, + "loss": 0.8609, + "step": 220290 + }, + { + "epoch": 1.4074339087435952, + "grad_norm": 1.2137643098831177, + "learning_rate": 2.017502422979015e-05, + "loss": 0.7503, + "step": 220300 + }, + { + "epoch": 1.4074977958933341, + "grad_norm": 0.9853841662406921, + "learning_rate": 2.01709971185729e-05, + "loss": 0.9774, + "step": 220310 + }, + { + "epoch": 1.4075616830430726, + "grad_norm": 0.9920171499252319, + "learning_rate": 2.0166970307759508e-05, + "loss": 0.8804, + "step": 220320 + }, + { + "epoch": 1.4076255701928115, + "grad_norm": 0.8332661986351013, + "learning_rate": 2.0162943797390522e-05, + "loss": 0.8944, + "step": 220330 + }, + { + "epoch": 1.40768945734255, + "grad_norm": 0.9016265273094177, + "learning_rate": 2.0158917587506522e-05, + "loss": 0.8822, + "step": 220340 + }, + { + "epoch": 1.407753344492289, + "grad_norm": 0.7224472165107727, + "learning_rate": 2.0154891678148013e-05, + "loss": 0.8487, + "step": 220350 + }, + { + "epoch": 1.4078172316420274, + "grad_norm": 0.9430965781211853, + "learning_rate": 2.0150866069355574e-05, + "loss": 0.737, + "step": 220360 + }, + { + "epoch": 1.4078811187917664, + "grad_norm": 1.0713948011398315, + "learning_rate": 2.014684076116975e-05, + "loss": 0.8471, + "step": 220370 + }, + { + "epoch": 1.4079450059415048, + "grad_norm": 1.0659370422363281, + "learning_rate": 2.0142815753631052e-05, + "loss": 0.7762, + "step": 220380 + }, + { + "epoch": 1.4080088930912436, + "grad_norm": 0.6916654706001282, + "learning_rate": 2.0138791046780044e-05, + "loss": 0.7892, + "step": 220390 + }, + { + "epoch": 1.4080727802409823, + "grad_norm": 1.019726037979126, + "learning_rate": 2.0134766640657228e-05, + "loss": 1.0982, + "step": 220400 + }, + { + "epoch": 1.408136667390721, + "grad_norm": 1.0331531763076782, + "learning_rate": 2.0130742535303164e-05, + "loss": 1.1579, + "step": 220410 + }, + { + "epoch": 1.4082005545404597, + "grad_norm": 1.8808170557022095, + "learning_rate": 2.0126718730758347e-05, + "loss": 0.8189, + "step": 220420 + }, + { + "epoch": 1.4082644416901984, + "grad_norm": 1.2670456171035767, + "learning_rate": 2.0122695227063332e-05, + "loss": 0.8223, + "step": 220430 + }, + { + "epoch": 1.408328328839937, + "grad_norm": 1.2577275037765503, + "learning_rate": 2.011867202425861e-05, + "loss": 0.8128, + "step": 220440 + }, + { + "epoch": 1.4083922159896758, + "grad_norm": 0.731774628162384, + "learning_rate": 2.0114649122384727e-05, + "loss": 1.4193, + "step": 220450 + }, + { + "epoch": 1.4084561031394145, + "grad_norm": 0.6962316632270813, + "learning_rate": 2.011062652148216e-05, + "loss": 0.7145, + "step": 220460 + }, + { + "epoch": 1.4085199902891532, + "grad_norm": 1.116998314857483, + "learning_rate": 2.010660422159147e-05, + "loss": 0.8743, + "step": 220470 + }, + { + "epoch": 1.408583877438892, + "grad_norm": 1.2085821628570557, + "learning_rate": 2.0102582222753114e-05, + "loss": 0.798, + "step": 220480 + }, + { + "epoch": 1.4086477645886306, + "grad_norm": 0.7292695641517639, + "learning_rate": 2.0098560525007638e-05, + "loss": 0.8246, + "step": 220490 + }, + { + "epoch": 1.4087116517383693, + "grad_norm": 1.5681432485580444, + "learning_rate": 2.0094539128395506e-05, + "loss": 0.7658, + "step": 220500 + }, + { + "epoch": 1.408775538888108, + "grad_norm": 0.6391398310661316, + "learning_rate": 2.0090518032957255e-05, + "loss": 1.0228, + "step": 220510 + }, + { + "epoch": 1.4088394260378467, + "grad_norm": 1.8824732303619385, + "learning_rate": 2.008649723873335e-05, + "loss": 0.9237, + "step": 220520 + }, + { + "epoch": 1.4089033131875854, + "grad_norm": 0.56192547082901, + "learning_rate": 2.0082476745764304e-05, + "loss": 0.6959, + "step": 220530 + }, + { + "epoch": 1.4089672003373241, + "grad_norm": 0.6403698921203613, + "learning_rate": 2.007845655409059e-05, + "loss": 0.7584, + "step": 220540 + }, + { + "epoch": 1.4090310874870628, + "grad_norm": 1.1955901384353638, + "learning_rate": 2.007443666375272e-05, + "loss": 0.8017, + "step": 220550 + }, + { + "epoch": 1.4090949746368016, + "grad_norm": 1.067419409751892, + "learning_rate": 2.007041707479115e-05, + "loss": 0.8995, + "step": 220560 + }, + { + "epoch": 1.4091588617865403, + "grad_norm": 1.870528221130371, + "learning_rate": 2.0066397787246367e-05, + "loss": 0.6883, + "step": 220570 + }, + { + "epoch": 1.409222748936279, + "grad_norm": 1.0919605493545532, + "learning_rate": 2.0062378801158872e-05, + "loss": 0.974, + "step": 220580 + }, + { + "epoch": 1.4092866360860177, + "grad_norm": 1.08636474609375, + "learning_rate": 2.0058360116569103e-05, + "loss": 1.0044, + "step": 220590 + }, + { + "epoch": 1.4093505232357564, + "grad_norm": 0.803911030292511, + "learning_rate": 2.0054341733517574e-05, + "loss": 0.9509, + "step": 220600 + }, + { + "epoch": 1.409414410385495, + "grad_norm": 1.357932686805725, + "learning_rate": 2.0050323652044705e-05, + "loss": 0.8243, + "step": 220610 + }, + { + "epoch": 1.4094782975352338, + "grad_norm": 1.1059114933013916, + "learning_rate": 2.0046305872191013e-05, + "loss": 0.7825, + "step": 220620 + }, + { + "epoch": 1.4095421846849725, + "grad_norm": 1.688194751739502, + "learning_rate": 2.004228839399691e-05, + "loss": 0.8504, + "step": 220630 + }, + { + "epoch": 1.4096060718347112, + "grad_norm": 0.6670559644699097, + "learning_rate": 2.00382712175029e-05, + "loss": 0.828, + "step": 220640 + }, + { + "epoch": 1.40966995898445, + "grad_norm": 1.0450772047042847, + "learning_rate": 2.0034254342749402e-05, + "loss": 0.6982, + "step": 220650 + }, + { + "epoch": 1.4097338461341886, + "grad_norm": 1.279312252998352, + "learning_rate": 2.0030237769776906e-05, + "loss": 0.7851, + "step": 220660 + }, + { + "epoch": 1.4097977332839273, + "grad_norm": 0.7498508095741272, + "learning_rate": 2.0026221498625825e-05, + "loss": 0.6041, + "step": 220670 + }, + { + "epoch": 1.409861620433666, + "grad_norm": 0.9692347645759583, + "learning_rate": 2.0022205529336642e-05, + "loss": 1.1839, + "step": 220680 + }, + { + "epoch": 1.4099255075834047, + "grad_norm": 1.1649972200393677, + "learning_rate": 2.0018189861949764e-05, + "loss": 1.0362, + "step": 220690 + }, + { + "epoch": 1.4099893947331434, + "grad_norm": 0.7185434699058533, + "learning_rate": 2.0014174496505673e-05, + "loss": 0.7825, + "step": 220700 + }, + { + "epoch": 1.4100532818828821, + "grad_norm": 1.2547842264175415, + "learning_rate": 2.0010159433044766e-05, + "loss": 0.8287, + "step": 220710 + }, + { + "epoch": 1.4101171690326209, + "grad_norm": 0.8520128726959229, + "learning_rate": 2.000614467160752e-05, + "loss": 1.2777, + "step": 220720 + }, + { + "epoch": 1.4101810561823596, + "grad_norm": 1.134917974472046, + "learning_rate": 2.0002130212234322e-05, + "loss": 0.6963, + "step": 220730 + }, + { + "epoch": 1.4102449433320983, + "grad_norm": 0.7581837177276611, + "learning_rate": 1.999811605496565e-05, + "loss": 0.8686, + "step": 220740 + }, + { + "epoch": 1.4103088304818368, + "grad_norm": 0.9078113436698914, + "learning_rate": 1.999410219984188e-05, + "loss": 1.0735, + "step": 220750 + }, + { + "epoch": 1.4103727176315757, + "grad_norm": 2.1745991706848145, + "learning_rate": 1.9990088646903477e-05, + "loss": 0.8066, + "step": 220760 + }, + { + "epoch": 1.4104366047813142, + "grad_norm": 0.9876050353050232, + "learning_rate": 1.9986075396190828e-05, + "loss": 0.6116, + "step": 220770 + }, + { + "epoch": 1.410500491931053, + "grad_norm": 0.7553558945655823, + "learning_rate": 1.998206244774437e-05, + "loss": 0.8716, + "step": 220780 + }, + { + "epoch": 1.4105643790807916, + "grad_norm": 0.6577632427215576, + "learning_rate": 1.9978049801604542e-05, + "loss": 0.9674, + "step": 220790 + }, + { + "epoch": 1.4106282662305305, + "grad_norm": 0.951414167881012, + "learning_rate": 1.997403745781169e-05, + "loss": 1.0595, + "step": 220800 + }, + { + "epoch": 1.410692153380269, + "grad_norm": 1.0069729089736938, + "learning_rate": 1.9970025416406278e-05, + "loss": 0.8617, + "step": 220810 + }, + { + "epoch": 1.410756040530008, + "grad_norm": 1.0184953212738037, + "learning_rate": 1.9966013677428668e-05, + "loss": 1.0118, + "step": 220820 + }, + { + "epoch": 1.4108199276797464, + "grad_norm": 0.9171404838562012, + "learning_rate": 1.996200224091931e-05, + "loss": 1.0464, + "step": 220830 + }, + { + "epoch": 1.4108838148294853, + "grad_norm": 1.1307326555252075, + "learning_rate": 1.995799110691855e-05, + "loss": 0.9237, + "step": 220840 + }, + { + "epoch": 1.4109477019792238, + "grad_norm": 2.443937063217163, + "learning_rate": 1.995438134499613e-05, + "loss": 0.8088, + "step": 220850 + }, + { + "epoch": 1.4110115891289625, + "grad_norm": 0.9654321074485779, + "learning_rate": 1.9950370785873044e-05, + "loss": 0.9652, + "step": 220860 + }, + { + "epoch": 1.4110754762787012, + "grad_norm": 0.7190786600112915, + "learning_rate": 1.994636052937574e-05, + "loss": 0.6677, + "step": 220870 + }, + { + "epoch": 1.41113936342844, + "grad_norm": 0.5666462182998657, + "learning_rate": 1.994235057554457e-05, + "loss": 0.7352, + "step": 220880 + }, + { + "epoch": 1.4112032505781786, + "grad_norm": 0.8541065454483032, + "learning_rate": 1.993834092441993e-05, + "loss": 0.7986, + "step": 220890 + }, + { + "epoch": 1.4112671377279173, + "grad_norm": 0.6696165204048157, + "learning_rate": 1.993433157604222e-05, + "loss": 0.8192, + "step": 220900 + }, + { + "epoch": 1.411331024877656, + "grad_norm": 1.8114312887191772, + "learning_rate": 1.9930322530451783e-05, + "loss": 1.041, + "step": 220910 + }, + { + "epoch": 1.4113949120273948, + "grad_norm": 0.6636319160461426, + "learning_rate": 1.9926313787689038e-05, + "loss": 0.6572, + "step": 220920 + }, + { + "epoch": 1.4114587991771335, + "grad_norm": 2.5401008129119873, + "learning_rate": 1.9922305347794308e-05, + "loss": 0.7537, + "step": 220930 + }, + { + "epoch": 1.4115226863268722, + "grad_norm": 0.8678221702575684, + "learning_rate": 1.991829721080802e-05, + "loss": 0.8484, + "step": 220940 + }, + { + "epoch": 1.4115865734766109, + "grad_norm": 1.05147385597229, + "learning_rate": 1.9914289376770463e-05, + "loss": 1.0464, + "step": 220950 + }, + { + "epoch": 1.4116504606263496, + "grad_norm": 0.7742120027542114, + "learning_rate": 1.991028184572206e-05, + "loss": 0.8125, + "step": 220960 + }, + { + "epoch": 1.4117143477760883, + "grad_norm": 0.7941192984580994, + "learning_rate": 1.9906274617703136e-05, + "loss": 0.805, + "step": 220970 + }, + { + "epoch": 1.411778234925827, + "grad_norm": 0.6988613605499268, + "learning_rate": 1.990226769275408e-05, + "loss": 0.7882, + "step": 220980 + }, + { + "epoch": 1.4118421220755657, + "grad_norm": 0.956760585308075, + "learning_rate": 1.9898261070915203e-05, + "loss": 0.7652, + "step": 220990 + }, + { + "epoch": 1.4119060092253044, + "grad_norm": 0.8377808332443237, + "learning_rate": 1.989425475222688e-05, + "loss": 0.8399, + "step": 221000 + }, + { + "epoch": 1.411969896375043, + "grad_norm": 0.8833522796630859, + "learning_rate": 1.9890248736729477e-05, + "loss": 1.0528, + "step": 221010 + }, + { + "epoch": 1.4120337835247818, + "grad_norm": 1.3832995891571045, + "learning_rate": 1.9886243024463298e-05, + "loss": 0.8216, + "step": 221020 + }, + { + "epoch": 1.4120976706745205, + "grad_norm": 1.0302939414978027, + "learning_rate": 1.9882237615468724e-05, + "loss": 0.9526, + "step": 221030 + }, + { + "epoch": 1.4121615578242592, + "grad_norm": 1.5530582666397095, + "learning_rate": 1.9878232509786054e-05, + "loss": 0.9041, + "step": 221040 + }, + { + "epoch": 1.412225444973998, + "grad_norm": 0.7477715015411377, + "learning_rate": 1.9874227707455657e-05, + "loss": 0.6952, + "step": 221050 + }, + { + "epoch": 1.4122893321237366, + "grad_norm": 1.5359477996826172, + "learning_rate": 1.9870223208517836e-05, + "loss": 0.7534, + "step": 221060 + }, + { + "epoch": 1.4123532192734753, + "grad_norm": 0.9684277772903442, + "learning_rate": 1.986621901301295e-05, + "loss": 1.0632, + "step": 221070 + }, + { + "epoch": 1.412417106423214, + "grad_norm": 0.8300984501838684, + "learning_rate": 1.9862215120981288e-05, + "loss": 0.9463, + "step": 221080 + }, + { + "epoch": 1.4124809935729528, + "grad_norm": 0.8654909729957581, + "learning_rate": 1.9858211532463212e-05, + "loss": 0.8561, + "step": 221090 + }, + { + "epoch": 1.4125448807226915, + "grad_norm": 1.0056959390640259, + "learning_rate": 1.9854208247499e-05, + "loss": 0.7763, + "step": 221100 + }, + { + "epoch": 1.4126087678724302, + "grad_norm": 1.648310899734497, + "learning_rate": 1.9850205266129013e-05, + "loss": 0.8435, + "step": 221110 + }, + { + "epoch": 1.4126726550221689, + "grad_norm": 0.861349880695343, + "learning_rate": 1.9846202588393526e-05, + "loss": 0.8127, + "step": 221120 + }, + { + "epoch": 1.4127365421719076, + "grad_norm": 1.2424954175949097, + "learning_rate": 1.984220021433288e-05, + "loss": 0.8182, + "step": 221130 + }, + { + "epoch": 1.4128004293216463, + "grad_norm": 0.825581967830658, + "learning_rate": 1.983819814398735e-05, + "loss": 0.9335, + "step": 221140 + }, + { + "epoch": 1.412864316471385, + "grad_norm": 0.7504009008407593, + "learning_rate": 1.983419637739728e-05, + "loss": 0.6404, + "step": 221150 + }, + { + "epoch": 1.4129282036211237, + "grad_norm": 1.168960690498352, + "learning_rate": 1.983019491460293e-05, + "loss": 0.7979, + "step": 221160 + }, + { + "epoch": 1.4129920907708624, + "grad_norm": 0.9512644410133362, + "learning_rate": 1.9826193755644636e-05, + "loss": 0.8454, + "step": 221170 + }, + { + "epoch": 1.4130559779206011, + "grad_norm": 0.5762017965316772, + "learning_rate": 1.9822192900562658e-05, + "loss": 0.9529, + "step": 221180 + }, + { + "epoch": 1.4131198650703398, + "grad_norm": 0.9827634692192078, + "learning_rate": 1.9818192349397317e-05, + "loss": 0.5732, + "step": 221190 + }, + { + "epoch": 1.4131837522200785, + "grad_norm": 1.3534090518951416, + "learning_rate": 1.981419210218888e-05, + "loss": 0.9992, + "step": 221200 + }, + { + "epoch": 1.4132476393698172, + "grad_norm": 1.1091433763504028, + "learning_rate": 1.9810192158977635e-05, + "loss": 1.0782, + "step": 221210 + }, + { + "epoch": 1.413311526519556, + "grad_norm": 0.8881558179855347, + "learning_rate": 1.98061925198039e-05, + "loss": 0.9678, + "step": 221220 + }, + { + "epoch": 1.4133754136692946, + "grad_norm": 0.8820149898529053, + "learning_rate": 1.9802193184707907e-05, + "loss": 0.7579, + "step": 221230 + }, + { + "epoch": 1.4134393008190331, + "grad_norm": 0.8992303609848022, + "learning_rate": 1.9798194153729964e-05, + "loss": 0.9235, + "step": 221240 + }, + { + "epoch": 1.413503187968772, + "grad_norm": 1.0931174755096436, + "learning_rate": 1.9794195426910322e-05, + "loss": 0.7635, + "step": 221250 + }, + { + "epoch": 1.4135670751185105, + "grad_norm": 0.724915623664856, + "learning_rate": 1.9790197004289284e-05, + "loss": 0.9907, + "step": 221260 + }, + { + "epoch": 1.4136309622682495, + "grad_norm": 0.7087599039077759, + "learning_rate": 1.9786198885907075e-05, + "loss": 0.5267, + "step": 221270 + }, + { + "epoch": 1.413694849417988, + "grad_norm": 1.4535980224609375, + "learning_rate": 1.9782201071804e-05, + "loss": 1.1702, + "step": 221280 + }, + { + "epoch": 1.4137587365677269, + "grad_norm": 0.7310057878494263, + "learning_rate": 1.9778203562020287e-05, + "loss": 1.1342, + "step": 221290 + }, + { + "epoch": 1.4138226237174654, + "grad_norm": 1.2365026473999023, + "learning_rate": 1.9774206356596227e-05, + "loss": 0.7644, + "step": 221300 + }, + { + "epoch": 1.4138865108672043, + "grad_norm": 0.9791273474693298, + "learning_rate": 1.9770209455572043e-05, + "loss": 0.7834, + "step": 221310 + }, + { + "epoch": 1.4139503980169428, + "grad_norm": 0.9656670093536377, + "learning_rate": 1.9766212858988014e-05, + "loss": 0.8897, + "step": 221320 + }, + { + "epoch": 1.4140142851666817, + "grad_norm": 1.2244353294372559, + "learning_rate": 1.976221656688436e-05, + "loss": 0.9983, + "step": 221330 + }, + { + "epoch": 1.4140781723164202, + "grad_norm": 1.1544291973114014, + "learning_rate": 1.975822057930137e-05, + "loss": 0.946, + "step": 221340 + }, + { + "epoch": 1.414142059466159, + "grad_norm": 1.3128889799118042, + "learning_rate": 1.975422489627924e-05, + "loss": 0.9674, + "step": 221350 + }, + { + "epoch": 1.4142059466158976, + "grad_norm": 0.965920090675354, + "learning_rate": 1.9750229517858243e-05, + "loss": 0.9747, + "step": 221360 + }, + { + "epoch": 1.4142698337656363, + "grad_norm": 0.6471821069717407, + "learning_rate": 1.9746234444078592e-05, + "loss": 0.7858, + "step": 221370 + }, + { + "epoch": 1.414333720915375, + "grad_norm": 1.2321809530258179, + "learning_rate": 1.974223967498055e-05, + "loss": 0.7847, + "step": 221380 + }, + { + "epoch": 1.4143976080651137, + "grad_norm": 1.0316810607910156, + "learning_rate": 1.9738245210604317e-05, + "loss": 0.6993, + "step": 221390 + }, + { + "epoch": 1.4144614952148524, + "grad_norm": 0.6123872399330139, + "learning_rate": 1.9734251050990148e-05, + "loss": 0.8941, + "step": 221400 + }, + { + "epoch": 1.4145253823645911, + "grad_norm": 0.8233139514923096, + "learning_rate": 1.9730257196178244e-05, + "loss": 0.732, + "step": 221410 + }, + { + "epoch": 1.4145892695143298, + "grad_norm": 0.9176562428474426, + "learning_rate": 1.972626364620885e-05, + "loss": 0.7941, + "step": 221420 + }, + { + "epoch": 1.4146531566640685, + "grad_norm": 1.1283241510391235, + "learning_rate": 1.9722270401122166e-05, + "loss": 1.0854, + "step": 221430 + }, + { + "epoch": 1.4147170438138073, + "grad_norm": 0.9821565747261047, + "learning_rate": 1.97182774609584e-05, + "loss": 0.7775, + "step": 221440 + }, + { + "epoch": 1.414780930963546, + "grad_norm": 1.1683138608932495, + "learning_rate": 1.9714284825757795e-05, + "loss": 1.1725, + "step": 221450 + }, + { + "epoch": 1.4148448181132847, + "grad_norm": 0.8943763375282288, + "learning_rate": 1.9710292495560527e-05, + "loss": 0.6814, + "step": 221460 + }, + { + "epoch": 1.4149087052630234, + "grad_norm": 1.9581735134124756, + "learning_rate": 1.9706300470406837e-05, + "loss": 0.6614, + "step": 221470 + }, + { + "epoch": 1.414972592412762, + "grad_norm": 1.9280062913894653, + "learning_rate": 1.9702308750336884e-05, + "loss": 0.8017, + "step": 221480 + }, + { + "epoch": 1.4150364795625008, + "grad_norm": 1.0093328952789307, + "learning_rate": 1.9698317335390916e-05, + "loss": 1.0391, + "step": 221490 + }, + { + "epoch": 1.4151003667122395, + "grad_norm": 0.77775639295578, + "learning_rate": 1.969432622560909e-05, + "loss": 0.9507, + "step": 221500 + }, + { + "epoch": 1.4151642538619782, + "grad_norm": 1.1072726249694824, + "learning_rate": 1.969033542103163e-05, + "loss": 0.7385, + "step": 221510 + }, + { + "epoch": 1.415228141011717, + "grad_norm": 1.4591403007507324, + "learning_rate": 1.96863449216987e-05, + "loss": 1.0868, + "step": 221520 + }, + { + "epoch": 1.4152920281614556, + "grad_norm": 1.0994980335235596, + "learning_rate": 1.9682354727650505e-05, + "loss": 0.8326, + "step": 221530 + }, + { + "epoch": 1.4153559153111943, + "grad_norm": 1.7113072872161865, + "learning_rate": 1.9678364838927238e-05, + "loss": 0.7034, + "step": 221540 + }, + { + "epoch": 1.415419802460933, + "grad_norm": 1.114270806312561, + "learning_rate": 1.9674375255569055e-05, + "loss": 0.7392, + "step": 221550 + }, + { + "epoch": 1.4154836896106717, + "grad_norm": 1.084441065788269, + "learning_rate": 1.9670385977616167e-05, + "loss": 0.9154, + "step": 221560 + }, + { + "epoch": 1.4155475767604104, + "grad_norm": 1.159225344657898, + "learning_rate": 1.966639700510871e-05, + "loss": 0.9085, + "step": 221570 + }, + { + "epoch": 1.4156114639101491, + "grad_norm": 0.9540026783943176, + "learning_rate": 1.9662408338086897e-05, + "loss": 0.8693, + "step": 221580 + }, + { + "epoch": 1.4156753510598878, + "grad_norm": 0.7180683612823486, + "learning_rate": 1.9658419976590858e-05, + "loss": 0.7765, + "step": 221590 + }, + { + "epoch": 1.4157392382096265, + "grad_norm": 0.9916054606437683, + "learning_rate": 1.96544319206608e-05, + "loss": 0.8833, + "step": 221600 + }, + { + "epoch": 1.4158031253593653, + "grad_norm": 0.6650932431221008, + "learning_rate": 1.9650444170336846e-05, + "loss": 0.6361, + "step": 221610 + }, + { + "epoch": 1.415867012509104, + "grad_norm": 1.0005336999893188, + "learning_rate": 1.964645672565919e-05, + "loss": 0.7585, + "step": 221620 + }, + { + "epoch": 1.4159308996588427, + "grad_norm": 1.8848949670791626, + "learning_rate": 1.964246958666796e-05, + "loss": 0.7068, + "step": 221630 + }, + { + "epoch": 1.4159947868085814, + "grad_norm": 1.0205187797546387, + "learning_rate": 1.963848275340334e-05, + "loss": 0.865, + "step": 221640 + }, + { + "epoch": 1.41605867395832, + "grad_norm": 1.4636200666427612, + "learning_rate": 1.963449622590545e-05, + "loss": 1.0402, + "step": 221650 + }, + { + "epoch": 1.4161225611080588, + "grad_norm": 0.8260305523872375, + "learning_rate": 1.963051000421447e-05, + "loss": 0.7022, + "step": 221660 + }, + { + "epoch": 1.4161864482577975, + "grad_norm": 2.168660879135132, + "learning_rate": 1.9626524088370512e-05, + "loss": 0.871, + "step": 221670 + }, + { + "epoch": 1.4162503354075362, + "grad_norm": 1.222449541091919, + "learning_rate": 1.9622538478413747e-05, + "loss": 0.7785, + "step": 221680 + }, + { + "epoch": 1.416314222557275, + "grad_norm": 1.6448215246200562, + "learning_rate": 1.9618553174384284e-05, + "loss": 0.8298, + "step": 221690 + }, + { + "epoch": 1.4163781097070136, + "grad_norm": 0.6294973492622375, + "learning_rate": 1.961456817632229e-05, + "loss": 0.7221, + "step": 221700 + }, + { + "epoch": 1.416441996856752, + "grad_norm": 1.326272964477539, + "learning_rate": 1.9610583484267864e-05, + "loss": 0.8475, + "step": 221710 + }, + { + "epoch": 1.416505884006491, + "grad_norm": 0.6224335432052612, + "learning_rate": 1.9606599098261175e-05, + "loss": 0.9709, + "step": 221720 + }, + { + "epoch": 1.4165697711562295, + "grad_norm": 0.9180363416671753, + "learning_rate": 1.9602615018342307e-05, + "loss": 0.9115, + "step": 221730 + }, + { + "epoch": 1.4166336583059684, + "grad_norm": 0.8147386908531189, + "learning_rate": 1.9598631244551402e-05, + "loss": 0.8754, + "step": 221740 + }, + { + "epoch": 1.416697545455707, + "grad_norm": 0.6118002533912659, + "learning_rate": 1.95946477769286e-05, + "loss": 0.6263, + "step": 221750 + }, + { + "epoch": 1.4167614326054458, + "grad_norm": 0.5602393746376038, + "learning_rate": 1.959066461551398e-05, + "loss": 0.9374, + "step": 221760 + }, + { + "epoch": 1.4168253197551843, + "grad_norm": 0.7014377117156982, + "learning_rate": 1.9586681760347692e-05, + "loss": 0.8284, + "step": 221770 + }, + { + "epoch": 1.4168892069049233, + "grad_norm": 0.9873439073562622, + "learning_rate": 1.9582699211469814e-05, + "loss": 0.6845, + "step": 221780 + }, + { + "epoch": 1.4169530940546617, + "grad_norm": 0.8853866457939148, + "learning_rate": 1.957871696892049e-05, + "loss": 0.7459, + "step": 221790 + }, + { + "epoch": 1.4170169812044007, + "grad_norm": 0.8921706676483154, + "learning_rate": 1.957473503273978e-05, + "loss": 0.9808, + "step": 221800 + }, + { + "epoch": 1.4170808683541392, + "grad_norm": 1.1913554668426514, + "learning_rate": 1.9570753402967834e-05, + "loss": 0.7517, + "step": 221810 + }, + { + "epoch": 1.417144755503878, + "grad_norm": 0.9814984202384949, + "learning_rate": 1.9566772079644706e-05, + "loss": 0.8787, + "step": 221820 + }, + { + "epoch": 1.4172086426536166, + "grad_norm": 0.6580504179000854, + "learning_rate": 1.9562791062810533e-05, + "loss": 0.724, + "step": 221830 + }, + { + "epoch": 1.4172725298033553, + "grad_norm": 0.7003348469734192, + "learning_rate": 1.9558810352505362e-05, + "loss": 0.8837, + "step": 221840 + }, + { + "epoch": 1.417336416953094, + "grad_norm": 1.384737253189087, + "learning_rate": 1.9554829948769326e-05, + "loss": 0.8948, + "step": 221850 + }, + { + "epoch": 1.4174003041028327, + "grad_norm": 0.7046149969100952, + "learning_rate": 1.9550849851642473e-05, + "loss": 0.6696, + "step": 221860 + }, + { + "epoch": 1.4174641912525714, + "grad_norm": 0.7566852569580078, + "learning_rate": 1.9546870061164922e-05, + "loss": 0.8452, + "step": 221870 + }, + { + "epoch": 1.41752807840231, + "grad_norm": 1.0079212188720703, + "learning_rate": 1.954289057737672e-05, + "loss": 0.7338, + "step": 221880 + }, + { + "epoch": 1.4175919655520488, + "grad_norm": 0.6846641898155212, + "learning_rate": 1.9538911400317976e-05, + "loss": 1.0205, + "step": 221890 + }, + { + "epoch": 1.4176558527017875, + "grad_norm": 0.8684399724006653, + "learning_rate": 1.9534932530028728e-05, + "loss": 0.7658, + "step": 221900 + }, + { + "epoch": 1.4177197398515262, + "grad_norm": 0.861835777759552, + "learning_rate": 1.953095396654908e-05, + "loss": 0.8338, + "step": 221910 + }, + { + "epoch": 1.417783627001265, + "grad_norm": 1.1581312417984009, + "learning_rate": 1.9526975709919092e-05, + "loss": 0.7955, + "step": 221920 + }, + { + "epoch": 1.4178475141510036, + "grad_norm": 1.2092033624649048, + "learning_rate": 1.9522997760178803e-05, + "loss": 0.6525, + "step": 221930 + }, + { + "epoch": 1.4179114013007423, + "grad_norm": 1.3593804836273193, + "learning_rate": 1.951902011736831e-05, + "loss": 0.9873, + "step": 221940 + }, + { + "epoch": 1.417975288450481, + "grad_norm": 1.110656976699829, + "learning_rate": 1.951504278152763e-05, + "loss": 0.7818, + "step": 221950 + }, + { + "epoch": 1.4180391756002197, + "grad_norm": 0.5778676271438599, + "learning_rate": 1.9511065752696866e-05, + "loss": 0.9688, + "step": 221960 + }, + { + "epoch": 1.4181030627499585, + "grad_norm": 0.8316310048103333, + "learning_rate": 1.9507089030916027e-05, + "loss": 1.0128, + "step": 221970 + }, + { + "epoch": 1.4181669498996972, + "grad_norm": 0.8025345206260681, + "learning_rate": 1.95031126162252e-05, + "loss": 0.7758, + "step": 221980 + }, + { + "epoch": 1.4182308370494359, + "grad_norm": 0.9378303289413452, + "learning_rate": 1.9499136508664396e-05, + "loss": 1.0748, + "step": 221990 + }, + { + "epoch": 1.4182947241991746, + "grad_norm": 0.7751262784004211, + "learning_rate": 1.949516070827369e-05, + "loss": 0.761, + "step": 222000 + }, + { + "epoch": 1.4183586113489133, + "grad_norm": 1.126213550567627, + "learning_rate": 1.949118521509309e-05, + "loss": 0.851, + "step": 222010 + }, + { + "epoch": 1.418422498498652, + "grad_norm": 0.7336819171905518, + "learning_rate": 1.9487210029162663e-05, + "loss": 0.7735, + "step": 222020 + }, + { + "epoch": 1.4184863856483907, + "grad_norm": 1.2958660125732422, + "learning_rate": 1.9483235150522413e-05, + "loss": 0.7772, + "step": 222030 + }, + { + "epoch": 1.4185502727981294, + "grad_norm": 0.699873685836792, + "learning_rate": 1.9479260579212404e-05, + "loss": 0.7898, + "step": 222040 + }, + { + "epoch": 1.418614159947868, + "grad_norm": 1.1349695920944214, + "learning_rate": 1.9475286315272627e-05, + "loss": 0.8856, + "step": 222050 + }, + { + "epoch": 1.4186780470976068, + "grad_norm": 1.0587198734283447, + "learning_rate": 1.9471312358743126e-05, + "loss": 0.7165, + "step": 222060 + }, + { + "epoch": 1.4187419342473455, + "grad_norm": 0.7114539742469788, + "learning_rate": 1.9467338709663935e-05, + "loss": 0.83, + "step": 222070 + }, + { + "epoch": 1.4188058213970842, + "grad_norm": 1.09430992603302, + "learning_rate": 1.946336536807504e-05, + "loss": 0.9272, + "step": 222080 + }, + { + "epoch": 1.418869708546823, + "grad_norm": 1.2134573459625244, + "learning_rate": 1.9459392334016496e-05, + "loss": 0.9753, + "step": 222090 + }, + { + "epoch": 1.4189335956965616, + "grad_norm": 0.9118198156356812, + "learning_rate": 1.9455419607528268e-05, + "loss": 1.0493, + "step": 222100 + }, + { + "epoch": 1.4189974828463003, + "grad_norm": 0.5956824421882629, + "learning_rate": 1.945144718865042e-05, + "loss": 0.6476, + "step": 222110 + }, + { + "epoch": 1.419061369996039, + "grad_norm": 0.8295848965644836, + "learning_rate": 1.9447475077422895e-05, + "loss": 1.0601, + "step": 222120 + }, + { + "epoch": 1.4191252571457778, + "grad_norm": 0.9054329991340637, + "learning_rate": 1.9443503273885755e-05, + "loss": 0.7685, + "step": 222130 + }, + { + "epoch": 1.4191891442955165, + "grad_norm": 0.832612931728363, + "learning_rate": 1.9439531778078947e-05, + "loss": 0.9382, + "step": 222140 + }, + { + "epoch": 1.4192530314452552, + "grad_norm": 0.8252206444740295, + "learning_rate": 1.9435560590042512e-05, + "loss": 0.7185, + "step": 222150 + }, + { + "epoch": 1.4193169185949939, + "grad_norm": 0.9640223979949951, + "learning_rate": 1.943158970981641e-05, + "loss": 0.7232, + "step": 222160 + }, + { + "epoch": 1.4193808057447326, + "grad_norm": 1.1477142572402954, + "learning_rate": 1.9427619137440663e-05, + "loss": 0.9039, + "step": 222170 + }, + { + "epoch": 1.4194446928944713, + "grad_norm": 1.1876455545425415, + "learning_rate": 1.9423648872955218e-05, + "loss": 0.7363, + "step": 222180 + }, + { + "epoch": 1.41950858004421, + "grad_norm": 0.6598142385482788, + "learning_rate": 1.9419678916400104e-05, + "loss": 0.7316, + "step": 222190 + }, + { + "epoch": 1.4195724671939485, + "grad_norm": 0.8525545001029968, + "learning_rate": 1.9415709267815252e-05, + "loss": 0.891, + "step": 222200 + }, + { + "epoch": 1.4196363543436874, + "grad_norm": 0.7906132340431213, + "learning_rate": 1.9411739927240692e-05, + "loss": 0.8702, + "step": 222210 + }, + { + "epoch": 1.4197002414934259, + "grad_norm": 0.7197248935699463, + "learning_rate": 1.9407770894716354e-05, + "loss": 0.7535, + "step": 222220 + }, + { + "epoch": 1.4197641286431648, + "grad_norm": 0.9192577004432678, + "learning_rate": 1.940380217028225e-05, + "loss": 0.954, + "step": 222230 + }, + { + "epoch": 1.4198280157929033, + "grad_norm": 0.985266387462616, + "learning_rate": 1.9399833753978308e-05, + "loss": 0.7107, + "step": 222240 + }, + { + "epoch": 1.4198919029426422, + "grad_norm": 1.0751519203186035, + "learning_rate": 1.939586564584453e-05, + "loss": 0.8886, + "step": 222250 + }, + { + "epoch": 1.4199557900923807, + "grad_norm": 1.5335538387298584, + "learning_rate": 1.9391897845920842e-05, + "loss": 1.1657, + "step": 222260 + }, + { + "epoch": 1.4200196772421196, + "grad_norm": 1.431660532951355, + "learning_rate": 1.9387930354247224e-05, + "loss": 0.952, + "step": 222270 + }, + { + "epoch": 1.4200835643918581, + "grad_norm": 1.3148257732391357, + "learning_rate": 1.938396317086365e-05, + "loss": 0.9239, + "step": 222280 + }, + { + "epoch": 1.420147451541597, + "grad_norm": 2.135805606842041, + "learning_rate": 1.937999629581003e-05, + "loss": 0.8645, + "step": 222290 + }, + { + "epoch": 1.4202113386913355, + "grad_norm": 1.0679904222488403, + "learning_rate": 1.937602972912636e-05, + "loss": 0.799, + "step": 222300 + }, + { + "epoch": 1.4202752258410745, + "grad_norm": 0.8440000414848328, + "learning_rate": 1.9372063470852547e-05, + "loss": 0.9153, + "step": 222310 + }, + { + "epoch": 1.420339112990813, + "grad_norm": 1.7078415155410767, + "learning_rate": 1.936809752102857e-05, + "loss": 1.2632, + "step": 222320 + }, + { + "epoch": 1.4204030001405517, + "grad_norm": 2.0827977657318115, + "learning_rate": 1.936413187969433e-05, + "loss": 0.7392, + "step": 222330 + }, + { + "epoch": 1.4204668872902904, + "grad_norm": 1.0674457550048828, + "learning_rate": 1.936016654688981e-05, + "loss": 1.0084, + "step": 222340 + }, + { + "epoch": 1.420530774440029, + "grad_norm": 0.8765248656272888, + "learning_rate": 1.9356201522654892e-05, + "loss": 1.0464, + "step": 222350 + }, + { + "epoch": 1.4205946615897678, + "grad_norm": 1.205122947692871, + "learning_rate": 1.9352236807029562e-05, + "loss": 0.8186, + "step": 222360 + }, + { + "epoch": 1.4206585487395065, + "grad_norm": 1.0993788242340088, + "learning_rate": 1.93482724000537e-05, + "loss": 1.1414, + "step": 222370 + }, + { + "epoch": 1.4207224358892452, + "grad_norm": 0.7830728888511658, + "learning_rate": 1.9344308301767274e-05, + "loss": 0.6976, + "step": 222380 + }, + { + "epoch": 1.420786323038984, + "grad_norm": 2.2178773880004883, + "learning_rate": 1.9340344512210163e-05, + "loss": 0.8951, + "step": 222390 + }, + { + "epoch": 1.4208502101887226, + "grad_norm": 0.6909429430961609, + "learning_rate": 1.9336381031422325e-05, + "loss": 0.8694, + "step": 222400 + }, + { + "epoch": 1.4209140973384613, + "grad_norm": 1.1704915761947632, + "learning_rate": 1.933241785944366e-05, + "loss": 0.8337, + "step": 222410 + }, + { + "epoch": 1.4209779844882, + "grad_norm": 1.000505805015564, + "learning_rate": 1.9328454996314055e-05, + "loss": 0.9537, + "step": 222420 + }, + { + "epoch": 1.4210418716379387, + "grad_norm": 0.865317702293396, + "learning_rate": 1.932449244207346e-05, + "loss": 0.8764, + "step": 222430 + }, + { + "epoch": 1.4211057587876774, + "grad_norm": 2.7685742378234863, + "learning_rate": 1.9320530196761753e-05, + "loss": 0.965, + "step": 222440 + }, + { + "epoch": 1.4211696459374161, + "grad_norm": 1.276869773864746, + "learning_rate": 1.9316568260418867e-05, + "loss": 1.4253, + "step": 222450 + }, + { + "epoch": 1.4212335330871548, + "grad_norm": 1.2430510520935059, + "learning_rate": 1.931260663308466e-05, + "loss": 0.9924, + "step": 222460 + }, + { + "epoch": 1.4212974202368935, + "grad_norm": 1.0641168355941772, + "learning_rate": 1.9308645314799073e-05, + "loss": 1.1704, + "step": 222470 + }, + { + "epoch": 1.4213613073866322, + "grad_norm": 0.7764881253242493, + "learning_rate": 1.930468430560196e-05, + "loss": 0.7378, + "step": 222480 + }, + { + "epoch": 1.421425194536371, + "grad_norm": 1.2852715253829956, + "learning_rate": 1.9300723605533255e-05, + "loss": 1.1468, + "step": 222490 + }, + { + "epoch": 1.4214890816861097, + "grad_norm": 1.244436264038086, + "learning_rate": 1.9296763214632796e-05, + "loss": 0.6906, + "step": 222500 + }, + { + "epoch": 1.4215529688358484, + "grad_norm": 0.7294203639030457, + "learning_rate": 1.9292803132940518e-05, + "loss": 0.6666, + "step": 222510 + }, + { + "epoch": 1.421616855985587, + "grad_norm": 1.364741325378418, + "learning_rate": 1.9288843360496255e-05, + "loss": 0.9499, + "step": 222520 + }, + { + "epoch": 1.4216807431353258, + "grad_norm": 1.0707260370254517, + "learning_rate": 1.9284883897339927e-05, + "loss": 0.9658, + "step": 222530 + }, + { + "epoch": 1.4217446302850645, + "grad_norm": 1.0712471008300781, + "learning_rate": 1.9280924743511382e-05, + "loss": 0.9658, + "step": 222540 + }, + { + "epoch": 1.4218085174348032, + "grad_norm": 1.0781949758529663, + "learning_rate": 1.9276965899050507e-05, + "loss": 1.0543, + "step": 222550 + }, + { + "epoch": 1.421872404584542, + "grad_norm": 0.5351880192756653, + "learning_rate": 1.9273007363997148e-05, + "loss": 0.7134, + "step": 222560 + }, + { + "epoch": 1.4219362917342806, + "grad_norm": 1.1996464729309082, + "learning_rate": 1.926904913839121e-05, + "loss": 0.8995, + "step": 222570 + }, + { + "epoch": 1.4220001788840193, + "grad_norm": 0.900219738483429, + "learning_rate": 1.9265091222272513e-05, + "loss": 0.9481, + "step": 222580 + }, + { + "epoch": 1.422064066033758, + "grad_norm": 2.1757538318634033, + "learning_rate": 1.926113361568094e-05, + "loss": 0.8004, + "step": 222590 + }, + { + "epoch": 1.4221279531834967, + "grad_norm": 0.8302018046379089, + "learning_rate": 1.9257176318656356e-05, + "loss": 0.8924, + "step": 222600 + }, + { + "epoch": 1.4221918403332354, + "grad_norm": 1.2007355690002441, + "learning_rate": 1.9253219331238586e-05, + "loss": 0.7996, + "step": 222610 + }, + { + "epoch": 1.4222557274829741, + "grad_norm": 1.2114557027816772, + "learning_rate": 1.924926265346752e-05, + "loss": 0.9217, + "step": 222620 + }, + { + "epoch": 1.4223196146327128, + "grad_norm": 0.8614585995674133, + "learning_rate": 1.9245306285382957e-05, + "loss": 0.7761, + "step": 222630 + }, + { + "epoch": 1.4223835017824515, + "grad_norm": 1.270330786705017, + "learning_rate": 1.924135022702479e-05, + "loss": 0.5521, + "step": 222640 + }, + { + "epoch": 1.4224473889321902, + "grad_norm": 1.1912163496017456, + "learning_rate": 1.9237394478432818e-05, + "loss": 0.8739, + "step": 222650 + }, + { + "epoch": 1.422511276081929, + "grad_norm": 1.322155237197876, + "learning_rate": 1.9233439039646917e-05, + "loss": 0.8325, + "step": 222660 + }, + { + "epoch": 1.4225751632316677, + "grad_norm": 0.9432783722877502, + "learning_rate": 1.922948391070688e-05, + "loss": 0.7395, + "step": 222670 + }, + { + "epoch": 1.4226390503814064, + "grad_norm": 1.2607407569885254, + "learning_rate": 1.9225529091652577e-05, + "loss": 0.9211, + "step": 222680 + }, + { + "epoch": 1.4227029375311449, + "grad_norm": 0.6521483063697815, + "learning_rate": 1.9221574582523804e-05, + "loss": 0.7439, + "step": 222690 + }, + { + "epoch": 1.4227668246808838, + "grad_norm": 1.0877735614776611, + "learning_rate": 1.921762038336042e-05, + "loss": 1.1125, + "step": 222700 + }, + { + "epoch": 1.4228307118306223, + "grad_norm": 0.8707333207130432, + "learning_rate": 1.9213666494202216e-05, + "loss": 0.8505, + "step": 222710 + }, + { + "epoch": 1.4228945989803612, + "grad_norm": 0.8127386569976807, + "learning_rate": 1.9209712915089035e-05, + "loss": 1.0918, + "step": 222720 + }, + { + "epoch": 1.4229584861300997, + "grad_norm": 0.8321516513824463, + "learning_rate": 1.9205759646060668e-05, + "loss": 0.8188, + "step": 222730 + }, + { + "epoch": 1.4230223732798386, + "grad_norm": 0.5849027633666992, + "learning_rate": 1.920180668715696e-05, + "loss": 0.8457, + "step": 222740 + }, + { + "epoch": 1.423086260429577, + "grad_norm": 0.9630666375160217, + "learning_rate": 1.919785403841768e-05, + "loss": 1.0351, + "step": 222750 + }, + { + "epoch": 1.423150147579316, + "grad_norm": 1.0991421937942505, + "learning_rate": 1.9193901699882683e-05, + "loss": 0.922, + "step": 222760 + }, + { + "epoch": 1.4232140347290545, + "grad_norm": 1.1798986196517944, + "learning_rate": 1.9189949671591724e-05, + "loss": 0.9662, + "step": 222770 + }, + { + "epoch": 1.4232779218787934, + "grad_norm": 0.5852562189102173, + "learning_rate": 1.9185997953584644e-05, + "loss": 0.8046, + "step": 222780 + }, + { + "epoch": 1.423341809028532, + "grad_norm": 0.9567358493804932, + "learning_rate": 1.9182046545901204e-05, + "loss": 0.9986, + "step": 222790 + }, + { + "epoch": 1.4234056961782708, + "grad_norm": 1.1330355405807495, + "learning_rate": 1.9178095448581224e-05, + "loss": 0.7967, + "step": 222800 + }, + { + "epoch": 1.4234695833280093, + "grad_norm": 1.0684672594070435, + "learning_rate": 1.9174144661664493e-05, + "loss": 1.1382, + "step": 222810 + }, + { + "epoch": 1.423533470477748, + "grad_norm": 0.8078852891921997, + "learning_rate": 1.9170194185190786e-05, + "loss": 1.0976, + "step": 222820 + }, + { + "epoch": 1.4235973576274867, + "grad_norm": 1.4376271963119507, + "learning_rate": 1.9166244019199913e-05, + "loss": 0.8219, + "step": 222830 + }, + { + "epoch": 1.4236612447772254, + "grad_norm": 0.7955139875411987, + "learning_rate": 1.9162294163731616e-05, + "loss": 0.9684, + "step": 222840 + }, + { + "epoch": 1.4237251319269641, + "grad_norm": 1.2134015560150146, + "learning_rate": 1.9158344618825713e-05, + "loss": 0.8365, + "step": 222850 + }, + { + "epoch": 1.4237890190767029, + "grad_norm": 0.7520911693572998, + "learning_rate": 1.9154395384521944e-05, + "loss": 0.8027, + "step": 222860 + }, + { + "epoch": 1.4238529062264416, + "grad_norm": 0.9657458066940308, + "learning_rate": 1.9150446460860118e-05, + "loss": 0.8404, + "step": 222870 + }, + { + "epoch": 1.4239167933761803, + "grad_norm": 1.2599238157272339, + "learning_rate": 1.9146497847879986e-05, + "loss": 0.7761, + "step": 222880 + }, + { + "epoch": 1.423980680525919, + "grad_norm": 0.6829902529716492, + "learning_rate": 1.9142549545621307e-05, + "loss": 0.8754, + "step": 222890 + }, + { + "epoch": 1.4240445676756577, + "grad_norm": 1.3122057914733887, + "learning_rate": 1.9138601554123837e-05, + "loss": 0.9307, + "step": 222900 + }, + { + "epoch": 1.4241084548253964, + "grad_norm": 0.788246214389801, + "learning_rate": 1.9134653873427344e-05, + "loss": 0.6951, + "step": 222910 + }, + { + "epoch": 1.424172341975135, + "grad_norm": 1.1151480674743652, + "learning_rate": 1.913070650357161e-05, + "loss": 0.9483, + "step": 222920 + }, + { + "epoch": 1.4242362291248738, + "grad_norm": 0.9124578833580017, + "learning_rate": 1.9126759444596348e-05, + "loss": 0.6496, + "step": 222930 + }, + { + "epoch": 1.4243001162746125, + "grad_norm": 0.904630184173584, + "learning_rate": 1.9122812696541348e-05, + "loss": 0.72, + "step": 222940 + }, + { + "epoch": 1.4243640034243512, + "grad_norm": 1.0377484560012817, + "learning_rate": 1.9118866259446315e-05, + "loss": 0.7401, + "step": 222950 + }, + { + "epoch": 1.42442789057409, + "grad_norm": 0.8512157201766968, + "learning_rate": 1.9114920133351033e-05, + "loss": 0.9919, + "step": 222960 + }, + { + "epoch": 1.4244917777238286, + "grad_norm": 0.9018188714981079, + "learning_rate": 1.911097431829521e-05, + "loss": 0.6751, + "step": 222970 + }, + { + "epoch": 1.4245556648735673, + "grad_norm": 1.2587921619415283, + "learning_rate": 1.9107028814318616e-05, + "loss": 0.8381, + "step": 222980 + }, + { + "epoch": 1.424619552023306, + "grad_norm": 0.847082257270813, + "learning_rate": 1.910308362146095e-05, + "loss": 0.878, + "step": 222990 + }, + { + "epoch": 1.4246834391730447, + "grad_norm": 0.8812136650085449, + "learning_rate": 1.9099138739761973e-05, + "loss": 0.6974, + "step": 223000 + }, + { + "epoch": 1.4247473263227834, + "grad_norm": 0.986804187297821, + "learning_rate": 1.9095194169261394e-05, + "loss": 1.0691, + "step": 223010 + }, + { + "epoch": 1.4248112134725222, + "grad_norm": 0.7562146782875061, + "learning_rate": 1.909124990999896e-05, + "loss": 0.6436, + "step": 223020 + }, + { + "epoch": 1.4248751006222609, + "grad_norm": 1.4277758598327637, + "learning_rate": 1.9087305962014356e-05, + "loss": 0.8546, + "step": 223030 + }, + { + "epoch": 1.4249389877719996, + "grad_norm": 0.6766421794891357, + "learning_rate": 1.9083362325347348e-05, + "loss": 0.9694, + "step": 223040 + }, + { + "epoch": 1.4250028749217383, + "grad_norm": 1.1126245260238647, + "learning_rate": 1.9079419000037613e-05, + "loss": 0.8527, + "step": 223050 + }, + { + "epoch": 1.425066762071477, + "grad_norm": 1.1563136577606201, + "learning_rate": 1.907547598612489e-05, + "loss": 0.8045, + "step": 223060 + }, + { + "epoch": 1.4251306492212157, + "grad_norm": 0.8701530694961548, + "learning_rate": 1.9071533283648863e-05, + "loss": 1.1362, + "step": 223070 + }, + { + "epoch": 1.4251945363709544, + "grad_norm": 1.362408995628357, + "learning_rate": 1.9067590892649266e-05, + "loss": 0.8057, + "step": 223080 + }, + { + "epoch": 1.425258423520693, + "grad_norm": 1.4804731607437134, + "learning_rate": 1.906364881316578e-05, + "loss": 1.2727, + "step": 223090 + }, + { + "epoch": 1.4253223106704318, + "grad_norm": 1.0505961179733276, + "learning_rate": 1.9059707045238125e-05, + "loss": 0.7074, + "step": 223100 + }, + { + "epoch": 1.4253861978201705, + "grad_norm": 0.7411073446273804, + "learning_rate": 1.905576558890597e-05, + "loss": 0.8723, + "step": 223110 + }, + { + "epoch": 1.4254500849699092, + "grad_norm": 0.8923389911651611, + "learning_rate": 1.9051824444209033e-05, + "loss": 0.6809, + "step": 223120 + }, + { + "epoch": 1.425513972119648, + "grad_norm": 1.6303932666778564, + "learning_rate": 1.9047883611187006e-05, + "loss": 0.8095, + "step": 223130 + }, + { + "epoch": 1.4255778592693866, + "grad_norm": 2.7295801639556885, + "learning_rate": 1.9043943089879557e-05, + "loss": 0.8861, + "step": 223140 + }, + { + "epoch": 1.4256417464191253, + "grad_norm": 0.7593581676483154, + "learning_rate": 1.9040002880326397e-05, + "loss": 0.8349, + "step": 223150 + }, + { + "epoch": 1.425705633568864, + "grad_norm": 0.8572759628295898, + "learning_rate": 1.9036062982567172e-05, + "loss": 1.0088, + "step": 223160 + }, + { + "epoch": 1.4257695207186027, + "grad_norm": 0.750809371471405, + "learning_rate": 1.9032123396641605e-05, + "loss": 0.8898, + "step": 223170 + }, + { + "epoch": 1.4258334078683412, + "grad_norm": 1.0740511417388916, + "learning_rate": 1.902818412258932e-05, + "loss": 0.7032, + "step": 223180 + }, + { + "epoch": 1.4258972950180802, + "grad_norm": 0.8483978509902954, + "learning_rate": 1.902424516045004e-05, + "loss": 0.8263, + "step": 223190 + }, + { + "epoch": 1.4259611821678186, + "grad_norm": 0.8091272115707397, + "learning_rate": 1.9020306510263392e-05, + "loss": 0.6744, + "step": 223200 + }, + { + "epoch": 1.4260250693175576, + "grad_norm": 0.9533054232597351, + "learning_rate": 1.901636817206907e-05, + "loss": 0.8155, + "step": 223210 + }, + { + "epoch": 1.426088956467296, + "grad_norm": 1.1179161071777344, + "learning_rate": 1.9012430145906714e-05, + "loss": 0.9055, + "step": 223220 + }, + { + "epoch": 1.426152843617035, + "grad_norm": 1.0814383029937744, + "learning_rate": 1.9008492431816005e-05, + "loss": 1.1812, + "step": 223230 + }, + { + "epoch": 1.4262167307667735, + "grad_norm": 1.3198683261871338, + "learning_rate": 1.900455502983658e-05, + "loss": 0.8627, + "step": 223240 + }, + { + "epoch": 1.4262806179165124, + "grad_norm": 2.693085193634033, + "learning_rate": 1.9000617940008107e-05, + "loss": 0.7759, + "step": 223250 + }, + { + "epoch": 1.4263445050662509, + "grad_norm": 1.1490528583526611, + "learning_rate": 1.899668116237022e-05, + "loss": 0.9932, + "step": 223260 + }, + { + "epoch": 1.4264083922159898, + "grad_norm": 1.3757820129394531, + "learning_rate": 1.8992744696962594e-05, + "loss": 0.7928, + "step": 223270 + }, + { + "epoch": 1.4264722793657283, + "grad_norm": 1.5062553882598877, + "learning_rate": 1.898880854382483e-05, + "loss": 0.9733, + "step": 223280 + }, + { + "epoch": 1.426536166515467, + "grad_norm": 0.8266945481300354, + "learning_rate": 1.898487270299662e-05, + "loss": 0.8319, + "step": 223290 + }, + { + "epoch": 1.4266000536652057, + "grad_norm": 0.8893870115280151, + "learning_rate": 1.898093717451755e-05, + "loss": 0.7207, + "step": 223300 + }, + { + "epoch": 1.4266639408149444, + "grad_norm": 4.273061752319336, + "learning_rate": 1.8977001958427295e-05, + "loss": 0.9901, + "step": 223310 + }, + { + "epoch": 1.4267278279646831, + "grad_norm": 0.5634547472000122, + "learning_rate": 1.8973067054765453e-05, + "loss": 0.793, + "step": 223320 + }, + { + "epoch": 1.4267917151144218, + "grad_norm": 1.0069791078567505, + "learning_rate": 1.8969132463571664e-05, + "loss": 0.8604, + "step": 223330 + }, + { + "epoch": 1.4268556022641605, + "grad_norm": 1.3176125288009644, + "learning_rate": 1.896519818488558e-05, + "loss": 0.7742, + "step": 223340 + }, + { + "epoch": 1.4269194894138992, + "grad_norm": 1.01974356174469, + "learning_rate": 1.8961264218746776e-05, + "loss": 0.854, + "step": 223350 + }, + { + "epoch": 1.426983376563638, + "grad_norm": 1.006564974784851, + "learning_rate": 1.8957330565194915e-05, + "loss": 0.9466, + "step": 223360 + }, + { + "epoch": 1.4270472637133766, + "grad_norm": 0.7808417081832886, + "learning_rate": 1.8953397224269593e-05, + "loss": 0.8487, + "step": 223370 + }, + { + "epoch": 1.4271111508631154, + "grad_norm": 1.5651687383651733, + "learning_rate": 1.8949464196010414e-05, + "loss": 0.8292, + "step": 223380 + }, + { + "epoch": 1.427175038012854, + "grad_norm": 0.9617428779602051, + "learning_rate": 1.894553148045698e-05, + "loss": 0.9321, + "step": 223390 + }, + { + "epoch": 1.4272389251625928, + "grad_norm": 0.7266619801521301, + "learning_rate": 1.8941599077648925e-05, + "loss": 0.8823, + "step": 223400 + }, + { + "epoch": 1.4273028123123315, + "grad_norm": 0.8688862919807434, + "learning_rate": 1.8937666987625817e-05, + "loss": 0.9043, + "step": 223410 + }, + { + "epoch": 1.4273666994620702, + "grad_norm": 1.1119881868362427, + "learning_rate": 1.89337352104273e-05, + "loss": 0.8779, + "step": 223420 + }, + { + "epoch": 1.4274305866118089, + "grad_norm": 1.5051636695861816, + "learning_rate": 1.8929803746092923e-05, + "loss": 1.0522, + "step": 223430 + }, + { + "epoch": 1.4274944737615476, + "grad_norm": 1.0099921226501465, + "learning_rate": 1.8925872594662304e-05, + "loss": 1.1269, + "step": 223440 + }, + { + "epoch": 1.4275583609112863, + "grad_norm": 1.209182620048523, + "learning_rate": 1.8921941756175045e-05, + "loss": 0.8918, + "step": 223450 + }, + { + "epoch": 1.427622248061025, + "grad_norm": 1.6870150566101074, + "learning_rate": 1.8918011230670708e-05, + "loss": 0.8373, + "step": 223460 + }, + { + "epoch": 1.4276861352107637, + "grad_norm": 1.0700960159301758, + "learning_rate": 1.8914081018188895e-05, + "loss": 0.8941, + "step": 223470 + }, + { + "epoch": 1.4277500223605024, + "grad_norm": 0.8452657461166382, + "learning_rate": 1.891015111876917e-05, + "loss": 0.8807, + "step": 223480 + }, + { + "epoch": 1.4278139095102411, + "grad_norm": 0.7489930987358093, + "learning_rate": 1.8906221532451134e-05, + "loss": 0.8034, + "step": 223490 + }, + { + "epoch": 1.4278777966599798, + "grad_norm": 1.563072919845581, + "learning_rate": 1.890229225927433e-05, + "loss": 0.7062, + "step": 223500 + }, + { + "epoch": 1.4279416838097185, + "grad_norm": 0.8351766467094421, + "learning_rate": 1.8898363299278364e-05, + "loss": 0.9484, + "step": 223510 + }, + { + "epoch": 1.4280055709594572, + "grad_norm": 0.9991663098335266, + "learning_rate": 1.8894434652502767e-05, + "loss": 0.7625, + "step": 223520 + }, + { + "epoch": 1.428069458109196, + "grad_norm": 0.950833261013031, + "learning_rate": 1.8890506318987144e-05, + "loss": 0.9521, + "step": 223530 + }, + { + "epoch": 1.4281333452589346, + "grad_norm": 0.8913379311561584, + "learning_rate": 1.8886578298771017e-05, + "loss": 0.7806, + "step": 223540 + }, + { + "epoch": 1.4281972324086734, + "grad_norm": 1.2399307489395142, + "learning_rate": 1.888265059189398e-05, + "loss": 0.7326, + "step": 223550 + }, + { + "epoch": 1.428261119558412, + "grad_norm": 0.7313088774681091, + "learning_rate": 1.887872319839556e-05, + "loss": 1.1297, + "step": 223560 + }, + { + "epoch": 1.4283250067081508, + "grad_norm": 1.8566665649414062, + "learning_rate": 1.887479611831533e-05, + "loss": 0.7582, + "step": 223570 + }, + { + "epoch": 1.4283888938578895, + "grad_norm": 1.242580533027649, + "learning_rate": 1.887086935169282e-05, + "loss": 0.946, + "step": 223580 + }, + { + "epoch": 1.4284527810076282, + "grad_norm": 0.9577218294143677, + "learning_rate": 1.8866942898567596e-05, + "loss": 0.9002, + "step": 223590 + }, + { + "epoch": 1.4285166681573669, + "grad_norm": 0.7212193608283997, + "learning_rate": 1.886301675897918e-05, + "loss": 0.8361, + "step": 223600 + }, + { + "epoch": 1.4285805553071056, + "grad_norm": 0.7417780160903931, + "learning_rate": 1.885909093296714e-05, + "loss": 0.8958, + "step": 223610 + }, + { + "epoch": 1.4286444424568443, + "grad_norm": 1.352882981300354, + "learning_rate": 1.8855165420570974e-05, + "loss": 0.7512, + "step": 223620 + }, + { + "epoch": 1.428708329606583, + "grad_norm": 0.7283663153648376, + "learning_rate": 1.8851240221830258e-05, + "loss": 0.7656, + "step": 223630 + }, + { + "epoch": 1.4287722167563217, + "grad_norm": 0.8927812576293945, + "learning_rate": 1.8847315336784477e-05, + "loss": 0.9373, + "step": 223640 + }, + { + "epoch": 1.4288361039060604, + "grad_norm": 1.8338420391082764, + "learning_rate": 1.8843390765473184e-05, + "loss": 0.7657, + "step": 223650 + }, + { + "epoch": 1.4288999910557991, + "grad_norm": 0.784820556640625, + "learning_rate": 1.8839466507935923e-05, + "loss": 0.9875, + "step": 223660 + }, + { + "epoch": 1.4289638782055376, + "grad_norm": 0.7442781329154968, + "learning_rate": 1.8835542564212168e-05, + "loss": 1.0937, + "step": 223670 + }, + { + "epoch": 1.4290277653552765, + "grad_norm": 1.2108781337738037, + "learning_rate": 1.883161893434148e-05, + "loss": 0.791, + "step": 223680 + }, + { + "epoch": 1.429091652505015, + "grad_norm": 1.1448874473571777, + "learning_rate": 1.8827695618363334e-05, + "loss": 1.29, + "step": 223690 + }, + { + "epoch": 1.429155539654754, + "grad_norm": 0.8458133935928345, + "learning_rate": 1.882377261631728e-05, + "loss": 0.8695, + "step": 223700 + }, + { + "epoch": 1.4292194268044924, + "grad_norm": 0.9109838604927063, + "learning_rate": 1.8819849928242793e-05, + "loss": 0.8017, + "step": 223710 + }, + { + "epoch": 1.4292833139542314, + "grad_norm": 1.242732286453247, + "learning_rate": 1.8815927554179408e-05, + "loss": 0.7131, + "step": 223720 + }, + { + "epoch": 1.4293472011039698, + "grad_norm": 0.6500130295753479, + "learning_rate": 1.881200549416659e-05, + "loss": 0.7161, + "step": 223730 + }, + { + "epoch": 1.4294110882537088, + "grad_norm": 0.7395265102386475, + "learning_rate": 1.8808083748243878e-05, + "loss": 0.7819, + "step": 223740 + }, + { + "epoch": 1.4294749754034473, + "grad_norm": 0.9619272947311401, + "learning_rate": 1.880416231645073e-05, + "loss": 0.8497, + "step": 223750 + }, + { + "epoch": 1.4295388625531862, + "grad_norm": 0.80619215965271, + "learning_rate": 1.8800241198826675e-05, + "loss": 0.9684, + "step": 223760 + }, + { + "epoch": 1.4296027497029247, + "grad_norm": 1.2067186832427979, + "learning_rate": 1.879632039541116e-05, + "loss": 0.8832, + "step": 223770 + }, + { + "epoch": 1.4296666368526634, + "grad_norm": 0.5644568204879761, + "learning_rate": 1.8792399906243712e-05, + "loss": 0.9305, + "step": 223780 + }, + { + "epoch": 1.429730524002402, + "grad_norm": 0.5826699733734131, + "learning_rate": 1.8788479731363785e-05, + "loss": 1.0084, + "step": 223790 + }, + { + "epoch": 1.4297944111521408, + "grad_norm": 0.9067062735557556, + "learning_rate": 1.878455987081088e-05, + "loss": 0.9709, + "step": 223800 + }, + { + "epoch": 1.4298582983018795, + "grad_norm": 1.0680228471755981, + "learning_rate": 1.8781032265095477e-05, + "loss": 0.7959, + "step": 223810 + }, + { + "epoch": 1.4299221854516182, + "grad_norm": 0.8127917647361755, + "learning_rate": 1.8777113001872633e-05, + "loss": 0.8962, + "step": 223820 + }, + { + "epoch": 1.429986072601357, + "grad_norm": 1.4489061832427979, + "learning_rate": 1.8773194053091285e-05, + "loss": 0.8774, + "step": 223830 + }, + { + "epoch": 1.4300499597510956, + "grad_norm": 1.1648927927017212, + "learning_rate": 1.876927541879088e-05, + "loss": 1.0562, + "step": 223840 + }, + { + "epoch": 1.4301138469008343, + "grad_norm": 1.0999516248703003, + "learning_rate": 1.8765357099010898e-05, + "loss": 0.6795, + "step": 223850 + }, + { + "epoch": 1.430177734050573, + "grad_norm": 0.9541165828704834, + "learning_rate": 1.876143909379078e-05, + "loss": 0.8735, + "step": 223860 + }, + { + "epoch": 1.4302416212003117, + "grad_norm": 0.7938796877861023, + "learning_rate": 1.875752140317001e-05, + "loss": 0.8024, + "step": 223870 + }, + { + "epoch": 1.4303055083500504, + "grad_norm": 1.1307613849639893, + "learning_rate": 1.875360402718801e-05, + "loss": 0.8533, + "step": 223880 + }, + { + "epoch": 1.4303693954997891, + "grad_norm": 1.4370204210281372, + "learning_rate": 1.8749686965884273e-05, + "loss": 0.81, + "step": 223890 + }, + { + "epoch": 1.4304332826495278, + "grad_norm": 0.9484549164772034, + "learning_rate": 1.8745770219298196e-05, + "loss": 1.0642, + "step": 223900 + }, + { + "epoch": 1.4304971697992666, + "grad_norm": 0.7258232831954956, + "learning_rate": 1.8741853787469278e-05, + "loss": 1.0068, + "step": 223910 + }, + { + "epoch": 1.4305610569490053, + "grad_norm": 0.8766778111457825, + "learning_rate": 1.8737937670436912e-05, + "loss": 0.7022, + "step": 223920 + }, + { + "epoch": 1.430624944098744, + "grad_norm": 0.6516861319541931, + "learning_rate": 1.873402186824058e-05, + "loss": 0.8096, + "step": 223930 + }, + { + "epoch": 1.4306888312484827, + "grad_norm": 0.9304824471473694, + "learning_rate": 1.8730106380919676e-05, + "loss": 0.7141, + "step": 223940 + }, + { + "epoch": 1.4307527183982214, + "grad_norm": 0.8284528255462646, + "learning_rate": 1.8726191208513673e-05, + "loss": 0.7919, + "step": 223950 + }, + { + "epoch": 1.43081660554796, + "grad_norm": 1.0690491199493408, + "learning_rate": 1.8722276351061963e-05, + "loss": 0.8234, + "step": 223960 + }, + { + "epoch": 1.4308804926976988, + "grad_norm": 1.2148635387420654, + "learning_rate": 1.8718361808603984e-05, + "loss": 0.7975, + "step": 223970 + }, + { + "epoch": 1.4309443798474375, + "grad_norm": 0.83355712890625, + "learning_rate": 1.871444758117919e-05, + "loss": 1.0509, + "step": 223980 + }, + { + "epoch": 1.4310082669971762, + "grad_norm": 0.8912057280540466, + "learning_rate": 1.8710533668826953e-05, + "loss": 0.9617, + "step": 223990 + }, + { + "epoch": 1.431072154146915, + "grad_norm": 0.8748745918273926, + "learning_rate": 1.8706620071586745e-05, + "loss": 0.7828, + "step": 224000 + }, + { + "epoch": 1.4311360412966536, + "grad_norm": 0.9734835028648376, + "learning_rate": 1.870270678949791e-05, + "loss": 0.9372, + "step": 224010 + }, + { + "epoch": 1.4311999284463923, + "grad_norm": 1.132683277130127, + "learning_rate": 1.8698793822599915e-05, + "loss": 0.6881, + "step": 224020 + }, + { + "epoch": 1.431263815596131, + "grad_norm": 1.0529520511627197, + "learning_rate": 1.869488117093212e-05, + "loss": 0.9852, + "step": 224030 + }, + { + "epoch": 1.4313277027458697, + "grad_norm": 1.1331969499588013, + "learning_rate": 1.8690968834533984e-05, + "loss": 0.5732, + "step": 224040 + }, + { + "epoch": 1.4313915898956084, + "grad_norm": 1.1039819717407227, + "learning_rate": 1.8687056813444854e-05, + "loss": 0.8991, + "step": 224050 + }, + { + "epoch": 1.4314554770453471, + "grad_norm": 0.7155099511146545, + "learning_rate": 1.8683145107704163e-05, + "loss": 0.7931, + "step": 224060 + }, + { + "epoch": 1.4315193641950859, + "grad_norm": 3.0403034687042236, + "learning_rate": 1.8679233717351284e-05, + "loss": 0.8228, + "step": 224070 + }, + { + "epoch": 1.4315832513448246, + "grad_norm": 1.0688437223434448, + "learning_rate": 1.8675322642425617e-05, + "loss": 1.0601, + "step": 224080 + }, + { + "epoch": 1.4316471384945633, + "grad_norm": 1.0098079442977905, + "learning_rate": 1.8671411882966567e-05, + "loss": 0.7916, + "step": 224090 + }, + { + "epoch": 1.431711025644302, + "grad_norm": 0.9161421060562134, + "learning_rate": 1.8667501439013487e-05, + "loss": 0.7391, + "step": 224100 + }, + { + "epoch": 1.4317749127940407, + "grad_norm": 0.9645119309425354, + "learning_rate": 1.8663591310605786e-05, + "loss": 1.0217, + "step": 224110 + }, + { + "epoch": 1.4318387999437794, + "grad_norm": 1.7049353122711182, + "learning_rate": 1.8659681497782823e-05, + "loss": 0.7817, + "step": 224120 + }, + { + "epoch": 1.431902687093518, + "grad_norm": 1.1278208494186401, + "learning_rate": 1.8655772000583987e-05, + "loss": 0.778, + "step": 224130 + }, + { + "epoch": 1.4319665742432566, + "grad_norm": 1.1097828149795532, + "learning_rate": 1.865186281904863e-05, + "loss": 0.9847, + "step": 224140 + }, + { + "epoch": 1.4320304613929955, + "grad_norm": 0.9624657034873962, + "learning_rate": 1.864795395321615e-05, + "loss": 0.6916, + "step": 224150 + }, + { + "epoch": 1.432094348542734, + "grad_norm": 0.8190516829490662, + "learning_rate": 1.8644045403125886e-05, + "loss": 0.793, + "step": 224160 + }, + { + "epoch": 1.432158235692473, + "grad_norm": 0.8392983675003052, + "learning_rate": 1.8640137168817223e-05, + "loss": 0.833, + "step": 224170 + }, + { + "epoch": 1.4322221228422114, + "grad_norm": 1.075068712234497, + "learning_rate": 1.8636229250329497e-05, + "loss": 1.0602, + "step": 224180 + }, + { + "epoch": 1.4322860099919503, + "grad_norm": 1.0051250457763672, + "learning_rate": 1.863232164770209e-05, + "loss": 0.7617, + "step": 224190 + }, + { + "epoch": 1.4323498971416888, + "grad_norm": 1.2177984714508057, + "learning_rate": 1.8628414360974323e-05, + "loss": 0.7633, + "step": 224200 + }, + { + "epoch": 1.4324137842914277, + "grad_norm": 0.8915864825248718, + "learning_rate": 1.862450739018558e-05, + "loss": 1.1079, + "step": 224210 + }, + { + "epoch": 1.4324776714411662, + "grad_norm": 0.4424319565296173, + "learning_rate": 1.8620600735375176e-05, + "loss": 0.646, + "step": 224220 + }, + { + "epoch": 1.4325415585909052, + "grad_norm": 1.0826150178909302, + "learning_rate": 1.8616694396582484e-05, + "loss": 0.9242, + "step": 224230 + }, + { + "epoch": 1.4326054457406436, + "grad_norm": 0.7918615937232971, + "learning_rate": 1.8612788373846817e-05, + "loss": 1.115, + "step": 224240 + }, + { + "epoch": 1.4326693328903826, + "grad_norm": 0.8647782206535339, + "learning_rate": 1.860888266720754e-05, + "loss": 0.8596, + "step": 224250 + }, + { + "epoch": 1.432733220040121, + "grad_norm": 0.8612581491470337, + "learning_rate": 1.8604977276703955e-05, + "loss": 0.7004, + "step": 224260 + }, + { + "epoch": 1.4327971071898598, + "grad_norm": 0.9314303398132324, + "learning_rate": 1.8601072202375423e-05, + "loss": 0.8431, + "step": 224270 + }, + { + "epoch": 1.4328609943395985, + "grad_norm": 0.630257248878479, + "learning_rate": 1.8597167444261247e-05, + "loss": 0.7244, + "step": 224280 + }, + { + "epoch": 1.4329248814893372, + "grad_norm": 0.8772875070571899, + "learning_rate": 1.8593263002400758e-05, + "loss": 0.7054, + "step": 224290 + }, + { + "epoch": 1.4329887686390759, + "grad_norm": 0.9034510254859924, + "learning_rate": 1.85893588768333e-05, + "loss": 0.7146, + "step": 224300 + }, + { + "epoch": 1.4330526557888146, + "grad_norm": 1.1122403144836426, + "learning_rate": 1.8585455067598156e-05, + "loss": 0.9885, + "step": 224310 + }, + { + "epoch": 1.4331165429385533, + "grad_norm": 0.8106998205184937, + "learning_rate": 1.8581551574734675e-05, + "loss": 1.0353, + "step": 224320 + }, + { + "epoch": 1.433180430088292, + "grad_norm": 0.9256942272186279, + "learning_rate": 1.8577648398282127e-05, + "loss": 0.9948, + "step": 224330 + }, + { + "epoch": 1.4332443172380307, + "grad_norm": 1.1601678133010864, + "learning_rate": 1.8573745538279864e-05, + "loss": 0.9708, + "step": 224340 + }, + { + "epoch": 1.4333082043877694, + "grad_norm": 0.8076596260070801, + "learning_rate": 1.8569842994767156e-05, + "loss": 0.9267, + "step": 224350 + }, + { + "epoch": 1.433372091537508, + "grad_norm": 0.8912524580955505, + "learning_rate": 1.8565940767783336e-05, + "loss": 1.119, + "step": 224360 + }, + { + "epoch": 1.4334359786872468, + "grad_norm": 1.3349778652191162, + "learning_rate": 1.856203885736767e-05, + "loss": 0.9859, + "step": 224370 + }, + { + "epoch": 1.4334998658369855, + "grad_norm": 3.2930209636688232, + "learning_rate": 1.8558137263559484e-05, + "loss": 1.0891, + "step": 224380 + }, + { + "epoch": 1.4335637529867242, + "grad_norm": 0.980534017086029, + "learning_rate": 1.8554235986398045e-05, + "loss": 0.8293, + "step": 224390 + }, + { + "epoch": 1.433627640136463, + "grad_norm": 0.53654944896698, + "learning_rate": 1.8550335025922667e-05, + "loss": 0.896, + "step": 224400 + }, + { + "epoch": 1.4336915272862016, + "grad_norm": 1.1580787897109985, + "learning_rate": 1.854643438217261e-05, + "loss": 1.0501, + "step": 224410 + }, + { + "epoch": 1.4337554144359403, + "grad_norm": 0.7166798710823059, + "learning_rate": 1.8542534055187182e-05, + "loss": 0.7975, + "step": 224420 + }, + { + "epoch": 1.433819301585679, + "grad_norm": 0.838466227054596, + "learning_rate": 1.8538634045005637e-05, + "loss": 0.8701, + "step": 224430 + }, + { + "epoch": 1.4338831887354178, + "grad_norm": 1.2870148420333862, + "learning_rate": 1.8534734351667284e-05, + "loss": 0.8109, + "step": 224440 + }, + { + "epoch": 1.4339470758851565, + "grad_norm": 2.708895206451416, + "learning_rate": 1.853083497521136e-05, + "loss": 0.7707, + "step": 224450 + }, + { + "epoch": 1.4340109630348952, + "grad_norm": 0.9632241725921631, + "learning_rate": 1.8526935915677168e-05, + "loss": 0.7554, + "step": 224460 + }, + { + "epoch": 1.4340748501846339, + "grad_norm": 1.179732084274292, + "learning_rate": 1.8523037173103942e-05, + "loss": 0.8122, + "step": 224470 + }, + { + "epoch": 1.4341387373343726, + "grad_norm": 1.2000482082366943, + "learning_rate": 1.8519138747530978e-05, + "loss": 0.8552, + "step": 224480 + }, + { + "epoch": 1.4342026244841113, + "grad_norm": 1.123895287513733, + "learning_rate": 1.8515240638997523e-05, + "loss": 0.9792, + "step": 224490 + }, + { + "epoch": 1.43426651163385, + "grad_norm": 0.743156373500824, + "learning_rate": 1.851134284754282e-05, + "loss": 1.0077, + "step": 224500 + }, + { + "epoch": 1.4343303987835887, + "grad_norm": 0.8791418671607971, + "learning_rate": 1.8507445373206143e-05, + "loss": 0.8268, + "step": 224510 + }, + { + "epoch": 1.4343942859333274, + "grad_norm": 0.8102611303329468, + "learning_rate": 1.850354821602673e-05, + "loss": 0.8767, + "step": 224520 + }, + { + "epoch": 1.4344581730830661, + "grad_norm": 1.0167714357376099, + "learning_rate": 1.8499651376043846e-05, + "loss": 0.9542, + "step": 224530 + }, + { + "epoch": 1.4345220602328048, + "grad_norm": 1.4743082523345947, + "learning_rate": 1.849575485329671e-05, + "loss": 0.7113, + "step": 224540 + }, + { + "epoch": 1.4345859473825435, + "grad_norm": 0.9514703154563904, + "learning_rate": 1.8491858647824595e-05, + "loss": 1.0415, + "step": 224550 + }, + { + "epoch": 1.4346498345322822, + "grad_norm": 0.9601063132286072, + "learning_rate": 1.8487962759666706e-05, + "loss": 0.7705, + "step": 224560 + }, + { + "epoch": 1.434713721682021, + "grad_norm": 0.943200409412384, + "learning_rate": 1.8484067188862304e-05, + "loss": 0.7712, + "step": 224570 + }, + { + "epoch": 1.4347776088317596, + "grad_norm": 0.9697480201721191, + "learning_rate": 1.8480171935450597e-05, + "loss": 0.8612, + "step": 224580 + }, + { + "epoch": 1.4348414959814983, + "grad_norm": 0.8412326574325562, + "learning_rate": 1.847627699947085e-05, + "loss": 0.9723, + "step": 224590 + }, + { + "epoch": 1.434905383131237, + "grad_norm": 1.015438199043274, + "learning_rate": 1.847238238096224e-05, + "loss": 0.7827, + "step": 224600 + }, + { + "epoch": 1.4349692702809758, + "grad_norm": 1.5689594745635986, + "learning_rate": 1.8468488079964018e-05, + "loss": 0.7896, + "step": 224610 + }, + { + "epoch": 1.4350331574307145, + "grad_norm": 0.921557605266571, + "learning_rate": 1.846459409651542e-05, + "loss": 0.7594, + "step": 224620 + }, + { + "epoch": 1.435097044580453, + "grad_norm": 1.0404754877090454, + "learning_rate": 1.846070043065562e-05, + "loss": 0.8123, + "step": 224630 + }, + { + "epoch": 1.4351609317301919, + "grad_norm": 1.056254267692566, + "learning_rate": 1.845680708242387e-05, + "loss": 1.0377, + "step": 224640 + }, + { + "epoch": 1.4352248188799304, + "grad_norm": 0.922844648361206, + "learning_rate": 1.8452914051859344e-05, + "loss": 0.9183, + "step": 224650 + }, + { + "epoch": 1.4352887060296693, + "grad_norm": 0.7790452837944031, + "learning_rate": 1.844902133900128e-05, + "loss": 0.8256, + "step": 224660 + }, + { + "epoch": 1.4353525931794078, + "grad_norm": 1.0011779069900513, + "learning_rate": 1.8445128943888858e-05, + "loss": 1.1991, + "step": 224670 + }, + { + "epoch": 1.4354164803291467, + "grad_norm": 0.6932054162025452, + "learning_rate": 1.8441236866561296e-05, + "loss": 1.0215, + "step": 224680 + }, + { + "epoch": 1.4354803674788852, + "grad_norm": 1.6760739088058472, + "learning_rate": 1.843734510705777e-05, + "loss": 1.1536, + "step": 224690 + }, + { + "epoch": 1.4355442546286241, + "grad_norm": 0.8367016315460205, + "learning_rate": 1.8433453665417493e-05, + "loss": 0.9362, + "step": 224700 + }, + { + "epoch": 1.4356081417783626, + "grad_norm": 0.8584269881248474, + "learning_rate": 1.8429562541679633e-05, + "loss": 0.8732, + "step": 224710 + }, + { + "epoch": 1.4356720289281015, + "grad_norm": 0.7792062163352966, + "learning_rate": 1.842567173588341e-05, + "loss": 0.7278, + "step": 224720 + }, + { + "epoch": 1.43573591607784, + "grad_norm": 0.6521714925765991, + "learning_rate": 1.842178124806797e-05, + "loss": 0.8583, + "step": 224730 + }, + { + "epoch": 1.435799803227579, + "grad_norm": 0.612637460231781, + "learning_rate": 1.841789107827253e-05, + "loss": 0.5808, + "step": 224740 + }, + { + "epoch": 1.4358636903773174, + "grad_norm": 1.4582802057266235, + "learning_rate": 1.841400122653623e-05, + "loss": 0.7482, + "step": 224750 + }, + { + "epoch": 1.4359275775270561, + "grad_norm": 0.7831106781959534, + "learning_rate": 1.8410111692898286e-05, + "loss": 0.8334, + "step": 224760 + }, + { + "epoch": 1.4359914646767948, + "grad_norm": 1.045304536819458, + "learning_rate": 1.8406222477397822e-05, + "loss": 0.8801, + "step": 224770 + }, + { + "epoch": 1.4360553518265335, + "grad_norm": 0.7949215173721313, + "learning_rate": 1.840233358007405e-05, + "loss": 0.7311, + "step": 224780 + }, + { + "epoch": 1.4361192389762722, + "grad_norm": 0.5786644816398621, + "learning_rate": 1.83984450009661e-05, + "loss": 0.8258, + "step": 224790 + }, + { + "epoch": 1.436183126126011, + "grad_norm": 0.8888071775436401, + "learning_rate": 1.8394556740113162e-05, + "loss": 0.907, + "step": 224800 + }, + { + "epoch": 1.4362470132757497, + "grad_norm": 1.349461555480957, + "learning_rate": 1.8390668797554367e-05, + "loss": 1.0114, + "step": 224810 + }, + { + "epoch": 1.4363109004254884, + "grad_norm": 0.9334039688110352, + "learning_rate": 1.8386781173328877e-05, + "loss": 0.9106, + "step": 224820 + }, + { + "epoch": 1.436374787575227, + "grad_norm": 0.9088615775108337, + "learning_rate": 1.838289386747587e-05, + "loss": 1.1062, + "step": 224830 + }, + { + "epoch": 1.4364386747249658, + "grad_norm": 0.9782391786575317, + "learning_rate": 1.837900688003446e-05, + "loss": 0.9834, + "step": 224840 + }, + { + "epoch": 1.4365025618747045, + "grad_norm": 0.988823652267456, + "learning_rate": 1.8375120211043823e-05, + "loss": 0.743, + "step": 224850 + }, + { + "epoch": 1.4365664490244432, + "grad_norm": 1.2976255416870117, + "learning_rate": 1.8371233860543063e-05, + "loss": 0.829, + "step": 224860 + }, + { + "epoch": 1.436630336174182, + "grad_norm": 0.9039852619171143, + "learning_rate": 1.8367347828571364e-05, + "loss": 0.9521, + "step": 224870 + }, + { + "epoch": 1.4366942233239206, + "grad_norm": 1.0037953853607178, + "learning_rate": 1.8363462115167818e-05, + "loss": 0.8553, + "step": 224880 + }, + { + "epoch": 1.4367581104736593, + "grad_norm": 1.209885835647583, + "learning_rate": 1.8359576720371595e-05, + "loss": 0.9151, + "step": 224890 + }, + { + "epoch": 1.436821997623398, + "grad_norm": 0.9278016090393066, + "learning_rate": 1.835569164422179e-05, + "loss": 0.8871, + "step": 224900 + }, + { + "epoch": 1.4368858847731367, + "grad_norm": 1.0095714330673218, + "learning_rate": 1.8351806886757565e-05, + "loss": 0.7992, + "step": 224910 + }, + { + "epoch": 1.4369497719228754, + "grad_norm": 1.1271530389785767, + "learning_rate": 1.8347922448018007e-05, + "loss": 0.8998, + "step": 224920 + }, + { + "epoch": 1.4370136590726141, + "grad_norm": 2.4450058937072754, + "learning_rate": 1.8344038328042267e-05, + "loss": 0.8528, + "step": 224930 + }, + { + "epoch": 1.4370775462223528, + "grad_norm": 1.007347583770752, + "learning_rate": 1.8340154526869437e-05, + "loss": 0.7698, + "step": 224940 + }, + { + "epoch": 1.4371414333720915, + "grad_norm": 1.0634715557098389, + "learning_rate": 1.833627104453865e-05, + "loss": 0.9392, + "step": 224950 + }, + { + "epoch": 1.4372053205218303, + "grad_norm": 0.6562912464141846, + "learning_rate": 1.8332387881088993e-05, + "loss": 1.0421, + "step": 224960 + }, + { + "epoch": 1.437269207671569, + "grad_norm": 0.7901965379714966, + "learning_rate": 1.832850503655961e-05, + "loss": 0.8792, + "step": 224970 + }, + { + "epoch": 1.4373330948213077, + "grad_norm": 0.9408652186393738, + "learning_rate": 1.832462251098957e-05, + "loss": 0.9602, + "step": 224980 + }, + { + "epoch": 1.4373969819710464, + "grad_norm": 0.5572311878204346, + "learning_rate": 1.832074030441797e-05, + "loss": 0.8207, + "step": 224990 + }, + { + "epoch": 1.437460869120785, + "grad_norm": 0.9873363971710205, + "learning_rate": 1.831685841688394e-05, + "loss": 0.8795, + "step": 225000 + }, + { + "epoch": 1.4375247562705238, + "grad_norm": 1.398949146270752, + "learning_rate": 1.831297684842654e-05, + "loss": 1.1101, + "step": 225010 + }, + { + "epoch": 1.4375886434202625, + "grad_norm": 1.1199678182601929, + "learning_rate": 1.8309095599084893e-05, + "loss": 0.8379, + "step": 225020 + }, + { + "epoch": 1.4376525305700012, + "grad_norm": 0.8787399530410767, + "learning_rate": 1.8305214668898053e-05, + "loss": 1.0063, + "step": 225030 + }, + { + "epoch": 1.43771641771974, + "grad_norm": 0.870714545249939, + "learning_rate": 1.8301334057905134e-05, + "loss": 0.6885, + "step": 225040 + }, + { + "epoch": 1.4377803048694786, + "grad_norm": 0.7601709365844727, + "learning_rate": 1.8297453766145194e-05, + "loss": 0.7629, + "step": 225050 + }, + { + "epoch": 1.4378441920192173, + "grad_norm": 1.3224676847457886, + "learning_rate": 1.8293573793657332e-05, + "loss": 0.8404, + "step": 225060 + }, + { + "epoch": 1.437908079168956, + "grad_norm": 1.1418801546096802, + "learning_rate": 1.82896941404806e-05, + "loss": 0.7309, + "step": 225070 + }, + { + "epoch": 1.4379719663186947, + "grad_norm": 0.9552497863769531, + "learning_rate": 1.8285814806654096e-05, + "loss": 0.8668, + "step": 225080 + }, + { + "epoch": 1.4380358534684334, + "grad_norm": 1.5807976722717285, + "learning_rate": 1.8281935792216852e-05, + "loss": 0.9506, + "step": 225090 + }, + { + "epoch": 1.4380997406181721, + "grad_norm": 0.8164640665054321, + "learning_rate": 1.827805709720798e-05, + "loss": 0.7146, + "step": 225100 + }, + { + "epoch": 1.4381636277679108, + "grad_norm": 0.911655843257904, + "learning_rate": 1.8274178721666496e-05, + "loss": 0.8573, + "step": 225110 + }, + { + "epoch": 1.4382275149176493, + "grad_norm": 0.742917537689209, + "learning_rate": 1.8270300665631497e-05, + "loss": 0.9148, + "step": 225120 + }, + { + "epoch": 1.4382914020673883, + "grad_norm": 0.615420937538147, + "learning_rate": 1.8266422929142002e-05, + "loss": 0.606, + "step": 225130 + }, + { + "epoch": 1.4383552892171267, + "grad_norm": 0.9167844653129578, + "learning_rate": 1.826254551223708e-05, + "loss": 0.944, + "step": 225140 + }, + { + "epoch": 1.4384191763668657, + "grad_norm": 0.9639812707901001, + "learning_rate": 1.8258668414955797e-05, + "loss": 0.6993, + "step": 225150 + }, + { + "epoch": 1.4384830635166042, + "grad_norm": 1.29408597946167, + "learning_rate": 1.825479163733717e-05, + "loss": 1.037, + "step": 225160 + }, + { + "epoch": 1.438546950666343, + "grad_norm": 0.8886136412620544, + "learning_rate": 1.8250915179420273e-05, + "loss": 0.841, + "step": 225170 + }, + { + "epoch": 1.4386108378160816, + "grad_norm": 1.0680568218231201, + "learning_rate": 1.8247039041244108e-05, + "loss": 0.8608, + "step": 225180 + }, + { + "epoch": 1.4386747249658205, + "grad_norm": 0.783477783203125, + "learning_rate": 1.824316322284775e-05, + "loss": 0.8877, + "step": 225190 + }, + { + "epoch": 1.438738612115559, + "grad_norm": 0.8530393838882446, + "learning_rate": 1.8239287724270187e-05, + "loss": 0.9734, + "step": 225200 + }, + { + "epoch": 1.438802499265298, + "grad_norm": 0.8309034109115601, + "learning_rate": 1.82354125455505e-05, + "loss": 0.9732, + "step": 225210 + }, + { + "epoch": 1.4388663864150364, + "grad_norm": 0.9732787609100342, + "learning_rate": 1.8231537686727667e-05, + "loss": 1.0601, + "step": 225220 + }, + { + "epoch": 1.4389302735647753, + "grad_norm": 1.3157035112380981, + "learning_rate": 1.822766314784075e-05, + "loss": 0.7097, + "step": 225230 + }, + { + "epoch": 1.4389941607145138, + "grad_norm": 1.3295843601226807, + "learning_rate": 1.8223788928928738e-05, + "loss": 0.7674, + "step": 225240 + }, + { + "epoch": 1.4390580478642525, + "grad_norm": 1.2309082746505737, + "learning_rate": 1.8219915030030675e-05, + "loss": 0.851, + "step": 225250 + }, + { + "epoch": 1.4391219350139912, + "grad_norm": 1.1842759847640991, + "learning_rate": 1.8216041451185545e-05, + "loss": 0.765, + "step": 225260 + }, + { + "epoch": 1.43918582216373, + "grad_norm": 1.7547402381896973, + "learning_rate": 1.8212168192432395e-05, + "loss": 0.9925, + "step": 225270 + }, + { + "epoch": 1.4392497093134686, + "grad_norm": 0.7879651188850403, + "learning_rate": 1.8208295253810186e-05, + "loss": 0.9249, + "step": 225280 + }, + { + "epoch": 1.4393135964632073, + "grad_norm": 0.8185400366783142, + "learning_rate": 1.8204422635357975e-05, + "loss": 0.8169, + "step": 225290 + }, + { + "epoch": 1.439377483612946, + "grad_norm": 1.1932501792907715, + "learning_rate": 1.8200550337114715e-05, + "loss": 0.9108, + "step": 225300 + }, + { + "epoch": 1.4394413707626847, + "grad_norm": 0.8779699206352234, + "learning_rate": 1.819667835911944e-05, + "loss": 0.9995, + "step": 225310 + }, + { + "epoch": 1.4395052579124235, + "grad_norm": 0.8722737431526184, + "learning_rate": 1.8192806701411107e-05, + "loss": 0.9463, + "step": 225320 + }, + { + "epoch": 1.4395691450621622, + "grad_norm": 1.0146985054016113, + "learning_rate": 1.8188935364028747e-05, + "loss": 0.7525, + "step": 225330 + }, + { + "epoch": 1.4396330322119009, + "grad_norm": 0.8703792095184326, + "learning_rate": 1.8185064347011317e-05, + "loss": 0.8022, + "step": 225340 + }, + { + "epoch": 1.4396969193616396, + "grad_norm": 0.5923279523849487, + "learning_rate": 1.8181193650397805e-05, + "loss": 1.019, + "step": 225350 + }, + { + "epoch": 1.4397608065113783, + "grad_norm": 0.685735821723938, + "learning_rate": 1.817732327422722e-05, + "loss": 0.927, + "step": 225360 + }, + { + "epoch": 1.439824693661117, + "grad_norm": 0.9039795398712158, + "learning_rate": 1.8173453218538505e-05, + "loss": 0.9359, + "step": 225370 + }, + { + "epoch": 1.4398885808108557, + "grad_norm": 1.2408785820007324, + "learning_rate": 1.816958348337066e-05, + "loss": 0.8988, + "step": 225380 + }, + { + "epoch": 1.4399524679605944, + "grad_norm": 1.1909972429275513, + "learning_rate": 1.816571406876264e-05, + "loss": 1.0245, + "step": 225390 + }, + { + "epoch": 1.440016355110333, + "grad_norm": 1.1718319654464722, + "learning_rate": 1.8161844974753427e-05, + "loss": 0.7103, + "step": 225400 + }, + { + "epoch": 1.4400802422600718, + "grad_norm": 1.1767466068267822, + "learning_rate": 1.815797620138197e-05, + "loss": 1.0089, + "step": 225410 + }, + { + "epoch": 1.4401441294098105, + "grad_norm": 1.2119587659835815, + "learning_rate": 1.8154107748687254e-05, + "loss": 0.9051, + "step": 225420 + }, + { + "epoch": 1.4402080165595492, + "grad_norm": 0.7241948843002319, + "learning_rate": 1.815023961670821e-05, + "loss": 0.7867, + "step": 225430 + }, + { + "epoch": 1.440271903709288, + "grad_norm": 1.3505481481552124, + "learning_rate": 1.8146371805483813e-05, + "loss": 0.8559, + "step": 225440 + }, + { + "epoch": 1.4403357908590266, + "grad_norm": 0.9351546764373779, + "learning_rate": 1.8142504315053006e-05, + "loss": 1.0054, + "step": 225450 + }, + { + "epoch": 1.4403996780087653, + "grad_norm": 1.3706575632095337, + "learning_rate": 1.813863714545475e-05, + "loss": 1.1069, + "step": 225460 + }, + { + "epoch": 1.440463565158504, + "grad_norm": 0.9399012923240662, + "learning_rate": 1.8134770296727984e-05, + "loss": 0.8457, + "step": 225470 + }, + { + "epoch": 1.4405274523082428, + "grad_norm": 1.141176462173462, + "learning_rate": 1.8130903768911628e-05, + "loss": 0.6414, + "step": 225480 + }, + { + "epoch": 1.4405913394579815, + "grad_norm": 2.1579673290252686, + "learning_rate": 1.8127037562044662e-05, + "loss": 0.9051, + "step": 225490 + }, + { + "epoch": 1.4406552266077202, + "grad_norm": 1.083505392074585, + "learning_rate": 1.8123171676165985e-05, + "loss": 0.6264, + "step": 225500 + }, + { + "epoch": 1.4407191137574589, + "grad_norm": 1.1727869510650635, + "learning_rate": 1.8119306111314567e-05, + "loss": 0.9338, + "step": 225510 + }, + { + "epoch": 1.4407830009071976, + "grad_norm": 0.8360462188720703, + "learning_rate": 1.8115440867529293e-05, + "loss": 0.9365, + "step": 225520 + }, + { + "epoch": 1.4408468880569363, + "grad_norm": 0.9862266182899475, + "learning_rate": 1.8111575944849135e-05, + "loss": 0.8103, + "step": 225530 + }, + { + "epoch": 1.440910775206675, + "grad_norm": 0.784279465675354, + "learning_rate": 1.8107711343312977e-05, + "loss": 0.8937, + "step": 225540 + }, + { + "epoch": 1.4409746623564137, + "grad_norm": 0.7874135971069336, + "learning_rate": 1.8103847062959772e-05, + "loss": 0.9192, + "step": 225550 + }, + { + "epoch": 1.4410385495061524, + "grad_norm": 0.7712077498435974, + "learning_rate": 1.8099983103828404e-05, + "loss": 0.7963, + "step": 225560 + }, + { + "epoch": 1.441102436655891, + "grad_norm": 1.632049560546875, + "learning_rate": 1.809611946595782e-05, + "loss": 1.0526, + "step": 225570 + }, + { + "epoch": 1.4411663238056298, + "grad_norm": 1.2270588874816895, + "learning_rate": 1.8092256149386904e-05, + "loss": 0.9838, + "step": 225580 + }, + { + "epoch": 1.4412302109553685, + "grad_norm": 0.9508424401283264, + "learning_rate": 1.808839315415458e-05, + "loss": 1.1952, + "step": 225590 + }, + { + "epoch": 1.4412940981051072, + "grad_norm": 1.569566249847412, + "learning_rate": 1.8084530480299734e-05, + "loss": 0.6849, + "step": 225600 + }, + { + "epoch": 1.4413579852548457, + "grad_norm": 1.066719651222229, + "learning_rate": 1.8080668127861294e-05, + "loss": 1.0987, + "step": 225610 + }, + { + "epoch": 1.4414218724045846, + "grad_norm": 3.388587236404419, + "learning_rate": 1.807680609687812e-05, + "loss": 0.8551, + "step": 225620 + }, + { + "epoch": 1.4414857595543231, + "grad_norm": 0.7674310207366943, + "learning_rate": 1.8072944387389144e-05, + "loss": 0.6982, + "step": 225630 + }, + { + "epoch": 1.441549646704062, + "grad_norm": 0.7472887635231018, + "learning_rate": 1.8069082999433223e-05, + "loss": 0.8768, + "step": 225640 + }, + { + "epoch": 1.4416135338538005, + "grad_norm": 0.6819210052490234, + "learning_rate": 1.8065221933049277e-05, + "loss": 0.6991, + "step": 225650 + }, + { + "epoch": 1.4416774210035395, + "grad_norm": 1.1873059272766113, + "learning_rate": 1.8061361188276155e-05, + "loss": 0.9916, + "step": 225660 + }, + { + "epoch": 1.441741308153278, + "grad_norm": 0.9788697957992554, + "learning_rate": 1.8057500765152757e-05, + "loss": 0.6409, + "step": 225670 + }, + { + "epoch": 1.4418051953030169, + "grad_norm": 0.880154550075531, + "learning_rate": 1.8053640663717974e-05, + "loss": 1.0943, + "step": 225680 + }, + { + "epoch": 1.4418690824527554, + "grad_norm": 0.5648699402809143, + "learning_rate": 1.8049780884010658e-05, + "loss": 0.8134, + "step": 225690 + }, + { + "epoch": 1.4419329696024943, + "grad_norm": 1.1649391651153564, + "learning_rate": 1.8045921426069702e-05, + "loss": 0.9198, + "step": 225700 + }, + { + "epoch": 1.4419968567522328, + "grad_norm": 0.8868089914321899, + "learning_rate": 1.804206228993394e-05, + "loss": 0.7574, + "step": 225710 + }, + { + "epoch": 1.4420607439019715, + "grad_norm": 0.9655132293701172, + "learning_rate": 1.8038203475642286e-05, + "loss": 0.8014, + "step": 225720 + }, + { + "epoch": 1.4421246310517102, + "grad_norm": 1.2397197484970093, + "learning_rate": 1.8034344983233546e-05, + "loss": 0.7869, + "step": 225730 + }, + { + "epoch": 1.4421885182014489, + "grad_norm": 0.8126650452613831, + "learning_rate": 1.8030486812746634e-05, + "loss": 0.8521, + "step": 225740 + }, + { + "epoch": 1.4422524053511876, + "grad_norm": 1.6487863063812256, + "learning_rate": 1.802662896422035e-05, + "loss": 0.7892, + "step": 225750 + }, + { + "epoch": 1.4423162925009263, + "grad_norm": 0.9608396291732788, + "learning_rate": 1.8022771437693596e-05, + "loss": 0.8121, + "step": 225760 + }, + { + "epoch": 1.442380179650665, + "grad_norm": 1.257879614830017, + "learning_rate": 1.8018914233205182e-05, + "loss": 1.0084, + "step": 225770 + }, + { + "epoch": 1.4424440668004037, + "grad_norm": 0.9482038617134094, + "learning_rate": 1.8015057350793984e-05, + "loss": 0.8868, + "step": 225780 + }, + { + "epoch": 1.4425079539501424, + "grad_norm": 0.8022263050079346, + "learning_rate": 1.8011200790498812e-05, + "loss": 0.7972, + "step": 225790 + }, + { + "epoch": 1.4425718410998811, + "grad_norm": 0.7212386727333069, + "learning_rate": 1.800734455235854e-05, + "loss": 0.8484, + "step": 225800 + }, + { + "epoch": 1.4426357282496198, + "grad_norm": 1.0760418176651, + "learning_rate": 1.800348863641197e-05, + "loss": 0.7978, + "step": 225810 + }, + { + "epoch": 1.4426996153993585, + "grad_norm": 0.711896538734436, + "learning_rate": 1.7999633042697962e-05, + "loss": 0.6596, + "step": 225820 + }, + { + "epoch": 1.4427635025490972, + "grad_norm": 1.0557421445846558, + "learning_rate": 1.799577777125532e-05, + "loss": 0.822, + "step": 225830 + }, + { + "epoch": 1.442827389698836, + "grad_norm": 0.8317104578018188, + "learning_rate": 1.7991922822122904e-05, + "loss": 0.9163, + "step": 225840 + }, + { + "epoch": 1.4428912768485747, + "grad_norm": 1.3670239448547363, + "learning_rate": 1.7988068195339493e-05, + "loss": 0.8233, + "step": 225850 + }, + { + "epoch": 1.4429551639983134, + "grad_norm": 0.8564910292625427, + "learning_rate": 1.7984213890943933e-05, + "loss": 0.8689, + "step": 225860 + }, + { + "epoch": 1.443019051148052, + "grad_norm": 1.9226771593093872, + "learning_rate": 1.7980359908975053e-05, + "loss": 0.7754, + "step": 225870 + }, + { + "epoch": 1.4430829382977908, + "grad_norm": 0.7119132280349731, + "learning_rate": 1.797650624947163e-05, + "loss": 1.0531, + "step": 225880 + }, + { + "epoch": 1.4431468254475295, + "grad_norm": 1.0629172325134277, + "learning_rate": 1.7972652912472514e-05, + "loss": 1.038, + "step": 225890 + }, + { + "epoch": 1.4432107125972682, + "grad_norm": 0.6397799253463745, + "learning_rate": 1.7968799898016474e-05, + "loss": 0.8435, + "step": 225900 + }, + { + "epoch": 1.443274599747007, + "grad_norm": 1.2140674591064453, + "learning_rate": 1.7964947206142347e-05, + "loss": 1.0597, + "step": 225910 + }, + { + "epoch": 1.4433384868967456, + "grad_norm": 1.007383942604065, + "learning_rate": 1.7961094836888898e-05, + "loss": 0.9592, + "step": 225920 + }, + { + "epoch": 1.4434023740464843, + "grad_norm": 0.9357951879501343, + "learning_rate": 1.795724279029496e-05, + "loss": 0.8951, + "step": 225930 + }, + { + "epoch": 1.443466261196223, + "grad_norm": 1.1374599933624268, + "learning_rate": 1.7953391066399304e-05, + "loss": 1.0299, + "step": 225940 + }, + { + "epoch": 1.4435301483459617, + "grad_norm": 0.6394752264022827, + "learning_rate": 1.794953966524073e-05, + "loss": 0.6978, + "step": 225950 + }, + { + "epoch": 1.4435940354957004, + "grad_norm": 0.7873253226280212, + "learning_rate": 1.7945688586857996e-05, + "loss": 0.8146, + "step": 225960 + }, + { + "epoch": 1.4436579226454391, + "grad_norm": 0.937562882900238, + "learning_rate": 1.794183783128993e-05, + "loss": 1.2297, + "step": 225970 + }, + { + "epoch": 1.4437218097951778, + "grad_norm": 1.135293960571289, + "learning_rate": 1.7937987398575275e-05, + "loss": 0.7278, + "step": 225980 + }, + { + "epoch": 1.4437856969449165, + "grad_norm": 2.8310680389404297, + "learning_rate": 1.7934137288752823e-05, + "loss": 0.8589, + "step": 225990 + }, + { + "epoch": 1.4438495840946552, + "grad_norm": 0.6206763386726379, + "learning_rate": 1.7930287501861364e-05, + "loss": 0.7586, + "step": 226000 + }, + { + "epoch": 1.443913471244394, + "grad_norm": 1.2204722166061401, + "learning_rate": 1.7926438037939635e-05, + "loss": 0.7127, + "step": 226010 + }, + { + "epoch": 1.4439773583941327, + "grad_norm": 2.4062082767486572, + "learning_rate": 1.792258889702645e-05, + "loss": 0.8781, + "step": 226020 + }, + { + "epoch": 1.4440412455438714, + "grad_norm": 1.2518004179000854, + "learning_rate": 1.791874007916052e-05, + "loss": 0.7874, + "step": 226030 + }, + { + "epoch": 1.44410513269361, + "grad_norm": 1.1318409442901611, + "learning_rate": 1.791489158438065e-05, + "loss": 0.6976, + "step": 226040 + }, + { + "epoch": 1.4441690198433488, + "grad_norm": 1.1307138204574585, + "learning_rate": 1.7911043412725565e-05, + "loss": 0.7689, + "step": 226050 + }, + { + "epoch": 1.4442329069930875, + "grad_norm": 0.7476751804351807, + "learning_rate": 1.7907195564234047e-05, + "loss": 0.6618, + "step": 226060 + }, + { + "epoch": 1.4442967941428262, + "grad_norm": 0.9509174227714539, + "learning_rate": 1.7903348038944816e-05, + "loss": 0.8388, + "step": 226070 + }, + { + "epoch": 1.444360681292565, + "grad_norm": 1.105707049369812, + "learning_rate": 1.789950083689666e-05, + "loss": 0.9766, + "step": 226080 + }, + { + "epoch": 1.4444245684423036, + "grad_norm": 0.9921726584434509, + "learning_rate": 1.7895653958128285e-05, + "loss": 1.2553, + "step": 226090 + }, + { + "epoch": 1.444488455592042, + "grad_norm": 0.7300543189048767, + "learning_rate": 1.7891807402678463e-05, + "loss": 0.8451, + "step": 226100 + }, + { + "epoch": 1.444552342741781, + "grad_norm": 1.0600464344024658, + "learning_rate": 1.78879611705859e-05, + "loss": 0.9788, + "step": 226110 + }, + { + "epoch": 1.4446162298915195, + "grad_norm": 0.6954507827758789, + "learning_rate": 1.7884115261889368e-05, + "loss": 0.9408, + "step": 226120 + }, + { + "epoch": 1.4446801170412584, + "grad_norm": 1.2967103719711304, + "learning_rate": 1.7880269676627558e-05, + "loss": 1.1109, + "step": 226130 + }, + { + "epoch": 1.444744004190997, + "grad_norm": 1.16764235496521, + "learning_rate": 1.7876424414839244e-05, + "loss": 1.3395, + "step": 226140 + }, + { + "epoch": 1.4448078913407358, + "grad_norm": 0.8881130218505859, + "learning_rate": 1.78725794765631e-05, + "loss": 0.7589, + "step": 226150 + }, + { + "epoch": 1.4448717784904743, + "grad_norm": 0.9801353812217712, + "learning_rate": 1.7868734861837898e-05, + "loss": 1.0322, + "step": 226160 + }, + { + "epoch": 1.4449356656402133, + "grad_norm": 0.7944816946983337, + "learning_rate": 1.786489057070232e-05, + "loss": 0.7409, + "step": 226170 + }, + { + "epoch": 1.4449995527899517, + "grad_norm": 0.832903265953064, + "learning_rate": 1.786104660319511e-05, + "loss": 1.2924, + "step": 226180 + }, + { + "epoch": 1.4450634399396907, + "grad_norm": 2.067941427230835, + "learning_rate": 1.7857202959354945e-05, + "loss": 1.0379, + "step": 226190 + }, + { + "epoch": 1.4451273270894291, + "grad_norm": 1.9666894674301147, + "learning_rate": 1.7853359639220558e-05, + "loss": 0.8461, + "step": 226200 + }, + { + "epoch": 1.4451912142391679, + "grad_norm": 5.305588245391846, + "learning_rate": 1.784951664283066e-05, + "loss": 0.9341, + "step": 226210 + }, + { + "epoch": 1.4452551013889066, + "grad_norm": 0.7343180179595947, + "learning_rate": 1.7845673970223932e-05, + "loss": 0.8269, + "step": 226220 + }, + { + "epoch": 1.4453189885386453, + "grad_norm": 1.2359371185302734, + "learning_rate": 1.78418316214391e-05, + "loss": 0.8709, + "step": 226230 + }, + { + "epoch": 1.445382875688384, + "grad_norm": 1.012980341911316, + "learning_rate": 1.7837989596514826e-05, + "loss": 0.7815, + "step": 226240 + }, + { + "epoch": 1.4454467628381227, + "grad_norm": 0.7704878449440002, + "learning_rate": 1.783414789548984e-05, + "loss": 1.1135, + "step": 226250 + }, + { + "epoch": 1.4455106499878614, + "grad_norm": 1.0692133903503418, + "learning_rate": 1.7830306518402796e-05, + "loss": 0.7503, + "step": 226260 + }, + { + "epoch": 1.4455745371376, + "grad_norm": 1.0675780773162842, + "learning_rate": 1.7826465465292407e-05, + "loss": 0.8263, + "step": 226270 + }, + { + "epoch": 1.4456384242873388, + "grad_norm": 2.0007741451263428, + "learning_rate": 1.7822624736197334e-05, + "loss": 0.7149, + "step": 226280 + }, + { + "epoch": 1.4457023114370775, + "grad_norm": 2.1513524055480957, + "learning_rate": 1.7818784331156285e-05, + "loss": 1.0759, + "step": 226290 + }, + { + "epoch": 1.4457661985868162, + "grad_norm": 1.7059158086776733, + "learning_rate": 1.78149442502079e-05, + "loss": 0.8618, + "step": 226300 + }, + { + "epoch": 1.445830085736555, + "grad_norm": 0.6934781074523926, + "learning_rate": 1.7811104493390885e-05, + "loss": 0.9491, + "step": 226310 + }, + { + "epoch": 1.4458939728862936, + "grad_norm": 1.6286593675613403, + "learning_rate": 1.7807265060743876e-05, + "loss": 0.7483, + "step": 226320 + }, + { + "epoch": 1.4459578600360323, + "grad_norm": 0.6673516035079956, + "learning_rate": 1.7803425952305585e-05, + "loss": 0.9391, + "step": 226330 + }, + { + "epoch": 1.446021747185771, + "grad_norm": 0.9682520031929016, + "learning_rate": 1.7799587168114623e-05, + "loss": 0.7737, + "step": 226340 + }, + { + "epoch": 1.4460856343355097, + "grad_norm": 0.745613157749176, + "learning_rate": 1.7795748708209696e-05, + "loss": 0.8105, + "step": 226350 + }, + { + "epoch": 1.4461495214852484, + "grad_norm": 0.8082693815231323, + "learning_rate": 1.7791910572629423e-05, + "loss": 0.7544, + "step": 226360 + }, + { + "epoch": 1.4462134086349872, + "grad_norm": 1.3045746088027954, + "learning_rate": 1.778807276141249e-05, + "loss": 0.8672, + "step": 226370 + }, + { + "epoch": 1.4462772957847259, + "grad_norm": 0.8008405566215515, + "learning_rate": 1.7784235274597515e-05, + "loss": 1.0634, + "step": 226380 + }, + { + "epoch": 1.4463411829344646, + "grad_norm": 1.4106078147888184, + "learning_rate": 1.7780398112223163e-05, + "loss": 0.9883, + "step": 226390 + }, + { + "epoch": 1.4464050700842033, + "grad_norm": 1.0247710943222046, + "learning_rate": 1.777656127432809e-05, + "loss": 0.9253, + "step": 226400 + }, + { + "epoch": 1.446468957233942, + "grad_norm": 0.7325378656387329, + "learning_rate": 1.777272476095091e-05, + "loss": 0.7098, + "step": 226410 + }, + { + "epoch": 1.4465328443836807, + "grad_norm": 1.0278514623641968, + "learning_rate": 1.7768888572130287e-05, + "loss": 0.8852, + "step": 226420 + }, + { + "epoch": 1.4465967315334194, + "grad_norm": 1.6052353382110596, + "learning_rate": 1.776505270790484e-05, + "loss": 0.6666, + "step": 226430 + }, + { + "epoch": 1.446660618683158, + "grad_norm": 1.0923691987991333, + "learning_rate": 1.77612171683132e-05, + "loss": 0.8607, + "step": 226440 + }, + { + "epoch": 1.4467245058328968, + "grad_norm": 2.1576809883117676, + "learning_rate": 1.7757381953393975e-05, + "loss": 0.8478, + "step": 226450 + }, + { + "epoch": 1.4467883929826355, + "grad_norm": 0.8072786927223206, + "learning_rate": 1.7753547063185823e-05, + "loss": 0.6595, + "step": 226460 + }, + { + "epoch": 1.4468522801323742, + "grad_norm": 3.7347018718719482, + "learning_rate": 1.7749712497727334e-05, + "loss": 0.8379, + "step": 226470 + }, + { + "epoch": 1.446916167282113, + "grad_norm": 0.7923364043235779, + "learning_rate": 1.7745878257057158e-05, + "loss": 1.0198, + "step": 226480 + }, + { + "epoch": 1.4469800544318516, + "grad_norm": 1.173071026802063, + "learning_rate": 1.7742044341213874e-05, + "loss": 0.79, + "step": 226490 + }, + { + "epoch": 1.4470439415815903, + "grad_norm": 0.7277421951293945, + "learning_rate": 1.773821075023613e-05, + "loss": 1.0715, + "step": 226500 + }, + { + "epoch": 1.447107828731329, + "grad_norm": 5.723413467407227, + "learning_rate": 1.7734377484162496e-05, + "loss": 0.9474, + "step": 226510 + }, + { + "epoch": 1.4471717158810677, + "grad_norm": 1.176943302154541, + "learning_rate": 1.7730544543031592e-05, + "loss": 0.6639, + "step": 226520 + }, + { + "epoch": 1.4472356030308065, + "grad_norm": 2.4388535022735596, + "learning_rate": 1.772671192688204e-05, + "loss": 1.013, + "step": 226530 + }, + { + "epoch": 1.4472994901805452, + "grad_norm": 0.7879016995429993, + "learning_rate": 1.7722879635752405e-05, + "loss": 0.7353, + "step": 226540 + }, + { + "epoch": 1.4473633773302839, + "grad_norm": 0.557325541973114, + "learning_rate": 1.771904766968131e-05, + "loss": 0.9431, + "step": 226550 + }, + { + "epoch": 1.4474272644800226, + "grad_norm": 1.2949154376983643, + "learning_rate": 1.7715216028707315e-05, + "loss": 0.9866, + "step": 226560 + }, + { + "epoch": 1.447491151629761, + "grad_norm": 0.670309841632843, + "learning_rate": 1.7711384712869038e-05, + "loss": 0.8569, + "step": 226570 + }, + { + "epoch": 1.4475550387795, + "grad_norm": 1.074346899986267, + "learning_rate": 1.770755372220504e-05, + "loss": 0.6498, + "step": 226580 + }, + { + "epoch": 1.4476189259292385, + "grad_norm": 0.6561318635940552, + "learning_rate": 1.770372305675393e-05, + "loss": 0.8314, + "step": 226590 + }, + { + "epoch": 1.4476828130789774, + "grad_norm": 1.1744028329849243, + "learning_rate": 1.7699892716554252e-05, + "loss": 0.9351, + "step": 226600 + }, + { + "epoch": 1.4477467002287159, + "grad_norm": 0.9757578372955322, + "learning_rate": 1.769606270164461e-05, + "loss": 0.7917, + "step": 226610 + }, + { + "epoch": 1.4478105873784548, + "grad_norm": 1.1972298622131348, + "learning_rate": 1.7692233012063552e-05, + "loss": 1.0842, + "step": 226620 + }, + { + "epoch": 1.4478744745281933, + "grad_norm": 0.9641197323799133, + "learning_rate": 1.7688403647849673e-05, + "loss": 0.7745, + "step": 226630 + }, + { + "epoch": 1.4479383616779322, + "grad_norm": 0.8222277760505676, + "learning_rate": 1.76845746090415e-05, + "loss": 0.985, + "step": 226640 + }, + { + "epoch": 1.4480022488276707, + "grad_norm": 1.6356098651885986, + "learning_rate": 1.768074589567764e-05, + "loss": 0.9724, + "step": 226650 + }, + { + "epoch": 1.4480661359774096, + "grad_norm": 0.8448604941368103, + "learning_rate": 1.7676917507796614e-05, + "loss": 0.8945, + "step": 226660 + }, + { + "epoch": 1.4481300231271481, + "grad_norm": 0.9055709838867188, + "learning_rate": 1.7673089445437004e-05, + "loss": 0.8706, + "step": 226670 + }, + { + "epoch": 1.448193910276887, + "grad_norm": 0.8757638335227966, + "learning_rate": 1.7669261708637336e-05, + "loss": 1.1719, + "step": 226680 + }, + { + "epoch": 1.4482577974266255, + "grad_norm": 0.6214796900749207, + "learning_rate": 1.766543429743619e-05, + "loss": 0.7492, + "step": 226690 + }, + { + "epoch": 1.4483216845763642, + "grad_norm": 1.0025808811187744, + "learning_rate": 1.766160721187207e-05, + "loss": 0.8919, + "step": 226700 + }, + { + "epoch": 1.448385571726103, + "grad_norm": 1.1682748794555664, + "learning_rate": 1.765778045198355e-05, + "loss": 1.2256, + "step": 226710 + }, + { + "epoch": 1.4484494588758416, + "grad_norm": 0.6261559128761292, + "learning_rate": 1.7653954017809178e-05, + "loss": 0.8631, + "step": 226720 + }, + { + "epoch": 1.4485133460255804, + "grad_norm": 0.8332759737968445, + "learning_rate": 1.7650127909387453e-05, + "loss": 0.6217, + "step": 226730 + }, + { + "epoch": 1.448577233175319, + "grad_norm": 0.7914277911186218, + "learning_rate": 1.764630212675695e-05, + "loss": 0.736, + "step": 226740 + }, + { + "epoch": 1.4486411203250578, + "grad_norm": 1.0079381465911865, + "learning_rate": 1.764247666995615e-05, + "loss": 0.7504, + "step": 226750 + }, + { + "epoch": 1.4487050074747965, + "grad_norm": 0.8110386729240417, + "learning_rate": 1.763865153902362e-05, + "loss": 0.7769, + "step": 226760 + }, + { + "epoch": 1.4487688946245352, + "grad_norm": 1.0138696432113647, + "learning_rate": 1.763482673399785e-05, + "loss": 0.9981, + "step": 226770 + }, + { + "epoch": 1.4488327817742739, + "grad_norm": 0.9576422572135925, + "learning_rate": 1.7631002254917388e-05, + "loss": 0.9525, + "step": 226780 + }, + { + "epoch": 1.4488966689240126, + "grad_norm": 1.2405056953430176, + "learning_rate": 1.7627178101820725e-05, + "loss": 1.0738, + "step": 226790 + }, + { + "epoch": 1.4489605560737513, + "grad_norm": 1.3423398733139038, + "learning_rate": 1.76233542747464e-05, + "loss": 0.7747, + "step": 226800 + }, + { + "epoch": 1.44902444322349, + "grad_norm": 1.0618234872817993, + "learning_rate": 1.761953077373289e-05, + "loss": 0.8785, + "step": 226810 + }, + { + "epoch": 1.4490883303732287, + "grad_norm": 1.3146846294403076, + "learning_rate": 1.7615707598818738e-05, + "loss": 1.1244, + "step": 226820 + }, + { + "epoch": 1.4491522175229674, + "grad_norm": 1.4140381813049316, + "learning_rate": 1.7611884750042406e-05, + "loss": 0.8067, + "step": 226830 + }, + { + "epoch": 1.4492161046727061, + "grad_norm": 0.9436303377151489, + "learning_rate": 1.7608062227442435e-05, + "loss": 0.7848, + "step": 226840 + }, + { + "epoch": 1.4492799918224448, + "grad_norm": 1.2600364685058594, + "learning_rate": 1.760424003105728e-05, + "loss": 0.7632, + "step": 226850 + }, + { + "epoch": 1.4493438789721835, + "grad_norm": 0.9329853057861328, + "learning_rate": 1.7600418160925473e-05, + "loss": 0.8161, + "step": 226860 + }, + { + "epoch": 1.4494077661219222, + "grad_norm": 0.8663994073867798, + "learning_rate": 1.7596596617085463e-05, + "loss": 1.1164, + "step": 226870 + }, + { + "epoch": 1.449471653271661, + "grad_norm": 1.1688934564590454, + "learning_rate": 1.7592775399575782e-05, + "loss": 0.9746, + "step": 226880 + }, + { + "epoch": 1.4495355404213996, + "grad_norm": 1.3450571298599243, + "learning_rate": 1.758895450843487e-05, + "loss": 0.658, + "step": 226890 + }, + { + "epoch": 1.4495994275711384, + "grad_norm": 0.6766774654388428, + "learning_rate": 1.7585133943701242e-05, + "loss": 0.6398, + "step": 226900 + }, + { + "epoch": 1.449663314720877, + "grad_norm": 0.8949316143989563, + "learning_rate": 1.758131370541336e-05, + "loss": 0.8675, + "step": 226910 + }, + { + "epoch": 1.4497272018706158, + "grad_norm": 1.316635012626648, + "learning_rate": 1.7577493793609675e-05, + "loss": 0.8526, + "step": 226920 + }, + { + "epoch": 1.4497910890203545, + "grad_norm": 0.887630045413971, + "learning_rate": 1.7573674208328695e-05, + "loss": 0.8046, + "step": 226930 + }, + { + "epoch": 1.4498549761700932, + "grad_norm": 1.6671459674835205, + "learning_rate": 1.756985494960885e-05, + "loss": 0.8499, + "step": 226940 + }, + { + "epoch": 1.4499188633198319, + "grad_norm": 0.9612981081008911, + "learning_rate": 1.756603601748864e-05, + "loss": 1.0565, + "step": 226950 + }, + { + "epoch": 1.4499827504695706, + "grad_norm": 0.6422664523124695, + "learning_rate": 1.7562217412006494e-05, + "loss": 0.7408, + "step": 226960 + }, + { + "epoch": 1.4500466376193093, + "grad_norm": 0.6129812002182007, + "learning_rate": 1.7558399133200893e-05, + "loss": 0.9403, + "step": 226970 + }, + { + "epoch": 1.450110524769048, + "grad_norm": 1.100138783454895, + "learning_rate": 1.7554581181110265e-05, + "loss": 0.9612, + "step": 226980 + }, + { + "epoch": 1.4501744119187867, + "grad_norm": 1.314050316810608, + "learning_rate": 1.7550763555773086e-05, + "loss": 0.9189, + "step": 226990 + }, + { + "epoch": 1.4502382990685254, + "grad_norm": 1.7175153493881226, + "learning_rate": 1.7546946257227774e-05, + "loss": 1.1167, + "step": 227000 + }, + { + "epoch": 1.4503021862182641, + "grad_norm": 0.9711244702339172, + "learning_rate": 1.754312928551281e-05, + "loss": 0.6749, + "step": 227010 + }, + { + "epoch": 1.4503660733680028, + "grad_norm": 1.205511212348938, + "learning_rate": 1.7539312640666593e-05, + "loss": 0.7577, + "step": 227020 + }, + { + "epoch": 1.4504299605177415, + "grad_norm": 1.204145908355713, + "learning_rate": 1.75354963227276e-05, + "loss": 0.7739, + "step": 227030 + }, + { + "epoch": 1.4504938476674802, + "grad_norm": 0.8410221338272095, + "learning_rate": 1.7531680331734225e-05, + "loss": 0.7122, + "step": 227040 + }, + { + "epoch": 1.450557734817219, + "grad_norm": 1.1859508752822876, + "learning_rate": 1.752786466772492e-05, + "loss": 0.937, + "step": 227050 + }, + { + "epoch": 1.4506216219669574, + "grad_norm": 0.9122669696807861, + "learning_rate": 1.752404933073813e-05, + "loss": 1.0059, + "step": 227060 + }, + { + "epoch": 1.4506855091166964, + "grad_norm": 0.9483950734138489, + "learning_rate": 1.7520234320812233e-05, + "loss": 0.7755, + "step": 227070 + }, + { + "epoch": 1.4507493962664348, + "grad_norm": 1.1015524864196777, + "learning_rate": 1.7516419637985703e-05, + "loss": 0.9223, + "step": 227080 + }, + { + "epoch": 1.4508132834161738, + "grad_norm": 0.5838099718093872, + "learning_rate": 1.7512605282296907e-05, + "loss": 0.9171, + "step": 227090 + }, + { + "epoch": 1.4508771705659123, + "grad_norm": 1.1983695030212402, + "learning_rate": 1.75087912537843e-05, + "loss": 0.8051, + "step": 227100 + }, + { + "epoch": 1.4509410577156512, + "grad_norm": 0.9621850252151489, + "learning_rate": 1.7504977552486255e-05, + "loss": 0.806, + "step": 227110 + }, + { + "epoch": 1.4510049448653897, + "grad_norm": 0.7084076404571533, + "learning_rate": 1.7501164178441215e-05, + "loss": 0.8508, + "step": 227120 + }, + { + "epoch": 1.4510688320151286, + "grad_norm": 0.736041784286499, + "learning_rate": 1.7497351131687557e-05, + "loss": 0.6109, + "step": 227130 + }, + { + "epoch": 1.451132719164867, + "grad_norm": 0.7992726564407349, + "learning_rate": 1.74935384122637e-05, + "loss": 0.8049, + "step": 227140 + }, + { + "epoch": 1.451196606314606, + "grad_norm": 0.921377420425415, + "learning_rate": 1.7489726020208018e-05, + "loss": 0.6698, + "step": 227150 + }, + { + "epoch": 1.4512604934643445, + "grad_norm": 0.7148461937904358, + "learning_rate": 1.7485913955558942e-05, + "loss": 0.9502, + "step": 227160 + }, + { + "epoch": 1.4513243806140834, + "grad_norm": 0.5825332403182983, + "learning_rate": 1.7482102218354823e-05, + "loss": 0.7701, + "step": 227170 + }, + { + "epoch": 1.451388267763822, + "grad_norm": 1.0439517498016357, + "learning_rate": 1.747829080863408e-05, + "loss": 0.9723, + "step": 227180 + }, + { + "epoch": 1.4514521549135606, + "grad_norm": 0.8032168745994568, + "learning_rate": 1.7474479726435065e-05, + "loss": 0.6788, + "step": 227190 + }, + { + "epoch": 1.4515160420632993, + "grad_norm": 2.783485174179077, + "learning_rate": 1.74706689717962e-05, + "loss": 0.7815, + "step": 227200 + }, + { + "epoch": 1.451579929213038, + "grad_norm": 0.992385983467102, + "learning_rate": 1.7466858544755825e-05, + "loss": 0.7934, + "step": 227210 + }, + { + "epoch": 1.4516438163627767, + "grad_norm": 1.3935444355010986, + "learning_rate": 1.7463048445352343e-05, + "loss": 1.0874, + "step": 227220 + }, + { + "epoch": 1.4517077035125154, + "grad_norm": 0.7856149673461914, + "learning_rate": 1.7459238673624094e-05, + "loss": 0.7179, + "step": 227230 + }, + { + "epoch": 1.4517715906622541, + "grad_norm": 0.9185128808021545, + "learning_rate": 1.7455429229609465e-05, + "loss": 0.7735, + "step": 227240 + }, + { + "epoch": 1.4518354778119928, + "grad_norm": 2.548475980758667, + "learning_rate": 1.7451620113346834e-05, + "loss": 1.1837, + "step": 227250 + }, + { + "epoch": 1.4518993649617316, + "grad_norm": 1.0396366119384766, + "learning_rate": 1.7447811324874536e-05, + "loss": 1.1245, + "step": 227260 + }, + { + "epoch": 1.4519632521114703, + "grad_norm": 0.7229011654853821, + "learning_rate": 1.7444002864230945e-05, + "loss": 0.7909, + "step": 227270 + }, + { + "epoch": 1.452027139261209, + "grad_norm": 0.6902373433113098, + "learning_rate": 1.74401947314544e-05, + "loss": 0.7689, + "step": 227280 + }, + { + "epoch": 1.4520910264109477, + "grad_norm": 1.0050629377365112, + "learning_rate": 1.7436386926583283e-05, + "loss": 0.8924, + "step": 227290 + }, + { + "epoch": 1.4521549135606864, + "grad_norm": 1.3460990190505981, + "learning_rate": 1.74325794496559e-05, + "loss": 0.836, + "step": 227300 + }, + { + "epoch": 1.452218800710425, + "grad_norm": 0.8463780879974365, + "learning_rate": 1.7428772300710633e-05, + "loss": 0.7391, + "step": 227310 + }, + { + "epoch": 1.4522826878601638, + "grad_norm": 0.9536442756652832, + "learning_rate": 1.742496547978579e-05, + "loss": 1.1644, + "step": 227320 + }, + { + "epoch": 1.4523465750099025, + "grad_norm": 0.8862224817276001, + "learning_rate": 1.7421158986919744e-05, + "loss": 1.1116, + "step": 227330 + }, + { + "epoch": 1.4524104621596412, + "grad_norm": 0.8460167050361633, + "learning_rate": 1.7417352822150794e-05, + "loss": 0.783, + "step": 227340 + }, + { + "epoch": 1.45247434930938, + "grad_norm": 0.5743412375450134, + "learning_rate": 1.741354698551731e-05, + "loss": 0.7744, + "step": 227350 + }, + { + "epoch": 1.4525382364591186, + "grad_norm": 0.9879737496376038, + "learning_rate": 1.7409741477057578e-05, + "loss": 0.771, + "step": 227360 + }, + { + "epoch": 1.4526021236088573, + "grad_norm": 1.0089986324310303, + "learning_rate": 1.740593629680996e-05, + "loss": 0.955, + "step": 227370 + }, + { + "epoch": 1.452666010758596, + "grad_norm": 0.8803655505180359, + "learning_rate": 1.7402131444812748e-05, + "loss": 0.8349, + "step": 227380 + }, + { + "epoch": 1.4527298979083347, + "grad_norm": 1.1196205615997314, + "learning_rate": 1.739832692110429e-05, + "loss": 0.81, + "step": 227390 + }, + { + "epoch": 1.4527937850580734, + "grad_norm": 1.3624825477600098, + "learning_rate": 1.7394522725722885e-05, + "loss": 0.7161, + "step": 227400 + }, + { + "epoch": 1.4528576722078121, + "grad_norm": 0.8905812501907349, + "learning_rate": 1.739071885870682e-05, + "loss": 0.8102, + "step": 227410 + }, + { + "epoch": 1.4529215593575509, + "grad_norm": 0.9938466548919678, + "learning_rate": 1.7386915320094454e-05, + "loss": 0.7328, + "step": 227420 + }, + { + "epoch": 1.4529854465072896, + "grad_norm": 0.9256237149238586, + "learning_rate": 1.7383112109924042e-05, + "loss": 0.8058, + "step": 227430 + }, + { + "epoch": 1.4530493336570283, + "grad_norm": 1.2441478967666626, + "learning_rate": 1.7379309228233925e-05, + "loss": 0.9255, + "step": 227440 + }, + { + "epoch": 1.453113220806767, + "grad_norm": 0.9369105696678162, + "learning_rate": 1.737550667506237e-05, + "loss": 0.6952, + "step": 227450 + }, + { + "epoch": 1.4531771079565057, + "grad_norm": 1.246598720550537, + "learning_rate": 1.7371704450447706e-05, + "loss": 0.8575, + "step": 227460 + }, + { + "epoch": 1.4532409951062444, + "grad_norm": 1.074830412864685, + "learning_rate": 1.7367902554428183e-05, + "loss": 0.8451, + "step": 227470 + }, + { + "epoch": 1.453304882255983, + "grad_norm": 1.5546703338623047, + "learning_rate": 1.7364100987042135e-05, + "loss": 1.0819, + "step": 227480 + }, + { + "epoch": 1.4533687694057218, + "grad_norm": 0.923184335231781, + "learning_rate": 1.7360299748327806e-05, + "loss": 0.7363, + "step": 227490 + }, + { + "epoch": 1.4534326565554605, + "grad_norm": 0.8427280187606812, + "learning_rate": 1.735649883832351e-05, + "loss": 0.5411, + "step": 227500 + }, + { + "epoch": 1.4534965437051992, + "grad_norm": 0.8301636576652527, + "learning_rate": 1.7352698257067495e-05, + "loss": 0.9946, + "step": 227510 + }, + { + "epoch": 1.453560430854938, + "grad_norm": 0.5679813027381897, + "learning_rate": 1.7348898004598075e-05, + "loss": 0.7921, + "step": 227520 + }, + { + "epoch": 1.4536243180046766, + "grad_norm": 0.7748306393623352, + "learning_rate": 1.7345098080953475e-05, + "loss": 1.2827, + "step": 227530 + }, + { + "epoch": 1.4536882051544153, + "grad_norm": 2.134530782699585, + "learning_rate": 1.734129848617201e-05, + "loss": 0.7154, + "step": 227540 + }, + { + "epoch": 1.4537520923041538, + "grad_norm": 1.778371810913086, + "learning_rate": 1.7337499220291903e-05, + "loss": 0.9307, + "step": 227550 + }, + { + "epoch": 1.4538159794538927, + "grad_norm": 0.7461889982223511, + "learning_rate": 1.733370028335144e-05, + "loss": 0.6509, + "step": 227560 + }, + { + "epoch": 1.4538798666036312, + "grad_norm": 0.9498295783996582, + "learning_rate": 1.7329901675388887e-05, + "loss": 1.3496, + "step": 227570 + }, + { + "epoch": 1.4539437537533701, + "grad_norm": 0.7788515686988831, + "learning_rate": 1.7326103396442473e-05, + "loss": 0.757, + "step": 227580 + }, + { + "epoch": 1.4540076409031086, + "grad_norm": 0.7038117051124573, + "learning_rate": 1.732230544655048e-05, + "loss": 1.0964, + "step": 227590 + }, + { + "epoch": 1.4540715280528476, + "grad_norm": 0.7378969192504883, + "learning_rate": 1.731850782575113e-05, + "loss": 0.8474, + "step": 227600 + }, + { + "epoch": 1.454135415202586, + "grad_norm": 1.4201772212982178, + "learning_rate": 1.731471053408269e-05, + "loss": 0.7944, + "step": 227610 + }, + { + "epoch": 1.454199302352325, + "grad_norm": 1.4799860715866089, + "learning_rate": 1.7310913571583375e-05, + "loss": 0.8094, + "step": 227620 + }, + { + "epoch": 1.4542631895020635, + "grad_norm": 1.1239525079727173, + "learning_rate": 1.7307116938291463e-05, + "loss": 1.1415, + "step": 227630 + }, + { + "epoch": 1.4543270766518024, + "grad_norm": 1.0189942121505737, + "learning_rate": 1.7303320634245148e-05, + "loss": 0.766, + "step": 227640 + }, + { + "epoch": 1.4543909638015409, + "grad_norm": 0.9115725755691528, + "learning_rate": 1.7299524659482697e-05, + "loss": 0.7498, + "step": 227650 + }, + { + "epoch": 1.4544548509512796, + "grad_norm": 0.8899692893028259, + "learning_rate": 1.7295729014042306e-05, + "loss": 0.8486, + "step": 227660 + }, + { + "epoch": 1.4545187381010183, + "grad_norm": 1.07687509059906, + "learning_rate": 1.7291933697962233e-05, + "loss": 1.0783, + "step": 227670 + }, + { + "epoch": 1.454582625250757, + "grad_norm": 0.5755985975265503, + "learning_rate": 1.7288138711280666e-05, + "loss": 0.8134, + "step": 227680 + }, + { + "epoch": 1.4546465124004957, + "grad_norm": 1.454787015914917, + "learning_rate": 1.728434405403586e-05, + "loss": 1.1284, + "step": 227690 + }, + { + "epoch": 1.4547103995502344, + "grad_norm": 0.8672309517860413, + "learning_rate": 1.7280549726265994e-05, + "loss": 0.8704, + "step": 227700 + }, + { + "epoch": 1.454774286699973, + "grad_norm": 3.574970245361328, + "learning_rate": 1.7276755728009318e-05, + "loss": 1.0213, + "step": 227710 + }, + { + "epoch": 1.4548381738497118, + "grad_norm": 1.3924107551574707, + "learning_rate": 1.7272962059304004e-05, + "loss": 1.0223, + "step": 227720 + }, + { + "epoch": 1.4549020609994505, + "grad_norm": 0.8274136781692505, + "learning_rate": 1.7269168720188296e-05, + "loss": 0.9841, + "step": 227730 + }, + { + "epoch": 1.4549659481491892, + "grad_norm": 2.5468506813049316, + "learning_rate": 1.726537571070035e-05, + "loss": 0.6942, + "step": 227740 + }, + { + "epoch": 1.455029835298928, + "grad_norm": 0.7780070304870605, + "learning_rate": 1.7261583030878414e-05, + "loss": 0.7812, + "step": 227750 + }, + { + "epoch": 1.4550937224486666, + "grad_norm": 1.2256584167480469, + "learning_rate": 1.725779068076064e-05, + "loss": 0.8563, + "step": 227760 + }, + { + "epoch": 1.4551576095984053, + "grad_norm": 1.1407594680786133, + "learning_rate": 1.725399866038524e-05, + "loss": 0.7595, + "step": 227770 + }, + { + "epoch": 1.455221496748144, + "grad_norm": 1.1632260084152222, + "learning_rate": 1.7250206969790416e-05, + "loss": 0.7344, + "step": 227780 + }, + { + "epoch": 1.4552853838978828, + "grad_norm": 0.7721037268638611, + "learning_rate": 1.7246415609014327e-05, + "loss": 0.7795, + "step": 227790 + }, + { + "epoch": 1.4553492710476215, + "grad_norm": 0.8553909659385681, + "learning_rate": 1.7242624578095184e-05, + "loss": 0.7139, + "step": 227800 + }, + { + "epoch": 1.4554131581973602, + "grad_norm": 0.8728143572807312, + "learning_rate": 1.7238833877071136e-05, + "loss": 0.8254, + "step": 227810 + }, + { + "epoch": 1.4554770453470989, + "grad_norm": 0.7298803329467773, + "learning_rate": 1.723504350598039e-05, + "loss": 0.7893, + "step": 227820 + }, + { + "epoch": 1.4555409324968376, + "grad_norm": 0.7812074422836304, + "learning_rate": 1.7231253464861087e-05, + "loss": 0.8362, + "step": 227830 + }, + { + "epoch": 1.4556048196465763, + "grad_norm": 0.685131847858429, + "learning_rate": 1.722746375375142e-05, + "loss": 0.8985, + "step": 227840 + }, + { + "epoch": 1.455668706796315, + "grad_norm": 0.671725332736969, + "learning_rate": 1.7223674372689535e-05, + "loss": 0.6595, + "step": 227850 + }, + { + "epoch": 1.4557325939460537, + "grad_norm": 2.0617687702178955, + "learning_rate": 1.721988532171362e-05, + "loss": 1.1418, + "step": 227860 + }, + { + "epoch": 1.4557964810957924, + "grad_norm": 1.009682536125183, + "learning_rate": 1.7216096600861803e-05, + "loss": 0.8784, + "step": 227870 + }, + { + "epoch": 1.4558603682455311, + "grad_norm": 1.326684832572937, + "learning_rate": 1.721230821017227e-05, + "loss": 0.9218, + "step": 227880 + }, + { + "epoch": 1.4559242553952698, + "grad_norm": 0.7491835355758667, + "learning_rate": 1.720852014968316e-05, + "loss": 0.6816, + "step": 227890 + }, + { + "epoch": 1.4559881425450085, + "grad_norm": 1.153235912322998, + "learning_rate": 1.72047324194326e-05, + "loss": 0.6852, + "step": 227900 + }, + { + "epoch": 1.4560520296947472, + "grad_norm": 0.826145589351654, + "learning_rate": 1.7201323744592614e-05, + "loss": 0.819, + "step": 227910 + }, + { + "epoch": 1.456115916844486, + "grad_norm": 1.335146427154541, + "learning_rate": 1.719753664190043e-05, + "loss": 0.9684, + "step": 227920 + }, + { + "epoch": 1.4561798039942246, + "grad_norm": 1.198949933052063, + "learning_rate": 1.7193749869557446e-05, + "loss": 0.6834, + "step": 227930 + }, + { + "epoch": 1.4562436911439633, + "grad_norm": 1.216091275215149, + "learning_rate": 1.7190342056928825e-05, + "loss": 0.8983, + "step": 227940 + }, + { + "epoch": 1.456307578293702, + "grad_norm": 0.6597400307655334, + "learning_rate": 1.718655591235436e-05, + "loss": 0.7476, + "step": 227950 + }, + { + "epoch": 1.4563714654434408, + "grad_norm": 1.405436396598816, + "learning_rate": 1.7182770098239642e-05, + "loss": 0.5881, + "step": 227960 + }, + { + "epoch": 1.4564353525931795, + "grad_norm": 0.6479114294052124, + "learning_rate": 1.7178984614622833e-05, + "loss": 0.9238, + "step": 227970 + }, + { + "epoch": 1.4564992397429182, + "grad_norm": 0.8640186190605164, + "learning_rate": 1.717519946154203e-05, + "loss": 0.7785, + "step": 227980 + }, + { + "epoch": 1.4565631268926569, + "grad_norm": 2.1302618980407715, + "learning_rate": 1.717141463903539e-05, + "loss": 0.8929, + "step": 227990 + }, + { + "epoch": 1.4566270140423956, + "grad_norm": 0.9887571334838867, + "learning_rate": 1.7167630147140977e-05, + "loss": 0.8549, + "step": 228000 + }, + { + "epoch": 1.4566909011921343, + "grad_norm": 0.932548463344574, + "learning_rate": 1.7163845985896938e-05, + "loss": 1.0528, + "step": 228010 + }, + { + "epoch": 1.456754788341873, + "grad_norm": 0.897533655166626, + "learning_rate": 1.7160062155341395e-05, + "loss": 0.9261, + "step": 228020 + }, + { + "epoch": 1.4568186754916117, + "grad_norm": 1.0525333881378174, + "learning_rate": 1.7156278655512415e-05, + "loss": 0.8245, + "step": 228030 + }, + { + "epoch": 1.4568825626413502, + "grad_norm": 0.9059542417526245, + "learning_rate": 1.7152495486448144e-05, + "loss": 0.9365, + "step": 228040 + }, + { + "epoch": 1.4569464497910891, + "grad_norm": 0.9503512978553772, + "learning_rate": 1.7148712648186644e-05, + "loss": 1.0488, + "step": 228050 + }, + { + "epoch": 1.4570103369408276, + "grad_norm": 1.1666412353515625, + "learning_rate": 1.7144930140766042e-05, + "loss": 0.9226, + "step": 228060 + }, + { + "epoch": 1.4570742240905665, + "grad_norm": 1.5587124824523926, + "learning_rate": 1.7141147964224404e-05, + "loss": 0.9057, + "step": 228070 + }, + { + "epoch": 1.457138111240305, + "grad_norm": 4.082630634307861, + "learning_rate": 1.7137366118599846e-05, + "loss": 0.7744, + "step": 228080 + }, + { + "epoch": 1.457201998390044, + "grad_norm": 0.8567389845848083, + "learning_rate": 1.7133584603930423e-05, + "loss": 1.1713, + "step": 228090 + }, + { + "epoch": 1.4572658855397824, + "grad_norm": 0.7330909371376038, + "learning_rate": 1.712980342025426e-05, + "loss": 0.8745, + "step": 228100 + }, + { + "epoch": 1.4573297726895214, + "grad_norm": 0.8207191824913025, + "learning_rate": 1.712602256760939e-05, + "loss": 0.8402, + "step": 228110 + }, + { + "epoch": 1.4573936598392598, + "grad_norm": 0.7916305065155029, + "learning_rate": 1.712224204603394e-05, + "loss": 0.8344, + "step": 228120 + }, + { + "epoch": 1.4574575469889988, + "grad_norm": 1.1310572624206543, + "learning_rate": 1.7118461855565925e-05, + "loss": 0.9267, + "step": 228130 + }, + { + "epoch": 1.4575214341387372, + "grad_norm": 1.3407129049301147, + "learning_rate": 1.711468199624347e-05, + "loss": 0.6926, + "step": 228140 + }, + { + "epoch": 1.457585321288476, + "grad_norm": 1.2362877130508423, + "learning_rate": 1.71109024681046e-05, + "loss": 0.8751, + "step": 228150 + }, + { + "epoch": 1.4576492084382147, + "grad_norm": 1.0826531648635864, + "learning_rate": 1.7107123271187414e-05, + "loss": 0.8926, + "step": 228160 + }, + { + "epoch": 1.4577130955879534, + "grad_norm": 0.696463942527771, + "learning_rate": 1.7103344405529932e-05, + "loss": 0.8336, + "step": 228170 + }, + { + "epoch": 1.457776982737692, + "grad_norm": 0.6624709367752075, + "learning_rate": 1.7099565871170248e-05, + "loss": 0.8775, + "step": 228180 + }, + { + "epoch": 1.4578408698874308, + "grad_norm": 0.9407052993774414, + "learning_rate": 1.70957876681464e-05, + "loss": 0.7396, + "step": 228190 + }, + { + "epoch": 1.4579047570371695, + "grad_norm": 0.8339453935623169, + "learning_rate": 1.7092009796496424e-05, + "loss": 0.7492, + "step": 228200 + }, + { + "epoch": 1.4579686441869082, + "grad_norm": 1.2756327390670776, + "learning_rate": 1.708823225625839e-05, + "loss": 0.8155, + "step": 228210 + }, + { + "epoch": 1.458032531336647, + "grad_norm": 1.3156120777130127, + "learning_rate": 1.7084455047470317e-05, + "loss": 0.7071, + "step": 228220 + }, + { + "epoch": 1.4580964184863856, + "grad_norm": 0.874346137046814, + "learning_rate": 1.708067817017027e-05, + "loss": 0.8141, + "step": 228230 + }, + { + "epoch": 1.4581603056361243, + "grad_norm": 0.44407644867897034, + "learning_rate": 1.7076901624396265e-05, + "loss": 0.9211, + "step": 228240 + }, + { + "epoch": 1.458224192785863, + "grad_norm": 1.41887629032135, + "learning_rate": 1.7073125410186346e-05, + "loss": 0.9911, + "step": 228250 + }, + { + "epoch": 1.4582880799356017, + "grad_norm": 1.2208428382873535, + "learning_rate": 1.7069349527578534e-05, + "loss": 0.8122, + "step": 228260 + }, + { + "epoch": 1.4583519670853404, + "grad_norm": 1.1697001457214355, + "learning_rate": 1.7065573976610876e-05, + "loss": 1.1614, + "step": 228270 + }, + { + "epoch": 1.4584158542350791, + "grad_norm": 0.5886402130126953, + "learning_rate": 1.706179875732136e-05, + "loss": 0.728, + "step": 228280 + }, + { + "epoch": 1.4584797413848178, + "grad_norm": 0.9518161416053772, + "learning_rate": 1.7058023869748048e-05, + "loss": 0.8771, + "step": 228290 + }, + { + "epoch": 1.4585436285345565, + "grad_norm": 0.942564845085144, + "learning_rate": 1.7054249313928917e-05, + "loss": 1.1118, + "step": 228300 + }, + { + "epoch": 1.4586075156842953, + "grad_norm": 1.543463945388794, + "learning_rate": 1.7050475089902014e-05, + "loss": 0.8808, + "step": 228310 + }, + { + "epoch": 1.458671402834034, + "grad_norm": 2.35699200630188, + "learning_rate": 1.7046701197705313e-05, + "loss": 0.718, + "step": 228320 + }, + { + "epoch": 1.4587352899837727, + "grad_norm": 0.9139242768287659, + "learning_rate": 1.704292763737684e-05, + "loss": 0.8443, + "step": 228330 + }, + { + "epoch": 1.4587991771335114, + "grad_norm": 0.9385525584220886, + "learning_rate": 1.703915440895461e-05, + "loss": 0.9222, + "step": 228340 + }, + { + "epoch": 1.45886306428325, + "grad_norm": 0.983020007610321, + "learning_rate": 1.70353815124766e-05, + "loss": 0.964, + "step": 228350 + }, + { + "epoch": 1.4589269514329888, + "grad_norm": 0.8044835329055786, + "learning_rate": 1.7031608947980833e-05, + "loss": 0.8631, + "step": 228360 + }, + { + "epoch": 1.4589908385827275, + "grad_norm": 1.0413472652435303, + "learning_rate": 1.702783671550527e-05, + "loss": 1.0598, + "step": 228370 + }, + { + "epoch": 1.4590547257324662, + "grad_norm": 1.6564788818359375, + "learning_rate": 1.702406481508793e-05, + "loss": 0.8751, + "step": 228380 + }, + { + "epoch": 1.459118612882205, + "grad_norm": 1.0101597309112549, + "learning_rate": 1.702029324676677e-05, + "loss": 0.8339, + "step": 228390 + }, + { + "epoch": 1.4591825000319436, + "grad_norm": 1.1082416772842407, + "learning_rate": 1.7016522010579806e-05, + "loss": 0.9407, + "step": 228400 + }, + { + "epoch": 1.4592463871816823, + "grad_norm": 0.9075333476066589, + "learning_rate": 1.7012751106564978e-05, + "loss": 0.8674, + "step": 228410 + }, + { + "epoch": 1.459310274331421, + "grad_norm": 1.3690296411514282, + "learning_rate": 1.700898053476031e-05, + "loss": 1.0353, + "step": 228420 + }, + { + "epoch": 1.4593741614811597, + "grad_norm": 0.9637429714202881, + "learning_rate": 1.7005210295203727e-05, + "loss": 0.9846, + "step": 228430 + }, + { + "epoch": 1.4594380486308984, + "grad_norm": 0.9706971049308777, + "learning_rate": 1.700144038793324e-05, + "loss": 0.7751, + "step": 228440 + }, + { + "epoch": 1.4595019357806371, + "grad_norm": 1.0732990503311157, + "learning_rate": 1.6997670812986776e-05, + "loss": 0.9152, + "step": 228450 + }, + { + "epoch": 1.4595658229303758, + "grad_norm": 1.644718885421753, + "learning_rate": 1.6993901570402337e-05, + "loss": 1.0086, + "step": 228460 + }, + { + "epoch": 1.4596297100801146, + "grad_norm": 1.098197340965271, + "learning_rate": 1.6990132660217845e-05, + "loss": 0.7548, + "step": 228470 + }, + { + "epoch": 1.4596935972298533, + "grad_norm": 0.7047260403633118, + "learning_rate": 1.6986364082471294e-05, + "loss": 0.9532, + "step": 228480 + }, + { + "epoch": 1.459757484379592, + "grad_norm": 0.8027589917182922, + "learning_rate": 1.6982595837200598e-05, + "loss": 0.8873, + "step": 228490 + }, + { + "epoch": 1.4598213715293307, + "grad_norm": 3.7240960597991943, + "learning_rate": 1.6978827924443747e-05, + "loss": 0.9685, + "step": 228500 + }, + { + "epoch": 1.4598852586790692, + "grad_norm": 1.015761375427246, + "learning_rate": 1.6975060344238645e-05, + "loss": 0.9572, + "step": 228510 + }, + { + "epoch": 1.459949145828808, + "grad_norm": 0.8119301795959473, + "learning_rate": 1.6971293096623276e-05, + "loss": 0.7355, + "step": 228520 + }, + { + "epoch": 1.4600130329785466, + "grad_norm": 1.2989156246185303, + "learning_rate": 1.696752618163554e-05, + "loss": 0.7846, + "step": 228530 + }, + { + "epoch": 1.4600769201282855, + "grad_norm": 0.833233118057251, + "learning_rate": 1.6963759599313394e-05, + "loss": 1.2733, + "step": 228540 + }, + { + "epoch": 1.460140807278024, + "grad_norm": 0.9700032472610474, + "learning_rate": 1.6959993349694785e-05, + "loss": 0.8575, + "step": 228550 + }, + { + "epoch": 1.460204694427763, + "grad_norm": 0.8506675958633423, + "learning_rate": 1.6956227432817613e-05, + "loss": 1.049, + "step": 228560 + }, + { + "epoch": 1.4602685815775014, + "grad_norm": 1.2259025573730469, + "learning_rate": 1.695246184871983e-05, + "loss": 0.7984, + "step": 228570 + }, + { + "epoch": 1.4603324687272403, + "grad_norm": 1.0135867595672607, + "learning_rate": 1.6948696597439333e-05, + "loss": 1.1484, + "step": 228580 + }, + { + "epoch": 1.4603963558769788, + "grad_norm": 0.8687092661857605, + "learning_rate": 1.694493167901407e-05, + "loss": 0.6181, + "step": 228590 + }, + { + "epoch": 1.4604602430267177, + "grad_norm": 1.1422390937805176, + "learning_rate": 1.6941167093481923e-05, + "loss": 1.0024, + "step": 228600 + }, + { + "epoch": 1.4605241301764562, + "grad_norm": 0.9797633290290833, + "learning_rate": 1.6937402840880846e-05, + "loss": 0.7148, + "step": 228610 + }, + { + "epoch": 1.4605880173261951, + "grad_norm": 0.5749666094779968, + "learning_rate": 1.69336389212487e-05, + "loss": 0.8237, + "step": 228620 + }, + { + "epoch": 1.4606519044759336, + "grad_norm": 0.5369682908058167, + "learning_rate": 1.692987533462344e-05, + "loss": 0.7458, + "step": 228630 + }, + { + "epoch": 1.4607157916256723, + "grad_norm": 1.1534923315048218, + "learning_rate": 1.6926112081042926e-05, + "loss": 0.7847, + "step": 228640 + }, + { + "epoch": 1.460779678775411, + "grad_norm": 0.7996623516082764, + "learning_rate": 1.6922349160545096e-05, + "loss": 0.8961, + "step": 228650 + }, + { + "epoch": 1.4608435659251497, + "grad_norm": 1.0210667848587036, + "learning_rate": 1.6918586573167804e-05, + "loss": 0.7339, + "step": 228660 + }, + { + "epoch": 1.4609074530748885, + "grad_norm": 2.753202199935913, + "learning_rate": 1.691482431894899e-05, + "loss": 0.6541, + "step": 228670 + }, + { + "epoch": 1.4609713402246272, + "grad_norm": 0.8186949491500854, + "learning_rate": 1.6911062397926515e-05, + "loss": 0.6773, + "step": 228680 + }, + { + "epoch": 1.4610352273743659, + "grad_norm": 0.6040871739387512, + "learning_rate": 1.6907300810138245e-05, + "loss": 0.8906, + "step": 228690 + }, + { + "epoch": 1.4610991145241046, + "grad_norm": 0.9389637112617493, + "learning_rate": 1.6903539555622106e-05, + "loss": 0.9025, + "step": 228700 + }, + { + "epoch": 1.4611630016738433, + "grad_norm": 0.7031495571136475, + "learning_rate": 1.6899778634415934e-05, + "loss": 0.7657, + "step": 228710 + }, + { + "epoch": 1.461226888823582, + "grad_norm": 0.6498950123786926, + "learning_rate": 1.6896018046557655e-05, + "loss": 0.8931, + "step": 228720 + }, + { + "epoch": 1.4612907759733207, + "grad_norm": 2.9347357749938965, + "learning_rate": 1.6892257792085086e-05, + "loss": 0.6996, + "step": 228730 + }, + { + "epoch": 1.4613546631230594, + "grad_norm": 0.7390297651290894, + "learning_rate": 1.6888497871036148e-05, + "loss": 0.8342, + "step": 228740 + }, + { + "epoch": 1.461418550272798, + "grad_norm": 0.6747456192970276, + "learning_rate": 1.6884738283448658e-05, + "loss": 0.6615, + "step": 228750 + }, + { + "epoch": 1.4614824374225368, + "grad_norm": 1.3063819408416748, + "learning_rate": 1.6880979029360523e-05, + "loss": 0.9759, + "step": 228760 + }, + { + "epoch": 1.4615463245722755, + "grad_norm": 1.4095808267593384, + "learning_rate": 1.6877220108809567e-05, + "loss": 0.8749, + "step": 228770 + }, + { + "epoch": 1.4616102117220142, + "grad_norm": 0.8128808736801147, + "learning_rate": 1.6873461521833672e-05, + "loss": 0.9157, + "step": 228780 + }, + { + "epoch": 1.461674098871753, + "grad_norm": 0.7370621562004089, + "learning_rate": 1.686970326847066e-05, + "loss": 0.8539, + "step": 228790 + }, + { + "epoch": 1.4617379860214916, + "grad_norm": 1.128772497177124, + "learning_rate": 1.6865945348758417e-05, + "loss": 1.0237, + "step": 228800 + }, + { + "epoch": 1.4618018731712303, + "grad_norm": 0.8914177417755127, + "learning_rate": 1.6862187762734755e-05, + "loss": 0.9627, + "step": 228810 + }, + { + "epoch": 1.461865760320969, + "grad_norm": 0.8150872588157654, + "learning_rate": 1.6858430510437544e-05, + "loss": 0.738, + "step": 228820 + }, + { + "epoch": 1.4619296474707077, + "grad_norm": 0.9016690850257874, + "learning_rate": 1.6854673591904597e-05, + "loss": 0.9736, + "step": 228830 + }, + { + "epoch": 1.4619935346204465, + "grad_norm": 0.8050869703292847, + "learning_rate": 1.685091700717377e-05, + "loss": 0.9467, + "step": 228840 + }, + { + "epoch": 1.4620574217701852, + "grad_norm": 1.197709560394287, + "learning_rate": 1.6847160756282875e-05, + "loss": 0.8094, + "step": 228850 + }, + { + "epoch": 1.4621213089199239, + "grad_norm": 0.8978109359741211, + "learning_rate": 1.6843404839269754e-05, + "loss": 1.0452, + "step": 228860 + }, + { + "epoch": 1.4621851960696626, + "grad_norm": 0.9194414019584656, + "learning_rate": 1.6839649256172245e-05, + "loss": 0.9372, + "step": 228870 + }, + { + "epoch": 1.4622490832194013, + "grad_norm": 0.8657510280609131, + "learning_rate": 1.683589400702814e-05, + "loss": 0.8444, + "step": 228880 + }, + { + "epoch": 1.46231297036914, + "grad_norm": 0.8180578351020813, + "learning_rate": 1.6832139091875293e-05, + "loss": 0.8992, + "step": 228890 + }, + { + "epoch": 1.4623768575188787, + "grad_norm": 0.7911380529403687, + "learning_rate": 1.6828384510751478e-05, + "loss": 0.9684, + "step": 228900 + }, + { + "epoch": 1.4624407446686174, + "grad_norm": 0.9842470288276672, + "learning_rate": 1.6824630263694553e-05, + "loss": 0.8405, + "step": 228910 + }, + { + "epoch": 1.462504631818356, + "grad_norm": 1.0976924896240234, + "learning_rate": 1.6820876350742277e-05, + "loss": 1.0123, + "step": 228920 + }, + { + "epoch": 1.4625685189680948, + "grad_norm": 0.8579769730567932, + "learning_rate": 1.6817122771932498e-05, + "loss": 0.728, + "step": 228930 + }, + { + "epoch": 1.4626324061178335, + "grad_norm": 0.9228938221931458, + "learning_rate": 1.6813369527302986e-05, + "loss": 0.9207, + "step": 228940 + }, + { + "epoch": 1.4626962932675722, + "grad_norm": 1.1510818004608154, + "learning_rate": 1.6809616616891567e-05, + "loss": 1.0057, + "step": 228950 + }, + { + "epoch": 1.462760180417311, + "grad_norm": 0.5346214771270752, + "learning_rate": 1.680586404073601e-05, + "loss": 0.9359, + "step": 228960 + }, + { + "epoch": 1.4628240675670496, + "grad_norm": 0.8829362392425537, + "learning_rate": 1.6802111798874133e-05, + "loss": 0.792, + "step": 228970 + }, + { + "epoch": 1.4628879547167883, + "grad_norm": 1.4143726825714111, + "learning_rate": 1.6798359891343697e-05, + "loss": 0.8763, + "step": 228980 + }, + { + "epoch": 1.462951841866527, + "grad_norm": 1.787758231163025, + "learning_rate": 1.6794608318182508e-05, + "loss": 0.9209, + "step": 228990 + }, + { + "epoch": 1.4630157290162655, + "grad_norm": 0.9715995788574219, + "learning_rate": 1.6790857079428334e-05, + "loss": 0.6902, + "step": 229000 + }, + { + "epoch": 1.4630796161660045, + "grad_norm": 0.8735285401344299, + "learning_rate": 1.6787106175118973e-05, + "loss": 1.0034, + "step": 229010 + }, + { + "epoch": 1.463143503315743, + "grad_norm": 2.247833728790283, + "learning_rate": 1.6783355605292166e-05, + "loss": 0.9573, + "step": 229020 + }, + { + "epoch": 1.4632073904654819, + "grad_norm": 1.173517107963562, + "learning_rate": 1.6779605369985724e-05, + "loss": 0.8683, + "step": 229030 + }, + { + "epoch": 1.4632712776152204, + "grad_norm": 0.9730435609817505, + "learning_rate": 1.6775855469237377e-05, + "loss": 0.7438, + "step": 229040 + }, + { + "epoch": 1.4633351647649593, + "grad_norm": 1.0451921224594116, + "learning_rate": 1.6772105903084924e-05, + "loss": 1.0182, + "step": 229050 + }, + { + "epoch": 1.4633990519146978, + "grad_norm": 0.7566806077957153, + "learning_rate": 1.6768356671566098e-05, + "loss": 0.8164, + "step": 229060 + }, + { + "epoch": 1.4634629390644367, + "grad_norm": 0.6601537466049194, + "learning_rate": 1.6764607774718666e-05, + "loss": 0.7758, + "step": 229070 + }, + { + "epoch": 1.4635268262141752, + "grad_norm": 0.9205562472343445, + "learning_rate": 1.6760859212580403e-05, + "loss": 0.69, + "step": 229080 + }, + { + "epoch": 1.463590713363914, + "grad_norm": 1.6375494003295898, + "learning_rate": 1.6757110985189035e-05, + "loss": 0.8699, + "step": 229090 + }, + { + "epoch": 1.4636546005136526, + "grad_norm": 1.4572193622589111, + "learning_rate": 1.675336309258233e-05, + "loss": 0.7878, + "step": 229100 + }, + { + "epoch": 1.4637184876633915, + "grad_norm": 0.8702135682106018, + "learning_rate": 1.6749615534798003e-05, + "loss": 0.6901, + "step": 229110 + }, + { + "epoch": 1.46378237481313, + "grad_norm": 0.8251786828041077, + "learning_rate": 1.674586831187383e-05, + "loss": 0.7376, + "step": 229120 + }, + { + "epoch": 1.4638462619628687, + "grad_norm": 0.8899903297424316, + "learning_rate": 1.674212142384752e-05, + "loss": 0.7067, + "step": 229130 + }, + { + "epoch": 1.4639101491126074, + "grad_norm": 0.600141704082489, + "learning_rate": 1.673837487075683e-05, + "loss": 0.8646, + "step": 229140 + }, + { + "epoch": 1.4639740362623461, + "grad_norm": 1.1960006952285767, + "learning_rate": 1.673462865263948e-05, + "loss": 0.8086, + "step": 229150 + }, + { + "epoch": 1.4640379234120848, + "grad_norm": 2.2285172939300537, + "learning_rate": 1.67308827695332e-05, + "loss": 0.6467, + "step": 229160 + }, + { + "epoch": 1.4641018105618235, + "grad_norm": 1.7123388051986694, + "learning_rate": 1.6727137221475696e-05, + "loss": 0.9564, + "step": 229170 + }, + { + "epoch": 1.4641656977115622, + "grad_norm": 1.2680044174194336, + "learning_rate": 1.6723392008504707e-05, + "loss": 0.9674, + "step": 229180 + }, + { + "epoch": 1.464229584861301, + "grad_norm": 2.0865049362182617, + "learning_rate": 1.6719647130657966e-05, + "loss": 0.8682, + "step": 229190 + }, + { + "epoch": 1.4642934720110397, + "grad_norm": 0.9947324395179749, + "learning_rate": 1.6715902587973154e-05, + "loss": 0.6539, + "step": 229200 + }, + { + "epoch": 1.4643573591607784, + "grad_norm": 0.7031934261322021, + "learning_rate": 1.6712158380488007e-05, + "loss": 0.9585, + "step": 229210 + }, + { + "epoch": 1.464421246310517, + "grad_norm": 1.1023660898208618, + "learning_rate": 1.670841450824021e-05, + "loss": 0.8712, + "step": 229220 + }, + { + "epoch": 1.4644851334602558, + "grad_norm": 0.5734823942184448, + "learning_rate": 1.67046709712675e-05, + "loss": 0.7178, + "step": 229230 + }, + { + "epoch": 1.4645490206099945, + "grad_norm": 1.032142996788025, + "learning_rate": 1.6700927769607544e-05, + "loss": 0.7241, + "step": 229240 + }, + { + "epoch": 1.4646129077597332, + "grad_norm": 1.1997605562210083, + "learning_rate": 1.6697184903298062e-05, + "loss": 0.7722, + "step": 229250 + }, + { + "epoch": 1.464676794909472, + "grad_norm": 0.8793839812278748, + "learning_rate": 1.6693442372376727e-05, + "loss": 0.9902, + "step": 229260 + }, + { + "epoch": 1.4647406820592106, + "grad_norm": 0.8298856019973755, + "learning_rate": 1.6689700176881256e-05, + "loss": 0.7513, + "step": 229270 + }, + { + "epoch": 1.4648045692089493, + "grad_norm": 0.7414980530738831, + "learning_rate": 1.6685958316849304e-05, + "loss": 0.8875, + "step": 229280 + }, + { + "epoch": 1.464868456358688, + "grad_norm": 1.0754121541976929, + "learning_rate": 1.6682216792318595e-05, + "loss": 0.8855, + "step": 229290 + }, + { + "epoch": 1.4649323435084267, + "grad_norm": 0.7558190226554871, + "learning_rate": 1.6678475603326767e-05, + "loss": 0.7526, + "step": 229300 + }, + { + "epoch": 1.4649962306581654, + "grad_norm": 2.532780170440674, + "learning_rate": 1.667473474991153e-05, + "loss": 1.0711, + "step": 229310 + }, + { + "epoch": 1.4650601178079041, + "grad_norm": 0.7748706340789795, + "learning_rate": 1.667099423211053e-05, + "loss": 0.7864, + "step": 229320 + }, + { + "epoch": 1.4651240049576428, + "grad_norm": 0.6396449208259583, + "learning_rate": 1.6667254049961472e-05, + "loss": 0.8154, + "step": 229330 + }, + { + "epoch": 1.4651878921073815, + "grad_norm": 0.7421424388885498, + "learning_rate": 1.6663514203501985e-05, + "loss": 0.8693, + "step": 229340 + }, + { + "epoch": 1.4652517792571202, + "grad_norm": 1.017627239227295, + "learning_rate": 1.6659774692769763e-05, + "loss": 0.8065, + "step": 229350 + }, + { + "epoch": 1.465315666406859, + "grad_norm": 1.8426539897918701, + "learning_rate": 1.6656035517802442e-05, + "loss": 0.882, + "step": 229360 + }, + { + "epoch": 1.4653795535565977, + "grad_norm": 0.957169234752655, + "learning_rate": 1.6652296678637704e-05, + "loss": 0.7843, + "step": 229370 + }, + { + "epoch": 1.4654434407063364, + "grad_norm": 1.5271551609039307, + "learning_rate": 1.6648558175313167e-05, + "loss": 0.8364, + "step": 229380 + }, + { + "epoch": 1.465507327856075, + "grad_norm": 0.784913957118988, + "learning_rate": 1.664482000786651e-05, + "loss": 0.7729, + "step": 229390 + }, + { + "epoch": 1.4655712150058138, + "grad_norm": 0.7997502088546753, + "learning_rate": 1.6641082176335383e-05, + "loss": 0.9061, + "step": 229400 + }, + { + "epoch": 1.4656351021555525, + "grad_norm": 0.7640141844749451, + "learning_rate": 1.6637344680757406e-05, + "loss": 0.8026, + "step": 229410 + }, + { + "epoch": 1.4656989893052912, + "grad_norm": 1.3364169597625732, + "learning_rate": 1.663360752117024e-05, + "loss": 0.9229, + "step": 229420 + }, + { + "epoch": 1.46576287645503, + "grad_norm": 1.1318371295928955, + "learning_rate": 1.6629870697611503e-05, + "loss": 0.9731, + "step": 229430 + }, + { + "epoch": 1.4658267636047686, + "grad_norm": 0.7715478539466858, + "learning_rate": 1.6626134210118848e-05, + "loss": 0.8884, + "step": 229440 + }, + { + "epoch": 1.4658906507545073, + "grad_norm": 2.0042197704315186, + "learning_rate": 1.6622398058729883e-05, + "loss": 1.0224, + "step": 229450 + }, + { + "epoch": 1.465954537904246, + "grad_norm": 3.34346342086792, + "learning_rate": 1.6618662243482263e-05, + "loss": 0.908, + "step": 229460 + }, + { + "epoch": 1.4660184250539847, + "grad_norm": 0.6757034659385681, + "learning_rate": 1.6614926764413574e-05, + "loss": 1.0681, + "step": 229470 + }, + { + "epoch": 1.4660823122037234, + "grad_norm": 1.661514401435852, + "learning_rate": 1.6611191621561467e-05, + "loss": 1.1794, + "step": 229480 + }, + { + "epoch": 1.466146199353462, + "grad_norm": 0.9512356519699097, + "learning_rate": 1.6607456814963534e-05, + "loss": 0.7131, + "step": 229490 + }, + { + "epoch": 1.4662100865032008, + "grad_norm": 1.0155463218688965, + "learning_rate": 1.6603722344657413e-05, + "loss": 0.726, + "step": 229500 + }, + { + "epoch": 1.4662739736529393, + "grad_norm": 1.3397324085235596, + "learning_rate": 1.6599988210680683e-05, + "loss": 0.9391, + "step": 229510 + }, + { + "epoch": 1.4663378608026783, + "grad_norm": 1.2345083951950073, + "learning_rate": 1.659625441307099e-05, + "loss": 0.8885, + "step": 229520 + }, + { + "epoch": 1.4664017479524167, + "grad_norm": 1.044854998588562, + "learning_rate": 1.659252095186589e-05, + "loss": 0.778, + "step": 229530 + }, + { + "epoch": 1.4664656351021557, + "grad_norm": 0.9372422099113464, + "learning_rate": 1.6588787827103025e-05, + "loss": 0.7025, + "step": 229540 + }, + { + "epoch": 1.4665295222518941, + "grad_norm": 1.096632719039917, + "learning_rate": 1.658505503881996e-05, + "loss": 0.9675, + "step": 229550 + }, + { + "epoch": 1.466593409401633, + "grad_norm": 1.0477042198181152, + "learning_rate": 1.6581322587054304e-05, + "loss": 0.8676, + "step": 229560 + }, + { + "epoch": 1.4666572965513716, + "grad_norm": 1.234499216079712, + "learning_rate": 1.6577590471843628e-05, + "loss": 0.9303, + "step": 229570 + }, + { + "epoch": 1.4667211837011105, + "grad_norm": 0.9738841652870178, + "learning_rate": 1.6573858693225536e-05, + "loss": 0.7914, + "step": 229580 + }, + { + "epoch": 1.466785070850849, + "grad_norm": 0.8118196725845337, + "learning_rate": 1.6570127251237622e-05, + "loss": 0.9868, + "step": 229590 + }, + { + "epoch": 1.466848958000588, + "grad_norm": 0.6751111149787903, + "learning_rate": 1.6566396145917424e-05, + "loss": 0.801, + "step": 229600 + }, + { + "epoch": 1.4669128451503264, + "grad_norm": 1.2858651876449585, + "learning_rate": 1.656266537730256e-05, + "loss": 0.932, + "step": 229610 + }, + { + "epoch": 1.466976732300065, + "grad_norm": 0.7679579854011536, + "learning_rate": 1.6558934945430564e-05, + "loss": 1.07, + "step": 229620 + }, + { + "epoch": 1.4670406194498038, + "grad_norm": 0.7404538989067078, + "learning_rate": 1.6555204850339047e-05, + "loss": 0.7723, + "step": 229630 + }, + { + "epoch": 1.4671045065995425, + "grad_norm": 0.8464112281799316, + "learning_rate": 1.6551475092065543e-05, + "loss": 0.8583, + "step": 229640 + }, + { + "epoch": 1.4671683937492812, + "grad_norm": 1.2769086360931396, + "learning_rate": 1.654774567064763e-05, + "loss": 0.8978, + "step": 229650 + }, + { + "epoch": 1.46723228089902, + "grad_norm": 0.8352043032646179, + "learning_rate": 1.6544016586122835e-05, + "loss": 0.7327, + "step": 229660 + }, + { + "epoch": 1.4672961680487586, + "grad_norm": 0.8383384943008423, + "learning_rate": 1.6540287838528756e-05, + "loss": 1.1571, + "step": 229670 + }, + { + "epoch": 1.4673600551984973, + "grad_norm": 1.0359272956848145, + "learning_rate": 1.653655942790291e-05, + "loss": 0.8446, + "step": 229680 + }, + { + "epoch": 1.467423942348236, + "grad_norm": 2.1989591121673584, + "learning_rate": 1.6532831354282874e-05, + "loss": 1.0813, + "step": 229690 + }, + { + "epoch": 1.4674878294979747, + "grad_norm": 2.8567075729370117, + "learning_rate": 1.6529103617706165e-05, + "loss": 0.9508, + "step": 229700 + }, + { + "epoch": 1.4675517166477134, + "grad_norm": 1.0511747598648071, + "learning_rate": 1.652537621821034e-05, + "loss": 0.7498, + "step": 229710 + }, + { + "epoch": 1.4676156037974522, + "grad_norm": 1.0333011150360107, + "learning_rate": 1.6521649155832953e-05, + "loss": 0.8748, + "step": 229720 + }, + { + "epoch": 1.4676794909471909, + "grad_norm": 1.0516149997711182, + "learning_rate": 1.6517922430611503e-05, + "loss": 0.9187, + "step": 229730 + }, + { + "epoch": 1.4677433780969296, + "grad_norm": 0.6968880891799927, + "learning_rate": 1.6514196042583556e-05, + "loss": 0.8594, + "step": 229740 + }, + { + "epoch": 1.4678072652466683, + "grad_norm": 1.0043814182281494, + "learning_rate": 1.6510469991786608e-05, + "loss": 0.6192, + "step": 229750 + }, + { + "epoch": 1.467871152396407, + "grad_norm": 0.7312933802604675, + "learning_rate": 1.6506744278258217e-05, + "loss": 0.9998, + "step": 229760 + }, + { + "epoch": 1.4679350395461457, + "grad_norm": 1.1748160123825073, + "learning_rate": 1.6503018902035872e-05, + "loss": 1.02, + "step": 229770 + }, + { + "epoch": 1.4679989266958844, + "grad_norm": 0.9027350544929504, + "learning_rate": 1.649929386315712e-05, + "loss": 0.6065, + "step": 229780 + }, + { + "epoch": 1.468062813845623, + "grad_norm": 0.8316593766212463, + "learning_rate": 1.6495569161659454e-05, + "loss": 1.1349, + "step": 229790 + }, + { + "epoch": 1.4681267009953618, + "grad_norm": 0.7100276947021484, + "learning_rate": 1.6491844797580396e-05, + "loss": 0.8488, + "step": 229800 + }, + { + "epoch": 1.4681905881451005, + "grad_norm": 0.6455823183059692, + "learning_rate": 1.648812077095744e-05, + "loss": 0.9165, + "step": 229810 + }, + { + "epoch": 1.4682544752948392, + "grad_norm": 0.9278150796890259, + "learning_rate": 1.6484397081828105e-05, + "loss": 0.9549, + "step": 229820 + }, + { + "epoch": 1.468318362444578, + "grad_norm": 0.804965615272522, + "learning_rate": 1.6480673730229885e-05, + "loss": 0.8013, + "step": 229830 + }, + { + "epoch": 1.4683822495943166, + "grad_norm": 2.876382350921631, + "learning_rate": 1.6476950716200284e-05, + "loss": 1.0493, + "step": 229840 + }, + { + "epoch": 1.4684461367440553, + "grad_norm": 0.7755618691444397, + "learning_rate": 1.6473228039776782e-05, + "loss": 0.9287, + "step": 229850 + }, + { + "epoch": 1.468510023893794, + "grad_norm": 1.1979588270187378, + "learning_rate": 1.646950570099689e-05, + "loss": 1.0427, + "step": 229860 + }, + { + "epoch": 1.4685739110435327, + "grad_norm": 0.9614741802215576, + "learning_rate": 1.6465783699898074e-05, + "loss": 0.8418, + "step": 229870 + }, + { + "epoch": 1.4686377981932714, + "grad_norm": 0.9963449835777283, + "learning_rate": 1.646206203651784e-05, + "loss": 0.8061, + "step": 229880 + }, + { + "epoch": 1.4687016853430102, + "grad_norm": 1.0735199451446533, + "learning_rate": 1.6458340710893632e-05, + "loss": 0.9624, + "step": 229890 + }, + { + "epoch": 1.4687655724927489, + "grad_norm": 0.8822026252746582, + "learning_rate": 1.6454619723062976e-05, + "loss": 1.2394, + "step": 229900 + }, + { + "epoch": 1.4688294596424876, + "grad_norm": 0.9153038859367371, + "learning_rate": 1.6450899073063303e-05, + "loss": 0.9479, + "step": 229910 + }, + { + "epoch": 1.4688933467922263, + "grad_norm": 1.3811986446380615, + "learning_rate": 1.6447178760932096e-05, + "loss": 0.9423, + "step": 229920 + }, + { + "epoch": 1.468957233941965, + "grad_norm": 1.9339368343353271, + "learning_rate": 1.6443458786706845e-05, + "loss": 0.8623, + "step": 229930 + }, + { + "epoch": 1.4690211210917037, + "grad_norm": 1.1579439640045166, + "learning_rate": 1.6439739150424982e-05, + "loss": 0.8004, + "step": 229940 + }, + { + "epoch": 1.4690850082414424, + "grad_norm": 0.6219136714935303, + "learning_rate": 1.643601985212399e-05, + "loss": 0.8937, + "step": 229950 + }, + { + "epoch": 1.469148895391181, + "grad_norm": 0.7490658760070801, + "learning_rate": 1.64323008918413e-05, + "loss": 0.8881, + "step": 229960 + }, + { + "epoch": 1.4692127825409198, + "grad_norm": 0.9120808839797974, + "learning_rate": 1.64285822696144e-05, + "loss": 1.4033, + "step": 229970 + }, + { + "epoch": 1.4692766696906583, + "grad_norm": 1.0183281898498535, + "learning_rate": 1.6424863985480697e-05, + "loss": 0.9336, + "step": 229980 + }, + { + "epoch": 1.4693405568403972, + "grad_norm": 1.4285860061645508, + "learning_rate": 1.6421146039477685e-05, + "loss": 1.0088, + "step": 229990 + }, + { + "epoch": 1.4694044439901357, + "grad_norm": 0.916454553604126, + "learning_rate": 1.641742843164276e-05, + "loss": 1.2467, + "step": 230000 + }, + { + "epoch": 1.4694683311398746, + "grad_norm": 0.772546648979187, + "learning_rate": 1.64137111620134e-05, + "loss": 0.8583, + "step": 230010 + }, + { + "epoch": 1.4695322182896131, + "grad_norm": 0.8761088252067566, + "learning_rate": 1.640999423062701e-05, + "loss": 0.969, + "step": 230020 + }, + { + "epoch": 1.469596105439352, + "grad_norm": 1.112427830696106, + "learning_rate": 1.6406277637521055e-05, + "loss": 1.0489, + "step": 230030 + }, + { + "epoch": 1.4696599925890905, + "grad_norm": 0.8705644607543945, + "learning_rate": 1.6402561382732933e-05, + "loss": 0.9679, + "step": 230040 + }, + { + "epoch": 1.4697238797388295, + "grad_norm": 1.6397324800491333, + "learning_rate": 1.6398845466300094e-05, + "loss": 1.2361, + "step": 230050 + }, + { + "epoch": 1.469787766888568, + "grad_norm": 2.675877332687378, + "learning_rate": 1.6395129888259942e-05, + "loss": 0.8386, + "step": 230060 + }, + { + "epoch": 1.4698516540383069, + "grad_norm": 1.344370722770691, + "learning_rate": 1.6391414648649915e-05, + "loss": 0.76, + "step": 230070 + }, + { + "epoch": 1.4699155411880453, + "grad_norm": 1.809248447418213, + "learning_rate": 1.6387699747507402e-05, + "loss": 0.9025, + "step": 230080 + }, + { + "epoch": 1.469979428337784, + "grad_norm": 0.7666621804237366, + "learning_rate": 1.638398518486985e-05, + "loss": 0.9313, + "step": 230090 + }, + { + "epoch": 1.4700433154875228, + "grad_norm": 0.8414475917816162, + "learning_rate": 1.638027096077463e-05, + "loss": 0.844, + "step": 230100 + }, + { + "epoch": 1.4701072026372615, + "grad_norm": 0.6243281960487366, + "learning_rate": 1.637655707525917e-05, + "loss": 0.8231, + "step": 230110 + }, + { + "epoch": 1.4701710897870002, + "grad_norm": 0.6101037859916687, + "learning_rate": 1.637284352836089e-05, + "loss": 0.9168, + "step": 230120 + }, + { + "epoch": 1.4702349769367389, + "grad_norm": 1.7307742834091187, + "learning_rate": 1.636913032011715e-05, + "loss": 0.9839, + "step": 230130 + }, + { + "epoch": 1.4702988640864776, + "grad_norm": 0.975806713104248, + "learning_rate": 1.6365417450565374e-05, + "loss": 0.992, + "step": 230140 + }, + { + "epoch": 1.4703627512362163, + "grad_norm": 1.1599698066711426, + "learning_rate": 1.636170491974292e-05, + "loss": 0.9875, + "step": 230150 + }, + { + "epoch": 1.470426638385955, + "grad_norm": 0.5582944750785828, + "learning_rate": 1.635799272768722e-05, + "loss": 0.8191, + "step": 230160 + }, + { + "epoch": 1.4704905255356937, + "grad_norm": 1.2626965045928955, + "learning_rate": 1.6354280874435624e-05, + "loss": 0.7218, + "step": 230170 + }, + { + "epoch": 1.4705544126854324, + "grad_norm": 0.8063907623291016, + "learning_rate": 1.6350569360025538e-05, + "loss": 0.7508, + "step": 230180 + }, + { + "epoch": 1.4706182998351711, + "grad_norm": 0.47269266843795776, + "learning_rate": 1.634685818449432e-05, + "loss": 0.9022, + "step": 230190 + }, + { + "epoch": 1.4706821869849098, + "grad_norm": 1.0264742374420166, + "learning_rate": 1.634314734787936e-05, + "loss": 0.8665, + "step": 230200 + }, + { + "epoch": 1.4707460741346485, + "grad_norm": 1.1556494235992432, + "learning_rate": 1.6339436850218015e-05, + "loss": 0.7588, + "step": 230210 + }, + { + "epoch": 1.4708099612843872, + "grad_norm": 0.7982380986213684, + "learning_rate": 1.6335726691547674e-05, + "loss": 1.1942, + "step": 230220 + }, + { + "epoch": 1.470873848434126, + "grad_norm": 0.853100597858429, + "learning_rate": 1.6332016871905676e-05, + "loss": 0.857, + "step": 230230 + }, + { + "epoch": 1.4709377355838646, + "grad_norm": 0.8381783366203308, + "learning_rate": 1.6328307391329394e-05, + "loss": 1.019, + "step": 230240 + }, + { + "epoch": 1.4710016227336034, + "grad_norm": 1.0367997884750366, + "learning_rate": 1.6324598249856204e-05, + "loss": 1.0161, + "step": 230250 + }, + { + "epoch": 1.471065509883342, + "grad_norm": 1.1487911939620972, + "learning_rate": 1.6320889447523425e-05, + "loss": 0.8627, + "step": 230260 + }, + { + "epoch": 1.4711293970330808, + "grad_norm": 1.105258822441101, + "learning_rate": 1.6317180984368442e-05, + "loss": 0.9296, + "step": 230270 + }, + { + "epoch": 1.4711932841828195, + "grad_norm": 0.8666543960571289, + "learning_rate": 1.631347286042857e-05, + "loss": 0.9805, + "step": 230280 + }, + { + "epoch": 1.4712571713325582, + "grad_norm": 1.2708656787872314, + "learning_rate": 1.630976507574119e-05, + "loss": 0.8902, + "step": 230290 + }, + { + "epoch": 1.4713210584822969, + "grad_norm": 0.9094721078872681, + "learning_rate": 1.6306057630343595e-05, + "loss": 0.9106, + "step": 230300 + }, + { + "epoch": 1.4713849456320356, + "grad_norm": 0.9870732426643372, + "learning_rate": 1.6302350524273175e-05, + "loss": 0.8808, + "step": 230310 + }, + { + "epoch": 1.4714488327817743, + "grad_norm": 0.5650733113288879, + "learning_rate": 1.629864375756722e-05, + "loss": 1.1285, + "step": 230320 + }, + { + "epoch": 1.471512719931513, + "grad_norm": 1.3587806224822998, + "learning_rate": 1.6294937330263093e-05, + "loss": 0.9673, + "step": 230330 + }, + { + "epoch": 1.4715766070812517, + "grad_norm": 1.2703903913497925, + "learning_rate": 1.629123124239809e-05, + "loss": 0.7315, + "step": 230340 + }, + { + "epoch": 1.4716404942309904, + "grad_norm": 0.8942323923110962, + "learning_rate": 1.6287525494009565e-05, + "loss": 0.8602, + "step": 230350 + }, + { + "epoch": 1.4717043813807291, + "grad_norm": 0.8783580660820007, + "learning_rate": 1.628382008513481e-05, + "loss": 0.915, + "step": 230360 + }, + { + "epoch": 1.4717682685304678, + "grad_norm": 0.8455802202224731, + "learning_rate": 1.628011501581117e-05, + "loss": 0.7608, + "step": 230370 + }, + { + "epoch": 1.4718321556802065, + "grad_norm": 0.9563354253768921, + "learning_rate": 1.627641028607593e-05, + "loss": 0.799, + "step": 230380 + }, + { + "epoch": 1.4718960428299452, + "grad_norm": 1.0895625352859497, + "learning_rate": 1.6272705895966428e-05, + "loss": 0.7895, + "step": 230390 + }, + { + "epoch": 1.471959929979684, + "grad_norm": 1.0790303945541382, + "learning_rate": 1.626900184551994e-05, + "loss": 0.8456, + "step": 230400 + }, + { + "epoch": 1.4720238171294227, + "grad_norm": 1.095496654510498, + "learning_rate": 1.62652981347738e-05, + "loss": 0.9899, + "step": 230410 + }, + { + "epoch": 1.4720877042791614, + "grad_norm": 1.1831773519515991, + "learning_rate": 1.6261594763765282e-05, + "loss": 0.9879, + "step": 230420 + }, + { + "epoch": 1.4721515914289, + "grad_norm": 0.7561997771263123, + "learning_rate": 1.625789173253168e-05, + "loss": 0.9086, + "step": 230430 + }, + { + "epoch": 1.4722154785786388, + "grad_norm": 1.0318355560302734, + "learning_rate": 1.6254189041110328e-05, + "loss": 0.9287, + "step": 230440 + }, + { + "epoch": 1.4722793657283775, + "grad_norm": 0.9705587029457092, + "learning_rate": 1.6250486689538465e-05, + "loss": 0.8275, + "step": 230450 + }, + { + "epoch": 1.4723432528781162, + "grad_norm": 0.8411953449249268, + "learning_rate": 1.6246784677853415e-05, + "loss": 0.8982, + "step": 230460 + }, + { + "epoch": 1.4724071400278547, + "grad_norm": 0.7869654297828674, + "learning_rate": 1.624308300609243e-05, + "loss": 0.8577, + "step": 230470 + }, + { + "epoch": 1.4724710271775936, + "grad_norm": 1.2532565593719482, + "learning_rate": 1.6239381674292813e-05, + "loss": 1.009, + "step": 230480 + }, + { + "epoch": 1.472534914327332, + "grad_norm": 1.5746049880981445, + "learning_rate": 1.6235680682491823e-05, + "loss": 0.9964, + "step": 230490 + }, + { + "epoch": 1.472598801477071, + "grad_norm": 0.7447236180305481, + "learning_rate": 1.6231980030726745e-05, + "loss": 0.8126, + "step": 230500 + }, + { + "epoch": 1.4726626886268095, + "grad_norm": 0.9821268916130066, + "learning_rate": 1.6228279719034835e-05, + "loss": 0.7112, + "step": 230510 + }, + { + "epoch": 1.4727265757765484, + "grad_norm": 1.128045678138733, + "learning_rate": 1.6224579747453372e-05, + "loss": 0.9051, + "step": 230520 + }, + { + "epoch": 1.472790462926287, + "grad_norm": 1.0752300024032593, + "learning_rate": 1.6220880116019598e-05, + "loss": 1.0858, + "step": 230530 + }, + { + "epoch": 1.4728543500760258, + "grad_norm": 1.2056881189346313, + "learning_rate": 1.6217180824770807e-05, + "loss": 0.8552, + "step": 230540 + }, + { + "epoch": 1.4729182372257643, + "grad_norm": 1.2044570446014404, + "learning_rate": 1.6213481873744207e-05, + "loss": 0.699, + "step": 230550 + }, + { + "epoch": 1.4729821243755032, + "grad_norm": 0.8839747905731201, + "learning_rate": 1.6209783262977095e-05, + "loss": 0.8121, + "step": 230560 + }, + { + "epoch": 1.4730460115252417, + "grad_norm": 1.1134631633758545, + "learning_rate": 1.6206084992506675e-05, + "loss": 0.6309, + "step": 230570 + }, + { + "epoch": 1.4731098986749804, + "grad_norm": 2.019157648086548, + "learning_rate": 1.6202387062370238e-05, + "loss": 0.6585, + "step": 230580 + }, + { + "epoch": 1.4731737858247191, + "grad_norm": 1.0488548278808594, + "learning_rate": 1.619868947260499e-05, + "loss": 1.065, + "step": 230590 + }, + { + "epoch": 1.4732376729744578, + "grad_norm": 1.4154844284057617, + "learning_rate": 1.619499222324819e-05, + "loss": 0.6922, + "step": 230600 + }, + { + "epoch": 1.4733015601241966, + "grad_norm": 0.7536170482635498, + "learning_rate": 1.6191295314337062e-05, + "loss": 0.7893, + "step": 230610 + }, + { + "epoch": 1.4733654472739353, + "grad_norm": 0.8556029796600342, + "learning_rate": 1.6187598745908826e-05, + "loss": 0.7962, + "step": 230620 + }, + { + "epoch": 1.473429334423674, + "grad_norm": 1.414725661277771, + "learning_rate": 1.6183902518000744e-05, + "loss": 0.7572, + "step": 230630 + }, + { + "epoch": 1.4734932215734127, + "grad_norm": 1.4327750205993652, + "learning_rate": 1.6180206630649996e-05, + "loss": 0.8687, + "step": 230640 + }, + { + "epoch": 1.4735571087231514, + "grad_norm": 0.8788250088691711, + "learning_rate": 1.6176511083893843e-05, + "loss": 0.8099, + "step": 230650 + }, + { + "epoch": 1.47362099587289, + "grad_norm": 0.7542177438735962, + "learning_rate": 1.6172815877769472e-05, + "loss": 0.8901, + "step": 230660 + }, + { + "epoch": 1.4736848830226288, + "grad_norm": 0.8953860402107239, + "learning_rate": 1.616912101231412e-05, + "loss": 1.1104, + "step": 230670 + }, + { + "epoch": 1.4737487701723675, + "grad_norm": 0.8468190431594849, + "learning_rate": 1.6165426487564972e-05, + "loss": 1.037, + "step": 230680 + }, + { + "epoch": 1.4738126573221062, + "grad_norm": 0.5810591578483582, + "learning_rate": 1.6161732303559267e-05, + "loss": 0.755, + "step": 230690 + }, + { + "epoch": 1.473876544471845, + "grad_norm": 0.9378907084465027, + "learning_rate": 1.615803846033418e-05, + "loss": 0.9148, + "step": 230700 + }, + { + "epoch": 1.4739404316215836, + "grad_norm": 0.8563784956932068, + "learning_rate": 1.6154344957926937e-05, + "loss": 1.0995, + "step": 230710 + }, + { + "epoch": 1.4740043187713223, + "grad_norm": 0.8726912140846252, + "learning_rate": 1.6150651796374706e-05, + "loss": 0.922, + "step": 230720 + }, + { + "epoch": 1.474068205921061, + "grad_norm": 1.8336883783340454, + "learning_rate": 1.614695897571471e-05, + "loss": 1.0036, + "step": 230730 + }, + { + "epoch": 1.4741320930707997, + "grad_norm": 3.0733964443206787, + "learning_rate": 1.6143266495984105e-05, + "loss": 0.7163, + "step": 230740 + }, + { + "epoch": 1.4741959802205384, + "grad_norm": 0.9470083713531494, + "learning_rate": 1.6139574357220116e-05, + "loss": 0.7829, + "step": 230750 + }, + { + "epoch": 1.4742598673702771, + "grad_norm": 0.9975496530532837, + "learning_rate": 1.613588255945989e-05, + "loss": 0.7918, + "step": 230760 + }, + { + "epoch": 1.4743237545200159, + "grad_norm": 1.0645257234573364, + "learning_rate": 1.6132191102740624e-05, + "loss": 1.0736, + "step": 230770 + }, + { + "epoch": 1.4743876416697546, + "grad_norm": 0.9394400715827942, + "learning_rate": 1.612849998709951e-05, + "loss": 0.8007, + "step": 230780 + }, + { + "epoch": 1.4744515288194933, + "grad_norm": 0.9448645710945129, + "learning_rate": 1.612480921257369e-05, + "loss": 0.85, + "step": 230790 + }, + { + "epoch": 1.474515415969232, + "grad_norm": 1.0276811122894287, + "learning_rate": 1.6121118779200356e-05, + "loss": 0.8035, + "step": 230800 + }, + { + "epoch": 1.4745793031189707, + "grad_norm": 0.8116633892059326, + "learning_rate": 1.6117428687016656e-05, + "loss": 0.6989, + "step": 230810 + }, + { + "epoch": 1.4746431902687094, + "grad_norm": 0.8088856935501099, + "learning_rate": 1.6113738936059774e-05, + "loss": 0.9791, + "step": 230820 + }, + { + "epoch": 1.474707077418448, + "grad_norm": 1.0470901727676392, + "learning_rate": 1.6110049526366843e-05, + "loss": 0.8447, + "step": 230830 + }, + { + "epoch": 1.4747709645681868, + "grad_norm": 1.2791444063186646, + "learning_rate": 1.610636045797505e-05, + "loss": 0.6795, + "step": 230840 + }, + { + "epoch": 1.4748348517179255, + "grad_norm": 0.9264209270477295, + "learning_rate": 1.610267173092151e-05, + "loss": 0.7939, + "step": 230850 + }, + { + "epoch": 1.4748987388676642, + "grad_norm": 0.9389378428459167, + "learning_rate": 1.6098983345243405e-05, + "loss": 0.6904, + "step": 230860 + }, + { + "epoch": 1.474962626017403, + "grad_norm": 1.025505781173706, + "learning_rate": 1.609529530097785e-05, + "loss": 0.8899, + "step": 230870 + }, + { + "epoch": 1.4750265131671416, + "grad_norm": 0.7423310279846191, + "learning_rate": 1.609160759816203e-05, + "loss": 0.9437, + "step": 230880 + }, + { + "epoch": 1.4750904003168803, + "grad_norm": 0.9326030611991882, + "learning_rate": 1.608792023683303e-05, + "loss": 0.8262, + "step": 230890 + }, + { + "epoch": 1.475154287466619, + "grad_norm": 1.2312084436416626, + "learning_rate": 1.6084233217028033e-05, + "loss": 0.7731, + "step": 230900 + }, + { + "epoch": 1.4752181746163577, + "grad_norm": 0.9910838603973389, + "learning_rate": 1.6080546538784124e-05, + "loss": 0.965, + "step": 230910 + }, + { + "epoch": 1.4752820617660964, + "grad_norm": 0.8265828490257263, + "learning_rate": 1.6076860202138483e-05, + "loss": 0.8836, + "step": 230920 + }, + { + "epoch": 1.4753459489158351, + "grad_norm": 0.7746514081954956, + "learning_rate": 1.6073174207128185e-05, + "loss": 0.9498, + "step": 230930 + }, + { + "epoch": 1.4754098360655736, + "grad_norm": 1.3665982484817505, + "learning_rate": 1.606948855379039e-05, + "loss": 0.8485, + "step": 230940 + }, + { + "epoch": 1.4754737232153126, + "grad_norm": 0.9096262454986572, + "learning_rate": 1.6065803242162182e-05, + "loss": 1.0628, + "step": 230950 + }, + { + "epoch": 1.475537610365051, + "grad_norm": 1.2939201593399048, + "learning_rate": 1.6062118272280695e-05, + "loss": 0.9087, + "step": 230960 + }, + { + "epoch": 1.47560149751479, + "grad_norm": 0.8000929951667786, + "learning_rate": 1.6058433644183056e-05, + "loss": 0.9918, + "step": 230970 + }, + { + "epoch": 1.4756653846645285, + "grad_norm": 1.0761744976043701, + "learning_rate": 1.6054749357906336e-05, + "loss": 1.0643, + "step": 230980 + }, + { + "epoch": 1.4757292718142674, + "grad_norm": 0.8242314457893372, + "learning_rate": 1.6051065413487672e-05, + "loss": 0.7486, + "step": 230990 + }, + { + "epoch": 1.4757931589640059, + "grad_norm": 1.2581071853637695, + "learning_rate": 1.604738181096413e-05, + "loss": 0.9587, + "step": 231000 + }, + { + "epoch": 1.4758570461137448, + "grad_norm": 1.2790287733078003, + "learning_rate": 1.6043698550372842e-05, + "loss": 0.8309, + "step": 231010 + }, + { + "epoch": 1.4759209332634833, + "grad_norm": 1.0953774452209473, + "learning_rate": 1.6040015631750877e-05, + "loss": 0.8387, + "step": 231020 + }, + { + "epoch": 1.4759848204132222, + "grad_norm": 0.9266490340232849, + "learning_rate": 1.6036333055135344e-05, + "loss": 0.6225, + "step": 231030 + }, + { + "epoch": 1.4760487075629607, + "grad_norm": 0.7282007336616516, + "learning_rate": 1.603265082056331e-05, + "loss": 1.0751, + "step": 231040 + }, + { + "epoch": 1.4761125947126996, + "grad_norm": 0.6650501489639282, + "learning_rate": 1.602896892807188e-05, + "loss": 1.058, + "step": 231050 + }, + { + "epoch": 1.476176481862438, + "grad_norm": 1.3024554252624512, + "learning_rate": 1.6025287377698105e-05, + "loss": 0.8926, + "step": 231060 + }, + { + "epoch": 1.4762403690121768, + "grad_norm": 0.6745061278343201, + "learning_rate": 1.6021606169479098e-05, + "loss": 0.8608, + "step": 231070 + }, + { + "epoch": 1.4763042561619155, + "grad_norm": 0.7933433055877686, + "learning_rate": 1.6017925303451898e-05, + "loss": 1.0975, + "step": 231080 + }, + { + "epoch": 1.4763681433116542, + "grad_norm": 0.8587223291397095, + "learning_rate": 1.6014244779653598e-05, + "loss": 0.8711, + "step": 231090 + }, + { + "epoch": 1.476432030461393, + "grad_norm": 1.0614300966262817, + "learning_rate": 1.6010564598121257e-05, + "loss": 1.0366, + "step": 231100 + }, + { + "epoch": 1.4764959176111316, + "grad_norm": 1.0521320104599, + "learning_rate": 1.6006884758891922e-05, + "loss": 1.0693, + "step": 231110 + }, + { + "epoch": 1.4765598047608703, + "grad_norm": 1.2860974073410034, + "learning_rate": 1.600320526200268e-05, + "loss": 0.6898, + "step": 231120 + }, + { + "epoch": 1.476623691910609, + "grad_norm": 4.338561534881592, + "learning_rate": 1.5999526107490557e-05, + "loss": 0.8247, + "step": 231130 + }, + { + "epoch": 1.4766875790603478, + "grad_norm": 0.7639591693878174, + "learning_rate": 1.5995847295392636e-05, + "loss": 1.2375, + "step": 231140 + }, + { + "epoch": 1.4767514662100865, + "grad_norm": 0.5624483227729797, + "learning_rate": 1.5992168825745933e-05, + "loss": 1.2762, + "step": 231150 + }, + { + "epoch": 1.4768153533598252, + "grad_norm": 0.9101094603538513, + "learning_rate": 1.5988490698587534e-05, + "loss": 0.9293, + "step": 231160 + }, + { + "epoch": 1.4768792405095639, + "grad_norm": 1.2439855337142944, + "learning_rate": 1.5984812913954435e-05, + "loss": 0.8092, + "step": 231170 + }, + { + "epoch": 1.4769431276593026, + "grad_norm": 0.6541301012039185, + "learning_rate": 1.5981135471883713e-05, + "loss": 0.7527, + "step": 231180 + }, + { + "epoch": 1.4770070148090413, + "grad_norm": 1.1231944561004639, + "learning_rate": 1.5977458372412373e-05, + "loss": 0.7811, + "step": 231190 + }, + { + "epoch": 1.47707090195878, + "grad_norm": 1.1022666692733765, + "learning_rate": 1.5973781615577475e-05, + "loss": 0.9701, + "step": 231200 + }, + { + "epoch": 1.4771347891085187, + "grad_norm": 0.6377559900283813, + "learning_rate": 1.5970105201416024e-05, + "loss": 0.9913, + "step": 231210 + }, + { + "epoch": 1.4771986762582574, + "grad_norm": 1.0564450025558472, + "learning_rate": 1.5966429129965066e-05, + "loss": 1.0922, + "step": 231220 + }, + { + "epoch": 1.477262563407996, + "grad_norm": 1.1828644275665283, + "learning_rate": 1.5962753401261595e-05, + "loss": 0.9356, + "step": 231230 + }, + { + "epoch": 1.4773264505577348, + "grad_norm": 1.4484585523605347, + "learning_rate": 1.5959078015342654e-05, + "loss": 1.1591, + "step": 231240 + }, + { + "epoch": 1.4773903377074735, + "grad_norm": 0.9290236830711365, + "learning_rate": 1.5955402972245235e-05, + "loss": 0.9023, + "step": 231250 + }, + { + "epoch": 1.4774542248572122, + "grad_norm": 0.9976933002471924, + "learning_rate": 1.5951728272006377e-05, + "loss": 0.7068, + "step": 231260 + }, + { + "epoch": 1.477518112006951, + "grad_norm": 1.0432369709014893, + "learning_rate": 1.594805391466305e-05, + "loss": 0.8532, + "step": 231270 + }, + { + "epoch": 1.4775819991566896, + "grad_norm": 0.8269700407981873, + "learning_rate": 1.5944379900252287e-05, + "loss": 0.9943, + "step": 231280 + }, + { + "epoch": 1.4776458863064283, + "grad_norm": 1.000536322593689, + "learning_rate": 1.594070622881109e-05, + "loss": 0.7274, + "step": 231290 + }, + { + "epoch": 1.477709773456167, + "grad_norm": 0.7897246479988098, + "learning_rate": 1.5937032900376437e-05, + "loss": 0.7177, + "step": 231300 + }, + { + "epoch": 1.4777736606059058, + "grad_norm": 1.0336436033248901, + "learning_rate": 1.5933359914985346e-05, + "loss": 0.8322, + "step": 231310 + }, + { + "epoch": 1.4778375477556445, + "grad_norm": 0.9047965407371521, + "learning_rate": 1.5929687272674775e-05, + "loss": 1.0183, + "step": 231320 + }, + { + "epoch": 1.4779014349053832, + "grad_norm": 1.1535193920135498, + "learning_rate": 1.5926014973481747e-05, + "loss": 0.5858, + "step": 231330 + }, + { + "epoch": 1.4779653220551219, + "grad_norm": 0.9659336805343628, + "learning_rate": 1.5922343017443203e-05, + "loss": 0.9971, + "step": 231340 + }, + { + "epoch": 1.4780292092048606, + "grad_norm": 1.0284655094146729, + "learning_rate": 1.5918671404596168e-05, + "loss": 1.1042, + "step": 231350 + }, + { + "epoch": 1.4780930963545993, + "grad_norm": 0.987298309803009, + "learning_rate": 1.5915000134977583e-05, + "loss": 0.8724, + "step": 231360 + }, + { + "epoch": 1.478156983504338, + "grad_norm": 1.0232453346252441, + "learning_rate": 1.5911329208624443e-05, + "loss": 0.9563, + "step": 231370 + }, + { + "epoch": 1.4782208706540767, + "grad_norm": 1.4888365268707275, + "learning_rate": 1.5907658625573695e-05, + "loss": 1.0152, + "step": 231380 + }, + { + "epoch": 1.4782847578038154, + "grad_norm": 0.9709943532943726, + "learning_rate": 1.5903988385862338e-05, + "loss": 0.7399, + "step": 231390 + }, + { + "epoch": 1.4783486449535541, + "grad_norm": 0.8116827011108398, + "learning_rate": 1.59003184895273e-05, + "loss": 1.0779, + "step": 231400 + }, + { + "epoch": 1.4784125321032928, + "grad_norm": 0.9209198951721191, + "learning_rate": 1.5896648936605568e-05, + "loss": 0.6134, + "step": 231410 + }, + { + "epoch": 1.4784764192530315, + "grad_norm": 1.203497290611267, + "learning_rate": 1.5892979727134066e-05, + "loss": 0.625, + "step": 231420 + }, + { + "epoch": 1.47854030640277, + "grad_norm": 1.94428551197052, + "learning_rate": 1.5889310861149786e-05, + "loss": 0.7796, + "step": 231430 + }, + { + "epoch": 1.478604193552509, + "grad_norm": 1.7343497276306152, + "learning_rate": 1.5885642338689638e-05, + "loss": 0.6879, + "step": 231440 + }, + { + "epoch": 1.4786680807022474, + "grad_norm": 0.8659371733665466, + "learning_rate": 1.58819741597906e-05, + "loss": 0.7654, + "step": 231450 + }, + { + "epoch": 1.4787319678519864, + "grad_norm": 1.5597984790802002, + "learning_rate": 1.5878306324489584e-05, + "loss": 1.0089, + "step": 231460 + }, + { + "epoch": 1.4787958550017248, + "grad_norm": 1.151652455329895, + "learning_rate": 1.587463883282356e-05, + "loss": 0.6786, + "step": 231470 + }, + { + "epoch": 1.4788597421514638, + "grad_norm": 1.3175287246704102, + "learning_rate": 1.5870971684829426e-05, + "loss": 0.7708, + "step": 231480 + }, + { + "epoch": 1.4789236293012022, + "grad_norm": 0.7284002304077148, + "learning_rate": 1.5867304880544133e-05, + "loss": 0.7838, + "step": 231490 + }, + { + "epoch": 1.4789875164509412, + "grad_norm": 1.0089588165283203, + "learning_rate": 1.586363842000463e-05, + "loss": 0.864, + "step": 231500 + }, + { + "epoch": 1.4790514036006797, + "grad_norm": 0.8042920827865601, + "learning_rate": 1.58599723032478e-05, + "loss": 1.109, + "step": 231510 + }, + { + "epoch": 1.4791152907504186, + "grad_norm": 1.1528286933898926, + "learning_rate": 1.58563065303106e-05, + "loss": 0.851, + "step": 231520 + }, + { + "epoch": 1.479179177900157, + "grad_norm": 1.242755651473999, + "learning_rate": 1.5852641101229914e-05, + "loss": 1.0138, + "step": 231530 + }, + { + "epoch": 1.479243065049896, + "grad_norm": 0.9317694306373596, + "learning_rate": 1.5848976016042693e-05, + "loss": 0.8296, + "step": 231540 + }, + { + "epoch": 1.4793069521996345, + "grad_norm": 0.633034348487854, + "learning_rate": 1.5845311274785812e-05, + "loss": 0.6141, + "step": 231550 + }, + { + "epoch": 1.4793708393493732, + "grad_norm": 0.689298152923584, + "learning_rate": 1.5841646877496213e-05, + "loss": 0.964, + "step": 231560 + }, + { + "epoch": 1.479434726499112, + "grad_norm": 0.7968207597732544, + "learning_rate": 1.5837982824210763e-05, + "loss": 0.7972, + "step": 231570 + }, + { + "epoch": 1.4794986136488506, + "grad_norm": 1.0918278694152832, + "learning_rate": 1.583431911496641e-05, + "loss": 0.9565, + "step": 231580 + }, + { + "epoch": 1.4795625007985893, + "grad_norm": 0.8419539928436279, + "learning_rate": 1.583065574979999e-05, + "loss": 0.9199, + "step": 231590 + }, + { + "epoch": 1.479626387948328, + "grad_norm": 2.540273666381836, + "learning_rate": 1.582699272874843e-05, + "loss": 1.1954, + "step": 231600 + }, + { + "epoch": 1.4796902750980667, + "grad_norm": 0.8567215204238892, + "learning_rate": 1.582333005184863e-05, + "loss": 1.0835, + "step": 231610 + }, + { + "epoch": 1.4797541622478054, + "grad_norm": 0.7937307357788086, + "learning_rate": 1.581966771913745e-05, + "loss": 1.0136, + "step": 231620 + }, + { + "epoch": 1.4798180493975441, + "grad_norm": 0.733269214630127, + "learning_rate": 1.5816005730651802e-05, + "loss": 0.7862, + "step": 231630 + }, + { + "epoch": 1.4798819365472828, + "grad_norm": 0.793347179889679, + "learning_rate": 1.5812344086428526e-05, + "loss": 0.8984, + "step": 231640 + }, + { + "epoch": 1.4799458236970215, + "grad_norm": 0.6982327103614807, + "learning_rate": 1.5808682786504546e-05, + "loss": 0.8062, + "step": 231650 + }, + { + "epoch": 1.4800097108467603, + "grad_norm": 0.6589148044586182, + "learning_rate": 1.5805021830916695e-05, + "loss": 0.874, + "step": 231660 + }, + { + "epoch": 1.480073597996499, + "grad_norm": 0.7669758200645447, + "learning_rate": 1.5801361219701865e-05, + "loss": 0.8848, + "step": 231670 + }, + { + "epoch": 1.4801374851462377, + "grad_norm": 0.7774679660797119, + "learning_rate": 1.5797700952896903e-05, + "loss": 0.8524, + "step": 231680 + }, + { + "epoch": 1.4802013722959764, + "grad_norm": 0.9099416136741638, + "learning_rate": 1.57940410305387e-05, + "loss": 0.9443, + "step": 231690 + }, + { + "epoch": 1.480265259445715, + "grad_norm": 1.5125083923339844, + "learning_rate": 1.579038145266408e-05, + "loss": 0.9126, + "step": 231700 + }, + { + "epoch": 1.4803291465954538, + "grad_norm": 1.3232501745224, + "learning_rate": 1.5786722219309924e-05, + "loss": 0.8163, + "step": 231710 + }, + { + "epoch": 1.4803930337451925, + "grad_norm": 1.0830581188201904, + "learning_rate": 1.5783063330513066e-05, + "loss": 0.8484, + "step": 231720 + }, + { + "epoch": 1.4804569208949312, + "grad_norm": 0.8370949625968933, + "learning_rate": 1.577940478631037e-05, + "loss": 0.8852, + "step": 231730 + }, + { + "epoch": 1.48052080804467, + "grad_norm": 0.7332857251167297, + "learning_rate": 1.577574658673866e-05, + "loss": 0.9428, + "step": 231740 + }, + { + "epoch": 1.4805846951944086, + "grad_norm": 1.0560225248336792, + "learning_rate": 1.5772088731834804e-05, + "loss": 0.9721, + "step": 231750 + }, + { + "epoch": 1.4806485823441473, + "grad_norm": 0.9546782970428467, + "learning_rate": 1.5768431221635615e-05, + "loss": 0.9743, + "step": 231760 + }, + { + "epoch": 1.480712469493886, + "grad_norm": 1.2282410860061646, + "learning_rate": 1.5764774056177957e-05, + "loss": 0.7588, + "step": 231770 + }, + { + "epoch": 1.4807763566436247, + "grad_norm": 0.8199455738067627, + "learning_rate": 1.576111723549862e-05, + "loss": 0.8427, + "step": 231780 + }, + { + "epoch": 1.4808402437933634, + "grad_norm": 0.8073819279670715, + "learning_rate": 1.5757460759634468e-05, + "loss": 0.9772, + "step": 231790 + }, + { + "epoch": 1.4809041309431021, + "grad_norm": 1.3390814065933228, + "learning_rate": 1.5753804628622292e-05, + "loss": 0.9785, + "step": 231800 + }, + { + "epoch": 1.4809680180928408, + "grad_norm": 1.1008869409561157, + "learning_rate": 1.5750148842498934e-05, + "loss": 0.8565, + "step": 231810 + }, + { + "epoch": 1.4810319052425795, + "grad_norm": 1.158144474029541, + "learning_rate": 1.5746493401301225e-05, + "loss": 0.9149, + "step": 231820 + }, + { + "epoch": 1.4810957923923183, + "grad_norm": 0.8972355127334595, + "learning_rate": 1.5742838305065945e-05, + "loss": 1.0507, + "step": 231830 + }, + { + "epoch": 1.481159679542057, + "grad_norm": 1.1104916334152222, + "learning_rate": 1.5739183553829935e-05, + "loss": 0.9427, + "step": 231840 + }, + { + "epoch": 1.4812235666917957, + "grad_norm": 1.1483949422836304, + "learning_rate": 1.5735529147629967e-05, + "loss": 0.642, + "step": 231850 + }, + { + "epoch": 1.4812874538415344, + "grad_norm": 0.7706599831581116, + "learning_rate": 1.573187508650289e-05, + "loss": 0.7168, + "step": 231860 + }, + { + "epoch": 1.481351340991273, + "grad_norm": 0.9245922565460205, + "learning_rate": 1.5728221370485452e-05, + "loss": 0.8697, + "step": 231870 + }, + { + "epoch": 1.4814152281410118, + "grad_norm": 1.2615220546722412, + "learning_rate": 1.5724567999614493e-05, + "loss": 0.9605, + "step": 231880 + }, + { + "epoch": 1.4814791152907505, + "grad_norm": 1.3631330728530884, + "learning_rate": 1.5720914973926766e-05, + "loss": 0.6942, + "step": 231890 + }, + { + "epoch": 1.4815430024404892, + "grad_norm": 1.0511490106582642, + "learning_rate": 1.57172622934591e-05, + "loss": 0.916, + "step": 231900 + }, + { + "epoch": 1.481606889590228, + "grad_norm": 1.2899891138076782, + "learning_rate": 1.5713609958248247e-05, + "loss": 0.8863, + "step": 231910 + }, + { + "epoch": 1.4816707767399664, + "grad_norm": 0.9524277448654175, + "learning_rate": 1.570995796833102e-05, + "loss": 0.7068, + "step": 231920 + }, + { + "epoch": 1.4817346638897053, + "grad_norm": 0.857600212097168, + "learning_rate": 1.5706306323744163e-05, + "loss": 0.9073, + "step": 231930 + }, + { + "epoch": 1.4817985510394438, + "grad_norm": 1.0148301124572754, + "learning_rate": 1.570265502452449e-05, + "loss": 0.8351, + "step": 231940 + }, + { + "epoch": 1.4818624381891827, + "grad_norm": 1.1091690063476562, + "learning_rate": 1.569900407070873e-05, + "loss": 0.7854, + "step": 231950 + }, + { + "epoch": 1.4819263253389212, + "grad_norm": 1.0870037078857422, + "learning_rate": 1.5695353462333697e-05, + "loss": 0.6657, + "step": 231960 + }, + { + "epoch": 1.4819902124886601, + "grad_norm": 1.797438383102417, + "learning_rate": 1.569170319943611e-05, + "loss": 0.7927, + "step": 231970 + }, + { + "epoch": 1.4820540996383986, + "grad_norm": 1.677264928817749, + "learning_rate": 1.5688053282052767e-05, + "loss": 0.8669, + "step": 231980 + }, + { + "epoch": 1.4821179867881376, + "grad_norm": 2.095792770385742, + "learning_rate": 1.5684403710220402e-05, + "loss": 0.7737, + "step": 231990 + }, + { + "epoch": 1.482181873937876, + "grad_norm": 1.3605451583862305, + "learning_rate": 1.568075448397579e-05, + "loss": 0.7819, + "step": 232000 + }, + { + "epoch": 1.482245761087615, + "grad_norm": 1.5517606735229492, + "learning_rate": 1.5677105603355656e-05, + "loss": 0.8138, + "step": 232010 + }, + { + "epoch": 1.4823096482373534, + "grad_norm": 0.6776690483093262, + "learning_rate": 1.5673457068396763e-05, + "loss": 1.1062, + "step": 232020 + }, + { + "epoch": 1.4823735353870924, + "grad_norm": 0.8474202156066895, + "learning_rate": 1.566980887913587e-05, + "loss": 0.9323, + "step": 232030 + }, + { + "epoch": 1.4824374225368309, + "grad_norm": 0.6449154019355774, + "learning_rate": 1.5666161035609684e-05, + "loss": 0.7905, + "step": 232040 + }, + { + "epoch": 1.4825013096865696, + "grad_norm": 1.335546612739563, + "learning_rate": 1.5662513537854978e-05, + "loss": 1.0527, + "step": 232050 + }, + { + "epoch": 1.4825651968363083, + "grad_norm": 0.9183993935585022, + "learning_rate": 1.565886638590845e-05, + "loss": 0.9243, + "step": 232060 + }, + { + "epoch": 1.482629083986047, + "grad_norm": 1.0847032070159912, + "learning_rate": 1.565521957980688e-05, + "loss": 0.7541, + "step": 232070 + }, + { + "epoch": 1.4826929711357857, + "grad_norm": 0.827590823173523, + "learning_rate": 1.5651573119586928e-05, + "loss": 0.8019, + "step": 232080 + }, + { + "epoch": 1.4827568582855244, + "grad_norm": 0.6743472218513489, + "learning_rate": 1.564792700528536e-05, + "loss": 1.0403, + "step": 232090 + }, + { + "epoch": 1.482820745435263, + "grad_norm": 1.1407054662704468, + "learning_rate": 1.564428123693888e-05, + "loss": 0.8504, + "step": 232100 + }, + { + "epoch": 1.4828846325850018, + "grad_norm": 3.8805408477783203, + "learning_rate": 1.564063581458422e-05, + "loss": 0.7055, + "step": 232110 + }, + { + "epoch": 1.4829485197347405, + "grad_norm": 1.3978850841522217, + "learning_rate": 1.5636990738258066e-05, + "loss": 0.9231, + "step": 232120 + }, + { + "epoch": 1.4830124068844792, + "grad_norm": 0.8537045121192932, + "learning_rate": 1.5633346007997147e-05, + "loss": 0.7011, + "step": 232130 + }, + { + "epoch": 1.483076294034218, + "grad_norm": 1.7283480167388916, + "learning_rate": 1.5629701623838176e-05, + "loss": 1.0444, + "step": 232140 + }, + { + "epoch": 1.4831401811839566, + "grad_norm": 0.9529137015342712, + "learning_rate": 1.5626057585817837e-05, + "loss": 0.7992, + "step": 232150 + }, + { + "epoch": 1.4832040683336953, + "grad_norm": 1.050119161605835, + "learning_rate": 1.56227782475784e-05, + "loss": 0.9607, + "step": 232160 + }, + { + "epoch": 1.483267955483434, + "grad_norm": 2.544686794281006, + "learning_rate": 1.5619134867322572e-05, + "loss": 0.9267, + "step": 232170 + }, + { + "epoch": 1.4833318426331727, + "grad_norm": 0.8504775762557983, + "learning_rate": 1.561549183331182e-05, + "loss": 0.6988, + "step": 232180 + }, + { + "epoch": 1.4833957297829115, + "grad_norm": 0.766586422920227, + "learning_rate": 1.561184914558279e-05, + "loss": 0.6566, + "step": 232190 + }, + { + "epoch": 1.4834596169326502, + "grad_norm": 0.7473874092102051, + "learning_rate": 1.5608206804172203e-05, + "loss": 0.6506, + "step": 232200 + }, + { + "epoch": 1.4835235040823889, + "grad_norm": 0.6732820868492126, + "learning_rate": 1.5604564809116735e-05, + "loss": 0.836, + "step": 232210 + }, + { + "epoch": 1.4835873912321276, + "grad_norm": 1.0650951862335205, + "learning_rate": 1.5600923160453053e-05, + "loss": 0.7287, + "step": 232220 + }, + { + "epoch": 1.4836512783818663, + "grad_norm": 0.8236666917800903, + "learning_rate": 1.559728185821781e-05, + "loss": 0.8019, + "step": 232230 + }, + { + "epoch": 1.483715165531605, + "grad_norm": 0.8332436084747314, + "learning_rate": 1.5593640902447725e-05, + "loss": 0.8261, + "step": 232240 + }, + { + "epoch": 1.4837790526813437, + "grad_norm": 0.6860572695732117, + "learning_rate": 1.5590000293179423e-05, + "loss": 0.6497, + "step": 232250 + }, + { + "epoch": 1.4838429398310824, + "grad_norm": 2.124331474304199, + "learning_rate": 1.558636003044959e-05, + "loss": 0.7933, + "step": 232260 + }, + { + "epoch": 1.483906826980821, + "grad_norm": 1.946094274520874, + "learning_rate": 1.5582720114294896e-05, + "loss": 0.741, + "step": 232270 + }, + { + "epoch": 1.4839707141305598, + "grad_norm": 0.8453720808029175, + "learning_rate": 1.5579080544751974e-05, + "loss": 0.7517, + "step": 232280 + }, + { + "epoch": 1.4840346012802985, + "grad_norm": 0.9035594463348389, + "learning_rate": 1.5575441321857503e-05, + "loss": 0.7278, + "step": 232290 + }, + { + "epoch": 1.4840984884300372, + "grad_norm": 0.765318751335144, + "learning_rate": 1.5571802445648104e-05, + "loss": 0.638, + "step": 232300 + }, + { + "epoch": 1.484162375579776, + "grad_norm": 1.1105114221572876, + "learning_rate": 1.556816391616045e-05, + "loss": 0.7516, + "step": 232310 + }, + { + "epoch": 1.4842262627295146, + "grad_norm": 0.6948691010475159, + "learning_rate": 1.5564525733431163e-05, + "loss": 0.7438, + "step": 232320 + }, + { + "epoch": 1.4842901498792533, + "grad_norm": 1.0976312160491943, + "learning_rate": 1.5560887897496907e-05, + "loss": 0.5559, + "step": 232330 + }, + { + "epoch": 1.484354037028992, + "grad_norm": 1.7774726152420044, + "learning_rate": 1.5557250408394287e-05, + "loss": 0.8347, + "step": 232340 + }, + { + "epoch": 1.4844179241787308, + "grad_norm": 1.5234465599060059, + "learning_rate": 1.5553613266159973e-05, + "loss": 1.2875, + "step": 232350 + }, + { + "epoch": 1.4844818113284695, + "grad_norm": 2.0421388149261475, + "learning_rate": 1.554997647083055e-05, + "loss": 0.8815, + "step": 232360 + }, + { + "epoch": 1.4845456984782082, + "grad_norm": 0.7968341112136841, + "learning_rate": 1.554634002244269e-05, + "loss": 0.8076, + "step": 232370 + }, + { + "epoch": 1.4846095856279469, + "grad_norm": 0.6294237971305847, + "learning_rate": 1.5542703921032975e-05, + "loss": 0.6622, + "step": 232380 + }, + { + "epoch": 1.4846734727776856, + "grad_norm": 0.8361502885818481, + "learning_rate": 1.5539068166638053e-05, + "loss": 0.7356, + "step": 232390 + }, + { + "epoch": 1.4847373599274243, + "grad_norm": 0.931691586971283, + "learning_rate": 1.553543275929452e-05, + "loss": 0.8819, + "step": 232400 + }, + { + "epoch": 1.4848012470771628, + "grad_norm": 0.998232901096344, + "learning_rate": 1.553179769903901e-05, + "loss": 0.7952, + "step": 232410 + }, + { + "epoch": 1.4848651342269017, + "grad_norm": 0.9501769542694092, + "learning_rate": 1.552816298590809e-05, + "loss": 0.7659, + "step": 232420 + }, + { + "epoch": 1.4849290213766402, + "grad_norm": 0.7583515048027039, + "learning_rate": 1.5524528619938417e-05, + "loss": 0.8152, + "step": 232430 + }, + { + "epoch": 1.484992908526379, + "grad_norm": 0.9512587189674377, + "learning_rate": 1.5520894601166546e-05, + "loss": 0.8091, + "step": 232440 + }, + { + "epoch": 1.4850567956761176, + "grad_norm": 1.126081109046936, + "learning_rate": 1.5517260929629097e-05, + "loss": 1.0074, + "step": 232450 + }, + { + "epoch": 1.4851206828258565, + "grad_norm": 1.1219438314437866, + "learning_rate": 1.551362760536268e-05, + "loss": 1.0365, + "step": 232460 + }, + { + "epoch": 1.485184569975595, + "grad_norm": 0.7321421504020691, + "learning_rate": 1.5509994628403846e-05, + "loss": 0.8553, + "step": 232470 + }, + { + "epoch": 1.485248457125334, + "grad_norm": 1.2466015815734863, + "learning_rate": 1.5506361998789225e-05, + "loss": 0.8566, + "step": 232480 + }, + { + "epoch": 1.4853123442750724, + "grad_norm": 0.8446059823036194, + "learning_rate": 1.5502729716555364e-05, + "loss": 0.603, + "step": 232490 + }, + { + "epoch": 1.4853762314248113, + "grad_norm": 0.5884280800819397, + "learning_rate": 1.549909778173887e-05, + "loss": 0.8151, + "step": 232500 + }, + { + "epoch": 1.4854401185745498, + "grad_norm": 3.8547708988189697, + "learning_rate": 1.5495466194376295e-05, + "loss": 0.7316, + "step": 232510 + }, + { + "epoch": 1.4855040057242885, + "grad_norm": 2.5176239013671875, + "learning_rate": 1.5491834954504246e-05, + "loss": 0.8722, + "step": 232520 + }, + { + "epoch": 1.4855678928740272, + "grad_norm": 0.652862012386322, + "learning_rate": 1.5488204062159255e-05, + "loss": 0.7634, + "step": 232530 + }, + { + "epoch": 1.485631780023766, + "grad_norm": 0.9530776143074036, + "learning_rate": 1.548457351737792e-05, + "loss": 0.8714, + "step": 232540 + }, + { + "epoch": 1.4856956671735047, + "grad_norm": 0.7320083379745483, + "learning_rate": 1.5480943320196778e-05, + "loss": 0.705, + "step": 232550 + }, + { + "epoch": 1.4857595543232434, + "grad_norm": 0.5726275444030762, + "learning_rate": 1.547731347065241e-05, + "loss": 0.9791, + "step": 232560 + }, + { + "epoch": 1.485823441472982, + "grad_norm": 1.0993123054504395, + "learning_rate": 1.547368396878135e-05, + "loss": 0.8455, + "step": 232570 + }, + { + "epoch": 1.4858873286227208, + "grad_norm": 1.0467476844787598, + "learning_rate": 1.5470054814620178e-05, + "loss": 0.9816, + "step": 232580 + }, + { + "epoch": 1.4859512157724595, + "grad_norm": 1.7535980939865112, + "learning_rate": 1.546642600820541e-05, + "loss": 0.7644, + "step": 232590 + }, + { + "epoch": 1.4860151029221982, + "grad_norm": 0.85832279920578, + "learning_rate": 1.5462797549573627e-05, + "loss": 0.7564, + "step": 232600 + }, + { + "epoch": 1.486078990071937, + "grad_norm": 0.8821678161621094, + "learning_rate": 1.545916943876133e-05, + "loss": 0.7779, + "step": 232610 + }, + { + "epoch": 1.4861428772216756, + "grad_norm": 0.9968520998954773, + "learning_rate": 1.5455541675805095e-05, + "loss": 0.7113, + "step": 232620 + }, + { + "epoch": 1.4862067643714143, + "grad_norm": 0.8889404535293579, + "learning_rate": 1.545191426074143e-05, + "loss": 0.8483, + "step": 232630 + }, + { + "epoch": 1.486270651521153, + "grad_norm": 0.8896504044532776, + "learning_rate": 1.5448287193606887e-05, + "loss": 0.7159, + "step": 232640 + }, + { + "epoch": 1.4863345386708917, + "grad_norm": 0.8445311784744263, + "learning_rate": 1.5444660474437972e-05, + "loss": 0.9071, + "step": 232650 + }, + { + "epoch": 1.4863984258206304, + "grad_norm": 1.0877915620803833, + "learning_rate": 1.544103410327122e-05, + "loss": 0.9653, + "step": 232660 + }, + { + "epoch": 1.4864623129703691, + "grad_norm": 0.7088839411735535, + "learning_rate": 1.543740808014316e-05, + "loss": 0.6883, + "step": 232670 + }, + { + "epoch": 1.4865262001201078, + "grad_norm": 0.8006494641304016, + "learning_rate": 1.5433782405090297e-05, + "loss": 0.6404, + "step": 232680 + }, + { + "epoch": 1.4865900872698465, + "grad_norm": 0.6382250189781189, + "learning_rate": 1.543015707814916e-05, + "loss": 0.9332, + "step": 232690 + }, + { + "epoch": 1.4866539744195852, + "grad_norm": 5.511012554168701, + "learning_rate": 1.542653209935625e-05, + "loss": 0.9581, + "step": 232700 + }, + { + "epoch": 1.486717861569324, + "grad_norm": 1.100386381149292, + "learning_rate": 1.542290746874807e-05, + "loss": 0.9676, + "step": 232710 + }, + { + "epoch": 1.4867817487190627, + "grad_norm": 1.28281569480896, + "learning_rate": 1.541928318636111e-05, + "loss": 0.925, + "step": 232720 + }, + { + "epoch": 1.4868456358688014, + "grad_norm": 0.6798058152198792, + "learning_rate": 1.54156592522319e-05, + "loss": 0.7567, + "step": 232730 + }, + { + "epoch": 1.48690952301854, + "grad_norm": 0.7467005252838135, + "learning_rate": 1.5412035666396906e-05, + "loss": 0.7877, + "step": 232740 + }, + { + "epoch": 1.4869734101682788, + "grad_norm": 1.0129773616790771, + "learning_rate": 1.5408412428892655e-05, + "loss": 0.9662, + "step": 232750 + }, + { + "epoch": 1.4870372973180175, + "grad_norm": 0.9531970024108887, + "learning_rate": 1.5404789539755593e-05, + "loss": 1.0394, + "step": 232760 + }, + { + "epoch": 1.4871011844677562, + "grad_norm": 1.0765858888626099, + "learning_rate": 1.5401166999022254e-05, + "loss": 0.8266, + "step": 232770 + }, + { + "epoch": 1.487165071617495, + "grad_norm": 1.456829309463501, + "learning_rate": 1.5397544806729076e-05, + "loss": 0.9925, + "step": 232780 + }, + { + "epoch": 1.4872289587672336, + "grad_norm": 1.1520856618881226, + "learning_rate": 1.5393922962912555e-05, + "loss": 0.8577, + "step": 232790 + }, + { + "epoch": 1.4872928459169723, + "grad_norm": 0.9173868894577026, + "learning_rate": 1.5390301467609187e-05, + "loss": 0.7145, + "step": 232800 + }, + { + "epoch": 1.487356733066711, + "grad_norm": 1.2395943403244019, + "learning_rate": 1.5386680320855408e-05, + "loss": 0.7307, + "step": 232810 + }, + { + "epoch": 1.4874206202164497, + "grad_norm": 1.308870792388916, + "learning_rate": 1.538305952268772e-05, + "loss": 0.8319, + "step": 232820 + }, + { + "epoch": 1.4874845073661884, + "grad_norm": 0.7490695118904114, + "learning_rate": 1.5379439073142553e-05, + "loss": 0.8522, + "step": 232830 + }, + { + "epoch": 1.4875483945159271, + "grad_norm": 0.7452239990234375, + "learning_rate": 1.53758189722564e-05, + "loss": 0.8708, + "step": 232840 + }, + { + "epoch": 1.4876122816656658, + "grad_norm": 0.8576825261116028, + "learning_rate": 1.537219922006569e-05, + "loss": 0.7481, + "step": 232850 + }, + { + "epoch": 1.4876761688154045, + "grad_norm": 1.7907367944717407, + "learning_rate": 1.536857981660691e-05, + "loss": 0.9311, + "step": 232860 + }, + { + "epoch": 1.4877400559651432, + "grad_norm": 0.8764051795005798, + "learning_rate": 1.536496076191647e-05, + "loss": 1.0075, + "step": 232870 + }, + { + "epoch": 1.487803943114882, + "grad_norm": 1.157345175743103, + "learning_rate": 1.536134205603086e-05, + "loss": 0.8867, + "step": 232880 + }, + { + "epoch": 1.4878678302646207, + "grad_norm": 0.9898677468299866, + "learning_rate": 1.5357723698986482e-05, + "loss": 1.0325, + "step": 232890 + }, + { + "epoch": 1.4879317174143591, + "grad_norm": 1.3587260246276855, + "learning_rate": 1.5354105690819814e-05, + "loss": 0.9747, + "step": 232900 + }, + { + "epoch": 1.487995604564098, + "grad_norm": 0.8809471726417542, + "learning_rate": 1.5350488031567263e-05, + "loss": 0.8676, + "step": 232910 + }, + { + "epoch": 1.4880594917138366, + "grad_norm": 1.4196127653121948, + "learning_rate": 1.5346870721265283e-05, + "loss": 0.9157, + "step": 232920 + }, + { + "epoch": 1.4881233788635755, + "grad_norm": 0.9943186640739441, + "learning_rate": 1.5343253759950284e-05, + "loss": 0.7639, + "step": 232930 + }, + { + "epoch": 1.488187266013314, + "grad_norm": 0.735995888710022, + "learning_rate": 1.533963714765871e-05, + "loss": 0.7108, + "step": 232940 + }, + { + "epoch": 1.488251153163053, + "grad_norm": 1.09149169921875, + "learning_rate": 1.5336020884426967e-05, + "loss": 1.0359, + "step": 232950 + }, + { + "epoch": 1.4883150403127914, + "grad_norm": 0.6204273700714111, + "learning_rate": 1.533240497029149e-05, + "loss": 0.8711, + "step": 232960 + }, + { + "epoch": 1.4883789274625303, + "grad_norm": 1.1760767698287964, + "learning_rate": 1.5328789405288678e-05, + "loss": 0.6275, + "step": 232970 + }, + { + "epoch": 1.4884428146122688, + "grad_norm": 1.1136373281478882, + "learning_rate": 1.532517418945495e-05, + "loss": 1.0665, + "step": 232980 + }, + { + "epoch": 1.4885067017620077, + "grad_norm": 0.851905882358551, + "learning_rate": 1.5321559322826733e-05, + "loss": 0.8263, + "step": 232990 + }, + { + "epoch": 1.4885705889117462, + "grad_norm": 0.5786584615707397, + "learning_rate": 1.53179448054404e-05, + "loss": 1.2527, + "step": 233000 + }, + { + "epoch": 1.488634476061485, + "grad_norm": 1.1659579277038574, + "learning_rate": 1.5314330637332376e-05, + "loss": 0.7822, + "step": 233010 + }, + { + "epoch": 1.4886983632112236, + "grad_norm": 1.3709019422531128, + "learning_rate": 1.531071681853904e-05, + "loss": 0.9937, + "step": 233020 + }, + { + "epoch": 1.4887622503609623, + "grad_norm": 1.0024594068527222, + "learning_rate": 1.530710334909681e-05, + "loss": 1.1988, + "step": 233030 + }, + { + "epoch": 1.488826137510701, + "grad_norm": 0.6908731460571289, + "learning_rate": 1.5303490229042045e-05, + "loss": 0.8364, + "step": 233040 + }, + { + "epoch": 1.4888900246604397, + "grad_norm": 0.9514659643173218, + "learning_rate": 1.5299877458411168e-05, + "loss": 0.7613, + "step": 233050 + }, + { + "epoch": 1.4889539118101784, + "grad_norm": 1.1845754384994507, + "learning_rate": 1.529626503724053e-05, + "loss": 0.7988, + "step": 233060 + }, + { + "epoch": 1.4890177989599171, + "grad_norm": 1.0755010843276978, + "learning_rate": 1.529265296556654e-05, + "loss": 0.9096, + "step": 233070 + }, + { + "epoch": 1.4890816861096559, + "grad_norm": 0.8984807729721069, + "learning_rate": 1.5289041243425544e-05, + "loss": 0.8871, + "step": 233080 + }, + { + "epoch": 1.4891455732593946, + "grad_norm": 0.7457914352416992, + "learning_rate": 1.528542987085395e-05, + "loss": 0.8331, + "step": 233090 + }, + { + "epoch": 1.4892094604091333, + "grad_norm": 1.1087687015533447, + "learning_rate": 1.5281818847888097e-05, + "loss": 1.04, + "step": 233100 + }, + { + "epoch": 1.489273347558872, + "grad_norm": 1.566397786140442, + "learning_rate": 1.5278208174564374e-05, + "loss": 0.8974, + "step": 233110 + }, + { + "epoch": 1.4893372347086107, + "grad_norm": 1.3691843748092651, + "learning_rate": 1.527459785091912e-05, + "loss": 0.7804, + "step": 233120 + }, + { + "epoch": 1.4894011218583494, + "grad_norm": 0.7337422370910645, + "learning_rate": 1.5270987876988723e-05, + "loss": 0.9106, + "step": 233130 + }, + { + "epoch": 1.489465009008088, + "grad_norm": 0.871205747127533, + "learning_rate": 1.5267378252809504e-05, + "loss": 0.9425, + "step": 233140 + }, + { + "epoch": 1.4895288961578268, + "grad_norm": 1.1365987062454224, + "learning_rate": 1.5263768978417858e-05, + "loss": 0.8911, + "step": 233150 + }, + { + "epoch": 1.4895927833075655, + "grad_norm": 0.6923983097076416, + "learning_rate": 1.5260160053850086e-05, + "loss": 0.7918, + "step": 233160 + }, + { + "epoch": 1.4896566704573042, + "grad_norm": 0.7337884902954102, + "learning_rate": 1.5256551479142572e-05, + "loss": 1.2812, + "step": 233170 + }, + { + "epoch": 1.489720557607043, + "grad_norm": 1.0790367126464844, + "learning_rate": 1.5252943254331648e-05, + "loss": 0.9928, + "step": 233180 + }, + { + "epoch": 1.4897844447567816, + "grad_norm": 1.5988062620162964, + "learning_rate": 1.5249335379453627e-05, + "loss": 0.795, + "step": 233190 + }, + { + "epoch": 1.4898483319065203, + "grad_norm": 1.0071760416030884, + "learning_rate": 1.5245727854544879e-05, + "loss": 1.0201, + "step": 233200 + }, + { + "epoch": 1.489912219056259, + "grad_norm": 1.4104787111282349, + "learning_rate": 1.5242120679641697e-05, + "loss": 0.7409, + "step": 233210 + }, + { + "epoch": 1.4899761062059977, + "grad_norm": 0.8305241465568542, + "learning_rate": 1.5238513854780451e-05, + "loss": 1.3258, + "step": 233220 + }, + { + "epoch": 1.4900399933557364, + "grad_norm": 0.5620672106742859, + "learning_rate": 1.5234907379997426e-05, + "loss": 0.8582, + "step": 233230 + }, + { + "epoch": 1.4901038805054752, + "grad_norm": 0.7753502130508423, + "learning_rate": 1.5231301255328978e-05, + "loss": 0.6606, + "step": 233240 + }, + { + "epoch": 1.4901677676552139, + "grad_norm": 0.7875840067863464, + "learning_rate": 1.5227695480811388e-05, + "loss": 0.7296, + "step": 233250 + }, + { + "epoch": 1.4902316548049526, + "grad_norm": 1.440796136856079, + "learning_rate": 1.5224090056481e-05, + "loss": 0.7251, + "step": 233260 + }, + { + "epoch": 1.4902955419546913, + "grad_norm": 0.8261032104492188, + "learning_rate": 1.52204849823741e-05, + "loss": 0.771, + "step": 233270 + }, + { + "epoch": 1.49035942910443, + "grad_norm": 0.8172363638877869, + "learning_rate": 1.5216880258527017e-05, + "loss": 0.9547, + "step": 233280 + }, + { + "epoch": 1.4904233162541687, + "grad_norm": 0.6734633445739746, + "learning_rate": 1.5213275884976024e-05, + "loss": 0.8702, + "step": 233290 + }, + { + "epoch": 1.4904872034039074, + "grad_norm": 1.0001932382583618, + "learning_rate": 1.5209671861757441e-05, + "loss": 0.815, + "step": 233300 + }, + { + "epoch": 1.490551090553646, + "grad_norm": 1.8237123489379883, + "learning_rate": 1.5206068188907574e-05, + "loss": 0.8879, + "step": 233310 + }, + { + "epoch": 1.4906149777033848, + "grad_norm": 0.9669250845909119, + "learning_rate": 1.5202464866462691e-05, + "loss": 0.8936, + "step": 233320 + }, + { + "epoch": 1.4906788648531235, + "grad_norm": 1.2896833419799805, + "learning_rate": 1.5198861894459099e-05, + "loss": 0.843, + "step": 233330 + }, + { + "epoch": 1.4907427520028622, + "grad_norm": 0.9315965175628662, + "learning_rate": 1.519525927293306e-05, + "loss": 0.8093, + "step": 233340 + }, + { + "epoch": 1.490806639152601, + "grad_norm": 2.279897928237915, + "learning_rate": 1.5191657001920889e-05, + "loss": 1.1767, + "step": 233350 + }, + { + "epoch": 1.4908705263023396, + "grad_norm": 1.2854963541030884, + "learning_rate": 1.518805508145883e-05, + "loss": 0.711, + "step": 233360 + }, + { + "epoch": 1.4909344134520781, + "grad_norm": 0.6539024710655212, + "learning_rate": 1.518445351158319e-05, + "loss": 0.7295, + "step": 233370 + }, + { + "epoch": 1.490998300601817, + "grad_norm": 2.2135396003723145, + "learning_rate": 1.5180852292330206e-05, + "loss": 0.9475, + "step": 233380 + }, + { + "epoch": 1.4910621877515555, + "grad_norm": 0.8237118721008301, + "learning_rate": 1.5177251423736178e-05, + "loss": 0.7877, + "step": 233390 + }, + { + "epoch": 1.4911260749012945, + "grad_norm": 1.0154304504394531, + "learning_rate": 1.5173650905837339e-05, + "loss": 0.8719, + "step": 233400 + }, + { + "epoch": 1.491189962051033, + "grad_norm": 1.093376874923706, + "learning_rate": 1.5170050738669978e-05, + "loss": 0.7655, + "step": 233410 + }, + { + "epoch": 1.4912538492007719, + "grad_norm": 1.0671299695968628, + "learning_rate": 1.516645092227032e-05, + "loss": 0.7285, + "step": 233420 + }, + { + "epoch": 1.4913177363505103, + "grad_norm": 1.1775280237197876, + "learning_rate": 1.5162851456674659e-05, + "loss": 0.9614, + "step": 233430 + }, + { + "epoch": 1.4913816235002493, + "grad_norm": 1.378699779510498, + "learning_rate": 1.5159252341919206e-05, + "loss": 0.9049, + "step": 233440 + }, + { + "epoch": 1.4914455106499878, + "grad_norm": 1.5878591537475586, + "learning_rate": 1.515565357804023e-05, + "loss": 0.7096, + "step": 233450 + }, + { + "epoch": 1.4915093977997267, + "grad_norm": 0.9215556383132935, + "learning_rate": 1.515241499057849e-05, + "loss": 1.1814, + "step": 233460 + }, + { + "epoch": 1.4915732849494652, + "grad_norm": 0.6913306713104248, + "learning_rate": 1.5148816893464646e-05, + "loss": 0.7497, + "step": 233470 + }, + { + "epoch": 1.491637172099204, + "grad_norm": 1.190250039100647, + "learning_rate": 1.514521914733238e-05, + "loss": 0.7633, + "step": 233480 + }, + { + "epoch": 1.4917010592489426, + "grad_norm": 0.6808974146842957, + "learning_rate": 1.5141621752217893e-05, + "loss": 0.8693, + "step": 233490 + }, + { + "epoch": 1.4917649463986813, + "grad_norm": 1.5304328203201294, + "learning_rate": 1.513802470815745e-05, + "loss": 0.7401, + "step": 233500 + }, + { + "epoch": 1.49182883354842, + "grad_norm": 1.0490107536315918, + "learning_rate": 1.5134428015187242e-05, + "loss": 0.9076, + "step": 233510 + }, + { + "epoch": 1.4918927206981587, + "grad_norm": 0.7744476199150085, + "learning_rate": 1.5130831673343526e-05, + "loss": 0.7956, + "step": 233520 + }, + { + "epoch": 1.4919566078478974, + "grad_norm": 1.0777429342269897, + "learning_rate": 1.5127235682662477e-05, + "loss": 0.757, + "step": 233530 + }, + { + "epoch": 1.4920204949976361, + "grad_norm": 0.9591853618621826, + "learning_rate": 1.5123640043180359e-05, + "loss": 0.8078, + "step": 233540 + }, + { + "epoch": 1.4920843821473748, + "grad_norm": 1.0310931205749512, + "learning_rate": 1.5120044754933338e-05, + "loss": 1.0422, + "step": 233550 + }, + { + "epoch": 1.4921482692971135, + "grad_norm": 1.1196818351745605, + "learning_rate": 1.5116449817957656e-05, + "loss": 0.7267, + "step": 233560 + }, + { + "epoch": 1.4922121564468522, + "grad_norm": 1.0456836223602295, + "learning_rate": 1.5112855232289491e-05, + "loss": 0.7895, + "step": 233570 + }, + { + "epoch": 1.492276043596591, + "grad_norm": 1.1032707691192627, + "learning_rate": 1.5109260997965069e-05, + "loss": 0.8093, + "step": 233580 + }, + { + "epoch": 1.4923399307463296, + "grad_norm": 1.1063132286071777, + "learning_rate": 1.510566711502056e-05, + "loss": 0.7527, + "step": 233590 + }, + { + "epoch": 1.4924038178960684, + "grad_norm": 1.1770296096801758, + "learning_rate": 1.5102073583492183e-05, + "loss": 0.7738, + "step": 233600 + }, + { + "epoch": 1.492467705045807, + "grad_norm": 0.682502031326294, + "learning_rate": 1.5098480403416104e-05, + "loss": 1.0723, + "step": 233610 + }, + { + "epoch": 1.4925315921955458, + "grad_norm": 1.0063148736953735, + "learning_rate": 1.5094887574828536e-05, + "loss": 0.8741, + "step": 233620 + }, + { + "epoch": 1.4925954793452845, + "grad_norm": 1.2028182744979858, + "learning_rate": 1.5091295097765629e-05, + "loss": 0.9136, + "step": 233630 + }, + { + "epoch": 1.4926593664950232, + "grad_norm": 0.7433778643608093, + "learning_rate": 1.5087702972263584e-05, + "loss": 1.0283, + "step": 233640 + }, + { + "epoch": 1.4927232536447619, + "grad_norm": 1.0931978225708008, + "learning_rate": 1.5084111198358586e-05, + "loss": 0.8848, + "step": 233650 + }, + { + "epoch": 1.4927871407945006, + "grad_norm": 0.8081746697425842, + "learning_rate": 1.5080519776086782e-05, + "loss": 0.888, + "step": 233660 + }, + { + "epoch": 1.4928510279442393, + "grad_norm": 0.9746555685997009, + "learning_rate": 1.5076928705484366e-05, + "loss": 0.9802, + "step": 233670 + }, + { + "epoch": 1.492914915093978, + "grad_norm": 0.5713024139404297, + "learning_rate": 1.5073337986587476e-05, + "loss": 0.831, + "step": 233680 + }, + { + "epoch": 1.4929788022437167, + "grad_norm": 1.1723284721374512, + "learning_rate": 1.5069747619432307e-05, + "loss": 0.8193, + "step": 233690 + }, + { + "epoch": 1.4930426893934554, + "grad_norm": 0.9624750018119812, + "learning_rate": 1.506615760405498e-05, + "loss": 0.9194, + "step": 233700 + }, + { + "epoch": 1.4931065765431941, + "grad_norm": 0.8232372403144836, + "learning_rate": 1.5062567940491685e-05, + "loss": 0.9009, + "step": 233710 + }, + { + "epoch": 1.4931704636929328, + "grad_norm": 1.230167269706726, + "learning_rate": 1.5058978628778541e-05, + "loss": 0.9485, + "step": 233720 + }, + { + "epoch": 1.4932343508426715, + "grad_norm": 1.1337039470672607, + "learning_rate": 1.5055389668951725e-05, + "loss": 0.7884, + "step": 233730 + }, + { + "epoch": 1.4932982379924102, + "grad_norm": 0.8492588996887207, + "learning_rate": 1.5051801061047355e-05, + "loss": 0.7516, + "step": 233740 + }, + { + "epoch": 1.493362125142149, + "grad_norm": 0.8926807641983032, + "learning_rate": 1.5048212805101591e-05, + "loss": 1.144, + "step": 233750 + }, + { + "epoch": 1.4934260122918877, + "grad_norm": 0.6456555724143982, + "learning_rate": 1.504462490115055e-05, + "loss": 0.8384, + "step": 233760 + }, + { + "epoch": 1.4934898994416264, + "grad_norm": 1.1378792524337769, + "learning_rate": 1.5041037349230392e-05, + "loss": 0.7143, + "step": 233770 + }, + { + "epoch": 1.493553786591365, + "grad_norm": 0.6030178070068359, + "learning_rate": 1.5037450149377214e-05, + "loss": 0.7133, + "step": 233780 + }, + { + "epoch": 1.4936176737411038, + "grad_norm": 0.9475358128547668, + "learning_rate": 1.5033863301627183e-05, + "loss": 0.8603, + "step": 233790 + }, + { + "epoch": 1.4936815608908425, + "grad_norm": 0.6618078351020813, + "learning_rate": 1.5030276806016375e-05, + "loss": 0.8643, + "step": 233800 + }, + { + "epoch": 1.4937454480405812, + "grad_norm": 0.6175677180290222, + "learning_rate": 1.502669066258095e-05, + "loss": 0.7618, + "step": 233810 + }, + { + "epoch": 1.4938093351903199, + "grad_norm": 0.7343020439147949, + "learning_rate": 1.5023104871357007e-05, + "loss": 0.7322, + "step": 233820 + }, + { + "epoch": 1.4938732223400586, + "grad_norm": 1.1418992280960083, + "learning_rate": 1.501951943238064e-05, + "loss": 0.8652, + "step": 233830 + }, + { + "epoch": 1.4939371094897973, + "grad_norm": 0.8838281035423279, + "learning_rate": 1.5015934345687992e-05, + "loss": 0.9093, + "step": 233840 + }, + { + "epoch": 1.494000996639536, + "grad_norm": 1.2746473550796509, + "learning_rate": 1.5012349611315136e-05, + "loss": 0.7186, + "step": 233850 + }, + { + "epoch": 1.4940648837892745, + "grad_norm": 0.8988287448883057, + "learning_rate": 1.5008765229298206e-05, + "loss": 0.6573, + "step": 233860 + }, + { + "epoch": 1.4941287709390134, + "grad_norm": 0.8221251964569092, + "learning_rate": 1.5005181199673263e-05, + "loss": 1.0266, + "step": 233870 + }, + { + "epoch": 1.494192658088752, + "grad_norm": 0.8387921452522278, + "learning_rate": 1.5001597522476435e-05, + "loss": 0.8873, + "step": 233880 + }, + { + "epoch": 1.4942565452384908, + "grad_norm": 1.367504358291626, + "learning_rate": 1.4998014197743781e-05, + "loss": 0.9954, + "step": 233890 + }, + { + "epoch": 1.4943204323882293, + "grad_norm": 1.0656965970993042, + "learning_rate": 1.4994431225511424e-05, + "loss": 1.0383, + "step": 233900 + }, + { + "epoch": 1.4943843195379682, + "grad_norm": 1.9154045581817627, + "learning_rate": 1.4990848605815411e-05, + "loss": 1.1103, + "step": 233910 + }, + { + "epoch": 1.4944482066877067, + "grad_norm": 2.2200756072998047, + "learning_rate": 1.4987266338691858e-05, + "loss": 0.6776, + "step": 233920 + }, + { + "epoch": 1.4945120938374457, + "grad_norm": 1.3810445070266724, + "learning_rate": 1.4983684424176803e-05, + "loss": 0.96, + "step": 233930 + }, + { + "epoch": 1.4945759809871841, + "grad_norm": 0.8132758736610413, + "learning_rate": 1.498010286230636e-05, + "loss": 0.7585, + "step": 233940 + }, + { + "epoch": 1.494639868136923, + "grad_norm": 0.9625273942947388, + "learning_rate": 1.497652165311656e-05, + "loss": 0.9365, + "step": 233950 + }, + { + "epoch": 1.4947037552866616, + "grad_norm": 1.0185012817382812, + "learning_rate": 1.4972940796643487e-05, + "loss": 0.7643, + "step": 233960 + }, + { + "epoch": 1.4947676424364005, + "grad_norm": 0.7479864954948425, + "learning_rate": 1.4969360292923217e-05, + "loss": 0.6614, + "step": 233970 + }, + { + "epoch": 1.494831529586139, + "grad_norm": 2.9963157176971436, + "learning_rate": 1.496578014199178e-05, + "loss": 0.8242, + "step": 233980 + }, + { + "epoch": 1.4948954167358777, + "grad_norm": 1.1559045314788818, + "learning_rate": 1.4962200343885264e-05, + "loss": 0.8894, + "step": 233990 + }, + { + "epoch": 1.4949593038856164, + "grad_norm": 1.065392255783081, + "learning_rate": 1.495862089863968e-05, + "loss": 0.9941, + "step": 234000 + }, + { + "epoch": 1.495023191035355, + "grad_norm": 0.8827859163284302, + "learning_rate": 1.495504180629112e-05, + "loss": 1.0541, + "step": 234010 + }, + { + "epoch": 1.4950870781850938, + "grad_norm": 0.9255498647689819, + "learning_rate": 1.4951463066875594e-05, + "loss": 0.8875, + "step": 234020 + }, + { + "epoch": 1.4951509653348325, + "grad_norm": 0.7709739804267883, + "learning_rate": 1.4947884680429164e-05, + "loss": 0.8926, + "step": 234030 + }, + { + "epoch": 1.4952148524845712, + "grad_norm": 0.9358537197113037, + "learning_rate": 1.494430664698785e-05, + "loss": 0.8256, + "step": 234040 + }, + { + "epoch": 1.49527873963431, + "grad_norm": 0.5185632109642029, + "learning_rate": 1.4940728966587708e-05, + "loss": 0.7478, + "step": 234050 + }, + { + "epoch": 1.4953426267840486, + "grad_norm": 0.9753923416137695, + "learning_rate": 1.493715163926474e-05, + "loss": 0.8866, + "step": 234060 + }, + { + "epoch": 1.4954065139337873, + "grad_norm": 1.034305453300476, + "learning_rate": 1.4933574665055006e-05, + "loss": 1.0687, + "step": 234070 + }, + { + "epoch": 1.495470401083526, + "grad_norm": 0.6250910758972168, + "learning_rate": 1.4929998043994497e-05, + "loss": 0.9843, + "step": 234080 + }, + { + "epoch": 1.4955342882332647, + "grad_norm": 0.7841780781745911, + "learning_rate": 1.4926421776119265e-05, + "loss": 0.9004, + "step": 234090 + }, + { + "epoch": 1.4955981753830034, + "grad_norm": 1.2987459897994995, + "learning_rate": 1.492284586146529e-05, + "loss": 0.9964, + "step": 234100 + }, + { + "epoch": 1.4956620625327421, + "grad_norm": 0.9914219379425049, + "learning_rate": 1.4919270300068615e-05, + "loss": 0.8802, + "step": 234110 + }, + { + "epoch": 1.4957259496824808, + "grad_norm": 0.6687591075897217, + "learning_rate": 1.4915695091965232e-05, + "loss": 0.9669, + "step": 234120 + }, + { + "epoch": 1.4957898368322196, + "grad_norm": 0.5962859392166138, + "learning_rate": 1.4912120237191157e-05, + "loss": 0.8703, + "step": 234130 + }, + { + "epoch": 1.4958537239819583, + "grad_norm": 1.0896154642105103, + "learning_rate": 1.4908545735782376e-05, + "loss": 1.0735, + "step": 234140 + }, + { + "epoch": 1.495917611131697, + "grad_norm": 0.8879744410514832, + "learning_rate": 1.4904971587774896e-05, + "loss": 1.0103, + "step": 234150 + }, + { + "epoch": 1.4959814982814357, + "grad_norm": 0.7168521285057068, + "learning_rate": 1.490139779320473e-05, + "loss": 0.8318, + "step": 234160 + }, + { + "epoch": 1.4960453854311744, + "grad_norm": 0.949070930480957, + "learning_rate": 1.4897824352107836e-05, + "loss": 0.7894, + "step": 234170 + }, + { + "epoch": 1.496109272580913, + "grad_norm": 1.325273036956787, + "learning_rate": 1.4894251264520238e-05, + "loss": 0.8188, + "step": 234180 + }, + { + "epoch": 1.4961731597306518, + "grad_norm": 0.9096631407737732, + "learning_rate": 1.4890678530477881e-05, + "loss": 0.9897, + "step": 234190 + }, + { + "epoch": 1.4962370468803905, + "grad_norm": 0.8928118348121643, + "learning_rate": 1.4887106150016784e-05, + "loss": 0.9482, + "step": 234200 + }, + { + "epoch": 1.4963009340301292, + "grad_norm": 0.6268375515937805, + "learning_rate": 1.4883534123172887e-05, + "loss": 0.9183, + "step": 234210 + }, + { + "epoch": 1.496364821179868, + "grad_norm": 1.260741114616394, + "learning_rate": 1.4879962449982199e-05, + "loss": 0.995, + "step": 234220 + }, + { + "epoch": 1.4964287083296066, + "grad_norm": 2.3379745483398438, + "learning_rate": 1.4876391130480654e-05, + "loss": 0.8229, + "step": 234230 + }, + { + "epoch": 1.4964925954793453, + "grad_norm": 1.0594456195831299, + "learning_rate": 1.4872820164704255e-05, + "loss": 1.0574, + "step": 234240 + }, + { + "epoch": 1.496556482629084, + "grad_norm": 1.1462024450302124, + "learning_rate": 1.486924955268893e-05, + "loss": 0.6305, + "step": 234250 + }, + { + "epoch": 1.4966203697788227, + "grad_norm": 1.3557465076446533, + "learning_rate": 1.4865679294470669e-05, + "loss": 1.038, + "step": 234260 + }, + { + "epoch": 1.4966842569285614, + "grad_norm": 0.7691382765769958, + "learning_rate": 1.4862109390085399e-05, + "loss": 0.7011, + "step": 234270 + }, + { + "epoch": 1.4967481440783001, + "grad_norm": 0.6914074420928955, + "learning_rate": 1.4858539839569096e-05, + "loss": 0.9041, + "step": 234280 + }, + { + "epoch": 1.4968120312280389, + "grad_norm": 1.4908119440078735, + "learning_rate": 1.4854970642957688e-05, + "loss": 0.6648, + "step": 234290 + }, + { + "epoch": 1.4968759183777776, + "grad_norm": 0.6637123227119446, + "learning_rate": 1.4851401800287146e-05, + "loss": 0.8933, + "step": 234300 + }, + { + "epoch": 1.4969398055275163, + "grad_norm": 1.5094033479690552, + "learning_rate": 1.4847833311593395e-05, + "loss": 0.738, + "step": 234310 + }, + { + "epoch": 1.497003692677255, + "grad_norm": 1.5913399457931519, + "learning_rate": 1.4844265176912359e-05, + "loss": 0.9591, + "step": 234320 + }, + { + "epoch": 1.4970675798269937, + "grad_norm": 0.9007583260536194, + "learning_rate": 1.484069739628e-05, + "loss": 0.802, + "step": 234330 + }, + { + "epoch": 1.4971314669767324, + "grad_norm": 1.1307207345962524, + "learning_rate": 1.4837129969732222e-05, + "loss": 0.8646, + "step": 234340 + }, + { + "epoch": 1.4971953541264709, + "grad_norm": 1.7793842554092407, + "learning_rate": 1.4833562897304975e-05, + "loss": 1.0184, + "step": 234350 + }, + { + "epoch": 1.4972592412762098, + "grad_norm": 0.8876829743385315, + "learning_rate": 1.4829996179034167e-05, + "loss": 0.812, + "step": 234360 + }, + { + "epoch": 1.4973231284259483, + "grad_norm": 0.8620350956916809, + "learning_rate": 1.4826429814955734e-05, + "loss": 0.8405, + "step": 234370 + }, + { + "epoch": 1.4973870155756872, + "grad_norm": 1.2996463775634766, + "learning_rate": 1.4822863805105569e-05, + "loss": 0.9884, + "step": 234380 + }, + { + "epoch": 1.4974509027254257, + "grad_norm": 0.757505476474762, + "learning_rate": 1.4819298149519611e-05, + "loss": 1.1306, + "step": 234390 + }, + { + "epoch": 1.4975147898751646, + "grad_norm": 0.8766262531280518, + "learning_rate": 1.4815732848233744e-05, + "loss": 1.0607, + "step": 234400 + }, + { + "epoch": 1.497578677024903, + "grad_norm": 1.9996721744537354, + "learning_rate": 1.4812167901283896e-05, + "loss": 0.7488, + "step": 234410 + }, + { + "epoch": 1.497642564174642, + "grad_norm": 1.6408275365829468, + "learning_rate": 1.4808603308705949e-05, + "loss": 1.0059, + "step": 234420 + }, + { + "epoch": 1.4977064513243805, + "grad_norm": 1.429452896118164, + "learning_rate": 1.480503907053582e-05, + "loss": 0.9959, + "step": 234430 + }, + { + "epoch": 1.4977703384741194, + "grad_norm": 0.7910223603248596, + "learning_rate": 1.4801475186809388e-05, + "loss": 0.9692, + "step": 234440 + }, + { + "epoch": 1.497834225623858, + "grad_norm": 1.11632239818573, + "learning_rate": 1.4797911657562562e-05, + "loss": 1.0808, + "step": 234450 + }, + { + "epoch": 1.4978981127735969, + "grad_norm": 0.9424408674240112, + "learning_rate": 1.4794348482831206e-05, + "loss": 0.8386, + "step": 234460 + }, + { + "epoch": 1.4979619999233353, + "grad_norm": 0.742712676525116, + "learning_rate": 1.4790785662651235e-05, + "loss": 0.7571, + "step": 234470 + }, + { + "epoch": 1.498025887073074, + "grad_norm": 0.9132349491119385, + "learning_rate": 1.4787223197058498e-05, + "loss": 0.6435, + "step": 234480 + }, + { + "epoch": 1.4980897742228128, + "grad_norm": 0.6201077103614807, + "learning_rate": 1.4783661086088885e-05, + "loss": 0.9236, + "step": 234490 + }, + { + "epoch": 1.4981536613725515, + "grad_norm": 1.0537291765213013, + "learning_rate": 1.4780099329778285e-05, + "loss": 1.0377, + "step": 234500 + }, + { + "epoch": 1.4982175485222902, + "grad_norm": 1.24350106716156, + "learning_rate": 1.4776537928162537e-05, + "loss": 0.8583, + "step": 234510 + }, + { + "epoch": 1.4982814356720289, + "grad_norm": 0.6610399484634399, + "learning_rate": 1.4772976881277544e-05, + "loss": 0.9588, + "step": 234520 + }, + { + "epoch": 1.4983453228217676, + "grad_norm": 0.6641136407852173, + "learning_rate": 1.4769416189159135e-05, + "loss": 0.8946, + "step": 234530 + }, + { + "epoch": 1.4984092099715063, + "grad_norm": 1.025057315826416, + "learning_rate": 1.4765855851843202e-05, + "loss": 0.8817, + "step": 234540 + }, + { + "epoch": 1.498473097121245, + "grad_norm": 0.7867838144302368, + "learning_rate": 1.4762295869365561e-05, + "loss": 0.6613, + "step": 234550 + }, + { + "epoch": 1.4985369842709837, + "grad_norm": 1.0991227626800537, + "learning_rate": 1.4758736241762106e-05, + "loss": 0.6598, + "step": 234560 + }, + { + "epoch": 1.4986008714207224, + "grad_norm": 0.8666796088218689, + "learning_rate": 1.475517696906864e-05, + "loss": 0.7919, + "step": 234570 + }, + { + "epoch": 1.498664758570461, + "grad_norm": 1.0286558866500854, + "learning_rate": 1.475161805132106e-05, + "loss": 0.6867, + "step": 234580 + }, + { + "epoch": 1.4987286457201998, + "grad_norm": 1.2615046501159668, + "learning_rate": 1.474805948855516e-05, + "loss": 0.7035, + "step": 234590 + }, + { + "epoch": 1.4987925328699385, + "grad_norm": 0.8936821222305298, + "learning_rate": 1.4744501280806811e-05, + "loss": 0.986, + "step": 234600 + }, + { + "epoch": 1.4988564200196772, + "grad_norm": 1.254328727722168, + "learning_rate": 1.4740943428111825e-05, + "loss": 0.7812, + "step": 234610 + }, + { + "epoch": 1.498920307169416, + "grad_norm": 0.8320007920265198, + "learning_rate": 1.4737385930506053e-05, + "loss": 0.9266, + "step": 234620 + }, + { + "epoch": 1.4989841943191546, + "grad_norm": 1.5553789138793945, + "learning_rate": 1.4733828788025294e-05, + "loss": 0.7904, + "step": 234630 + }, + { + "epoch": 1.4990480814688933, + "grad_norm": 1.5870753526687622, + "learning_rate": 1.4730272000705408e-05, + "loss": 0.7808, + "step": 234640 + }, + { + "epoch": 1.499111968618632, + "grad_norm": 0.8670278191566467, + "learning_rate": 1.4726715568582184e-05, + "loss": 0.8087, + "step": 234650 + }, + { + "epoch": 1.4991758557683708, + "grad_norm": 0.8606697916984558, + "learning_rate": 1.4723159491691458e-05, + "loss": 0.7755, + "step": 234660 + }, + { + "epoch": 1.4992397429181095, + "grad_norm": 0.7828750014305115, + "learning_rate": 1.4719603770069023e-05, + "loss": 1.1492, + "step": 234670 + }, + { + "epoch": 1.4993036300678482, + "grad_norm": 0.6629054546356201, + "learning_rate": 1.4716048403750698e-05, + "loss": 0.9579, + "step": 234680 + }, + { + "epoch": 1.4993675172175869, + "grad_norm": 1.172922968864441, + "learning_rate": 1.471249339277231e-05, + "loss": 0.9958, + "step": 234690 + }, + { + "epoch": 1.4994314043673256, + "grad_norm": 0.7931080460548401, + "learning_rate": 1.470893873716962e-05, + "loss": 0.6613, + "step": 234700 + }, + { + "epoch": 1.4994952915170643, + "grad_norm": 0.687065064907074, + "learning_rate": 1.4705384436978464e-05, + "loss": 0.6888, + "step": 234710 + }, + { + "epoch": 1.499559178666803, + "grad_norm": 0.707865297794342, + "learning_rate": 1.4701830492234609e-05, + "loss": 1.2256, + "step": 234720 + }, + { + "epoch": 1.4996230658165417, + "grad_norm": 0.6391942501068115, + "learning_rate": 1.4698276902973873e-05, + "loss": 0.6766, + "step": 234730 + }, + { + "epoch": 1.4996869529662804, + "grad_norm": 0.8546810746192932, + "learning_rate": 1.4694723669232014e-05, + "loss": 1.0149, + "step": 234740 + }, + { + "epoch": 1.4997508401160191, + "grad_norm": 0.8669940233230591, + "learning_rate": 1.4691170791044844e-05, + "loss": 0.7914, + "step": 234750 + }, + { + "epoch": 1.4998147272657578, + "grad_norm": 0.6280136108398438, + "learning_rate": 1.4687618268448116e-05, + "loss": 0.9951, + "step": 234760 + }, + { + "epoch": 1.4998786144154965, + "grad_norm": 0.7606225609779358, + "learning_rate": 1.468406610147764e-05, + "loss": 1.1205, + "step": 234770 + }, + { + "epoch": 1.4999425015652352, + "grad_norm": 0.9498753547668457, + "learning_rate": 1.4680514290169157e-05, + "loss": 1.101, + "step": 234780 + }, + { + "epoch": 1.500006388714974, + "grad_norm": 0.9360266923904419, + "learning_rate": 1.4676962834558472e-05, + "loss": 0.7562, + "step": 234790 + }, + { + "epoch": 1.5000702758647124, + "grad_norm": 1.285915493965149, + "learning_rate": 1.4673411734681309e-05, + "loss": 0.8137, + "step": 234800 + }, + { + "epoch": 1.5001341630144513, + "grad_norm": 1.4864213466644287, + "learning_rate": 1.4669860990573448e-05, + "loss": 0.9683, + "step": 234810 + }, + { + "epoch": 1.5001980501641898, + "grad_norm": 0.9704619646072388, + "learning_rate": 1.4666310602270666e-05, + "loss": 0.7457, + "step": 234820 + }, + { + "epoch": 1.5002619373139288, + "grad_norm": 1.6511520147323608, + "learning_rate": 1.4662760569808686e-05, + "loss": 0.855, + "step": 234830 + }, + { + "epoch": 1.5003258244636672, + "grad_norm": 1.3069212436676025, + "learning_rate": 1.4659210893223301e-05, + "loss": 0.7079, + "step": 234840 + }, + { + "epoch": 1.5003897116134062, + "grad_norm": 2.1083788871765137, + "learning_rate": 1.4655661572550217e-05, + "loss": 0.933, + "step": 234850 + }, + { + "epoch": 1.5004535987631447, + "grad_norm": 0.7109601497650146, + "learning_rate": 1.4652112607825213e-05, + "loss": 0.923, + "step": 234860 + }, + { + "epoch": 1.5005174859128836, + "grad_norm": 0.9529529809951782, + "learning_rate": 1.4648563999084002e-05, + "loss": 0.6489, + "step": 234870 + }, + { + "epoch": 1.500581373062622, + "grad_norm": 0.627356767654419, + "learning_rate": 1.464501574636235e-05, + "loss": 0.8803, + "step": 234880 + }, + { + "epoch": 1.500645260212361, + "grad_norm": 1.132530927658081, + "learning_rate": 1.4641467849695961e-05, + "loss": 1.0272, + "step": 234890 + }, + { + "epoch": 1.5007091473620995, + "grad_norm": 1.142256259918213, + "learning_rate": 1.463792030912059e-05, + "loss": 0.8087, + "step": 234900 + }, + { + "epoch": 1.5007730345118384, + "grad_norm": 1.1030948162078857, + "learning_rate": 1.4634373124671947e-05, + "loss": 1.4046, + "step": 234910 + }, + { + "epoch": 1.500836921661577, + "grad_norm": 0.9625082612037659, + "learning_rate": 1.4630826296385775e-05, + "loss": 0.9084, + "step": 234920 + }, + { + "epoch": 1.5009008088113158, + "grad_norm": 0.7878156304359436, + "learning_rate": 1.4627279824297762e-05, + "loss": 0.7928, + "step": 234930 + }, + { + "epoch": 1.5009646959610543, + "grad_norm": 5.58283805847168, + "learning_rate": 1.462373370844366e-05, + "loss": 1.0296, + "step": 234940 + }, + { + "epoch": 1.5010285831107932, + "grad_norm": 0.9159521460533142, + "learning_rate": 1.4620187948859149e-05, + "loss": 1.1049, + "step": 234950 + }, + { + "epoch": 1.5010924702605317, + "grad_norm": 0.9695411324501038, + "learning_rate": 1.4616642545579967e-05, + "loss": 0.7801, + "step": 234960 + }, + { + "epoch": 1.5011563574102706, + "grad_norm": 1.1427416801452637, + "learning_rate": 1.4613097498641792e-05, + "loss": 0.9557, + "step": 234970 + }, + { + "epoch": 1.5012202445600091, + "grad_norm": 0.6965705752372742, + "learning_rate": 1.4609552808080357e-05, + "loss": 0.6617, + "step": 234980 + }, + { + "epoch": 1.501284131709748, + "grad_norm": 2.3865509033203125, + "learning_rate": 1.4606008473931326e-05, + "loss": 1.207, + "step": 234990 + }, + { + "epoch": 1.5013480188594865, + "grad_norm": 0.8042628169059753, + "learning_rate": 1.4602464496230406e-05, + "loss": 1.0717, + "step": 235000 + }, + { + "epoch": 1.5014119060092253, + "grad_norm": 0.7267552018165588, + "learning_rate": 1.4598920875013312e-05, + "loss": 0.9661, + "step": 235010 + }, + { + "epoch": 1.501475793158964, + "grad_norm": 0.8529950380325317, + "learning_rate": 1.4595377610315692e-05, + "loss": 0.8902, + "step": 235020 + }, + { + "epoch": 1.5015396803087027, + "grad_norm": 0.893867552280426, + "learning_rate": 1.4591834702173262e-05, + "loss": 0.7361, + "step": 235030 + }, + { + "epoch": 1.5016035674584414, + "grad_norm": 1.094534158706665, + "learning_rate": 1.458829215062168e-05, + "loss": 1.2228, + "step": 235040 + }, + { + "epoch": 1.50166745460818, + "grad_norm": 0.9267030954360962, + "learning_rate": 1.4584749955696648e-05, + "loss": 0.9459, + "step": 235050 + }, + { + "epoch": 1.5017313417579188, + "grad_norm": 0.6565288305282593, + "learning_rate": 1.4581208117433804e-05, + "loss": 1.0574, + "step": 235060 + }, + { + "epoch": 1.5017952289076575, + "grad_norm": 1.656847357749939, + "learning_rate": 1.4577666635868848e-05, + "loss": 1.1281, + "step": 235070 + }, + { + "epoch": 1.5018591160573962, + "grad_norm": 1.3042356967926025, + "learning_rate": 1.4574125511037423e-05, + "loss": 0.8161, + "step": 235080 + }, + { + "epoch": 1.501923003207135, + "grad_norm": 1.1575617790222168, + "learning_rate": 1.4570584742975213e-05, + "loss": 0.7767, + "step": 235090 + }, + { + "epoch": 1.5019868903568736, + "grad_norm": 1.0763665437698364, + "learning_rate": 1.456704433171785e-05, + "loss": 1.3651, + "step": 235100 + }, + { + "epoch": 1.5020507775066123, + "grad_norm": 2.460211992263794, + "learning_rate": 1.456350427730102e-05, + "loss": 0.7748, + "step": 235110 + }, + { + "epoch": 1.502114664656351, + "grad_norm": 1.2127439975738525, + "learning_rate": 1.4559964579760348e-05, + "loss": 0.9433, + "step": 235120 + }, + { + "epoch": 1.5021785518060897, + "grad_norm": 1.1584923267364502, + "learning_rate": 1.4556425239131504e-05, + "loss": 1.0207, + "step": 235130 + }, + { + "epoch": 1.5022424389558284, + "grad_norm": 0.915944516658783, + "learning_rate": 1.455288625545011e-05, + "loss": 0.7177, + "step": 235140 + }, + { + "epoch": 1.5023063261055671, + "grad_norm": 0.9569301009178162, + "learning_rate": 1.454934762875183e-05, + "loss": 0.699, + "step": 235150 + }, + { + "epoch": 1.5023702132553058, + "grad_norm": 0.8162155151367188, + "learning_rate": 1.4545809359072271e-05, + "loss": 1.0925, + "step": 235160 + }, + { + "epoch": 1.5024341004050445, + "grad_norm": 0.8980828523635864, + "learning_rate": 1.4542271446447103e-05, + "loss": 0.7741, + "step": 235170 + }, + { + "epoch": 1.5024979875547833, + "grad_norm": 0.6045962572097778, + "learning_rate": 1.4538733890911916e-05, + "loss": 0.8077, + "step": 235180 + }, + { + "epoch": 1.502561874704522, + "grad_norm": 0.6471248865127563, + "learning_rate": 1.4535196692502379e-05, + "loss": 0.8041, + "step": 235190 + }, + { + "epoch": 1.5026257618542607, + "grad_norm": 0.9747743606567383, + "learning_rate": 1.4531659851254076e-05, + "loss": 0.8377, + "step": 235200 + }, + { + "epoch": 1.5026896490039994, + "grad_norm": 0.8339213132858276, + "learning_rate": 1.452812336720264e-05, + "loss": 0.9018, + "step": 235210 + }, + { + "epoch": 1.502753536153738, + "grad_norm": 1.8665075302124023, + "learning_rate": 1.4524587240383703e-05, + "loss": 0.9894, + "step": 235220 + }, + { + "epoch": 1.5028174233034768, + "grad_norm": 1.8768938779830933, + "learning_rate": 1.4521051470832852e-05, + "loss": 0.9992, + "step": 235230 + }, + { + "epoch": 1.5028813104532155, + "grad_norm": 2.487178087234497, + "learning_rate": 1.4517516058585723e-05, + "loss": 0.8355, + "step": 235240 + }, + { + "epoch": 1.5029451976029542, + "grad_norm": 1.0076284408569336, + "learning_rate": 1.4513981003677885e-05, + "loss": 0.9543, + "step": 235250 + }, + { + "epoch": 1.503009084752693, + "grad_norm": 0.7256926894187927, + "learning_rate": 1.4510446306144977e-05, + "loss": 0.8898, + "step": 235260 + }, + { + "epoch": 1.5030729719024314, + "grad_norm": 1.6876107454299927, + "learning_rate": 1.4506911966022574e-05, + "loss": 0.8502, + "step": 235270 + }, + { + "epoch": 1.5031368590521703, + "grad_norm": 0.8799774646759033, + "learning_rate": 1.4503377983346272e-05, + "loss": 0.7909, + "step": 235280 + }, + { + "epoch": 1.5032007462019088, + "grad_norm": 0.8718019127845764, + "learning_rate": 1.449984435815165e-05, + "loss": 0.957, + "step": 235290 + }, + { + "epoch": 1.5032646333516477, + "grad_norm": 0.7004748582839966, + "learning_rate": 1.4496311090474324e-05, + "loss": 0.8612, + "step": 235300 + }, + { + "epoch": 1.5033285205013862, + "grad_norm": 1.0339982509613037, + "learning_rate": 1.4492778180349841e-05, + "loss": 0.7531, + "step": 235310 + }, + { + "epoch": 1.5033924076511251, + "grad_norm": 1.2925528287887573, + "learning_rate": 1.4489245627813819e-05, + "loss": 0.7904, + "step": 235320 + }, + { + "epoch": 1.5034562948008636, + "grad_norm": 1.0401469469070435, + "learning_rate": 1.4485713432901798e-05, + "loss": 0.903, + "step": 235330 + }, + { + "epoch": 1.5035201819506026, + "grad_norm": 0.8769915103912354, + "learning_rate": 1.4482181595649369e-05, + "loss": 1.0074, + "step": 235340 + }, + { + "epoch": 1.503584069100341, + "grad_norm": 1.0945196151733398, + "learning_rate": 1.4478650116092107e-05, + "loss": 0.7037, + "step": 235350 + }, + { + "epoch": 1.50364795625008, + "grad_norm": 1.3890713453292847, + "learning_rate": 1.4475118994265563e-05, + "loss": 0.7796, + "step": 235360 + }, + { + "epoch": 1.5037118433998184, + "grad_norm": 1.254223346710205, + "learning_rate": 1.4471588230205313e-05, + "loss": 0.9753, + "step": 235370 + }, + { + "epoch": 1.5037757305495574, + "grad_norm": 2.11234974861145, + "learning_rate": 1.446805782394689e-05, + "loss": 0.7127, + "step": 235380 + }, + { + "epoch": 1.5038396176992959, + "grad_norm": 0.6228627562522888, + "learning_rate": 1.4464527775525883e-05, + "loss": 1.0625, + "step": 235390 + }, + { + "epoch": 1.5039035048490348, + "grad_norm": 0.8461014032363892, + "learning_rate": 1.4460998084977812e-05, + "loss": 1.0634, + "step": 235400 + }, + { + "epoch": 1.5039673919987733, + "grad_norm": 0.9258694648742676, + "learning_rate": 1.4457468752338244e-05, + "loss": 0.8858, + "step": 235410 + }, + { + "epoch": 1.5040312791485122, + "grad_norm": 1.5003474950790405, + "learning_rate": 1.44539397776427e-05, + "loss": 0.9238, + "step": 235420 + }, + { + "epoch": 1.5040951662982507, + "grad_norm": 0.9997551441192627, + "learning_rate": 1.4450411160926753e-05, + "loss": 0.9678, + "step": 235430 + }, + { + "epoch": 1.5041590534479896, + "grad_norm": 1.9879392385482788, + "learning_rate": 1.44468829022259e-05, + "loss": 1.0032, + "step": 235440 + }, + { + "epoch": 1.504222940597728, + "grad_norm": 1.0907074213027954, + "learning_rate": 1.4443355001575715e-05, + "loss": 0.8554, + "step": 235450 + }, + { + "epoch": 1.504286827747467, + "grad_norm": 1.0263760089874268, + "learning_rate": 1.4439827459011685e-05, + "loss": 0.7579, + "step": 235460 + }, + { + "epoch": 1.5043507148972055, + "grad_norm": 1.8271600008010864, + "learning_rate": 1.4436300274569375e-05, + "loss": 1.0961, + "step": 235470 + }, + { + "epoch": 1.5044146020469444, + "grad_norm": 0.9254994988441467, + "learning_rate": 1.4432773448284276e-05, + "loss": 0.6641, + "step": 235480 + }, + { + "epoch": 1.504478489196683, + "grad_norm": 1.155799388885498, + "learning_rate": 1.4429246980191929e-05, + "loss": 0.9395, + "step": 235490 + }, + { + "epoch": 1.5045423763464216, + "grad_norm": 1.6232599020004272, + "learning_rate": 1.4425720870327825e-05, + "loss": 0.7697, + "step": 235500 + }, + { + "epoch": 1.5046062634961603, + "grad_norm": 1.015397548675537, + "learning_rate": 1.4422195118727506e-05, + "loss": 0.7536, + "step": 235510 + }, + { + "epoch": 1.504670150645899, + "grad_norm": 0.7980430722236633, + "learning_rate": 1.4418669725426436e-05, + "loss": 1.2109, + "step": 235520 + }, + { + "epoch": 1.5047340377956377, + "grad_norm": 0.7753633856773376, + "learning_rate": 1.4415144690460153e-05, + "loss": 0.8404, + "step": 235530 + }, + { + "epoch": 1.5047979249453765, + "grad_norm": 0.9534075856208801, + "learning_rate": 1.4411620013864163e-05, + "loss": 0.6998, + "step": 235540 + }, + { + "epoch": 1.5048618120951152, + "grad_norm": 0.9780369400978088, + "learning_rate": 1.440809569567393e-05, + "loss": 0.6091, + "step": 235550 + }, + { + "epoch": 1.5049256992448539, + "grad_norm": 0.80521559715271, + "learning_rate": 1.4404571735924983e-05, + "loss": 0.87, + "step": 235560 + }, + { + "epoch": 1.5049895863945926, + "grad_norm": 1.0219656229019165, + "learning_rate": 1.4401048134652773e-05, + "loss": 1.181, + "step": 235570 + }, + { + "epoch": 1.5050534735443313, + "grad_norm": 1.8196736574172974, + "learning_rate": 1.4397524891892821e-05, + "loss": 0.9589, + "step": 235580 + }, + { + "epoch": 1.50511736069407, + "grad_norm": 0.778160810470581, + "learning_rate": 1.4394002007680585e-05, + "loss": 0.6646, + "step": 235590 + }, + { + "epoch": 1.5051812478438087, + "grad_norm": 0.9555881023406982, + "learning_rate": 1.4390479482051561e-05, + "loss": 0.8511, + "step": 235600 + }, + { + "epoch": 1.5052451349935474, + "grad_norm": 1.0950413942337036, + "learning_rate": 1.4386957315041205e-05, + "loss": 1.0269, + "step": 235610 + }, + { + "epoch": 1.505309022143286, + "grad_norm": 1.0835603475570679, + "learning_rate": 1.4383435506685012e-05, + "loss": 0.7831, + "step": 235620 + }, + { + "epoch": 1.5053729092930248, + "grad_norm": 0.7866032123565674, + "learning_rate": 1.4379914057018417e-05, + "loss": 0.8731, + "step": 235630 + }, + { + "epoch": 1.5054367964427635, + "grad_norm": 1.1362463235855103, + "learning_rate": 1.4376392966076924e-05, + "loss": 0.8806, + "step": 235640 + }, + { + "epoch": 1.5055006835925022, + "grad_norm": 1.0019675493240356, + "learning_rate": 1.4372872233895957e-05, + "loss": 0.8582, + "step": 235650 + }, + { + "epoch": 1.505564570742241, + "grad_norm": 1.379349946975708, + "learning_rate": 1.4369351860511e-05, + "loss": 0.6788, + "step": 235660 + }, + { + "epoch": 1.5056284578919796, + "grad_norm": 0.7315894365310669, + "learning_rate": 1.4365831845957483e-05, + "loss": 0.7359, + "step": 235670 + }, + { + "epoch": 1.5056923450417183, + "grad_norm": 1.0244824886322021, + "learning_rate": 1.4362312190270877e-05, + "loss": 0.9086, + "step": 235680 + }, + { + "epoch": 1.505756232191457, + "grad_norm": 1.2244248390197754, + "learning_rate": 1.4358792893486611e-05, + "loss": 0.7557, + "step": 235690 + }, + { + "epoch": 1.5058201193411958, + "grad_norm": 1.191369652748108, + "learning_rate": 1.4355273955640141e-05, + "loss": 0.8936, + "step": 235700 + }, + { + "epoch": 1.5058840064909345, + "grad_norm": 1.3286489248275757, + "learning_rate": 1.435175537676689e-05, + "loss": 0.7653, + "step": 235710 + }, + { + "epoch": 1.5059478936406732, + "grad_norm": 0.8277062773704529, + "learning_rate": 1.4348237156902317e-05, + "loss": 0.9562, + "step": 235720 + }, + { + "epoch": 1.5060117807904119, + "grad_norm": 0.920819103717804, + "learning_rate": 1.434471929608182e-05, + "loss": 0.8078, + "step": 235730 + }, + { + "epoch": 1.5060756679401504, + "grad_norm": 1.1261929273605347, + "learning_rate": 1.4341201794340852e-05, + "loss": 1.0394, + "step": 235740 + }, + { + "epoch": 1.5061395550898893, + "grad_norm": 0.8690299987792969, + "learning_rate": 1.4337684651714844e-05, + "loss": 1.2097, + "step": 235750 + }, + { + "epoch": 1.5062034422396278, + "grad_norm": 1.0420963764190674, + "learning_rate": 1.4334167868239202e-05, + "loss": 1.0992, + "step": 235760 + }, + { + "epoch": 1.5062673293893667, + "grad_norm": 0.7404338121414185, + "learning_rate": 1.433065144394935e-05, + "loss": 0.7894, + "step": 235770 + }, + { + "epoch": 1.5063312165391052, + "grad_norm": 0.958733320236206, + "learning_rate": 1.4327135378880674e-05, + "loss": 0.8995, + "step": 235780 + }, + { + "epoch": 1.506395103688844, + "grad_norm": 0.6937169432640076, + "learning_rate": 1.4323619673068628e-05, + "loss": 1.1497, + "step": 235790 + }, + { + "epoch": 1.5064589908385826, + "grad_norm": 1.0881526470184326, + "learning_rate": 1.4320104326548578e-05, + "loss": 0.7022, + "step": 235800 + }, + { + "epoch": 1.5065228779883215, + "grad_norm": 1.025079369544983, + "learning_rate": 1.4316589339355957e-05, + "loss": 0.9019, + "step": 235810 + }, + { + "epoch": 1.50658676513806, + "grad_norm": 0.9831896424293518, + "learning_rate": 1.4313074711526142e-05, + "loss": 0.9154, + "step": 235820 + }, + { + "epoch": 1.506650652287799, + "grad_norm": 0.7834585905075073, + "learning_rate": 1.4309560443094549e-05, + "loss": 0.8514, + "step": 235830 + }, + { + "epoch": 1.5067145394375374, + "grad_norm": 0.7178348302841187, + "learning_rate": 1.4306046534096546e-05, + "loss": 0.803, + "step": 235840 + }, + { + "epoch": 1.5067784265872763, + "grad_norm": 1.5619884729385376, + "learning_rate": 1.4302532984567535e-05, + "loss": 1.0766, + "step": 235850 + }, + { + "epoch": 1.5068423137370148, + "grad_norm": 0.7582671642303467, + "learning_rate": 1.4299019794542912e-05, + "loss": 0.8792, + "step": 235860 + }, + { + "epoch": 1.5069062008867538, + "grad_norm": 0.9229720234870911, + "learning_rate": 1.4295506964058037e-05, + "loss": 0.8011, + "step": 235870 + }, + { + "epoch": 1.5069700880364922, + "grad_norm": 1.0600378513336182, + "learning_rate": 1.4291994493148303e-05, + "loss": 0.7071, + "step": 235880 + }, + { + "epoch": 1.5070339751862312, + "grad_norm": 1.0364567041397095, + "learning_rate": 1.428848238184906e-05, + "loss": 0.7985, + "step": 235890 + }, + { + "epoch": 1.5070978623359697, + "grad_norm": 0.8426311016082764, + "learning_rate": 1.4284970630195715e-05, + "loss": 0.9932, + "step": 235900 + }, + { + "epoch": 1.5071617494857086, + "grad_norm": 0.9839722514152527, + "learning_rate": 1.4281459238223598e-05, + "loss": 0.7332, + "step": 235910 + }, + { + "epoch": 1.507225636635447, + "grad_norm": 0.6878924369812012, + "learning_rate": 1.42779482059681e-05, + "loss": 1.1979, + "step": 235920 + }, + { + "epoch": 1.507289523785186, + "grad_norm": 1.0222363471984863, + "learning_rate": 1.4274437533464552e-05, + "loss": 0.752, + "step": 235930 + }, + { + "epoch": 1.5073534109349245, + "grad_norm": 0.9550470113754272, + "learning_rate": 1.4270927220748348e-05, + "loss": 0.8941, + "step": 235940 + }, + { + "epoch": 1.5074172980846634, + "grad_norm": 1.4349061250686646, + "learning_rate": 1.4267417267854793e-05, + "loss": 0.6205, + "step": 235950 + }, + { + "epoch": 1.5074811852344019, + "grad_norm": 0.6272745728492737, + "learning_rate": 1.4263907674819277e-05, + "loss": 0.598, + "step": 235960 + }, + { + "epoch": 1.5075450723841408, + "grad_norm": 1.3375252485275269, + "learning_rate": 1.4260398441677114e-05, + "loss": 0.7406, + "step": 235970 + }, + { + "epoch": 1.5076089595338793, + "grad_norm": 0.8655692338943481, + "learning_rate": 1.4256889568463671e-05, + "loss": 0.9343, + "step": 235980 + }, + { + "epoch": 1.507672846683618, + "grad_norm": 0.8092108964920044, + "learning_rate": 1.4253381055214254e-05, + "loss": 0.8449, + "step": 235990 + }, + { + "epoch": 1.5077367338333567, + "grad_norm": 0.7435670495033264, + "learning_rate": 1.4249872901964234e-05, + "loss": 1.1352, + "step": 236000 + }, + { + "epoch": 1.5078006209830954, + "grad_norm": 1.159881591796875, + "learning_rate": 1.4246365108748904e-05, + "loss": 0.916, + "step": 236010 + }, + { + "epoch": 1.5078645081328341, + "grad_norm": 0.8930804133415222, + "learning_rate": 1.4242857675603627e-05, + "loss": 0.7678, + "step": 236020 + }, + { + "epoch": 1.5079283952825728, + "grad_norm": 0.9131158590316772, + "learning_rate": 1.4239350602563688e-05, + "loss": 0.9283, + "step": 236030 + }, + { + "epoch": 1.5079922824323115, + "grad_norm": 2.3771884441375732, + "learning_rate": 1.4235843889664447e-05, + "loss": 0.8729, + "step": 236040 + }, + { + "epoch": 1.5080561695820502, + "grad_norm": 0.689081609249115, + "learning_rate": 1.4232337536941182e-05, + "loss": 0.9161, + "step": 236050 + }, + { + "epoch": 1.508120056731789, + "grad_norm": 0.6467447876930237, + "learning_rate": 1.4228831544429222e-05, + "loss": 0.8958, + "step": 236060 + }, + { + "epoch": 1.5081839438815277, + "grad_norm": 0.8249161839485168, + "learning_rate": 1.422532591216389e-05, + "loss": 0.8604, + "step": 236070 + }, + { + "epoch": 1.5082478310312664, + "grad_norm": 1.0231643915176392, + "learning_rate": 1.4221820640180456e-05, + "loss": 1.2338, + "step": 236080 + }, + { + "epoch": 1.508311718181005, + "grad_norm": 0.7032090425491333, + "learning_rate": 1.4218315728514253e-05, + "loss": 1.2069, + "step": 236090 + }, + { + "epoch": 1.5083756053307438, + "grad_norm": 0.7626880407333374, + "learning_rate": 1.421481117720056e-05, + "loss": 0.9404, + "step": 236100 + }, + { + "epoch": 1.5084394924804825, + "grad_norm": 0.8990775346755981, + "learning_rate": 1.4211306986274686e-05, + "loss": 0.8909, + "step": 236110 + }, + { + "epoch": 1.5085033796302212, + "grad_norm": 0.8762044906616211, + "learning_rate": 1.4207803155771898e-05, + "loss": 0.7445, + "step": 236120 + }, + { + "epoch": 1.50856726677996, + "grad_norm": 0.8042936325073242, + "learning_rate": 1.4204299685727518e-05, + "loss": 0.824, + "step": 236130 + }, + { + "epoch": 1.5086311539296986, + "grad_norm": 0.9430978298187256, + "learning_rate": 1.4200796576176788e-05, + "loss": 0.7238, + "step": 236140 + }, + { + "epoch": 1.5086950410794373, + "grad_norm": 1.2746912240982056, + "learning_rate": 1.4197293827155023e-05, + "loss": 0.914, + "step": 236150 + }, + { + "epoch": 1.508758928229176, + "grad_norm": 1.2489253282546997, + "learning_rate": 1.4193791438697467e-05, + "loss": 0.9566, + "step": 236160 + }, + { + "epoch": 1.5088228153789147, + "grad_norm": 0.8797708749771118, + "learning_rate": 1.4190289410839425e-05, + "loss": 1.0164, + "step": 236170 + }, + { + "epoch": 1.5088867025286534, + "grad_norm": 0.6316624879837036, + "learning_rate": 1.418678774361613e-05, + "loss": 0.8057, + "step": 236180 + }, + { + "epoch": 1.5089505896783921, + "grad_norm": 0.9060586094856262, + "learning_rate": 1.4183286437062882e-05, + "loss": 0.8373, + "step": 236190 + }, + { + "epoch": 1.5090144768281308, + "grad_norm": 0.741062581539154, + "learning_rate": 1.417978549121491e-05, + "loss": 0.8532, + "step": 236200 + }, + { + "epoch": 1.5090783639778695, + "grad_norm": 1.327261209487915, + "learning_rate": 1.4176284906107501e-05, + "loss": 0.7526, + "step": 236210 + }, + { + "epoch": 1.5091422511276082, + "grad_norm": 0.6677632927894592, + "learning_rate": 1.417278468177588e-05, + "loss": 0.8935, + "step": 236220 + }, + { + "epoch": 1.5092061382773467, + "grad_norm": 0.711325466632843, + "learning_rate": 1.416928481825533e-05, + "loss": 1.1103, + "step": 236230 + }, + { + "epoch": 1.5092700254270857, + "grad_norm": 1.0160562992095947, + "learning_rate": 1.4165785315581066e-05, + "loss": 0.7888, + "step": 236240 + }, + { + "epoch": 1.5093339125768241, + "grad_norm": 0.8401907682418823, + "learning_rate": 1.4162286173788359e-05, + "loss": 0.7818, + "step": 236250 + }, + { + "epoch": 1.509397799726563, + "grad_norm": 0.573164701461792, + "learning_rate": 1.4158787392912431e-05, + "loss": 0.716, + "step": 236260 + }, + { + "epoch": 1.5094616868763016, + "grad_norm": 0.5766507983207703, + "learning_rate": 1.4155288972988507e-05, + "loss": 0.7058, + "step": 236270 + }, + { + "epoch": 1.5095255740260405, + "grad_norm": 1.2178000211715698, + "learning_rate": 1.4151790914051849e-05, + "loss": 0.8116, + "step": 236280 + }, + { + "epoch": 1.509589461175779, + "grad_norm": 0.9472524523735046, + "learning_rate": 1.4148293216137654e-05, + "loss": 0.725, + "step": 236290 + }, + { + "epoch": 1.509653348325518, + "grad_norm": 1.013202428817749, + "learning_rate": 1.414479587928118e-05, + "loss": 0.9632, + "step": 236300 + }, + { + "epoch": 1.5097172354752564, + "grad_norm": 0.6979582905769348, + "learning_rate": 1.4141298903517608e-05, + "loss": 0.9035, + "step": 236310 + }, + { + "epoch": 1.5097811226249953, + "grad_norm": 0.641642689704895, + "learning_rate": 1.4137802288882202e-05, + "loss": 0.9259, + "step": 236320 + }, + { + "epoch": 1.5098450097747338, + "grad_norm": 0.8102341294288635, + "learning_rate": 1.4134306035410134e-05, + "loss": 0.8065, + "step": 236330 + }, + { + "epoch": 1.5099088969244727, + "grad_norm": 0.8578863143920898, + "learning_rate": 1.4130810143136646e-05, + "loss": 0.847, + "step": 236340 + }, + { + "epoch": 1.5099727840742112, + "grad_norm": 1.7385120391845703, + "learning_rate": 1.412731461209692e-05, + "loss": 0.9918, + "step": 236350 + }, + { + "epoch": 1.5100366712239501, + "grad_norm": 1.2686347961425781, + "learning_rate": 1.412381944232618e-05, + "loss": 0.9261, + "step": 236360 + }, + { + "epoch": 1.5101005583736886, + "grad_norm": 0.8600685596466064, + "learning_rate": 1.4120324633859605e-05, + "loss": 0.8673, + "step": 236370 + }, + { + "epoch": 1.5101644455234275, + "grad_norm": 1.0204668045043945, + "learning_rate": 1.41168301867324e-05, + "loss": 0.912, + "step": 236380 + }, + { + "epoch": 1.510228332673166, + "grad_norm": 0.8734965920448303, + "learning_rate": 1.4113336100979767e-05, + "loss": 0.6381, + "step": 236390 + }, + { + "epoch": 1.510292219822905, + "grad_norm": 0.6682913303375244, + "learning_rate": 1.4109842376636878e-05, + "loss": 0.9431, + "step": 236400 + }, + { + "epoch": 1.5103561069726434, + "grad_norm": 2.5903491973876953, + "learning_rate": 1.410634901373894e-05, + "loss": 0.8283, + "step": 236410 + }, + { + "epoch": 1.5104199941223824, + "grad_norm": 0.5716591477394104, + "learning_rate": 1.4102856012321104e-05, + "loss": 0.9301, + "step": 236420 + }, + { + "epoch": 1.5104838812721209, + "grad_norm": 0.8407396674156189, + "learning_rate": 1.4099363372418584e-05, + "loss": 1.024, + "step": 236430 + }, + { + "epoch": 1.5105477684218598, + "grad_norm": 1.072849988937378, + "learning_rate": 1.409587109406651e-05, + "loss": 1.2643, + "step": 236440 + }, + { + "epoch": 1.5106116555715983, + "grad_norm": 0.9766477346420288, + "learning_rate": 1.4092379177300091e-05, + "loss": 0.8839, + "step": 236450 + }, + { + "epoch": 1.5106755427213372, + "grad_norm": 0.8836285471916199, + "learning_rate": 1.4088887622154468e-05, + "loss": 1.0016, + "step": 236460 + }, + { + "epoch": 1.5107394298710757, + "grad_norm": 0.9911049008369446, + "learning_rate": 1.4085396428664826e-05, + "loss": 0.8989, + "step": 236470 + }, + { + "epoch": 1.5108033170208144, + "grad_norm": 1.1944478750228882, + "learning_rate": 1.4081905596866296e-05, + "loss": 0.9794, + "step": 236480 + }, + { + "epoch": 1.510867204170553, + "grad_norm": 0.7414306998252869, + "learning_rate": 1.407841512679407e-05, + "loss": 0.6418, + "step": 236490 + }, + { + "epoch": 1.5109310913202918, + "grad_norm": 1.1465985774993896, + "learning_rate": 1.4074925018483265e-05, + "loss": 0.754, + "step": 236500 + }, + { + "epoch": 1.5109949784700305, + "grad_norm": 1.3809295892715454, + "learning_rate": 1.4071435271969058e-05, + "loss": 0.7055, + "step": 236510 + }, + { + "epoch": 1.5110588656197692, + "grad_norm": 0.7024053931236267, + "learning_rate": 1.406794588728656e-05, + "loss": 0.8718, + "step": 236520 + }, + { + "epoch": 1.511122752769508, + "grad_norm": 0.8922861218452454, + "learning_rate": 1.4064456864470954e-05, + "loss": 0.7903, + "step": 236530 + }, + { + "epoch": 1.5111866399192466, + "grad_norm": 0.7403420805931091, + "learning_rate": 1.406096820355734e-05, + "loss": 0.8861, + "step": 236540 + }, + { + "epoch": 1.5112505270689853, + "grad_norm": 0.8601128458976746, + "learning_rate": 1.4057479904580884e-05, + "loss": 1.1446, + "step": 236550 + }, + { + "epoch": 1.511314414218724, + "grad_norm": 0.9789170622825623, + "learning_rate": 1.4053991967576684e-05, + "loss": 0.6798, + "step": 236560 + }, + { + "epoch": 1.5113783013684627, + "grad_norm": 1.0461435317993164, + "learning_rate": 1.4050504392579894e-05, + "loss": 0.7695, + "step": 236570 + }, + { + "epoch": 1.5114421885182014, + "grad_norm": 1.5159739255905151, + "learning_rate": 1.4047017179625616e-05, + "loss": 0.92, + "step": 236580 + }, + { + "epoch": 1.5115060756679402, + "grad_norm": 0.7177718877792358, + "learning_rate": 1.4043530328748976e-05, + "loss": 0.7973, + "step": 236590 + }, + { + "epoch": 1.5115699628176789, + "grad_norm": 1.0641429424285889, + "learning_rate": 1.4040043839985107e-05, + "loss": 0.9135, + "step": 236600 + }, + { + "epoch": 1.5116338499674176, + "grad_norm": 1.2808974981307983, + "learning_rate": 1.4036557713369091e-05, + "loss": 0.7015, + "step": 236610 + }, + { + "epoch": 1.5116977371171563, + "grad_norm": 0.9831389784812927, + "learning_rate": 1.4033071948936071e-05, + "loss": 0.9069, + "step": 236620 + }, + { + "epoch": 1.511761624266895, + "grad_norm": 1.1371793746948242, + "learning_rate": 1.4029586546721113e-05, + "loss": 0.8939, + "step": 236630 + }, + { + "epoch": 1.5118255114166337, + "grad_norm": 0.7853311896324158, + "learning_rate": 1.4026101506759354e-05, + "loss": 0.9674, + "step": 236640 + }, + { + "epoch": 1.5118893985663724, + "grad_norm": 0.6516915559768677, + "learning_rate": 1.4022616829085861e-05, + "loss": 0.8483, + "step": 236650 + }, + { + "epoch": 1.511953285716111, + "grad_norm": 0.8449056148529053, + "learning_rate": 1.401913251373575e-05, + "loss": 0.893, + "step": 236660 + }, + { + "epoch": 1.5120171728658498, + "grad_norm": 1.0003093481063843, + "learning_rate": 1.4015648560744093e-05, + "loss": 0.7645, + "step": 236670 + }, + { + "epoch": 1.5120810600155885, + "grad_norm": 1.1056313514709473, + "learning_rate": 1.4012164970146002e-05, + "loss": 0.8626, + "step": 236680 + }, + { + "epoch": 1.5121449471653272, + "grad_norm": 0.949406623840332, + "learning_rate": 1.4008681741976526e-05, + "loss": 0.9225, + "step": 236690 + }, + { + "epoch": 1.512208834315066, + "grad_norm": 0.9006887078285217, + "learning_rate": 1.4005198876270775e-05, + "loss": 0.9581, + "step": 236700 + }, + { + "epoch": 1.5122727214648046, + "grad_norm": 0.7306432127952576, + "learning_rate": 1.40017163730638e-05, + "loss": 0.9412, + "step": 236710 + }, + { + "epoch": 1.512336608614543, + "grad_norm": 1.1052353382110596, + "learning_rate": 1.3998234232390695e-05, + "loss": 1.0588, + "step": 236720 + }, + { + "epoch": 1.512400495764282, + "grad_norm": 1.119249701499939, + "learning_rate": 1.3994752454286525e-05, + "loss": 0.7952, + "step": 236730 + }, + { + "epoch": 1.5124643829140205, + "grad_norm": 0.7595828771591187, + "learning_rate": 1.3991271038786325e-05, + "loss": 0.8514, + "step": 236740 + }, + { + "epoch": 1.5125282700637595, + "grad_norm": 1.1814230680465698, + "learning_rate": 1.3987789985925193e-05, + "loss": 0.8241, + "step": 236750 + }, + { + "epoch": 1.512592157213498, + "grad_norm": 0.7136185765266418, + "learning_rate": 1.3984309295738157e-05, + "loss": 0.8798, + "step": 236760 + }, + { + "epoch": 1.5126560443632369, + "grad_norm": 1.1095455884933472, + "learning_rate": 1.3980828968260296e-05, + "loss": 0.8871, + "step": 236770 + }, + { + "epoch": 1.5127199315129753, + "grad_norm": 1.1192913055419922, + "learning_rate": 1.397734900352664e-05, + "loss": 0.9155, + "step": 236780 + }, + { + "epoch": 1.5127838186627143, + "grad_norm": 0.9863184094429016, + "learning_rate": 1.3973869401572254e-05, + "loss": 0.907, + "step": 236790 + }, + { + "epoch": 1.5128477058124528, + "grad_norm": 0.8944969773292542, + "learning_rate": 1.3970390162432156e-05, + "loss": 1.1686, + "step": 236800 + }, + { + "epoch": 1.5129115929621917, + "grad_norm": 0.6087459325790405, + "learning_rate": 1.3966911286141416e-05, + "loss": 0.9757, + "step": 236810 + }, + { + "epoch": 1.5129754801119302, + "grad_norm": 1.1121567487716675, + "learning_rate": 1.3963432772735036e-05, + "loss": 0.8356, + "step": 236820 + }, + { + "epoch": 1.513039367261669, + "grad_norm": 2.9160475730895996, + "learning_rate": 1.3959954622248078e-05, + "loss": 0.9628, + "step": 236830 + }, + { + "epoch": 1.5131032544114076, + "grad_norm": 0.8892576694488525, + "learning_rate": 1.3956476834715544e-05, + "loss": 0.8443, + "step": 236840 + }, + { + "epoch": 1.5131671415611465, + "grad_norm": 0.9479966759681702, + "learning_rate": 1.3952999410172485e-05, + "loss": 0.7571, + "step": 236850 + }, + { + "epoch": 1.513231028710885, + "grad_norm": 1.493237853050232, + "learning_rate": 1.3949522348653887e-05, + "loss": 0.9724, + "step": 236860 + }, + { + "epoch": 1.513294915860624, + "grad_norm": 0.968614399433136, + "learning_rate": 1.3946045650194806e-05, + "loss": 0.8699, + "step": 236870 + }, + { + "epoch": 1.5133588030103624, + "grad_norm": 0.9419978857040405, + "learning_rate": 1.3942569314830218e-05, + "loss": 1.1052, + "step": 236880 + }, + { + "epoch": 1.5134226901601013, + "grad_norm": 1.1808056831359863, + "learning_rate": 1.3939093342595172e-05, + "loss": 1.2063, + "step": 236890 + }, + { + "epoch": 1.5134865773098398, + "grad_norm": 1.5204524993896484, + "learning_rate": 1.3935617733524636e-05, + "loss": 0.6257, + "step": 236900 + }, + { + "epoch": 1.5135504644595787, + "grad_norm": 1.0782991647720337, + "learning_rate": 1.3932142487653627e-05, + "loss": 0.7599, + "step": 236910 + }, + { + "epoch": 1.5136143516093172, + "grad_norm": 1.501810073852539, + "learning_rate": 1.3928667605017165e-05, + "loss": 1.0803, + "step": 236920 + }, + { + "epoch": 1.5136782387590562, + "grad_norm": 0.753489077091217, + "learning_rate": 1.3925193085650207e-05, + "loss": 1.2847, + "step": 236930 + }, + { + "epoch": 1.5137421259087946, + "grad_norm": 0.9196399450302124, + "learning_rate": 1.3921718929587779e-05, + "loss": 0.8163, + "step": 236940 + }, + { + "epoch": 1.5138060130585336, + "grad_norm": 1.3756850957870483, + "learning_rate": 1.3918245136864844e-05, + "loss": 0.8701, + "step": 236950 + }, + { + "epoch": 1.513869900208272, + "grad_norm": 1.2189220190048218, + "learning_rate": 1.3914771707516406e-05, + "loss": 0.7241, + "step": 236960 + }, + { + "epoch": 1.5139337873580108, + "grad_norm": 0.7605637311935425, + "learning_rate": 1.3911298641577425e-05, + "loss": 0.7165, + "step": 236970 + }, + { + "epoch": 1.5139976745077495, + "grad_norm": 1.6095051765441895, + "learning_rate": 1.3907825939082897e-05, + "loss": 0.928, + "step": 236980 + }, + { + "epoch": 1.5140615616574882, + "grad_norm": 1.8448421955108643, + "learning_rate": 1.390435360006777e-05, + "loss": 0.8557, + "step": 236990 + }, + { + "epoch": 1.5141254488072269, + "grad_norm": 1.3630905151367188, + "learning_rate": 1.3900881624567053e-05, + "loss": 1.0013, + "step": 237000 + }, + { + "epoch": 1.5141893359569656, + "grad_norm": 0.9617922306060791, + "learning_rate": 1.3897410012615664e-05, + "loss": 0.8883, + "step": 237010 + }, + { + "epoch": 1.5142532231067043, + "grad_norm": 1.3461328744888306, + "learning_rate": 1.3893938764248609e-05, + "loss": 1.0668, + "step": 237020 + }, + { + "epoch": 1.514317110256443, + "grad_norm": 1.3913925886154175, + "learning_rate": 1.3890467879500813e-05, + "loss": 1.0365, + "step": 237030 + }, + { + "epoch": 1.5143809974061817, + "grad_norm": 1.317252516746521, + "learning_rate": 1.3886997358407256e-05, + "loss": 0.8252, + "step": 237040 + }, + { + "epoch": 1.5144448845559204, + "grad_norm": 1.155108094215393, + "learning_rate": 1.388352720100286e-05, + "loss": 0.9447, + "step": 237050 + }, + { + "epoch": 1.5145087717056591, + "grad_norm": 0.9392795562744141, + "learning_rate": 1.3880057407322612e-05, + "loss": 0.8071, + "step": 237060 + }, + { + "epoch": 1.5145726588553978, + "grad_norm": 0.9975323677062988, + "learning_rate": 1.3876587977401418e-05, + "loss": 0.9965, + "step": 237070 + }, + { + "epoch": 1.5146365460051365, + "grad_norm": 0.9569724798202515, + "learning_rate": 1.3873118911274247e-05, + "loss": 0.5875, + "step": 237080 + }, + { + "epoch": 1.5147004331548752, + "grad_norm": 0.8880897164344788, + "learning_rate": 1.386965020897601e-05, + "loss": 0.9679, + "step": 237090 + }, + { + "epoch": 1.514764320304614, + "grad_norm": 1.2377578020095825, + "learning_rate": 1.3866181870541667e-05, + "loss": 1.1771, + "step": 237100 + }, + { + "epoch": 1.5148282074543526, + "grad_norm": 1.317079782485962, + "learning_rate": 1.3862713896006118e-05, + "loss": 0.6383, + "step": 237110 + }, + { + "epoch": 1.5148920946040914, + "grad_norm": 1.1456557512283325, + "learning_rate": 1.3859246285404304e-05, + "loss": 0.6171, + "step": 237120 + }, + { + "epoch": 1.51495598175383, + "grad_norm": 0.9332743883132935, + "learning_rate": 1.3855779038771156e-05, + "loss": 1.0269, + "step": 237130 + }, + { + "epoch": 1.5150198689035688, + "grad_norm": 1.2689080238342285, + "learning_rate": 1.3852312156141573e-05, + "loss": 0.7387, + "step": 237140 + }, + { + "epoch": 1.5150837560533075, + "grad_norm": 1.0080506801605225, + "learning_rate": 1.384884563755049e-05, + "loss": 0.7163, + "step": 237150 + }, + { + "epoch": 1.5151476432030462, + "grad_norm": 0.8763740062713623, + "learning_rate": 1.3845379483032794e-05, + "loss": 0.7504, + "step": 237160 + }, + { + "epoch": 1.5152115303527849, + "grad_norm": 1.120248794555664, + "learning_rate": 1.3841913692623421e-05, + "loss": 0.8088, + "step": 237170 + }, + { + "epoch": 1.5152754175025236, + "grad_norm": 1.2910860776901245, + "learning_rate": 1.383844826635724e-05, + "loss": 0.9404, + "step": 237180 + }, + { + "epoch": 1.5153393046522623, + "grad_norm": 1.2270375490188599, + "learning_rate": 1.3834983204269186e-05, + "loss": 0.8614, + "step": 237190 + }, + { + "epoch": 1.515403191802001, + "grad_norm": 1.0125607252120972, + "learning_rate": 1.383151850639412e-05, + "loss": 0.9424, + "step": 237200 + }, + { + "epoch": 1.5154670789517395, + "grad_norm": 1.463198184967041, + "learning_rate": 1.3828054172766974e-05, + "loss": 0.927, + "step": 237210 + }, + { + "epoch": 1.5155309661014784, + "grad_norm": 1.2470359802246094, + "learning_rate": 1.382459020342261e-05, + "loss": 0.9859, + "step": 237220 + }, + { + "epoch": 1.515594853251217, + "grad_norm": 0.8071841597557068, + "learning_rate": 1.3821126598395906e-05, + "loss": 0.8442, + "step": 237230 + }, + { + "epoch": 1.5156587404009558, + "grad_norm": 0.9477114677429199, + "learning_rate": 1.3817663357721772e-05, + "loss": 0.9196, + "step": 237240 + }, + { + "epoch": 1.5157226275506943, + "grad_norm": 1.4145262241363525, + "learning_rate": 1.3814200481435057e-05, + "loss": 0.6977, + "step": 237250 + }, + { + "epoch": 1.5157865147004332, + "grad_norm": 0.6105630397796631, + "learning_rate": 1.3810737969570659e-05, + "loss": 0.8531, + "step": 237260 + }, + { + "epoch": 1.5158504018501717, + "grad_norm": 0.530202329158783, + "learning_rate": 1.3807275822163429e-05, + "loss": 1.0427, + "step": 237270 + }, + { + "epoch": 1.5159142889999107, + "grad_norm": 1.427032709121704, + "learning_rate": 1.3803814039248247e-05, + "loss": 0.9795, + "step": 237280 + }, + { + "epoch": 1.5159781761496491, + "grad_norm": 1.3998620510101318, + "learning_rate": 1.380035262085997e-05, + "loss": 1.2296, + "step": 237290 + }, + { + "epoch": 1.516042063299388, + "grad_norm": 0.7057076096534729, + "learning_rate": 1.3796891567033466e-05, + "loss": 0.8736, + "step": 237300 + }, + { + "epoch": 1.5161059504491265, + "grad_norm": 0.9350077509880066, + "learning_rate": 1.379343087780357e-05, + "loss": 0.5734, + "step": 237310 + }, + { + "epoch": 1.5161698375988655, + "grad_norm": 1.0376969575881958, + "learning_rate": 1.3789970553205161e-05, + "loss": 0.6677, + "step": 237320 + }, + { + "epoch": 1.516233724748604, + "grad_norm": 0.796563982963562, + "learning_rate": 1.3786510593273067e-05, + "loss": 0.7587, + "step": 237330 + }, + { + "epoch": 1.516297611898343, + "grad_norm": 1.1124401092529297, + "learning_rate": 1.3783050998042146e-05, + "loss": 0.7973, + "step": 237340 + }, + { + "epoch": 1.5163614990480814, + "grad_norm": 0.6725632548332214, + "learning_rate": 1.3779591767547223e-05, + "loss": 0.7605, + "step": 237350 + }, + { + "epoch": 1.5164253861978203, + "grad_norm": 0.9825790524482727, + "learning_rate": 1.3776132901823163e-05, + "loss": 0.8773, + "step": 237360 + }, + { + "epoch": 1.5164892733475588, + "grad_norm": 0.533623456954956, + "learning_rate": 1.3772674400904766e-05, + "loss": 0.7198, + "step": 237370 + }, + { + "epoch": 1.5165531604972977, + "grad_norm": 0.9449812173843384, + "learning_rate": 1.3769216264826895e-05, + "loss": 0.6792, + "step": 237380 + }, + { + "epoch": 1.5166170476470362, + "grad_norm": 0.7161667346954346, + "learning_rate": 1.3765758493624348e-05, + "loss": 0.7869, + "step": 237390 + }, + { + "epoch": 1.5166809347967751, + "grad_norm": 1.3305622339248657, + "learning_rate": 1.376230108733197e-05, + "loss": 0.9165, + "step": 237400 + }, + { + "epoch": 1.5167448219465136, + "grad_norm": 0.9296539425849915, + "learning_rate": 1.3758844045984553e-05, + "loss": 0.6234, + "step": 237410 + }, + { + "epoch": 1.5168087090962525, + "grad_norm": 0.8305658102035522, + "learning_rate": 1.3755387369616952e-05, + "loss": 0.9068, + "step": 237420 + }, + { + "epoch": 1.516872596245991, + "grad_norm": 0.8052715063095093, + "learning_rate": 1.3751931058263933e-05, + "loss": 0.8243, + "step": 237430 + }, + { + "epoch": 1.5169364833957297, + "grad_norm": 0.8373987078666687, + "learning_rate": 1.3748475111960334e-05, + "loss": 0.9154, + "step": 237440 + }, + { + "epoch": 1.5170003705454684, + "grad_norm": 1.054856538772583, + "learning_rate": 1.3745019530740965e-05, + "loss": 0.8902, + "step": 237450 + }, + { + "epoch": 1.5170642576952071, + "grad_norm": 0.9032215476036072, + "learning_rate": 1.3741564314640599e-05, + "loss": 0.8574, + "step": 237460 + }, + { + "epoch": 1.5171281448449458, + "grad_norm": 0.979597806930542, + "learning_rate": 1.3738109463694065e-05, + "loss": 1.0963, + "step": 237470 + }, + { + "epoch": 1.5171920319946846, + "grad_norm": 1.0569349527359009, + "learning_rate": 1.3734654977936123e-05, + "loss": 1.2293, + "step": 237480 + }, + { + "epoch": 1.5172559191444233, + "grad_norm": 0.7811325192451477, + "learning_rate": 1.3731200857401594e-05, + "loss": 0.8444, + "step": 237490 + }, + { + "epoch": 1.517319806294162, + "grad_norm": 0.9610899090766907, + "learning_rate": 1.3727747102125239e-05, + "loss": 1.0613, + "step": 237500 + }, + { + "epoch": 1.5173836934439007, + "grad_norm": 1.7076857089996338, + "learning_rate": 1.3724293712141862e-05, + "loss": 0.8392, + "step": 237510 + }, + { + "epoch": 1.5174475805936394, + "grad_norm": 0.8330065608024597, + "learning_rate": 1.3720840687486215e-05, + "loss": 0.7585, + "step": 237520 + }, + { + "epoch": 1.517511467743378, + "grad_norm": 1.0238244533538818, + "learning_rate": 1.3717388028193101e-05, + "loss": 0.9074, + "step": 237530 + }, + { + "epoch": 1.5175753548931168, + "grad_norm": 0.796726405620575, + "learning_rate": 1.3713935734297268e-05, + "loss": 0.8588, + "step": 237540 + }, + { + "epoch": 1.5176392420428555, + "grad_norm": 1.0146052837371826, + "learning_rate": 1.3710483805833507e-05, + "loss": 0.8888, + "step": 237550 + }, + { + "epoch": 1.5177031291925942, + "grad_norm": 0.8111674189567566, + "learning_rate": 1.3707032242836554e-05, + "loss": 1.0352, + "step": 237560 + }, + { + "epoch": 1.517767016342333, + "grad_norm": 1.2000855207443237, + "learning_rate": 1.3703581045341196e-05, + "loss": 0.9485, + "step": 237570 + }, + { + "epoch": 1.5178309034920716, + "grad_norm": 0.7340511083602905, + "learning_rate": 1.3700130213382173e-05, + "loss": 0.9307, + "step": 237580 + }, + { + "epoch": 1.5178947906418103, + "grad_norm": 1.05705988407135, + "learning_rate": 1.3696679746994251e-05, + "loss": 0.7477, + "step": 237590 + }, + { + "epoch": 1.517958677791549, + "grad_norm": 1.2210100889205933, + "learning_rate": 1.3693229646212153e-05, + "loss": 0.8642, + "step": 237600 + }, + { + "epoch": 1.5180225649412877, + "grad_norm": 0.6993927955627441, + "learning_rate": 1.368977991107066e-05, + "loss": 0.6987, + "step": 237610 + }, + { + "epoch": 1.5180864520910264, + "grad_norm": 1.8930222988128662, + "learning_rate": 1.3686330541604481e-05, + "loss": 0.7734, + "step": 237620 + }, + { + "epoch": 1.5181503392407651, + "grad_norm": 0.5977169275283813, + "learning_rate": 1.3682881537848385e-05, + "loss": 0.8858, + "step": 237630 + }, + { + "epoch": 1.5182142263905039, + "grad_norm": 0.9742574095726013, + "learning_rate": 1.3679432899837075e-05, + "loss": 0.9346, + "step": 237640 + }, + { + "epoch": 1.5182781135402426, + "grad_norm": 0.7708993554115295, + "learning_rate": 1.3675984627605298e-05, + "loss": 0.9882, + "step": 237650 + }, + { + "epoch": 1.5183420006899813, + "grad_norm": 1.17482590675354, + "learning_rate": 1.367253672118779e-05, + "loss": 0.9899, + "step": 237660 + }, + { + "epoch": 1.51840588783972, + "grad_norm": 2.454292058944702, + "learning_rate": 1.3669089180619255e-05, + "loss": 1.1206, + "step": 237670 + }, + { + "epoch": 1.5184697749894585, + "grad_norm": 1.0289863348007202, + "learning_rate": 1.3665642005934436e-05, + "loss": 0.9492, + "step": 237680 + }, + { + "epoch": 1.5185336621391974, + "grad_norm": 1.0246806144714355, + "learning_rate": 1.3662195197168026e-05, + "loss": 0.7844, + "step": 237690 + }, + { + "epoch": 1.5185975492889359, + "grad_norm": 0.8510035276412964, + "learning_rate": 1.3658748754354773e-05, + "loss": 0.8301, + "step": 237700 + }, + { + "epoch": 1.5186614364386748, + "grad_norm": 1.074318289756775, + "learning_rate": 1.365530267752933e-05, + "loss": 0.573, + "step": 237710 + }, + { + "epoch": 1.5187253235884133, + "grad_norm": 1.842522382736206, + "learning_rate": 1.3651856966726445e-05, + "loss": 0.9324, + "step": 237720 + }, + { + "epoch": 1.5187892107381522, + "grad_norm": 0.6383247375488281, + "learning_rate": 1.3648411621980794e-05, + "loss": 1.0601, + "step": 237730 + }, + { + "epoch": 1.5188530978878907, + "grad_norm": 0.7060561180114746, + "learning_rate": 1.3644966643327101e-05, + "loss": 0.7332, + "step": 237740 + }, + { + "epoch": 1.5189169850376296, + "grad_norm": 1.0963681936264038, + "learning_rate": 1.3641522030800025e-05, + "loss": 0.9709, + "step": 237750 + }, + { + "epoch": 1.518980872187368, + "grad_norm": 0.8624010682106018, + "learning_rate": 1.3638077784434283e-05, + "loss": 0.8324, + "step": 237760 + }, + { + "epoch": 1.519044759337107, + "grad_norm": 0.6828634738922119, + "learning_rate": 1.3634633904264572e-05, + "loss": 0.7604, + "step": 237770 + }, + { + "epoch": 1.5191086464868455, + "grad_norm": 1.0846530199050903, + "learning_rate": 1.3631190390325539e-05, + "loss": 0.7711, + "step": 237780 + }, + { + "epoch": 1.5191725336365844, + "grad_norm": 0.6594371795654297, + "learning_rate": 1.3627747242651895e-05, + "loss": 0.7386, + "step": 237790 + }, + { + "epoch": 1.519236420786323, + "grad_norm": 1.0198477506637573, + "learning_rate": 1.3624304461278292e-05, + "loss": 0.8324, + "step": 237800 + }, + { + "epoch": 1.5193003079360619, + "grad_norm": 0.8147205710411072, + "learning_rate": 1.3620862046239425e-05, + "loss": 1.0197, + "step": 237810 + }, + { + "epoch": 1.5193641950858003, + "grad_norm": 0.9647092819213867, + "learning_rate": 1.3617419997569936e-05, + "loss": 0.8689, + "step": 237820 + }, + { + "epoch": 1.5194280822355393, + "grad_norm": 1.2450505495071411, + "learning_rate": 1.361397831530452e-05, + "loss": 0.9359, + "step": 237830 + }, + { + "epoch": 1.5194919693852778, + "grad_norm": 1.0000580549240112, + "learning_rate": 1.3610536999477802e-05, + "loss": 1.0524, + "step": 237840 + }, + { + "epoch": 1.5195558565350167, + "grad_norm": 0.6471056342124939, + "learning_rate": 1.3607096050124474e-05, + "loss": 1.0834, + "step": 237850 + }, + { + "epoch": 1.5196197436847552, + "grad_norm": 0.9813455939292908, + "learning_rate": 1.3603655467279158e-05, + "loss": 0.8514, + "step": 237860 + }, + { + "epoch": 1.519683630834494, + "grad_norm": 0.793258011341095, + "learning_rate": 1.3600215250976533e-05, + "loss": 0.8319, + "step": 237870 + }, + { + "epoch": 1.5197475179842326, + "grad_norm": 0.9160090684890747, + "learning_rate": 1.3596775401251222e-05, + "loss": 0.9845, + "step": 237880 + }, + { + "epoch": 1.5198114051339715, + "grad_norm": 0.8141654133796692, + "learning_rate": 1.3593335918137883e-05, + "loss": 0.9324, + "step": 237890 + }, + { + "epoch": 1.51987529228371, + "grad_norm": 1.104178786277771, + "learning_rate": 1.3589896801671137e-05, + "loss": 0.9644, + "step": 237900 + }, + { + "epoch": 1.519939179433449, + "grad_norm": 0.8147308230400085, + "learning_rate": 1.3586458051885641e-05, + "loss": 0.8855, + "step": 237910 + }, + { + "epoch": 1.5200030665831874, + "grad_norm": 0.9954264163970947, + "learning_rate": 1.3583019668816004e-05, + "loss": 0.7827, + "step": 237920 + }, + { + "epoch": 1.520066953732926, + "grad_norm": 1.1882027387619019, + "learning_rate": 1.3579581652496875e-05, + "loss": 0.9866, + "step": 237930 + }, + { + "epoch": 1.5201308408826648, + "grad_norm": 0.8633627891540527, + "learning_rate": 1.3576144002962854e-05, + "loss": 0.7234, + "step": 237940 + }, + { + "epoch": 1.5201947280324035, + "grad_norm": 4.5558671951293945, + "learning_rate": 1.3572706720248584e-05, + "loss": 0.8573, + "step": 237950 + }, + { + "epoch": 1.5202586151821422, + "grad_norm": 1.0978244543075562, + "learning_rate": 1.3569269804388663e-05, + "loss": 0.8994, + "step": 237960 + }, + { + "epoch": 1.520322502331881, + "grad_norm": 1.1287128925323486, + "learning_rate": 1.3566176893803823e-05, + "loss": 0.9077, + "step": 237970 + }, + { + "epoch": 1.5203863894816196, + "grad_norm": 1.094420313835144, + "learning_rate": 1.3562740675062546e-05, + "loss": 0.8999, + "step": 237980 + }, + { + "epoch": 1.5204502766313583, + "grad_norm": 0.8271204233169556, + "learning_rate": 1.3559304823275987e-05, + "loss": 0.8423, + "step": 237990 + }, + { + "epoch": 1.520514163781097, + "grad_norm": 1.0374590158462524, + "learning_rate": 1.355586933847876e-05, + "loss": 0.9159, + "step": 238000 + }, + { + "epoch": 1.5205780509308358, + "grad_norm": 0.7326343655586243, + "learning_rate": 1.3552434220705446e-05, + "loss": 0.7634, + "step": 238010 + }, + { + "epoch": 1.5206419380805745, + "grad_norm": 0.7431874871253967, + "learning_rate": 1.3548999469990663e-05, + "loss": 0.895, + "step": 238020 + }, + { + "epoch": 1.5207058252303132, + "grad_norm": 1.8628556728363037, + "learning_rate": 1.3545565086368977e-05, + "loss": 0.9853, + "step": 238030 + }, + { + "epoch": 1.5207697123800519, + "grad_norm": 1.555378794670105, + "learning_rate": 1.3542131069875007e-05, + "loss": 1.007, + "step": 238040 + }, + { + "epoch": 1.5208335995297906, + "grad_norm": 1.2259501218795776, + "learning_rate": 1.35386974205433e-05, + "loss": 0.7222, + "step": 238050 + }, + { + "epoch": 1.5208974866795293, + "grad_norm": 0.804809033870697, + "learning_rate": 1.3535264138408466e-05, + "loss": 0.8414, + "step": 238060 + }, + { + "epoch": 1.520961373829268, + "grad_norm": 0.8649407625198364, + "learning_rate": 1.3531831223505059e-05, + "loss": 0.6925, + "step": 238070 + }, + { + "epoch": 1.5210252609790067, + "grad_norm": 0.8720270991325378, + "learning_rate": 1.3528398675867653e-05, + "loss": 0.7738, + "step": 238080 + }, + { + "epoch": 1.5210891481287454, + "grad_norm": 0.8932050466537476, + "learning_rate": 1.352496649553085e-05, + "loss": 0.9991, + "step": 238090 + }, + { + "epoch": 1.5211530352784841, + "grad_norm": 1.077694296836853, + "learning_rate": 1.3521534682529163e-05, + "loss": 0.9286, + "step": 238100 + }, + { + "epoch": 1.5212169224282228, + "grad_norm": 0.7399711012840271, + "learning_rate": 1.3518103236897206e-05, + "loss": 0.8307, + "step": 238110 + }, + { + "epoch": 1.5212808095779615, + "grad_norm": 0.7662681937217712, + "learning_rate": 1.351467215866949e-05, + "loss": 0.6985, + "step": 238120 + }, + { + "epoch": 1.5213446967277002, + "grad_norm": 0.9886029958724976, + "learning_rate": 1.3511241447880606e-05, + "loss": 0.9631, + "step": 238130 + }, + { + "epoch": 1.521408583877439, + "grad_norm": 0.6486461758613586, + "learning_rate": 1.3507811104565072e-05, + "loss": 1.0866, + "step": 238140 + }, + { + "epoch": 1.5214724710271776, + "grad_norm": 0.5925334095954895, + "learning_rate": 1.3504381128757465e-05, + "loss": 0.7503, + "step": 238150 + }, + { + "epoch": 1.5215363581769163, + "grad_norm": 0.9701249599456787, + "learning_rate": 1.35009515204923e-05, + "loss": 0.8577, + "step": 238160 + }, + { + "epoch": 1.5216002453266548, + "grad_norm": 1.4359204769134521, + "learning_rate": 1.3497522279804142e-05, + "loss": 0.9309, + "step": 238170 + }, + { + "epoch": 1.5216641324763938, + "grad_norm": 0.606021523475647, + "learning_rate": 1.3494093406727504e-05, + "loss": 0.8931, + "step": 238180 + }, + { + "epoch": 1.5217280196261322, + "grad_norm": 1.6420189142227173, + "learning_rate": 1.3490664901296935e-05, + "loss": 1.0485, + "step": 238190 + }, + { + "epoch": 1.5217919067758712, + "grad_norm": 0.7900764346122742, + "learning_rate": 1.3487236763546946e-05, + "loss": 0.8114, + "step": 238200 + }, + { + "epoch": 1.5218557939256097, + "grad_norm": 1.0666368007659912, + "learning_rate": 1.3483808993512088e-05, + "loss": 0.962, + "step": 238210 + }, + { + "epoch": 1.5219196810753486, + "grad_norm": 0.7549812197685242, + "learning_rate": 1.3480381591226843e-05, + "loss": 0.9305, + "step": 238220 + }, + { + "epoch": 1.521983568225087, + "grad_norm": 1.072159767150879, + "learning_rate": 1.3476954556725768e-05, + "loss": 0.7658, + "step": 238230 + }, + { + "epoch": 1.522047455374826, + "grad_norm": 1.3268358707427979, + "learning_rate": 1.3473527890043342e-05, + "loss": 1.0262, + "step": 238240 + }, + { + "epoch": 1.5221113425245645, + "grad_norm": 0.9297296404838562, + "learning_rate": 1.3470101591214102e-05, + "loss": 0.8517, + "step": 238250 + }, + { + "epoch": 1.5221752296743034, + "grad_norm": 0.6360511183738708, + "learning_rate": 1.3466675660272532e-05, + "loss": 0.6929, + "step": 238260 + }, + { + "epoch": 1.522239116824042, + "grad_norm": 1.2996023893356323, + "learning_rate": 1.3463250097253155e-05, + "loss": 0.7778, + "step": 238270 + }, + { + "epoch": 1.5223030039737808, + "grad_norm": 0.7415247559547424, + "learning_rate": 1.3459824902190444e-05, + "loss": 0.7141, + "step": 238280 + }, + { + "epoch": 1.5223668911235193, + "grad_norm": 1.4085605144500732, + "learning_rate": 1.3456400075118907e-05, + "loss": 0.7232, + "step": 238290 + }, + { + "epoch": 1.5224307782732582, + "grad_norm": 1.129459261894226, + "learning_rate": 1.3452975616073054e-05, + "loss": 0.6778, + "step": 238300 + }, + { + "epoch": 1.5224946654229967, + "grad_norm": 1.0583432912826538, + "learning_rate": 1.3449551525087339e-05, + "loss": 1.1301, + "step": 238310 + }, + { + "epoch": 1.5225585525727356, + "grad_norm": 1.1752994060516357, + "learning_rate": 1.3446127802196273e-05, + "loss": 0.81, + "step": 238320 + }, + { + "epoch": 1.5226224397224741, + "grad_norm": 2.07598614692688, + "learning_rate": 1.3442704447434313e-05, + "loss": 0.9167, + "step": 238330 + }, + { + "epoch": 1.522686326872213, + "grad_norm": 0.9651932716369629, + "learning_rate": 1.3439281460835973e-05, + "loss": 0.68, + "step": 238340 + }, + { + "epoch": 1.5227502140219515, + "grad_norm": 0.8868149518966675, + "learning_rate": 1.3435858842435667e-05, + "loss": 0.9032, + "step": 238350 + }, + { + "epoch": 1.5228141011716905, + "grad_norm": 0.7346360683441162, + "learning_rate": 1.343243659226791e-05, + "loss": 0.8223, + "step": 238360 + }, + { + "epoch": 1.522877988321429, + "grad_norm": 0.7869923114776611, + "learning_rate": 1.3429014710367139e-05, + "loss": 1.0532, + "step": 238370 + }, + { + "epoch": 1.5229418754711679, + "grad_norm": 1.0384732484817505, + "learning_rate": 1.3425593196767844e-05, + "loss": 0.8243, + "step": 238380 + }, + { + "epoch": 1.5230057626209064, + "grad_norm": 0.7875847816467285, + "learning_rate": 1.3422172051504445e-05, + "loss": 0.7631, + "step": 238390 + }, + { + "epoch": 1.5230696497706453, + "grad_norm": 0.836666464805603, + "learning_rate": 1.3418751274611424e-05, + "loss": 0.9747, + "step": 238400 + }, + { + "epoch": 1.5231335369203838, + "grad_norm": 1.0695717334747314, + "learning_rate": 1.3415330866123244e-05, + "loss": 1.1035, + "step": 238410 + }, + { + "epoch": 1.5231974240701225, + "grad_norm": 0.823056161403656, + "learning_rate": 1.3411910826074308e-05, + "loss": 0.9714, + "step": 238420 + }, + { + "epoch": 1.5232613112198612, + "grad_norm": 0.8672632575035095, + "learning_rate": 1.3408491154499103e-05, + "loss": 0.8351, + "step": 238430 + }, + { + "epoch": 1.5233251983696, + "grad_norm": 0.9678645133972168, + "learning_rate": 1.3405071851432034e-05, + "loss": 0.9878, + "step": 238440 + }, + { + "epoch": 1.5233890855193386, + "grad_norm": 0.7150598168373108, + "learning_rate": 1.3401652916907564e-05, + "loss": 1.0002, + "step": 238450 + }, + { + "epoch": 1.5234529726690773, + "grad_norm": 1.2159661054611206, + "learning_rate": 1.3398234350960098e-05, + "loss": 0.9008, + "step": 238460 + }, + { + "epoch": 1.523516859818816, + "grad_norm": 0.9079484939575195, + "learning_rate": 1.339481615362409e-05, + "loss": 1.0404, + "step": 238470 + }, + { + "epoch": 1.5235807469685547, + "grad_norm": 0.8452890515327454, + "learning_rate": 1.3391398324933934e-05, + "loss": 1.0406, + "step": 238480 + }, + { + "epoch": 1.5236446341182934, + "grad_norm": 0.7699106931686401, + "learning_rate": 1.3387980864924087e-05, + "loss": 0.9912, + "step": 238490 + }, + { + "epoch": 1.5237085212680321, + "grad_norm": 1.040332555770874, + "learning_rate": 1.3384563773628932e-05, + "loss": 0.7277, + "step": 238500 + }, + { + "epoch": 1.5237724084177708, + "grad_norm": 3.4283716678619385, + "learning_rate": 1.338114705108291e-05, + "loss": 0.7815, + "step": 238510 + }, + { + "epoch": 1.5238362955675095, + "grad_norm": 0.5395203232765198, + "learning_rate": 1.33777306973204e-05, + "loss": 0.9668, + "step": 238520 + }, + { + "epoch": 1.5239001827172483, + "grad_norm": 0.8401491045951843, + "learning_rate": 1.3374314712375845e-05, + "loss": 0.6885, + "step": 238530 + }, + { + "epoch": 1.523964069866987, + "grad_norm": 1.065290927886963, + "learning_rate": 1.3370899096283607e-05, + "loss": 0.8173, + "step": 238540 + }, + { + "epoch": 1.5240279570167257, + "grad_norm": 1.0944569110870361, + "learning_rate": 1.3367483849078122e-05, + "loss": 0.7403, + "step": 238550 + }, + { + "epoch": 1.5240918441664644, + "grad_norm": 0.7220026850700378, + "learning_rate": 1.3364068970793746e-05, + "loss": 1.334, + "step": 238560 + }, + { + "epoch": 1.524155731316203, + "grad_norm": 0.9048059582710266, + "learning_rate": 1.3360654461464912e-05, + "loss": 0.7798, + "step": 238570 + }, + { + "epoch": 1.5242196184659418, + "grad_norm": 1.1797586679458618, + "learning_rate": 1.3357240321125963e-05, + "loss": 0.8042, + "step": 238580 + }, + { + "epoch": 1.5242835056156805, + "grad_norm": 0.9054034948348999, + "learning_rate": 1.335382654981132e-05, + "loss": 0.7306, + "step": 238590 + }, + { + "epoch": 1.5243473927654192, + "grad_norm": 0.8827263712882996, + "learning_rate": 1.335041314755534e-05, + "loss": 0.9916, + "step": 238600 + }, + { + "epoch": 1.524411279915158, + "grad_norm": 1.2693238258361816, + "learning_rate": 1.3347000114392399e-05, + "loss": 0.6009, + "step": 238610 + }, + { + "epoch": 1.5244751670648966, + "grad_norm": 0.9437078833580017, + "learning_rate": 1.3343587450356893e-05, + "loss": 0.9352, + "step": 238620 + }, + { + "epoch": 1.5245390542146353, + "grad_norm": 1.0291279554367065, + "learning_rate": 1.3340175155483158e-05, + "loss": 0.8882, + "step": 238630 + }, + { + "epoch": 1.524602941364374, + "grad_norm": 0.6132214665412903, + "learning_rate": 1.3336763229805593e-05, + "loss": 0.8061, + "step": 238640 + }, + { + "epoch": 1.5246668285141127, + "grad_norm": 0.7776349186897278, + "learning_rate": 1.3333351673358524e-05, + "loss": 0.8665, + "step": 238650 + }, + { + "epoch": 1.5247307156638512, + "grad_norm": 1.9750133752822876, + "learning_rate": 1.332994048617634e-05, + "loss": 0.7649, + "step": 238660 + }, + { + "epoch": 1.5247946028135901, + "grad_norm": 0.7864769697189331, + "learning_rate": 1.3326529668293364e-05, + "loss": 0.8988, + "step": 238670 + }, + { + "epoch": 1.5248584899633286, + "grad_norm": 1.1065561771392822, + "learning_rate": 1.3323119219743974e-05, + "loss": 0.936, + "step": 238680 + }, + { + "epoch": 1.5249223771130676, + "grad_norm": 0.6974082589149475, + "learning_rate": 1.3319709140562492e-05, + "loss": 0.8567, + "step": 238690 + }, + { + "epoch": 1.524986264262806, + "grad_norm": 0.8889352679252625, + "learning_rate": 1.331629943078328e-05, + "loss": 0.9785, + "step": 238700 + }, + { + "epoch": 1.525050151412545, + "grad_norm": 1.8425053358078003, + "learning_rate": 1.331289009044066e-05, + "loss": 0.8213, + "step": 238710 + }, + { + "epoch": 1.5251140385622834, + "grad_norm": 0.8023721575737, + "learning_rate": 1.330948111956899e-05, + "loss": 0.9311, + "step": 238720 + }, + { + "epoch": 1.5251779257120224, + "grad_norm": 0.7577361464500427, + "learning_rate": 1.3306072518202573e-05, + "loss": 0.9213, + "step": 238730 + }, + { + "epoch": 1.5252418128617609, + "grad_norm": 0.9880037307739258, + "learning_rate": 1.330266428637576e-05, + "loss": 1.0437, + "step": 238740 + }, + { + "epoch": 1.5253057000114998, + "grad_norm": 0.6256246566772461, + "learning_rate": 1.3299256424122857e-05, + "loss": 0.6359, + "step": 238750 + }, + { + "epoch": 1.5253695871612383, + "grad_norm": 0.5575620532035828, + "learning_rate": 1.3295848931478206e-05, + "loss": 0.7845, + "step": 238760 + }, + { + "epoch": 1.5254334743109772, + "grad_norm": 0.7465316653251648, + "learning_rate": 1.3292441808476091e-05, + "loss": 0.8861, + "step": 238770 + }, + { + "epoch": 1.5254973614607157, + "grad_norm": 2.4132463932037354, + "learning_rate": 1.328903505515086e-05, + "loss": 0.7827, + "step": 238780 + }, + { + "epoch": 1.5255612486104546, + "grad_norm": 0.8057778477668762, + "learning_rate": 1.3285628671536793e-05, + "loss": 0.7437, + "step": 238790 + }, + { + "epoch": 1.525625135760193, + "grad_norm": 0.8477099537849426, + "learning_rate": 1.328222265766822e-05, + "loss": 1.0969, + "step": 238800 + }, + { + "epoch": 1.525689022909932, + "grad_norm": 1.326920986175537, + "learning_rate": 1.3278817013579414e-05, + "loss": 0.7555, + "step": 238810 + }, + { + "epoch": 1.5257529100596705, + "grad_norm": 0.48724564909935, + "learning_rate": 1.327541173930471e-05, + "loss": 0.8383, + "step": 238820 + }, + { + "epoch": 1.5258167972094094, + "grad_norm": 0.7151726484298706, + "learning_rate": 1.3272006834878376e-05, + "loss": 0.9653, + "step": 238830 + }, + { + "epoch": 1.525880684359148, + "grad_norm": 0.9274958372116089, + "learning_rate": 1.3268602300334692e-05, + "loss": 0.7029, + "step": 238840 + }, + { + "epoch": 1.5259445715088868, + "grad_norm": 0.8506492376327515, + "learning_rate": 1.3265198135707979e-05, + "loss": 0.9839, + "step": 238850 + }, + { + "epoch": 1.5260084586586253, + "grad_norm": 1.3150792121887207, + "learning_rate": 1.3261794341032486e-05, + "loss": 0.8877, + "step": 238860 + }, + { + "epoch": 1.5260723458083643, + "grad_norm": 0.9653270244598389, + "learning_rate": 1.3258390916342517e-05, + "loss": 0.977, + "step": 238870 + }, + { + "epoch": 1.5261362329581027, + "grad_norm": 1.3301193714141846, + "learning_rate": 1.3254987861672325e-05, + "loss": 1.0401, + "step": 238880 + }, + { + "epoch": 1.5262001201078417, + "grad_norm": 0.6298654675483704, + "learning_rate": 1.3251585177056208e-05, + "loss": 0.7647, + "step": 238890 + }, + { + "epoch": 1.5262640072575802, + "grad_norm": 0.9454180598258972, + "learning_rate": 1.3248182862528407e-05, + "loss": 0.9728, + "step": 238900 + }, + { + "epoch": 1.5263278944073189, + "grad_norm": 0.7858530879020691, + "learning_rate": 1.3244780918123217e-05, + "loss": 0.8936, + "step": 238910 + }, + { + "epoch": 1.5263917815570576, + "grad_norm": 0.6082397699356079, + "learning_rate": 1.3241379343874866e-05, + "loss": 0.7736, + "step": 238920 + }, + { + "epoch": 1.5264556687067963, + "grad_norm": 1.0330744981765747, + "learning_rate": 1.323797813981762e-05, + "loss": 0.8991, + "step": 238930 + }, + { + "epoch": 1.526519555856535, + "grad_norm": 1.9987537860870361, + "learning_rate": 1.3234577305985752e-05, + "loss": 0.8497, + "step": 238940 + }, + { + "epoch": 1.5265834430062737, + "grad_norm": 1.100146770477295, + "learning_rate": 1.323117684241349e-05, + "loss": 0.7261, + "step": 238950 + }, + { + "epoch": 1.5266473301560124, + "grad_norm": 2.2519874572753906, + "learning_rate": 1.3227776749135095e-05, + "loss": 0.7635, + "step": 238960 + }, + { + "epoch": 1.526711217305751, + "grad_norm": 0.9835031628608704, + "learning_rate": 1.3224377026184787e-05, + "loss": 0.9724, + "step": 238970 + }, + { + "epoch": 1.5267751044554898, + "grad_norm": 0.9068618416786194, + "learning_rate": 1.3220977673596835e-05, + "loss": 0.8858, + "step": 238980 + }, + { + "epoch": 1.5268389916052285, + "grad_norm": 1.3341480493545532, + "learning_rate": 1.3217578691405441e-05, + "loss": 0.993, + "step": 238990 + }, + { + "epoch": 1.5269028787549672, + "grad_norm": 0.763285219669342, + "learning_rate": 1.3214180079644866e-05, + "loss": 0.74, + "step": 239000 + }, + { + "epoch": 1.526966765904706, + "grad_norm": 1.4933006763458252, + "learning_rate": 1.3210781838349306e-05, + "loss": 1.0131, + "step": 239010 + }, + { + "epoch": 1.5270306530544446, + "grad_norm": 0.6231247782707214, + "learning_rate": 1.3207383967553017e-05, + "loss": 0.9825, + "step": 239020 + }, + { + "epoch": 1.5270945402041833, + "grad_norm": 0.9345896244049072, + "learning_rate": 1.320398646729018e-05, + "loss": 1.005, + "step": 239030 + }, + { + "epoch": 1.527158427353922, + "grad_norm": 1.0000743865966797, + "learning_rate": 1.3200589337595054e-05, + "loss": 0.9002, + "step": 239040 + }, + { + "epoch": 1.5272223145036607, + "grad_norm": 0.8784435391426086, + "learning_rate": 1.319719257850181e-05, + "loss": 0.8738, + "step": 239050 + }, + { + "epoch": 1.5272862016533995, + "grad_norm": 1.6685031652450562, + "learning_rate": 1.3193796190044694e-05, + "loss": 0.966, + "step": 239060 + }, + { + "epoch": 1.5273500888031382, + "grad_norm": 0.9063624739646912, + "learning_rate": 1.3190400172257877e-05, + "loss": 1.0455, + "step": 239070 + }, + { + "epoch": 1.5274139759528769, + "grad_norm": 0.9266459345817566, + "learning_rate": 1.318700452517559e-05, + "loss": 0.6222, + "step": 239080 + }, + { + "epoch": 1.5274778631026156, + "grad_norm": 0.9492422938346863, + "learning_rate": 1.3183609248832001e-05, + "loss": 0.6274, + "step": 239090 + }, + { + "epoch": 1.5275417502523543, + "grad_norm": 1.0158274173736572, + "learning_rate": 1.3180214343261333e-05, + "loss": 0.8135, + "step": 239100 + }, + { + "epoch": 1.527605637402093, + "grad_norm": 1.1831732988357544, + "learning_rate": 1.3176819808497742e-05, + "loss": 1.0633, + "step": 239110 + }, + { + "epoch": 1.5276695245518317, + "grad_norm": 1.1651058197021484, + "learning_rate": 1.3173425644575449e-05, + "loss": 1.1724, + "step": 239120 + }, + { + "epoch": 1.5277334117015704, + "grad_norm": 0.7208083868026733, + "learning_rate": 1.3170031851528602e-05, + "loss": 0.7644, + "step": 239130 + }, + { + "epoch": 1.527797298851309, + "grad_norm": 0.7914683818817139, + "learning_rate": 1.3166638429391398e-05, + "loss": 0.7051, + "step": 239140 + }, + { + "epoch": 1.5278611860010476, + "grad_norm": 0.7424027919769287, + "learning_rate": 1.3163245378198025e-05, + "loss": 0.862, + "step": 239150 + }, + { + "epoch": 1.5279250731507865, + "grad_norm": 1.3931986093521118, + "learning_rate": 1.3159852697982627e-05, + "loss": 0.6573, + "step": 239160 + }, + { + "epoch": 1.527988960300525, + "grad_norm": 0.9321253895759583, + "learning_rate": 1.3156460388779396e-05, + "loss": 1.0719, + "step": 239170 + }, + { + "epoch": 1.528052847450264, + "grad_norm": 0.8587426543235779, + "learning_rate": 1.3153068450622468e-05, + "loss": 0.7854, + "step": 239180 + }, + { + "epoch": 1.5281167346000024, + "grad_norm": 0.9214495420455933, + "learning_rate": 1.3149676883546035e-05, + "loss": 0.9369, + "step": 239190 + }, + { + "epoch": 1.5281806217497413, + "grad_norm": 1.1665353775024414, + "learning_rate": 1.314628568758422e-05, + "loss": 0.8909, + "step": 239200 + }, + { + "epoch": 1.5282445088994798, + "grad_norm": 1.0801894664764404, + "learning_rate": 1.3142894862771205e-05, + "loss": 0.8334, + "step": 239210 + }, + { + "epoch": 1.5283083960492188, + "grad_norm": 0.8318184018135071, + "learning_rate": 1.3139504409141113e-05, + "loss": 0.8276, + "step": 239220 + }, + { + "epoch": 1.5283722831989572, + "grad_norm": 0.9736336469650269, + "learning_rate": 1.313611432672811e-05, + "loss": 0.7969, + "step": 239230 + }, + { + "epoch": 1.5284361703486962, + "grad_norm": 0.8926606178283691, + "learning_rate": 1.3132724615566317e-05, + "loss": 0.7547, + "step": 239240 + }, + { + "epoch": 1.5285000574984346, + "grad_norm": 0.926861584186554, + "learning_rate": 1.3129335275689897e-05, + "loss": 0.9206, + "step": 239250 + }, + { + "epoch": 1.5285639446481736, + "grad_norm": 0.6626387238502502, + "learning_rate": 1.3125946307132947e-05, + "loss": 0.747, + "step": 239260 + }, + { + "epoch": 1.528627831797912, + "grad_norm": 2.2653844356536865, + "learning_rate": 1.3122557709929639e-05, + "loss": 0.9802, + "step": 239270 + }, + { + "epoch": 1.528691718947651, + "grad_norm": 0.8143163323402405, + "learning_rate": 1.3119169484114063e-05, + "loss": 0.896, + "step": 239280 + }, + { + "epoch": 1.5287556060973895, + "grad_norm": 1.0263252258300781, + "learning_rate": 1.311578162972037e-05, + "loss": 0.897, + "step": 239290 + }, + { + "epoch": 1.5288194932471284, + "grad_norm": 1.1428170204162598, + "learning_rate": 1.3112394146782653e-05, + "loss": 0.7813, + "step": 239300 + }, + { + "epoch": 1.5288833803968669, + "grad_norm": 1.001025915145874, + "learning_rate": 1.3109007035335052e-05, + "loss": 0.7202, + "step": 239310 + }, + { + "epoch": 1.5289472675466058, + "grad_norm": 1.0566990375518799, + "learning_rate": 1.310562029541167e-05, + "loss": 0.9607, + "step": 239320 + }, + { + "epoch": 1.5290111546963443, + "grad_norm": 1.396536111831665, + "learning_rate": 1.3102233927046586e-05, + "loss": 0.717, + "step": 239330 + }, + { + "epoch": 1.5290750418460832, + "grad_norm": 0.9807648062705994, + "learning_rate": 1.3098847930273949e-05, + "loss": 0.8423, + "step": 239340 + }, + { + "epoch": 1.5291389289958217, + "grad_norm": 1.382917046546936, + "learning_rate": 1.309546230512782e-05, + "loss": 0.7852, + "step": 239350 + }, + { + "epoch": 1.5292028161455606, + "grad_norm": 1.0471018552780151, + "learning_rate": 1.3092077051642332e-05, + "loss": 0.8256, + "step": 239360 + }, + { + "epoch": 1.5292667032952991, + "grad_norm": 1.5218364000320435, + "learning_rate": 1.3088692169851535e-05, + "loss": 0.8289, + "step": 239370 + }, + { + "epoch": 1.5293305904450378, + "grad_norm": 0.7558670043945312, + "learning_rate": 1.3085307659789559e-05, + "loss": 0.8752, + "step": 239380 + }, + { + "epoch": 1.5293944775947765, + "grad_norm": 0.6498563289642334, + "learning_rate": 1.3081923521490463e-05, + "loss": 0.9807, + "step": 239390 + }, + { + "epoch": 1.5294583647445152, + "grad_norm": 0.7214164733886719, + "learning_rate": 1.3078539754988339e-05, + "loss": 1.0477, + "step": 239400 + }, + { + "epoch": 1.529522251894254, + "grad_norm": 0.9879746437072754, + "learning_rate": 1.3075156360317253e-05, + "loss": 0.8445, + "step": 239410 + }, + { + "epoch": 1.5295861390439927, + "grad_norm": 0.7819598913192749, + "learning_rate": 1.30717733375113e-05, + "loss": 0.9972, + "step": 239420 + }, + { + "epoch": 1.5296500261937314, + "grad_norm": 2.294267177581787, + "learning_rate": 1.3068390686604525e-05, + "loss": 1.0819, + "step": 239430 + }, + { + "epoch": 1.52971391334347, + "grad_norm": 1.4084625244140625, + "learning_rate": 1.306500840763102e-05, + "loss": 0.9889, + "step": 239440 + }, + { + "epoch": 1.5297778004932088, + "grad_norm": 0.8941295146942139, + "learning_rate": 1.3061626500624819e-05, + "loss": 0.6393, + "step": 239450 + }, + { + "epoch": 1.5298416876429475, + "grad_norm": 0.7554815411567688, + "learning_rate": 1.3058244965619993e-05, + "loss": 0.7597, + "step": 239460 + }, + { + "epoch": 1.5299055747926862, + "grad_norm": 1.0731430053710938, + "learning_rate": 1.3054863802650613e-05, + "loss": 0.9949, + "step": 239470 + }, + { + "epoch": 1.529969461942425, + "grad_norm": 0.7292632460594177, + "learning_rate": 1.3051483011750704e-05, + "loss": 0.8798, + "step": 239480 + }, + { + "epoch": 1.5300333490921636, + "grad_norm": 0.9323152899742126, + "learning_rate": 1.3048102592954342e-05, + "loss": 0.6778, + "step": 239490 + }, + { + "epoch": 1.5300972362419023, + "grad_norm": 0.7616154551506042, + "learning_rate": 1.304472254629554e-05, + "loss": 0.8363, + "step": 239500 + }, + { + "epoch": 1.530161123391641, + "grad_norm": 0.960150420665741, + "learning_rate": 1.3041342871808366e-05, + "loss": 0.7406, + "step": 239510 + }, + { + "epoch": 1.5302250105413797, + "grad_norm": 1.2647629976272583, + "learning_rate": 1.3037963569526829e-05, + "loss": 0.9212, + "step": 239520 + }, + { + "epoch": 1.5302888976911184, + "grad_norm": 0.916258692741394, + "learning_rate": 1.3034584639484986e-05, + "loss": 0.7491, + "step": 239530 + }, + { + "epoch": 1.5303527848408571, + "grad_norm": 2.347499132156372, + "learning_rate": 1.3031206081716846e-05, + "loss": 0.9045, + "step": 239540 + }, + { + "epoch": 1.5304166719905958, + "grad_norm": 1.465839147567749, + "learning_rate": 1.3027827896256456e-05, + "loss": 0.6732, + "step": 239550 + }, + { + "epoch": 1.5304805591403345, + "grad_norm": 0.8912522196769714, + "learning_rate": 1.3024450083137812e-05, + "loss": 0.8075, + "step": 239560 + }, + { + "epoch": 1.5305444462900732, + "grad_norm": 0.9872187376022339, + "learning_rate": 1.302107264239496e-05, + "loss": 0.8532, + "step": 239570 + }, + { + "epoch": 1.530608333439812, + "grad_norm": 0.9597538709640503, + "learning_rate": 1.3017695574061878e-05, + "loss": 0.7483, + "step": 239580 + }, + { + "epoch": 1.5306722205895507, + "grad_norm": 0.959831953048706, + "learning_rate": 1.3014318878172615e-05, + "loss": 0.9163, + "step": 239590 + }, + { + "epoch": 1.5307361077392894, + "grad_norm": 1.2454932928085327, + "learning_rate": 1.3010942554761141e-05, + "loss": 0.9478, + "step": 239600 + }, + { + "epoch": 1.530799994889028, + "grad_norm": 1.333516001701355, + "learning_rate": 1.3007566603861493e-05, + "loss": 1.1341, + "step": 239610 + }, + { + "epoch": 1.5308638820387668, + "grad_norm": 2.669330358505249, + "learning_rate": 1.3004191025507628e-05, + "loss": 0.8008, + "step": 239620 + }, + { + "epoch": 1.5309277691885055, + "grad_norm": 1.3271132707595825, + "learning_rate": 1.3000815819733592e-05, + "loss": 0.8742, + "step": 239630 + }, + { + "epoch": 1.530991656338244, + "grad_norm": 0.5166794657707214, + "learning_rate": 1.2997440986573327e-05, + "loss": 0.7715, + "step": 239640 + }, + { + "epoch": 1.531055543487983, + "grad_norm": 1.243624210357666, + "learning_rate": 1.299406652606086e-05, + "loss": 1.0447, + "step": 239650 + }, + { + "epoch": 1.5311194306377214, + "grad_norm": 0.8429069519042969, + "learning_rate": 1.2990692438230151e-05, + "loss": 0.822, + "step": 239660 + }, + { + "epoch": 1.5311833177874603, + "grad_norm": 0.8280727863311768, + "learning_rate": 1.2987318723115177e-05, + "loss": 0.7127, + "step": 239670 + }, + { + "epoch": 1.5312472049371988, + "grad_norm": 0.8618836402893066, + "learning_rate": 1.298394538074994e-05, + "loss": 0.8952, + "step": 239680 + }, + { + "epoch": 1.5313110920869377, + "grad_norm": 0.7919678092002869, + "learning_rate": 1.2980572411168384e-05, + "loss": 0.9811, + "step": 239690 + }, + { + "epoch": 1.5313749792366762, + "grad_norm": 0.9257929921150208, + "learning_rate": 1.2977199814404505e-05, + "loss": 0.9175, + "step": 239700 + }, + { + "epoch": 1.5314388663864151, + "grad_norm": 1.1609474420547485, + "learning_rate": 1.2973827590492238e-05, + "loss": 0.7815, + "step": 239710 + }, + { + "epoch": 1.5315027535361536, + "grad_norm": 1.3709371089935303, + "learning_rate": 1.297045573946557e-05, + "loss": 0.8933, + "step": 239720 + }, + { + "epoch": 1.5315666406858925, + "grad_norm": 0.5066688656806946, + "learning_rate": 1.2967084261358436e-05, + "loss": 0.6719, + "step": 239730 + }, + { + "epoch": 1.531630527835631, + "grad_norm": 0.6616301536560059, + "learning_rate": 1.296371315620482e-05, + "loss": 0.8534, + "step": 239740 + }, + { + "epoch": 1.53169441498537, + "grad_norm": 0.8841010332107544, + "learning_rate": 1.2960342424038636e-05, + "loss": 0.9464, + "step": 239750 + }, + { + "epoch": 1.5317583021351084, + "grad_norm": 0.9628351330757141, + "learning_rate": 1.2956972064893857e-05, + "loss": 0.7827, + "step": 239760 + }, + { + "epoch": 1.5318221892848474, + "grad_norm": 2.0725762844085693, + "learning_rate": 1.2953602078804411e-05, + "loss": 0.8328, + "step": 239770 + }, + { + "epoch": 1.5318860764345859, + "grad_norm": 0.7003540992736816, + "learning_rate": 1.2950232465804252e-05, + "loss": 0.8365, + "step": 239780 + }, + { + "epoch": 1.5319499635843248, + "grad_norm": 0.5919496417045593, + "learning_rate": 1.2946863225927302e-05, + "loss": 0.9907, + "step": 239790 + }, + { + "epoch": 1.5320138507340633, + "grad_norm": 0.9051730036735535, + "learning_rate": 1.294349435920748e-05, + "loss": 0.8912, + "step": 239800 + }, + { + "epoch": 1.5320777378838022, + "grad_norm": 0.8595243692398071, + "learning_rate": 1.2940125865678748e-05, + "loss": 1.1719, + "step": 239810 + }, + { + "epoch": 1.5321416250335407, + "grad_norm": 1.0000742673873901, + "learning_rate": 1.2936757745374994e-05, + "loss": 0.9054, + "step": 239820 + }, + { + "epoch": 1.5322055121832796, + "grad_norm": 1.2009698152542114, + "learning_rate": 1.2933389998330164e-05, + "loss": 0.8474, + "step": 239830 + }, + { + "epoch": 1.532269399333018, + "grad_norm": 0.7443961501121521, + "learning_rate": 1.2930022624578153e-05, + "loss": 0.9226, + "step": 239840 + }, + { + "epoch": 1.532333286482757, + "grad_norm": 0.5982275009155273, + "learning_rate": 1.29266556241529e-05, + "loss": 0.7455, + "step": 239850 + }, + { + "epoch": 1.5323971736324955, + "grad_norm": 0.5646331906318665, + "learning_rate": 1.2923288997088284e-05, + "loss": 0.7711, + "step": 239860 + }, + { + "epoch": 1.5324610607822342, + "grad_norm": 1.0039056539535522, + "learning_rate": 1.2919922743418234e-05, + "loss": 0.8096, + "step": 239870 + }, + { + "epoch": 1.532524947931973, + "grad_norm": 1.1351054906845093, + "learning_rate": 1.2916556863176627e-05, + "loss": 0.6707, + "step": 239880 + }, + { + "epoch": 1.5325888350817116, + "grad_norm": 1.437688946723938, + "learning_rate": 1.2913191356397392e-05, + "loss": 0.7691, + "step": 239890 + }, + { + "epoch": 1.5326527222314503, + "grad_norm": 0.7194865942001343, + "learning_rate": 1.2909826223114385e-05, + "loss": 0.7505, + "step": 239900 + }, + { + "epoch": 1.532716609381189, + "grad_norm": 0.7203089594841003, + "learning_rate": 1.2906461463361536e-05, + "loss": 0.702, + "step": 239910 + }, + { + "epoch": 1.5327804965309277, + "grad_norm": 0.7741053104400635, + "learning_rate": 1.2903097077172693e-05, + "loss": 0.973, + "step": 239920 + }, + { + "epoch": 1.5328443836806664, + "grad_norm": 0.8206771016120911, + "learning_rate": 1.2899733064581771e-05, + "loss": 0.8626, + "step": 239930 + }, + { + "epoch": 1.5329082708304052, + "grad_norm": 1.8737119436264038, + "learning_rate": 1.2896369425622617e-05, + "loss": 0.7082, + "step": 239940 + }, + { + "epoch": 1.5329721579801439, + "grad_norm": 0.880370020866394, + "learning_rate": 1.2893006160329135e-05, + "loss": 0.8168, + "step": 239950 + }, + { + "epoch": 1.5330360451298826, + "grad_norm": 0.7264848947525024, + "learning_rate": 1.2889643268735174e-05, + "loss": 0.6213, + "step": 239960 + }, + { + "epoch": 1.5330999322796213, + "grad_norm": 0.7904850244522095, + "learning_rate": 1.2886280750874624e-05, + "loss": 0.9023, + "step": 239970 + }, + { + "epoch": 1.53316381942936, + "grad_norm": 1.2321357727050781, + "learning_rate": 1.288291860678132e-05, + "loss": 0.9196, + "step": 239980 + }, + { + "epoch": 1.5332277065790987, + "grad_norm": 1.2595999240875244, + "learning_rate": 1.2879556836489131e-05, + "loss": 0.9804, + "step": 239990 + }, + { + "epoch": 1.5332915937288374, + "grad_norm": 0.8715834021568298, + "learning_rate": 1.2876195440031936e-05, + "loss": 0.9739, + "step": 240000 + }, + { + "epoch": 1.533355480878576, + "grad_norm": 0.9129816293716431, + "learning_rate": 1.2872834417443559e-05, + "loss": 0.8105, + "step": 240010 + }, + { + "epoch": 1.5334193680283148, + "grad_norm": 0.6168598532676697, + "learning_rate": 1.2869473768757867e-05, + "loss": 0.7125, + "step": 240020 + }, + { + "epoch": 1.5334832551780535, + "grad_norm": 0.7878352403640747, + "learning_rate": 1.2866113494008686e-05, + "loss": 0.7464, + "step": 240030 + }, + { + "epoch": 1.5335471423277922, + "grad_norm": 0.7556173205375671, + "learning_rate": 1.2862753593229882e-05, + "loss": 0.7382, + "step": 240040 + }, + { + "epoch": 1.533611029477531, + "grad_norm": 1.1012219190597534, + "learning_rate": 1.2859394066455265e-05, + "loss": 0.8976, + "step": 240050 + }, + { + "epoch": 1.5336749166272696, + "grad_norm": 1.080224871635437, + "learning_rate": 1.2856034913718696e-05, + "loss": 0.8698, + "step": 240060 + }, + { + "epoch": 1.5337388037770083, + "grad_norm": 1.3037000894546509, + "learning_rate": 1.285267613505397e-05, + "loss": 0.9602, + "step": 240070 + }, + { + "epoch": 1.533802690926747, + "grad_norm": 0.8306534886360168, + "learning_rate": 1.2849317730494948e-05, + "loss": 0.8438, + "step": 240080 + }, + { + "epoch": 1.5338665780764857, + "grad_norm": 1.6114468574523926, + "learning_rate": 1.2845959700075421e-05, + "loss": 0.7366, + "step": 240090 + }, + { + "epoch": 1.5339304652262244, + "grad_norm": 0.9883949160575867, + "learning_rate": 1.2842602043829239e-05, + "loss": 0.8261, + "step": 240100 + }, + { + "epoch": 1.533994352375963, + "grad_norm": 1.1887831687927246, + "learning_rate": 1.2839244761790187e-05, + "loss": 0.9771, + "step": 240110 + }, + { + "epoch": 1.5340582395257019, + "grad_norm": 0.5754899978637695, + "learning_rate": 1.2835887853992102e-05, + "loss": 1.1814, + "step": 240120 + }, + { + "epoch": 1.5341221266754403, + "grad_norm": 1.040809988975525, + "learning_rate": 1.2832531320468765e-05, + "loss": 0.9311, + "step": 240130 + }, + { + "epoch": 1.5341860138251793, + "grad_norm": 1.1350606679916382, + "learning_rate": 1.2829175161254003e-05, + "loss": 0.9305, + "step": 240140 + }, + { + "epoch": 1.5342499009749178, + "grad_norm": 1.1955920457839966, + "learning_rate": 1.2825819376381593e-05, + "loss": 0.8709, + "step": 240150 + }, + { + "epoch": 1.5343137881246567, + "grad_norm": 0.953775942325592, + "learning_rate": 1.2822463965885356e-05, + "loss": 0.8353, + "step": 240160 + }, + { + "epoch": 1.5343776752743952, + "grad_norm": 0.9397642612457275, + "learning_rate": 1.2819108929799051e-05, + "loss": 0.8255, + "step": 240170 + }, + { + "epoch": 1.534441562424134, + "grad_norm": 1.1109893321990967, + "learning_rate": 1.2815754268156505e-05, + "loss": 0.767, + "step": 240180 + }, + { + "epoch": 1.5345054495738726, + "grad_norm": 1.0977774858474731, + "learning_rate": 1.2812399980991469e-05, + "loss": 0.8658, + "step": 240190 + }, + { + "epoch": 1.5345693367236115, + "grad_norm": 1.2020318508148193, + "learning_rate": 1.2809046068337732e-05, + "loss": 0.8189, + "step": 240200 + }, + { + "epoch": 1.53463322387335, + "grad_norm": 2.4566478729248047, + "learning_rate": 1.2805692530229096e-05, + "loss": 0.8471, + "step": 240210 + }, + { + "epoch": 1.534697111023089, + "grad_norm": 0.9017288088798523, + "learning_rate": 1.2802339366699295e-05, + "loss": 0.8161, + "step": 240220 + }, + { + "epoch": 1.5347609981728274, + "grad_norm": 0.9971731305122375, + "learning_rate": 1.2798986577782135e-05, + "loss": 0.9237, + "step": 240230 + }, + { + "epoch": 1.5348248853225663, + "grad_norm": 0.6848049163818359, + "learning_rate": 1.2795634163511345e-05, + "loss": 0.7426, + "step": 240240 + }, + { + "epoch": 1.5348887724723048, + "grad_norm": 0.6405489444732666, + "learning_rate": 1.2792282123920719e-05, + "loss": 0.7286, + "step": 240250 + }, + { + "epoch": 1.5349526596220437, + "grad_norm": 0.6404396295547485, + "learning_rate": 1.278893045904399e-05, + "loss": 0.7565, + "step": 240260 + }, + { + "epoch": 1.5350165467717822, + "grad_norm": 0.7233564853668213, + "learning_rate": 1.278557916891494e-05, + "loss": 1.0074, + "step": 240270 + }, + { + "epoch": 1.5350804339215212, + "grad_norm": 1.2777774333953857, + "learning_rate": 1.2782228253567303e-05, + "loss": 0.8462, + "step": 240280 + }, + { + "epoch": 1.5351443210712596, + "grad_norm": 1.4462738037109375, + "learning_rate": 1.277887771303482e-05, + "loss": 0.8536, + "step": 240290 + }, + { + "epoch": 1.5352082082209986, + "grad_norm": 1.3389897346496582, + "learning_rate": 1.2775527547351229e-05, + "loss": 0.7831, + "step": 240300 + }, + { + "epoch": 1.535272095370737, + "grad_norm": 0.7546529173851013, + "learning_rate": 1.2772177756550274e-05, + "loss": 0.9307, + "step": 240310 + }, + { + "epoch": 1.535335982520476, + "grad_norm": 1.994634747505188, + "learning_rate": 1.2768828340665717e-05, + "loss": 0.7946, + "step": 240320 + }, + { + "epoch": 1.5353998696702145, + "grad_norm": 1.2724889516830444, + "learning_rate": 1.2765479299731254e-05, + "loss": 0.7879, + "step": 240330 + }, + { + "epoch": 1.5354637568199534, + "grad_norm": 0.7900464534759521, + "learning_rate": 1.2762130633780634e-05, + "loss": 0.8841, + "step": 240340 + }, + { + "epoch": 1.5355276439696919, + "grad_norm": 3.2594213485717773, + "learning_rate": 1.2758782342847564e-05, + "loss": 0.9508, + "step": 240350 + }, + { + "epoch": 1.5355915311194306, + "grad_norm": 0.8626235127449036, + "learning_rate": 1.275543442696579e-05, + "loss": 0.6073, + "step": 240360 + }, + { + "epoch": 1.5356554182691693, + "grad_norm": 0.8508633971214294, + "learning_rate": 1.2752086886168996e-05, + "loss": 0.9052, + "step": 240370 + }, + { + "epoch": 1.535719305418908, + "grad_norm": 0.630959153175354, + "learning_rate": 1.2748739720490926e-05, + "loss": 0.6618, + "step": 240380 + }, + { + "epoch": 1.5357831925686467, + "grad_norm": 0.8615701794624329, + "learning_rate": 1.274539292996526e-05, + "loss": 0.7737, + "step": 240390 + }, + { + "epoch": 1.5358470797183854, + "grad_norm": 0.7362240552902222, + "learning_rate": 1.2742046514625728e-05, + "loss": 0.859, + "step": 240400 + }, + { + "epoch": 1.5359109668681241, + "grad_norm": 0.824510395526886, + "learning_rate": 1.273870047450601e-05, + "loss": 0.883, + "step": 240410 + }, + { + "epoch": 1.5359748540178628, + "grad_norm": 0.8618409037590027, + "learning_rate": 1.2735354809639827e-05, + "loss": 0.5913, + "step": 240420 + }, + { + "epoch": 1.5360387411676015, + "grad_norm": 1.2590079307556152, + "learning_rate": 1.2732009520060844e-05, + "loss": 0.8846, + "step": 240430 + }, + { + "epoch": 1.5361026283173402, + "grad_norm": 0.8605117797851562, + "learning_rate": 1.272866460580278e-05, + "loss": 0.9529, + "step": 240440 + }, + { + "epoch": 1.536166515467079, + "grad_norm": 1.3561952114105225, + "learning_rate": 1.2725320066899294e-05, + "loss": 0.987, + "step": 240450 + }, + { + "epoch": 1.5362304026168176, + "grad_norm": 0.9079533815383911, + "learning_rate": 1.2721975903384097e-05, + "loss": 0.8781, + "step": 240460 + }, + { + "epoch": 1.5362942897665564, + "grad_norm": 1.1836191415786743, + "learning_rate": 1.271863211529084e-05, + "loss": 0.9204, + "step": 240470 + }, + { + "epoch": 1.536358176916295, + "grad_norm": 0.9336318373680115, + "learning_rate": 1.2715288702653228e-05, + "loss": 0.8499, + "step": 240480 + }, + { + "epoch": 1.5364220640660338, + "grad_norm": 0.957920253276825, + "learning_rate": 1.2711945665504894e-05, + "loss": 0.9255, + "step": 240490 + }, + { + "epoch": 1.5364859512157725, + "grad_norm": 0.9811254739761353, + "learning_rate": 1.2708603003879544e-05, + "loss": 0.7817, + "step": 240500 + }, + { + "epoch": 1.5365498383655112, + "grad_norm": 1.009942889213562, + "learning_rate": 1.2705260717810808e-05, + "loss": 0.8197, + "step": 240510 + }, + { + "epoch": 1.5366137255152499, + "grad_norm": 1.3132307529449463, + "learning_rate": 1.2701918807332358e-05, + "loss": 0.8946, + "step": 240520 + }, + { + "epoch": 1.5366776126649886, + "grad_norm": 0.5535053610801697, + "learning_rate": 1.2698577272477868e-05, + "loss": 1.0185, + "step": 240530 + }, + { + "epoch": 1.5367414998147273, + "grad_norm": 0.8424240946769714, + "learning_rate": 1.2695236113280967e-05, + "loss": 0.8985, + "step": 240540 + }, + { + "epoch": 1.536805386964466, + "grad_norm": 0.9007782936096191, + "learning_rate": 1.2691895329775321e-05, + "loss": 0.799, + "step": 240550 + }, + { + "epoch": 1.5368692741142047, + "grad_norm": 2.0551929473876953, + "learning_rate": 1.268855492199455e-05, + "loss": 0.8517, + "step": 240560 + }, + { + "epoch": 1.5369331612639434, + "grad_norm": 0.8320748805999756, + "learning_rate": 1.268521488997233e-05, + "loss": 0.8527, + "step": 240570 + }, + { + "epoch": 1.5369970484136821, + "grad_norm": 0.9764295220375061, + "learning_rate": 1.2681875233742258e-05, + "loss": 0.858, + "step": 240580 + }, + { + "epoch": 1.5370609355634208, + "grad_norm": 0.9124672412872314, + "learning_rate": 1.2678535953338e-05, + "loss": 1.1554, + "step": 240590 + }, + { + "epoch": 1.5371248227131593, + "grad_norm": 0.9281206130981445, + "learning_rate": 1.2675197048793164e-05, + "loss": 0.9436, + "step": 240600 + }, + { + "epoch": 1.5371887098628982, + "grad_norm": 0.7381476759910583, + "learning_rate": 1.2671858520141394e-05, + "loss": 0.9653, + "step": 240610 + }, + { + "epoch": 1.5372525970126367, + "grad_norm": 0.7912752628326416, + "learning_rate": 1.2668520367416286e-05, + "loss": 1.0923, + "step": 240620 + }, + { + "epoch": 1.5373164841623757, + "grad_norm": 1.3308603763580322, + "learning_rate": 1.2665182590651498e-05, + "loss": 0.9441, + "step": 240630 + }, + { + "epoch": 1.5373803713121141, + "grad_norm": 1.331600546836853, + "learning_rate": 1.26618451898806e-05, + "loss": 0.7931, + "step": 240640 + }, + { + "epoch": 1.537444258461853, + "grad_norm": 0.9687751531600952, + "learning_rate": 1.2658508165137234e-05, + "loss": 0.8497, + "step": 240650 + }, + { + "epoch": 1.5375081456115915, + "grad_norm": 0.7466058731079102, + "learning_rate": 1.2655505164399511e-05, + "loss": 0.7598, + "step": 240660 + }, + { + "epoch": 1.5375720327613305, + "grad_norm": 1.5060503482818604, + "learning_rate": 1.2652168854201007e-05, + "loss": 1.0108, + "step": 240670 + }, + { + "epoch": 1.537635919911069, + "grad_norm": 1.3408730030059814, + "learning_rate": 1.264883292012749e-05, + "loss": 0.8109, + "step": 240680 + }, + { + "epoch": 1.5376998070608079, + "grad_norm": 1.010647177696228, + "learning_rate": 1.2645497362212521e-05, + "loss": 1.1945, + "step": 240690 + }, + { + "epoch": 1.5377636942105464, + "grad_norm": 0.8791874647140503, + "learning_rate": 1.2642162180489724e-05, + "loss": 0.7551, + "step": 240700 + }, + { + "epoch": 1.5378275813602853, + "grad_norm": 1.3113328218460083, + "learning_rate": 1.2638827374992662e-05, + "loss": 0.885, + "step": 240710 + }, + { + "epoch": 1.5378914685100238, + "grad_norm": 0.8393133282661438, + "learning_rate": 1.263549294575494e-05, + "loss": 0.9787, + "step": 240720 + }, + { + "epoch": 1.5379553556597627, + "grad_norm": 0.7710729241371155, + "learning_rate": 1.263215889281012e-05, + "loss": 1.0568, + "step": 240730 + }, + { + "epoch": 1.5380192428095012, + "grad_norm": 0.9903393387794495, + "learning_rate": 1.2628825216191802e-05, + "loss": 0.8698, + "step": 240740 + }, + { + "epoch": 1.5380831299592401, + "grad_norm": 0.9258543252944946, + "learning_rate": 1.2625491915933524e-05, + "loss": 0.89, + "step": 240750 + }, + { + "epoch": 1.5381470171089786, + "grad_norm": 0.8964667916297913, + "learning_rate": 1.26221589920689e-05, + "loss": 0.8349, + "step": 240760 + }, + { + "epoch": 1.5382109042587175, + "grad_norm": 2.6720521450042725, + "learning_rate": 1.2618826444631448e-05, + "loss": 0.9522, + "step": 240770 + }, + { + "epoch": 1.538274791408456, + "grad_norm": 1.3586536645889282, + "learning_rate": 1.2615494273654771e-05, + "loss": 0.921, + "step": 240780 + }, + { + "epoch": 1.538338678558195, + "grad_norm": 1.9273499250411987, + "learning_rate": 1.2612162479172395e-05, + "loss": 0.7124, + "step": 240790 + }, + { + "epoch": 1.5384025657079334, + "grad_norm": 2.5090017318725586, + "learning_rate": 1.2608831061217902e-05, + "loss": 0.834, + "step": 240800 + }, + { + "epoch": 1.5384664528576724, + "grad_norm": 1.410423994064331, + "learning_rate": 1.2605500019824811e-05, + "loss": 0.775, + "step": 240810 + }, + { + "epoch": 1.5385303400074108, + "grad_norm": 1.0567747354507446, + "learning_rate": 1.2602169355026705e-05, + "loss": 0.9427, + "step": 240820 + }, + { + "epoch": 1.5385942271571498, + "grad_norm": 0.9426378607749939, + "learning_rate": 1.2598839066857094e-05, + "loss": 0.7259, + "step": 240830 + }, + { + "epoch": 1.5386581143068883, + "grad_norm": 0.8864075541496277, + "learning_rate": 1.2595509155349522e-05, + "loss": 0.8738, + "step": 240840 + }, + { + "epoch": 1.538722001456627, + "grad_norm": 2.0888774394989014, + "learning_rate": 1.2592179620537554e-05, + "loss": 0.7952, + "step": 240850 + }, + { + "epoch": 1.5387858886063657, + "grad_norm": 0.895809531211853, + "learning_rate": 1.2588850462454682e-05, + "loss": 0.723, + "step": 240860 + }, + { + "epoch": 1.5388497757561044, + "grad_norm": 0.8410398364067078, + "learning_rate": 1.2585521681134466e-05, + "loss": 0.9484, + "step": 240870 + }, + { + "epoch": 1.538913662905843, + "grad_norm": 1.5574727058410645, + "learning_rate": 1.2582193276610398e-05, + "loss": 1.06, + "step": 240880 + }, + { + "epoch": 1.5389775500555818, + "grad_norm": 1.247363805770874, + "learning_rate": 1.2578865248916039e-05, + "loss": 0.9964, + "step": 240890 + }, + { + "epoch": 1.5390414372053205, + "grad_norm": 0.6102619767189026, + "learning_rate": 1.2575537598084853e-05, + "loss": 0.8924, + "step": 240900 + }, + { + "epoch": 1.5391053243550592, + "grad_norm": 0.8611202239990234, + "learning_rate": 1.2572210324150419e-05, + "loss": 0.9274, + "step": 240910 + }, + { + "epoch": 1.539169211504798, + "grad_norm": 4.812923431396484, + "learning_rate": 1.2568883427146172e-05, + "loss": 0.8025, + "step": 240920 + }, + { + "epoch": 1.5392330986545366, + "grad_norm": 1.0879570245742798, + "learning_rate": 1.2565556907105669e-05, + "loss": 0.9616, + "step": 240930 + }, + { + "epoch": 1.5392969858042753, + "grad_norm": 1.8127998113632202, + "learning_rate": 1.2562230764062377e-05, + "loss": 0.8644, + "step": 240940 + }, + { + "epoch": 1.539360872954014, + "grad_norm": 0.8016753792762756, + "learning_rate": 1.2558904998049808e-05, + "loss": 0.9531, + "step": 240950 + }, + { + "epoch": 1.5394247601037527, + "grad_norm": 0.5597435235977173, + "learning_rate": 1.2555579609101475e-05, + "loss": 0.6647, + "step": 240960 + }, + { + "epoch": 1.5394886472534914, + "grad_norm": 1.3119679689407349, + "learning_rate": 1.2552254597250835e-05, + "loss": 0.8009, + "step": 240970 + }, + { + "epoch": 1.5395525344032301, + "grad_norm": 1.1380845308303833, + "learning_rate": 1.2548929962531397e-05, + "loss": 0.8643, + "step": 240980 + }, + { + "epoch": 1.5396164215529689, + "grad_norm": 0.8482895493507385, + "learning_rate": 1.2545605704976626e-05, + "loss": 0.8643, + "step": 240990 + }, + { + "epoch": 1.5396803087027076, + "grad_norm": 0.9055123925209045, + "learning_rate": 1.2542281824620017e-05, + "loss": 0.6563, + "step": 241000 + }, + { + "epoch": 1.5397441958524463, + "grad_norm": 1.2197997570037842, + "learning_rate": 1.2538958321495026e-05, + "loss": 0.9265, + "step": 241010 + }, + { + "epoch": 1.539808083002185, + "grad_norm": 0.9531196355819702, + "learning_rate": 1.2535635195635147e-05, + "loss": 0.9722, + "step": 241020 + }, + { + "epoch": 1.5398719701519237, + "grad_norm": 1.799153447151184, + "learning_rate": 1.2532312447073818e-05, + "loss": 0.7653, + "step": 241030 + }, + { + "epoch": 1.5399358573016624, + "grad_norm": 0.8743974566459656, + "learning_rate": 1.2528990075844527e-05, + "loss": 0.6515, + "step": 241040 + }, + { + "epoch": 1.539999744451401, + "grad_norm": 1.757636547088623, + "learning_rate": 1.2525668081980712e-05, + "loss": 0.7872, + "step": 241050 + }, + { + "epoch": 1.5400636316011398, + "grad_norm": 1.1052943468093872, + "learning_rate": 1.2522346465515855e-05, + "loss": 0.9347, + "step": 241060 + }, + { + "epoch": 1.5401275187508785, + "grad_norm": 0.6515098810195923, + "learning_rate": 1.2519025226483378e-05, + "loss": 0.8837, + "step": 241070 + }, + { + "epoch": 1.5401914059006172, + "grad_norm": 0.7127335667610168, + "learning_rate": 1.2515704364916758e-05, + "loss": 0.8703, + "step": 241080 + }, + { + "epoch": 1.5402552930503557, + "grad_norm": 0.633703887462616, + "learning_rate": 1.2512383880849404e-05, + "loss": 0.7829, + "step": 241090 + }, + { + "epoch": 1.5403191802000946, + "grad_norm": 1.5734481811523438, + "learning_rate": 1.2509063774314795e-05, + "loss": 0.9246, + "step": 241100 + }, + { + "epoch": 1.540383067349833, + "grad_norm": 1.2697124481201172, + "learning_rate": 1.2505744045346329e-05, + "loss": 0.7494, + "step": 241110 + }, + { + "epoch": 1.540446954499572, + "grad_norm": 0.903329610824585, + "learning_rate": 1.2502424693977476e-05, + "loss": 0.8019, + "step": 241120 + }, + { + "epoch": 1.5405108416493105, + "grad_norm": 0.7796242237091064, + "learning_rate": 1.2499105720241628e-05, + "loss": 0.9078, + "step": 241130 + }, + { + "epoch": 1.5405747287990494, + "grad_norm": 0.9284679293632507, + "learning_rate": 1.2495787124172248e-05, + "loss": 0.6733, + "step": 241140 + }, + { + "epoch": 1.540638615948788, + "grad_norm": 0.6304734945297241, + "learning_rate": 1.2492468905802717e-05, + "loss": 0.8974, + "step": 241150 + }, + { + "epoch": 1.5407025030985269, + "grad_norm": 0.8838341236114502, + "learning_rate": 1.2489151065166476e-05, + "loss": 0.8293, + "step": 241160 + }, + { + "epoch": 1.5407663902482653, + "grad_norm": 1.1389199495315552, + "learning_rate": 1.2485833602296953e-05, + "loss": 0.7646, + "step": 241170 + }, + { + "epoch": 1.5408302773980043, + "grad_norm": 0.9645891189575195, + "learning_rate": 1.2482516517227522e-05, + "loss": 0.7372, + "step": 241180 + }, + { + "epoch": 1.5408941645477428, + "grad_norm": 1.2675881385803223, + "learning_rate": 1.247919980999162e-05, + "loss": 0.8884, + "step": 241190 + }, + { + "epoch": 1.5409580516974817, + "grad_norm": 1.097831130027771, + "learning_rate": 1.2475883480622624e-05, + "loss": 1.1137, + "step": 241200 + }, + { + "epoch": 1.5410219388472202, + "grad_norm": 0.7022253274917603, + "learning_rate": 1.2472567529153955e-05, + "loss": 1.0512, + "step": 241210 + }, + { + "epoch": 1.541085825996959, + "grad_norm": 1.3515077829360962, + "learning_rate": 1.2469251955618988e-05, + "loss": 0.9112, + "step": 241220 + }, + { + "epoch": 1.5411497131466976, + "grad_norm": 0.9095216989517212, + "learning_rate": 1.2465936760051133e-05, + "loss": 0.8354, + "step": 241230 + }, + { + "epoch": 1.5412136002964365, + "grad_norm": 0.9183611869812012, + "learning_rate": 1.2462621942483749e-05, + "loss": 0.9475, + "step": 241240 + }, + { + "epoch": 1.541277487446175, + "grad_norm": 1.1954056024551392, + "learning_rate": 1.2459307502950256e-05, + "loss": 1.0327, + "step": 241250 + }, + { + "epoch": 1.541341374595914, + "grad_norm": 1.835919737815857, + "learning_rate": 1.2455993441483999e-05, + "loss": 0.7387, + "step": 241260 + }, + { + "epoch": 1.5414052617456524, + "grad_norm": 0.7012445330619812, + "learning_rate": 1.245267975811838e-05, + "loss": 0.7472, + "step": 241270 + }, + { + "epoch": 1.5414691488953913, + "grad_norm": 0.7782658338546753, + "learning_rate": 1.2449366452886752e-05, + "loss": 0.93, + "step": 241280 + }, + { + "epoch": 1.5415330360451298, + "grad_norm": 1.1713647842407227, + "learning_rate": 1.2446053525822498e-05, + "loss": 0.8349, + "step": 241290 + }, + { + "epoch": 1.5415969231948687, + "grad_norm": 0.959614634513855, + "learning_rate": 1.2442740976958955e-05, + "loss": 0.9754, + "step": 241300 + }, + { + "epoch": 1.5416608103446072, + "grad_norm": 0.936188280582428, + "learning_rate": 1.2439428806329522e-05, + "loss": 0.8709, + "step": 241310 + }, + { + "epoch": 1.5417246974943462, + "grad_norm": 0.8712064623832703, + "learning_rate": 1.2436117013967525e-05, + "loss": 0.8923, + "step": 241320 + }, + { + "epoch": 1.5417885846440846, + "grad_norm": 0.8634532690048218, + "learning_rate": 1.2432805599906332e-05, + "loss": 0.8087, + "step": 241330 + }, + { + "epoch": 1.5418524717938233, + "grad_norm": 1.1293830871582031, + "learning_rate": 1.2429494564179278e-05, + "loss": 0.7126, + "step": 241340 + }, + { + "epoch": 1.541916358943562, + "grad_norm": 1.6642500162124634, + "learning_rate": 1.2426183906819733e-05, + "loss": 1.0368, + "step": 241350 + }, + { + "epoch": 1.5419802460933008, + "grad_norm": 0.9231435060501099, + "learning_rate": 1.2422873627861004e-05, + "loss": 0.7463, + "step": 241360 + }, + { + "epoch": 1.5420441332430395, + "grad_norm": 1.0752265453338623, + "learning_rate": 1.2419563727336447e-05, + "loss": 0.8077, + "step": 241370 + }, + { + "epoch": 1.5421080203927782, + "grad_norm": 1.3547260761260986, + "learning_rate": 1.241625420527941e-05, + "loss": 0.9673, + "step": 241380 + }, + { + "epoch": 1.5421719075425169, + "grad_norm": 1.10608971118927, + "learning_rate": 1.2412945061723192e-05, + "loss": 0.6985, + "step": 241390 + }, + { + "epoch": 1.5422357946922556, + "grad_norm": 0.6865934729576111, + "learning_rate": 1.2409636296701166e-05, + "loss": 0.7879, + "step": 241400 + }, + { + "epoch": 1.5422996818419943, + "grad_norm": 0.7840709090232849, + "learning_rate": 1.2406327910246595e-05, + "loss": 0.7392, + "step": 241410 + }, + { + "epoch": 1.542363568991733, + "grad_norm": 0.904344379901886, + "learning_rate": 1.2403019902392837e-05, + "loss": 0.915, + "step": 241420 + }, + { + "epoch": 1.5424274561414717, + "grad_norm": 1.0914677381515503, + "learning_rate": 1.2399712273173181e-05, + "loss": 0.7752, + "step": 241430 + }, + { + "epoch": 1.5424913432912104, + "grad_norm": 1.2973593473434448, + "learning_rate": 1.2396405022620966e-05, + "loss": 0.8754, + "step": 241440 + }, + { + "epoch": 1.542555230440949, + "grad_norm": 0.7872340679168701, + "learning_rate": 1.2393098150769467e-05, + "loss": 0.8355, + "step": 241450 + }, + { + "epoch": 1.5426191175906878, + "grad_norm": 0.7979745864868164, + "learning_rate": 1.2389791657652023e-05, + "loss": 0.9445, + "step": 241460 + }, + { + "epoch": 1.5426830047404265, + "grad_norm": 1.740941047668457, + "learning_rate": 1.2386485543301896e-05, + "loss": 0.9143, + "step": 241470 + }, + { + "epoch": 1.5427468918901652, + "grad_norm": 0.9509738683700562, + "learning_rate": 1.2383179807752399e-05, + "loss": 0.678, + "step": 241480 + }, + { + "epoch": 1.542810779039904, + "grad_norm": 0.8133483529090881, + "learning_rate": 1.2379874451036844e-05, + "loss": 0.9289, + "step": 241490 + }, + { + "epoch": 1.5428746661896426, + "grad_norm": 0.7095542550086975, + "learning_rate": 1.2376569473188483e-05, + "loss": 0.776, + "step": 241500 + }, + { + "epoch": 1.5429385533393813, + "grad_norm": 0.9627558588981628, + "learning_rate": 1.2373264874240625e-05, + "loss": 0.8404, + "step": 241510 + }, + { + "epoch": 1.54300244048912, + "grad_norm": 0.9276164770126343, + "learning_rate": 1.2369960654226536e-05, + "loss": 1.0322, + "step": 241520 + }, + { + "epoch": 1.5430663276388588, + "grad_norm": 0.7478491067886353, + "learning_rate": 1.2366656813179506e-05, + "loss": 0.7374, + "step": 241530 + }, + { + "epoch": 1.5431302147885975, + "grad_norm": 0.8188766241073608, + "learning_rate": 1.236335335113279e-05, + "loss": 0.7053, + "step": 241540 + }, + { + "epoch": 1.5431941019383362, + "grad_norm": 0.7239640951156616, + "learning_rate": 1.2360050268119677e-05, + "loss": 0.7981, + "step": 241550 + }, + { + "epoch": 1.5432579890880749, + "grad_norm": 1.2404205799102783, + "learning_rate": 1.2356747564173405e-05, + "loss": 1.3444, + "step": 241560 + }, + { + "epoch": 1.5433218762378136, + "grad_norm": 0.597846508026123, + "learning_rate": 1.2353445239327271e-05, + "loss": 0.9732, + "step": 241570 + }, + { + "epoch": 1.543385763387552, + "grad_norm": 0.9635471105575562, + "learning_rate": 1.2350143293614491e-05, + "loss": 0.7034, + "step": 241580 + }, + { + "epoch": 1.543449650537291, + "grad_norm": 0.9366225004196167, + "learning_rate": 1.234684172706836e-05, + "loss": 0.6786, + "step": 241590 + }, + { + "epoch": 1.5435135376870295, + "grad_norm": 0.9822403788566589, + "learning_rate": 1.2343540539722093e-05, + "loss": 0.7379, + "step": 241600 + }, + { + "epoch": 1.5435774248367684, + "grad_norm": 0.6430383920669556, + "learning_rate": 1.234023973160896e-05, + "loss": 0.8726, + "step": 241610 + }, + { + "epoch": 1.543641311986507, + "grad_norm": 0.7444342374801636, + "learning_rate": 1.233693930276218e-05, + "loss": 1.0381, + "step": 241620 + }, + { + "epoch": 1.5437051991362458, + "grad_norm": 0.9430611729621887, + "learning_rate": 1.2333639253215024e-05, + "loss": 0.7164, + "step": 241630 + }, + { + "epoch": 1.5437690862859843, + "grad_norm": 1.330551266670227, + "learning_rate": 1.2330339583000688e-05, + "loss": 0.6713, + "step": 241640 + }, + { + "epoch": 1.5438329734357232, + "grad_norm": 1.3129150867462158, + "learning_rate": 1.2327040292152436e-05, + "loss": 0.8398, + "step": 241650 + }, + { + "epoch": 1.5438968605854617, + "grad_norm": 1.5121541023254395, + "learning_rate": 1.2323741380703469e-05, + "loss": 0.7578, + "step": 241660 + }, + { + "epoch": 1.5439607477352006, + "grad_norm": 1.7165089845657349, + "learning_rate": 1.2320442848687031e-05, + "loss": 0.6938, + "step": 241670 + }, + { + "epoch": 1.5440246348849391, + "grad_norm": 1.1789953708648682, + "learning_rate": 1.2317144696136318e-05, + "loss": 0.6197, + "step": 241680 + }, + { + "epoch": 1.544088522034678, + "grad_norm": 0.80938321352005, + "learning_rate": 1.2313846923084554e-05, + "loss": 1.0115, + "step": 241690 + }, + { + "epoch": 1.5441524091844165, + "grad_norm": 0.9993014931678772, + "learning_rate": 1.2310549529564974e-05, + "loss": 0.8602, + "step": 241700 + }, + { + "epoch": 1.5442162963341555, + "grad_norm": 0.8560216426849365, + "learning_rate": 1.2307252515610751e-05, + "loss": 0.7854, + "step": 241710 + }, + { + "epoch": 1.544280183483894, + "grad_norm": 0.9550018310546875, + "learning_rate": 1.230395588125512e-05, + "loss": 0.7854, + "step": 241720 + }, + { + "epoch": 1.5443440706336329, + "grad_norm": 0.46049657464027405, + "learning_rate": 1.2300659626531247e-05, + "loss": 0.7882, + "step": 241730 + }, + { + "epoch": 1.5444079577833714, + "grad_norm": 2.080028533935547, + "learning_rate": 1.2297363751472363e-05, + "loss": 0.9231, + "step": 241740 + }, + { + "epoch": 1.5444718449331103, + "grad_norm": 1.1706985235214233, + "learning_rate": 1.2294068256111629e-05, + "loss": 0.9966, + "step": 241750 + }, + { + "epoch": 1.5445357320828488, + "grad_norm": 2.6633691787719727, + "learning_rate": 1.2290773140482265e-05, + "loss": 1.0068, + "step": 241760 + }, + { + "epoch": 1.5445996192325877, + "grad_norm": 0.6567732691764832, + "learning_rate": 1.2287478404617419e-05, + "loss": 0.839, + "step": 241770 + }, + { + "epoch": 1.5446635063823262, + "grad_norm": 0.5708428025245667, + "learning_rate": 1.2284184048550307e-05, + "loss": 0.9023, + "step": 241780 + }, + { + "epoch": 1.5447273935320651, + "grad_norm": 0.9928178787231445, + "learning_rate": 1.2280890072314078e-05, + "loss": 0.7571, + "step": 241790 + }, + { + "epoch": 1.5447912806818036, + "grad_norm": 0.8214378952980042, + "learning_rate": 1.2277596475941933e-05, + "loss": 0.8863, + "step": 241800 + }, + { + "epoch": 1.5448551678315423, + "grad_norm": 0.6824464797973633, + "learning_rate": 1.2274303259467007e-05, + "loss": 0.9429, + "step": 241810 + }, + { + "epoch": 1.544919054981281, + "grad_norm": 0.6267659664154053, + "learning_rate": 1.2271010422922503e-05, + "loss": 0.6823, + "step": 241820 + }, + { + "epoch": 1.5449829421310197, + "grad_norm": 1.0387673377990723, + "learning_rate": 1.2267717966341547e-05, + "loss": 0.8397, + "step": 241830 + }, + { + "epoch": 1.5450468292807584, + "grad_norm": 0.7781873345375061, + "learning_rate": 1.226442588975733e-05, + "loss": 0.5977, + "step": 241840 + }, + { + "epoch": 1.5451107164304971, + "grad_norm": 1.1527103185653687, + "learning_rate": 1.2261134193202977e-05, + "loss": 0.8294, + "step": 241850 + }, + { + "epoch": 1.5451746035802358, + "grad_norm": 2.6820149421691895, + "learning_rate": 1.2257842876711662e-05, + "loss": 0.998, + "step": 241860 + }, + { + "epoch": 1.5452384907299745, + "grad_norm": 0.8141218423843384, + "learning_rate": 1.2254551940316512e-05, + "loss": 0.8745, + "step": 241870 + }, + { + "epoch": 1.5453023778797133, + "grad_norm": 1.027464509010315, + "learning_rate": 1.2251261384050689e-05, + "loss": 0.9838, + "step": 241880 + }, + { + "epoch": 1.545366265029452, + "grad_norm": 1.029893398284912, + "learning_rate": 1.2247971207947323e-05, + "loss": 1.0286, + "step": 241890 + }, + { + "epoch": 1.5454301521791907, + "grad_norm": 0.9250289797782898, + "learning_rate": 1.2244681412039532e-05, + "loss": 1.1109, + "step": 241900 + }, + { + "epoch": 1.5454940393289294, + "grad_norm": 0.6123344898223877, + "learning_rate": 1.2241391996360475e-05, + "loss": 0.9885, + "step": 241910 + }, + { + "epoch": 1.545557926478668, + "grad_norm": 0.9226810932159424, + "learning_rate": 1.2238102960943254e-05, + "loss": 1.044, + "step": 241920 + }, + { + "epoch": 1.5456218136284068, + "grad_norm": 3.4627225399017334, + "learning_rate": 1.2234814305821019e-05, + "loss": 0.7857, + "step": 241930 + }, + { + "epoch": 1.5456857007781455, + "grad_norm": 0.76270991563797, + "learning_rate": 1.2231526031026863e-05, + "loss": 0.9521, + "step": 241940 + }, + { + "epoch": 1.5457495879278842, + "grad_norm": 1.6452094316482544, + "learning_rate": 1.2228238136593922e-05, + "loss": 0.6327, + "step": 241950 + }, + { + "epoch": 1.545813475077623, + "grad_norm": 1.7455980777740479, + "learning_rate": 1.2224950622555292e-05, + "loss": 0.9259, + "step": 241960 + }, + { + "epoch": 1.5458773622273616, + "grad_norm": 1.0099796056747437, + "learning_rate": 1.2221663488944101e-05, + "loss": 0.6914, + "step": 241970 + }, + { + "epoch": 1.5459412493771003, + "grad_norm": 0.7746630907058716, + "learning_rate": 1.2218376735793424e-05, + "loss": 0.7888, + "step": 241980 + }, + { + "epoch": 1.546005136526839, + "grad_norm": 0.8316277861595154, + "learning_rate": 1.2215090363136406e-05, + "loss": 0.9571, + "step": 241990 + }, + { + "epoch": 1.5460690236765777, + "grad_norm": 1.7187628746032715, + "learning_rate": 1.221180437100609e-05, + "loss": 0.7908, + "step": 242000 + }, + { + "epoch": 1.5461329108263164, + "grad_norm": 2.2674307823181152, + "learning_rate": 1.22085187594356e-05, + "loss": 0.8359, + "step": 242010 + }, + { + "epoch": 1.5461967979760551, + "grad_norm": 0.6435132622718811, + "learning_rate": 1.2205233528458031e-05, + "loss": 0.7904, + "step": 242020 + }, + { + "epoch": 1.5462606851257938, + "grad_norm": 1.0840686559677124, + "learning_rate": 1.2201948678106445e-05, + "loss": 0.9222, + "step": 242030 + }, + { + "epoch": 1.5463245722755325, + "grad_norm": 0.643201470375061, + "learning_rate": 1.2198664208413951e-05, + "loss": 0.7934, + "step": 242040 + }, + { + "epoch": 1.5463884594252713, + "grad_norm": 1.0409761667251587, + "learning_rate": 1.2195380119413596e-05, + "loss": 1.0341, + "step": 242050 + }, + { + "epoch": 1.54645234657501, + "grad_norm": 0.9929067492485046, + "learning_rate": 1.2192096411138487e-05, + "loss": 1.0195, + "step": 242060 + }, + { + "epoch": 1.5465162337247484, + "grad_norm": 1.201584815979004, + "learning_rate": 1.2188813083621659e-05, + "loss": 1.0365, + "step": 242070 + }, + { + "epoch": 1.5465801208744874, + "grad_norm": 0.9801458716392517, + "learning_rate": 1.2185530136896205e-05, + "loss": 0.6543, + "step": 242080 + }, + { + "epoch": 1.5466440080242259, + "grad_norm": 0.9027044177055359, + "learning_rate": 1.218224757099517e-05, + "loss": 0.7799, + "step": 242090 + }, + { + "epoch": 1.5467078951739648, + "grad_norm": 1.112123966217041, + "learning_rate": 1.2178965385951629e-05, + "loss": 0.8694, + "step": 242100 + }, + { + "epoch": 1.5467717823237033, + "grad_norm": 0.6458132863044739, + "learning_rate": 1.2175683581798613e-05, + "loss": 0.8681, + "step": 242110 + }, + { + "epoch": 1.5468356694734422, + "grad_norm": 1.030352234840393, + "learning_rate": 1.2172402158569202e-05, + "loss": 0.975, + "step": 242120 + }, + { + "epoch": 1.5468995566231807, + "grad_norm": 1.0494451522827148, + "learning_rate": 1.2169121116296407e-05, + "loss": 0.7302, + "step": 242130 + }, + { + "epoch": 1.5469634437729196, + "grad_norm": 0.6920552849769592, + "learning_rate": 1.2165840455013305e-05, + "loss": 0.8045, + "step": 242140 + }, + { + "epoch": 1.547027330922658, + "grad_norm": 0.905730128288269, + "learning_rate": 1.2162560174752912e-05, + "loss": 0.9529, + "step": 242150 + }, + { + "epoch": 1.547091218072397, + "grad_norm": 1.1732187271118164, + "learning_rate": 1.2159280275548286e-05, + "loss": 1.0796, + "step": 242160 + }, + { + "epoch": 1.5471551052221355, + "grad_norm": 2.3803117275238037, + "learning_rate": 1.2156000757432423e-05, + "loss": 0.9862, + "step": 242170 + }, + { + "epoch": 1.5472189923718744, + "grad_norm": 0.9430058002471924, + "learning_rate": 1.2152721620438395e-05, + "loss": 0.8618, + "step": 242180 + }, + { + "epoch": 1.547282879521613, + "grad_norm": 0.8770157098770142, + "learning_rate": 1.2149442864599187e-05, + "loss": 0.8812, + "step": 242190 + }, + { + "epoch": 1.5473467666713518, + "grad_norm": 1.1416618824005127, + "learning_rate": 1.2146164489947847e-05, + "loss": 0.8192, + "step": 242200 + }, + { + "epoch": 1.5474106538210903, + "grad_norm": 1.3545187711715698, + "learning_rate": 1.2142886496517365e-05, + "loss": 0.95, + "step": 242210 + }, + { + "epoch": 1.5474745409708293, + "grad_norm": 0.8353721499443054, + "learning_rate": 1.2139608884340764e-05, + "loss": 0.7645, + "step": 242220 + }, + { + "epoch": 1.5475384281205677, + "grad_norm": 2.1088666915893555, + "learning_rate": 1.2136331653451071e-05, + "loss": 0.8966, + "step": 242230 + }, + { + "epoch": 1.5476023152703067, + "grad_norm": 0.8680897951126099, + "learning_rate": 1.2133054803881267e-05, + "loss": 0.9958, + "step": 242240 + }, + { + "epoch": 1.5476662024200452, + "grad_norm": 1.1479185819625854, + "learning_rate": 1.2129778335664366e-05, + "loss": 0.9269, + "step": 242250 + }, + { + "epoch": 1.547730089569784, + "grad_norm": 0.6384342908859253, + "learning_rate": 1.2126502248833355e-05, + "loss": 0.7369, + "step": 242260 + }, + { + "epoch": 1.5477939767195226, + "grad_norm": 0.48517945408821106, + "learning_rate": 1.2123226543421235e-05, + "loss": 0.7885, + "step": 242270 + }, + { + "epoch": 1.5478578638692615, + "grad_norm": 0.5258692502975464, + "learning_rate": 1.2119951219460985e-05, + "loss": 1.0042, + "step": 242280 + }, + { + "epoch": 1.547921751019, + "grad_norm": 0.6555418968200684, + "learning_rate": 1.2116676276985606e-05, + "loss": 0.8657, + "step": 242290 + }, + { + "epoch": 1.5479856381687387, + "grad_norm": 0.8531206250190735, + "learning_rate": 1.2113401716028061e-05, + "loss": 0.7222, + "step": 242300 + }, + { + "epoch": 1.5480495253184774, + "grad_norm": 1.6429301500320435, + "learning_rate": 1.2110127536621352e-05, + "loss": 0.797, + "step": 242310 + }, + { + "epoch": 1.548113412468216, + "grad_norm": 0.78465735912323, + "learning_rate": 1.2106853738798419e-05, + "loss": 0.7611, + "step": 242320 + }, + { + "epoch": 1.5481772996179548, + "grad_norm": 1.2310987710952759, + "learning_rate": 1.2103580322592273e-05, + "loss": 1.1022, + "step": 242330 + }, + { + "epoch": 1.5482411867676935, + "grad_norm": 0.8379166722297668, + "learning_rate": 1.210030728803584e-05, + "loss": 0.9564, + "step": 242340 + }, + { + "epoch": 1.5483050739174322, + "grad_norm": 0.7665345668792725, + "learning_rate": 1.2097034635162108e-05, + "loss": 0.9774, + "step": 242350 + }, + { + "epoch": 1.548368961067171, + "grad_norm": 1.7157317399978638, + "learning_rate": 1.2093762364004024e-05, + "loss": 1.0717, + "step": 242360 + }, + { + "epoch": 1.5484328482169096, + "grad_norm": 1.0685333013534546, + "learning_rate": 1.2090490474594557e-05, + "loss": 0.7697, + "step": 242370 + }, + { + "epoch": 1.5484967353666483, + "grad_norm": 0.9549485445022583, + "learning_rate": 1.2087218966966645e-05, + "loss": 0.9957, + "step": 242380 + }, + { + "epoch": 1.548560622516387, + "grad_norm": 0.7715072631835938, + "learning_rate": 1.2083947841153226e-05, + "loss": 0.956, + "step": 242390 + }, + { + "epoch": 1.5486245096661257, + "grad_norm": 0.7687417268753052, + "learning_rate": 1.2080677097187266e-05, + "loss": 0.817, + "step": 242400 + }, + { + "epoch": 1.5486883968158645, + "grad_norm": 3.5652546882629395, + "learning_rate": 1.2077406735101682e-05, + "loss": 0.6971, + "step": 242410 + }, + { + "epoch": 1.5487522839656032, + "grad_norm": 0.7793838977813721, + "learning_rate": 1.2074136754929428e-05, + "loss": 0.838, + "step": 242420 + }, + { + "epoch": 1.5488161711153419, + "grad_norm": 0.7141621708869934, + "learning_rate": 1.2070867156703419e-05, + "loss": 0.7336, + "step": 242430 + }, + { + "epoch": 1.5488800582650806, + "grad_norm": 0.9291849136352539, + "learning_rate": 1.2067597940456605e-05, + "loss": 0.9723, + "step": 242440 + }, + { + "epoch": 1.5489439454148193, + "grad_norm": 0.8107156157493591, + "learning_rate": 1.2064329106221877e-05, + "loss": 0.8796, + "step": 242450 + }, + { + "epoch": 1.549007832564558, + "grad_norm": 1.0288563966751099, + "learning_rate": 1.2061060654032192e-05, + "loss": 0.9119, + "step": 242460 + }, + { + "epoch": 1.5490717197142967, + "grad_norm": 1.1197317838668823, + "learning_rate": 1.2057792583920436e-05, + "loss": 0.7437, + "step": 242470 + }, + { + "epoch": 1.5491356068640354, + "grad_norm": 0.7756921052932739, + "learning_rate": 1.2054524895919539e-05, + "loss": 0.6483, + "step": 242480 + }, + { + "epoch": 1.549199494013774, + "grad_norm": 1.0276974439620972, + "learning_rate": 1.2051257590062397e-05, + "loss": 0.779, + "step": 242490 + }, + { + "epoch": 1.5492633811635128, + "grad_norm": 3.1762709617614746, + "learning_rate": 1.2047990666381936e-05, + "loss": 1.1483, + "step": 242500 + }, + { + "epoch": 1.5493272683132515, + "grad_norm": 0.7494045495986938, + "learning_rate": 1.2044724124911023e-05, + "loss": 1.0768, + "step": 242510 + }, + { + "epoch": 1.5493911554629902, + "grad_norm": 0.996212363243103, + "learning_rate": 1.204145796568259e-05, + "loss": 1.0021, + "step": 242520 + }, + { + "epoch": 1.549455042612729, + "grad_norm": 1.222100019454956, + "learning_rate": 1.2038192188729502e-05, + "loss": 0.8258, + "step": 242530 + }, + { + "epoch": 1.5495189297624674, + "grad_norm": 1.158316731452942, + "learning_rate": 1.203492679408466e-05, + "loss": 0.8146, + "step": 242540 + }, + { + "epoch": 1.5495828169122063, + "grad_norm": 0.9694693684577942, + "learning_rate": 1.2031661781780962e-05, + "loss": 0.7009, + "step": 242550 + }, + { + "epoch": 1.5496467040619448, + "grad_norm": 1.084350347518921, + "learning_rate": 1.2028397151851262e-05, + "loss": 0.7308, + "step": 242560 + }, + { + "epoch": 1.5497105912116838, + "grad_norm": 1.462477207183838, + "learning_rate": 1.2025132904328474e-05, + "loss": 0.856, + "step": 242570 + }, + { + "epoch": 1.5497744783614222, + "grad_norm": 1.2192946672439575, + "learning_rate": 1.2021869039245431e-05, + "loss": 0.8128, + "step": 242580 + }, + { + "epoch": 1.5498383655111612, + "grad_norm": 0.6992841958999634, + "learning_rate": 1.2018605556635037e-05, + "loss": 0.6754, + "step": 242590 + }, + { + "epoch": 1.5499022526608996, + "grad_norm": 0.6951451897621155, + "learning_rate": 1.2015342456530126e-05, + "loss": 0.773, + "step": 242600 + }, + { + "epoch": 1.5499661398106386, + "grad_norm": 0.764622688293457, + "learning_rate": 1.20120797389636e-05, + "loss": 0.7105, + "step": 242610 + }, + { + "epoch": 1.550030026960377, + "grad_norm": 1.0126152038574219, + "learning_rate": 1.2008817403968275e-05, + "loss": 0.8862, + "step": 242620 + }, + { + "epoch": 1.550093914110116, + "grad_norm": 0.9220937490463257, + "learning_rate": 1.2005555451577038e-05, + "loss": 0.8853, + "step": 242630 + }, + { + "epoch": 1.5501578012598545, + "grad_norm": 1.593605875968933, + "learning_rate": 1.2002293881822718e-05, + "loss": 0.983, + "step": 242640 + }, + { + "epoch": 1.5502216884095934, + "grad_norm": 1.2632958889007568, + "learning_rate": 1.1999032694738188e-05, + "loss": 0.8475, + "step": 242650 + }, + { + "epoch": 1.5502855755593319, + "grad_norm": 1.2085556983947754, + "learning_rate": 1.1995771890356255e-05, + "loss": 0.9129, + "step": 242660 + }, + { + "epoch": 1.5503494627090708, + "grad_norm": 0.9262987375259399, + "learning_rate": 1.1992511468709794e-05, + "loss": 0.9488, + "step": 242670 + }, + { + "epoch": 1.5504133498588093, + "grad_norm": 0.9073424339294434, + "learning_rate": 1.198925142983161e-05, + "loss": 0.8018, + "step": 242680 + }, + { + "epoch": 1.5504772370085482, + "grad_norm": 0.8505311608314514, + "learning_rate": 1.1985991773754557e-05, + "loss": 0.6834, + "step": 242690 + }, + { + "epoch": 1.5505411241582867, + "grad_norm": 0.9804407954216003, + "learning_rate": 1.1982732500511445e-05, + "loss": 0.96, + "step": 242700 + }, + { + "epoch": 1.5506050113080256, + "grad_norm": 0.9178571105003357, + "learning_rate": 1.1979473610135117e-05, + "loss": 1.0759, + "step": 242710 + }, + { + "epoch": 1.5506688984577641, + "grad_norm": 0.6002690196037292, + "learning_rate": 1.1976215102658372e-05, + "loss": 0.777, + "step": 242720 + }, + { + "epoch": 1.550732785607503, + "grad_norm": 1.127046823501587, + "learning_rate": 1.197295697811403e-05, + "loss": 0.9468, + "step": 242730 + }, + { + "epoch": 1.5507966727572415, + "grad_norm": 0.6819142699241638, + "learning_rate": 1.1969699236534932e-05, + "loss": 0.7426, + "step": 242740 + }, + { + "epoch": 1.5508605599069805, + "grad_norm": 0.817094624042511, + "learning_rate": 1.1966441877953843e-05, + "loss": 0.8225, + "step": 242750 + }, + { + "epoch": 1.550924447056719, + "grad_norm": 0.8324134945869446, + "learning_rate": 1.1963184902403607e-05, + "loss": 0.7833, + "step": 242760 + }, + { + "epoch": 1.5509883342064579, + "grad_norm": 0.8241669535636902, + "learning_rate": 1.1959928309916984e-05, + "loss": 1.0963, + "step": 242770 + }, + { + "epoch": 1.5510522213561964, + "grad_norm": 0.9614746570587158, + "learning_rate": 1.195667210052681e-05, + "loss": 0.9468, + "step": 242780 + }, + { + "epoch": 1.551116108505935, + "grad_norm": 3.019289255142212, + "learning_rate": 1.195341627426585e-05, + "loss": 0.9503, + "step": 242790 + }, + { + "epoch": 1.5511799956556738, + "grad_norm": 1.0065377950668335, + "learning_rate": 1.1950160831166912e-05, + "loss": 0.6603, + "step": 242800 + }, + { + "epoch": 1.5512438828054125, + "grad_norm": 0.888316810131073, + "learning_rate": 1.1946905771262761e-05, + "loss": 0.9429, + "step": 242810 + }, + { + "epoch": 1.5513077699551512, + "grad_norm": 0.9668296575546265, + "learning_rate": 1.1943651094586206e-05, + "loss": 0.7907, + "step": 242820 + }, + { + "epoch": 1.55137165710489, + "grad_norm": 0.7448475360870361, + "learning_rate": 1.194039680116999e-05, + "loss": 0.9268, + "step": 242830 + }, + { + "epoch": 1.5514355442546286, + "grad_norm": 0.918478786945343, + "learning_rate": 1.1937142891046915e-05, + "loss": 0.8964, + "step": 242840 + }, + { + "epoch": 1.5514994314043673, + "grad_norm": 1.3066163063049316, + "learning_rate": 1.1933889364249733e-05, + "loss": 0.9459, + "step": 242850 + }, + { + "epoch": 1.551563318554106, + "grad_norm": 0.7327299118041992, + "learning_rate": 1.1930636220811226e-05, + "loss": 0.9063, + "step": 242860 + }, + { + "epoch": 1.5516272057038447, + "grad_norm": 0.7058196067810059, + "learning_rate": 1.1927383460764152e-05, + "loss": 0.9326, + "step": 242870 + }, + { + "epoch": 1.5516910928535834, + "grad_norm": 0.5669124722480774, + "learning_rate": 1.1924131084141244e-05, + "loss": 1.0724, + "step": 242880 + }, + { + "epoch": 1.5517549800033221, + "grad_norm": 1.1147737503051758, + "learning_rate": 1.1920879090975295e-05, + "loss": 0.8509, + "step": 242890 + }, + { + "epoch": 1.5518188671530608, + "grad_norm": 0.9954694509506226, + "learning_rate": 1.1917627481299021e-05, + "loss": 1.228, + "step": 242900 + }, + { + "epoch": 1.5518827543027995, + "grad_norm": 1.6905938386917114, + "learning_rate": 1.1914376255145199e-05, + "loss": 0.713, + "step": 242910 + }, + { + "epoch": 1.5519466414525382, + "grad_norm": 1.1534686088562012, + "learning_rate": 1.1911125412546542e-05, + "loss": 0.7959, + "step": 242920 + }, + { + "epoch": 1.552010528602277, + "grad_norm": 1.1367498636245728, + "learning_rate": 1.190787495353582e-05, + "loss": 0.7514, + "step": 242930 + }, + { + "epoch": 1.5520744157520157, + "grad_norm": 1.3090966939926147, + "learning_rate": 1.1904624878145731e-05, + "loss": 0.8619, + "step": 242940 + }, + { + "epoch": 1.5521383029017544, + "grad_norm": 2.116231918334961, + "learning_rate": 1.1901375186409047e-05, + "loss": 0.7968, + "step": 242950 + }, + { + "epoch": 1.552202190051493, + "grad_norm": 2.4607231616973877, + "learning_rate": 1.1898125878358457e-05, + "loss": 0.8341, + "step": 242960 + }, + { + "epoch": 1.5522660772012318, + "grad_norm": 1.982274055480957, + "learning_rate": 1.1894876954026718e-05, + "loss": 0.8466, + "step": 242970 + }, + { + "epoch": 1.5523299643509705, + "grad_norm": 0.8793577551841736, + "learning_rate": 1.189162841344652e-05, + "loss": 0.8501, + "step": 242980 + }, + { + "epoch": 1.5523938515007092, + "grad_norm": 3.320389986038208, + "learning_rate": 1.188838025665061e-05, + "loss": 0.9446, + "step": 242990 + }, + { + "epoch": 1.552457738650448, + "grad_norm": 0.8653967976570129, + "learning_rate": 1.1885132483671663e-05, + "loss": 0.6972, + "step": 243000 + }, + { + "epoch": 1.5525216258001866, + "grad_norm": 1.2445247173309326, + "learning_rate": 1.1881885094542422e-05, + "loss": 0.679, + "step": 243010 + }, + { + "epoch": 1.5525855129499253, + "grad_norm": 0.9507594704627991, + "learning_rate": 1.1878638089295562e-05, + "loss": 0.7246, + "step": 243020 + }, + { + "epoch": 1.5526494000996638, + "grad_norm": 0.9456453919410706, + "learning_rate": 1.1875391467963803e-05, + "loss": 0.8444, + "step": 243030 + }, + { + "epoch": 1.5527132872494027, + "grad_norm": 0.9404791593551636, + "learning_rate": 1.1872145230579828e-05, + "loss": 0.9284, + "step": 243040 + }, + { + "epoch": 1.5527771743991412, + "grad_norm": 0.8506212830543518, + "learning_rate": 1.1868899377176346e-05, + "loss": 0.839, + "step": 243050 + }, + { + "epoch": 1.5528410615488801, + "grad_norm": 1.2514028549194336, + "learning_rate": 1.1865653907786023e-05, + "loss": 0.6611, + "step": 243060 + }, + { + "epoch": 1.5529049486986186, + "grad_norm": 0.8052425980567932, + "learning_rate": 1.1862408822441557e-05, + "loss": 0.7826, + "step": 243070 + }, + { + "epoch": 1.5529688358483575, + "grad_norm": 0.6969268321990967, + "learning_rate": 1.1859164121175642e-05, + "loss": 0.8792, + "step": 243080 + }, + { + "epoch": 1.553032722998096, + "grad_norm": 0.908186137676239, + "learning_rate": 1.1855919804020926e-05, + "loss": 1.0984, + "step": 243090 + }, + { + "epoch": 1.553096610147835, + "grad_norm": 0.9428640604019165, + "learning_rate": 1.185267587101011e-05, + "loss": 0.6629, + "step": 243100 + }, + { + "epoch": 1.5531604972975734, + "grad_norm": 1.1042400598526, + "learning_rate": 1.1849432322175835e-05, + "loss": 0.7213, + "step": 243110 + }, + { + "epoch": 1.5532243844473124, + "grad_norm": 0.9871320128440857, + "learning_rate": 1.1846189157550796e-05, + "loss": 0.7859, + "step": 243120 + }, + { + "epoch": 1.5532882715970509, + "grad_norm": 1.2303754091262817, + "learning_rate": 1.184294637716763e-05, + "loss": 0.9326, + "step": 243130 + }, + { + "epoch": 1.5533521587467898, + "grad_norm": 1.3488849401474, + "learning_rate": 1.1839703981059014e-05, + "loss": 0.8483, + "step": 243140 + }, + { + "epoch": 1.5534160458965283, + "grad_norm": 1.0050297975540161, + "learning_rate": 1.1836461969257578e-05, + "loss": 1.1711, + "step": 243150 + }, + { + "epoch": 1.5534799330462672, + "grad_norm": 0.8590402603149414, + "learning_rate": 1.1833220341796002e-05, + "loss": 0.7949, + "step": 243160 + }, + { + "epoch": 1.5535438201960057, + "grad_norm": 1.1629736423492432, + "learning_rate": 1.1829979098706905e-05, + "loss": 0.8913, + "step": 243170 + }, + { + "epoch": 1.5536077073457446, + "grad_norm": 0.979515552520752, + "learning_rate": 1.1826738240022949e-05, + "loss": 0.893, + "step": 243180 + }, + { + "epoch": 1.553671594495483, + "grad_norm": 0.6931970715522766, + "learning_rate": 1.1823497765776753e-05, + "loss": 0.9594, + "step": 243190 + }, + { + "epoch": 1.553735481645222, + "grad_norm": 0.8464453816413879, + "learning_rate": 1.1820257676000978e-05, + "loss": 0.8218, + "step": 243200 + }, + { + "epoch": 1.5537993687949605, + "grad_norm": 1.2369046211242676, + "learning_rate": 1.1817017970728223e-05, + "loss": 0.646, + "step": 243210 + }, + { + "epoch": 1.5538632559446994, + "grad_norm": 0.5345267653465271, + "learning_rate": 1.1813778649991136e-05, + "loss": 0.7116, + "step": 243220 + }, + { + "epoch": 1.553927143094438, + "grad_norm": 1.2022993564605713, + "learning_rate": 1.1810539713822327e-05, + "loss": 1.1143, + "step": 243230 + }, + { + "epoch": 1.5539910302441768, + "grad_norm": 0.5171950459480286, + "learning_rate": 1.1807301162254435e-05, + "loss": 0.7505, + "step": 243240 + }, + { + "epoch": 1.5540549173939153, + "grad_norm": NaN, + "learning_rate": 1.1804386794704053e-05, + "loss": 0.8014, + "step": 243250 + }, + { + "epoch": 1.5541188045436543, + "grad_norm": 0.7231611013412476, + "learning_rate": 1.1801148973967718e-05, + "loss": 1.0568, + "step": 243260 + }, + { + "epoch": 1.5541826916933927, + "grad_norm": 0.8570977449417114, + "learning_rate": 1.179791153792687e-05, + "loss": 0.642, + "step": 243270 + }, + { + "epoch": 1.5542465788431314, + "grad_norm": 1.048039197921753, + "learning_rate": 1.1794674486614089e-05, + "loss": 0.8153, + "step": 243280 + }, + { + "epoch": 1.5543104659928701, + "grad_norm": 0.8269684314727783, + "learning_rate": 1.1791437820062002e-05, + "loss": 0.7687, + "step": 243290 + }, + { + "epoch": 1.5543743531426089, + "grad_norm": 0.8667225241661072, + "learning_rate": 1.1788201538303173e-05, + "loss": 0.8972, + "step": 243300 + }, + { + "epoch": 1.5544382402923476, + "grad_norm": 0.8198990225791931, + "learning_rate": 1.1784965641370233e-05, + "loss": 1.0236, + "step": 243310 + }, + { + "epoch": 1.5545021274420863, + "grad_norm": 0.7782520055770874, + "learning_rate": 1.1781730129295732e-05, + "loss": 0.9793, + "step": 243320 + }, + { + "epoch": 1.554566014591825, + "grad_norm": 0.7397206425666809, + "learning_rate": 1.1778495002112289e-05, + "loss": 1.0227, + "step": 243330 + }, + { + "epoch": 1.5546299017415637, + "grad_norm": 0.8276669383049011, + "learning_rate": 1.177526025985245e-05, + "loss": 0.7356, + "step": 243340 + }, + { + "epoch": 1.5546937888913024, + "grad_norm": 1.3340283632278442, + "learning_rate": 1.1772025902548828e-05, + "loss": 0.9137, + "step": 243350 + }, + { + "epoch": 1.554757676041041, + "grad_norm": 1.0188891887664795, + "learning_rate": 1.1768791930233958e-05, + "loss": 0.9519, + "step": 243360 + }, + { + "epoch": 1.5548215631907798, + "grad_norm": 0.9149680137634277, + "learning_rate": 1.1765558342940442e-05, + "loss": 0.8527, + "step": 243370 + }, + { + "epoch": 1.5548854503405185, + "grad_norm": 1.1764163970947266, + "learning_rate": 1.176232514070082e-05, + "loss": 0.8642, + "step": 243380 + }, + { + "epoch": 1.5549493374902572, + "grad_norm": 1.2475415468215942, + "learning_rate": 1.1759092323547665e-05, + "loss": 1.0427, + "step": 243390 + }, + { + "epoch": 1.555013224639996, + "grad_norm": 1.0803353786468506, + "learning_rate": 1.1755859891513549e-05, + "loss": 0.913, + "step": 243400 + }, + { + "epoch": 1.5550771117897346, + "grad_norm": 0.6774383783340454, + "learning_rate": 1.1752627844630988e-05, + "loss": 0.9511, + "step": 243410 + }, + { + "epoch": 1.5551409989394733, + "grad_norm": 1.2802356481552124, + "learning_rate": 1.1749396182932571e-05, + "loss": 1.1907, + "step": 243420 + }, + { + "epoch": 1.555204886089212, + "grad_norm": 0.8571861982345581, + "learning_rate": 1.1746164906450814e-05, + "loss": 0.9953, + "step": 243430 + }, + { + "epoch": 1.5552687732389507, + "grad_norm": 0.6251296997070312, + "learning_rate": 1.1742934015218282e-05, + "loss": 0.8489, + "step": 243440 + }, + { + "epoch": 1.5553326603886894, + "grad_norm": 1.214426875114441, + "learning_rate": 1.173970350926749e-05, + "loss": 0.832, + "step": 243450 + }, + { + "epoch": 1.5553965475384282, + "grad_norm": 1.2595208883285522, + "learning_rate": 1.1736473388630998e-05, + "loss": 0.7714, + "step": 243460 + }, + { + "epoch": 1.5554604346881669, + "grad_norm": 0.8752058148384094, + "learning_rate": 1.1733243653341309e-05, + "loss": 0.936, + "step": 243470 + }, + { + "epoch": 1.5555243218379056, + "grad_norm": 1.6139124631881714, + "learning_rate": 1.1730014303430969e-05, + "loss": 0.9531, + "step": 243480 + }, + { + "epoch": 1.5555882089876443, + "grad_norm": 1.2276432514190674, + "learning_rate": 1.17267853389325e-05, + "loss": 0.7412, + "step": 243490 + }, + { + "epoch": 1.555652096137383, + "grad_norm": 0.7098108530044556, + "learning_rate": 1.1723556759878395e-05, + "loss": 0.8033, + "step": 243500 + }, + { + "epoch": 1.5557159832871217, + "grad_norm": 0.7928702235221863, + "learning_rate": 1.1720328566301202e-05, + "loss": 0.8587, + "step": 243510 + }, + { + "epoch": 1.5557798704368602, + "grad_norm": 1.4220958948135376, + "learning_rate": 1.1717100758233406e-05, + "loss": 1.0561, + "step": 243520 + }, + { + "epoch": 1.555843757586599, + "grad_norm": 0.8648812770843506, + "learning_rate": 1.1713873335707537e-05, + "loss": 0.8772, + "step": 243530 + }, + { + "epoch": 1.5559076447363376, + "grad_norm": 1.4728702306747437, + "learning_rate": 1.1710646298756073e-05, + "loss": 0.823, + "step": 243540 + }, + { + "epoch": 1.5559715318860765, + "grad_norm": 0.9253225922584534, + "learning_rate": 1.1707419647411538e-05, + "loss": 0.8703, + "step": 243550 + }, + { + "epoch": 1.556035419035815, + "grad_norm": 0.9636625051498413, + "learning_rate": 1.1704193381706397e-05, + "loss": 0.7297, + "step": 243560 + }, + { + "epoch": 1.556099306185554, + "grad_norm": 0.6991531252861023, + "learning_rate": 1.1700967501673176e-05, + "loss": 0.6324, + "step": 243570 + }, + { + "epoch": 1.5561631933352924, + "grad_norm": 0.8342954516410828, + "learning_rate": 1.1697742007344336e-05, + "loss": 0.881, + "step": 243580 + }, + { + "epoch": 1.5562270804850313, + "grad_norm": 1.172669768333435, + "learning_rate": 1.1694516898752383e-05, + "loss": 1.1548, + "step": 243590 + }, + { + "epoch": 1.5562909676347698, + "grad_norm": 1.1467583179473877, + "learning_rate": 1.1691292175929769e-05, + "loss": 1.2036, + "step": 243600 + }, + { + "epoch": 1.5563548547845087, + "grad_norm": 1.2399263381958008, + "learning_rate": 1.1688067838908995e-05, + "loss": 0.8152, + "step": 243610 + }, + { + "epoch": 1.5564187419342472, + "grad_norm": 0.9543413519859314, + "learning_rate": 1.1684843887722512e-05, + "loss": 0.991, + "step": 243620 + }, + { + "epoch": 1.5564826290839862, + "grad_norm": 1.735193133354187, + "learning_rate": 1.1681620322402808e-05, + "loss": 0.8614, + "step": 243630 + }, + { + "epoch": 1.5565465162337246, + "grad_norm": 0.979796826839447, + "learning_rate": 1.1678397142982333e-05, + "loss": 0.6309, + "step": 243640 + }, + { + "epoch": 1.5566104033834636, + "grad_norm": 0.7954897880554199, + "learning_rate": 1.1675174349493556e-05, + "loss": 0.8495, + "step": 243650 + }, + { + "epoch": 1.556674290533202, + "grad_norm": 0.7541021704673767, + "learning_rate": 1.1671951941968922e-05, + "loss": 0.8902, + "step": 243660 + }, + { + "epoch": 1.556738177682941, + "grad_norm": 0.7214952111244202, + "learning_rate": 1.1668729920440897e-05, + "loss": 0.7945, + "step": 243670 + }, + { + "epoch": 1.5568020648326795, + "grad_norm": 0.5442585349082947, + "learning_rate": 1.1665508284941918e-05, + "loss": 0.6824, + "step": 243680 + }, + { + "epoch": 1.5568659519824184, + "grad_norm": 1.2295407056808472, + "learning_rate": 1.1662287035504438e-05, + "loss": 0.894, + "step": 243690 + }, + { + "epoch": 1.5569298391321569, + "grad_norm": 1.6326357126235962, + "learning_rate": 1.1659066172160887e-05, + "loss": 1.0403, + "step": 243700 + }, + { + "epoch": 1.5569937262818958, + "grad_norm": 0.9609403610229492, + "learning_rate": 1.1655845694943712e-05, + "loss": 0.6769, + "step": 243710 + }, + { + "epoch": 1.5570576134316343, + "grad_norm": 0.969074547290802, + "learning_rate": 1.1652625603885353e-05, + "loss": 0.9733, + "step": 243720 + }, + { + "epoch": 1.5571215005813732, + "grad_norm": 0.5936760902404785, + "learning_rate": 1.1649405899018211e-05, + "loss": 0.8327, + "step": 243730 + }, + { + "epoch": 1.5571853877311117, + "grad_norm": 0.8490630388259888, + "learning_rate": 1.1646186580374752e-05, + "loss": 0.8271, + "step": 243740 + }, + { + "epoch": 1.5572492748808506, + "grad_norm": 1.3601961135864258, + "learning_rate": 1.1642967647987357e-05, + "loss": 0.8539, + "step": 243750 + }, + { + "epoch": 1.5573131620305891, + "grad_norm": 0.7356747984886169, + "learning_rate": 1.1639749101888476e-05, + "loss": 1.0955, + "step": 243760 + }, + { + "epoch": 1.5573770491803278, + "grad_norm": 1.0827041864395142, + "learning_rate": 1.1636530942110496e-05, + "loss": 0.7306, + "step": 243770 + }, + { + "epoch": 1.5574409363300665, + "grad_norm": 1.0377275943756104, + "learning_rate": 1.163331316868585e-05, + "loss": 0.8907, + "step": 243780 + }, + { + "epoch": 1.5575048234798052, + "grad_norm": 0.9135669469833374, + "learning_rate": 1.1630095781646915e-05, + "loss": 1.1076, + "step": 243790 + }, + { + "epoch": 1.557568710629544, + "grad_norm": 0.9091414213180542, + "learning_rate": 1.162687878102613e-05, + "loss": 0.8852, + "step": 243800 + }, + { + "epoch": 1.5576325977792826, + "grad_norm": 0.7446238994598389, + "learning_rate": 1.1623662166855853e-05, + "loss": 1.0378, + "step": 243810 + }, + { + "epoch": 1.5576964849290214, + "grad_norm": 1.005259394645691, + "learning_rate": 1.1620445939168517e-05, + "loss": 1.3612, + "step": 243820 + }, + { + "epoch": 1.55776037207876, + "grad_norm": 0.6413303017616272, + "learning_rate": 1.1617230097996479e-05, + "loss": 0.804, + "step": 243830 + }, + { + "epoch": 1.5578242592284988, + "grad_norm": 0.8589564561843872, + "learning_rate": 1.1614014643372157e-05, + "loss": 0.8413, + "step": 243840 + }, + { + "epoch": 1.5578881463782375, + "grad_norm": 1.4905831813812256, + "learning_rate": 1.1610799575327896e-05, + "loss": 0.9538, + "step": 243850 + }, + { + "epoch": 1.5579520335279762, + "grad_norm": 1.0603193044662476, + "learning_rate": 1.1607584893896112e-05, + "loss": 0.9069, + "step": 243860 + }, + { + "epoch": 1.5580159206777149, + "grad_norm": 1.3208253383636475, + "learning_rate": 1.1604370599109143e-05, + "loss": 1.0391, + "step": 243870 + }, + { + "epoch": 1.5580798078274536, + "grad_norm": 0.860833466053009, + "learning_rate": 1.1601156690999398e-05, + "loss": 1.1647, + "step": 243880 + }, + { + "epoch": 1.5581436949771923, + "grad_norm": 0.8165043592453003, + "learning_rate": 1.1597943169599212e-05, + "loss": 0.7295, + "step": 243890 + }, + { + "epoch": 1.558207582126931, + "grad_norm": 1.4571269750595093, + "learning_rate": 1.1594730034940976e-05, + "loss": 0.9935, + "step": 243900 + }, + { + "epoch": 1.5582714692766697, + "grad_norm": 0.9263190627098083, + "learning_rate": 1.1591517287057013e-05, + "loss": 0.8241, + "step": 243910 + }, + { + "epoch": 1.5583353564264084, + "grad_norm": 0.7845439314842224, + "learning_rate": 1.1588304925979704e-05, + "loss": 0.7681, + "step": 243920 + }, + { + "epoch": 1.5583992435761471, + "grad_norm": 1.0294877290725708, + "learning_rate": 1.1585092951741405e-05, + "loss": 0.9277, + "step": 243930 + }, + { + "epoch": 1.5584631307258858, + "grad_norm": 0.9200586676597595, + "learning_rate": 1.1581881364374448e-05, + "loss": 0.7354, + "step": 243940 + }, + { + "epoch": 1.5585270178756245, + "grad_norm": 0.8697454333305359, + "learning_rate": 1.1578670163911186e-05, + "loss": 0.7926, + "step": 243950 + }, + { + "epoch": 1.5585909050253632, + "grad_norm": 1.2176082134246826, + "learning_rate": 1.157545935038395e-05, + "loss": 0.6585, + "step": 243960 + }, + { + "epoch": 1.558654792175102, + "grad_norm": 0.5793275833129883, + "learning_rate": 1.1572248923825102e-05, + "loss": 0.7817, + "step": 243970 + }, + { + "epoch": 1.5587186793248407, + "grad_norm": 1.0738435983657837, + "learning_rate": 1.1569038884266924e-05, + "loss": 1.0253, + "step": 243980 + }, + { + "epoch": 1.5587825664745794, + "grad_norm": 0.824019730091095, + "learning_rate": 1.1565829231741787e-05, + "loss": 0.7129, + "step": 243990 + }, + { + "epoch": 1.558846453624318, + "grad_norm": 0.8560379147529602, + "learning_rate": 1.1562619966281985e-05, + "loss": 0.8505, + "step": 244000 + }, + { + "epoch": 1.5589103407740565, + "grad_norm": 1.3552207946777344, + "learning_rate": 1.1559411087919868e-05, + "loss": 0.5609, + "step": 244010 + }, + { + "epoch": 1.5589742279237955, + "grad_norm": 1.1426405906677246, + "learning_rate": 1.1556202596687726e-05, + "loss": 0.7931, + "step": 244020 + }, + { + "epoch": 1.559038115073534, + "grad_norm": 2.6822657585144043, + "learning_rate": 1.155299449261788e-05, + "loss": 1.1084, + "step": 244030 + }, + { + "epoch": 1.5591020022232729, + "grad_norm": 0.5422342419624329, + "learning_rate": 1.1549786775742656e-05, + "loss": 1.004, + "step": 244040 + }, + { + "epoch": 1.5591658893730114, + "grad_norm": 2.055058479309082, + "learning_rate": 1.154657944609433e-05, + "loss": 0.6937, + "step": 244050 + }, + { + "epoch": 1.5592297765227503, + "grad_norm": 0.8282245993614197, + "learning_rate": 1.1543372503705224e-05, + "loss": 0.8837, + "step": 244060 + }, + { + "epoch": 1.5592936636724888, + "grad_norm": 0.898158609867096, + "learning_rate": 1.1540165948607618e-05, + "loss": 0.7305, + "step": 244070 + }, + { + "epoch": 1.5593575508222277, + "grad_norm": 1.2153574228286743, + "learning_rate": 1.1536959780833829e-05, + "loss": 0.8754, + "step": 244080 + }, + { + "epoch": 1.5594214379719662, + "grad_norm": 0.8690845370292664, + "learning_rate": 1.1533754000416114e-05, + "loss": 0.7418, + "step": 244090 + }, + { + "epoch": 1.5594853251217051, + "grad_norm": 0.9315603375434875, + "learning_rate": 1.1530548607386788e-05, + "loss": 0.7841, + "step": 244100 + }, + { + "epoch": 1.5595492122714436, + "grad_norm": 1.0761457681655884, + "learning_rate": 1.1527343601778101e-05, + "loss": 0.7509, + "step": 244110 + }, + { + "epoch": 1.5596130994211825, + "grad_norm": 1.8089863061904907, + "learning_rate": 1.1524138983622368e-05, + "loss": 0.9987, + "step": 244120 + }, + { + "epoch": 1.559676986570921, + "grad_norm": 1.311298131942749, + "learning_rate": 1.1520934752951824e-05, + "loss": 1.0197, + "step": 244130 + }, + { + "epoch": 1.55974087372066, + "grad_norm": 0.5967944264411926, + "learning_rate": 1.1517730909798768e-05, + "loss": 0.7139, + "step": 244140 + }, + { + "epoch": 1.5598047608703984, + "grad_norm": 1.037063479423523, + "learning_rate": 1.1514527454195445e-05, + "loss": 0.8653, + "step": 244150 + }, + { + "epoch": 1.5598686480201374, + "grad_norm": 0.7818079590797424, + "learning_rate": 1.1511324386174138e-05, + "loss": 0.7673, + "step": 244160 + }, + { + "epoch": 1.5599325351698758, + "grad_norm": 0.9035627245903015, + "learning_rate": 1.1508121705767072e-05, + "loss": 0.8202, + "step": 244170 + }, + { + "epoch": 1.5599964223196148, + "grad_norm": 0.9490880966186523, + "learning_rate": 1.1504919413006542e-05, + "loss": 0.769, + "step": 244180 + }, + { + "epoch": 1.5600603094693533, + "grad_norm": 0.8521928191184998, + "learning_rate": 1.1501717507924759e-05, + "loss": 0.8679, + "step": 244190 + }, + { + "epoch": 1.5601241966190922, + "grad_norm": 0.8884910941123962, + "learning_rate": 1.1498515990554e-05, + "loss": 0.9999, + "step": 244200 + }, + { + "epoch": 1.5601880837688307, + "grad_norm": 0.7485408186912537, + "learning_rate": 1.1495314860926481e-05, + "loss": 0.9666, + "step": 244210 + }, + { + "epoch": 1.5602519709185696, + "grad_norm": 1.0756878852844238, + "learning_rate": 1.1492114119074465e-05, + "loss": 1.007, + "step": 244220 + }, + { + "epoch": 1.560315858068308, + "grad_norm": 0.7426232099533081, + "learning_rate": 1.1488913765030163e-05, + "loss": 0.9013, + "step": 244230 + }, + { + "epoch": 1.5603797452180468, + "grad_norm": 0.5656971335411072, + "learning_rate": 1.1485713798825815e-05, + "loss": 0.8492, + "step": 244240 + }, + { + "epoch": 1.5604436323677855, + "grad_norm": 1.036117434501648, + "learning_rate": 1.1482514220493663e-05, + "loss": 0.9966, + "step": 244250 + }, + { + "epoch": 1.5605075195175242, + "grad_norm": 1.0510843992233276, + "learning_rate": 1.1479315030065897e-05, + "loss": 0.9037, + "step": 244260 + }, + { + "epoch": 1.560571406667263, + "grad_norm": 6.631254196166992, + "learning_rate": 1.1476116227574768e-05, + "loss": 0.9423, + "step": 244270 + }, + { + "epoch": 1.5606352938170016, + "grad_norm": 0.85828697681427, + "learning_rate": 1.147291781305247e-05, + "loss": 0.9699, + "step": 244280 + }, + { + "epoch": 1.5606991809667403, + "grad_norm": 1.2128971815109253, + "learning_rate": 1.146971978653123e-05, + "loss": 0.8753, + "step": 244290 + }, + { + "epoch": 1.560763068116479, + "grad_norm": 1.1858617067337036, + "learning_rate": 1.1466522148043229e-05, + "loss": 0.7687, + "step": 244300 + }, + { + "epoch": 1.5608269552662177, + "grad_norm": 0.7385855317115784, + "learning_rate": 1.1463324897620702e-05, + "loss": 0.7099, + "step": 244310 + }, + { + "epoch": 1.5608908424159564, + "grad_norm": 1.2785903215408325, + "learning_rate": 1.146012803529582e-05, + "loss": 0.9598, + "step": 244320 + }, + { + "epoch": 1.5609547295656951, + "grad_norm": 0.951858639717102, + "learning_rate": 1.1456931561100798e-05, + "loss": 0.765, + "step": 244330 + }, + { + "epoch": 1.5610186167154338, + "grad_norm": 0.9530162811279297, + "learning_rate": 1.1453735475067811e-05, + "loss": 0.7981, + "step": 244340 + }, + { + "epoch": 1.5610825038651726, + "grad_norm": 1.2772146463394165, + "learning_rate": 1.1450539777229069e-05, + "loss": 0.7661, + "step": 244350 + }, + { + "epoch": 1.5611463910149113, + "grad_norm": 1.0854233503341675, + "learning_rate": 1.1447344467616727e-05, + "loss": 0.7692, + "step": 244360 + }, + { + "epoch": 1.56121027816465, + "grad_norm": 1.4239158630371094, + "learning_rate": 1.1444149546262995e-05, + "loss": 0.865, + "step": 244370 + }, + { + "epoch": 1.5612741653143887, + "grad_norm": 1.141292691230774, + "learning_rate": 1.1440955013200017e-05, + "loss": 0.7969, + "step": 244380 + }, + { + "epoch": 1.5613380524641274, + "grad_norm": 0.7779687643051147, + "learning_rate": 1.143776086845999e-05, + "loss": 0.7294, + "step": 244390 + }, + { + "epoch": 1.561401939613866, + "grad_norm": 1.1889150142669678, + "learning_rate": 1.1434567112075061e-05, + "loss": 0.9621, + "step": 244400 + }, + { + "epoch": 1.5614658267636048, + "grad_norm": 0.5841159224510193, + "learning_rate": 1.1431373744077422e-05, + "loss": 0.6453, + "step": 244410 + }, + { + "epoch": 1.5615297139133435, + "grad_norm": 0.8326659202575684, + "learning_rate": 1.1428180764499202e-05, + "loss": 0.8479, + "step": 244420 + }, + { + "epoch": 1.5615936010630822, + "grad_norm": 0.6364220976829529, + "learning_rate": 1.142498817337257e-05, + "loss": 0.8159, + "step": 244430 + }, + { + "epoch": 1.561657488212821, + "grad_norm": 1.0597093105316162, + "learning_rate": 1.1421795970729688e-05, + "loss": 0.7059, + "step": 244440 + }, + { + "epoch": 1.5617213753625596, + "grad_norm": 0.6557106375694275, + "learning_rate": 1.1418604156602686e-05, + "loss": 0.9927, + "step": 244450 + }, + { + "epoch": 1.5617852625122983, + "grad_norm": 1.2477772235870361, + "learning_rate": 1.1415412731023745e-05, + "loss": 1.1245, + "step": 244460 + }, + { + "epoch": 1.561849149662037, + "grad_norm": 0.8714918494224548, + "learning_rate": 1.1412221694024954e-05, + "loss": 0.7583, + "step": 244470 + }, + { + "epoch": 1.5619130368117757, + "grad_norm": 0.969723105430603, + "learning_rate": 1.140903104563848e-05, + "loss": 1.0076, + "step": 244480 + }, + { + "epoch": 1.5619769239615144, + "grad_norm": 1.1985634565353394, + "learning_rate": 1.1405840785896443e-05, + "loss": 0.7064, + "step": 244490 + }, + { + "epoch": 1.562040811111253, + "grad_norm": 2.019909381866455, + "learning_rate": 1.1402650914830987e-05, + "loss": 0.8153, + "step": 244500 + }, + { + "epoch": 1.5621046982609919, + "grad_norm": 0.9854116439819336, + "learning_rate": 1.1399461432474218e-05, + "loss": 0.9243, + "step": 244510 + }, + { + "epoch": 1.5621685854107303, + "grad_norm": 0.7602574229240417, + "learning_rate": 1.1396272338858276e-05, + "loss": 0.6641, + "step": 244520 + }, + { + "epoch": 1.5622324725604693, + "grad_norm": 0.8997085690498352, + "learning_rate": 1.1393083634015255e-05, + "loss": 0.9308, + "step": 244530 + }, + { + "epoch": 1.5622963597102077, + "grad_norm": 1.1706210374832153, + "learning_rate": 1.13898953179773e-05, + "loss": 0.9875, + "step": 244540 + }, + { + "epoch": 1.5623602468599467, + "grad_norm": 1.6842466592788696, + "learning_rate": 1.1386707390776485e-05, + "loss": 0.9852, + "step": 244550 + }, + { + "epoch": 1.5624241340096852, + "grad_norm": 2.359833002090454, + "learning_rate": 1.138351985244493e-05, + "loss": 0.8689, + "step": 244560 + }, + { + "epoch": 1.562488021159424, + "grad_norm": 0.835586667060852, + "learning_rate": 1.1380332703014757e-05, + "loss": 1.0087, + "step": 244570 + }, + { + "epoch": 1.5625519083091626, + "grad_norm": 1.5438088178634644, + "learning_rate": 1.1377145942518024e-05, + "loss": 0.6784, + "step": 244580 + }, + { + "epoch": 1.5626157954589015, + "grad_norm": 1.3177486658096313, + "learning_rate": 1.1373959570986864e-05, + "loss": 0.8362, + "step": 244590 + }, + { + "epoch": 1.56267968260864, + "grad_norm": 1.20131254196167, + "learning_rate": 1.1370773588453332e-05, + "loss": 1.1488, + "step": 244600 + }, + { + "epoch": 1.562743569758379, + "grad_norm": 0.9831515550613403, + "learning_rate": 1.136758799494954e-05, + "loss": 0.8171, + "step": 244610 + }, + { + "epoch": 1.5628074569081174, + "grad_norm": 0.7109906673431396, + "learning_rate": 1.136440279050755e-05, + "loss": 1.0254, + "step": 244620 + }, + { + "epoch": 1.5628713440578563, + "grad_norm": 0.9102615714073181, + "learning_rate": 1.1361217975159454e-05, + "loss": 0.6895, + "step": 244630 + }, + { + "epoch": 1.5629352312075948, + "grad_norm": 1.2789576053619385, + "learning_rate": 1.1358033548937314e-05, + "loss": 0.7483, + "step": 244640 + }, + { + "epoch": 1.5629991183573337, + "grad_norm": 1.4456409215927124, + "learning_rate": 1.1354849511873222e-05, + "loss": 0.7125, + "step": 244650 + }, + { + "epoch": 1.5630630055070722, + "grad_norm": 1.380568027496338, + "learning_rate": 1.1351665863999206e-05, + "loss": 0.8184, + "step": 244660 + }, + { + "epoch": 1.5631268926568112, + "grad_norm": 1.104844093322754, + "learning_rate": 1.1348482605347372e-05, + "loss": 0.7848, + "step": 244670 + }, + { + "epoch": 1.5631907798065496, + "grad_norm": 1.1302462816238403, + "learning_rate": 1.1345299735949738e-05, + "loss": 0.813, + "step": 244680 + }, + { + "epoch": 1.5632546669562886, + "grad_norm": 0.9731299877166748, + "learning_rate": 1.1342117255838391e-05, + "loss": 0.887, + "step": 244690 + }, + { + "epoch": 1.563318554106027, + "grad_norm": 1.2548291683197021, + "learning_rate": 1.1338935165045356e-05, + "loss": 0.9777, + "step": 244700 + }, + { + "epoch": 1.563382441255766, + "grad_norm": 0.9935744404792786, + "learning_rate": 1.13357534636027e-05, + "loss": 0.927, + "step": 244710 + }, + { + "epoch": 1.5634463284055045, + "grad_norm": 1.089245319366455, + "learning_rate": 1.1332572151542448e-05, + "loss": 0.904, + "step": 244720 + }, + { + "epoch": 1.5635102155552432, + "grad_norm": 0.7848595380783081, + "learning_rate": 1.1329391228896652e-05, + "loss": 0.8947, + "step": 244730 + }, + { + "epoch": 1.5635741027049819, + "grad_norm": 0.7474117875099182, + "learning_rate": 1.132621069569733e-05, + "loss": 0.8568, + "step": 244740 + }, + { + "epoch": 1.5636379898547206, + "grad_norm": 1.012871503829956, + "learning_rate": 1.1323030551976544e-05, + "loss": 0.7365, + "step": 244750 + }, + { + "epoch": 1.5637018770044593, + "grad_norm": 0.6147935390472412, + "learning_rate": 1.1319850797766285e-05, + "loss": 0.6997, + "step": 244760 + }, + { + "epoch": 1.563765764154198, + "grad_norm": 0.7995234727859497, + "learning_rate": 1.1316671433098585e-05, + "loss": 0.787, + "step": 244770 + }, + { + "epoch": 1.5638296513039367, + "grad_norm": 1.2112396955490112, + "learning_rate": 1.1313492458005488e-05, + "loss": 0.7857, + "step": 244780 + }, + { + "epoch": 1.5638935384536754, + "grad_norm": 1.4421223402023315, + "learning_rate": 1.1310313872518979e-05, + "loss": 0.9153, + "step": 244790 + }, + { + "epoch": 1.563957425603414, + "grad_norm": 1.025989055633545, + "learning_rate": 1.1307135676671087e-05, + "loss": 0.8745, + "step": 244800 + }, + { + "epoch": 1.5640213127531528, + "grad_norm": 1.7144854068756104, + "learning_rate": 1.1303957870493808e-05, + "loss": 0.8442, + "step": 244810 + }, + { + "epoch": 1.5640851999028915, + "grad_norm": 0.9952170252799988, + "learning_rate": 1.130078045401916e-05, + "loss": 1.0449, + "step": 244820 + }, + { + "epoch": 1.5641490870526302, + "grad_norm": 0.763575553894043, + "learning_rate": 1.129760342727912e-05, + "loss": 0.8549, + "step": 244830 + }, + { + "epoch": 1.564212974202369, + "grad_norm": 0.5484232306480408, + "learning_rate": 1.1294426790305707e-05, + "loss": 0.8661, + "step": 244840 + }, + { + "epoch": 1.5642768613521076, + "grad_norm": 1.4526423215866089, + "learning_rate": 1.1291250543130888e-05, + "loss": 0.7648, + "step": 244850 + }, + { + "epoch": 1.5643407485018463, + "grad_norm": 6.604135990142822, + "learning_rate": 1.1288074685786677e-05, + "loss": 0.9358, + "step": 244860 + }, + { + "epoch": 1.564404635651585, + "grad_norm": 0.8231043815612793, + "learning_rate": 1.1284899218305034e-05, + "loss": 0.6355, + "step": 244870 + }, + { + "epoch": 1.5644685228013238, + "grad_norm": 0.8967733979225159, + "learning_rate": 1.128172414071796e-05, + "loss": 0.7986, + "step": 244880 + }, + { + "epoch": 1.5645324099510625, + "grad_norm": 0.6413818597793579, + "learning_rate": 1.1278549453057408e-05, + "loss": 1.1034, + "step": 244890 + }, + { + "epoch": 1.5645962971008012, + "grad_norm": 0.7567101716995239, + "learning_rate": 1.1275375155355372e-05, + "loss": 0.661, + "step": 244900 + }, + { + "epoch": 1.5646601842505399, + "grad_norm": 1.2307841777801514, + "learning_rate": 1.1272201247643799e-05, + "loss": 0.7908, + "step": 244910 + }, + { + "epoch": 1.5647240714002786, + "grad_norm": 0.8741402626037598, + "learning_rate": 1.1269027729954678e-05, + "loss": 0.7941, + "step": 244920 + }, + { + "epoch": 1.5647879585500173, + "grad_norm": 0.5362954139709473, + "learning_rate": 1.1265854602319936e-05, + "loss": 0.706, + "step": 244930 + }, + { + "epoch": 1.564851845699756, + "grad_norm": 1.1786456108093262, + "learning_rate": 1.1262681864771568e-05, + "loss": 0.94, + "step": 244940 + }, + { + "epoch": 1.5649157328494947, + "grad_norm": 1.1140693426132202, + "learning_rate": 1.1259509517341504e-05, + "loss": 0.7359, + "step": 244950 + }, + { + "epoch": 1.5649796199992334, + "grad_norm": 1.1226584911346436, + "learning_rate": 1.125633756006168e-05, + "loss": 0.8718, + "step": 244960 + }, + { + "epoch": 1.565043507148972, + "grad_norm": 1.0554084777832031, + "learning_rate": 1.1253165992964071e-05, + "loss": 0.811, + "step": 244970 + }, + { + "epoch": 1.5651073942987108, + "grad_norm": 0.6143940091133118, + "learning_rate": 1.1249994816080584e-05, + "loss": 0.9069, + "step": 244980 + }, + { + "epoch": 1.5651712814484493, + "grad_norm": 1.6363439559936523, + "learning_rate": 1.1246824029443187e-05, + "loss": 0.7897, + "step": 244990 + }, + { + "epoch": 1.5652351685981882, + "grad_norm": 0.8065378069877625, + "learning_rate": 1.1243653633083789e-05, + "loss": 0.8535, + "step": 245000 + }, + { + "epoch": 1.5652990557479267, + "grad_norm": 1.6017779111862183, + "learning_rate": 1.1240483627034337e-05, + "loss": 0.9318, + "step": 245010 + }, + { + "epoch": 1.5653629428976656, + "grad_norm": 1.2830206155776978, + "learning_rate": 1.1237314011326733e-05, + "loss": 0.9921, + "step": 245020 + }, + { + "epoch": 1.5654268300474041, + "grad_norm": 1.024481177330017, + "learning_rate": 1.1234144785992927e-05, + "loss": 0.7257, + "step": 245030 + }, + { + "epoch": 1.565490717197143, + "grad_norm": 0.7877789735794067, + "learning_rate": 1.123097595106481e-05, + "loss": 0.8077, + "step": 245040 + }, + { + "epoch": 1.5655546043468815, + "grad_norm": 0.6652231216430664, + "learning_rate": 1.1227807506574312e-05, + "loss": 0.6924, + "step": 245050 + }, + { + "epoch": 1.5656184914966205, + "grad_norm": 1.2229474782943726, + "learning_rate": 1.1224639452553326e-05, + "loss": 0.746, + "step": 245060 + }, + { + "epoch": 1.565682378646359, + "grad_norm": 0.6137751340866089, + "learning_rate": 1.1221471789033777e-05, + "loss": 0.8736, + "step": 245070 + }, + { + "epoch": 1.5657462657960979, + "grad_norm": 1.5874028205871582, + "learning_rate": 1.121830451604754e-05, + "loss": 0.9268, + "step": 245080 + }, + { + "epoch": 1.5658101529458364, + "grad_norm": 0.9740021228790283, + "learning_rate": 1.1215137633626532e-05, + "loss": 1.2189, + "step": 245090 + }, + { + "epoch": 1.5658740400955753, + "grad_norm": 0.8691504597663879, + "learning_rate": 1.1211971141802658e-05, + "loss": 0.8799, + "step": 245100 + }, + { + "epoch": 1.5659379272453138, + "grad_norm": 0.9141390919685364, + "learning_rate": 1.1208805040607768e-05, + "loss": 1.2535, + "step": 245110 + }, + { + "epoch": 1.5660018143950527, + "grad_norm": 0.9398859739303589, + "learning_rate": 1.1205639330073791e-05, + "loss": 0.9259, + "step": 245120 + }, + { + "epoch": 1.5660657015447912, + "grad_norm": 1.1347993612289429, + "learning_rate": 1.1202474010232572e-05, + "loss": 0.9103, + "step": 245130 + }, + { + "epoch": 1.5661295886945301, + "grad_norm": 0.5754815340042114, + "learning_rate": 1.1199309081116016e-05, + "loss": 0.8963, + "step": 245140 + }, + { + "epoch": 1.5661934758442686, + "grad_norm": 0.8746938109397888, + "learning_rate": 1.1196144542755976e-05, + "loss": 0.6805, + "step": 245150 + }, + { + "epoch": 1.5662573629940075, + "grad_norm": 0.8321001529693604, + "learning_rate": 1.1192980395184344e-05, + "loss": 1.0746, + "step": 245160 + }, + { + "epoch": 1.566321250143746, + "grad_norm": 0.5793132781982422, + "learning_rate": 1.1189816638432954e-05, + "loss": 0.7767, + "step": 245170 + }, + { + "epoch": 1.566385137293485, + "grad_norm": 1.0133846998214722, + "learning_rate": 1.1186653272533698e-05, + "loss": 0.7942, + "step": 245180 + }, + { + "epoch": 1.5664490244432234, + "grad_norm": 1.2261652946472168, + "learning_rate": 1.1183490297518417e-05, + "loss": 1.0238, + "step": 245190 + }, + { + "epoch": 1.5665129115929624, + "grad_norm": 1.3811265230178833, + "learning_rate": 1.1180327713418976e-05, + "loss": 0.8442, + "step": 245200 + }, + { + "epoch": 1.5665767987427008, + "grad_norm": 0.6937904953956604, + "learning_rate": 1.1177165520267207e-05, + "loss": 1.0061, + "step": 245210 + }, + { + "epoch": 1.5666406858924395, + "grad_norm": 1.304955244064331, + "learning_rate": 1.1174003718094983e-05, + "loss": 1.1358, + "step": 245220 + }, + { + "epoch": 1.5667045730421783, + "grad_norm": 1.1086231470108032, + "learning_rate": 1.1170842306934114e-05, + "loss": 0.944, + "step": 245230 + }, + { + "epoch": 1.566768460191917, + "grad_norm": 0.8318544030189514, + "learning_rate": 1.1167681286816472e-05, + "loss": 0.7889, + "step": 245240 + }, + { + "epoch": 1.5668323473416557, + "grad_norm": 0.7987926602363586, + "learning_rate": 1.1164520657773863e-05, + "loss": 0.8404, + "step": 245250 + }, + { + "epoch": 1.5668962344913944, + "grad_norm": 0.6072881817817688, + "learning_rate": 1.116136041983814e-05, + "loss": 0.9524, + "step": 245260 + }, + { + "epoch": 1.566960121641133, + "grad_norm": 0.8089340329170227, + "learning_rate": 1.11582005730411e-05, + "loss": 0.5764, + "step": 245270 + }, + { + "epoch": 1.5670240087908718, + "grad_norm": 1.2315974235534668, + "learning_rate": 1.1155041117414584e-05, + "loss": 0.854, + "step": 245280 + }, + { + "epoch": 1.5670878959406105, + "grad_norm": 0.7801371216773987, + "learning_rate": 1.1151882052990425e-05, + "loss": 0.5417, + "step": 245290 + }, + { + "epoch": 1.5671517830903492, + "grad_norm": 0.9974910616874695, + "learning_rate": 1.1148723379800407e-05, + "loss": 0.9432, + "step": 245300 + }, + { + "epoch": 1.567215670240088, + "grad_norm": 3.204942464828491, + "learning_rate": 1.1145565097876376e-05, + "loss": 0.925, + "step": 245310 + }, + { + "epoch": 1.5672795573898266, + "grad_norm": 0.731713056564331, + "learning_rate": 1.11424072072501e-05, + "loss": 0.691, + "step": 245320 + }, + { + "epoch": 1.5673434445395653, + "grad_norm": 1.2439576387405396, + "learning_rate": 1.113924970795341e-05, + "loss": 0.9036, + "step": 245330 + }, + { + "epoch": 1.567407331689304, + "grad_norm": 1.1690343618392944, + "learning_rate": 1.1136092600018084e-05, + "loss": 0.8694, + "step": 245340 + }, + { + "epoch": 1.5674712188390427, + "grad_norm": 1.4979528188705444, + "learning_rate": 1.1132935883475942e-05, + "loss": 0.9888, + "step": 245350 + }, + { + "epoch": 1.5675351059887814, + "grad_norm": 1.1090213060379028, + "learning_rate": 1.112977955835875e-05, + "loss": 0.9002, + "step": 245360 + }, + { + "epoch": 1.5675989931385201, + "grad_norm": 1.3513494729995728, + "learning_rate": 1.1126623624698312e-05, + "loss": 0.7067, + "step": 245370 + }, + { + "epoch": 1.5676628802882588, + "grad_norm": 0.8277401924133301, + "learning_rate": 1.1123468082526395e-05, + "loss": 1.0927, + "step": 245380 + }, + { + "epoch": 1.5677267674379975, + "grad_norm": 0.7830334901809692, + "learning_rate": 1.1120312931874798e-05, + "loss": 0.775, + "step": 245390 + }, + { + "epoch": 1.5677906545877363, + "grad_norm": 0.8738146424293518, + "learning_rate": 1.1117158172775278e-05, + "loss": 0.9157, + "step": 245400 + }, + { + "epoch": 1.567854541737475, + "grad_norm": 1.0072499513626099, + "learning_rate": 1.111400380525962e-05, + "loss": 0.7875, + "step": 245410 + }, + { + "epoch": 1.5679184288872137, + "grad_norm": 1.1970336437225342, + "learning_rate": 1.1110849829359577e-05, + "loss": 0.8484, + "step": 245420 + }, + { + "epoch": 1.5679823160369524, + "grad_norm": 0.8071025609970093, + "learning_rate": 1.110769624510693e-05, + "loss": 0.7452, + "step": 245430 + }, + { + "epoch": 1.568046203186691, + "grad_norm": 0.8323325514793396, + "learning_rate": 1.1104543052533433e-05, + "loss": 0.8922, + "step": 245440 + }, + { + "epoch": 1.5681100903364298, + "grad_norm": 0.7894075512886047, + "learning_rate": 1.1101390251670818e-05, + "loss": 0.8484, + "step": 245450 + }, + { + "epoch": 1.5681739774861683, + "grad_norm": 0.9994876980781555, + "learning_rate": 1.1098237842550874e-05, + "loss": 0.6992, + "step": 245460 + }, + { + "epoch": 1.5682378646359072, + "grad_norm": 1.2985382080078125, + "learning_rate": 1.1095085825205309e-05, + "loss": 0.8643, + "step": 245470 + }, + { + "epoch": 1.5683017517856457, + "grad_norm": 1.2723342180252075, + "learning_rate": 1.1091934199665904e-05, + "loss": 0.9908, + "step": 245480 + }, + { + "epoch": 1.5683656389353846, + "grad_norm": 0.8650884032249451, + "learning_rate": 1.1088782965964373e-05, + "loss": 0.9908, + "step": 245490 + }, + { + "epoch": 1.568429526085123, + "grad_norm": 1.132795810699463, + "learning_rate": 1.1085632124132467e-05, + "loss": 0.8583, + "step": 245500 + }, + { + "epoch": 1.568493413234862, + "grad_norm": 0.6821199655532837, + "learning_rate": 1.10824816742019e-05, + "loss": 0.8731, + "step": 245510 + }, + { + "epoch": 1.5685573003846005, + "grad_norm": 1.0070725679397583, + "learning_rate": 1.107933161620443e-05, + "loss": 0.8267, + "step": 245520 + }, + { + "epoch": 1.5686211875343394, + "grad_norm": 1.131516695022583, + "learning_rate": 1.1076181950171743e-05, + "loss": 0.8921, + "step": 245530 + }, + { + "epoch": 1.568685074684078, + "grad_norm": 1.031501293182373, + "learning_rate": 1.1073032676135591e-05, + "loss": 1.1427, + "step": 245540 + }, + { + "epoch": 1.5687489618338168, + "grad_norm": 1.1449588537216187, + "learning_rate": 1.1069883794127661e-05, + "loss": 0.9412, + "step": 245550 + }, + { + "epoch": 1.5688128489835553, + "grad_norm": 1.0295383930206299, + "learning_rate": 1.1066735304179698e-05, + "loss": 0.609, + "step": 245560 + }, + { + "epoch": 1.5688767361332943, + "grad_norm": 1.0607703924179077, + "learning_rate": 1.1063587206323378e-05, + "loss": 0.7468, + "step": 245570 + }, + { + "epoch": 1.5689406232830327, + "grad_norm": 0.988595724105835, + "learning_rate": 1.1060439500590436e-05, + "loss": 0.8743, + "step": 245580 + }, + { + "epoch": 1.5690045104327717, + "grad_norm": 1.4843655824661255, + "learning_rate": 1.1057292187012535e-05, + "loss": 1.0648, + "step": 245590 + }, + { + "epoch": 1.5690683975825102, + "grad_norm": 0.9656323790550232, + "learning_rate": 1.1054145265621412e-05, + "loss": 0.9058, + "step": 245600 + }, + { + "epoch": 1.569132284732249, + "grad_norm": 1.3070482015609741, + "learning_rate": 1.1050998736448726e-05, + "loss": 0.8845, + "step": 245610 + }, + { + "epoch": 1.5691961718819876, + "grad_norm": 1.023574948310852, + "learning_rate": 1.1047852599526176e-05, + "loss": 0.6545, + "step": 245620 + }, + { + "epoch": 1.5692600590317265, + "grad_norm": 0.8293267488479614, + "learning_rate": 1.1044706854885462e-05, + "loss": 0.8326, + "step": 245630 + }, + { + "epoch": 1.569323946181465, + "grad_norm": 0.9412983655929565, + "learning_rate": 1.1041561502558233e-05, + "loss": 1.18, + "step": 245640 + }, + { + "epoch": 1.569387833331204, + "grad_norm": 1.4803204536437988, + "learning_rate": 1.1038416542576202e-05, + "loss": 0.891, + "step": 245650 + }, + { + "epoch": 1.5694517204809424, + "grad_norm": 0.9529423713684082, + "learning_rate": 1.1035271974971013e-05, + "loss": 0.9386, + "step": 245660 + }, + { + "epoch": 1.5695156076306813, + "grad_norm": 1.0522215366363525, + "learning_rate": 1.1032127799774356e-05, + "loss": 1.0287, + "step": 245670 + }, + { + "epoch": 1.5695794947804198, + "grad_norm": 0.979540228843689, + "learning_rate": 1.1028984017017863e-05, + "loss": 0.6719, + "step": 245680 + }, + { + "epoch": 1.5696433819301587, + "grad_norm": 0.9477531909942627, + "learning_rate": 1.1025840626733237e-05, + "loss": 1.0261, + "step": 245690 + }, + { + "epoch": 1.5697072690798972, + "grad_norm": 0.8826400637626648, + "learning_rate": 1.10226976289521e-05, + "loss": 1.0628, + "step": 245700 + }, + { + "epoch": 1.569771156229636, + "grad_norm": 0.8096141815185547, + "learning_rate": 1.101955502370613e-05, + "loss": 0.8966, + "step": 245710 + }, + { + "epoch": 1.5698350433793746, + "grad_norm": 3.1200544834136963, + "learning_rate": 1.1016412811026943e-05, + "loss": 0.8998, + "step": 245720 + }, + { + "epoch": 1.5698989305291133, + "grad_norm": 0.9650887250900269, + "learning_rate": 1.1013270990946228e-05, + "loss": 0.7894, + "step": 245730 + }, + { + "epoch": 1.569962817678852, + "grad_norm": 1.0511590242385864, + "learning_rate": 1.1010129563495586e-05, + "loss": 0.7479, + "step": 245740 + }, + { + "epoch": 1.5700267048285907, + "grad_norm": 0.8517940044403076, + "learning_rate": 1.1006988528706685e-05, + "loss": 0.6017, + "step": 245750 + }, + { + "epoch": 1.5700905919783295, + "grad_norm": 1.2697738409042358, + "learning_rate": 1.1003847886611129e-05, + "loss": 0.9392, + "step": 245760 + }, + { + "epoch": 1.5701544791280682, + "grad_norm": 0.6032277941703796, + "learning_rate": 1.1000707637240571e-05, + "loss": 0.7957, + "step": 245770 + }, + { + "epoch": 1.5702183662778069, + "grad_norm": 0.5549653768539429, + "learning_rate": 1.0997567780626617e-05, + "loss": 0.5034, + "step": 245780 + }, + { + "epoch": 1.5702822534275456, + "grad_norm": 0.7875387072563171, + "learning_rate": 1.099442831680091e-05, + "loss": 0.8163, + "step": 245790 + }, + { + "epoch": 1.5703461405772843, + "grad_norm": 0.9425227046012878, + "learning_rate": 1.0991289245795039e-05, + "loss": 0.8701, + "step": 245800 + }, + { + "epoch": 1.570410027727023, + "grad_norm": 0.7016029357910156, + "learning_rate": 1.0988150567640636e-05, + "loss": 0.8692, + "step": 245810 + }, + { + "epoch": 1.5704739148767617, + "grad_norm": 0.8856372237205505, + "learning_rate": 1.0985012282369318e-05, + "loss": 0.6919, + "step": 245820 + }, + { + "epoch": 1.5705378020265004, + "grad_norm": 1.202332854270935, + "learning_rate": 1.0981874390012664e-05, + "loss": 0.7305, + "step": 245830 + }, + { + "epoch": 1.570601689176239, + "grad_norm": 1.0130597352981567, + "learning_rate": 1.0978736890602308e-05, + "loss": 0.7973, + "step": 245840 + }, + { + "epoch": 1.5706655763259778, + "grad_norm": 1.3987774848937988, + "learning_rate": 1.0975599784169815e-05, + "loss": 0.8538, + "step": 245850 + }, + { + "epoch": 1.5707294634757165, + "grad_norm": 0.9398975372314453, + "learning_rate": 1.0972463070746803e-05, + "loss": 0.9216, + "step": 245860 + }, + { + "epoch": 1.5707933506254552, + "grad_norm": 1.7127617597579956, + "learning_rate": 1.0969326750364844e-05, + "loss": 0.8187, + "step": 245870 + }, + { + "epoch": 1.570857237775194, + "grad_norm": 1.2938936948776245, + "learning_rate": 1.096619082305554e-05, + "loss": 0.7339, + "step": 245880 + }, + { + "epoch": 1.5709211249249326, + "grad_norm": 0.8230463266372681, + "learning_rate": 1.0963055288850455e-05, + "loss": 0.772, + "step": 245890 + }, + { + "epoch": 1.5709850120746713, + "grad_norm": 1.022827386856079, + "learning_rate": 1.0959920147781182e-05, + "loss": 0.9621, + "step": 245900 + }, + { + "epoch": 1.57104889922441, + "grad_norm": 1.102954387664795, + "learning_rate": 1.0956785399879276e-05, + "loss": 0.8469, + "step": 245910 + }, + { + "epoch": 1.5711127863741488, + "grad_norm": 1.8467339277267456, + "learning_rate": 1.0953651045176349e-05, + "loss": 1.0997, + "step": 245920 + }, + { + "epoch": 1.5711766735238875, + "grad_norm": 2.318774461746216, + "learning_rate": 1.0950517083703904e-05, + "loss": 0.7981, + "step": 245930 + }, + { + "epoch": 1.5712405606736262, + "grad_norm": 0.8628405928611755, + "learning_rate": 1.0947383515493536e-05, + "loss": 0.8306, + "step": 245940 + }, + { + "epoch": 1.5713044478233646, + "grad_norm": 2.304323196411133, + "learning_rate": 1.0944250340576818e-05, + "loss": 0.9758, + "step": 245950 + }, + { + "epoch": 1.5713683349731036, + "grad_norm": 0.9406927824020386, + "learning_rate": 1.094111755898527e-05, + "loss": 0.6177, + "step": 245960 + }, + { + "epoch": 1.571432222122842, + "grad_norm": 0.7597897052764893, + "learning_rate": 1.093798517075047e-05, + "loss": 0.7199, + "step": 245970 + }, + { + "epoch": 1.571496109272581, + "grad_norm": 0.8691983819007874, + "learning_rate": 1.0934853175903948e-05, + "loss": 0.9129, + "step": 245980 + }, + { + "epoch": 1.5715599964223195, + "grad_norm": 1.026710033416748, + "learning_rate": 1.0931721574477255e-05, + "loss": 1.0643, + "step": 245990 + }, + { + "epoch": 1.5716238835720584, + "grad_norm": 1.5185635089874268, + "learning_rate": 1.0928590366501917e-05, + "loss": 1.0488, + "step": 246000 + }, + { + "epoch": 1.5716877707217969, + "grad_norm": 0.7404221892356873, + "learning_rate": 1.0925459552009486e-05, + "loss": 0.8142, + "step": 246010 + }, + { + "epoch": 1.5717516578715358, + "grad_norm": 0.7018494606018066, + "learning_rate": 1.0922329131031467e-05, + "loss": 1.1856, + "step": 246020 + }, + { + "epoch": 1.5718155450212743, + "grad_norm": 1.0612128973007202, + "learning_rate": 1.091919910359942e-05, + "loss": 0.5845, + "step": 246030 + }, + { + "epoch": 1.5718794321710132, + "grad_norm": 2.2911901473999023, + "learning_rate": 1.0916069469744827e-05, + "loss": 0.8647, + "step": 246040 + }, + { + "epoch": 1.5719433193207517, + "grad_norm": 1.1287060976028442, + "learning_rate": 1.0912940229499247e-05, + "loss": 0.9265, + "step": 246050 + }, + { + "epoch": 1.5720072064704906, + "grad_norm": 0.6138035655021667, + "learning_rate": 1.0909811382894158e-05, + "loss": 0.8942, + "step": 246060 + }, + { + "epoch": 1.5720710936202291, + "grad_norm": 1.3819774389266968, + "learning_rate": 1.0906682929961099e-05, + "loss": 1.0637, + "step": 246070 + }, + { + "epoch": 1.572134980769968, + "grad_norm": 1.287611484527588, + "learning_rate": 1.0903554870731548e-05, + "loss": 0.849, + "step": 246080 + }, + { + "epoch": 1.5721988679197065, + "grad_norm": 0.7293438911437988, + "learning_rate": 1.090042720523704e-05, + "loss": 0.7931, + "step": 246090 + }, + { + "epoch": 1.5722627550694455, + "grad_norm": 1.9633628129959106, + "learning_rate": 1.0897299933509037e-05, + "loss": 0.9321, + "step": 246100 + }, + { + "epoch": 1.572326642219184, + "grad_norm": 0.5741513967514038, + "learning_rate": 1.089417305557907e-05, + "loss": 0.6795, + "step": 246110 + }, + { + "epoch": 1.5723905293689229, + "grad_norm": 1.1001640558242798, + "learning_rate": 1.0891046571478597e-05, + "loss": 1.11, + "step": 246120 + }, + { + "epoch": 1.5724544165186614, + "grad_norm": 1.022058129310608, + "learning_rate": 1.0887920481239122e-05, + "loss": 1.0803, + "step": 246130 + }, + { + "epoch": 1.5725183036684003, + "grad_norm": 1.7643972635269165, + "learning_rate": 1.0884794784892133e-05, + "loss": 0.8927, + "step": 246140 + }, + { + "epoch": 1.5725821908181388, + "grad_norm": 0.7340256571769714, + "learning_rate": 1.0881669482469092e-05, + "loss": 1.0397, + "step": 246150 + }, + { + "epoch": 1.5726460779678777, + "grad_norm": 1.4430502653121948, + "learning_rate": 1.0878544574001492e-05, + "loss": 0.8335, + "step": 246160 + }, + { + "epoch": 1.5727099651176162, + "grad_norm": 1.156911015510559, + "learning_rate": 1.087542005952078e-05, + "loss": 0.8916, + "step": 246170 + }, + { + "epoch": 1.572773852267355, + "grad_norm": 4.588479995727539, + "learning_rate": 1.0872295939058446e-05, + "loss": 0.8711, + "step": 246180 + }, + { + "epoch": 1.5728377394170936, + "grad_norm": 1.0178959369659424, + "learning_rate": 1.0869172212645933e-05, + "loss": 0.8555, + "step": 246190 + }, + { + "epoch": 1.5729016265668323, + "grad_norm": 0.626876175403595, + "learning_rate": 1.0866048880314722e-05, + "loss": 0.8164, + "step": 246200 + }, + { + "epoch": 1.572965513716571, + "grad_norm": 1.4186381101608276, + "learning_rate": 1.0863238218182126e-05, + "loss": 0.7887, + "step": 246210 + }, + { + "epoch": 1.5730294008663097, + "grad_norm": 1.20530366897583, + "learning_rate": 1.0860115634692013e-05, + "loss": 1.002, + "step": 246220 + }, + { + "epoch": 1.5730932880160484, + "grad_norm": 1.250042200088501, + "learning_rate": 1.0856993445374386e-05, + "loss": 0.738, + "step": 246230 + }, + { + "epoch": 1.5731571751657871, + "grad_norm": 1.899093508720398, + "learning_rate": 1.0853871650260716e-05, + "loss": 0.9449, + "step": 246240 + }, + { + "epoch": 1.5732210623155258, + "grad_norm": 1.0699888467788696, + "learning_rate": 1.0850750249382408e-05, + "loss": 0.7348, + "step": 246250 + }, + { + "epoch": 1.5732849494652645, + "grad_norm": 0.7223033308982849, + "learning_rate": 1.0847629242770912e-05, + "loss": 0.9035, + "step": 246260 + }, + { + "epoch": 1.5733488366150032, + "grad_norm": 0.8356568813323975, + "learning_rate": 1.0844508630457678e-05, + "loss": 1.0011, + "step": 246270 + }, + { + "epoch": 1.573412723764742, + "grad_norm": 0.8588039875030518, + "learning_rate": 1.0841388412474101e-05, + "loss": 0.6855, + "step": 246280 + }, + { + "epoch": 1.5734766109144807, + "grad_norm": 0.5876787304878235, + "learning_rate": 1.083826858885163e-05, + "loss": 0.8988, + "step": 246290 + }, + { + "epoch": 1.5735404980642194, + "grad_norm": 0.9825795292854309, + "learning_rate": 1.0835149159621666e-05, + "loss": 0.8273, + "step": 246300 + }, + { + "epoch": 1.573604385213958, + "grad_norm": 1.5006439685821533, + "learning_rate": 1.0832030124815646e-05, + "loss": 0.7658, + "step": 246310 + }, + { + "epoch": 1.5736682723636968, + "grad_norm": 1.415667176246643, + "learning_rate": 1.0828911484464954e-05, + "loss": 1.0479, + "step": 246320 + }, + { + "epoch": 1.5737321595134355, + "grad_norm": 1.1721471548080444, + "learning_rate": 1.0825793238601017e-05, + "loss": 0.8538, + "step": 246330 + }, + { + "epoch": 1.5737960466631742, + "grad_norm": 1.0684322118759155, + "learning_rate": 1.0822675387255227e-05, + "loss": 0.7542, + "step": 246340 + }, + { + "epoch": 1.573859933812913, + "grad_norm": 0.8883429169654846, + "learning_rate": 1.0819557930459e-05, + "loss": 1.0311, + "step": 246350 + }, + { + "epoch": 1.5739238209626516, + "grad_norm": 0.8817760944366455, + "learning_rate": 1.081644086824371e-05, + "loss": 1.0911, + "step": 246360 + }, + { + "epoch": 1.5739877081123903, + "grad_norm": 0.7657302021980286, + "learning_rate": 1.0813324200640768e-05, + "loss": 0.9857, + "step": 246370 + }, + { + "epoch": 1.574051595262129, + "grad_norm": 1.0451817512512207, + "learning_rate": 1.0810207927681542e-05, + "loss": 0.8334, + "step": 246380 + }, + { + "epoch": 1.5741154824118677, + "grad_norm": 1.2856969833374023, + "learning_rate": 1.0807092049397439e-05, + "loss": 0.8962, + "step": 246390 + }, + { + "epoch": 1.5741793695616064, + "grad_norm": 0.7358013987541199, + "learning_rate": 1.0803976565819813e-05, + "loss": 0.8894, + "step": 246400 + }, + { + "epoch": 1.5742432567113451, + "grad_norm": 1.019336462020874, + "learning_rate": 1.0800861476980067e-05, + "loss": 1.1504, + "step": 246410 + }, + { + "epoch": 1.5743071438610838, + "grad_norm": 0.748558521270752, + "learning_rate": 1.079774678290954e-05, + "loss": 0.7269, + "step": 246420 + }, + { + "epoch": 1.5743710310108225, + "grad_norm": 0.6980547308921814, + "learning_rate": 1.0794632483639634e-05, + "loss": 0.8777, + "step": 246430 + }, + { + "epoch": 1.574434918160561, + "grad_norm": 1.1652023792266846, + "learning_rate": 1.0791518579201688e-05, + "loss": 0.8815, + "step": 246440 + }, + { + "epoch": 1.5744988053103, + "grad_norm": 0.9284915328025818, + "learning_rate": 1.0788405069627072e-05, + "loss": 0.717, + "step": 246450 + }, + { + "epoch": 1.5745626924600384, + "grad_norm": 1.090552806854248, + "learning_rate": 1.0785291954947135e-05, + "loss": 0.8092, + "step": 246460 + }, + { + "epoch": 1.5746265796097774, + "grad_norm": 1.166859745979309, + "learning_rate": 1.0782179235193229e-05, + "loss": 0.6876, + "step": 246470 + }, + { + "epoch": 1.5746904667595159, + "grad_norm": 0.8611056804656982, + "learning_rate": 1.0779066910396724e-05, + "loss": 0.9592, + "step": 246480 + }, + { + "epoch": 1.5747543539092548, + "grad_norm": 0.9985960721969604, + "learning_rate": 1.0775954980588932e-05, + "loss": 0.7917, + "step": 246490 + }, + { + "epoch": 1.5748182410589933, + "grad_norm": 0.5808977484703064, + "learning_rate": 1.0772843445801216e-05, + "loss": 0.7458, + "step": 246500 + }, + { + "epoch": 1.5748821282087322, + "grad_norm": 1.3468122482299805, + "learning_rate": 1.0769732306064895e-05, + "loss": 0.8343, + "step": 246510 + }, + { + "epoch": 1.5749460153584707, + "grad_norm": 0.7967391610145569, + "learning_rate": 1.0766621561411317e-05, + "loss": 1.0195, + "step": 246520 + }, + { + "epoch": 1.5750099025082096, + "grad_norm": 1.3037304878234863, + "learning_rate": 1.0763511211871791e-05, + "loss": 0.8428, + "step": 246530 + }, + { + "epoch": 1.575073789657948, + "grad_norm": 0.7973846197128296, + "learning_rate": 1.0760401257477664e-05, + "loss": 0.8917, + "step": 246540 + }, + { + "epoch": 1.575137676807687, + "grad_norm": 0.8173840641975403, + "learning_rate": 1.0757291698260246e-05, + "loss": 0.822, + "step": 246550 + }, + { + "epoch": 1.5752015639574255, + "grad_norm": 0.8736957311630249, + "learning_rate": 1.0754182534250851e-05, + "loss": 0.9213, + "step": 246560 + }, + { + "epoch": 1.5752654511071644, + "grad_norm": 0.9506877064704895, + "learning_rate": 1.0751073765480773e-05, + "loss": 0.9554, + "step": 246570 + }, + { + "epoch": 1.575329338256903, + "grad_norm": 1.5564134120941162, + "learning_rate": 1.074796539198134e-05, + "loss": 0.8712, + "step": 246580 + }, + { + "epoch": 1.5753932254066418, + "grad_norm": 0.5361325144767761, + "learning_rate": 1.0744857413783865e-05, + "loss": 0.5999, + "step": 246590 + }, + { + "epoch": 1.5754571125563803, + "grad_norm": 0.8528962731361389, + "learning_rate": 1.0741749830919623e-05, + "loss": 0.8792, + "step": 246600 + }, + { + "epoch": 1.5755209997061193, + "grad_norm": 0.9711998701095581, + "learning_rate": 1.0738642643419938e-05, + "loss": 0.9086, + "step": 246610 + }, + { + "epoch": 1.5755848868558577, + "grad_norm": 0.8326975107192993, + "learning_rate": 1.073553585131607e-05, + "loss": 0.7687, + "step": 246620 + }, + { + "epoch": 1.5756487740055967, + "grad_norm": 0.7956464290618896, + "learning_rate": 1.0732429454639342e-05, + "loss": 0.6987, + "step": 246630 + }, + { + "epoch": 1.5757126611553351, + "grad_norm": 1.4720463752746582, + "learning_rate": 1.0729323453421008e-05, + "loss": 0.7809, + "step": 246640 + }, + { + "epoch": 1.575776548305074, + "grad_norm": 1.258072018623352, + "learning_rate": 1.0726217847692371e-05, + "loss": 0.9506, + "step": 246650 + }, + { + "epoch": 1.5758404354548126, + "grad_norm": 0.9584420323371887, + "learning_rate": 1.0723112637484684e-05, + "loss": 0.7599, + "step": 246660 + }, + { + "epoch": 1.5759043226045513, + "grad_norm": 1.1683968305587769, + "learning_rate": 1.0720007822829248e-05, + "loss": 0.828, + "step": 246670 + }, + { + "epoch": 1.57596820975429, + "grad_norm": 0.9970899820327759, + "learning_rate": 1.07169034037573e-05, + "loss": 0.8159, + "step": 246680 + }, + { + "epoch": 1.5760320969040287, + "grad_norm": 0.9691714644432068, + "learning_rate": 1.0713799380300132e-05, + "loss": 0.8154, + "step": 246690 + }, + { + "epoch": 1.5760959840537674, + "grad_norm": 1.2461732625961304, + "learning_rate": 1.071069575248898e-05, + "loss": 0.9882, + "step": 246700 + }, + { + "epoch": 1.576159871203506, + "grad_norm": 0.8372601270675659, + "learning_rate": 1.0707592520355125e-05, + "loss": 0.84, + "step": 246710 + }, + { + "epoch": 1.5762237583532448, + "grad_norm": 0.7840808033943176, + "learning_rate": 1.0704489683929796e-05, + "loss": 0.8733, + "step": 246720 + }, + { + "epoch": 1.5762876455029835, + "grad_norm": 1.0950987339019775, + "learning_rate": 1.0701387243244259e-05, + "loss": 0.9205, + "step": 246730 + }, + { + "epoch": 1.5763515326527222, + "grad_norm": 1.576825737953186, + "learning_rate": 1.0698285198329744e-05, + "loss": 0.8162, + "step": 246740 + }, + { + "epoch": 1.576415419802461, + "grad_norm": 1.297703742980957, + "learning_rate": 1.0695183549217502e-05, + "loss": 0.7828, + "step": 246750 + }, + { + "epoch": 1.5764793069521996, + "grad_norm": 0.9645804762840271, + "learning_rate": 1.069208229593876e-05, + "loss": 0.9203, + "step": 246760 + }, + { + "epoch": 1.5765431941019383, + "grad_norm": 0.9418362975120544, + "learning_rate": 1.0688981438524764e-05, + "loss": 0.9594, + "step": 246770 + }, + { + "epoch": 1.576607081251677, + "grad_norm": 1.2878425121307373, + "learning_rate": 1.0685880977006723e-05, + "loss": 0.8661, + "step": 246780 + }, + { + "epoch": 1.5766709684014157, + "grad_norm": 1.1463112831115723, + "learning_rate": 1.0682780911415868e-05, + "loss": 0.8811, + "step": 246790 + }, + { + "epoch": 1.5767348555511544, + "grad_norm": 0.8709837794303894, + "learning_rate": 1.067968124178344e-05, + "loss": 0.9157, + "step": 246800 + }, + { + "epoch": 1.5767987427008932, + "grad_norm": 2.789950132369995, + "learning_rate": 1.0676581968140625e-05, + "loss": 0.8866, + "step": 246810 + }, + { + "epoch": 1.5768626298506319, + "grad_norm": 0.8597791790962219, + "learning_rate": 1.0673483090518666e-05, + "loss": 0.7851, + "step": 246820 + }, + { + "epoch": 1.5769265170003706, + "grad_norm": 1.0854740142822266, + "learning_rate": 1.0670384608948737e-05, + "loss": 1.0849, + "step": 246830 + }, + { + "epoch": 1.5769904041501093, + "grad_norm": 1.0474879741668701, + "learning_rate": 1.0667286523462072e-05, + "loss": 0.826, + "step": 246840 + }, + { + "epoch": 1.577054291299848, + "grad_norm": 0.9019538164138794, + "learning_rate": 1.066418883408985e-05, + "loss": 0.8243, + "step": 246850 + }, + { + "epoch": 1.5771181784495867, + "grad_norm": 0.8639311790466309, + "learning_rate": 1.0661091540863289e-05, + "loss": 0.7818, + "step": 246860 + }, + { + "epoch": 1.5771820655993254, + "grad_norm": 1.1420975923538208, + "learning_rate": 1.0657994643813557e-05, + "loss": 1.1253, + "step": 246870 + }, + { + "epoch": 1.577245952749064, + "grad_norm": 1.1920766830444336, + "learning_rate": 1.0654898142971865e-05, + "loss": 0.9389, + "step": 246880 + }, + { + "epoch": 1.5773098398988028, + "grad_norm": 1.6129494905471802, + "learning_rate": 1.0651802038369373e-05, + "loss": 0.8958, + "step": 246890 + }, + { + "epoch": 1.5773737270485415, + "grad_norm": 0.7555667757987976, + "learning_rate": 1.0648706330037289e-05, + "loss": 0.8478, + "step": 246900 + }, + { + "epoch": 1.57743761419828, + "grad_norm": 0.8667502403259277, + "learning_rate": 1.0645611018006769e-05, + "loss": 0.7366, + "step": 246910 + }, + { + "epoch": 1.577501501348019, + "grad_norm": 1.4878448247909546, + "learning_rate": 1.0642516102309002e-05, + "loss": 0.8595, + "step": 246920 + }, + { + "epoch": 1.5775653884977574, + "grad_norm": 0.6253286600112915, + "learning_rate": 1.0639421582975128e-05, + "loss": 0.8944, + "step": 246930 + }, + { + "epoch": 1.5776292756474963, + "grad_norm": 0.9675326943397522, + "learning_rate": 1.0636327460036349e-05, + "loss": 0.681, + "step": 246940 + }, + { + "epoch": 1.5776931627972348, + "grad_norm": 0.7513461112976074, + "learning_rate": 1.0633233733523795e-05, + "loss": 1.0764, + "step": 246950 + }, + { + "epoch": 1.5777570499469737, + "grad_norm": 0.71452796459198, + "learning_rate": 1.0630140403468647e-05, + "loss": 0.8061, + "step": 246960 + }, + { + "epoch": 1.5778209370967122, + "grad_norm": 0.8605654835700989, + "learning_rate": 1.062704746990203e-05, + "loss": 0.7689, + "step": 246970 + }, + { + "epoch": 1.5778848242464512, + "grad_norm": 0.9058616757392883, + "learning_rate": 1.0623954932855108e-05, + "loss": 0.8225, + "step": 246980 + }, + { + "epoch": 1.5779487113961896, + "grad_norm": 0.860543429851532, + "learning_rate": 1.0620862792359037e-05, + "loss": 0.8649, + "step": 246990 + }, + { + "epoch": 1.5780125985459286, + "grad_norm": 0.8707665205001831, + "learning_rate": 1.0617771048444936e-05, + "loss": 0.7123, + "step": 247000 + }, + { + "epoch": 1.578076485695667, + "grad_norm": 1.5746833086013794, + "learning_rate": 1.061467970114396e-05, + "loss": 0.8932, + "step": 247010 + }, + { + "epoch": 1.578140372845406, + "grad_norm": 0.792719304561615, + "learning_rate": 1.0611588750487223e-05, + "loss": 0.8443, + "step": 247020 + }, + { + "epoch": 1.5782042599951445, + "grad_norm": 1.7057514190673828, + "learning_rate": 1.0608498196505873e-05, + "loss": 0.9104, + "step": 247030 + }, + { + "epoch": 1.5782681471448834, + "grad_norm": 1.1587259769439697, + "learning_rate": 1.0605408039231024e-05, + "loss": 0.7765, + "step": 247040 + }, + { + "epoch": 1.5783320342946219, + "grad_norm": 0.7321895360946655, + "learning_rate": 1.0602318278693802e-05, + "loss": 0.9205, + "step": 247050 + }, + { + "epoch": 1.5783959214443608, + "grad_norm": 0.8459962010383606, + "learning_rate": 1.0599228914925303e-05, + "loss": 0.9146, + "step": 247060 + }, + { + "epoch": 1.5784598085940993, + "grad_norm": 0.8274263143539429, + "learning_rate": 1.0596139947956669e-05, + "loss": 0.8276, + "step": 247070 + }, + { + "epoch": 1.5785236957438382, + "grad_norm": 0.9039373397827148, + "learning_rate": 1.059305137781898e-05, + "loss": 0.9089, + "step": 247080 + }, + { + "epoch": 1.5785875828935767, + "grad_norm": 1.0288807153701782, + "learning_rate": 1.0589963204543368e-05, + "loss": 0.7714, + "step": 247090 + }, + { + "epoch": 1.5786514700433156, + "grad_norm": 0.9122907519340515, + "learning_rate": 1.0586875428160908e-05, + "loss": 0.5767, + "step": 247100 + }, + { + "epoch": 1.5787153571930541, + "grad_norm": 1.0185425281524658, + "learning_rate": 1.0583788048702715e-05, + "loss": 0.9802, + "step": 247110 + }, + { + "epoch": 1.578779244342793, + "grad_norm": 0.9830597043037415, + "learning_rate": 1.0580701066199883e-05, + "loss": 0.9354, + "step": 247120 + }, + { + "epoch": 1.5788431314925315, + "grad_norm": 0.9498816132545471, + "learning_rate": 1.0577614480683485e-05, + "loss": 0.9159, + "step": 247130 + }, + { + "epoch": 1.5789070186422705, + "grad_norm": 0.9181402921676636, + "learning_rate": 1.0574528292184622e-05, + "loss": 0.7249, + "step": 247140 + }, + { + "epoch": 1.578970905792009, + "grad_norm": 1.1932886838912964, + "learning_rate": 1.0571442500734357e-05, + "loss": 0.8719, + "step": 247150 + }, + { + "epoch": 1.5790347929417476, + "grad_norm": 0.7471179962158203, + "learning_rate": 1.0568357106363792e-05, + "loss": 0.8601, + "step": 247160 + }, + { + "epoch": 1.5790986800914864, + "grad_norm": 0.9473945498466492, + "learning_rate": 1.0565272109103968e-05, + "loss": 1.0133, + "step": 247170 + }, + { + "epoch": 1.579162567241225, + "grad_norm": 1.2394994497299194, + "learning_rate": 1.0562187508985987e-05, + "loss": 0.8834, + "step": 247180 + }, + { + "epoch": 1.5792264543909638, + "grad_norm": 1.355468988418579, + "learning_rate": 1.055910330604088e-05, + "loss": 0.8369, + "step": 247190 + }, + { + "epoch": 1.5792903415407025, + "grad_norm": 0.8456324338912964, + "learning_rate": 1.0556019500299735e-05, + "loss": 0.7767, + "step": 247200 + }, + { + "epoch": 1.5793542286904412, + "grad_norm": 0.6504706740379333, + "learning_rate": 1.0552936091793591e-05, + "loss": 0.8129, + "step": 247210 + }, + { + "epoch": 1.5794181158401799, + "grad_norm": 1.120896339416504, + "learning_rate": 1.0549853080553513e-05, + "loss": 0.8561, + "step": 247220 + }, + { + "epoch": 1.5794820029899186, + "grad_norm": 0.928348958492279, + "learning_rate": 1.0546770466610533e-05, + "loss": 1.0094, + "step": 247230 + }, + { + "epoch": 1.5795458901396573, + "grad_norm": 1.2079272270202637, + "learning_rate": 1.054368824999572e-05, + "loss": 0.8225, + "step": 247240 + }, + { + "epoch": 1.579609777289396, + "grad_norm": 1.0350985527038574, + "learning_rate": 1.0540606430740091e-05, + "loss": 1.0844, + "step": 247250 + }, + { + "epoch": 1.5796736644391347, + "grad_norm": 0.8831034302711487, + "learning_rate": 1.0537525008874705e-05, + "loss": 0.9471, + "step": 247260 + }, + { + "epoch": 1.5797375515888734, + "grad_norm": 0.7610118389129639, + "learning_rate": 1.0534443984430564e-05, + "loss": 1.0177, + "step": 247270 + }, + { + "epoch": 1.5798014387386121, + "grad_norm": 1.1378825902938843, + "learning_rate": 1.0531363357438728e-05, + "loss": 1.037, + "step": 247280 + }, + { + "epoch": 1.5798653258883508, + "grad_norm": 1.1919612884521484, + "learning_rate": 1.0528283127930199e-05, + "loss": 0.7967, + "step": 247290 + }, + { + "epoch": 1.5799292130380895, + "grad_norm": 0.8108116388320923, + "learning_rate": 1.0525203295936004e-05, + "loss": 0.86, + "step": 247300 + }, + { + "epoch": 1.5799931001878282, + "grad_norm": 1.2976237535476685, + "learning_rate": 1.0522123861487177e-05, + "loss": 0.7617, + "step": 247310 + }, + { + "epoch": 1.580056987337567, + "grad_norm": 1.0517657995224, + "learning_rate": 1.0519044824614705e-05, + "loss": 0.9928, + "step": 247320 + }, + { + "epoch": 1.5801208744873056, + "grad_norm": 1.2619192600250244, + "learning_rate": 1.0515966185349612e-05, + "loss": 0.9209, + "step": 247330 + }, + { + "epoch": 1.5801847616370444, + "grad_norm": 0.7939496040344238, + "learning_rate": 1.0512887943722893e-05, + "loss": 0.7354, + "step": 247340 + }, + { + "epoch": 1.580248648786783, + "grad_norm": 1.7504665851593018, + "learning_rate": 1.0509810099765566e-05, + "loss": 0.9486, + "step": 247350 + }, + { + "epoch": 1.5803125359365218, + "grad_norm": 0.9001903533935547, + "learning_rate": 1.05067326535086e-05, + "loss": 1.0617, + "step": 247360 + }, + { + "epoch": 1.5803764230862605, + "grad_norm": 1.1803821325302124, + "learning_rate": 1.0503655604983021e-05, + "loss": 1.0478, + "step": 247370 + }, + { + "epoch": 1.5804403102359992, + "grad_norm": 0.8175578117370605, + "learning_rate": 1.050057895421978e-05, + "loss": 0.9052, + "step": 247380 + }, + { + "epoch": 1.5805041973857379, + "grad_norm": 0.7418546080589294, + "learning_rate": 1.0497502701249894e-05, + "loss": 0.7204, + "step": 247390 + }, + { + "epoch": 1.5805680845354764, + "grad_norm": 0.9057771563529968, + "learning_rate": 1.0494426846104321e-05, + "loss": 0.6815, + "step": 247400 + }, + { + "epoch": 1.5806319716852153, + "grad_norm": 1.755724549293518, + "learning_rate": 1.0491351388814057e-05, + "loss": 0.8857, + "step": 247410 + }, + { + "epoch": 1.5806958588349538, + "grad_norm": 1.293442964553833, + "learning_rate": 1.0488276329410051e-05, + "loss": 0.7786, + "step": 247420 + }, + { + "epoch": 1.5807597459846927, + "grad_norm": 1.0106276273727417, + "learning_rate": 1.0485201667923301e-05, + "loss": 1.021, + "step": 247430 + }, + { + "epoch": 1.5808236331344312, + "grad_norm": 1.364980936050415, + "learning_rate": 1.048212740438474e-05, + "loss": 0.8366, + "step": 247440 + }, + { + "epoch": 1.5808875202841701, + "grad_norm": 1.1519886255264282, + "learning_rate": 1.0479053538825357e-05, + "loss": 0.6659, + "step": 247450 + }, + { + "epoch": 1.5809514074339086, + "grad_norm": 0.8733057379722595, + "learning_rate": 1.047598007127608e-05, + "loss": 0.8836, + "step": 247460 + }, + { + "epoch": 1.5810152945836475, + "grad_norm": 0.9156480431556702, + "learning_rate": 1.0472907001767895e-05, + "loss": 1.1012, + "step": 247470 + }, + { + "epoch": 1.581079181733386, + "grad_norm": 0.7186042070388794, + "learning_rate": 1.0469834330331718e-05, + "loss": 0.6984, + "step": 247480 + }, + { + "epoch": 1.581143068883125, + "grad_norm": 1.4364821910858154, + "learning_rate": 1.046676205699852e-05, + "loss": 0.8082, + "step": 247490 + }, + { + "epoch": 1.5812069560328634, + "grad_norm": 1.1890616416931152, + "learning_rate": 1.0463690181799212e-05, + "loss": 0.8385, + "step": 247500 + }, + { + "epoch": 1.5812708431826024, + "grad_norm": 1.5968211889266968, + "learning_rate": 1.0460618704764752e-05, + "loss": 1.0133, + "step": 247510 + }, + { + "epoch": 1.5813347303323408, + "grad_norm": 0.9693719744682312, + "learning_rate": 1.0457547625926096e-05, + "loss": 0.7946, + "step": 247520 + }, + { + "epoch": 1.5813986174820798, + "grad_norm": 1.1753880977630615, + "learning_rate": 1.0454476945314113e-05, + "loss": 0.878, + "step": 247530 + }, + { + "epoch": 1.5814625046318183, + "grad_norm": 1.0377428531646729, + "learning_rate": 1.0451406662959778e-05, + "loss": 0.8611, + "step": 247540 + }, + { + "epoch": 1.5815263917815572, + "grad_norm": 1.8484477996826172, + "learning_rate": 1.0448336778893974e-05, + "loss": 0.9474, + "step": 247550 + }, + { + "epoch": 1.5815902789312957, + "grad_norm": 0.8086001873016357, + "learning_rate": 1.0445267293147654e-05, + "loss": 0.7801, + "step": 247560 + }, + { + "epoch": 1.5816541660810346, + "grad_norm": 0.9309337139129639, + "learning_rate": 1.0442198205751692e-05, + "loss": 0.9321, + "step": 247570 + }, + { + "epoch": 1.581718053230773, + "grad_norm": 0.783740758895874, + "learning_rate": 1.0439129516737034e-05, + "loss": 0.8634, + "step": 247580 + }, + { + "epoch": 1.581781940380512, + "grad_norm": 0.7251185178756714, + "learning_rate": 1.0436061226134553e-05, + "loss": 0.8588, + "step": 247590 + }, + { + "epoch": 1.5818458275302505, + "grad_norm": 0.8295360803604126, + "learning_rate": 1.0432993333975176e-05, + "loss": 0.9542, + "step": 247600 + }, + { + "epoch": 1.5819097146799894, + "grad_norm": 0.7831049561500549, + "learning_rate": 1.0429925840289772e-05, + "loss": 0.7736, + "step": 247610 + }, + { + "epoch": 1.581973601829728, + "grad_norm": 0.6984795928001404, + "learning_rate": 1.0426858745109263e-05, + "loss": 1.1484, + "step": 247620 + }, + { + "epoch": 1.5820374889794668, + "grad_norm": 1.2676637172698975, + "learning_rate": 1.042379204846451e-05, + "loss": 1.0692, + "step": 247630 + }, + { + "epoch": 1.5821013761292053, + "grad_norm": 0.7221893072128296, + "learning_rate": 1.0420725750386407e-05, + "loss": 0.6549, + "step": 247640 + }, + { + "epoch": 1.582165263278944, + "grad_norm": 1.0046474933624268, + "learning_rate": 1.0417659850905853e-05, + "loss": 0.8093, + "step": 247650 + }, + { + "epoch": 1.5822291504286827, + "grad_norm": 0.6917647123336792, + "learning_rate": 1.0414594350053691e-05, + "loss": 0.8005, + "step": 247660 + }, + { + "epoch": 1.5822930375784214, + "grad_norm": 1.201385259628296, + "learning_rate": 1.0411529247860824e-05, + "loss": 0.98, + "step": 247670 + }, + { + "epoch": 1.5823569247281601, + "grad_norm": 0.5854590535163879, + "learning_rate": 1.0408464544358094e-05, + "loss": 1.0863, + "step": 247680 + }, + { + "epoch": 1.5824208118778988, + "grad_norm": 0.9084241986274719, + "learning_rate": 1.040540023957639e-05, + "loss": 0.8678, + "step": 247690 + }, + { + "epoch": 1.5824846990276376, + "grad_norm": 0.6541767716407776, + "learning_rate": 1.0402336333546547e-05, + "loss": 0.9901, + "step": 247700 + }, + { + "epoch": 1.5825485861773763, + "grad_norm": 1.1705743074417114, + "learning_rate": 1.039927282629945e-05, + "loss": 1.1037, + "step": 247710 + }, + { + "epoch": 1.582612473327115, + "grad_norm": 0.9867904782295227, + "learning_rate": 1.0396209717865918e-05, + "loss": 1.0031, + "step": 247720 + }, + { + "epoch": 1.5826763604768537, + "grad_norm": 0.9883776903152466, + "learning_rate": 1.0393147008276832e-05, + "loss": 0.9994, + "step": 247730 + }, + { + "epoch": 1.5827402476265924, + "grad_norm": 1.223418116569519, + "learning_rate": 1.0390084697563008e-05, + "loss": 0.9183, + "step": 247740 + }, + { + "epoch": 1.582804134776331, + "grad_norm": 1.9032589197158813, + "learning_rate": 1.0387022785755307e-05, + "loss": 0.8064, + "step": 247750 + }, + { + "epoch": 1.5828680219260698, + "grad_norm": 0.8207881450653076, + "learning_rate": 1.0383961272884546e-05, + "loss": 0.8457, + "step": 247760 + }, + { + "epoch": 1.5829319090758085, + "grad_norm": 1.2639472484588623, + "learning_rate": 1.0380900158981583e-05, + "loss": 0.8783, + "step": 247770 + }, + { + "epoch": 1.5829957962255472, + "grad_norm": 0.9134830832481384, + "learning_rate": 1.0377839444077215e-05, + "loss": 0.7592, + "step": 247780 + }, + { + "epoch": 1.583059683375286, + "grad_norm": 1.3179031610488892, + "learning_rate": 1.0374779128202294e-05, + "loss": 0.8649, + "step": 247790 + }, + { + "epoch": 1.5831235705250246, + "grad_norm": 0.8579550981521606, + "learning_rate": 1.037171921138762e-05, + "loss": 1.1334, + "step": 247800 + }, + { + "epoch": 1.5831874576747633, + "grad_norm": 0.6448858976364136, + "learning_rate": 1.0368659693664023e-05, + "loss": 0.9309, + "step": 247810 + }, + { + "epoch": 1.583251344824502, + "grad_norm": 0.9539541602134705, + "learning_rate": 1.03656005750623e-05, + "loss": 0.8983, + "step": 247820 + }, + { + "epoch": 1.5833152319742407, + "grad_norm": 1.5543828010559082, + "learning_rate": 1.0362541855613267e-05, + "loss": 0.9587, + "step": 247830 + }, + { + "epoch": 1.5833791191239794, + "grad_norm": 1.0303435325622559, + "learning_rate": 1.0359483535347742e-05, + "loss": 1.2866, + "step": 247840 + }, + { + "epoch": 1.5834430062737181, + "grad_norm": 1.1230798959732056, + "learning_rate": 1.03564256142965e-05, + "loss": 0.8112, + "step": 247850 + }, + { + "epoch": 1.5835068934234569, + "grad_norm": 1.0403016805648804, + "learning_rate": 1.0353368092490362e-05, + "loss": 0.7441, + "step": 247860 + }, + { + "epoch": 1.5835707805731956, + "grad_norm": 1.1610591411590576, + "learning_rate": 1.0350310969960092e-05, + "loss": 0.8292, + "step": 247870 + }, + { + "epoch": 1.5836346677229343, + "grad_norm": 0.8638874292373657, + "learning_rate": 1.0347254246736504e-05, + "loss": 1.0327, + "step": 247880 + }, + { + "epoch": 1.5836985548726727, + "grad_norm": 0.9477463364601135, + "learning_rate": 1.0344197922850357e-05, + "loss": 0.9738, + "step": 247890 + }, + { + "epoch": 1.5837624420224117, + "grad_norm": 0.8979694247245789, + "learning_rate": 1.034114199833246e-05, + "loss": 0.8999, + "step": 247900 + }, + { + "epoch": 1.5838263291721502, + "grad_norm": 3.237966299057007, + "learning_rate": 1.033808647321356e-05, + "loss": 0.7072, + "step": 247910 + }, + { + "epoch": 1.583890216321889, + "grad_norm": 1.0422779321670532, + "learning_rate": 1.0335031347524454e-05, + "loss": 0.6422, + "step": 247920 + }, + { + "epoch": 1.5839541034716276, + "grad_norm": 0.9260254502296448, + "learning_rate": 1.033197662129588e-05, + "loss": 0.9527, + "step": 247930 + }, + { + "epoch": 1.5840179906213665, + "grad_norm": 3.0579593181610107, + "learning_rate": 1.032892229455864e-05, + "loss": 1.086, + "step": 247940 + }, + { + "epoch": 1.584081877771105, + "grad_norm": 1.0561027526855469, + "learning_rate": 1.0325868367343456e-05, + "loss": 0.8016, + "step": 247950 + }, + { + "epoch": 1.584145764920844, + "grad_norm": 0.6812159419059753, + "learning_rate": 1.032281483968111e-05, + "loss": 0.71, + "step": 247960 + }, + { + "epoch": 1.5842096520705824, + "grad_norm": 1.056764841079712, + "learning_rate": 1.0319761711602332e-05, + "loss": 1.2764, + "step": 247970 + }, + { + "epoch": 1.5842735392203213, + "grad_norm": 0.7221099734306335, + "learning_rate": 1.0316708983137902e-05, + "loss": 0.8135, + "step": 247980 + }, + { + "epoch": 1.5843374263700598, + "grad_norm": 0.838124692440033, + "learning_rate": 1.031365665431852e-05, + "loss": 1.0928, + "step": 247990 + }, + { + "epoch": 1.5844013135197987, + "grad_norm": 0.7449299693107605, + "learning_rate": 1.0310604725174971e-05, + "loss": 0.9119, + "step": 248000 + }, + { + "epoch": 1.5844652006695372, + "grad_norm": 1.1632795333862305, + "learning_rate": 1.0307553195737968e-05, + "loss": 0.841, + "step": 248010 + }, + { + "epoch": 1.5845290878192761, + "grad_norm": 1.0129088163375854, + "learning_rate": 1.0304502066038224e-05, + "loss": 0.9286, + "step": 248020 + }, + { + "epoch": 1.5845929749690146, + "grad_norm": 0.9895704388618469, + "learning_rate": 1.0301451336106504e-05, + "loss": 1.1631, + "step": 248030 + }, + { + "epoch": 1.5846568621187536, + "grad_norm": 1.057637095451355, + "learning_rate": 1.0298401005973502e-05, + "loss": 0.79, + "step": 248040 + }, + { + "epoch": 1.584720749268492, + "grad_norm": 1.1140066385269165, + "learning_rate": 1.0295351075669957e-05, + "loss": 0.5707, + "step": 248050 + }, + { + "epoch": 1.584784636418231, + "grad_norm": 1.073408842086792, + "learning_rate": 1.0292301545226562e-05, + "loss": 0.8806, + "step": 248060 + }, + { + "epoch": 1.5848485235679695, + "grad_norm": 0.9552643299102783, + "learning_rate": 1.0289252414674055e-05, + "loss": 0.9322, + "step": 248070 + }, + { + "epoch": 1.5849124107177084, + "grad_norm": 0.6368883848190308, + "learning_rate": 1.0286203684043122e-05, + "loss": 0.9138, + "step": 248080 + }, + { + "epoch": 1.5849762978674469, + "grad_norm": 1.0097981691360474, + "learning_rate": 1.0283155353364487e-05, + "loss": 0.8904, + "step": 248090 + }, + { + "epoch": 1.5850401850171858, + "grad_norm": 0.7692947387695312, + "learning_rate": 1.0280107422668822e-05, + "loss": 0.97, + "step": 248100 + }, + { + "epoch": 1.5851040721669243, + "grad_norm": 0.7867854237556458, + "learning_rate": 1.0277059891986856e-05, + "loss": 0.778, + "step": 248110 + }, + { + "epoch": 1.5851679593166632, + "grad_norm": 0.7965070009231567, + "learning_rate": 1.0274012761349244e-05, + "loss": 0.8268, + "step": 248120 + }, + { + "epoch": 1.5852318464664017, + "grad_norm": 1.3041713237762451, + "learning_rate": 1.0270966030786711e-05, + "loss": 1.1806, + "step": 248130 + }, + { + "epoch": 1.5852957336161404, + "grad_norm": 0.7487338781356812, + "learning_rate": 1.0267919700329903e-05, + "loss": 0.8568, + "step": 248140 + }, + { + "epoch": 1.585359620765879, + "grad_norm": 5.159639358520508, + "learning_rate": 1.0264873770009514e-05, + "loss": 0.8124, + "step": 248150 + }, + { + "epoch": 1.5854235079156178, + "grad_norm": 0.80680251121521, + "learning_rate": 1.0261828239856241e-05, + "loss": 0.8401, + "step": 248160 + }, + { + "epoch": 1.5854873950653565, + "grad_norm": 1.3771222829818726, + "learning_rate": 1.0258783109900717e-05, + "loss": 0.857, + "step": 248170 + }, + { + "epoch": 1.5855512822150952, + "grad_norm": 0.7607705593109131, + "learning_rate": 1.0255738380173647e-05, + "loss": 1.0143, + "step": 248180 + }, + { + "epoch": 1.585615169364834, + "grad_norm": 0.9139487147331238, + "learning_rate": 1.0252694050705658e-05, + "loss": 0.888, + "step": 248190 + }, + { + "epoch": 1.5856790565145726, + "grad_norm": 1.1925636529922485, + "learning_rate": 1.0249650121527443e-05, + "loss": 0.8567, + "step": 248200 + }, + { + "epoch": 1.5857429436643113, + "grad_norm": 1.0441328287124634, + "learning_rate": 1.0246606592669622e-05, + "loss": 0.9031, + "step": 248210 + }, + { + "epoch": 1.58580683081405, + "grad_norm": 0.8782601356506348, + "learning_rate": 1.0243563464162881e-05, + "loss": 1.0137, + "step": 248220 + }, + { + "epoch": 1.5858707179637888, + "grad_norm": 1.2816804647445679, + "learning_rate": 1.0240520736037834e-05, + "loss": 0.8509, + "step": 248230 + }, + { + "epoch": 1.5859346051135275, + "grad_norm": 0.9288935661315918, + "learning_rate": 1.0237478408325158e-05, + "loss": 0.9061, + "step": 248240 + }, + { + "epoch": 1.5859984922632662, + "grad_norm": 0.8910688757896423, + "learning_rate": 1.0234436481055454e-05, + "loss": 0.9472, + "step": 248250 + }, + { + "epoch": 1.5860623794130049, + "grad_norm": 0.9356805682182312, + "learning_rate": 1.0231394954259394e-05, + "loss": 0.9874, + "step": 248260 + }, + { + "epoch": 1.5861262665627436, + "grad_norm": 1.12856924533844, + "learning_rate": 1.0228353827967579e-05, + "loss": 0.9618, + "step": 248270 + }, + { + "epoch": 1.5861901537124823, + "grad_norm": 1.6523456573486328, + "learning_rate": 1.022531310221066e-05, + "loss": 0.881, + "step": 248280 + }, + { + "epoch": 1.586254040862221, + "grad_norm": 1.1004304885864258, + "learning_rate": 1.0222272777019237e-05, + "loss": 0.8419, + "step": 248290 + }, + { + "epoch": 1.5863179280119597, + "grad_norm": 0.7944563031196594, + "learning_rate": 1.0219232852423949e-05, + "loss": 0.7764, + "step": 248300 + }, + { + "epoch": 1.5863818151616984, + "grad_norm": 1.085784673690796, + "learning_rate": 1.0216193328455392e-05, + "loss": 0.9209, + "step": 248310 + }, + { + "epoch": 1.5864457023114371, + "grad_norm": 0.8792476654052734, + "learning_rate": 1.0213154205144199e-05, + "loss": 0.8064, + "step": 248320 + }, + { + "epoch": 1.5865095894611758, + "grad_norm": 0.6366348266601562, + "learning_rate": 1.0210115482520948e-05, + "loss": 0.7487, + "step": 248330 + }, + { + "epoch": 1.5865734766109145, + "grad_norm": 0.8250417113304138, + "learning_rate": 1.0207077160616274e-05, + "loss": 0.8573, + "step": 248340 + }, + { + "epoch": 1.5866373637606532, + "grad_norm": 0.9031588435173035, + "learning_rate": 1.0204039239460745e-05, + "loss": 0.9037, + "step": 248350 + }, + { + "epoch": 1.586701250910392, + "grad_norm": 0.7464722394943237, + "learning_rate": 1.0201001719084969e-05, + "loss": 0.8245, + "step": 248360 + }, + { + "epoch": 1.5867651380601306, + "grad_norm": 1.6804680824279785, + "learning_rate": 1.0197964599519543e-05, + "loss": 0.838, + "step": 248370 + }, + { + "epoch": 1.5868290252098691, + "grad_norm": 0.8236187100410461, + "learning_rate": 1.019492788079504e-05, + "loss": 1.0602, + "step": 248380 + }, + { + "epoch": 1.586892912359608, + "grad_norm": 1.974440574645996, + "learning_rate": 1.0191891562942063e-05, + "loss": 0.7904, + "step": 248390 + }, + { + "epoch": 1.5869567995093465, + "grad_norm": 1.6416873931884766, + "learning_rate": 1.0188855645991163e-05, + "loss": 1.0055, + "step": 248400 + }, + { + "epoch": 1.5870206866590855, + "grad_norm": 0.9831100702285767, + "learning_rate": 1.0185820129972945e-05, + "loss": 0.7863, + "step": 248410 + }, + { + "epoch": 1.587084573808824, + "grad_norm": 0.891497790813446, + "learning_rate": 1.0182785014917945e-05, + "loss": 0.7261, + "step": 248420 + }, + { + "epoch": 1.5871484609585629, + "grad_norm": 0.8720300793647766, + "learning_rate": 1.017975030085676e-05, + "loss": 0.8513, + "step": 248430 + }, + { + "epoch": 1.5872123481083014, + "grad_norm": 0.6937757730484009, + "learning_rate": 1.0176715987819923e-05, + "loss": 0.7849, + "step": 248440 + }, + { + "epoch": 1.5872762352580403, + "grad_norm": 1.342638611793518, + "learning_rate": 1.0173682075838026e-05, + "loss": 0.7524, + "step": 248450 + }, + { + "epoch": 1.5873401224077788, + "grad_norm": 0.6457897424697876, + "learning_rate": 1.0170648564941592e-05, + "loss": 0.9541, + "step": 248460 + }, + { + "epoch": 1.5874040095575177, + "grad_norm": 0.9868115782737732, + "learning_rate": 1.0167615455161194e-05, + "loss": 1.0407, + "step": 248470 + }, + { + "epoch": 1.5874678967072562, + "grad_norm": 1.2257436513900757, + "learning_rate": 1.0164582746527357e-05, + "loss": 0.944, + "step": 248480 + }, + { + "epoch": 1.5875317838569951, + "grad_norm": 1.2771724462509155, + "learning_rate": 1.0161550439070649e-05, + "loss": 0.9378, + "step": 248490 + }, + { + "epoch": 1.5875956710067336, + "grad_norm": 0.47110313177108765, + "learning_rate": 1.0158518532821593e-05, + "loss": 0.9029, + "step": 248500 + }, + { + "epoch": 1.5876595581564725, + "grad_norm": 1.252586841583252, + "learning_rate": 1.0155487027810706e-05, + "loss": 0.8562, + "step": 248510 + }, + { + "epoch": 1.587723445306211, + "grad_norm": 0.7602173089981079, + "learning_rate": 1.0152455924068549e-05, + "loss": 0.7778, + "step": 248520 + }, + { + "epoch": 1.58778733245595, + "grad_norm": 0.8817898631095886, + "learning_rate": 1.0149425221625625e-05, + "loss": 0.982, + "step": 248530 + }, + { + "epoch": 1.5878512196056884, + "grad_norm": 3.27030873298645, + "learning_rate": 1.0146394920512475e-05, + "loss": 0.6923, + "step": 248540 + }, + { + "epoch": 1.5879151067554274, + "grad_norm": 1.6652390956878662, + "learning_rate": 1.0143365020759593e-05, + "loss": 0.84, + "step": 248550 + }, + { + "epoch": 1.5879789939051658, + "grad_norm": 0.7446771860122681, + "learning_rate": 1.014033552239752e-05, + "loss": 1.0318, + "step": 248560 + }, + { + "epoch": 1.5880428810549048, + "grad_norm": 0.9068264961242676, + "learning_rate": 1.0137306425456738e-05, + "loss": 0.9516, + "step": 248570 + }, + { + "epoch": 1.5881067682046432, + "grad_norm": 1.3777496814727783, + "learning_rate": 1.0134277729967784e-05, + "loss": 0.7882, + "step": 248580 + }, + { + "epoch": 1.5881706553543822, + "grad_norm": 1.0046823024749756, + "learning_rate": 1.013124943596112e-05, + "loss": 0.981, + "step": 248590 + }, + { + "epoch": 1.5882345425041207, + "grad_norm": 1.1702842712402344, + "learning_rate": 1.0128221543467288e-05, + "loss": 0.6711, + "step": 248600 + }, + { + "epoch": 1.5882984296538596, + "grad_norm": 0.9184240102767944, + "learning_rate": 1.012519405251674e-05, + "loss": 0.6594, + "step": 248610 + }, + { + "epoch": 1.588362316803598, + "grad_norm": 1.1504337787628174, + "learning_rate": 1.012216696314e-05, + "loss": 0.8985, + "step": 248620 + }, + { + "epoch": 1.5884262039533368, + "grad_norm": 1.0141348838806152, + "learning_rate": 1.0119140275367522e-05, + "loss": 1.0221, + "step": 248630 + }, + { + "epoch": 1.5884900911030755, + "grad_norm": 0.9321434497833252, + "learning_rate": 1.011611398922982e-05, + "loss": 0.9214, + "step": 248640 + }, + { + "epoch": 1.5885539782528142, + "grad_norm": 0.8669467568397522, + "learning_rate": 1.011308810475734e-05, + "loss": 1.0403, + "step": 248650 + }, + { + "epoch": 1.588617865402553, + "grad_norm": 0.9638845920562744, + "learning_rate": 1.0110062621980587e-05, + "loss": 0.7519, + "step": 248660 + }, + { + "epoch": 1.5886817525522916, + "grad_norm": 0.9903292655944824, + "learning_rate": 1.0107037540929992e-05, + "loss": 0.7911, + "step": 248670 + }, + { + "epoch": 1.5887456397020303, + "grad_norm": 0.8706985116004944, + "learning_rate": 1.0104012861636047e-05, + "loss": 0.7986, + "step": 248680 + }, + { + "epoch": 1.588809526851769, + "grad_norm": 0.6027273535728455, + "learning_rate": 1.0100988584129217e-05, + "loss": 0.7547, + "step": 248690 + }, + { + "epoch": 1.5888734140015077, + "grad_norm": 0.8384061455726624, + "learning_rate": 1.0097964708439938e-05, + "loss": 0.9898, + "step": 248700 + }, + { + "epoch": 1.5889373011512464, + "grad_norm": 0.747555136680603, + "learning_rate": 1.0094941234598693e-05, + "loss": 0.813, + "step": 248710 + }, + { + "epoch": 1.5890011883009851, + "grad_norm": 0.9358962774276733, + "learning_rate": 1.0091918162635894e-05, + "loss": 0.6808, + "step": 248720 + }, + { + "epoch": 1.5890650754507238, + "grad_norm": 1.0704421997070312, + "learning_rate": 1.0088895492582013e-05, + "loss": 0.9423, + "step": 248730 + }, + { + "epoch": 1.5891289626004625, + "grad_norm": 0.8794128894805908, + "learning_rate": 1.0085873224467479e-05, + "loss": 0.7343, + "step": 248740 + }, + { + "epoch": 1.5891928497502013, + "grad_norm": 1.1814675331115723, + "learning_rate": 1.0082851358322737e-05, + "loss": 0.933, + "step": 248750 + }, + { + "epoch": 1.58925673689994, + "grad_norm": 0.7328624725341797, + "learning_rate": 1.0079829894178205e-05, + "loss": 0.8094, + "step": 248760 + }, + { + "epoch": 1.5893206240496787, + "grad_norm": 0.7021011114120483, + "learning_rate": 1.0076808832064339e-05, + "loss": 0.8035, + "step": 248770 + }, + { + "epoch": 1.5893845111994174, + "grad_norm": 0.8993996381759644, + "learning_rate": 1.0073788172011528e-05, + "loss": 0.6404, + "step": 248780 + }, + { + "epoch": 1.589448398349156, + "grad_norm": 0.9888026714324951, + "learning_rate": 1.0070767914050228e-05, + "loss": 0.6814, + "step": 248790 + }, + { + "epoch": 1.5895122854988948, + "grad_norm": 1.5480871200561523, + "learning_rate": 1.0067748058210825e-05, + "loss": 0.9038, + "step": 248800 + }, + { + "epoch": 1.5895761726486335, + "grad_norm": 0.9566226005554199, + "learning_rate": 1.0064728604523754e-05, + "loss": 0.7915, + "step": 248810 + }, + { + "epoch": 1.5896400597983722, + "grad_norm": 1.3926703929901123, + "learning_rate": 1.0061709553019406e-05, + "loss": 0.9904, + "step": 248820 + }, + { + "epoch": 1.589703946948111, + "grad_norm": 1.0358089208602905, + "learning_rate": 1.0058690903728207e-05, + "loss": 0.8774, + "step": 248830 + }, + { + "epoch": 1.5897678340978496, + "grad_norm": 1.1436970233917236, + "learning_rate": 1.0055672656680532e-05, + "loss": 0.8088, + "step": 248840 + }, + { + "epoch": 1.5898317212475883, + "grad_norm": 2.777813673019409, + "learning_rate": 1.0052654811906803e-05, + "loss": 1.1499, + "step": 248850 + }, + { + "epoch": 1.589895608397327, + "grad_norm": 1.3283435106277466, + "learning_rate": 1.0049637369437386e-05, + "loss": 0.9473, + "step": 248860 + }, + { + "epoch": 1.5899594955470655, + "grad_norm": 1.2573764324188232, + "learning_rate": 1.0046620329302692e-05, + "loss": 0.7271, + "step": 248870 + }, + { + "epoch": 1.5900233826968044, + "grad_norm": 1.3681310415267944, + "learning_rate": 1.0043603691533088e-05, + "loss": 0.9251, + "step": 248880 + }, + { + "epoch": 1.590087269846543, + "grad_norm": 0.8003191351890564, + "learning_rate": 1.0040587456158961e-05, + "loss": 0.7029, + "step": 248890 + }, + { + "epoch": 1.5901511569962818, + "grad_norm": 0.7895790934562683, + "learning_rate": 1.00375716232107e-05, + "loss": 0.7636, + "step": 248900 + }, + { + "epoch": 1.5902150441460203, + "grad_norm": 0.6591528058052063, + "learning_rate": 1.0034556192718652e-05, + "loss": 0.6429, + "step": 248910 + }, + { + "epoch": 1.5902789312957593, + "grad_norm": 0.8874780535697937, + "learning_rate": 1.0031541164713215e-05, + "loss": 0.9603, + "step": 248920 + }, + { + "epoch": 1.5903428184454977, + "grad_norm": 1.5267889499664307, + "learning_rate": 1.002852653922472e-05, + "loss": 0.6381, + "step": 248930 + }, + { + "epoch": 1.5904067055952367, + "grad_norm": 1.887830138206482, + "learning_rate": 1.0025512316283553e-05, + "loss": 0.825, + "step": 248940 + }, + { + "epoch": 1.5904705927449752, + "grad_norm": 0.8215576410293579, + "learning_rate": 1.0022498495920053e-05, + "loss": 1.2069, + "step": 248950 + }, + { + "epoch": 1.590534479894714, + "grad_norm": 1.7051416635513306, + "learning_rate": 1.0019485078164587e-05, + "loss": 0.9801, + "step": 248960 + }, + { + "epoch": 1.5905983670444526, + "grad_norm": 1.316051721572876, + "learning_rate": 1.0016472063047483e-05, + "loss": 0.7481, + "step": 248970 + }, + { + "epoch": 1.5906622541941915, + "grad_norm": 0.9726854562759399, + "learning_rate": 1.0013459450599122e-05, + "loss": 0.9108, + "step": 248980 + }, + { + "epoch": 1.59072614134393, + "grad_norm": 1.443015217781067, + "learning_rate": 1.001044724084979e-05, + "loss": 0.8269, + "step": 248990 + }, + { + "epoch": 1.590790028493669, + "grad_norm": 1.5187175273895264, + "learning_rate": 1.0007435433829854e-05, + "loss": 0.8788, + "step": 249000 + }, + { + "epoch": 1.5908539156434074, + "grad_norm": 0.8051071763038635, + "learning_rate": 1.0004424029569653e-05, + "loss": 0.899, + "step": 249010 + }, + { + "epoch": 1.5909178027931463, + "grad_norm": 1.0638450384140015, + "learning_rate": 1.0001413028099487e-05, + "loss": 0.9527, + "step": 249020 + }, + { + "epoch": 1.5909816899428848, + "grad_norm": 1.3071459531784058, + "learning_rate": 9.99840242944971e-06, + "loss": 0.8981, + "step": 249030 + }, + { + "epoch": 1.5910455770926237, + "grad_norm": 0.8064150214195251, + "learning_rate": 9.995392233650614e-06, + "loss": 0.8127, + "step": 249040 + }, + { + "epoch": 1.5911094642423622, + "grad_norm": 1.0664138793945312, + "learning_rate": 9.992382440732546e-06, + "loss": 0.7089, + "step": 249050 + }, + { + "epoch": 1.5911733513921011, + "grad_norm": 0.6624587178230286, + "learning_rate": 9.989373050725775e-06, + "loss": 0.7468, + "step": 249060 + }, + { + "epoch": 1.5912372385418396, + "grad_norm": 1.153746485710144, + "learning_rate": 9.986364063660653e-06, + "loss": 0.7196, + "step": 249070 + }, + { + "epoch": 1.5913011256915786, + "grad_norm": 1.2669557332992554, + "learning_rate": 9.983355479567446e-06, + "loss": 0.8815, + "step": 249080 + }, + { + "epoch": 1.591365012841317, + "grad_norm": 0.9664118885993958, + "learning_rate": 9.980347298476477e-06, + "loss": 1.0083, + "step": 249090 + }, + { + "epoch": 1.5914288999910557, + "grad_norm": 1.1675530672073364, + "learning_rate": 9.977339520418027e-06, + "loss": 0.7832, + "step": 249100 + }, + { + "epoch": 1.5914927871407945, + "grad_norm": 0.9785093069076538, + "learning_rate": 9.974332145422399e-06, + "loss": 0.9357, + "step": 249110 + }, + { + "epoch": 1.5915566742905332, + "grad_norm": 1.8464292287826538, + "learning_rate": 9.971325173519863e-06, + "loss": 1.0924, + "step": 249120 + }, + { + "epoch": 1.5916205614402719, + "grad_norm": 1.0116223096847534, + "learning_rate": 9.968318604740728e-06, + "loss": 0.7755, + "step": 249130 + }, + { + "epoch": 1.5916844485900106, + "grad_norm": 0.5784453749656677, + "learning_rate": 9.965312439115243e-06, + "loss": 0.7531, + "step": 249140 + }, + { + "epoch": 1.5917483357397493, + "grad_norm": 0.9767338633537292, + "learning_rate": 9.962306676673711e-06, + "loss": 0.9978, + "step": 249150 + }, + { + "epoch": 1.591812222889488, + "grad_norm": 0.7933934926986694, + "learning_rate": 9.959301317446369e-06, + "loss": 0.927, + "step": 249160 + }, + { + "epoch": 1.5918761100392267, + "grad_norm": 2.1215498447418213, + "learning_rate": 9.956296361463518e-06, + "loss": 0.8457, + "step": 249170 + }, + { + "epoch": 1.5919399971889654, + "grad_norm": 0.5488957166671753, + "learning_rate": 9.953291808755393e-06, + "loss": 0.8871, + "step": 249180 + }, + { + "epoch": 1.592003884338704, + "grad_norm": 1.6344760656356812, + "learning_rate": 9.95028765935228e-06, + "loss": 0.8525, + "step": 249190 + }, + { + "epoch": 1.5920677714884428, + "grad_norm": 1.200919508934021, + "learning_rate": 9.947584269740252e-06, + "loss": 0.7876, + "step": 249200 + }, + { + "epoch": 1.5921316586381815, + "grad_norm": 1.2127763032913208, + "learning_rate": 9.944580886699966e-06, + "loss": 0.8877, + "step": 249210 + }, + { + "epoch": 1.5921955457879202, + "grad_norm": 0.837215781211853, + "learning_rate": 9.941577907052413e-06, + "loss": 1.0373, + "step": 249220 + }, + { + "epoch": 1.592259432937659, + "grad_norm": 1.1384263038635254, + "learning_rate": 9.938575330827821e-06, + "loss": 0.864, + "step": 249230 + }, + { + "epoch": 1.5923233200873976, + "grad_norm": 0.8473384380340576, + "learning_rate": 9.935573158056437e-06, + "loss": 0.898, + "step": 249240 + }, + { + "epoch": 1.5923872072371363, + "grad_norm": 2.536351442337036, + "learning_rate": 9.932571388768486e-06, + "loss": 0.8477, + "step": 249250 + }, + { + "epoch": 1.592451094386875, + "grad_norm": 0.7730850577354431, + "learning_rate": 9.929570022994217e-06, + "loss": 0.9414, + "step": 249260 + }, + { + "epoch": 1.5925149815366137, + "grad_norm": 0.5718420743942261, + "learning_rate": 9.926569060763829e-06, + "loss": 0.8113, + "step": 249270 + }, + { + "epoch": 1.5925788686863525, + "grad_norm": 1.1947507858276367, + "learning_rate": 9.923568502107572e-06, + "loss": 0.8983, + "step": 249280 + }, + { + "epoch": 1.5926427558360912, + "grad_norm": 1.237534999847412, + "learning_rate": 9.920568347055637e-06, + "loss": 0.9349, + "step": 249290 + }, + { + "epoch": 1.5927066429858299, + "grad_norm": 0.8614002466201782, + "learning_rate": 9.917568595638265e-06, + "loss": 0.9377, + "step": 249300 + }, + { + "epoch": 1.5927705301355686, + "grad_norm": 0.7836909890174866, + "learning_rate": 9.914569247885647e-06, + "loss": 0.9604, + "step": 249310 + }, + { + "epoch": 1.5928344172853073, + "grad_norm": 0.7826888561248779, + "learning_rate": 9.911570303827999e-06, + "loss": 0.8028, + "step": 249320 + }, + { + "epoch": 1.592898304435046, + "grad_norm": 1.3262845277786255, + "learning_rate": 9.908571763495516e-06, + "loss": 0.8967, + "step": 249330 + }, + { + "epoch": 1.5929621915847845, + "grad_norm": 1.4745041131973267, + "learning_rate": 9.905573626918392e-06, + "loss": 0.9859, + "step": 249340 + }, + { + "epoch": 1.5930260787345234, + "grad_norm": 1.885468602180481, + "learning_rate": 9.902575894126841e-06, + "loss": 0.8534, + "step": 249350 + }, + { + "epoch": 1.5930899658842619, + "grad_norm": 0.608864426612854, + "learning_rate": 9.899578565151035e-06, + "loss": 0.7751, + "step": 249360 + }, + { + "epoch": 1.5931538530340008, + "grad_norm": 0.8991163372993469, + "learning_rate": 9.896581640021169e-06, + "loss": 0.9384, + "step": 249370 + }, + { + "epoch": 1.5932177401837393, + "grad_norm": 1.1069456338882446, + "learning_rate": 9.893585118767413e-06, + "loss": 0.7718, + "step": 249380 + }, + { + "epoch": 1.5932816273334782, + "grad_norm": 0.7635006904602051, + "learning_rate": 9.890589001419959e-06, + "loss": 0.8445, + "step": 249390 + }, + { + "epoch": 1.5933455144832167, + "grad_norm": 0.6687338948249817, + "learning_rate": 9.887593288008967e-06, + "loss": 0.9449, + "step": 249400 + }, + { + "epoch": 1.5934094016329556, + "grad_norm": 1.5973081588745117, + "learning_rate": 9.884597978564625e-06, + "loss": 0.8369, + "step": 249410 + }, + { + "epoch": 1.5934732887826941, + "grad_norm": 1.0921237468719482, + "learning_rate": 9.881603073117068e-06, + "loss": 0.8445, + "step": 249420 + }, + { + "epoch": 1.593537175932433, + "grad_norm": 0.813981831073761, + "learning_rate": 9.878608571696491e-06, + "loss": 0.7445, + "step": 249430 + }, + { + "epoch": 1.5936010630821715, + "grad_norm": 1.3870654106140137, + "learning_rate": 9.875614474333022e-06, + "loss": 0.8893, + "step": 249440 + }, + { + "epoch": 1.5936649502319105, + "grad_norm": 1.150707721710205, + "learning_rate": 9.87262078105684e-06, + "loss": 1.0009, + "step": 249450 + }, + { + "epoch": 1.593728837381649, + "grad_norm": 1.7357616424560547, + "learning_rate": 9.86962749189807e-06, + "loss": 0.9953, + "step": 249460 + }, + { + "epoch": 1.5937927245313879, + "grad_norm": 0.9563319087028503, + "learning_rate": 9.866634606886882e-06, + "loss": 1.0655, + "step": 249470 + }, + { + "epoch": 1.5938566116811264, + "grad_norm": 1.100013256072998, + "learning_rate": 9.863642126053391e-06, + "loss": 1.1086, + "step": 249480 + }, + { + "epoch": 1.5939204988308653, + "grad_norm": 0.9695361852645874, + "learning_rate": 9.86065004942776e-06, + "loss": 1.0619, + "step": 249490 + }, + { + "epoch": 1.5939843859806038, + "grad_norm": 0.9638354182243347, + "learning_rate": 9.857658377040097e-06, + "loss": 0.8276, + "step": 249500 + }, + { + "epoch": 1.5940482731303427, + "grad_norm": 1.1016769409179688, + "learning_rate": 9.854667108920552e-06, + "loss": 0.8694, + "step": 249510 + }, + { + "epoch": 1.5941121602800812, + "grad_norm": 1.1219736337661743, + "learning_rate": 9.851676245099228e-06, + "loss": 0.8565, + "step": 249520 + }, + { + "epoch": 1.59417604742982, + "grad_norm": 1.7792798280715942, + "learning_rate": 9.84868578560626e-06, + "loss": 1.3292, + "step": 249530 + }, + { + "epoch": 1.5942399345795586, + "grad_norm": 1.0533701181411743, + "learning_rate": 9.845695730471772e-06, + "loss": 0.7286, + "step": 249540 + }, + { + "epoch": 1.5943038217292975, + "grad_norm": 1.4720954895019531, + "learning_rate": 9.842706079725856e-06, + "loss": 1.1434, + "step": 249550 + }, + { + "epoch": 1.594367708879036, + "grad_norm": 0.9887880086898804, + "learning_rate": 9.839716833398638e-06, + "loss": 0.7406, + "step": 249560 + }, + { + "epoch": 1.594431596028775, + "grad_norm": 0.8414777517318726, + "learning_rate": 9.836727991520206e-06, + "loss": 0.9619, + "step": 249570 + }, + { + "epoch": 1.5944954831785134, + "grad_norm": 0.9462972283363342, + "learning_rate": 9.833739554120686e-06, + "loss": 0.7826, + "step": 249580 + }, + { + "epoch": 1.5945593703282521, + "grad_norm": 0.9703009128570557, + "learning_rate": 9.830751521230142e-06, + "loss": 1.0046, + "step": 249590 + }, + { + "epoch": 1.5946232574779908, + "grad_norm": 1.0508860349655151, + "learning_rate": 9.827763892878688e-06, + "loss": 1.3851, + "step": 249600 + }, + { + "epoch": 1.5946871446277295, + "grad_norm": 1.112051248550415, + "learning_rate": 9.824776669096413e-06, + "loss": 1.0773, + "step": 249610 + }, + { + "epoch": 1.5947510317774682, + "grad_norm": 1.7548609972000122, + "learning_rate": 9.821789849913393e-06, + "loss": 0.773, + "step": 249620 + }, + { + "epoch": 1.594814918927207, + "grad_norm": 0.8418256044387817, + "learning_rate": 9.818803435359691e-06, + "loss": 1.038, + "step": 249630 + }, + { + "epoch": 1.5948788060769457, + "grad_norm": 0.551695704460144, + "learning_rate": 9.815817425465418e-06, + "loss": 0.8379, + "step": 249640 + }, + { + "epoch": 1.5949426932266844, + "grad_norm": 0.9964414834976196, + "learning_rate": 9.81283182026061e-06, + "loss": 0.8553, + "step": 249650 + }, + { + "epoch": 1.595006580376423, + "grad_norm": 0.7424940466880798, + "learning_rate": 9.809846619775354e-06, + "loss": 0.9025, + "step": 249660 + }, + { + "epoch": 1.5950704675261618, + "grad_norm": 2.2108371257781982, + "learning_rate": 9.806861824039726e-06, + "loss": 0.7527, + "step": 249670 + }, + { + "epoch": 1.5951343546759005, + "grad_norm": 1.1251683235168457, + "learning_rate": 9.803877433083758e-06, + "loss": 1.0794, + "step": 249680 + }, + { + "epoch": 1.5951982418256392, + "grad_norm": 1.1081650257110596, + "learning_rate": 9.800893446937537e-06, + "loss": 0.7922, + "step": 249690 + }, + { + "epoch": 1.595262128975378, + "grad_norm": 1.0226283073425293, + "learning_rate": 9.797909865631078e-06, + "loss": 0.6803, + "step": 249700 + }, + { + "epoch": 1.5953260161251166, + "grad_norm": 0.6846336722373962, + "learning_rate": 9.794926689194456e-06, + "loss": 0.812, + "step": 249710 + }, + { + "epoch": 1.5953899032748553, + "grad_norm": 1.3356788158416748, + "learning_rate": 9.791943917657698e-06, + "loss": 0.8233, + "step": 249720 + }, + { + "epoch": 1.595453790424594, + "grad_norm": 0.6795898675918579, + "learning_rate": 9.788961551050857e-06, + "loss": 1.0373, + "step": 249730 + }, + { + "epoch": 1.5955176775743327, + "grad_norm": 0.9310694932937622, + "learning_rate": 9.785979589403948e-06, + "loss": 0.6179, + "step": 249740 + }, + { + "epoch": 1.5955815647240714, + "grad_norm": 1.051107406616211, + "learning_rate": 9.782998032747031e-06, + "loss": 0.8966, + "step": 249750 + }, + { + "epoch": 1.5956454518738101, + "grad_norm": 0.7246778011322021, + "learning_rate": 9.780016881110104e-06, + "loss": 0.7684, + "step": 249760 + }, + { + "epoch": 1.5957093390235488, + "grad_norm": 0.9064025282859802, + "learning_rate": 9.777036134523215e-06, + "loss": 0.8458, + "step": 249770 + }, + { + "epoch": 1.5957732261732875, + "grad_norm": 0.9094128608703613, + "learning_rate": 9.774055793016357e-06, + "loss": 0.6985, + "step": 249780 + }, + { + "epoch": 1.5958371133230262, + "grad_norm": 1.2230727672576904, + "learning_rate": 9.771075856619566e-06, + "loss": 0.9097, + "step": 249790 + }, + { + "epoch": 1.595901000472765, + "grad_norm": 1.1106340885162354, + "learning_rate": 9.768096325362836e-06, + "loss": 0.9081, + "step": 249800 + }, + { + "epoch": 1.5959648876225037, + "grad_norm": 1.3577688932418823, + "learning_rate": 9.765117199276192e-06, + "loss": 0.8087, + "step": 249810 + }, + { + "epoch": 1.5960287747722424, + "grad_norm": 1.1891157627105713, + "learning_rate": 9.762138478389616e-06, + "loss": 0.8516, + "step": 249820 + }, + { + "epoch": 1.5960926619219808, + "grad_norm": 1.4682430028915405, + "learning_rate": 9.759160162733127e-06, + "loss": 0.8946, + "step": 249830 + }, + { + "epoch": 1.5961565490717198, + "grad_norm": 0.7288236021995544, + "learning_rate": 9.7561822523367e-06, + "loss": 0.7522, + "step": 249840 + }, + { + "epoch": 1.5962204362214583, + "grad_norm": 1.9862715005874634, + "learning_rate": 9.753204747230327e-06, + "loss": 0.8168, + "step": 249850 + }, + { + "epoch": 1.5962843233711972, + "grad_norm": 1.0854727029800415, + "learning_rate": 9.750227647444015e-06, + "loss": 0.8119, + "step": 249860 + }, + { + "epoch": 1.5963482105209357, + "grad_norm": 1.2034173011779785, + "learning_rate": 9.74725095300772e-06, + "loss": 0.9405, + "step": 249870 + }, + { + "epoch": 1.5964120976706746, + "grad_norm": 1.0065717697143555, + "learning_rate": 9.744274663951441e-06, + "loss": 0.9919, + "step": 249880 + }, + { + "epoch": 1.596475984820413, + "grad_norm": 1.0933870077133179, + "learning_rate": 9.741298780305136e-06, + "loss": 0.8674, + "step": 249890 + }, + { + "epoch": 1.596539871970152, + "grad_norm": 0.6897011399269104, + "learning_rate": 9.738323302098789e-06, + "loss": 1.101, + "step": 249900 + }, + { + "epoch": 1.5966037591198905, + "grad_norm": 1.1092685461044312, + "learning_rate": 9.735348229362346e-06, + "loss": 1.0444, + "step": 249910 + }, + { + "epoch": 1.5966676462696294, + "grad_norm": 1.103498935699463, + "learning_rate": 9.73237356212579e-06, + "loss": 0.6172, + "step": 249920 + }, + { + "epoch": 1.596731533419368, + "grad_norm": 2.056804895401001, + "learning_rate": 9.729399300419062e-06, + "loss": 0.7891, + "step": 249930 + }, + { + "epoch": 1.5967954205691068, + "grad_norm": 1.0614315271377563, + "learning_rate": 9.72642544427213e-06, + "loss": 0.9441, + "step": 249940 + }, + { + "epoch": 1.5968593077188453, + "grad_norm": 1.092970609664917, + "learning_rate": 9.723451993714927e-06, + "loss": 1.0506, + "step": 249950 + }, + { + "epoch": 1.5969231948685843, + "grad_norm": 3.405836343765259, + "learning_rate": 9.720478948777418e-06, + "loss": 1.0595, + "step": 249960 + }, + { + "epoch": 1.5969870820183227, + "grad_norm": 0.8358373045921326, + "learning_rate": 9.717506309489516e-06, + "loss": 0.8651, + "step": 249970 + }, + { + "epoch": 1.5970509691680617, + "grad_norm": 1.3352793455123901, + "learning_rate": 9.714534075881187e-06, + "loss": 0.8881, + "step": 249980 + }, + { + "epoch": 1.5971148563178001, + "grad_norm": 1.7608089447021484, + "learning_rate": 9.711562247982343e-06, + "loss": 1.2395, + "step": 249990 + }, + { + "epoch": 1.597178743467539, + "grad_norm": 0.7808798551559448, + "learning_rate": 9.708590825822939e-06, + "loss": 0.9053, + "step": 250000 + }, + { + "epoch": 1.5972426306172776, + "grad_norm": 0.683994472026825, + "learning_rate": 9.70561980943286e-06, + "loss": 0.916, + "step": 250010 + }, + { + "epoch": 1.5973065177670165, + "grad_norm": 2.5665359497070312, + "learning_rate": 9.702649198842067e-06, + "loss": 0.7694, + "step": 250020 + }, + { + "epoch": 1.597370404916755, + "grad_norm": 0.9608335494995117, + "learning_rate": 9.699678994080446e-06, + "loss": 0.8268, + "step": 250030 + }, + { + "epoch": 1.597434292066494, + "grad_norm": 0.7314376831054688, + "learning_rate": 9.696709195177934e-06, + "loss": 1.0663, + "step": 250040 + }, + { + "epoch": 1.5974981792162324, + "grad_norm": 0.9006572365760803, + "learning_rate": 9.693739802164414e-06, + "loss": 0.8218, + "step": 250050 + }, + { + "epoch": 1.5975620663659713, + "grad_norm": 0.7238269448280334, + "learning_rate": 9.690770815069805e-06, + "loss": 1.0089, + "step": 250060 + }, + { + "epoch": 1.5976259535157098, + "grad_norm": 1.1664044857025146, + "learning_rate": 9.687802233924021e-06, + "loss": 1.0815, + "step": 250070 + }, + { + "epoch": 1.5976898406654485, + "grad_norm": 0.8418641090393066, + "learning_rate": 9.684834058756931e-06, + "loss": 0.7089, + "step": 250080 + }, + { + "epoch": 1.5977537278151872, + "grad_norm": 1.1800457239151, + "learning_rate": 9.681866289598445e-06, + "loss": 0.7196, + "step": 250090 + }, + { + "epoch": 1.597817614964926, + "grad_norm": 0.6666972637176514, + "learning_rate": 9.678898926478452e-06, + "loss": 0.7729, + "step": 250100 + }, + { + "epoch": 1.5978815021146646, + "grad_norm": 1.1304889917373657, + "learning_rate": 9.675931969426833e-06, + "loss": 0.912, + "step": 250110 + }, + { + "epoch": 1.5979453892644033, + "grad_norm": 0.9430323243141174, + "learning_rate": 9.672965418473446e-06, + "loss": 0.872, + "step": 250120 + }, + { + "epoch": 1.598009276414142, + "grad_norm": 0.9690349102020264, + "learning_rate": 9.6699992736482e-06, + "loss": 0.7998, + "step": 250130 + }, + { + "epoch": 1.5980731635638807, + "grad_norm": 0.9274241328239441, + "learning_rate": 9.667033534980935e-06, + "loss": 0.8911, + "step": 250140 + }, + { + "epoch": 1.5981370507136194, + "grad_norm": 0.5790615677833557, + "learning_rate": 9.664068202501553e-06, + "loss": 0.8692, + "step": 250150 + }, + { + "epoch": 1.5982009378633582, + "grad_norm": 1.0739883184432983, + "learning_rate": 9.66110327623988e-06, + "loss": 1.0901, + "step": 250160 + }, + { + "epoch": 1.5982648250130969, + "grad_norm": 0.8284801840782166, + "learning_rate": 9.658138756225805e-06, + "loss": 0.6699, + "step": 250170 + }, + { + "epoch": 1.5983287121628356, + "grad_norm": 0.8745191693305969, + "learning_rate": 9.655174642489179e-06, + "loss": 0.6788, + "step": 250180 + }, + { + "epoch": 1.5983925993125743, + "grad_norm": 1.3312053680419922, + "learning_rate": 9.652210935059836e-06, + "loss": 0.9959, + "step": 250190 + }, + { + "epoch": 1.598456486462313, + "grad_norm": 1.0378258228302002, + "learning_rate": 9.649247633967651e-06, + "loss": 0.9172, + "step": 250200 + }, + { + "epoch": 1.5985203736120517, + "grad_norm": 0.8793679475784302, + "learning_rate": 9.646284739242434e-06, + "loss": 0.9155, + "step": 250210 + }, + { + "epoch": 1.5985842607617904, + "grad_norm": 0.8637825846672058, + "learning_rate": 9.643322250914056e-06, + "loss": 0.9205, + "step": 250220 + }, + { + "epoch": 1.598648147911529, + "grad_norm": 1.0453790426254272, + "learning_rate": 9.640360169012325e-06, + "loss": 1.1401, + "step": 250230 + }, + { + "epoch": 1.5987120350612678, + "grad_norm": 0.9813113212585449, + "learning_rate": 9.637398493567091e-06, + "loss": 0.9221, + "step": 250240 + }, + { + "epoch": 1.5987759222110065, + "grad_norm": 0.6848676204681396, + "learning_rate": 9.634437224608162e-06, + "loss": 0.8406, + "step": 250250 + }, + { + "epoch": 1.5988398093607452, + "grad_norm": 0.9610247611999512, + "learning_rate": 9.631476362165386e-06, + "loss": 0.8424, + "step": 250260 + }, + { + "epoch": 1.598903696510484, + "grad_norm": 0.7780939340591431, + "learning_rate": 9.628515906268554e-06, + "loss": 0.5489, + "step": 250270 + }, + { + "epoch": 1.5989675836602226, + "grad_norm": 1.0890668630599976, + "learning_rate": 9.625555856947505e-06, + "loss": 0.8914, + "step": 250280 + }, + { + "epoch": 1.5990314708099613, + "grad_norm": 0.7628886103630066, + "learning_rate": 9.622596214232021e-06, + "loss": 1.1567, + "step": 250290 + }, + { + "epoch": 1.5990953579597, + "grad_norm": 0.8857354521751404, + "learning_rate": 9.619636978151942e-06, + "loss": 0.9771, + "step": 250300 + }, + { + "epoch": 1.5991592451094387, + "grad_norm": 1.5625184774398804, + "learning_rate": 9.616678148737034e-06, + "loss": 1.0714, + "step": 250310 + }, + { + "epoch": 1.5992231322591772, + "grad_norm": 0.9194492101669312, + "learning_rate": 9.613719726017134e-06, + "loss": 0.7574, + "step": 250320 + }, + { + "epoch": 1.5992870194089162, + "grad_norm": 0.6353196501731873, + "learning_rate": 9.610761710021998e-06, + "loss": 0.894, + "step": 250330 + }, + { + "epoch": 1.5993509065586546, + "grad_norm": 1.1404699087142944, + "learning_rate": 9.607804100781448e-06, + "loss": 0.8643, + "step": 250340 + }, + { + "epoch": 1.5994147937083936, + "grad_norm": 0.8246428370475769, + "learning_rate": 9.604846898325243e-06, + "loss": 0.7766, + "step": 250350 + }, + { + "epoch": 1.599478680858132, + "grad_norm": 1.066979169845581, + "learning_rate": 9.601890102683187e-06, + "loss": 1.002, + "step": 250360 + }, + { + "epoch": 1.599542568007871, + "grad_norm": 1.0399022102355957, + "learning_rate": 9.598933713885034e-06, + "loss": 0.8962, + "step": 250370 + }, + { + "epoch": 1.5996064551576095, + "grad_norm": 0.4756830036640167, + "learning_rate": 9.59597773196057e-06, + "loss": 0.8602, + "step": 250380 + }, + { + "epoch": 1.5996703423073484, + "grad_norm": 2.020519971847534, + "learning_rate": 9.593022156939579e-06, + "loss": 0.8967, + "step": 250390 + }, + { + "epoch": 1.5997342294570869, + "grad_norm": 1.331128478050232, + "learning_rate": 9.590066988851797e-06, + "loss": 0.825, + "step": 250400 + }, + { + "epoch": 1.5997981166068258, + "grad_norm": 0.7770277261734009, + "learning_rate": 9.587112227727018e-06, + "loss": 1.0992, + "step": 250410 + }, + { + "epoch": 1.5998620037565643, + "grad_norm": 1.1182270050048828, + "learning_rate": 9.584157873594961e-06, + "loss": 0.7391, + "step": 250420 + }, + { + "epoch": 1.5999258909063032, + "grad_norm": 0.7087160348892212, + "learning_rate": 9.581203926485421e-06, + "loss": 0.662, + "step": 250430 + }, + { + "epoch": 1.5999897780560417, + "grad_norm": 0.8244704008102417, + "learning_rate": 9.578250386428105e-06, + "loss": 0.9107, + "step": 250440 + }, + { + "epoch": 1.6000536652057806, + "grad_norm": 1.832184076309204, + "learning_rate": 9.575297253452791e-06, + "loss": 0.9488, + "step": 250450 + }, + { + "epoch": 1.6001175523555191, + "grad_norm": 0.7238314151763916, + "learning_rate": 9.572344527589195e-06, + "loss": 1.0217, + "step": 250460 + }, + { + "epoch": 1.600181439505258, + "grad_norm": 1.169054627418518, + "learning_rate": 9.569392208867078e-06, + "loss": 0.6509, + "step": 250470 + }, + { + "epoch": 1.6002453266549965, + "grad_norm": 0.9178882837295532, + "learning_rate": 9.566440297316142e-06, + "loss": 0.8303, + "step": 250480 + }, + { + "epoch": 1.6003092138047355, + "grad_norm": 0.6997204422950745, + "learning_rate": 9.563488792966146e-06, + "loss": 0.7921, + "step": 250490 + }, + { + "epoch": 1.600373100954474, + "grad_norm": 1.2562320232391357, + "learning_rate": 9.56053769584679e-06, + "loss": 0.9274, + "step": 250500 + }, + { + "epoch": 1.6004369881042129, + "grad_norm": 0.970008373260498, + "learning_rate": 9.557587005987817e-06, + "loss": 0.8867, + "step": 250510 + }, + { + "epoch": 1.6005008752539513, + "grad_norm": 1.1938084363937378, + "learning_rate": 9.554636723418919e-06, + "loss": 0.9233, + "step": 250520 + }, + { + "epoch": 1.6005647624036903, + "grad_norm": 1.1808513402938843, + "learning_rate": 9.551686848169827e-06, + "loss": 1.1654, + "step": 250530 + }, + { + "epoch": 1.6006286495534288, + "grad_norm": 1.0170284509658813, + "learning_rate": 9.548737380270234e-06, + "loss": 0.8142, + "step": 250540 + }, + { + "epoch": 1.6006925367031677, + "grad_norm": 0.6526452302932739, + "learning_rate": 9.54578831974986e-06, + "loss": 0.729, + "step": 250550 + }, + { + "epoch": 1.6007564238529062, + "grad_norm": 0.8424413800239563, + "learning_rate": 9.542839666638387e-06, + "loss": 0.6507, + "step": 250560 + }, + { + "epoch": 1.6008203110026449, + "grad_norm": 0.8636687994003296, + "learning_rate": 9.539891420965524e-06, + "loss": 0.8822, + "step": 250570 + }, + { + "epoch": 1.6008841981523836, + "grad_norm": 1.0570945739746094, + "learning_rate": 9.536943582760966e-06, + "loss": 0.8312, + "step": 250580 + }, + { + "epoch": 1.6009480853021223, + "grad_norm": 1.320425271987915, + "learning_rate": 9.533996152054375e-06, + "loss": 0.7736, + "step": 250590 + }, + { + "epoch": 1.601011972451861, + "grad_norm": 0.8588604927062988, + "learning_rate": 9.531049128875463e-06, + "loss": 0.9693, + "step": 250600 + }, + { + "epoch": 1.6010758596015997, + "grad_norm": 1.496766209602356, + "learning_rate": 9.528102513253883e-06, + "loss": 0.8687, + "step": 250610 + }, + { + "epoch": 1.6011397467513384, + "grad_norm": 0.958341658115387, + "learning_rate": 9.52515630521934e-06, + "loss": 0.6905, + "step": 250620 + }, + { + "epoch": 1.6012036339010771, + "grad_norm": 1.0842797756195068, + "learning_rate": 9.522210504801471e-06, + "loss": 0.787, + "step": 250630 + }, + { + "epoch": 1.6012675210508158, + "grad_norm": 0.8197239637374878, + "learning_rate": 9.519265112029978e-06, + "loss": 0.8905, + "step": 250640 + }, + { + "epoch": 1.6013314082005545, + "grad_norm": 1.5352048873901367, + "learning_rate": 9.516320126934491e-06, + "loss": 1.1262, + "step": 250650 + }, + { + "epoch": 1.6013952953502932, + "grad_norm": 1.2177964448928833, + "learning_rate": 9.513375549544701e-06, + "loss": 0.7407, + "step": 250660 + }, + { + "epoch": 1.601459182500032, + "grad_norm": 0.968707263469696, + "learning_rate": 9.510431379890227e-06, + "loss": 0.8163, + "step": 250670 + }, + { + "epoch": 1.6015230696497706, + "grad_norm": 1.3217895030975342, + "learning_rate": 9.507487618000754e-06, + "loss": 0.9355, + "step": 250680 + }, + { + "epoch": 1.6015869567995094, + "grad_norm": 1.0752805471420288, + "learning_rate": 9.504544263905895e-06, + "loss": 0.9473, + "step": 250690 + }, + { + "epoch": 1.601650843949248, + "grad_norm": 1.2310861349105835, + "learning_rate": 9.50160131763531e-06, + "loss": 0.8163, + "step": 250700 + }, + { + "epoch": 1.6017147310989868, + "grad_norm": 0.782647967338562, + "learning_rate": 9.49865877921865e-06, + "loss": 0.8367, + "step": 250710 + }, + { + "epoch": 1.6017786182487255, + "grad_norm": 0.5226444005966187, + "learning_rate": 9.49571664868552e-06, + "loss": 0.8799, + "step": 250720 + }, + { + "epoch": 1.6018425053984642, + "grad_norm": 4.664597511291504, + "learning_rate": 9.492774926065579e-06, + "loss": 1.1183, + "step": 250730 + }, + { + "epoch": 1.6019063925482029, + "grad_norm": 1.443206787109375, + "learning_rate": 9.489833611388428e-06, + "loss": 0.8891, + "step": 250740 + }, + { + "epoch": 1.6019702796979416, + "grad_norm": 0.7619695663452148, + "learning_rate": 9.48689270468371e-06, + "loss": 0.6385, + "step": 250750 + }, + { + "epoch": 1.6020341668476803, + "grad_norm": 2.3720896244049072, + "learning_rate": 9.483952205981018e-06, + "loss": 0.9371, + "step": 250760 + }, + { + "epoch": 1.602098053997419, + "grad_norm": 2.431199312210083, + "learning_rate": 9.481012115309989e-06, + "loss": 0.764, + "step": 250770 + }, + { + "epoch": 1.6021619411471577, + "grad_norm": 1.5024423599243164, + "learning_rate": 9.478072432700208e-06, + "loss": 0.6849, + "step": 250780 + }, + { + "epoch": 1.6022258282968964, + "grad_norm": 1.117741584777832, + "learning_rate": 9.47513315818131e-06, + "loss": 0.7878, + "step": 250790 + }, + { + "epoch": 1.6022897154466351, + "grad_norm": 1.0036662817001343, + "learning_rate": 9.472194291782871e-06, + "loss": 0.803, + "step": 250800 + }, + { + "epoch": 1.6023536025963736, + "grad_norm": 0.7195963263511658, + "learning_rate": 9.469255833534501e-06, + "loss": 0.9983, + "step": 250810 + }, + { + "epoch": 1.6024174897461125, + "grad_norm": 1.221651554107666, + "learning_rate": 9.466317783465784e-06, + "loss": 0.9196, + "step": 250820 + }, + { + "epoch": 1.602481376895851, + "grad_norm": 1.0164096355438232, + "learning_rate": 9.463380141606321e-06, + "loss": 1.0127, + "step": 250830 + }, + { + "epoch": 1.60254526404559, + "grad_norm": 1.8335058689117432, + "learning_rate": 9.460442907985679e-06, + "loss": 0.8599, + "step": 250840 + }, + { + "epoch": 1.6026091511953284, + "grad_norm": 1.6345186233520508, + "learning_rate": 9.45750608263346e-06, + "loss": 0.8847, + "step": 250850 + }, + { + "epoch": 1.6026730383450674, + "grad_norm": 1.392560362815857, + "learning_rate": 9.454569665579221e-06, + "loss": 0.9483, + "step": 250860 + }, + { + "epoch": 1.6027369254948058, + "grad_norm": 2.1398732662200928, + "learning_rate": 9.45163365685255e-06, + "loss": 0.7336, + "step": 250870 + }, + { + "epoch": 1.6028008126445448, + "grad_norm": 1.1836103200912476, + "learning_rate": 9.448698056482996e-06, + "loss": 0.9999, + "step": 250880 + }, + { + "epoch": 1.6028646997942833, + "grad_norm": 0.7985469102859497, + "learning_rate": 9.445762864500147e-06, + "loss": 1.2659, + "step": 250890 + }, + { + "epoch": 1.6029285869440222, + "grad_norm": 1.1316717863082886, + "learning_rate": 9.442828080933536e-06, + "loss": 0.8242, + "step": 250900 + }, + { + "epoch": 1.6029924740937607, + "grad_norm": 1.4258652925491333, + "learning_rate": 9.439893705812736e-06, + "loss": 0.8395, + "step": 250910 + }, + { + "epoch": 1.6030563612434996, + "grad_norm": 1.2066962718963623, + "learning_rate": 9.436959739167305e-06, + "loss": 0.9081, + "step": 250920 + }, + { + "epoch": 1.603120248393238, + "grad_norm": 1.2025872468948364, + "learning_rate": 9.434026181026773e-06, + "loss": 0.7896, + "step": 250930 + }, + { + "epoch": 1.603184135542977, + "grad_norm": 1.226680874824524, + "learning_rate": 9.431093031420702e-06, + "loss": 0.7066, + "step": 250940 + }, + { + "epoch": 1.6032480226927155, + "grad_norm": 0.9295899271965027, + "learning_rate": 9.428160290378606e-06, + "loss": 0.7791, + "step": 250950 + }, + { + "epoch": 1.6033119098424544, + "grad_norm": 0.9173290133476257, + "learning_rate": 9.425227957930055e-06, + "loss": 0.7236, + "step": 250960 + }, + { + "epoch": 1.603375796992193, + "grad_norm": 1.2388783693313599, + "learning_rate": 9.422296034104544e-06, + "loss": 0.8935, + "step": 250970 + }, + { + "epoch": 1.6034396841419318, + "grad_norm": 1.0881764888763428, + "learning_rate": 9.419364518931633e-06, + "loss": 0.9492, + "step": 250980 + }, + { + "epoch": 1.6035035712916703, + "grad_norm": 1.2922098636627197, + "learning_rate": 9.416433412440812e-06, + "loss": 1.0869, + "step": 250990 + }, + { + "epoch": 1.6035674584414092, + "grad_norm": 1.176303744316101, + "learning_rate": 9.413502714661626e-06, + "loss": 0.9365, + "step": 251000 + }, + { + "epoch": 1.6036313455911477, + "grad_norm": 1.007675290107727, + "learning_rate": 9.41057242562357e-06, + "loss": 1.02, + "step": 251010 + }, + { + "epoch": 1.6036952327408867, + "grad_norm": 0.736621618270874, + "learning_rate": 9.407642545356182e-06, + "loss": 0.8256, + "step": 251020 + }, + { + "epoch": 1.6037591198906251, + "grad_norm": 0.718731164932251, + "learning_rate": 9.404713073888933e-06, + "loss": 0.9013, + "step": 251030 + }, + { + "epoch": 1.6038230070403638, + "grad_norm": 0.7369511127471924, + "learning_rate": 9.401784011251357e-06, + "loss": 0.8759, + "step": 251040 + }, + { + "epoch": 1.6038868941901026, + "grad_norm": 1.6651430130004883, + "learning_rate": 9.398855357472924e-06, + "loss": 0.5682, + "step": 251050 + }, + { + "epoch": 1.6039507813398413, + "grad_norm": 1.296785831451416, + "learning_rate": 9.395927112583159e-06, + "loss": 1.0505, + "step": 251060 + }, + { + "epoch": 1.60401466848958, + "grad_norm": 0.8208275437355042, + "learning_rate": 9.392999276611537e-06, + "loss": 0.9532, + "step": 251070 + }, + { + "epoch": 1.6040785556393187, + "grad_norm": 0.9491279721260071, + "learning_rate": 9.390071849587523e-06, + "loss": 0.615, + "step": 251080 + }, + { + "epoch": 1.6041424427890574, + "grad_norm": 1.2566066980361938, + "learning_rate": 9.387144831540634e-06, + "loss": 0.7125, + "step": 251090 + }, + { + "epoch": 1.604206329938796, + "grad_norm": 1.3850597143173218, + "learning_rate": 9.384218222500324e-06, + "loss": 1.1517, + "step": 251100 + }, + { + "epoch": 1.6042702170885348, + "grad_norm": 0.8714288473129272, + "learning_rate": 9.38129202249608e-06, + "loss": 0.9129, + "step": 251110 + }, + { + "epoch": 1.6043341042382735, + "grad_norm": 1.3241082429885864, + "learning_rate": 9.378366231557356e-06, + "loss": 0.75, + "step": 251120 + }, + { + "epoch": 1.6043979913880122, + "grad_norm": 0.9380171895027161, + "learning_rate": 9.375440849713635e-06, + "loss": 0.8114, + "step": 251130 + }, + { + "epoch": 1.604461878537751, + "grad_norm": 1.197385549545288, + "learning_rate": 9.372515876994364e-06, + "loss": 0.7724, + "step": 251140 + }, + { + "epoch": 1.6045257656874896, + "grad_norm": 1.5692874193191528, + "learning_rate": 9.369591313429016e-06, + "loss": 0.6484, + "step": 251150 + }, + { + "epoch": 1.6045896528372283, + "grad_norm": 1.0515121221542358, + "learning_rate": 9.366667159047022e-06, + "loss": 1.0717, + "step": 251160 + }, + { + "epoch": 1.604653539986967, + "grad_norm": 1.019773244857788, + "learning_rate": 9.363743413877851e-06, + "loss": 0.7979, + "step": 251170 + }, + { + "epoch": 1.6047174271367057, + "grad_norm": 0.8589492440223694, + "learning_rate": 9.360820077950928e-06, + "loss": 0.9352, + "step": 251180 + }, + { + "epoch": 1.6047813142864444, + "grad_norm": 0.8133339881896973, + "learning_rate": 9.35789715129572e-06, + "loss": 0.7797, + "step": 251190 + }, + { + "epoch": 1.6048452014361831, + "grad_norm": 0.7571247816085815, + "learning_rate": 9.354974633941633e-06, + "loss": 0.7098, + "step": 251200 + }, + { + "epoch": 1.6049090885859219, + "grad_norm": 0.987298309803009, + "learning_rate": 9.352052525918126e-06, + "loss": 0.8463, + "step": 251210 + }, + { + "epoch": 1.6049729757356606, + "grad_norm": 0.6550473570823669, + "learning_rate": 9.349130827254605e-06, + "loss": 0.7963, + "step": 251220 + }, + { + "epoch": 1.6050368628853993, + "grad_norm": 0.8027055263519287, + "learning_rate": 9.346209537980505e-06, + "loss": 0.8266, + "step": 251230 + }, + { + "epoch": 1.605100750035138, + "grad_norm": 1.0942720174789429, + "learning_rate": 9.34328865812526e-06, + "loss": 0.8502, + "step": 251240 + }, + { + "epoch": 1.6051646371848767, + "grad_norm": 1.5014644861221313, + "learning_rate": 9.340368187718256e-06, + "loss": 0.8505, + "step": 251250 + }, + { + "epoch": 1.6052285243346154, + "grad_norm": 0.6291337609291077, + "learning_rate": 9.337448126788927e-06, + "loss": 0.8417, + "step": 251260 + }, + { + "epoch": 1.605292411484354, + "grad_norm": 0.4529359042644501, + "learning_rate": 9.334528475366672e-06, + "loss": 1.0095, + "step": 251270 + }, + { + "epoch": 1.6053562986340928, + "grad_norm": 1.283196210861206, + "learning_rate": 9.331609233480898e-06, + "loss": 1.0089, + "step": 251280 + }, + { + "epoch": 1.6054201857838315, + "grad_norm": 1.5335444211959839, + "learning_rate": 9.328690401161e-06, + "loss": 0.9119, + "step": 251290 + }, + { + "epoch": 1.60548407293357, + "grad_norm": 1.4574180841445923, + "learning_rate": 9.325771978436382e-06, + "loss": 0.6515, + "step": 251300 + }, + { + "epoch": 1.605547960083309, + "grad_norm": 1.1264441013336182, + "learning_rate": 9.322853965336414e-06, + "loss": 1.1423, + "step": 251310 + }, + { + "epoch": 1.6056118472330474, + "grad_norm": 1.130892038345337, + "learning_rate": 9.319936361890514e-06, + "loss": 1.0827, + "step": 251320 + }, + { + "epoch": 1.6056757343827863, + "grad_norm": 1.047214388847351, + "learning_rate": 9.317019168128033e-06, + "loss": 0.7339, + "step": 251330 + }, + { + "epoch": 1.6057396215325248, + "grad_norm": 1.3307965993881226, + "learning_rate": 9.314102384078377e-06, + "loss": 0.6577, + "step": 251340 + }, + { + "epoch": 1.6058035086822637, + "grad_norm": 1.187129020690918, + "learning_rate": 9.311186009770901e-06, + "loss": 0.8338, + "step": 251350 + }, + { + "epoch": 1.6058673958320022, + "grad_norm": 2.011436700820923, + "learning_rate": 9.30827004523499e-06, + "loss": 1.0097, + "step": 251360 + }, + { + "epoch": 1.6059312829817411, + "grad_norm": 0.9869546294212341, + "learning_rate": 9.30535449049999e-06, + "loss": 0.9975, + "step": 251370 + }, + { + "epoch": 1.6059951701314796, + "grad_norm": 0.7743905186653137, + "learning_rate": 9.302439345595293e-06, + "loss": 0.8767, + "step": 251380 + }, + { + "epoch": 1.6060590572812186, + "grad_norm": 1.3220921754837036, + "learning_rate": 9.29952461055022e-06, + "loss": 0.8059, + "step": 251390 + }, + { + "epoch": 1.606122944430957, + "grad_norm": 1.1547019481658936, + "learning_rate": 9.296610285394164e-06, + "loss": 0.6196, + "step": 251400 + }, + { + "epoch": 1.606186831580696, + "grad_norm": 1.0041553974151611, + "learning_rate": 9.29369637015644e-06, + "loss": 0.8884, + "step": 251410 + }, + { + "epoch": 1.6062507187304345, + "grad_norm": 0.7838250994682312, + "learning_rate": 9.290782864866426e-06, + "loss": 1.0101, + "step": 251420 + }, + { + "epoch": 1.6063146058801734, + "grad_norm": 1.1419521570205688, + "learning_rate": 9.28786976955343e-06, + "loss": 0.8721, + "step": 251430 + }, + { + "epoch": 1.6063784930299119, + "grad_norm": 0.8693103194236755, + "learning_rate": 9.284957084246814e-06, + "loss": 1.0617, + "step": 251440 + }, + { + "epoch": 1.6064423801796508, + "grad_norm": 1.3104223012924194, + "learning_rate": 9.28204480897591e-06, + "loss": 1.0841, + "step": 251450 + }, + { + "epoch": 1.6065062673293893, + "grad_norm": 1.1727479696273804, + "learning_rate": 9.27913294377003e-06, + "loss": 1.0063, + "step": 251460 + }, + { + "epoch": 1.6065701544791282, + "grad_norm": 0.9226076602935791, + "learning_rate": 9.27622148865852e-06, + "loss": 0.9742, + "step": 251470 + }, + { + "epoch": 1.6066340416288667, + "grad_norm": 0.9547157883644104, + "learning_rate": 9.27331044367068e-06, + "loss": 0.8409, + "step": 251480 + }, + { + "epoch": 1.6066979287786056, + "grad_norm": 1.7603591680526733, + "learning_rate": 9.27039980883585e-06, + "loss": 1.0406, + "step": 251490 + }, + { + "epoch": 1.606761815928344, + "grad_norm": 0.7404820322990417, + "learning_rate": 9.26748958418332e-06, + "loss": 0.7947, + "step": 251500 + }, + { + "epoch": 1.606825703078083, + "grad_norm": 1.1479718685150146, + "learning_rate": 9.264579769742416e-06, + "loss": 0.705, + "step": 251510 + }, + { + "epoch": 1.6068895902278215, + "grad_norm": 0.8657917380332947, + "learning_rate": 9.261670365542424e-06, + "loss": 0.927, + "step": 251520 + }, + { + "epoch": 1.6069534773775602, + "grad_norm": 1.3002824783325195, + "learning_rate": 9.258761371612668e-06, + "loss": 0.9777, + "step": 251530 + }, + { + "epoch": 1.607017364527299, + "grad_norm": 0.9150441884994507, + "learning_rate": 9.255852787982422e-06, + "loss": 0.8219, + "step": 251540 + }, + { + "epoch": 1.6070812516770376, + "grad_norm": 0.8327670693397522, + "learning_rate": 9.252944614680998e-06, + "loss": 1.3298, + "step": 251550 + }, + { + "epoch": 1.6071451388267763, + "grad_norm": 0.8424513936042786, + "learning_rate": 9.25003685173767e-06, + "loss": 0.6958, + "step": 251560 + }, + { + "epoch": 1.607209025976515, + "grad_norm": 0.8646684288978577, + "learning_rate": 9.247129499181711e-06, + "loss": 1.1269, + "step": 251570 + }, + { + "epoch": 1.6072729131262538, + "grad_norm": 0.9485357999801636, + "learning_rate": 9.244222557042431e-06, + "loss": 0.8929, + "step": 251580 + }, + { + "epoch": 1.6073368002759925, + "grad_norm": 0.7912691235542297, + "learning_rate": 9.241316025349073e-06, + "loss": 1.036, + "step": 251590 + }, + { + "epoch": 1.6074006874257312, + "grad_norm": 1.385108232498169, + "learning_rate": 9.238409904130941e-06, + "loss": 0.744, + "step": 251600 + }, + { + "epoch": 1.6074645745754699, + "grad_norm": 1.1460243463516235, + "learning_rate": 9.235504193417266e-06, + "loss": 1.1109, + "step": 251610 + }, + { + "epoch": 1.6075284617252086, + "grad_norm": 0.646547794342041, + "learning_rate": 9.23259889323735e-06, + "loss": 1.2331, + "step": 251620 + }, + { + "epoch": 1.6075923488749473, + "grad_norm": 1.1643790006637573, + "learning_rate": 9.229694003620415e-06, + "loss": 0.9083, + "step": 251630 + }, + { + "epoch": 1.607656236024686, + "grad_norm": 0.8957083821296692, + "learning_rate": 9.226789524595747e-06, + "loss": 0.6786, + "step": 251640 + }, + { + "epoch": 1.6077201231744247, + "grad_norm": 1.0356974601745605, + "learning_rate": 9.223885456192566e-06, + "loss": 0.9857, + "step": 251650 + }, + { + "epoch": 1.6077840103241634, + "grad_norm": 1.0247944593429565, + "learning_rate": 9.220981798440148e-06, + "loss": 0.7348, + "step": 251660 + }, + { + "epoch": 1.607847897473902, + "grad_norm": 0.7964886426925659, + "learning_rate": 9.218078551367715e-06, + "loss": 0.809, + "step": 251670 + }, + { + "epoch": 1.6079117846236408, + "grad_norm": 0.8133732676506042, + "learning_rate": 9.21517571500452e-06, + "loss": 0.7862, + "step": 251680 + }, + { + "epoch": 1.6079756717733795, + "grad_norm": 1.3863354921340942, + "learning_rate": 9.212563513458184e-06, + "loss": 0.8938, + "step": 251690 + }, + { + "epoch": 1.6080395589231182, + "grad_norm": 0.682494580745697, + "learning_rate": 9.209661457523062e-06, + "loss": 0.709, + "step": 251700 + }, + { + "epoch": 1.608103446072857, + "grad_norm": 0.5610239505767822, + "learning_rate": 9.206759812381938e-06, + "loss": 0.6273, + "step": 251710 + }, + { + "epoch": 1.6081673332225956, + "grad_norm": 0.6274298429489136, + "learning_rate": 9.203858578064028e-06, + "loss": 0.7385, + "step": 251720 + }, + { + "epoch": 1.6082312203723343, + "grad_norm": 1.206041932106018, + "learning_rate": 9.20095775459856e-06, + "loss": 0.9551, + "step": 251730 + }, + { + "epoch": 1.608295107522073, + "grad_norm": 1.270914077758789, + "learning_rate": 9.198057342014738e-06, + "loss": 0.9709, + "step": 251740 + }, + { + "epoch": 1.6083589946718118, + "grad_norm": 0.9437939524650574, + "learning_rate": 9.195157340341787e-06, + "loss": 0.8078, + "step": 251750 + }, + { + "epoch": 1.6084228818215505, + "grad_norm": 1.5694162845611572, + "learning_rate": 9.19225774960889e-06, + "loss": 0.9904, + "step": 251760 + }, + { + "epoch": 1.608486768971289, + "grad_norm": 1.7031606435775757, + "learning_rate": 9.189358569845275e-06, + "loss": 0.972, + "step": 251770 + }, + { + "epoch": 1.6085506561210279, + "grad_norm": 2.132373094558716, + "learning_rate": 9.186459801080111e-06, + "loss": 0.8312, + "step": 251780 + }, + { + "epoch": 1.6086145432707664, + "grad_norm": 1.3549762964248657, + "learning_rate": 9.18356144334262e-06, + "loss": 1.0376, + "step": 251790 + }, + { + "epoch": 1.6086784304205053, + "grad_norm": 0.9008809328079224, + "learning_rate": 9.180663496661962e-06, + "loss": 0.8303, + "step": 251800 + }, + { + "epoch": 1.6087423175702438, + "grad_norm": 0.8437301516532898, + "learning_rate": 9.177765961067348e-06, + "loss": 0.7478, + "step": 251810 + }, + { + "epoch": 1.6088062047199827, + "grad_norm": 1.0403225421905518, + "learning_rate": 9.174868836587942e-06, + "loss": 0.9228, + "step": 251820 + }, + { + "epoch": 1.6088700918697212, + "grad_norm": 1.1173384189605713, + "learning_rate": 9.171972123252931e-06, + "loss": 0.7614, + "step": 251830 + }, + { + "epoch": 1.6089339790194601, + "grad_norm": 0.7167786955833435, + "learning_rate": 9.169075821091472e-06, + "loss": 0.951, + "step": 251840 + }, + { + "epoch": 1.6089978661691986, + "grad_norm": 0.8369988203048706, + "learning_rate": 9.16617993013275e-06, + "loss": 1.0164, + "step": 251850 + }, + { + "epoch": 1.6090617533189375, + "grad_norm": 1.0024378299713135, + "learning_rate": 9.163284450405918e-06, + "loss": 0.7347, + "step": 251860 + }, + { + "epoch": 1.609125640468676, + "grad_norm": 0.9501249194145203, + "learning_rate": 9.160389381940137e-06, + "loss": 0.6731, + "step": 251870 + }, + { + "epoch": 1.609189527618415, + "grad_norm": 0.7685287594795227, + "learning_rate": 9.157494724764577e-06, + "loss": 1.06, + "step": 251880 + }, + { + "epoch": 1.6092534147681534, + "grad_norm": 0.8647333979606628, + "learning_rate": 9.154600478908365e-06, + "loss": 0.803, + "step": 251890 + }, + { + "epoch": 1.6093173019178924, + "grad_norm": 0.7192227244377136, + "learning_rate": 9.151706644400681e-06, + "loss": 0.949, + "step": 251900 + }, + { + "epoch": 1.6093811890676308, + "grad_norm": 0.7441559433937073, + "learning_rate": 9.148813221270635e-06, + "loss": 0.9322, + "step": 251910 + }, + { + "epoch": 1.6094450762173698, + "grad_norm": 2.0405662059783936, + "learning_rate": 9.145920209547392e-06, + "loss": 1.0214, + "step": 251920 + }, + { + "epoch": 1.6095089633671082, + "grad_norm": 1.1048287153244019, + "learning_rate": 9.143027609260063e-06, + "loss": 0.7865, + "step": 251930 + }, + { + "epoch": 1.6095728505168472, + "grad_norm": 0.8086538910865784, + "learning_rate": 9.140135420437812e-06, + "loss": 0.7849, + "step": 251940 + }, + { + "epoch": 1.6096367376665857, + "grad_norm": 1.294543743133545, + "learning_rate": 9.137243643109722e-06, + "loss": 0.9705, + "step": 251950 + }, + { + "epoch": 1.6097006248163246, + "grad_norm": 1.7661151885986328, + "learning_rate": 9.134352277304964e-06, + "loss": 0.744, + "step": 251960 + }, + { + "epoch": 1.609764511966063, + "grad_norm": 0.9390109777450562, + "learning_rate": 9.131461323052615e-06, + "loss": 0.7945, + "step": 251970 + }, + { + "epoch": 1.609828399115802, + "grad_norm": 0.8471163511276245, + "learning_rate": 9.128570780381817e-06, + "loss": 0.7913, + "step": 251980 + }, + { + "epoch": 1.6098922862655405, + "grad_norm": 0.7456984519958496, + "learning_rate": 9.125680649321661e-06, + "loss": 0.8629, + "step": 251990 + }, + { + "epoch": 1.6099561734152794, + "grad_norm": 0.7650848627090454, + "learning_rate": 9.12279092990127e-06, + "loss": 0.8887, + "step": 252000 + }, + { + "epoch": 1.610020060565018, + "grad_norm": 0.687843382358551, + "learning_rate": 9.11990162214973e-06, + "loss": 0.6964, + "step": 252010 + }, + { + "epoch": 1.6100839477147566, + "grad_norm": 1.4888943433761597, + "learning_rate": 9.117012726096158e-06, + "loss": 1.0631, + "step": 252020 + }, + { + "epoch": 1.6101478348644953, + "grad_norm": 0.8628354072570801, + "learning_rate": 9.114124241769622e-06, + "loss": 0.8178, + "step": 252030 + }, + { + "epoch": 1.610211722014234, + "grad_norm": 0.6701124906539917, + "learning_rate": 9.111236169199245e-06, + "loss": 0.6124, + "step": 252040 + }, + { + "epoch": 1.6102756091639727, + "grad_norm": 0.9853718280792236, + "learning_rate": 9.108348508414078e-06, + "loss": 0.7542, + "step": 252050 + }, + { + "epoch": 1.6103394963137114, + "grad_norm": 0.8621380925178528, + "learning_rate": 9.105461259443227e-06, + "loss": 0.6891, + "step": 252060 + }, + { + "epoch": 1.6104033834634501, + "grad_norm": 1.3660041093826294, + "learning_rate": 9.102574422315752e-06, + "loss": 0.7933, + "step": 252070 + }, + { + "epoch": 1.6104672706131888, + "grad_norm": 1.1363906860351562, + "learning_rate": 9.09968799706073e-06, + "loss": 0.9381, + "step": 252080 + }, + { + "epoch": 1.6105311577629275, + "grad_norm": 1.0103809833526611, + "learning_rate": 9.096801983707248e-06, + "loss": 0.9705, + "step": 252090 + }, + { + "epoch": 1.6105950449126663, + "grad_norm": 1.2093793153762817, + "learning_rate": 9.093916382284346e-06, + "loss": 0.7687, + "step": 252100 + }, + { + "epoch": 1.610658932062405, + "grad_norm": 0.9786253571510315, + "learning_rate": 9.091031192821104e-06, + "loss": 1.1084, + "step": 252110 + }, + { + "epoch": 1.6107228192121437, + "grad_norm": 0.9100025296211243, + "learning_rate": 9.088146415346554e-06, + "loss": 0.804, + "step": 252120 + }, + { + "epoch": 1.6107867063618824, + "grad_norm": 4.617587089538574, + "learning_rate": 9.085262049889782e-06, + "loss": 0.8656, + "step": 252130 + }, + { + "epoch": 1.610850593511621, + "grad_norm": 1.6649965047836304, + "learning_rate": 9.082378096479805e-06, + "loss": 1.04, + "step": 252140 + }, + { + "epoch": 1.6109144806613598, + "grad_norm": 3.090346097946167, + "learning_rate": 9.079494555145684e-06, + "loss": 1.0039, + "step": 252150 + }, + { + "epoch": 1.6109783678110985, + "grad_norm": 0.8761329650878906, + "learning_rate": 9.076611425916449e-06, + "loss": 0.7212, + "step": 252160 + }, + { + "epoch": 1.6110422549608372, + "grad_norm": 0.6641421318054199, + "learning_rate": 9.07372870882115e-06, + "loss": 0.8803, + "step": 252170 + }, + { + "epoch": 1.611106142110576, + "grad_norm": 1.0108219385147095, + "learning_rate": 9.0708464038888e-06, + "loss": 0.883, + "step": 252180 + }, + { + "epoch": 1.6111700292603146, + "grad_norm": 0.725486695766449, + "learning_rate": 9.067964511148458e-06, + "loss": 0.9893, + "step": 252190 + }, + { + "epoch": 1.6112339164100533, + "grad_norm": 0.5850199460983276, + "learning_rate": 9.065083030629102e-06, + "loss": 0.7496, + "step": 252200 + }, + { + "epoch": 1.611297803559792, + "grad_norm": 0.5066903829574585, + "learning_rate": 9.062201962359773e-06, + "loss": 0.6822, + "step": 252210 + }, + { + "epoch": 1.6113616907095307, + "grad_norm": 1.204042673110962, + "learning_rate": 9.059321306369495e-06, + "loss": 0.8696, + "step": 252220 + }, + { + "epoch": 1.6114255778592694, + "grad_norm": 0.8453181385993958, + "learning_rate": 9.056441062687259e-06, + "loss": 0.7135, + "step": 252230 + }, + { + "epoch": 1.6114894650090081, + "grad_norm": 0.9459882378578186, + "learning_rate": 9.0535612313421e-06, + "loss": 0.645, + "step": 252240 + }, + { + "epoch": 1.6115533521587468, + "grad_norm": 0.8971336483955383, + "learning_rate": 9.050681812362982e-06, + "loss": 0.9969, + "step": 252250 + }, + { + "epoch": 1.6116172393084853, + "grad_norm": 0.8886403441429138, + "learning_rate": 9.047802805778948e-06, + "loss": 0.8814, + "step": 252260 + }, + { + "epoch": 1.6116811264582243, + "grad_norm": 1.397530436515808, + "learning_rate": 9.044924211618948e-06, + "loss": 0.8373, + "step": 252270 + }, + { + "epoch": 1.6117450136079627, + "grad_norm": 0.8331180214881897, + "learning_rate": 9.042046029912005e-06, + "loss": 0.7237, + "step": 252280 + }, + { + "epoch": 1.6118089007577017, + "grad_norm": 0.912804365158081, + "learning_rate": 9.03916826068708e-06, + "loss": 0.9643, + "step": 252290 + }, + { + "epoch": 1.6118727879074402, + "grad_norm": 0.8552295565605164, + "learning_rate": 9.036290903973183e-06, + "loss": 1.1771, + "step": 252300 + }, + { + "epoch": 1.611936675057179, + "grad_norm": 1.0347743034362793, + "learning_rate": 9.033413959799258e-06, + "loss": 0.842, + "step": 252310 + }, + { + "epoch": 1.6120005622069176, + "grad_norm": 0.9951327443122864, + "learning_rate": 9.030537428194314e-06, + "loss": 0.7409, + "step": 252320 + }, + { + "epoch": 1.6120644493566565, + "grad_norm": 0.7869225144386292, + "learning_rate": 9.027661309187285e-06, + "loss": 0.8458, + "step": 252330 + }, + { + "epoch": 1.612128336506395, + "grad_norm": 0.9086364507675171, + "learning_rate": 9.024785602807168e-06, + "loss": 0.7975, + "step": 252340 + }, + { + "epoch": 1.612192223656134, + "grad_norm": 0.7900458574295044, + "learning_rate": 9.021910309082893e-06, + "loss": 0.7442, + "step": 252350 + }, + { + "epoch": 1.6122561108058724, + "grad_norm": 0.7535526752471924, + "learning_rate": 9.019035428043443e-06, + "loss": 0.8918, + "step": 252360 + }, + { + "epoch": 1.6123199979556113, + "grad_norm": 0.651496410369873, + "learning_rate": 9.01616095971775e-06, + "loss": 1.081, + "step": 252370 + }, + { + "epoch": 1.6123838851053498, + "grad_norm": 1.4786691665649414, + "learning_rate": 9.013286904134788e-06, + "loss": 0.8141, + "step": 252380 + }, + { + "epoch": 1.6124477722550887, + "grad_norm": 0.8722392320632935, + "learning_rate": 9.010413261323469e-06, + "loss": 0.7453, + "step": 252390 + }, + { + "epoch": 1.6125116594048272, + "grad_norm": 0.6708277463912964, + "learning_rate": 9.00754003131275e-06, + "loss": 1.1094, + "step": 252400 + }, + { + "epoch": 1.6125755465545661, + "grad_norm": 1.2941380739212036, + "learning_rate": 9.00466721413158e-06, + "loss": 0.8299, + "step": 252410 + }, + { + "epoch": 1.6126394337043046, + "grad_norm": 1.2671501636505127, + "learning_rate": 9.00179480980886e-06, + "loss": 0.7799, + "step": 252420 + }, + { + "epoch": 1.6127033208540436, + "grad_norm": 0.8946260213851929, + "learning_rate": 8.998922818373557e-06, + "loss": 0.971, + "step": 252430 + }, + { + "epoch": 1.612767208003782, + "grad_norm": 1.0225296020507812, + "learning_rate": 8.996051239854553e-06, + "loss": 1.3356, + "step": 252440 + }, + { + "epoch": 1.612831095153521, + "grad_norm": 1.6290887594223022, + "learning_rate": 8.993180074280799e-06, + "loss": 0.6761, + "step": 252450 + }, + { + "epoch": 1.6128949823032595, + "grad_norm": 1.9371238946914673, + "learning_rate": 8.990309321681184e-06, + "loss": 1.1749, + "step": 252460 + }, + { + "epoch": 1.6129588694529984, + "grad_norm": 0.8040488362312317, + "learning_rate": 8.98743898208465e-06, + "loss": 0.9965, + "step": 252470 + }, + { + "epoch": 1.6130227566027369, + "grad_norm": 0.9329996705055237, + "learning_rate": 8.98456905552007e-06, + "loss": 0.7557, + "step": 252480 + }, + { + "epoch": 1.6130866437524758, + "grad_norm": 0.7548239231109619, + "learning_rate": 8.981699542016376e-06, + "loss": 0.9107, + "step": 252490 + }, + { + "epoch": 1.6131505309022143, + "grad_norm": 1.1711938381195068, + "learning_rate": 8.978830441602443e-06, + "loss": 0.9201, + "step": 252500 + }, + { + "epoch": 1.613214418051953, + "grad_norm": 1.1052647829055786, + "learning_rate": 8.975961754307183e-06, + "loss": 0.7716, + "step": 252510 + }, + { + "epoch": 1.6132783052016917, + "grad_norm": 0.6817499399185181, + "learning_rate": 8.973093480159472e-06, + "loss": 0.8824, + "step": 252520 + }, + { + "epoch": 1.6133421923514304, + "grad_norm": 1.1585290431976318, + "learning_rate": 8.970225619188216e-06, + "loss": 0.9834, + "step": 252530 + }, + { + "epoch": 1.613406079501169, + "grad_norm": 0.9899934530258179, + "learning_rate": 8.967358171422269e-06, + "loss": 0.8478, + "step": 252540 + }, + { + "epoch": 1.6134699666509078, + "grad_norm": 0.9996746778488159, + "learning_rate": 8.964491136890535e-06, + "loss": 0.6712, + "step": 252550 + }, + { + "epoch": 1.6135338538006465, + "grad_norm": 1.0687540769577026, + "learning_rate": 8.961624515621864e-06, + "loss": 0.8051, + "step": 252560 + }, + { + "epoch": 1.6135977409503852, + "grad_norm": 0.8011711239814758, + "learning_rate": 8.958758307645148e-06, + "loss": 0.7865, + "step": 252570 + }, + { + "epoch": 1.613661628100124, + "grad_norm": 0.9394097924232483, + "learning_rate": 8.955892512989233e-06, + "loss": 0.6621, + "step": 252580 + }, + { + "epoch": 1.6137255152498626, + "grad_norm": 1.0367693901062012, + "learning_rate": 8.953027131683e-06, + "loss": 0.9656, + "step": 252590 + }, + { + "epoch": 1.6137894023996013, + "grad_norm": 1.1489052772521973, + "learning_rate": 8.950162163755283e-06, + "loss": 0.8708, + "step": 252600 + }, + { + "epoch": 1.61385328954934, + "grad_norm": 2.150038003921509, + "learning_rate": 8.947297609234944e-06, + "loss": 0.8722, + "step": 252610 + }, + { + "epoch": 1.6139171766990787, + "grad_norm": 0.9468705058097839, + "learning_rate": 8.944433468150848e-06, + "loss": 1.1167, + "step": 252620 + }, + { + "epoch": 1.6139810638488175, + "grad_norm": 0.9067215323448181, + "learning_rate": 8.941569740531808e-06, + "loss": 0.876, + "step": 252630 + }, + { + "epoch": 1.6140449509985562, + "grad_norm": 0.7629149556159973, + "learning_rate": 8.938706426406702e-06, + "loss": 0.8063, + "step": 252640 + }, + { + "epoch": 1.6141088381482949, + "grad_norm": 1.2172737121582031, + "learning_rate": 8.935843525804328e-06, + "loss": 0.8134, + "step": 252650 + }, + { + "epoch": 1.6141727252980336, + "grad_norm": 1.2737256288528442, + "learning_rate": 8.932981038753547e-06, + "loss": 0.714, + "step": 252660 + }, + { + "epoch": 1.6142366124477723, + "grad_norm": 0.5625755190849304, + "learning_rate": 8.930118965283174e-06, + "loss": 0.6967, + "step": 252670 + }, + { + "epoch": 1.614300499597511, + "grad_norm": 1.1674059629440308, + "learning_rate": 8.927257305422038e-06, + "loss": 0.9341, + "step": 252680 + }, + { + "epoch": 1.6143643867472497, + "grad_norm": 1.0258337259292603, + "learning_rate": 8.924396059198936e-06, + "loss": 1.0119, + "step": 252690 + }, + { + "epoch": 1.6144282738969884, + "grad_norm": 1.1566420793533325, + "learning_rate": 8.921535226642718e-06, + "loss": 0.9388, + "step": 252700 + }, + { + "epoch": 1.614492161046727, + "grad_norm": 0.6528016328811646, + "learning_rate": 8.918674807782163e-06, + "loss": 0.9135, + "step": 252710 + }, + { + "epoch": 1.6145560481964658, + "grad_norm": 1.0167995691299438, + "learning_rate": 8.915814802646093e-06, + "loss": 0.9045, + "step": 252720 + }, + { + "epoch": 1.6146199353462045, + "grad_norm": 1.4415538311004639, + "learning_rate": 8.912955211263323e-06, + "loss": 0.971, + "step": 252730 + }, + { + "epoch": 1.6146838224959432, + "grad_norm": 1.1365299224853516, + "learning_rate": 8.91009603366263e-06, + "loss": 1.3696, + "step": 252740 + }, + { + "epoch": 1.6147477096456817, + "grad_norm": 0.9816776514053345, + "learning_rate": 8.907237269872831e-06, + "loss": 0.7537, + "step": 252750 + }, + { + "epoch": 1.6148115967954206, + "grad_norm": 0.7047487497329712, + "learning_rate": 8.904378919922684e-06, + "loss": 1.2247, + "step": 252760 + }, + { + "epoch": 1.6148754839451591, + "grad_norm": 1.297702670097351, + "learning_rate": 8.901520983841017e-06, + "loss": 0.9188, + "step": 252770 + }, + { + "epoch": 1.614939371094898, + "grad_norm": 0.8933472037315369, + "learning_rate": 8.89866346165657e-06, + "loss": 0.8977, + "step": 252780 + }, + { + "epoch": 1.6150032582446365, + "grad_norm": 0.910576343536377, + "learning_rate": 8.895806353398151e-06, + "loss": 0.8402, + "step": 252790 + }, + { + "epoch": 1.6150671453943755, + "grad_norm": 0.6891723871231079, + "learning_rate": 8.892949659094513e-06, + "loss": 1.0497, + "step": 252800 + }, + { + "epoch": 1.615131032544114, + "grad_norm": 0.7324727177619934, + "learning_rate": 8.890093378774439e-06, + "loss": 0.8316, + "step": 252810 + }, + { + "epoch": 1.6151949196938529, + "grad_norm": 1.0393435955047607, + "learning_rate": 8.887237512466685e-06, + "loss": 1.2601, + "step": 252820 + }, + { + "epoch": 1.6152588068435914, + "grad_norm": 0.8941654562950134, + "learning_rate": 8.88438206020003e-06, + "loss": 1.1747, + "step": 252830 + }, + { + "epoch": 1.6153226939933303, + "grad_norm": 0.872191846370697, + "learning_rate": 8.8815270220032e-06, + "loss": 0.6791, + "step": 252840 + }, + { + "epoch": 1.6153865811430688, + "grad_norm": 0.8596194386482239, + "learning_rate": 8.878672397904986e-06, + "loss": 0.7369, + "step": 252850 + }, + { + "epoch": 1.6154504682928077, + "grad_norm": 1.0066167116165161, + "learning_rate": 8.875818187934098e-06, + "loss": 0.6986, + "step": 252860 + }, + { + "epoch": 1.6155143554425462, + "grad_norm": 0.8400135040283203, + "learning_rate": 8.87296439211931e-06, + "loss": 0.8633, + "step": 252870 + }, + { + "epoch": 1.615578242592285, + "grad_norm": 1.0368555784225464, + "learning_rate": 8.870111010489341e-06, + "loss": 1.041, + "step": 252880 + }, + { + "epoch": 1.6156421297420236, + "grad_norm": 0.6796692609786987, + "learning_rate": 8.867258043072946e-06, + "loss": 0.9529, + "step": 252890 + }, + { + "epoch": 1.6157060168917625, + "grad_norm": 0.8098213076591492, + "learning_rate": 8.864405489898837e-06, + "loss": 0.9878, + "step": 252900 + }, + { + "epoch": 1.615769904041501, + "grad_norm": 1.901564121246338, + "learning_rate": 8.86155335099576e-06, + "loss": 0.7924, + "step": 252910 + }, + { + "epoch": 1.61583379119124, + "grad_norm": 0.9435287117958069, + "learning_rate": 8.858701626392425e-06, + "loss": 1.1102, + "step": 252920 + }, + { + "epoch": 1.6158976783409784, + "grad_norm": 1.3926451206207275, + "learning_rate": 8.855850316117554e-06, + "loss": 1.0823, + "step": 252930 + }, + { + "epoch": 1.6159615654907173, + "grad_norm": 0.9484646916389465, + "learning_rate": 8.852999420199876e-06, + "loss": 1.1486, + "step": 252940 + }, + { + "epoch": 1.6160254526404558, + "grad_norm": 1.0296976566314697, + "learning_rate": 8.85014893866808e-06, + "loss": 1.0762, + "step": 252950 + }, + { + "epoch": 1.6160893397901948, + "grad_norm": 0.8114399909973145, + "learning_rate": 8.847298871550896e-06, + "loss": 1.0435, + "step": 252960 + }, + { + "epoch": 1.6161532269399332, + "grad_norm": 1.1355493068695068, + "learning_rate": 8.844449218877005e-06, + "loss": 0.9153, + "step": 252970 + }, + { + "epoch": 1.6162171140896722, + "grad_norm": 0.7851929068565369, + "learning_rate": 8.841599980675125e-06, + "loss": 0.6855, + "step": 252980 + }, + { + "epoch": 1.6162810012394107, + "grad_norm": 1.2407293319702148, + "learning_rate": 8.838751156973923e-06, + "loss": 0.9023, + "step": 252990 + }, + { + "epoch": 1.6163448883891494, + "grad_norm": 1.2262955904006958, + "learning_rate": 8.835902747802128e-06, + "loss": 1.0251, + "step": 253000 + }, + { + "epoch": 1.616408775538888, + "grad_norm": 1.3506430387496948, + "learning_rate": 8.833054753188386e-06, + "loss": 0.8255, + "step": 253010 + }, + { + "epoch": 1.6164726626886268, + "grad_norm": 0.7670350670814514, + "learning_rate": 8.830207173161408e-06, + "loss": 0.7006, + "step": 253020 + }, + { + "epoch": 1.6165365498383655, + "grad_norm": 0.9348157644271851, + "learning_rate": 8.827360007749852e-06, + "loss": 0.8525, + "step": 253030 + }, + { + "epoch": 1.6166004369881042, + "grad_norm": 1.010741949081421, + "learning_rate": 8.824513256982414e-06, + "loss": 0.9768, + "step": 253040 + }, + { + "epoch": 1.616664324137843, + "grad_norm": 0.9472290277481079, + "learning_rate": 8.821666920887733e-06, + "loss": 0.846, + "step": 253050 + }, + { + "epoch": 1.6167282112875816, + "grad_norm": 1.4080737829208374, + "learning_rate": 8.818820999494504e-06, + "loss": 1.032, + "step": 253060 + }, + { + "epoch": 1.6167920984373203, + "grad_norm": 0.8792275190353394, + "learning_rate": 8.815975492831363e-06, + "loss": 0.8813, + "step": 253070 + }, + { + "epoch": 1.616855985587059, + "grad_norm": 2.1530280113220215, + "learning_rate": 8.813130400926988e-06, + "loss": 0.8588, + "step": 253080 + }, + { + "epoch": 1.6169198727367977, + "grad_norm": 0.9053194522857666, + "learning_rate": 8.810285723810013e-06, + "loss": 0.7985, + "step": 253090 + }, + { + "epoch": 1.6169837598865364, + "grad_norm": 0.6816472411155701, + "learning_rate": 8.807441461509108e-06, + "loss": 1.087, + "step": 253100 + }, + { + "epoch": 1.6170476470362751, + "grad_norm": 1.0651800632476807, + "learning_rate": 8.804597614052885e-06, + "loss": 0.908, + "step": 253110 + }, + { + "epoch": 1.6171115341860138, + "grad_norm": 1.0918779373168945, + "learning_rate": 8.801754181470023e-06, + "loss": 1.0176, + "step": 253120 + }, + { + "epoch": 1.6171754213357525, + "grad_norm": 2.096322774887085, + "learning_rate": 8.79891116378912e-06, + "loss": 0.938, + "step": 253130 + }, + { + "epoch": 1.6172393084854912, + "grad_norm": 1.3487486839294434, + "learning_rate": 8.796068561038828e-06, + "loss": 0.9764, + "step": 253140 + }, + { + "epoch": 1.61730319563523, + "grad_norm": 0.8517560362815857, + "learning_rate": 8.793226373247787e-06, + "loss": 0.828, + "step": 253150 + }, + { + "epoch": 1.6173670827849687, + "grad_norm": 0.764380931854248, + "learning_rate": 8.790384600444601e-06, + "loss": 0.8548, + "step": 253160 + }, + { + "epoch": 1.6174309699347074, + "grad_norm": 1.2008692026138306, + "learning_rate": 8.787543242657891e-06, + "loss": 0.9342, + "step": 253170 + }, + { + "epoch": 1.617494857084446, + "grad_norm": 0.9357091188430786, + "learning_rate": 8.78470229991627e-06, + "loss": 0.6565, + "step": 253180 + }, + { + "epoch": 1.6175587442341848, + "grad_norm": 1.0450602769851685, + "learning_rate": 8.781861772248362e-06, + "loss": 0.8956, + "step": 253190 + }, + { + "epoch": 1.6176226313839235, + "grad_norm": 0.9648509621620178, + "learning_rate": 8.779021659682752e-06, + "loss": 1.0464, + "step": 253200 + }, + { + "epoch": 1.6176865185336622, + "grad_norm": 0.9442372918128967, + "learning_rate": 8.776181962248065e-06, + "loss": 0.9629, + "step": 253210 + }, + { + "epoch": 1.617750405683401, + "grad_norm": 1.1622222661972046, + "learning_rate": 8.773342679972879e-06, + "loss": 1.0001, + "step": 253220 + }, + { + "epoch": 1.6178142928331396, + "grad_norm": 0.7686667442321777, + "learning_rate": 8.77050381288581e-06, + "loss": 0.9153, + "step": 253230 + }, + { + "epoch": 1.617878179982878, + "grad_norm": 0.7704346179962158, + "learning_rate": 8.767665361015425e-06, + "loss": 0.7974, + "step": 253240 + }, + { + "epoch": 1.617942067132617, + "grad_norm": 0.8347066640853882, + "learning_rate": 8.764827324390324e-06, + "loss": 1.2218, + "step": 253250 + }, + { + "epoch": 1.6180059542823555, + "grad_norm": 3.1967313289642334, + "learning_rate": 8.761989703039091e-06, + "loss": 0.9429, + "step": 253260 + }, + { + "epoch": 1.6180698414320944, + "grad_norm": 0.7391064763069153, + "learning_rate": 8.759152496990291e-06, + "loss": 0.8333, + "step": 253270 + }, + { + "epoch": 1.618133728581833, + "grad_norm": 1.1434670686721802, + "learning_rate": 8.756315706272516e-06, + "loss": 0.885, + "step": 253280 + }, + { + "epoch": 1.6181976157315718, + "grad_norm": 1.1375341415405273, + "learning_rate": 8.753479330914305e-06, + "loss": 0.9347, + "step": 253290 + }, + { + "epoch": 1.6182615028813103, + "grad_norm": 0.8434621691703796, + "learning_rate": 8.750643370944256e-06, + "loss": 0.7543, + "step": 253300 + }, + { + "epoch": 1.6183253900310492, + "grad_norm": 1.2263952493667603, + "learning_rate": 8.747807826390902e-06, + "loss": 0.7379, + "step": 253310 + }, + { + "epoch": 1.6183892771807877, + "grad_norm": 0.9305620789527893, + "learning_rate": 8.744972697282821e-06, + "loss": 0.8767, + "step": 253320 + }, + { + "epoch": 1.6184531643305267, + "grad_norm": 0.9405209422111511, + "learning_rate": 8.742137983648552e-06, + "loss": 0.9911, + "step": 253330 + }, + { + "epoch": 1.6185170514802651, + "grad_norm": 0.964889407157898, + "learning_rate": 8.739303685516647e-06, + "loss": 0.9723, + "step": 253340 + }, + { + "epoch": 1.618580938630004, + "grad_norm": 1.0919055938720703, + "learning_rate": 8.736469802915648e-06, + "loss": 0.8663, + "step": 253350 + }, + { + "epoch": 1.6186448257797426, + "grad_norm": 0.9384715557098389, + "learning_rate": 8.7336363358741e-06, + "loss": 0.9946, + "step": 253360 + }, + { + "epoch": 1.6187087129294815, + "grad_norm": 0.8242424726486206, + "learning_rate": 8.730803284420524e-06, + "loss": 1.0228, + "step": 253370 + }, + { + "epoch": 1.61877260007922, + "grad_norm": 1.8711493015289307, + "learning_rate": 8.727970648583478e-06, + "loss": 0.943, + "step": 253380 + }, + { + "epoch": 1.618836487228959, + "grad_norm": 1.430071234703064, + "learning_rate": 8.725138428391461e-06, + "loss": 0.8167, + "step": 253390 + }, + { + "epoch": 1.6189003743786974, + "grad_norm": 0.7001949548721313, + "learning_rate": 8.722306623873016e-06, + "loss": 0.7138, + "step": 253400 + }, + { + "epoch": 1.6189642615284363, + "grad_norm": 0.9516452550888062, + "learning_rate": 8.71947523505664e-06, + "loss": 0.8375, + "step": 253410 + }, + { + "epoch": 1.6190281486781748, + "grad_norm": 0.8716029524803162, + "learning_rate": 8.71664426197088e-06, + "loss": 0.7683, + "step": 253420 + }, + { + "epoch": 1.6190920358279137, + "grad_norm": 1.038964033126831, + "learning_rate": 8.713813704644208e-06, + "loss": 1.0598, + "step": 253430 + }, + { + "epoch": 1.6191559229776522, + "grad_norm": 0.9941467046737671, + "learning_rate": 8.71098356310517e-06, + "loss": 0.7832, + "step": 253440 + }, + { + "epoch": 1.6192198101273911, + "grad_norm": 1.290126085281372, + "learning_rate": 8.708153837382227e-06, + "loss": 0.9266, + "step": 253450 + }, + { + "epoch": 1.6192836972771296, + "grad_norm": 0.6855711936950684, + "learning_rate": 8.7053245275039e-06, + "loss": 1.1036, + "step": 253460 + }, + { + "epoch": 1.6193475844268683, + "grad_norm": 0.8235642910003662, + "learning_rate": 8.702495633498697e-06, + "loss": 0.8896, + "step": 253470 + }, + { + "epoch": 1.619411471576607, + "grad_norm": 0.8104490637779236, + "learning_rate": 8.699667155395074e-06, + "loss": 0.7131, + "step": 253480 + }, + { + "epoch": 1.6194753587263457, + "grad_norm": 0.9141734838485718, + "learning_rate": 8.696839093221542e-06, + "loss": 0.6927, + "step": 253490 + }, + { + "epoch": 1.6195392458760844, + "grad_norm": 0.9874205589294434, + "learning_rate": 8.694011447006568e-06, + "loss": 0.8266, + "step": 253500 + }, + { + "epoch": 1.6196031330258231, + "grad_norm": 2.374424457550049, + "learning_rate": 8.691184216778642e-06, + "loss": 0.7966, + "step": 253510 + }, + { + "epoch": 1.6196670201755619, + "grad_norm": 1.5083518028259277, + "learning_rate": 8.68835740256621e-06, + "loss": 1.0701, + "step": 253520 + }, + { + "epoch": 1.6197309073253006, + "grad_norm": 0.8272247314453125, + "learning_rate": 8.68553100439778e-06, + "loss": 0.7651, + "step": 253530 + }, + { + "epoch": 1.6197947944750393, + "grad_norm": 1.3498271703720093, + "learning_rate": 8.682705022301779e-06, + "loss": 0.8375, + "step": 253540 + }, + { + "epoch": 1.619858681624778, + "grad_norm": 0.7562444806098938, + "learning_rate": 8.679879456306695e-06, + "loss": 0.952, + "step": 253550 + }, + { + "epoch": 1.6199225687745167, + "grad_norm": 0.9159499406814575, + "learning_rate": 8.677054306440956e-06, + "loss": 0.9591, + "step": 253560 + }, + { + "epoch": 1.6199864559242554, + "grad_norm": 0.5267454385757446, + "learning_rate": 8.67422957273305e-06, + "loss": 0.7717, + "step": 253570 + }, + { + "epoch": 1.620050343073994, + "grad_norm": 1.2697906494140625, + "learning_rate": 8.671405255211384e-06, + "loss": 1.0003, + "step": 253580 + }, + { + "epoch": 1.6201142302237328, + "grad_norm": 0.8089349865913391, + "learning_rate": 8.668581353904436e-06, + "loss": 0.6609, + "step": 253590 + }, + { + "epoch": 1.6201781173734715, + "grad_norm": 0.5568481087684631, + "learning_rate": 8.665757868840624e-06, + "loss": 0.6861, + "step": 253600 + }, + { + "epoch": 1.6202420045232102, + "grad_norm": 1.1049556732177734, + "learning_rate": 8.662934800048395e-06, + "loss": 0.7879, + "step": 253610 + }, + { + "epoch": 1.620305891672949, + "grad_norm": 1.5867961645126343, + "learning_rate": 8.660112147556165e-06, + "loss": 0.5971, + "step": 253620 + }, + { + "epoch": 1.6203697788226876, + "grad_norm": 0.8453254103660583, + "learning_rate": 8.65728991139238e-06, + "loss": 0.7813, + "step": 253630 + }, + { + "epoch": 1.6204336659724263, + "grad_norm": 0.6466057896614075, + "learning_rate": 8.654468091585455e-06, + "loss": 0.6963, + "step": 253640 + }, + { + "epoch": 1.620497553122165, + "grad_norm": 0.8569933176040649, + "learning_rate": 8.65164668816379e-06, + "loss": 0.9115, + "step": 253650 + }, + { + "epoch": 1.6205614402719037, + "grad_norm": 1.1944561004638672, + "learning_rate": 8.648825701155828e-06, + "loss": 0.8411, + "step": 253660 + }, + { + "epoch": 1.6206253274216424, + "grad_norm": 1.0112055540084839, + "learning_rate": 8.646005130589951e-06, + "loss": 0.7752, + "step": 253670 + }, + { + "epoch": 1.6206892145713812, + "grad_norm": 0.755969762802124, + "learning_rate": 8.643184976494595e-06, + "loss": 0.918, + "step": 253680 + }, + { + "epoch": 1.6207531017211199, + "grad_norm": 1.1352064609527588, + "learning_rate": 8.64036523889813e-06, + "loss": 0.7852, + "step": 253690 + }, + { + "epoch": 1.6208169888708586, + "grad_norm": 1.1978332996368408, + "learning_rate": 8.637545917828977e-06, + "loss": 0.9255, + "step": 253700 + }, + { + "epoch": 1.6208808760205973, + "grad_norm": 0.9351130127906799, + "learning_rate": 8.634727013315513e-06, + "loss": 1.0426, + "step": 253710 + }, + { + "epoch": 1.620944763170336, + "grad_norm": 1.1238881349563599, + "learning_rate": 8.631908525386146e-06, + "loss": 1.0592, + "step": 253720 + }, + { + "epoch": 1.6210086503200745, + "grad_norm": 0.7413579821586609, + "learning_rate": 8.629090454069233e-06, + "loss": 0.9098, + "step": 253730 + }, + { + "epoch": 1.6210725374698134, + "grad_norm": 1.8136285543441772, + "learning_rate": 8.626272799393188e-06, + "loss": 0.807, + "step": 253740 + }, + { + "epoch": 1.6211364246195519, + "grad_norm": 1.440377950668335, + "learning_rate": 8.623455561386351e-06, + "loss": 0.8329, + "step": 253750 + }, + { + "epoch": 1.6212003117692908, + "grad_norm": 1.3558531999588013, + "learning_rate": 8.620638740077125e-06, + "loss": 1.0271, + "step": 253760 + }, + { + "epoch": 1.6212641989190293, + "grad_norm": 1.0401147603988647, + "learning_rate": 8.617822335493858e-06, + "loss": 0.7169, + "step": 253770 + }, + { + "epoch": 1.6213280860687682, + "grad_norm": 1.2938861846923828, + "learning_rate": 8.615006347664917e-06, + "loss": 1.0175, + "step": 253780 + }, + { + "epoch": 1.6213919732185067, + "grad_norm": 1.347000241279602, + "learning_rate": 8.612190776618678e-06, + "loss": 1.2466, + "step": 253790 + }, + { + "epoch": 1.6214558603682456, + "grad_norm": 1.1044390201568604, + "learning_rate": 8.60937562238347e-06, + "loss": 0.7113, + "step": 253800 + }, + { + "epoch": 1.6215197475179841, + "grad_norm": 0.8815792202949524, + "learning_rate": 8.606560884987674e-06, + "loss": 0.9855, + "step": 253810 + }, + { + "epoch": 1.621583634667723, + "grad_norm": 0.9015599489212036, + "learning_rate": 8.603746564459603e-06, + "loss": 0.8028, + "step": 253820 + }, + { + "epoch": 1.6216475218174615, + "grad_norm": 0.8410263061523438, + "learning_rate": 8.600932660827631e-06, + "loss": 0.6916, + "step": 253830 + }, + { + "epoch": 1.6217114089672005, + "grad_norm": 0.9172081351280212, + "learning_rate": 8.598119174120072e-06, + "loss": 0.8827, + "step": 253840 + }, + { + "epoch": 1.621775296116939, + "grad_norm": 0.9500551819801331, + "learning_rate": 8.595306104365281e-06, + "loss": 0.8207, + "step": 253850 + }, + { + "epoch": 1.6218391832666779, + "grad_norm": 1.037736177444458, + "learning_rate": 8.592493451591566e-06, + "loss": 0.6628, + "step": 253860 + }, + { + "epoch": 1.6219030704164163, + "grad_norm": 0.6005664467811584, + "learning_rate": 8.589681215827278e-06, + "loss": 1.1371, + "step": 253870 + }, + { + "epoch": 1.6219669575661553, + "grad_norm": 2.2810683250427246, + "learning_rate": 8.58686939710071e-06, + "loss": 0.8308, + "step": 253880 + }, + { + "epoch": 1.6220308447158938, + "grad_norm": 0.8344201445579529, + "learning_rate": 8.58405799544021e-06, + "loss": 0.818, + "step": 253890 + }, + { + "epoch": 1.6220947318656327, + "grad_norm": 1.553041696548462, + "learning_rate": 8.58124701087406e-06, + "loss": 0.7736, + "step": 253900 + }, + { + "epoch": 1.6221586190153712, + "grad_norm": 0.7343198657035828, + "learning_rate": 8.578436443430599e-06, + "loss": 0.8195, + "step": 253910 + }, + { + "epoch": 1.62222250616511, + "grad_norm": 0.99444979429245, + "learning_rate": 8.575626293138105e-06, + "loss": 0.7379, + "step": 253920 + }, + { + "epoch": 1.6222863933148486, + "grad_norm": 1.0453051328659058, + "learning_rate": 8.572816560024904e-06, + "loss": 1.2449, + "step": 253930 + }, + { + "epoch": 1.6223502804645875, + "grad_norm": 1.0699635744094849, + "learning_rate": 8.570007244119271e-06, + "loss": 0.7713, + "step": 253940 + }, + { + "epoch": 1.622414167614326, + "grad_norm": 0.913071870803833, + "learning_rate": 8.567198345449517e-06, + "loss": 0.7638, + "step": 253950 + }, + { + "epoch": 1.6224780547640647, + "grad_norm": 0.2786030173301697, + "learning_rate": 8.564389864043909e-06, + "loss": 0.684, + "step": 253960 + }, + { + "epoch": 1.6225419419138034, + "grad_norm": 0.9149852395057678, + "learning_rate": 8.561581799930752e-06, + "loss": 0.7624, + "step": 253970 + }, + { + "epoch": 1.6226058290635421, + "grad_norm": 0.7636930346488953, + "learning_rate": 8.558774153138304e-06, + "loss": 0.7989, + "step": 253980 + }, + { + "epoch": 1.6226697162132808, + "grad_norm": 1.0884915590286255, + "learning_rate": 8.555966923694848e-06, + "loss": 0.9634, + "step": 253990 + }, + { + "epoch": 1.6227336033630195, + "grad_norm": 0.7749305963516235, + "learning_rate": 8.553160111628677e-06, + "loss": 0.7077, + "step": 254000 + }, + { + "epoch": 1.6227974905127582, + "grad_norm": 0.9463643431663513, + "learning_rate": 8.55035371696803e-06, + "loss": 0.8166, + "step": 254010 + }, + { + "epoch": 1.622861377662497, + "grad_norm": 1.131610631942749, + "learning_rate": 8.547547739741186e-06, + "loss": 1.0242, + "step": 254020 + }, + { + "epoch": 1.6229252648122356, + "grad_norm": 0.9619300961494446, + "learning_rate": 8.54474217997639e-06, + "loss": 0.9237, + "step": 254030 + }, + { + "epoch": 1.6229891519619744, + "grad_norm": 1.0643653869628906, + "learning_rate": 8.541937037701914e-06, + "loss": 0.7191, + "step": 254040 + }, + { + "epoch": 1.623053039111713, + "grad_norm": 0.9629873633384705, + "learning_rate": 8.539132312945985e-06, + "loss": 0.6617, + "step": 254050 + }, + { + "epoch": 1.6231169262614518, + "grad_norm": 0.9147536754608154, + "learning_rate": 8.536328005736876e-06, + "loss": 0.7178, + "step": 254060 + }, + { + "epoch": 1.6231808134111905, + "grad_norm": 1.4298889636993408, + "learning_rate": 8.533804486274533e-06, + "loss": 1.0717, + "step": 254070 + }, + { + "epoch": 1.6232447005609292, + "grad_norm": 1.483441948890686, + "learning_rate": 8.531000972482162e-06, + "loss": 0.6687, + "step": 254080 + }, + { + "epoch": 1.6233085877106679, + "grad_norm": 0.825210690498352, + "learning_rate": 8.528197876318472e-06, + "loss": 1.025, + "step": 254090 + }, + { + "epoch": 1.6233724748604066, + "grad_norm": 1.3113638162612915, + "learning_rate": 8.525395197811703e-06, + "loss": 0.8856, + "step": 254100 + }, + { + "epoch": 1.6234363620101453, + "grad_norm": 1.4372659921646118, + "learning_rate": 8.522592936990103e-06, + "loss": 0.8735, + "step": 254110 + }, + { + "epoch": 1.623500249159884, + "grad_norm": 0.9195582270622253, + "learning_rate": 8.519791093881862e-06, + "loss": 0.7764, + "step": 254120 + }, + { + "epoch": 1.6235641363096227, + "grad_norm": 0.6057882308959961, + "learning_rate": 8.516989668515224e-06, + "loss": 0.7851, + "step": 254130 + }, + { + "epoch": 1.6236280234593614, + "grad_norm": 0.7561864256858826, + "learning_rate": 8.514188660918377e-06, + "loss": 0.8902, + "step": 254140 + }, + { + "epoch": 1.6236919106091001, + "grad_norm": 1.9797289371490479, + "learning_rate": 8.511388071119548e-06, + "loss": 0.9164, + "step": 254150 + }, + { + "epoch": 1.6237557977588388, + "grad_norm": 0.8142422437667847, + "learning_rate": 8.50858789914692e-06, + "loss": 0.7603, + "step": 254160 + }, + { + "epoch": 1.6238196849085775, + "grad_norm": 1.075189232826233, + "learning_rate": 8.505788145028725e-06, + "loss": 0.8073, + "step": 254170 + }, + { + "epoch": 1.6238835720583162, + "grad_norm": 0.8327805399894714, + "learning_rate": 8.502988808793127e-06, + "loss": 1.0442, + "step": 254180 + }, + { + "epoch": 1.623947459208055, + "grad_norm": 0.9066655039787292, + "learning_rate": 8.500189890468341e-06, + "loss": 1.0807, + "step": 254190 + }, + { + "epoch": 1.6240113463577934, + "grad_norm": 0.7944930791854858, + "learning_rate": 8.497391390082538e-06, + "loss": 1.1007, + "step": 254200 + }, + { + "epoch": 1.6240752335075324, + "grad_norm": 1.1014735698699951, + "learning_rate": 8.494593307663917e-06, + "loss": 0.9846, + "step": 254210 + }, + { + "epoch": 1.6241391206572708, + "grad_norm": 1.0590271949768066, + "learning_rate": 8.491795643240635e-06, + "loss": 1.0084, + "step": 254220 + }, + { + "epoch": 1.6242030078070098, + "grad_norm": 1.487004041671753, + "learning_rate": 8.488998396840896e-06, + "loss": 0.8395, + "step": 254230 + }, + { + "epoch": 1.6242668949567483, + "grad_norm": 0.5390102863311768, + "learning_rate": 8.48620156849284e-06, + "loss": 0.8143, + "step": 254240 + }, + { + "epoch": 1.6243307821064872, + "grad_norm": 1.1597110033035278, + "learning_rate": 8.483405158224666e-06, + "loss": 0.8888, + "step": 254250 + }, + { + "epoch": 1.6243946692562257, + "grad_norm": 1.2700620889663696, + "learning_rate": 8.480609166064502e-06, + "loss": 0.7846, + "step": 254260 + }, + { + "epoch": 1.6244585564059646, + "grad_norm": 1.114956021308899, + "learning_rate": 8.47781359204054e-06, + "loss": 0.8657, + "step": 254270 + }, + { + "epoch": 1.624522443555703, + "grad_norm": 1.4616273641586304, + "learning_rate": 8.475018436180914e-06, + "loss": 0.8768, + "step": 254280 + }, + { + "epoch": 1.624586330705442, + "grad_norm": 1.03916597366333, + "learning_rate": 8.472223698513765e-06, + "loss": 1.1322, + "step": 254290 + }, + { + "epoch": 1.6246502178551805, + "grad_norm": 0.8728145360946655, + "learning_rate": 8.469429379067263e-06, + "loss": 1.0256, + "step": 254300 + }, + { + "epoch": 1.6247141050049194, + "grad_norm": 0.6270018815994263, + "learning_rate": 8.466635477869523e-06, + "loss": 0.9038, + "step": 254310 + }, + { + "epoch": 1.624777992154658, + "grad_norm": 1.0088562965393066, + "learning_rate": 8.463841994948707e-06, + "loss": 1.1522, + "step": 254320 + }, + { + "epoch": 1.6248418793043968, + "grad_norm": 1.4455088376998901, + "learning_rate": 8.461048930332927e-06, + "loss": 0.7958, + "step": 254330 + }, + { + "epoch": 1.6249057664541353, + "grad_norm": 1.4557523727416992, + "learning_rate": 8.458256284050325e-06, + "loss": 0.7086, + "step": 254340 + }, + { + "epoch": 1.6249696536038742, + "grad_norm": 1.1543890237808228, + "learning_rate": 8.455464056129015e-06, + "loss": 0.7853, + "step": 254350 + }, + { + "epoch": 1.6250335407536127, + "grad_norm": 0.8756847977638245, + "learning_rate": 8.452672246597132e-06, + "loss": 0.9691, + "step": 254360 + }, + { + "epoch": 1.6250974279033517, + "grad_norm": 0.8379682898521423, + "learning_rate": 8.449880855482772e-06, + "loss": 0.9012, + "step": 254370 + }, + { + "epoch": 1.6251613150530901, + "grad_norm": 0.8428141474723816, + "learning_rate": 8.447089882814074e-06, + "loss": 0.7197, + "step": 254380 + }, + { + "epoch": 1.625225202202829, + "grad_norm": 1.1772140264511108, + "learning_rate": 8.444299328619116e-06, + "loss": 1.1891, + "step": 254390 + }, + { + "epoch": 1.6252890893525676, + "grad_norm": 1.0148288011550903, + "learning_rate": 8.441509192926023e-06, + "loss": 1.0552, + "step": 254400 + }, + { + "epoch": 1.6253529765023065, + "grad_norm": 1.1200981140136719, + "learning_rate": 8.438719475762873e-06, + "loss": 0.9241, + "step": 254410 + }, + { + "epoch": 1.625416863652045, + "grad_norm": 1.1166496276855469, + "learning_rate": 8.435930177157775e-06, + "loss": 0.6754, + "step": 254420 + }, + { + "epoch": 1.625480750801784, + "grad_norm": 1.1475327014923096, + "learning_rate": 8.43314129713883e-06, + "loss": 0.8859, + "step": 254430 + }, + { + "epoch": 1.6255446379515224, + "grad_norm": 1.413521647453308, + "learning_rate": 8.4303528357341e-06, + "loss": 0.942, + "step": 254440 + }, + { + "epoch": 1.625608525101261, + "grad_norm": 1.0335079431533813, + "learning_rate": 8.427564792971698e-06, + "loss": 1.0567, + "step": 254450 + }, + { + "epoch": 1.6256724122509998, + "grad_norm": 0.9358431696891785, + "learning_rate": 8.424777168879667e-06, + "loss": 0.9045, + "step": 254460 + }, + { + "epoch": 1.6257362994007385, + "grad_norm": 1.4438246488571167, + "learning_rate": 8.42198996348611e-06, + "loss": 0.8445, + "step": 254470 + }, + { + "epoch": 1.6258001865504772, + "grad_norm": 0.9652209877967834, + "learning_rate": 8.419203176819068e-06, + "loss": 0.868, + "step": 254480 + }, + { + "epoch": 1.625864073700216, + "grad_norm": 0.8575952053070068, + "learning_rate": 8.41641680890664e-06, + "loss": 0.8968, + "step": 254490 + }, + { + "epoch": 1.6259279608499546, + "grad_norm": 1.0472744703292847, + "learning_rate": 8.413630859776855e-06, + "loss": 0.6678, + "step": 254500 + }, + { + "epoch": 1.6259918479996933, + "grad_norm": 0.7651621103286743, + "learning_rate": 8.4108453294578e-06, + "loss": 0.8726, + "step": 254510 + }, + { + "epoch": 1.626055735149432, + "grad_norm": 0.8643214106559753, + "learning_rate": 8.408060217977499e-06, + "loss": 0.8884, + "step": 254520 + }, + { + "epoch": 1.6261196222991707, + "grad_norm": 0.9992806911468506, + "learning_rate": 8.40527552536402e-06, + "loss": 0.8474, + "step": 254530 + }, + { + "epoch": 1.6261835094489094, + "grad_norm": 0.8906753659248352, + "learning_rate": 8.402491251645394e-06, + "loss": 0.8043, + "step": 254540 + }, + { + "epoch": 1.6262473965986481, + "grad_norm": 1.187233567237854, + "learning_rate": 8.399707396849682e-06, + "loss": 1.0656, + "step": 254550 + }, + { + "epoch": 1.6263112837483868, + "grad_norm": 1.2150403261184692, + "learning_rate": 8.396923961004888e-06, + "loss": 0.76, + "step": 254560 + }, + { + "epoch": 1.6263751708981256, + "grad_norm": 1.0672495365142822, + "learning_rate": 8.394140944139079e-06, + "loss": 0.9527, + "step": 254570 + }, + { + "epoch": 1.6264390580478643, + "grad_norm": 0.9496098160743713, + "learning_rate": 8.391358346280253e-06, + "loss": 0.924, + "step": 254580 + }, + { + "epoch": 1.626502945197603, + "grad_norm": 0.8727953433990479, + "learning_rate": 8.388576167456453e-06, + "loss": 1.0533, + "step": 254590 + }, + { + "epoch": 1.6265668323473417, + "grad_norm": 1.1882212162017822, + "learning_rate": 8.385794407695679e-06, + "loss": 0.8036, + "step": 254600 + }, + { + "epoch": 1.6266307194970804, + "grad_norm": 0.7046560645103455, + "learning_rate": 8.38301306702597e-06, + "loss": 0.6862, + "step": 254610 + }, + { + "epoch": 1.626694606646819, + "grad_norm": 0.7783632874488831, + "learning_rate": 8.380232145475314e-06, + "loss": 0.923, + "step": 254620 + }, + { + "epoch": 1.6267584937965578, + "grad_norm": 1.1140450239181519, + "learning_rate": 8.377451643071722e-06, + "loss": 0.8692, + "step": 254630 + }, + { + "epoch": 1.6268223809462965, + "grad_norm": 0.5317877531051636, + "learning_rate": 8.374671559843211e-06, + "loss": 0.8846, + "step": 254640 + }, + { + "epoch": 1.6268862680960352, + "grad_norm": 1.6173325777053833, + "learning_rate": 8.371891895817763e-06, + "loss": 1.2716, + "step": 254650 + }, + { + "epoch": 1.626950155245774, + "grad_norm": 0.8460519909858704, + "learning_rate": 8.369112651023386e-06, + "loss": 0.8159, + "step": 254660 + }, + { + "epoch": 1.6270140423955126, + "grad_norm": 0.9344705939292908, + "learning_rate": 8.366333825488048e-06, + "loss": 0.6481, + "step": 254670 + }, + { + "epoch": 1.6270779295452513, + "grad_norm": 1.0719621181488037, + "learning_rate": 8.363555419239754e-06, + "loss": 0.9804, + "step": 254680 + }, + { + "epoch": 1.6271418166949898, + "grad_norm": 0.5989803671836853, + "learning_rate": 8.360777432306472e-06, + "loss": 0.7973, + "step": 254690 + }, + { + "epoch": 1.6272057038447287, + "grad_norm": 0.5963758826255798, + "learning_rate": 8.357999864716192e-06, + "loss": 0.9673, + "step": 254700 + }, + { + "epoch": 1.6272695909944672, + "grad_norm": 1.047027349472046, + "learning_rate": 8.355222716496874e-06, + "loss": 0.8392, + "step": 254710 + }, + { + "epoch": 1.6273334781442061, + "grad_norm": 1.8303179740905762, + "learning_rate": 8.352445987676493e-06, + "loss": 1.0897, + "step": 254720 + }, + { + "epoch": 1.6273973652939446, + "grad_norm": 1.0254693031311035, + "learning_rate": 8.349669678283006e-06, + "loss": 1.1549, + "step": 254730 + }, + { + "epoch": 1.6274612524436836, + "grad_norm": 0.9034429788589478, + "learning_rate": 8.34689378834439e-06, + "loss": 1.0297, + "step": 254740 + }, + { + "epoch": 1.627525139593422, + "grad_norm": 0.9026135206222534, + "learning_rate": 8.344118317888578e-06, + "loss": 0.8806, + "step": 254750 + }, + { + "epoch": 1.627589026743161, + "grad_norm": 0.9114247560501099, + "learning_rate": 8.341343266943541e-06, + "loss": 1.0349, + "step": 254760 + }, + { + "epoch": 1.6276529138928995, + "grad_norm": 1.8976101875305176, + "learning_rate": 8.338568635537214e-06, + "loss": 1.2793, + "step": 254770 + }, + { + "epoch": 1.6277168010426384, + "grad_norm": 0.6016636490821838, + "learning_rate": 8.335794423697535e-06, + "loss": 0.8066, + "step": 254780 + }, + { + "epoch": 1.6277806881923769, + "grad_norm": 0.5638062357902527, + "learning_rate": 8.333020631452465e-06, + "loss": 0.6787, + "step": 254790 + }, + { + "epoch": 1.6278445753421158, + "grad_norm": 0.8801025152206421, + "learning_rate": 8.33024725882991e-06, + "loss": 0.8708, + "step": 254800 + }, + { + "epoch": 1.6279084624918543, + "grad_norm": 0.8659884333610535, + "learning_rate": 8.327474305857824e-06, + "loss": 0.9343, + "step": 254810 + }, + { + "epoch": 1.6279723496415932, + "grad_norm": 1.0816649198532104, + "learning_rate": 8.324701772564114e-06, + "loss": 0.8127, + "step": 254820 + }, + { + "epoch": 1.6280362367913317, + "grad_norm": 0.8168279528617859, + "learning_rate": 8.321929658976724e-06, + "loss": 0.8289, + "step": 254830 + }, + { + "epoch": 1.6281001239410706, + "grad_norm": 1.38945734500885, + "learning_rate": 8.319157965123542e-06, + "loss": 0.6386, + "step": 254840 + }, + { + "epoch": 1.628164011090809, + "grad_norm": 2.2664241790771484, + "learning_rate": 8.316386691032518e-06, + "loss": 0.9428, + "step": 254850 + }, + { + "epoch": 1.628227898240548, + "grad_norm": 0.9218431115150452, + "learning_rate": 8.313615836731525e-06, + "loss": 0.7923, + "step": 254860 + }, + { + "epoch": 1.6282917853902865, + "grad_norm": 1.1074163913726807, + "learning_rate": 8.310845402248496e-06, + "loss": 0.8064, + "step": 254870 + }, + { + "epoch": 1.6283556725400254, + "grad_norm": 0.8705098628997803, + "learning_rate": 8.308075387611309e-06, + "loss": 0.6641, + "step": 254880 + }, + { + "epoch": 1.628419559689764, + "grad_norm": 0.6592504382133484, + "learning_rate": 8.305305792847884e-06, + "loss": 0.8826, + "step": 254890 + }, + { + "epoch": 1.6284834468395029, + "grad_norm": 0.715520441532135, + "learning_rate": 8.30253661798609e-06, + "loss": 0.7623, + "step": 254900 + }, + { + "epoch": 1.6285473339892413, + "grad_norm": 1.0568970441818237, + "learning_rate": 8.299767863053836e-06, + "loss": 0.7528, + "step": 254910 + }, + { + "epoch": 1.6286112211389803, + "grad_norm": 0.5976938009262085, + "learning_rate": 8.296999528078985e-06, + "loss": 0.8017, + "step": 254920 + }, + { + "epoch": 1.6286751082887188, + "grad_norm": 1.312427282333374, + "learning_rate": 8.294231613089438e-06, + "loss": 1.0423, + "step": 254930 + }, + { + "epoch": 1.6287389954384575, + "grad_norm": 0.9450312256813049, + "learning_rate": 8.291464118113046e-06, + "loss": 0.7212, + "step": 254940 + }, + { + "epoch": 1.6288028825881962, + "grad_norm": 0.9719014763832092, + "learning_rate": 8.288697043177695e-06, + "loss": 0.8867, + "step": 254950 + }, + { + "epoch": 1.6288667697379349, + "grad_norm": 2.3406286239624023, + "learning_rate": 8.285930388311259e-06, + "loss": 0.8943, + "step": 254960 + }, + { + "epoch": 1.6289306568876736, + "grad_norm": 1.1877886056900024, + "learning_rate": 8.283164153541583e-06, + "loss": 0.8398, + "step": 254970 + }, + { + "epoch": 1.6289945440374123, + "grad_norm": 1.5753027200698853, + "learning_rate": 8.280398338896545e-06, + "loss": 0.7947, + "step": 254980 + }, + { + "epoch": 1.629058431187151, + "grad_norm": 1.7172194719314575, + "learning_rate": 8.27763294440398e-06, + "loss": 0.81, + "step": 254990 + }, + { + "epoch": 1.6291223183368897, + "grad_norm": 1.424993872642517, + "learning_rate": 8.274867970091755e-06, + "loss": 1.0839, + "step": 255000 + }, + { + "epoch": 1.6291862054866284, + "grad_norm": 1.0607999563217163, + "learning_rate": 8.272103415987692e-06, + "loss": 0.8039, + "step": 255010 + }, + { + "epoch": 1.629250092636367, + "grad_norm": 1.1941611766815186, + "learning_rate": 8.269339282119665e-06, + "loss": 0.9033, + "step": 255020 + }, + { + "epoch": 1.6293139797861058, + "grad_norm": 0.8026540875434875, + "learning_rate": 8.266575568515478e-06, + "loss": 0.7958, + "step": 255030 + }, + { + "epoch": 1.6293778669358445, + "grad_norm": 1.5615814924240112, + "learning_rate": 8.263812275202992e-06, + "loss": 0.9509, + "step": 255040 + }, + { + "epoch": 1.6294417540855832, + "grad_norm": 1.3261250257492065, + "learning_rate": 8.261049402210014e-06, + "loss": 0.8111, + "step": 255050 + }, + { + "epoch": 1.629505641235322, + "grad_norm": 2.028870105743408, + "learning_rate": 8.258286949564387e-06, + "loss": 1.0351, + "step": 255060 + }, + { + "epoch": 1.6295695283850606, + "grad_norm": 0.7149475812911987, + "learning_rate": 8.255524917293912e-06, + "loss": 0.8323, + "step": 255070 + }, + { + "epoch": 1.6296334155347993, + "grad_norm": 0.812483549118042, + "learning_rate": 8.252763305426425e-06, + "loss": 0.7807, + "step": 255080 + }, + { + "epoch": 1.629697302684538, + "grad_norm": 3.0065112113952637, + "learning_rate": 8.250002113989712e-06, + "loss": 0.9642, + "step": 255090 + }, + { + "epoch": 1.6297611898342768, + "grad_norm": 1.015610933303833, + "learning_rate": 8.24724134301162e-06, + "loss": 0.8268, + "step": 255100 + }, + { + "epoch": 1.6298250769840155, + "grad_norm": 0.6669619083404541, + "learning_rate": 8.24448099251991e-06, + "loss": 0.8502, + "step": 255110 + }, + { + "epoch": 1.6298889641337542, + "grad_norm": 1.0304489135742188, + "learning_rate": 8.241721062542413e-06, + "loss": 0.7631, + "step": 255120 + }, + { + "epoch": 1.6299528512834929, + "grad_norm": 0.6376091837882996, + "learning_rate": 8.238961553106894e-06, + "loss": 0.793, + "step": 255130 + }, + { + "epoch": 1.6300167384332316, + "grad_norm": 1.7186837196350098, + "learning_rate": 8.236202464241177e-06, + "loss": 0.9544, + "step": 255140 + }, + { + "epoch": 1.6300806255829703, + "grad_norm": 0.6518387794494629, + "learning_rate": 8.233443795973023e-06, + "loss": 0.7596, + "step": 255150 + }, + { + "epoch": 1.630144512732709, + "grad_norm": 0.9873846769332886, + "learning_rate": 8.230685548330219e-06, + "loss": 1.0806, + "step": 255160 + }, + { + "epoch": 1.6302083998824477, + "grad_norm": 0.8769903779029846, + "learning_rate": 8.227927721340561e-06, + "loss": 0.8805, + "step": 255170 + }, + { + "epoch": 1.6302722870321862, + "grad_norm": 1.1980665922164917, + "learning_rate": 8.225170315031794e-06, + "loss": 0.7595, + "step": 255180 + }, + { + "epoch": 1.6303361741819251, + "grad_norm": 0.5990476608276367, + "learning_rate": 8.222413329431721e-06, + "loss": 0.8834, + "step": 255190 + }, + { + "epoch": 1.6304000613316636, + "grad_norm": 0.7708885669708252, + "learning_rate": 8.219656764568067e-06, + "loss": 0.6584, + "step": 255200 + }, + { + "epoch": 1.6304639484814025, + "grad_norm": 1.4800968170166016, + "learning_rate": 8.216900620468636e-06, + "loss": 0.7687, + "step": 255210 + }, + { + "epoch": 1.630527835631141, + "grad_norm": 1.000148057937622, + "learning_rate": 8.214144897161147e-06, + "loss": 0.8093, + "step": 255220 + }, + { + "epoch": 1.63059172278088, + "grad_norm": 1.8174539804458618, + "learning_rate": 8.21138959467338e-06, + "loss": 0.7309, + "step": 255230 + }, + { + "epoch": 1.6306556099306184, + "grad_norm": 0.9420324563980103, + "learning_rate": 8.20863471303306e-06, + "loss": 0.8949, + "step": 255240 + }, + { + "epoch": 1.6307194970803573, + "grad_norm": 1.1732988357543945, + "learning_rate": 8.205880252267966e-06, + "loss": 0.7893, + "step": 255250 + }, + { + "epoch": 1.6307833842300958, + "grad_norm": 1.243893027305603, + "learning_rate": 8.2031262124058e-06, + "loss": 0.8272, + "step": 255260 + }, + { + "epoch": 1.6308472713798348, + "grad_norm": 0.9962188005447388, + "learning_rate": 8.200372593474304e-06, + "loss": 0.8322, + "step": 255270 + }, + { + "epoch": 1.6309111585295732, + "grad_norm": 1.157586693763733, + "learning_rate": 8.197619395501228e-06, + "loss": 0.9957, + "step": 255280 + }, + { + "epoch": 1.6309750456793122, + "grad_norm": 0.7285836338996887, + "learning_rate": 8.19486661851428e-06, + "loss": 0.7998, + "step": 255290 + }, + { + "epoch": 1.6310389328290507, + "grad_norm": 1.6524395942687988, + "learning_rate": 8.192114262541207e-06, + "loss": 0.8012, + "step": 255300 + }, + { + "epoch": 1.6311028199787896, + "grad_norm": 1.0645508766174316, + "learning_rate": 8.189362327609695e-06, + "loss": 0.8827, + "step": 255310 + }, + { + "epoch": 1.631166707128528, + "grad_norm": 0.747715950012207, + "learning_rate": 8.186610813747491e-06, + "loss": 0.819, + "step": 255320 + }, + { + "epoch": 1.631230594278267, + "grad_norm": 0.8673967123031616, + "learning_rate": 8.183859720982274e-06, + "loss": 0.9304, + "step": 255330 + }, + { + "epoch": 1.6312944814280055, + "grad_norm": 1.3902219533920288, + "learning_rate": 8.181109049341783e-06, + "loss": 0.9032, + "step": 255340 + }, + { + "epoch": 1.6313583685777444, + "grad_norm": 1.0869730710983276, + "learning_rate": 8.178358798853686e-06, + "loss": 1.0941, + "step": 255350 + }, + { + "epoch": 1.631422255727483, + "grad_norm": 0.6030356287956238, + "learning_rate": 8.175608969545711e-06, + "loss": 0.9028, + "step": 255360 + }, + { + "epoch": 1.6314861428772218, + "grad_norm": 0.7987911105155945, + "learning_rate": 8.172859561445524e-06, + "loss": 0.8724, + "step": 255370 + }, + { + "epoch": 1.6315500300269603, + "grad_norm": 1.0909885168075562, + "learning_rate": 8.170110574580841e-06, + "loss": 0.9267, + "step": 255380 + }, + { + "epoch": 1.6316139171766992, + "grad_norm": 1.3036298751831055, + "learning_rate": 8.167362008979319e-06, + "loss": 0.7328, + "step": 255390 + }, + { + "epoch": 1.6316778043264377, + "grad_norm": 0.6408089995384216, + "learning_rate": 8.164613864668663e-06, + "loss": 0.6808, + "step": 255400 + }, + { + "epoch": 1.6317416914761766, + "grad_norm": 1.4903781414031982, + "learning_rate": 8.161866141676527e-06, + "loss": 1.0693, + "step": 255410 + }, + { + "epoch": 1.6318055786259151, + "grad_norm": 0.9320021867752075, + "learning_rate": 8.159118840030606e-06, + "loss": 0.8584, + "step": 255420 + }, + { + "epoch": 1.6318694657756538, + "grad_norm": 0.4931786358356476, + "learning_rate": 8.156371959758546e-06, + "loss": 0.8167, + "step": 255430 + }, + { + "epoch": 1.6319333529253925, + "grad_norm": 0.9139940738677979, + "learning_rate": 8.153625500888028e-06, + "loss": 1.234, + "step": 255440 + }, + { + "epoch": 1.6319972400751313, + "grad_norm": 0.8138971924781799, + "learning_rate": 8.150879463446692e-06, + "loss": 0.8459, + "step": 255450 + }, + { + "epoch": 1.63206112722487, + "grad_norm": 1.0304619073867798, + "learning_rate": 8.14813384746222e-06, + "loss": 0.7986, + "step": 255460 + }, + { + "epoch": 1.6321250143746087, + "grad_norm": 1.0934109687805176, + "learning_rate": 8.145388652962233e-06, + "loss": 1.1236, + "step": 255470 + }, + { + "epoch": 1.6321889015243474, + "grad_norm": 0.9255013465881348, + "learning_rate": 8.142643879974394e-06, + "loss": 0.9883, + "step": 255480 + }, + { + "epoch": 1.632252788674086, + "grad_norm": 0.954413890838623, + "learning_rate": 8.139899528526352e-06, + "loss": 0.9778, + "step": 255490 + }, + { + "epoch": 1.6323166758238248, + "grad_norm": 1.3113750219345093, + "learning_rate": 8.137155598645724e-06, + "loss": 0.8375, + "step": 255500 + }, + { + "epoch": 1.6323805629735635, + "grad_norm": 1.5685514211654663, + "learning_rate": 8.134412090360166e-06, + "loss": 0.8376, + "step": 255510 + }, + { + "epoch": 1.6324444501233022, + "grad_norm": 0.8365868330001831, + "learning_rate": 8.13166900369729e-06, + "loss": 0.6379, + "step": 255520 + }, + { + "epoch": 1.632508337273041, + "grad_norm": 1.8244489431381226, + "learning_rate": 8.128926338684734e-06, + "loss": 1.0256, + "step": 255530 + }, + { + "epoch": 1.6325722244227796, + "grad_norm": 1.310700535774231, + "learning_rate": 8.126184095350109e-06, + "loss": 1.0009, + "step": 255540 + }, + { + "epoch": 1.6326361115725183, + "grad_norm": 0.7904416918754578, + "learning_rate": 8.123442273721044e-06, + "loss": 0.9701, + "step": 255550 + }, + { + "epoch": 1.632699998722257, + "grad_norm": 0.6500728130340576, + "learning_rate": 8.120700873825133e-06, + "loss": 0.7332, + "step": 255560 + }, + { + "epoch": 1.6327638858719957, + "grad_norm": 0.8715521693229675, + "learning_rate": 8.117959895690003e-06, + "loss": 0.8498, + "step": 255570 + }, + { + "epoch": 1.6328277730217344, + "grad_norm": 0.9831506609916687, + "learning_rate": 8.115219339343244e-06, + "loss": 0.9128, + "step": 255580 + }, + { + "epoch": 1.6328916601714731, + "grad_norm": 0.7139176726341248, + "learning_rate": 8.112479204812468e-06, + "loss": 0.9402, + "step": 255590 + }, + { + "epoch": 1.6329555473212118, + "grad_norm": 0.9015781879425049, + "learning_rate": 8.109739492125256e-06, + "loss": 1.0385, + "step": 255600 + }, + { + "epoch": 1.6330194344709505, + "grad_norm": 0.904238224029541, + "learning_rate": 8.107000201309217e-06, + "loss": 0.6729, + "step": 255610 + }, + { + "epoch": 1.6330833216206893, + "grad_norm": 0.9031368494033813, + "learning_rate": 8.104261332391922e-06, + "loss": 1.0791, + "step": 255620 + }, + { + "epoch": 1.633147208770428, + "grad_norm": 1.1262279748916626, + "learning_rate": 8.101522885400964e-06, + "loss": 1.0155, + "step": 255630 + }, + { + "epoch": 1.6332110959201667, + "grad_norm": 1.1558489799499512, + "learning_rate": 8.098784860363912e-06, + "loss": 0.6596, + "step": 255640 + }, + { + "epoch": 1.6332749830699054, + "grad_norm": 1.087398886680603, + "learning_rate": 8.096047257308354e-06, + "loss": 0.7462, + "step": 255650 + }, + { + "epoch": 1.633338870219644, + "grad_norm": 1.2112863063812256, + "learning_rate": 8.093310076261846e-06, + "loss": 1.0399, + "step": 255660 + }, + { + "epoch": 1.6334027573693826, + "grad_norm": 2.303938150405884, + "learning_rate": 8.090573317251965e-06, + "loss": 0.9191, + "step": 255670 + }, + { + "epoch": 1.6334666445191215, + "grad_norm": 0.8003772497177124, + "learning_rate": 8.087836980306262e-06, + "loss": 0.6613, + "step": 255680 + }, + { + "epoch": 1.63353053166886, + "grad_norm": 1.2863084077835083, + "learning_rate": 8.085101065452293e-06, + "loss": 0.9107, + "step": 255690 + }, + { + "epoch": 1.633594418818599, + "grad_norm": 1.1990734338760376, + "learning_rate": 8.082365572717638e-06, + "loss": 0.6576, + "step": 255700 + }, + { + "epoch": 1.6336583059683374, + "grad_norm": 1.2158446311950684, + "learning_rate": 8.07963050212981e-06, + "loss": 0.6629, + "step": 255710 + }, + { + "epoch": 1.6337221931180763, + "grad_norm": 1.5338866710662842, + "learning_rate": 8.076895853716381e-06, + "loss": 0.9001, + "step": 255720 + }, + { + "epoch": 1.6337860802678148, + "grad_norm": 1.4597996473312378, + "learning_rate": 8.074161627504879e-06, + "loss": 0.9187, + "step": 255730 + }, + { + "epoch": 1.6338499674175537, + "grad_norm": 0.7120456099510193, + "learning_rate": 8.071427823522837e-06, + "loss": 0.7856, + "step": 255740 + }, + { + "epoch": 1.6339138545672922, + "grad_norm": 0.7377516627311707, + "learning_rate": 8.06869444179778e-06, + "loss": 0.9781, + "step": 255750 + }, + { + "epoch": 1.6339777417170311, + "grad_norm": 1.3690109252929688, + "learning_rate": 8.065961482357264e-06, + "loss": 0.8921, + "step": 255760 + }, + { + "epoch": 1.6340416288667696, + "grad_norm": 0.9299448728561401, + "learning_rate": 8.063228945228773e-06, + "loss": 0.8308, + "step": 255770 + }, + { + "epoch": 1.6341055160165086, + "grad_norm": 0.7954294085502625, + "learning_rate": 8.060496830439867e-06, + "loss": 0.8905, + "step": 255780 + }, + { + "epoch": 1.634169403166247, + "grad_norm": 1.089705228805542, + "learning_rate": 8.057765138018025e-06, + "loss": 0.8922, + "step": 255790 + }, + { + "epoch": 1.634233290315986, + "grad_norm": 1.4746631383895874, + "learning_rate": 8.055033867990774e-06, + "loss": 0.8673, + "step": 255800 + }, + { + "epoch": 1.6342971774657244, + "grad_norm": 0.9421790242195129, + "learning_rate": 8.052303020385632e-06, + "loss": 0.7981, + "step": 255810 + }, + { + "epoch": 1.6343610646154634, + "grad_norm": 0.873547375202179, + "learning_rate": 8.049572595230071e-06, + "loss": 0.5999, + "step": 255820 + }, + { + "epoch": 1.6344249517652019, + "grad_norm": 0.8159425258636475, + "learning_rate": 8.046842592551623e-06, + "loss": 0.7421, + "step": 255830 + }, + { + "epoch": 1.6344888389149408, + "grad_norm": 0.6584420800209045, + "learning_rate": 8.044113012377752e-06, + "loss": 0.7277, + "step": 255840 + }, + { + "epoch": 1.6345527260646793, + "grad_norm": 1.2918460369110107, + "learning_rate": 8.041383854735972e-06, + "loss": 0.876, + "step": 255850 + }, + { + "epoch": 1.6346166132144182, + "grad_norm": 1.1654072999954224, + "learning_rate": 8.038655119653748e-06, + "loss": 1.0023, + "step": 255860 + }, + { + "epoch": 1.6346805003641567, + "grad_norm": 0.7087494134902954, + "learning_rate": 8.035926807158573e-06, + "loss": 0.9122, + "step": 255870 + }, + { + "epoch": 1.6347443875138956, + "grad_norm": 0.6799043416976929, + "learning_rate": 8.033198917277912e-06, + "loss": 0.7897, + "step": 255880 + }, + { + "epoch": 1.634808274663634, + "grad_norm": 1.0854226350784302, + "learning_rate": 8.030471450039257e-06, + "loss": 0.6717, + "step": 255890 + }, + { + "epoch": 1.6348721618133728, + "grad_norm": 0.6955246329307556, + "learning_rate": 8.02774440547005e-06, + "loss": 0.9762, + "step": 255900 + }, + { + "epoch": 1.6349360489631115, + "grad_norm": 0.7711451053619385, + "learning_rate": 8.025017783597777e-06, + "loss": 0.7392, + "step": 255910 + }, + { + "epoch": 1.6349999361128502, + "grad_norm": 0.8822253942489624, + "learning_rate": 8.02229158444988e-06, + "loss": 0.8686, + "step": 255920 + }, + { + "epoch": 1.635063823262589, + "grad_norm": 0.7996523976325989, + "learning_rate": 8.019565808053836e-06, + "loss": 0.8066, + "step": 255930 + }, + { + "epoch": 1.6351277104123276, + "grad_norm": 1.005338191986084, + "learning_rate": 8.016840454437063e-06, + "loss": 1.337, + "step": 255940 + }, + { + "epoch": 1.6351915975620663, + "grad_norm": 0.9166159629821777, + "learning_rate": 8.014115523627046e-06, + "loss": 1.0206, + "step": 255950 + }, + { + "epoch": 1.635255484711805, + "grad_norm": 0.864742636680603, + "learning_rate": 8.011391015651198e-06, + "loss": 0.8222, + "step": 255960 + }, + { + "epoch": 1.6353193718615437, + "grad_norm": 1.0380345582962036, + "learning_rate": 8.008666930536972e-06, + "loss": 0.8661, + "step": 255970 + }, + { + "epoch": 1.6353832590112825, + "grad_norm": 0.8901452422142029, + "learning_rate": 8.005943268311794e-06, + "loss": 1.0178, + "step": 255980 + }, + { + "epoch": 1.6354471461610212, + "grad_norm": 1.2338999509811401, + "learning_rate": 8.003220029003106e-06, + "loss": 0.7705, + "step": 255990 + }, + { + "epoch": 1.6355110333107599, + "grad_norm": 0.8710522651672363, + "learning_rate": 8.000497212638313e-06, + "loss": 0.9058, + "step": 256000 + }, + { + "epoch": 1.6355749204604986, + "grad_norm": 1.0031273365020752, + "learning_rate": 7.997774819244846e-06, + "loss": 0.9033, + "step": 256010 + }, + { + "epoch": 1.6356388076102373, + "grad_norm": 1.399172067642212, + "learning_rate": 7.995052848850137e-06, + "loss": 0.8117, + "step": 256020 + }, + { + "epoch": 1.635702694759976, + "grad_norm": 0.9795224070549011, + "learning_rate": 7.992331301481575e-06, + "loss": 0.9511, + "step": 256030 + }, + { + "epoch": 1.6357665819097147, + "grad_norm": 0.6004918813705444, + "learning_rate": 7.98961017716659e-06, + "loss": 0.6345, + "step": 256040 + }, + { + "epoch": 1.6358304690594534, + "grad_norm": 0.7056857943534851, + "learning_rate": 7.986889475932558e-06, + "loss": 0.9637, + "step": 256050 + }, + { + "epoch": 1.635894356209192, + "grad_norm": 0.8062849640846252, + "learning_rate": 7.984169197806912e-06, + "loss": 1.091, + "step": 256060 + }, + { + "epoch": 1.6359582433589308, + "grad_norm": 0.780800998210907, + "learning_rate": 7.981449342817021e-06, + "loss": 0.9206, + "step": 256070 + }, + { + "epoch": 1.6360221305086695, + "grad_norm": 1.4013501405715942, + "learning_rate": 7.979001835129851e-06, + "loss": 0.9148, + "step": 256080 + }, + { + "epoch": 1.6360860176584082, + "grad_norm": 1.1285284757614136, + "learning_rate": 7.976282784173366e-06, + "loss": 0.6931, + "step": 256090 + }, + { + "epoch": 1.636149904808147, + "grad_norm": 0.704509437084198, + "learning_rate": 7.97356415643209e-06, + "loss": 0.8667, + "step": 256100 + }, + { + "epoch": 1.6362137919578856, + "grad_norm": 1.3260267972946167, + "learning_rate": 7.970845951933365e-06, + "loss": 0.9538, + "step": 256110 + }, + { + "epoch": 1.6362776791076243, + "grad_norm": 0.758438766002655, + "learning_rate": 7.96812817070458e-06, + "loss": 0.86, + "step": 256120 + }, + { + "epoch": 1.636341566257363, + "grad_norm": 0.8738319873809814, + "learning_rate": 7.965410812773122e-06, + "loss": 0.7856, + "step": 256130 + }, + { + "epoch": 1.6364054534071018, + "grad_norm": 0.633561372756958, + "learning_rate": 7.962693878166328e-06, + "loss": 0.8174, + "step": 256140 + }, + { + "epoch": 1.6364693405568405, + "grad_norm": 0.9667708277702332, + "learning_rate": 7.959977366911586e-06, + "loss": 0.812, + "step": 256150 + }, + { + "epoch": 1.636533227706579, + "grad_norm": 0.9830121994018555, + "learning_rate": 7.957261279036227e-06, + "loss": 1.0059, + "step": 256160 + }, + { + "epoch": 1.6365971148563179, + "grad_norm": 0.9154394268989563, + "learning_rate": 7.954545614567633e-06, + "loss": 1.0548, + "step": 256170 + }, + { + "epoch": 1.6366610020060564, + "grad_norm": 0.8336869478225708, + "learning_rate": 7.951830373533132e-06, + "loss": 0.7661, + "step": 256180 + }, + { + "epoch": 1.6367248891557953, + "grad_norm": 0.7483879327774048, + "learning_rate": 7.949115555960084e-06, + "loss": 0.9512, + "step": 256190 + }, + { + "epoch": 1.6367887763055338, + "grad_norm": 1.3023123741149902, + "learning_rate": 7.946401161875811e-06, + "loss": 1.2298, + "step": 256200 + }, + { + "epoch": 1.6368526634552727, + "grad_norm": 0.709023654460907, + "learning_rate": 7.943687191307669e-06, + "loss": 1.3336, + "step": 256210 + }, + { + "epoch": 1.6369165506050112, + "grad_norm": 2.153777837753296, + "learning_rate": 7.940973644282967e-06, + "loss": 0.85, + "step": 256220 + }, + { + "epoch": 1.63698043775475, + "grad_norm": 0.7116622924804688, + "learning_rate": 7.938260520829065e-06, + "loss": 0.834, + "step": 256230 + }, + { + "epoch": 1.6370443249044886, + "grad_norm": 2.8475935459136963, + "learning_rate": 7.935547820973254e-06, + "loss": 0.9491, + "step": 256240 + }, + { + "epoch": 1.6371082120542275, + "grad_norm": 0.6584477424621582, + "learning_rate": 7.932835544742877e-06, + "loss": 0.9856, + "step": 256250 + }, + { + "epoch": 1.637172099203966, + "grad_norm": 1.0373663902282715, + "learning_rate": 7.930123692165231e-06, + "loss": 0.7778, + "step": 256260 + }, + { + "epoch": 1.637235986353705, + "grad_norm": 0.5286368727684021, + "learning_rate": 7.927412263267641e-06, + "loss": 0.8536, + "step": 256270 + }, + { + "epoch": 1.6372998735034434, + "grad_norm": 1.3653309345245361, + "learning_rate": 7.9247012580774e-06, + "loss": 0.7506, + "step": 256280 + }, + { + "epoch": 1.6373637606531823, + "grad_norm": 0.7942734956741333, + "learning_rate": 7.921990676621832e-06, + "loss": 1.1046, + "step": 256290 + }, + { + "epoch": 1.6374276478029208, + "grad_norm": 0.7212092876434326, + "learning_rate": 7.919280518928207e-06, + "loss": 0.8769, + "step": 256300 + }, + { + "epoch": 1.6374915349526598, + "grad_norm": 0.9169106483459473, + "learning_rate": 7.916570785023841e-06, + "loss": 0.6814, + "step": 256310 + }, + { + "epoch": 1.6375554221023982, + "grad_norm": 0.7443297505378723, + "learning_rate": 7.913861474936002e-06, + "loss": 0.827, + "step": 256320 + }, + { + "epoch": 1.6376193092521372, + "grad_norm": 1.0695321559906006, + "learning_rate": 7.911152588691995e-06, + "loss": 0.8919, + "step": 256330 + }, + { + "epoch": 1.6376831964018757, + "grad_norm": 1.0362414121627808, + "learning_rate": 7.908444126319098e-06, + "loss": 0.8526, + "step": 256340 + }, + { + "epoch": 1.6377470835516146, + "grad_norm": 0.7101940512657166, + "learning_rate": 7.905736087844574e-06, + "loss": 1.0854, + "step": 256350 + }, + { + "epoch": 1.637810970701353, + "grad_norm": 1.1458754539489746, + "learning_rate": 7.903028473295714e-06, + "loss": 1.0137, + "step": 256360 + }, + { + "epoch": 1.637874857851092, + "grad_norm": 0.9790529012680054, + "learning_rate": 7.900321282699779e-06, + "loss": 0.6549, + "step": 256370 + }, + { + "epoch": 1.6379387450008305, + "grad_norm": 0.736199140548706, + "learning_rate": 7.89761451608403e-06, + "loss": 0.8473, + "step": 256380 + }, + { + "epoch": 1.6380026321505692, + "grad_norm": 0.7104125618934631, + "learning_rate": 7.894908173475712e-06, + "loss": 0.9423, + "step": 256390 + }, + { + "epoch": 1.6380665193003079, + "grad_norm": 1.0521243810653687, + "learning_rate": 7.892202254902108e-06, + "loss": 1.0501, + "step": 256400 + }, + { + "epoch": 1.6381304064500466, + "grad_norm": 1.024814486503601, + "learning_rate": 7.889496760390447e-06, + "loss": 0.9319, + "step": 256410 + }, + { + "epoch": 1.6381942935997853, + "grad_norm": 0.6387311816215515, + "learning_rate": 7.886791689967993e-06, + "loss": 0.8884, + "step": 256420 + }, + { + "epoch": 1.638258180749524, + "grad_norm": 0.835491955280304, + "learning_rate": 7.884087043661969e-06, + "loss": 0.9697, + "step": 256430 + }, + { + "epoch": 1.6383220678992627, + "grad_norm": 0.8031365871429443, + "learning_rate": 7.881382821499621e-06, + "loss": 0.811, + "step": 256440 + }, + { + "epoch": 1.6383859550490014, + "grad_norm": 0.6929246187210083, + "learning_rate": 7.8786790235082e-06, + "loss": 0.9724, + "step": 256450 + }, + { + "epoch": 1.6384498421987401, + "grad_norm": 0.7216389775276184, + "learning_rate": 7.875975649714912e-06, + "loss": 0.9006, + "step": 256460 + }, + { + "epoch": 1.6385137293484788, + "grad_norm": 1.058194875717163, + "learning_rate": 7.873272700147e-06, + "loss": 0.7515, + "step": 256470 + }, + { + "epoch": 1.6385776164982175, + "grad_norm": 0.9971916079521179, + "learning_rate": 7.870570174831664e-06, + "loss": 0.9915, + "step": 256480 + }, + { + "epoch": 1.6386415036479562, + "grad_norm": 0.5907204747200012, + "learning_rate": 7.867868073796154e-06, + "loss": 0.8927, + "step": 256490 + }, + { + "epoch": 1.638705390797695, + "grad_norm": 0.6885647177696228, + "learning_rate": 7.865166397067642e-06, + "loss": 0.722, + "step": 256500 + }, + { + "epoch": 1.6387692779474337, + "grad_norm": 0.8568047285079956, + "learning_rate": 7.86246514467337e-06, + "loss": 0.8133, + "step": 256510 + }, + { + "epoch": 1.6388331650971724, + "grad_norm": 1.0300815105438232, + "learning_rate": 7.859764316640516e-06, + "loss": 1.0747, + "step": 256520 + }, + { + "epoch": 1.638897052246911, + "grad_norm": 0.8222336769104004, + "learning_rate": 7.857063912996304e-06, + "loss": 0.7396, + "step": 256530 + }, + { + "epoch": 1.6389609393966498, + "grad_norm": 1.097979187965393, + "learning_rate": 7.854363933767906e-06, + "loss": 0.8752, + "step": 256540 + }, + { + "epoch": 1.6390248265463885, + "grad_norm": 1.208530068397522, + "learning_rate": 7.851664378982532e-06, + "loss": 1.0752, + "step": 256550 + }, + { + "epoch": 1.6390887136961272, + "grad_norm": 0.7187122702598572, + "learning_rate": 7.84896524866735e-06, + "loss": 0.7706, + "step": 256560 + }, + { + "epoch": 1.639152600845866, + "grad_norm": 0.8158242106437683, + "learning_rate": 7.846266542849574e-06, + "loss": 0.994, + "step": 256570 + }, + { + "epoch": 1.6392164879956046, + "grad_norm": 1.7641271352767944, + "learning_rate": 7.843568261556339e-06, + "loss": 0.8622, + "step": 256580 + }, + { + "epoch": 1.6392803751453433, + "grad_norm": 1.0310348272323608, + "learning_rate": 7.84087040481486e-06, + "loss": 0.8935, + "step": 256590 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 1.2308247089385986, + "learning_rate": 7.83817297265228e-06, + "loss": 0.9992, + "step": 256600 + }, + { + "epoch": 1.6394081494448207, + "grad_norm": 2.2752044200897217, + "learning_rate": 7.835475965095779e-06, + "loss": 0.9188, + "step": 256610 + }, + { + "epoch": 1.6394720365945594, + "grad_norm": 0.6786547899246216, + "learning_rate": 7.832779382172506e-06, + "loss": 0.9933, + "step": 256620 + }, + { + "epoch": 1.639535923744298, + "grad_norm": 1.5297602415084839, + "learning_rate": 7.830083223909629e-06, + "loss": 0.9466, + "step": 256630 + }, + { + "epoch": 1.6395998108940368, + "grad_norm": 3.73483943939209, + "learning_rate": 7.827387490334293e-06, + "loss": 1.0265, + "step": 256640 + }, + { + "epoch": 1.6396636980437753, + "grad_norm": 0.8407526016235352, + "learning_rate": 7.824692181473642e-06, + "loss": 1.0092, + "step": 256650 + }, + { + "epoch": 1.6397275851935142, + "grad_norm": 1.1145086288452148, + "learning_rate": 7.821997297354844e-06, + "loss": 1.0603, + "step": 256660 + }, + { + "epoch": 1.6397914723432527, + "grad_norm": 0.9100845456123352, + "learning_rate": 7.819302838005011e-06, + "loss": 1.0177, + "step": 256670 + }, + { + "epoch": 1.6398553594929917, + "grad_norm": 1.0881621837615967, + "learning_rate": 7.816608803451297e-06, + "loss": 0.856, + "step": 256680 + }, + { + "epoch": 1.6399192466427301, + "grad_norm": 1.6409400701522827, + "learning_rate": 7.813915193720817e-06, + "loss": 0.7758, + "step": 256690 + }, + { + "epoch": 1.639983133792469, + "grad_norm": 0.9504512548446655, + "learning_rate": 7.81122200884072e-06, + "loss": 1.2167, + "step": 256700 + }, + { + "epoch": 1.6400470209422076, + "grad_norm": 1.1367648839950562, + "learning_rate": 7.808529248838103e-06, + "loss": 0.7544, + "step": 256710 + }, + { + "epoch": 1.6401109080919465, + "grad_norm": 1.4266903400421143, + "learning_rate": 7.805836913740111e-06, + "loss": 0.8999, + "step": 256720 + }, + { + "epoch": 1.640174795241685, + "grad_norm": 0.9106956720352173, + "learning_rate": 7.803145003573832e-06, + "loss": 0.8518, + "step": 256730 + }, + { + "epoch": 1.640238682391424, + "grad_norm": 0.6638666391372681, + "learning_rate": 7.800453518366397e-06, + "loss": 0.6891, + "step": 256740 + }, + { + "epoch": 1.6403025695411624, + "grad_norm": 1.59250807762146, + "learning_rate": 7.797762458144891e-06, + "loss": 0.8653, + "step": 256750 + }, + { + "epoch": 1.6403664566909013, + "grad_norm": 0.8077670931816101, + "learning_rate": 7.795071822936446e-06, + "loss": 0.8013, + "step": 256760 + }, + { + "epoch": 1.6404303438406398, + "grad_norm": 1.235260009765625, + "learning_rate": 7.792381612768123e-06, + "loss": 0.955, + "step": 256770 + }, + { + "epoch": 1.6404942309903787, + "grad_norm": 0.8226930499076843, + "learning_rate": 7.789691827667045e-06, + "loss": 0.945, + "step": 256780 + }, + { + "epoch": 1.6405581181401172, + "grad_norm": 1.6601190567016602, + "learning_rate": 7.78700246766027e-06, + "loss": 0.6711, + "step": 256790 + }, + { + "epoch": 1.6406220052898561, + "grad_norm": 1.1035475730895996, + "learning_rate": 7.784313532774918e-06, + "loss": 0.9102, + "step": 256800 + }, + { + "epoch": 1.6406858924395946, + "grad_norm": 1.2023992538452148, + "learning_rate": 7.781625023038036e-06, + "loss": 0.7553, + "step": 256810 + }, + { + "epoch": 1.6407497795893335, + "grad_norm": 0.8770223259925842, + "learning_rate": 7.778936938476728e-06, + "loss": 0.8356, + "step": 256820 + }, + { + "epoch": 1.640813666739072, + "grad_norm": 1.4096797704696655, + "learning_rate": 7.776249279118042e-06, + "loss": 0.908, + "step": 256830 + }, + { + "epoch": 1.640877553888811, + "grad_norm": 2.869856595993042, + "learning_rate": 7.773562044989063e-06, + "loss": 1.0478, + "step": 256840 + }, + { + "epoch": 1.6409414410385494, + "grad_norm": 1.1328037977218628, + "learning_rate": 7.770875236116843e-06, + "loss": 0.7729, + "step": 256850 + }, + { + "epoch": 1.6410053281882884, + "grad_norm": 0.991386890411377, + "learning_rate": 7.768188852528434e-06, + "loss": 0.8065, + "step": 256860 + }, + { + "epoch": 1.6410692153380269, + "grad_norm": 0.6845223307609558, + "learning_rate": 7.765502894250908e-06, + "loss": 0.9615, + "step": 256870 + }, + { + "epoch": 1.6411331024877656, + "grad_norm": 1.3230713605880737, + "learning_rate": 7.762817361311298e-06, + "loss": 0.9169, + "step": 256880 + }, + { + "epoch": 1.6411969896375043, + "grad_norm": 0.6882030963897705, + "learning_rate": 7.760132253736668e-06, + "loss": 0.6916, + "step": 256890 + }, + { + "epoch": 1.641260876787243, + "grad_norm": 1.0747628211975098, + "learning_rate": 7.757447571554033e-06, + "loss": 0.867, + "step": 256900 + }, + { + "epoch": 1.6413247639369817, + "grad_norm": 1.1972609758377075, + "learning_rate": 7.754763314790463e-06, + "loss": 1.1478, + "step": 256910 + }, + { + "epoch": 1.6413886510867204, + "grad_norm": 0.8115911483764648, + "learning_rate": 7.752079483472958e-06, + "loss": 0.8082, + "step": 256920 + }, + { + "epoch": 1.641452538236459, + "grad_norm": 0.9861070513725281, + "learning_rate": 7.749396077628579e-06, + "loss": 0.7608, + "step": 256930 + }, + { + "epoch": 1.6415164253861978, + "grad_norm": 1.068723440170288, + "learning_rate": 7.74671309728432e-06, + "loss": 0.8506, + "step": 256940 + }, + { + "epoch": 1.6415803125359365, + "grad_norm": 7.141610145568848, + "learning_rate": 7.744030542467222e-06, + "loss": 0.7723, + "step": 256950 + }, + { + "epoch": 1.6416441996856752, + "grad_norm": 1.6269599199295044, + "learning_rate": 7.741348413204286e-06, + "loss": 0.8175, + "step": 256960 + }, + { + "epoch": 1.641708086835414, + "grad_norm": 1.107198715209961, + "learning_rate": 7.73866670952253e-06, + "loss": 0.968, + "step": 256970 + }, + { + "epoch": 1.6417719739851526, + "grad_norm": 1.0261207818984985, + "learning_rate": 7.735985431448972e-06, + "loss": 0.9184, + "step": 256980 + }, + { + "epoch": 1.6418358611348913, + "grad_norm": 0.8513238430023193, + "learning_rate": 7.733304579010591e-06, + "loss": 0.7687, + "step": 256990 + }, + { + "epoch": 1.64189974828463, + "grad_norm": 1.1189079284667969, + "learning_rate": 7.730624152234412e-06, + "loss": 0.8642, + "step": 257000 + }, + { + "epoch": 1.6419636354343687, + "grad_norm": 1.1294742822647095, + "learning_rate": 7.727944151147403e-06, + "loss": 0.9542, + "step": 257010 + }, + { + "epoch": 1.6420275225841074, + "grad_norm": 1.1503225564956665, + "learning_rate": 7.72526457577658e-06, + "loss": 0.9426, + "step": 257020 + }, + { + "epoch": 1.6420914097338462, + "grad_norm": 2.0296530723571777, + "learning_rate": 7.722585426148903e-06, + "loss": 0.8177, + "step": 257030 + }, + { + "epoch": 1.6421552968835849, + "grad_norm": 0.9002004265785217, + "learning_rate": 7.71990670229138e-06, + "loss": 0.8094, + "step": 257040 + }, + { + "epoch": 1.6422191840333236, + "grad_norm": 1.0578171014785767, + "learning_rate": 7.717228404230964e-06, + "loss": 0.7348, + "step": 257050 + }, + { + "epoch": 1.6422830711830623, + "grad_norm": 1.2272627353668213, + "learning_rate": 7.71455053199464e-06, + "loss": 0.8838, + "step": 257060 + }, + { + "epoch": 1.642346958332801, + "grad_norm": 0.7876251935958862, + "learning_rate": 7.71214081108384e-06, + "loss": 0.9401, + "step": 257070 + }, + { + "epoch": 1.6424108454825397, + "grad_norm": 1.100836157798767, + "learning_rate": 7.709463747987582e-06, + "loss": 0.7681, + "step": 257080 + }, + { + "epoch": 1.6424747326322784, + "grad_norm": 1.7621210813522339, + "learning_rate": 7.706787110793607e-06, + "loss": 1.1007, + "step": 257090 + }, + { + "epoch": 1.642538619782017, + "grad_norm": 0.7386590242385864, + "learning_rate": 7.704110899528872e-06, + "loss": 0.8787, + "step": 257100 + }, + { + "epoch": 1.6426025069317558, + "grad_norm": 0.6226412057876587, + "learning_rate": 7.701435114220346e-06, + "loss": 0.8882, + "step": 257110 + }, + { + "epoch": 1.6426663940814943, + "grad_norm": 1.0297595262527466, + "learning_rate": 7.698759754894946e-06, + "loss": 0.9987, + "step": 257120 + }, + { + "epoch": 1.6427302812312332, + "grad_norm": 0.6230430603027344, + "learning_rate": 7.696084821579641e-06, + "loss": 0.8812, + "step": 257130 + }, + { + "epoch": 1.6427941683809717, + "grad_norm": 0.5937321186065674, + "learning_rate": 7.693410314301352e-06, + "loss": 1.0652, + "step": 257140 + }, + { + "epoch": 1.6428580555307106, + "grad_norm": 0.8128228783607483, + "learning_rate": 7.690736233087032e-06, + "loss": 0.9578, + "step": 257150 + }, + { + "epoch": 1.642921942680449, + "grad_norm": 0.7910133004188538, + "learning_rate": 7.68806257796359e-06, + "loss": 0.9108, + "step": 257160 + }, + { + "epoch": 1.642985829830188, + "grad_norm": 1.0798237323760986, + "learning_rate": 7.685389348957978e-06, + "loss": 0.8957, + "step": 257170 + }, + { + "epoch": 1.6430497169799265, + "grad_norm": 1.373158574104309, + "learning_rate": 7.682716546097085e-06, + "loss": 0.7722, + "step": 257180 + }, + { + "epoch": 1.6431136041296655, + "grad_norm": 0.7850404977798462, + "learning_rate": 7.680044169407858e-06, + "loss": 0.7864, + "step": 257190 + }, + { + "epoch": 1.643177491279404, + "grad_norm": 1.0319653749465942, + "learning_rate": 7.677372218917189e-06, + "loss": 0.7347, + "step": 257200 + }, + { + "epoch": 1.6432413784291429, + "grad_norm": 1.0518642663955688, + "learning_rate": 7.674700694652004e-06, + "loss": 0.9759, + "step": 257210 + }, + { + "epoch": 1.6433052655788813, + "grad_norm": 1.550307035446167, + "learning_rate": 7.672029596639191e-06, + "loss": 0.9258, + "step": 257220 + }, + { + "epoch": 1.6433691527286203, + "grad_norm": 1.0691750049591064, + "learning_rate": 7.669358924905673e-06, + "loss": 0.8578, + "step": 257230 + }, + { + "epoch": 1.6434330398783588, + "grad_norm": 1.184849739074707, + "learning_rate": 7.666688679478317e-06, + "loss": 0.8363, + "step": 257240 + }, + { + "epoch": 1.6434969270280977, + "grad_norm": 0.7209656834602356, + "learning_rate": 7.664018860384042e-06, + "loss": 0.7998, + "step": 257250 + }, + { + "epoch": 1.6435608141778362, + "grad_norm": 0.9371286630630493, + "learning_rate": 7.661349467649714e-06, + "loss": 0.7666, + "step": 257260 + }, + { + "epoch": 1.643624701327575, + "grad_norm": 0.8568039536476135, + "learning_rate": 7.658680501302235e-06, + "loss": 1.1363, + "step": 257270 + }, + { + "epoch": 1.6436885884773136, + "grad_norm": 0.6117787957191467, + "learning_rate": 7.656011961368459e-06, + "loss": 0.837, + "step": 257280 + }, + { + "epoch": 1.6437524756270525, + "grad_norm": 0.845210611820221, + "learning_rate": 7.653343847875277e-06, + "loss": 1.0475, + "step": 257290 + }, + { + "epoch": 1.643816362776791, + "grad_norm": 0.9108684659004211, + "learning_rate": 7.650676160849568e-06, + "loss": 0.9906, + "step": 257300 + }, + { + "epoch": 1.64388024992653, + "grad_norm": 0.7374324202537537, + "learning_rate": 7.648008900318176e-06, + "loss": 0.9109, + "step": 257310 + }, + { + "epoch": 1.6439441370762684, + "grad_norm": 1.5589121580123901, + "learning_rate": 7.645342066307986e-06, + "loss": 0.8385, + "step": 257320 + }, + { + "epoch": 1.6440080242260073, + "grad_norm": 1.1741704940795898, + "learning_rate": 7.642675658845839e-06, + "loss": 0.7737, + "step": 257330 + }, + { + "epoch": 1.6440719113757458, + "grad_norm": 1.1971560716629028, + "learning_rate": 7.640009677958592e-06, + "loss": 1.0186, + "step": 257340 + }, + { + "epoch": 1.6441357985254847, + "grad_norm": 0.8373765349388123, + "learning_rate": 7.63734412367309e-06, + "loss": 0.9073, + "step": 257350 + }, + { + "epoch": 1.6441996856752232, + "grad_norm": 0.7890423536300659, + "learning_rate": 7.634678996016193e-06, + "loss": 0.718, + "step": 257360 + }, + { + "epoch": 1.644263572824962, + "grad_norm": 1.0626322031021118, + "learning_rate": 7.632014295014717e-06, + "loss": 1.2763, + "step": 257370 + }, + { + "epoch": 1.6443274599747006, + "grad_norm": 0.793804407119751, + "learning_rate": 7.62935002069552e-06, + "loss": 0.8366, + "step": 257380 + }, + { + "epoch": 1.6443913471244394, + "grad_norm": 1.9477283954620361, + "learning_rate": 7.626686173085412e-06, + "loss": 0.8928, + "step": 257390 + }, + { + "epoch": 1.644455234274178, + "grad_norm": 0.6621059775352478, + "learning_rate": 7.624022752211246e-06, + "loss": 0.9907, + "step": 257400 + }, + { + "epoch": 1.6445191214239168, + "grad_norm": 1.115936517715454, + "learning_rate": 7.621359758099822e-06, + "loss": 0.7577, + "step": 257410 + }, + { + "epoch": 1.6445830085736555, + "grad_norm": 1.2295455932617188, + "learning_rate": 7.618697190777979e-06, + "loss": 0.7741, + "step": 257420 + }, + { + "epoch": 1.6446468957233942, + "grad_norm": 0.9931457042694092, + "learning_rate": 7.616035050272508e-06, + "loss": 0.8288, + "step": 257430 + }, + { + "epoch": 1.6447107828731329, + "grad_norm": 0.5664223432540894, + "learning_rate": 7.613373336610241e-06, + "loss": 0.6466, + "step": 257440 + }, + { + "epoch": 1.6447746700228716, + "grad_norm": 0.9523608684539795, + "learning_rate": 7.610712049817964e-06, + "loss": 0.7106, + "step": 257450 + }, + { + "epoch": 1.6448385571726103, + "grad_norm": 0.7553440928459167, + "learning_rate": 7.608051189922499e-06, + "loss": 0.8361, + "step": 257460 + }, + { + "epoch": 1.644902444322349, + "grad_norm": 1.1481449604034424, + "learning_rate": 7.60539075695062e-06, + "loss": 1.0126, + "step": 257470 + }, + { + "epoch": 1.6449663314720877, + "grad_norm": 0.599646270275116, + "learning_rate": 7.6027307509291486e-06, + "loss": 0.9073, + "step": 257480 + }, + { + "epoch": 1.6450302186218264, + "grad_norm": 1.2230756282806396, + "learning_rate": 7.60007117188486e-06, + "loss": 0.9898, + "step": 257490 + }, + { + "epoch": 1.6450941057715651, + "grad_norm": 0.9773632287979126, + "learning_rate": 7.597412019844513e-06, + "loss": 1.0886, + "step": 257500 + }, + { + "epoch": 1.6451579929213038, + "grad_norm": 1.1554090976715088, + "learning_rate": 7.594753294834933e-06, + "loss": 0.8617, + "step": 257510 + }, + { + "epoch": 1.6452218800710425, + "grad_norm": 0.9558780193328857, + "learning_rate": 7.592094996882854e-06, + "loss": 0.8919, + "step": 257520 + }, + { + "epoch": 1.6452857672207812, + "grad_norm": 0.7920116186141968, + "learning_rate": 7.589437126015081e-06, + "loss": 0.8181, + "step": 257530 + }, + { + "epoch": 1.64534965437052, + "grad_norm": 0.9481412172317505, + "learning_rate": 7.586779682258355e-06, + "loss": 0.9225, + "step": 257540 + }, + { + "epoch": 1.6454135415202586, + "grad_norm": 1.214705228805542, + "learning_rate": 7.5841226656394645e-06, + "loss": 0.8409, + "step": 257550 + }, + { + "epoch": 1.6454774286699974, + "grad_norm": 0.7767091989517212, + "learning_rate": 7.58146607618514e-06, + "loss": 0.9165, + "step": 257560 + }, + { + "epoch": 1.645541315819736, + "grad_norm": 0.6727650761604309, + "learning_rate": 7.578809913922158e-06, + "loss": 0.7626, + "step": 257570 + }, + { + "epoch": 1.6456052029694748, + "grad_norm": 0.9648169279098511, + "learning_rate": 7.576154178877248e-06, + "loss": 0.9397, + "step": 257580 + }, + { + "epoch": 1.6456690901192135, + "grad_norm": 1.433490514755249, + "learning_rate": 7.5734988710771836e-06, + "loss": 0.8868, + "step": 257590 + }, + { + "epoch": 1.6457329772689522, + "grad_norm": 1.0735435485839844, + "learning_rate": 7.570843990548676e-06, + "loss": 0.9415, + "step": 257600 + }, + { + "epoch": 1.6457968644186907, + "grad_norm": 0.9692497849464417, + "learning_rate": 7.568189537318487e-06, + "loss": 0.9294, + "step": 257610 + }, + { + "epoch": 1.6458607515684296, + "grad_norm": 1.3035361766815186, + "learning_rate": 7.5655355114133245e-06, + "loss": 0.8981, + "step": 257620 + }, + { + "epoch": 1.645924638718168, + "grad_norm": 1.2619551420211792, + "learning_rate": 7.562881912859937e-06, + "loss": 0.9399, + "step": 257630 + }, + { + "epoch": 1.645988525867907, + "grad_norm": 3.850501775741577, + "learning_rate": 7.560228741685049e-06, + "loss": 0.7043, + "step": 257640 + }, + { + "epoch": 1.6460524130176455, + "grad_norm": 0.9732416272163391, + "learning_rate": 7.557575997915362e-06, + "loss": 0.7152, + "step": 257650 + }, + { + "epoch": 1.6461163001673844, + "grad_norm": 1.1814053058624268, + "learning_rate": 7.554923681577614e-06, + "loss": 0.8326, + "step": 257660 + }, + { + "epoch": 1.646180187317123, + "grad_norm": 2.915731430053711, + "learning_rate": 7.5522717926984895e-06, + "loss": 0.8365, + "step": 257670 + }, + { + "epoch": 1.6462440744668618, + "grad_norm": 0.9403936266899109, + "learning_rate": 7.54962033130473e-06, + "loss": 0.8559, + "step": 257680 + }, + { + "epoch": 1.6463079616166003, + "grad_norm": 0.9900381565093994, + "learning_rate": 7.546969297423001e-06, + "loss": 0.9193, + "step": 257690 + }, + { + "epoch": 1.6463718487663392, + "grad_norm": 1.1494392156600952, + "learning_rate": 7.544318691080032e-06, + "loss": 1.0322, + "step": 257700 + }, + { + "epoch": 1.6464357359160777, + "grad_norm": 0.9025363326072693, + "learning_rate": 7.541668512302491e-06, + "loss": 0.8881, + "step": 257710 + }, + { + "epoch": 1.6464996230658167, + "grad_norm": 1.2116295099258423, + "learning_rate": 7.539018761117095e-06, + "loss": 0.9982, + "step": 257720 + }, + { + "epoch": 1.6465635102155551, + "grad_norm": 1.2075427770614624, + "learning_rate": 7.5363694375505e-06, + "loss": 0.8318, + "step": 257730 + }, + { + "epoch": 1.646627397365294, + "grad_norm": 0.9334720969200134, + "learning_rate": 7.5337205416294076e-06, + "loss": 0.8711, + "step": 257740 + }, + { + "epoch": 1.6466912845150325, + "grad_norm": 1.1920677423477173, + "learning_rate": 7.53107207338048e-06, + "loss": 0.9515, + "step": 257750 + }, + { + "epoch": 1.6467551716647715, + "grad_norm": 0.9365494847297668, + "learning_rate": 7.528424032830411e-06, + "loss": 1.0222, + "step": 257760 + }, + { + "epoch": 1.64681905881451, + "grad_norm": 1.1577626466751099, + "learning_rate": 7.5257764200058424e-06, + "loss": 0.9483, + "step": 257770 + }, + { + "epoch": 1.646882945964249, + "grad_norm": 4.188989162445068, + "learning_rate": 7.5231292349334625e-06, + "loss": 0.8343, + "step": 257780 + }, + { + "epoch": 1.6469468331139874, + "grad_norm": 1.5819982290267944, + "learning_rate": 7.520482477639906e-06, + "loss": 0.901, + "step": 257790 + }, + { + "epoch": 1.6470107202637263, + "grad_norm": 0.8653442859649658, + "learning_rate": 7.517836148151852e-06, + "loss": 0.8159, + "step": 257800 + }, + { + "epoch": 1.6470746074134648, + "grad_norm": 1.4531724452972412, + "learning_rate": 7.5151902464959344e-06, + "loss": 1.0052, + "step": 257810 + }, + { + "epoch": 1.6471384945632037, + "grad_norm": 0.883104145526886, + "learning_rate": 7.512544772698799e-06, + "loss": 0.868, + "step": 257820 + }, + { + "epoch": 1.6472023817129422, + "grad_norm": 0.8851460814476013, + "learning_rate": 7.509899726787106e-06, + "loss": 0.6053, + "step": 257830 + }, + { + "epoch": 1.6472662688626811, + "grad_norm": 0.77079176902771, + "learning_rate": 7.507255108787475e-06, + "loss": 1.041, + "step": 257840 + }, + { + "epoch": 1.6473301560124196, + "grad_norm": 0.7836570739746094, + "learning_rate": 7.504610918726557e-06, + "loss": 0.9428, + "step": 257850 + }, + { + "epoch": 1.6473940431621583, + "grad_norm": 1.032909631729126, + "learning_rate": 7.501967156630957e-06, + "loss": 0.7965, + "step": 257860 + }, + { + "epoch": 1.647457930311897, + "grad_norm": 0.612842321395874, + "learning_rate": 7.499323822527327e-06, + "loss": 0.9013, + "step": 257870 + }, + { + "epoch": 1.6475218174616357, + "grad_norm": 1.1342487335205078, + "learning_rate": 7.496680916442262e-06, + "loss": 0.8627, + "step": 257880 + }, + { + "epoch": 1.6475857046113744, + "grad_norm": 0.634815514087677, + "learning_rate": 7.494038438402401e-06, + "loss": 0.7458, + "step": 257890 + }, + { + "epoch": 1.6476495917611131, + "grad_norm": 1.1310441493988037, + "learning_rate": 7.491396388434336e-06, + "loss": 0.8711, + "step": 257900 + }, + { + "epoch": 1.6477134789108518, + "grad_norm": 1.735602855682373, + "learning_rate": 7.488754766564698e-06, + "loss": 0.7482, + "step": 257910 + }, + { + "epoch": 1.6477773660605906, + "grad_norm": 1.2173813581466675, + "learning_rate": 7.486113572820064e-06, + "loss": 0.7555, + "step": 257920 + }, + { + "epoch": 1.6478412532103293, + "grad_norm": 0.9211714267730713, + "learning_rate": 7.48347280722706e-06, + "loss": 0.9156, + "step": 257930 + }, + { + "epoch": 1.647905140360068, + "grad_norm": 0.8173043131828308, + "learning_rate": 7.480832469812249e-06, + "loss": 0.8714, + "step": 257940 + }, + { + "epoch": 1.6479690275098067, + "grad_norm": 1.0287995338439941, + "learning_rate": 7.478192560602254e-06, + "loss": 0.9726, + "step": 257950 + }, + { + "epoch": 1.6480329146595454, + "grad_norm": 1.5631654262542725, + "learning_rate": 7.475553079623637e-06, + "loss": 1.0207, + "step": 257960 + }, + { + "epoch": 1.648096801809284, + "grad_norm": 0.9244776964187622, + "learning_rate": 7.472914026903e-06, + "loss": 0.7689, + "step": 257970 + }, + { + "epoch": 1.6481606889590228, + "grad_norm": 1.429133653640747, + "learning_rate": 7.470275402466909e-06, + "loss": 0.8569, + "step": 257980 + }, + { + "epoch": 1.6482245761087615, + "grad_norm": 0.9924103021621704, + "learning_rate": 7.467637206341927e-06, + "loss": 0.7181, + "step": 257990 + }, + { + "epoch": 1.6482884632585002, + "grad_norm": 1.2423604726791382, + "learning_rate": 7.464999438554643e-06, + "loss": 1.1156, + "step": 258000 + }, + { + "epoch": 1.648352350408239, + "grad_norm": 1.2210172414779663, + "learning_rate": 7.462362099131603e-06, + "loss": 0.7794, + "step": 258010 + }, + { + "epoch": 1.6484162375579776, + "grad_norm": 0.8788898587226868, + "learning_rate": 7.459725188099387e-06, + "loss": 0.6866, + "step": 258020 + }, + { + "epoch": 1.6484801247077163, + "grad_norm": 1.631868839263916, + "learning_rate": 7.457088705484533e-06, + "loss": 1.0881, + "step": 258030 + }, + { + "epoch": 1.648544011857455, + "grad_norm": 0.6207049489021301, + "learning_rate": 7.4544526513136095e-06, + "loss": 0.6963, + "step": 258040 + }, + { + "epoch": 1.6486078990071937, + "grad_norm": 0.7716155648231506, + "learning_rate": 7.451817025613145e-06, + "loss": 0.7368, + "step": 258050 + }, + { + "epoch": 1.6486717861569324, + "grad_norm": 0.9051589965820312, + "learning_rate": 7.449181828409702e-06, + "loss": 0.9487, + "step": 258060 + }, + { + "epoch": 1.6487356733066711, + "grad_norm": 0.8805955648422241, + "learning_rate": 7.446547059729802e-06, + "loss": 0.759, + "step": 258070 + }, + { + "epoch": 1.6487995604564099, + "grad_norm": 0.9993048906326294, + "learning_rate": 7.443912719599993e-06, + "loss": 0.8764, + "step": 258080 + }, + { + "epoch": 1.6488634476061486, + "grad_norm": 0.953590989112854, + "learning_rate": 7.441278808046792e-06, + "loss": 0.885, + "step": 258090 + }, + { + "epoch": 1.648927334755887, + "grad_norm": 0.9039053320884705, + "learning_rate": 7.438645325096743e-06, + "loss": 1.1734, + "step": 258100 + }, + { + "epoch": 1.648991221905626, + "grad_norm": 0.8109670877456665, + "learning_rate": 7.436012270776343e-06, + "loss": 0.7155, + "step": 258110 + }, + { + "epoch": 1.6490551090553645, + "grad_norm": 1.853824496269226, + "learning_rate": 7.4333796451121374e-06, + "loss": 0.8389, + "step": 258120 + }, + { + "epoch": 1.6491189962051034, + "grad_norm": 1.172304630279541, + "learning_rate": 7.430747448130609e-06, + "loss": 0.9715, + "step": 258130 + }, + { + "epoch": 1.6491828833548419, + "grad_norm": 0.904864490032196, + "learning_rate": 7.428115679858283e-06, + "loss": 0.8458, + "step": 258140 + }, + { + "epoch": 1.6492467705045808, + "grad_norm": 1.2634391784667969, + "learning_rate": 7.425484340321676e-06, + "loss": 0.8386, + "step": 258150 + }, + { + "epoch": 1.6493106576543193, + "grad_norm": 1.1437947750091553, + "learning_rate": 7.422853429547255e-06, + "loss": 0.8137, + "step": 258160 + }, + { + "epoch": 1.6493745448040582, + "grad_norm": 1.268520474433899, + "learning_rate": 7.420222947561556e-06, + "loss": 1.2592, + "step": 258170 + }, + { + "epoch": 1.6494384319537967, + "grad_norm": 0.8500333428382874, + "learning_rate": 7.417592894391029e-06, + "loss": 0.9249, + "step": 258180 + }, + { + "epoch": 1.6495023191035356, + "grad_norm": 0.9268811345100403, + "learning_rate": 7.414963270062198e-06, + "loss": 0.8247, + "step": 258190 + }, + { + "epoch": 1.649566206253274, + "grad_norm": 0.7710387110710144, + "learning_rate": 7.412334074601513e-06, + "loss": 0.8659, + "step": 258200 + }, + { + "epoch": 1.649630093403013, + "grad_norm": 0.9998272061347961, + "learning_rate": 7.409705308035475e-06, + "loss": 0.9299, + "step": 258210 + }, + { + "epoch": 1.6496939805527515, + "grad_norm": 1.1036978960037231, + "learning_rate": 7.407076970390547e-06, + "loss": 0.8578, + "step": 258220 + }, + { + "epoch": 1.6497578677024904, + "grad_norm": 0.7306403517723083, + "learning_rate": 7.404449061693208e-06, + "loss": 0.738, + "step": 258230 + }, + { + "epoch": 1.649821754852229, + "grad_norm": 0.7348700761795044, + "learning_rate": 7.401821581969909e-06, + "loss": 0.8739, + "step": 258240 + }, + { + "epoch": 1.6498856420019679, + "grad_norm": 0.921229898929596, + "learning_rate": 7.399194531247128e-06, + "loss": 0.8938, + "step": 258250 + }, + { + "epoch": 1.6499495291517063, + "grad_norm": 1.5429316759109497, + "learning_rate": 7.396567909551305e-06, + "loss": 0.8227, + "step": 258260 + }, + { + "epoch": 1.6500134163014453, + "grad_norm": 0.7975358366966248, + "learning_rate": 7.3939417169089085e-06, + "loss": 0.7359, + "step": 258270 + }, + { + "epoch": 1.6500773034511838, + "grad_norm": 0.6854422092437744, + "learning_rate": 7.3913159533463675e-06, + "loss": 0.7961, + "step": 258280 + }, + { + "epoch": 1.6501411906009227, + "grad_norm": 1.200284719467163, + "learning_rate": 7.388690618890148e-06, + "loss": 0.8829, + "step": 258290 + }, + { + "epoch": 1.6502050777506612, + "grad_norm": 0.825060248374939, + "learning_rate": 7.38606571356667e-06, + "loss": 1.0262, + "step": 258300 + }, + { + "epoch": 1.6502689649004, + "grad_norm": 0.9735358357429504, + "learning_rate": 7.383441237402383e-06, + "loss": 0.9383, + "step": 258310 + }, + { + "epoch": 1.6503328520501386, + "grad_norm": 1.1090470552444458, + "learning_rate": 7.380817190423706e-06, + "loss": 1.0839, + "step": 258320 + }, + { + "epoch": 1.6503967391998773, + "grad_norm": 1.1171561479568481, + "learning_rate": 7.378193572657077e-06, + "loss": 0.8753, + "step": 258330 + }, + { + "epoch": 1.650460626349616, + "grad_norm": 0.9593927264213562, + "learning_rate": 7.375570384128905e-06, + "loss": 0.8434, + "step": 258340 + }, + { + "epoch": 1.6505245134993547, + "grad_norm": 0.9938549995422363, + "learning_rate": 7.372947624865612e-06, + "loss": 0.761, + "step": 258350 + }, + { + "epoch": 1.6505884006490934, + "grad_norm": 0.8092223405838013, + "learning_rate": 7.370325294893621e-06, + "loss": 0.9039, + "step": 258360 + }, + { + "epoch": 1.650652287798832, + "grad_norm": 2.7092342376708984, + "learning_rate": 7.367703394239328e-06, + "loss": 0.8425, + "step": 258370 + }, + { + "epoch": 1.6507161749485708, + "grad_norm": 1.0019803047180176, + "learning_rate": 7.3650819229291565e-06, + "loss": 0.8753, + "step": 258380 + }, + { + "epoch": 1.6507800620983095, + "grad_norm": 1.1598658561706543, + "learning_rate": 7.36246088098948e-06, + "loss": 0.9008, + "step": 258390 + }, + { + "epoch": 1.6508439492480482, + "grad_norm": 1.3171591758728027, + "learning_rate": 7.359840268446722e-06, + "loss": 0.9998, + "step": 258400 + }, + { + "epoch": 1.650907836397787, + "grad_norm": 0.9848918914794922, + "learning_rate": 7.357220085327249e-06, + "loss": 0.7861, + "step": 258410 + }, + { + "epoch": 1.6509717235475256, + "grad_norm": 0.9984748363494873, + "learning_rate": 7.354600331657468e-06, + "loss": 0.8645, + "step": 258420 + }, + { + "epoch": 1.6510356106972643, + "grad_norm": 1.6876024007797241, + "learning_rate": 7.351981007463754e-06, + "loss": 1.0606, + "step": 258430 + }, + { + "epoch": 1.651099497847003, + "grad_norm": 0.5669913291931152, + "learning_rate": 7.34936211277249e-06, + "loss": 0.9474, + "step": 258440 + }, + { + "epoch": 1.6511633849967418, + "grad_norm": 1.1290777921676636, + "learning_rate": 7.346743647610038e-06, + "loss": 0.9748, + "step": 258450 + }, + { + "epoch": 1.6512272721464805, + "grad_norm": 0.7757238149642944, + "learning_rate": 7.344125612002794e-06, + "loss": 1.1126, + "step": 258460 + }, + { + "epoch": 1.6512911592962192, + "grad_norm": 2.1162233352661133, + "learning_rate": 7.341508005977104e-06, + "loss": 0.787, + "step": 258470 + }, + { + "epoch": 1.6513550464459579, + "grad_norm": 0.9174578785896301, + "learning_rate": 7.338890829559319e-06, + "loss": 0.6748, + "step": 258480 + }, + { + "epoch": 1.6514189335956966, + "grad_norm": 1.0966631174087524, + "learning_rate": 7.336274082775829e-06, + "loss": 1.0263, + "step": 258490 + }, + { + "epoch": 1.6514828207454353, + "grad_norm": 0.8177943229675293, + "learning_rate": 7.333657765652951e-06, + "loss": 1.035, + "step": 258500 + }, + { + "epoch": 1.651546707895174, + "grad_norm": 0.8279902338981628, + "learning_rate": 7.331041878217065e-06, + "loss": 1.0372, + "step": 258510 + }, + { + "epoch": 1.6516105950449127, + "grad_norm": 0.7911148071289062, + "learning_rate": 7.328426420494488e-06, + "loss": 1.1565, + "step": 258520 + }, + { + "epoch": 1.6516744821946514, + "grad_norm": 0.9187765717506409, + "learning_rate": 7.325811392511583e-06, + "loss": 0.7033, + "step": 258530 + }, + { + "epoch": 1.6517383693443901, + "grad_norm": 1.4142966270446777, + "learning_rate": 7.323196794294668e-06, + "loss": 0.9938, + "step": 258540 + }, + { + "epoch": 1.6518022564941288, + "grad_norm": 2.9758827686309814, + "learning_rate": 7.320582625870092e-06, + "loss": 1.0167, + "step": 258550 + }, + { + "epoch": 1.6518661436438675, + "grad_norm": 0.7503261566162109, + "learning_rate": 7.3179688872641595e-06, + "loss": 0.7175, + "step": 258560 + }, + { + "epoch": 1.651930030793606, + "grad_norm": 0.5419626235961914, + "learning_rate": 7.315355578503219e-06, + "loss": 0.7395, + "step": 258570 + }, + { + "epoch": 1.651993917943345, + "grad_norm": 0.9171911478042603, + "learning_rate": 7.312742699613561e-06, + "loss": 0.6511, + "step": 258580 + }, + { + "epoch": 1.6520578050930834, + "grad_norm": 3.700895071029663, + "learning_rate": 7.310130250621533e-06, + "loss": 0.8161, + "step": 258590 + }, + { + "epoch": 1.6521216922428223, + "grad_norm": 0.655987560749054, + "learning_rate": 7.307518231553406e-06, + "loss": 0.7787, + "step": 258600 + }, + { + "epoch": 1.6521855793925608, + "grad_norm": 1.6462640762329102, + "learning_rate": 7.304906642435516e-06, + "loss": 1.0206, + "step": 258610 + }, + { + "epoch": 1.6522494665422998, + "grad_norm": 0.9825352430343628, + "learning_rate": 7.302295483294147e-06, + "loss": 0.9565, + "step": 258620 + }, + { + "epoch": 1.6523133536920382, + "grad_norm": 1.0422539710998535, + "learning_rate": 7.299684754155606e-06, + "loss": 1.143, + "step": 258630 + }, + { + "epoch": 1.6523772408417772, + "grad_norm": 1.152791142463684, + "learning_rate": 7.297074455046171e-06, + "loss": 1.1539, + "step": 258640 + }, + { + "epoch": 1.6524411279915157, + "grad_norm": 0.7904389500617981, + "learning_rate": 7.294464585992156e-06, + "loss": 0.8516, + "step": 258650 + }, + { + "epoch": 1.6525050151412546, + "grad_norm": 1.0914205312728882, + "learning_rate": 7.29185514701981e-06, + "loss": 0.6568, + "step": 258660 + }, + { + "epoch": 1.652568902290993, + "grad_norm": 0.9571302533149719, + "learning_rate": 7.289246138155436e-06, + "loss": 1.0887, + "step": 258670 + }, + { + "epoch": 1.652632789440732, + "grad_norm": 0.8392852544784546, + "learning_rate": 7.286637559425313e-06, + "loss": 0.9472, + "step": 258680 + }, + { + "epoch": 1.6526966765904705, + "grad_norm": 1.3355880975723267, + "learning_rate": 7.284029410855686e-06, + "loss": 0.7675, + "step": 258690 + }, + { + "epoch": 1.6527605637402094, + "grad_norm": 0.7684211134910583, + "learning_rate": 7.281421692472856e-06, + "loss": 0.879, + "step": 258700 + }, + { + "epoch": 1.652824450889948, + "grad_norm": 0.7331956028938293, + "learning_rate": 7.278814404303047e-06, + "loss": 0.7309, + "step": 258710 + }, + { + "epoch": 1.6528883380396868, + "grad_norm": 0.8413065671920776, + "learning_rate": 7.276207546372555e-06, + "loss": 0.8249, + "step": 258720 + }, + { + "epoch": 1.6529522251894253, + "grad_norm": 0.8433711528778076, + "learning_rate": 7.273601118707601e-06, + "loss": 1.0195, + "step": 258730 + }, + { + "epoch": 1.6530161123391642, + "grad_norm": 1.394230842590332, + "learning_rate": 7.270995121334456e-06, + "loss": 0.882, + "step": 258740 + }, + { + "epoch": 1.6530799994889027, + "grad_norm": 1.0308012962341309, + "learning_rate": 7.268389554279348e-06, + "loss": 0.7528, + "step": 258750 + }, + { + "epoch": 1.6531438866386416, + "grad_norm": 0.8747785091400146, + "learning_rate": 7.265784417568533e-06, + "loss": 1.0698, + "step": 258760 + }, + { + "epoch": 1.6532077737883801, + "grad_norm": 4.951910495758057, + "learning_rate": 7.26317971122823e-06, + "loss": 1.1276, + "step": 258770 + }, + { + "epoch": 1.653271660938119, + "grad_norm": 1.2269554138183594, + "learning_rate": 7.260575435284694e-06, + "loss": 1.0913, + "step": 258780 + }, + { + "epoch": 1.6533355480878575, + "grad_norm": 1.137904405593872, + "learning_rate": 7.257971589764123e-06, + "loss": 1.0261, + "step": 258790 + }, + { + "epoch": 1.6533994352375965, + "grad_norm": 1.1804286241531372, + "learning_rate": 7.255368174692767e-06, + "loss": 0.7504, + "step": 258800 + }, + { + "epoch": 1.653463322387335, + "grad_norm": 0.7149761915206909, + "learning_rate": 7.252765190096822e-06, + "loss": 0.8265, + "step": 258810 + }, + { + "epoch": 1.6535272095370737, + "grad_norm": 1.0930120944976807, + "learning_rate": 7.250162636002528e-06, + "loss": 1.0783, + "step": 258820 + }, + { + "epoch": 1.6535910966868124, + "grad_norm": 0.8766050934791565, + "learning_rate": 7.247560512436063e-06, + "loss": 0.8981, + "step": 258830 + }, + { + "epoch": 1.653654983836551, + "grad_norm": 0.5357649326324463, + "learning_rate": 7.244958819423664e-06, + "loss": 0.7771, + "step": 258840 + }, + { + "epoch": 1.6537188709862898, + "grad_norm": 0.979239284992218, + "learning_rate": 7.242357556991508e-06, + "loss": 0.8438, + "step": 258850 + }, + { + "epoch": 1.6537827581360285, + "grad_norm": 0.8525903224945068, + "learning_rate": 7.2397567251658086e-06, + "loss": 0.8439, + "step": 258860 + }, + { + "epoch": 1.6538466452857672, + "grad_norm": 0.9817124605178833, + "learning_rate": 7.237156323972744e-06, + "loss": 0.6514, + "step": 258870 + }, + { + "epoch": 1.653910532435506, + "grad_norm": 1.5313221216201782, + "learning_rate": 7.2345563534385095e-06, + "loss": 0.6729, + "step": 258880 + }, + { + "epoch": 1.6539744195852446, + "grad_norm": 0.5732386112213135, + "learning_rate": 7.231956813589302e-06, + "loss": 0.9224, + "step": 258890 + }, + { + "epoch": 1.6540383067349833, + "grad_norm": 1.3414524793624878, + "learning_rate": 7.2293577044512785e-06, + "loss": 0.8311, + "step": 258900 + }, + { + "epoch": 1.654102193884722, + "grad_norm": 0.8082197308540344, + "learning_rate": 7.226759026050633e-06, + "loss": 0.9462, + "step": 258910 + }, + { + "epoch": 1.6541660810344607, + "grad_norm": 0.9954521656036377, + "learning_rate": 7.224160778413519e-06, + "loss": 0.9759, + "step": 258920 + }, + { + "epoch": 1.6542299681841994, + "grad_norm": 1.4728593826293945, + "learning_rate": 7.221562961566125e-06, + "loss": 0.7694, + "step": 258930 + }, + { + "epoch": 1.6542938553339381, + "grad_norm": 1.2984557151794434, + "learning_rate": 7.218965575534597e-06, + "loss": 0.9735, + "step": 258940 + }, + { + "epoch": 1.6543577424836768, + "grad_norm": 1.1309173107147217, + "learning_rate": 7.216368620345098e-06, + "loss": 0.8834, + "step": 258950 + }, + { + "epoch": 1.6544216296334155, + "grad_norm": 1.143385648727417, + "learning_rate": 7.2137720960237685e-06, + "loss": 0.8156, + "step": 258960 + }, + { + "epoch": 1.6544855167831543, + "grad_norm": 0.8312119245529175, + "learning_rate": 7.211176002596787e-06, + "loss": 0.8482, + "step": 258970 + }, + { + "epoch": 1.654549403932893, + "grad_norm": 1.2088533639907837, + "learning_rate": 7.20858034009026e-06, + "loss": 0.8226, + "step": 258980 + }, + { + "epoch": 1.6546132910826317, + "grad_norm": 0.682985246181488, + "learning_rate": 7.205985108530355e-06, + "loss": 0.8264, + "step": 258990 + }, + { + "epoch": 1.6546771782323704, + "grad_norm": 0.8336872458457947, + "learning_rate": 7.2033903079432085e-06, + "loss": 1.1374, + "step": 259000 + }, + { + "epoch": 1.654741065382109, + "grad_norm": 1.0589526891708374, + "learning_rate": 7.200795938354937e-06, + "loss": 0.9523, + "step": 259010 + }, + { + "epoch": 1.6548049525318478, + "grad_norm": 1.032314658164978, + "learning_rate": 7.198201999791687e-06, + "loss": 1.2553, + "step": 259020 + }, + { + "epoch": 1.6548688396815865, + "grad_norm": 1.1617461442947388, + "learning_rate": 7.195608492279565e-06, + "loss": 1.1661, + "step": 259030 + }, + { + "epoch": 1.6549327268313252, + "grad_norm": 0.8610371947288513, + "learning_rate": 7.193015415844706e-06, + "loss": 0.8, + "step": 259040 + }, + { + "epoch": 1.654996613981064, + "grad_norm": 0.7629457712173462, + "learning_rate": 7.1904227705131995e-06, + "loss": 0.9533, + "step": 259050 + }, + { + "epoch": 1.6550605011308024, + "grad_norm": 0.7997171878814697, + "learning_rate": 7.187830556311187e-06, + "loss": 0.844, + "step": 259060 + }, + { + "epoch": 1.6551243882805413, + "grad_norm": 0.6464492678642273, + "learning_rate": 7.185238773264746e-06, + "loss": 0.8245, + "step": 259070 + }, + { + "epoch": 1.6551882754302798, + "grad_norm": 1.1188616752624512, + "learning_rate": 7.182647421400002e-06, + "loss": 0.9905, + "step": 259080 + }, + { + "epoch": 1.6552521625800187, + "grad_norm": 0.8652756214141846, + "learning_rate": 7.180056500743032e-06, + "loss": 0.9505, + "step": 259090 + }, + { + "epoch": 1.6553160497297572, + "grad_norm": 0.8543888330459595, + "learning_rate": 7.177466011319945e-06, + "loss": 1.05, + "step": 259100 + }, + { + "epoch": 1.6553799368794961, + "grad_norm": 1.304616093635559, + "learning_rate": 7.174875953156812e-06, + "loss": 0.9993, + "step": 259110 + }, + { + "epoch": 1.6554438240292346, + "grad_norm": 0.7675788998603821, + "learning_rate": 7.1722863262797365e-06, + "loss": 0.9806, + "step": 259120 + }, + { + "epoch": 1.6555077111789736, + "grad_norm": 1.1626311540603638, + "learning_rate": 7.169697130714781e-06, + "loss": 0.8677, + "step": 259130 + }, + { + "epoch": 1.655571598328712, + "grad_norm": 3.422379970550537, + "learning_rate": 7.16710836648804e-06, + "loss": 0.7833, + "step": 259140 + }, + { + "epoch": 1.655635485478451, + "grad_norm": 1.048319935798645, + "learning_rate": 7.16452003362556e-06, + "loss": 0.8893, + "step": 259150 + }, + { + "epoch": 1.6556993726281894, + "grad_norm": 0.8422994613647461, + "learning_rate": 7.161932132153432e-06, + "loss": 0.9099, + "step": 259160 + }, + { + "epoch": 1.6557632597779284, + "grad_norm": 0.6454704403877258, + "learning_rate": 7.1593446620977e-06, + "loss": 0.7903, + "step": 259170 + }, + { + "epoch": 1.6558271469276669, + "grad_norm": 0.8068992495536804, + "learning_rate": 7.156757623484433e-06, + "loss": 0.8284, + "step": 259180 + }, + { + "epoch": 1.6558910340774058, + "grad_norm": 1.7042864561080933, + "learning_rate": 7.154171016339678e-06, + "loss": 1.1194, + "step": 259190 + }, + { + "epoch": 1.6559549212271443, + "grad_norm": 0.6754870414733887, + "learning_rate": 7.151584840689485e-06, + "loss": 0.8066, + "step": 259200 + }, + { + "epoch": 1.6560188083768832, + "grad_norm": 0.5614364147186279, + "learning_rate": 7.1489990965599165e-06, + "loss": 0.6847, + "step": 259210 + }, + { + "epoch": 1.6560826955266217, + "grad_norm": 0.9404881000518799, + "learning_rate": 7.146413783976979e-06, + "loss": 1.162, + "step": 259220 + }, + { + "epoch": 1.6561465826763606, + "grad_norm": 0.7669598460197449, + "learning_rate": 7.1438289029667485e-06, + "loss": 0.7352, + "step": 259230 + }, + { + "epoch": 1.656210469826099, + "grad_norm": 0.7307997345924377, + "learning_rate": 7.141244453555224e-06, + "loss": 0.8912, + "step": 259240 + }, + { + "epoch": 1.656274356975838, + "grad_norm": 0.7206220626831055, + "learning_rate": 7.138660435768452e-06, + "loss": 1.2435, + "step": 259250 + }, + { + "epoch": 1.6563382441255765, + "grad_norm": 0.9848304986953735, + "learning_rate": 7.136076849632445e-06, + "loss": 0.9284, + "step": 259260 + }, + { + "epoch": 1.6564021312753154, + "grad_norm": 0.5905624628067017, + "learning_rate": 7.133493695173233e-06, + "loss": 0.8219, + "step": 259270 + }, + { + "epoch": 1.656466018425054, + "grad_norm": 0.7588204145431519, + "learning_rate": 7.130910972416816e-06, + "loss": 0.7849, + "step": 259280 + }, + { + "epoch": 1.6565299055747928, + "grad_norm": 0.9828800559043884, + "learning_rate": 7.128328681389224e-06, + "loss": 0.5819, + "step": 259290 + }, + { + "epoch": 1.6565937927245313, + "grad_norm": 1.427322268486023, + "learning_rate": 7.125746822116436e-06, + "loss": 0.7216, + "step": 259300 + }, + { + "epoch": 1.65665767987427, + "grad_norm": 0.6109748482704163, + "learning_rate": 7.123165394624481e-06, + "loss": 0.9879, + "step": 259310 + }, + { + "epoch": 1.6567215670240087, + "grad_norm": 1.3023579120635986, + "learning_rate": 7.12058439893934e-06, + "loss": 0.9592, + "step": 259320 + }, + { + "epoch": 1.6567854541737475, + "grad_norm": 1.2052748203277588, + "learning_rate": 7.118003835087011e-06, + "loss": 1.0689, + "step": 259330 + }, + { + "epoch": 1.6568493413234862, + "grad_norm": 2.0568909645080566, + "learning_rate": 7.115423703093477e-06, + "loss": 0.8332, + "step": 259340 + }, + { + "epoch": 1.6569132284732249, + "grad_norm": 0.9474199414253235, + "learning_rate": 7.112844002984737e-06, + "loss": 0.5583, + "step": 259350 + }, + { + "epoch": 1.6569771156229636, + "grad_norm": 0.8970088958740234, + "learning_rate": 7.110264734786748e-06, + "loss": 0.8069, + "step": 259360 + }, + { + "epoch": 1.6570410027727023, + "grad_norm": 0.5539265871047974, + "learning_rate": 7.107685898525512e-06, + "loss": 0.8496, + "step": 259370 + }, + { + "epoch": 1.657104889922441, + "grad_norm": 0.9140658378601074, + "learning_rate": 7.10510749422697e-06, + "loss": 0.9984, + "step": 259380 + }, + { + "epoch": 1.6571687770721797, + "grad_norm": 2.285615921020508, + "learning_rate": 7.102529521917123e-06, + "loss": 0.8285, + "step": 259390 + }, + { + "epoch": 1.6572326642219184, + "grad_norm": 0.8295195698738098, + "learning_rate": 7.099951981621899e-06, + "loss": 0.6997, + "step": 259400 + }, + { + "epoch": 1.657296551371657, + "grad_norm": 0.6087706685066223, + "learning_rate": 7.097374873367274e-06, + "loss": 0.7951, + "step": 259410 + }, + { + "epoch": 1.6573604385213958, + "grad_norm": 1.149000644683838, + "learning_rate": 7.094798197179209e-06, + "loss": 1.0521, + "step": 259420 + }, + { + "epoch": 1.6574243256711345, + "grad_norm": 0.9276943206787109, + "learning_rate": 7.0922219530836495e-06, + "loss": 1.0712, + "step": 259430 + }, + { + "epoch": 1.6574882128208732, + "grad_norm": 0.8571450114250183, + "learning_rate": 7.089646141106532e-06, + "loss": 1.1182, + "step": 259440 + }, + { + "epoch": 1.657552099970612, + "grad_norm": 0.8675596117973328, + "learning_rate": 7.0870707612737894e-06, + "loss": 0.8654, + "step": 259450 + }, + { + "epoch": 1.6576159871203506, + "grad_norm": 0.7856946587562561, + "learning_rate": 7.08449581361138e-06, + "loss": 0.8388, + "step": 259460 + }, + { + "epoch": 1.6576798742700893, + "grad_norm": 1.1640080213546753, + "learning_rate": 7.081921298145217e-06, + "loss": 0.7823, + "step": 259470 + }, + { + "epoch": 1.657743761419828, + "grad_norm": 1.505099892616272, + "learning_rate": 7.079347214901244e-06, + "loss": 0.8802, + "step": 259480 + }, + { + "epoch": 1.6578076485695667, + "grad_norm": 1.1108448505401611, + "learning_rate": 7.076773563905364e-06, + "loss": 0.9317, + "step": 259490 + }, + { + "epoch": 1.6578715357193055, + "grad_norm": 0.66489177942276, + "learning_rate": 7.074200345183518e-06, + "loss": 1.0849, + "step": 259500 + }, + { + "epoch": 1.6579354228690442, + "grad_norm": 2.0368494987487793, + "learning_rate": 7.071627558761607e-06, + "loss": 1.0492, + "step": 259510 + }, + { + "epoch": 1.6579993100187829, + "grad_norm": 0.9006597399711609, + "learning_rate": 7.069055204665537e-06, + "loss": 1.1099, + "step": 259520 + }, + { + "epoch": 1.6580631971685216, + "grad_norm": 1.1404061317443848, + "learning_rate": 7.066483282921238e-06, + "loss": 1.2387, + "step": 259530 + }, + { + "epoch": 1.6581270843182603, + "grad_norm": 0.8864837884902954, + "learning_rate": 7.063911793554584e-06, + "loss": 0.9816, + "step": 259540 + }, + { + "epoch": 1.6581909714679988, + "grad_norm": 1.1338231563568115, + "learning_rate": 7.061340736591487e-06, + "loss": 0.8627, + "step": 259550 + }, + { + "epoch": 1.6582548586177377, + "grad_norm": 1.0094668865203857, + "learning_rate": 7.058770112057833e-06, + "loss": 0.8275, + "step": 259560 + }, + { + "epoch": 1.6583187457674762, + "grad_norm": 0.8408057689666748, + "learning_rate": 7.056199919979522e-06, + "loss": 1.094, + "step": 259570 + }, + { + "epoch": 1.658382632917215, + "grad_norm": 0.8108777403831482, + "learning_rate": 7.053630160382418e-06, + "loss": 0.9879, + "step": 259580 + }, + { + "epoch": 1.6584465200669536, + "grad_norm": 0.8305297493934631, + "learning_rate": 7.051060833292422e-06, + "loss": 0.7306, + "step": 259590 + }, + { + "epoch": 1.6585104072166925, + "grad_norm": 1.6006922721862793, + "learning_rate": 7.048491938735391e-06, + "loss": 0.7892, + "step": 259600 + }, + { + "epoch": 1.658574294366431, + "grad_norm": 0.9414778351783752, + "learning_rate": 7.045923476737215e-06, + "loss": 0.8955, + "step": 259610 + }, + { + "epoch": 1.65863818151617, + "grad_norm": 0.5770961046218872, + "learning_rate": 7.043355447323735e-06, + "loss": 0.9527, + "step": 259620 + }, + { + "epoch": 1.6587020686659084, + "grad_norm": 1.4286727905273438, + "learning_rate": 7.040787850520847e-06, + "loss": 0.837, + "step": 259630 + }, + { + "epoch": 1.6587659558156473, + "grad_norm": 0.8571046590805054, + "learning_rate": 7.0382206863543745e-06, + "loss": 1.1439, + "step": 259640 + }, + { + "epoch": 1.6588298429653858, + "grad_norm": 0.9641390442848206, + "learning_rate": 7.0356539548501965e-06, + "loss": 0.7242, + "step": 259650 + }, + { + "epoch": 1.6588937301151248, + "grad_norm": 0.7776396870613098, + "learning_rate": 7.033087656034143e-06, + "loss": 0.9499, + "step": 259660 + }, + { + "epoch": 1.6589576172648632, + "grad_norm": 1.0999819040298462, + "learning_rate": 7.030521789932082e-06, + "loss": 0.9728, + "step": 259670 + }, + { + "epoch": 1.6590215044146022, + "grad_norm": 0.7827914357185364, + "learning_rate": 7.027956356569831e-06, + "loss": 0.8949, + "step": 259680 + }, + { + "epoch": 1.6590853915643407, + "grad_norm": 1.1689743995666504, + "learning_rate": 7.025391355973243e-06, + "loss": 0.8803, + "step": 259690 + }, + { + "epoch": 1.6591492787140796, + "grad_norm": 0.9321467876434326, + "learning_rate": 7.022826788168129e-06, + "loss": 1.0456, + "step": 259700 + }, + { + "epoch": 1.659213165863818, + "grad_norm": 2.064208984375, + "learning_rate": 7.020262653180343e-06, + "loss": 0.8112, + "step": 259710 + }, + { + "epoch": 1.659277053013557, + "grad_norm": 1.093505620956421, + "learning_rate": 7.017698951035684e-06, + "loss": 0.7237, + "step": 259720 + }, + { + "epoch": 1.6593409401632955, + "grad_norm": 0.7893190383911133, + "learning_rate": 7.015135681759983e-06, + "loss": 0.7623, + "step": 259730 + }, + { + "epoch": 1.6594048273130344, + "grad_norm": 0.39939016103744507, + "learning_rate": 7.012572845379062e-06, + "loss": 1.0163, + "step": 259740 + }, + { + "epoch": 1.6594687144627729, + "grad_norm": 0.834723711013794, + "learning_rate": 7.010010441918713e-06, + "loss": 0.8614, + "step": 259750 + }, + { + "epoch": 1.6595326016125118, + "grad_norm": 1.1181209087371826, + "learning_rate": 7.007448471404759e-06, + "loss": 0.8843, + "step": 259760 + }, + { + "epoch": 1.6595964887622503, + "grad_norm": 2.0455477237701416, + "learning_rate": 7.004886933862986e-06, + "loss": 1.0729, + "step": 259770 + }, + { + "epoch": 1.6596603759119892, + "grad_norm": 1.9647599458694458, + "learning_rate": 7.002325829319206e-06, + "loss": 0.8421, + "step": 259780 + }, + { + "epoch": 1.6597242630617277, + "grad_norm": 0.7878594994544983, + "learning_rate": 6.9997651577991875e-06, + "loss": 0.7208, + "step": 259790 + }, + { + "epoch": 1.6597881502114664, + "grad_norm": 1.202373743057251, + "learning_rate": 6.997204919328753e-06, + "loss": 0.7398, + "step": 259800 + }, + { + "epoch": 1.6598520373612051, + "grad_norm": 1.152909517288208, + "learning_rate": 6.994645113933651e-06, + "loss": 0.7517, + "step": 259810 + }, + { + "epoch": 1.6599159245109438, + "grad_norm": 0.6811971664428711, + "learning_rate": 6.992085741639692e-06, + "loss": 0.8073, + "step": 259820 + }, + { + "epoch": 1.6599798116606825, + "grad_norm": 1.4438114166259766, + "learning_rate": 6.989526802472623e-06, + "loss": 1.0148, + "step": 259830 + }, + { + "epoch": 1.6600436988104212, + "grad_norm": 0.7701313495635986, + "learning_rate": 6.986968296458241e-06, + "loss": 0.886, + "step": 259840 + }, + { + "epoch": 1.66010758596016, + "grad_norm": 0.7953954935073853, + "learning_rate": 6.98441022362229e-06, + "loss": 0.7475, + "step": 259850 + }, + { + "epoch": 1.6601714731098987, + "grad_norm": 0.9012263417243958, + "learning_rate": 6.9818525839905555e-06, + "loss": 0.749, + "step": 259860 + }, + { + "epoch": 1.6602353602596374, + "grad_norm": 0.8417377471923828, + "learning_rate": 6.979295377588762e-06, + "loss": 0.6071, + "step": 259870 + }, + { + "epoch": 1.660299247409376, + "grad_norm": 0.8795028924942017, + "learning_rate": 6.9767386044427e-06, + "loss": 0.7728, + "step": 259880 + }, + { + "epoch": 1.6603631345591148, + "grad_norm": 0.8333396911621094, + "learning_rate": 6.974182264578088e-06, + "loss": 0.9647, + "step": 259890 + }, + { + "epoch": 1.6604270217088535, + "grad_norm": 0.7249652147293091, + "learning_rate": 6.971626358020694e-06, + "loss": 0.8005, + "step": 259900 + }, + { + "epoch": 1.6604909088585922, + "grad_norm": 1.110545039176941, + "learning_rate": 6.969070884796247e-06, + "loss": 0.9663, + "step": 259910 + }, + { + "epoch": 1.660554796008331, + "grad_norm": 0.9945274591445923, + "learning_rate": 6.966515844930471e-06, + "loss": 0.9558, + "step": 259920 + }, + { + "epoch": 1.6606186831580696, + "grad_norm": 0.9578294157981873, + "learning_rate": 6.963961238449124e-06, + "loss": 0.7574, + "step": 259930 + }, + { + "epoch": 1.6606825703078083, + "grad_norm": 1.060130000114441, + "learning_rate": 6.961407065377906e-06, + "loss": 1.1417, + "step": 259940 + }, + { + "epoch": 1.660746457457547, + "grad_norm": 0.7615020275115967, + "learning_rate": 6.9588533257425645e-06, + "loss": 0.8906, + "step": 259950 + }, + { + "epoch": 1.6608103446072857, + "grad_norm": 0.857650876045227, + "learning_rate": 6.956300019568795e-06, + "loss": 0.9114, + "step": 259960 + }, + { + "epoch": 1.6608742317570244, + "grad_norm": 1.2820098400115967, + "learning_rate": 6.953747146882328e-06, + "loss": 0.9001, + "step": 259970 + }, + { + "epoch": 1.6609381189067631, + "grad_norm": 1.1419647932052612, + "learning_rate": 6.951194707708863e-06, + "loss": 0.9179, + "step": 259980 + }, + { + "epoch": 1.6610020060565018, + "grad_norm": 1.4320467710494995, + "learning_rate": 6.94864270207412e-06, + "loss": 0.9435, + "step": 259990 + }, + { + "epoch": 1.6610658932062405, + "grad_norm": 1.4276831150054932, + "learning_rate": 6.946091130003779e-06, + "loss": 0.6951, + "step": 260000 + }, + { + "epoch": 1.6611297803559792, + "grad_norm": 1.3144681453704834, + "learning_rate": 6.9435399915235566e-06, + "loss": 0.8405, + "step": 260010 + }, + { + "epoch": 1.661193667505718, + "grad_norm": 0.8671199083328247, + "learning_rate": 6.940989286659122e-06, + "loss": 0.8414, + "step": 260020 + }, + { + "epoch": 1.6612575546554567, + "grad_norm": 0.8773459792137146, + "learning_rate": 6.938439015436193e-06, + "loss": 0.9126, + "step": 260030 + }, + { + "epoch": 1.6613214418051951, + "grad_norm": 1.2610617876052856, + "learning_rate": 6.935889177880422e-06, + "loss": 1.253, + "step": 260040 + }, + { + "epoch": 1.661385328954934, + "grad_norm": 0.6040871143341064, + "learning_rate": 6.9333397740175055e-06, + "loss": 0.8628, + "step": 260050 + }, + { + "epoch": 1.6614492161046726, + "grad_norm": 0.6715024709701538, + "learning_rate": 6.930790803873122e-06, + "loss": 0.6639, + "step": 260060 + }, + { + "epoch": 1.6615131032544115, + "grad_norm": 0.5819202065467834, + "learning_rate": 6.928242267472928e-06, + "loss": 0.9687, + "step": 260070 + }, + { + "epoch": 1.66157699040415, + "grad_norm": 1.6079118251800537, + "learning_rate": 6.9256941648426055e-06, + "loss": 0.8493, + "step": 260080 + }, + { + "epoch": 1.661640877553889, + "grad_norm": 0.8289951086044312, + "learning_rate": 6.9231464960077986e-06, + "loss": 1.1034, + "step": 260090 + }, + { + "epoch": 1.6617047647036274, + "grad_norm": 0.7515999674797058, + "learning_rate": 6.920599260994182e-06, + "loss": 1.0362, + "step": 260100 + }, + { + "epoch": 1.6617686518533663, + "grad_norm": 0.8849010467529297, + "learning_rate": 6.91805245982739e-06, + "loss": 0.6695, + "step": 260110 + }, + { + "epoch": 1.6618325390031048, + "grad_norm": 1.1140575408935547, + "learning_rate": 6.915506092533092e-06, + "loss": 0.8597, + "step": 260120 + }, + { + "epoch": 1.6618964261528437, + "grad_norm": 1.2413811683654785, + "learning_rate": 6.912960159136911e-06, + "loss": 0.9088, + "step": 260130 + }, + { + "epoch": 1.6619603133025822, + "grad_norm": 0.6170461177825928, + "learning_rate": 6.910414659664505e-06, + "loss": 1.1817, + "step": 260140 + }, + { + "epoch": 1.6620242004523211, + "grad_norm": 0.8800461888313293, + "learning_rate": 6.907869594141492e-06, + "loss": 0.7515, + "step": 260150 + }, + { + "epoch": 1.6620880876020596, + "grad_norm": 2.8232409954071045, + "learning_rate": 6.905324962593523e-06, + "loss": 0.6154, + "step": 260160 + }, + { + "epoch": 1.6621519747517985, + "grad_norm": 1.2982561588287354, + "learning_rate": 6.902780765046202e-06, + "loss": 0.8484, + "step": 260170 + }, + { + "epoch": 1.662215861901537, + "grad_norm": 1.3580363988876343, + "learning_rate": 6.9002370015251785e-06, + "loss": 0.7412, + "step": 260180 + }, + { + "epoch": 1.662279749051276, + "grad_norm": 1.23239004611969, + "learning_rate": 6.897693672056038e-06, + "loss": 0.8696, + "step": 260190 + }, + { + "epoch": 1.6623436362010144, + "grad_norm": 1.3502616882324219, + "learning_rate": 6.895150776664428e-06, + "loss": 0.8322, + "step": 260200 + }, + { + "epoch": 1.6624075233507534, + "grad_norm": 2.1571125984191895, + "learning_rate": 6.8926083153759256e-06, + "loss": 0.8226, + "step": 260210 + }, + { + "epoch": 1.6624714105004919, + "grad_norm": 1.4189776182174683, + "learning_rate": 6.890066288216163e-06, + "loss": 0.7431, + "step": 260220 + }, + { + "epoch": 1.6625352976502308, + "grad_norm": 0.8198552131652832, + "learning_rate": 6.887524695210712e-06, + "loss": 1.0104, + "step": 260230 + }, + { + "epoch": 1.6625991847999693, + "grad_norm": 2.3322837352752686, + "learning_rate": 6.884983536385203e-06, + "loss": 1.0087, + "step": 260240 + }, + { + "epoch": 1.6626630719497082, + "grad_norm": 1.4671247005462646, + "learning_rate": 6.882442811765194e-06, + "loss": 0.9418, + "step": 260250 + }, + { + "epoch": 1.6627269590994467, + "grad_norm": 1.3341039419174194, + "learning_rate": 6.879902521376291e-06, + "loss": 0.8353, + "step": 260260 + }, + { + "epoch": 1.6627908462491854, + "grad_norm": 0.7063724994659424, + "learning_rate": 6.877362665244086e-06, + "loss": 1.0032, + "step": 260270 + }, + { + "epoch": 1.662854733398924, + "grad_norm": 0.9785807132720947, + "learning_rate": 6.87482324339413e-06, + "loss": 0.7307, + "step": 260280 + }, + { + "epoch": 1.6629186205486628, + "grad_norm": 0.6390813589096069, + "learning_rate": 6.872284255852024e-06, + "loss": 0.7543, + "step": 260290 + }, + { + "epoch": 1.6629825076984015, + "grad_norm": 0.8422411680221558, + "learning_rate": 6.869745702643321e-06, + "loss": 0.9207, + "step": 260300 + }, + { + "epoch": 1.6630463948481402, + "grad_norm": 0.996957540512085, + "learning_rate": 6.867207583793595e-06, + "loss": 0.7347, + "step": 260310 + }, + { + "epoch": 1.663110281997879, + "grad_norm": 1.3174203634262085, + "learning_rate": 6.864669899328396e-06, + "loss": 0.8377, + "step": 260320 + }, + { + "epoch": 1.6631741691476176, + "grad_norm": 1.2684632539749146, + "learning_rate": 6.8621326492733e-06, + "loss": 0.8836, + "step": 260330 + }, + { + "epoch": 1.6632380562973563, + "grad_norm": 1.1714234352111816, + "learning_rate": 6.8595958336538366e-06, + "loss": 1.0765, + "step": 260340 + }, + { + "epoch": 1.663301943447095, + "grad_norm": 0.5843120813369751, + "learning_rate": 6.857059452495579e-06, + "loss": 0.8196, + "step": 260350 + }, + { + "epoch": 1.6633658305968337, + "grad_norm": 2.4564054012298584, + "learning_rate": 6.8545235058240455e-06, + "loss": 0.8696, + "step": 260360 + }, + { + "epoch": 1.6634297177465724, + "grad_norm": 1.0437301397323608, + "learning_rate": 6.8519879936647936e-06, + "loss": 1.1825, + "step": 260370 + }, + { + "epoch": 1.6634936048963112, + "grad_norm": 1.1650587320327759, + "learning_rate": 6.849452916043342e-06, + "loss": 0.9162, + "step": 260380 + }, + { + "epoch": 1.6635574920460499, + "grad_norm": 0.844201922416687, + "learning_rate": 6.846918272985236e-06, + "loss": 0.811, + "step": 260390 + }, + { + "epoch": 1.6636213791957886, + "grad_norm": 0.9441670179367065, + "learning_rate": 6.844384064516002e-06, + "loss": 0.835, + "step": 260400 + }, + { + "epoch": 1.6636852663455273, + "grad_norm": 0.881705641746521, + "learning_rate": 6.841850290661145e-06, + "loss": 0.8547, + "step": 260410 + }, + { + "epoch": 1.663749153495266, + "grad_norm": 1.6114307641983032, + "learning_rate": 6.839316951446201e-06, + "loss": 0.9447, + "step": 260420 + }, + { + "epoch": 1.6638130406450047, + "grad_norm": 1.4205198287963867, + "learning_rate": 6.83678404689666e-06, + "loss": 1.0585, + "step": 260430 + }, + { + "epoch": 1.6638769277947434, + "grad_norm": 1.1777595281600952, + "learning_rate": 6.834251577038059e-06, + "loss": 0.9045, + "step": 260440 + }, + { + "epoch": 1.663940814944482, + "grad_norm": 0.6923563480377197, + "learning_rate": 6.831719541895881e-06, + "loss": 0.5997, + "step": 260450 + }, + { + "epoch": 1.6640047020942208, + "grad_norm": 1.00764000415802, + "learning_rate": 6.8291879414956404e-06, + "loss": 0.8307, + "step": 260460 + }, + { + "epoch": 1.6640685892439595, + "grad_norm": 0.7519059777259827, + "learning_rate": 6.826656775862816e-06, + "loss": 0.8285, + "step": 260470 + }, + { + "epoch": 1.6641324763936982, + "grad_norm": 1.0380022525787354, + "learning_rate": 6.8241260450229164e-06, + "loss": 0.999, + "step": 260480 + }, + { + "epoch": 1.664196363543437, + "grad_norm": 1.5414055585861206, + "learning_rate": 6.8215957490014135e-06, + "loss": 1.0058, + "step": 260490 + }, + { + "epoch": 1.6642602506931756, + "grad_norm": 1.0284233093261719, + "learning_rate": 6.8190658878238e-06, + "loss": 0.7669, + "step": 260500 + }, + { + "epoch": 1.6643241378429143, + "grad_norm": 0.8977694511413574, + "learning_rate": 6.816536461515544e-06, + "loss": 0.891, + "step": 260510 + }, + { + "epoch": 1.664388024992653, + "grad_norm": 0.6301488280296326, + "learning_rate": 6.814007470102135e-06, + "loss": 0.8527, + "step": 260520 + }, + { + "epoch": 1.6644519121423915, + "grad_norm": 0.8851107358932495, + "learning_rate": 6.811478913609021e-06, + "loss": 0.9336, + "step": 260530 + }, + { + "epoch": 1.6645157992921304, + "grad_norm": 2.0833637714385986, + "learning_rate": 6.808950792061692e-06, + "loss": 0.9544, + "step": 260540 + }, + { + "epoch": 1.664579686441869, + "grad_norm": 1.2819724082946777, + "learning_rate": 6.806423105485577e-06, + "loss": 0.9766, + "step": 260550 + }, + { + "epoch": 1.6646435735916079, + "grad_norm": 0.8446447253227234, + "learning_rate": 6.803895853906161e-06, + "loss": 0.7945, + "step": 260560 + }, + { + "epoch": 1.6647074607413463, + "grad_norm": 1.1862691640853882, + "learning_rate": 6.801369037348876e-06, + "loss": 0.8252, + "step": 260570 + }, + { + "epoch": 1.6647713478910853, + "grad_norm": 0.799518883228302, + "learning_rate": 6.798842655839177e-06, + "loss": 0.7502, + "step": 260580 + }, + { + "epoch": 1.6648352350408238, + "grad_norm": 0.642816424369812, + "learning_rate": 6.796316709402517e-06, + "loss": 0.6272, + "step": 260590 + }, + { + "epoch": 1.6648991221905627, + "grad_norm": 0.8209549188613892, + "learning_rate": 6.793791198064309e-06, + "loss": 0.5957, + "step": 260600 + }, + { + "epoch": 1.6649630093403012, + "grad_norm": 0.8683721423149109, + "learning_rate": 6.79126612185002e-06, + "loss": 0.7731, + "step": 260610 + }, + { + "epoch": 1.66502689649004, + "grad_norm": 0.5981495380401611, + "learning_rate": 6.788741480785049e-06, + "loss": 0.9643, + "step": 260620 + }, + { + "epoch": 1.6650907836397786, + "grad_norm": 0.8800461292266846, + "learning_rate": 6.7862172748948405e-06, + "loss": 0.9372, + "step": 260630 + }, + { + "epoch": 1.6651546707895175, + "grad_norm": 2.0129380226135254, + "learning_rate": 6.7836935042048025e-06, + "loss": 0.8178, + "step": 260640 + }, + { + "epoch": 1.665218557939256, + "grad_norm": 2.3390965461730957, + "learning_rate": 6.781170168740369e-06, + "loss": 0.826, + "step": 260650 + }, + { + "epoch": 1.665282445088995, + "grad_norm": 0.812234103679657, + "learning_rate": 6.778647268526933e-06, + "loss": 0.8003, + "step": 260660 + }, + { + "epoch": 1.6653463322387334, + "grad_norm": 1.0095510482788086, + "learning_rate": 6.7761248035899175e-06, + "loss": 0.9685, + "step": 260670 + }, + { + "epoch": 1.6654102193884723, + "grad_norm": 0.8833625316619873, + "learning_rate": 6.7736027739547125e-06, + "loss": 0.8384, + "step": 260680 + }, + { + "epoch": 1.6654741065382108, + "grad_norm": 1.3582264184951782, + "learning_rate": 6.7710811796467265e-06, + "loss": 1.0906, + "step": 260690 + }, + { + "epoch": 1.6655379936879497, + "grad_norm": 0.8271639347076416, + "learning_rate": 6.76856002069135e-06, + "loss": 0.6488, + "step": 260700 + }, + { + "epoch": 1.6656018808376882, + "grad_norm": 0.9180144667625427, + "learning_rate": 6.76603929711398e-06, + "loss": 1.0439, + "step": 260710 + }, + { + "epoch": 1.6656657679874272, + "grad_norm": 1.2316609621047974, + "learning_rate": 6.763519008939989e-06, + "loss": 0.8008, + "step": 260720 + }, + { + "epoch": 1.6657296551371656, + "grad_norm": 0.8312288522720337, + "learning_rate": 6.760999156194775e-06, + "loss": 0.6873, + "step": 260730 + }, + { + "epoch": 1.6657935422869046, + "grad_norm": 0.6729956269264221, + "learning_rate": 6.758479738903695e-06, + "loss": 0.7072, + "step": 260740 + }, + { + "epoch": 1.665857429436643, + "grad_norm": 0.7326833605766296, + "learning_rate": 6.7559607570921445e-06, + "loss": 1.0599, + "step": 260750 + }, + { + "epoch": 1.6659213165863818, + "grad_norm": 0.9062512516975403, + "learning_rate": 6.7534422107854715e-06, + "loss": 0.8788, + "step": 260760 + }, + { + "epoch": 1.6659852037361205, + "grad_norm": 1.1865432262420654, + "learning_rate": 6.75092410000906e-06, + "loss": 1.0603, + "step": 260770 + }, + { + "epoch": 1.6660490908858592, + "grad_norm": 1.0734955072402954, + "learning_rate": 6.748406424788251e-06, + "loss": 0.8722, + "step": 260780 + }, + { + "epoch": 1.6661129780355979, + "grad_norm": 1.5629147291183472, + "learning_rate": 6.745889185148402e-06, + "loss": 0.7382, + "step": 260790 + }, + { + "epoch": 1.6661768651853366, + "grad_norm": 0.8913010954856873, + "learning_rate": 6.7433723811148816e-06, + "loss": 0.7195, + "step": 260800 + }, + { + "epoch": 1.6662407523350753, + "grad_norm": 0.9528048038482666, + "learning_rate": 6.74085601271301e-06, + "loss": 0.8426, + "step": 260810 + }, + { + "epoch": 1.666304639484814, + "grad_norm": 0.8141299486160278, + "learning_rate": 6.738340079968158e-06, + "loss": 1.0155, + "step": 260820 + }, + { + "epoch": 1.6663685266345527, + "grad_norm": 0.9365982413291931, + "learning_rate": 6.735824582905636e-06, + "loss": 0.9692, + "step": 260830 + }, + { + "epoch": 1.6664324137842914, + "grad_norm": 1.3122490644454956, + "learning_rate": 6.7333095215507956e-06, + "loss": 0.9352, + "step": 260840 + }, + { + "epoch": 1.6664963009340301, + "grad_norm": 0.897006094455719, + "learning_rate": 6.730794895928954e-06, + "loss": 0.9062, + "step": 260850 + }, + { + "epoch": 1.6665601880837688, + "grad_norm": 1.195637822151184, + "learning_rate": 6.728280706065448e-06, + "loss": 0.6769, + "step": 260860 + }, + { + "epoch": 1.6666240752335075, + "grad_norm": 0.8795990943908691, + "learning_rate": 6.725766951985585e-06, + "loss": 0.8848, + "step": 260870 + }, + { + "epoch": 1.6666879623832462, + "grad_norm": 1.1648136377334595, + "learning_rate": 6.723253633714694e-06, + "loss": 0.907, + "step": 260880 + }, + { + "epoch": 1.666751849532985, + "grad_norm": 1.356160283088684, + "learning_rate": 6.720740751278082e-06, + "loss": 0.8372, + "step": 260890 + }, + { + "epoch": 1.6668157366827236, + "grad_norm": 1.5002801418304443, + "learning_rate": 6.7182283047010355e-06, + "loss": 0.9494, + "step": 260900 + }, + { + "epoch": 1.6668796238324624, + "grad_norm": 0.8991245627403259, + "learning_rate": 6.715716294008889e-06, + "loss": 0.7429, + "step": 260910 + }, + { + "epoch": 1.666943510982201, + "grad_norm": 0.7147846221923828, + "learning_rate": 6.713204719226912e-06, + "loss": 1.0831, + "step": 260920 + }, + { + "epoch": 1.6670073981319398, + "grad_norm": 1.274672508239746, + "learning_rate": 6.710693580380423e-06, + "loss": 1.1, + "step": 260930 + }, + { + "epoch": 1.6670712852816785, + "grad_norm": 1.1722091436386108, + "learning_rate": 6.70818287749469e-06, + "loss": 1.0835, + "step": 260940 + }, + { + "epoch": 1.6671351724314172, + "grad_norm": 0.9067403674125671, + "learning_rate": 6.705672610595021e-06, + "loss": 0.7682, + "step": 260950 + }, + { + "epoch": 1.6671990595811559, + "grad_norm": 1.082120656967163, + "learning_rate": 6.703162779706673e-06, + "loss": 1.2327, + "step": 260960 + }, + { + "epoch": 1.6672629467308946, + "grad_norm": 1.0921111106872559, + "learning_rate": 6.700653384854944e-06, + "loss": 0.8915, + "step": 260970 + }, + { + "epoch": 1.6673268338806333, + "grad_norm": 1.0145679712295532, + "learning_rate": 6.698144426065079e-06, + "loss": 1.1276, + "step": 260980 + }, + { + "epoch": 1.667390721030372, + "grad_norm": 0.8104844689369202, + "learning_rate": 6.6956359033623795e-06, + "loss": 0.7862, + "step": 260990 + }, + { + "epoch": 1.6674546081801105, + "grad_norm": 1.0032187700271606, + "learning_rate": 6.693127816772071e-06, + "loss": 0.9296, + "step": 261000 + }, + { + "epoch": 1.6675184953298494, + "grad_norm": 1.3268076181411743, + "learning_rate": 6.6906201663194505e-06, + "loss": 0.6579, + "step": 261010 + }, + { + "epoch": 1.667582382479588, + "grad_norm": 0.8994929194450378, + "learning_rate": 6.688112952029735e-06, + "loss": 0.8517, + "step": 261020 + }, + { + "epoch": 1.6676462696293268, + "grad_norm": 1.6113344430923462, + "learning_rate": 6.685606173928205e-06, + "loss": 0.742, + "step": 261030 + }, + { + "epoch": 1.6677101567790653, + "grad_norm": 1.1202119588851929, + "learning_rate": 6.68309983204008e-06, + "loss": 0.6721, + "step": 261040 + }, + { + "epoch": 1.6677740439288042, + "grad_norm": 1.158851146697998, + "learning_rate": 6.680593926390627e-06, + "loss": 0.9163, + "step": 261050 + }, + { + "epoch": 1.6678379310785427, + "grad_norm": 0.9848284125328064, + "learning_rate": 6.678088457005061e-06, + "loss": 0.8478, + "step": 261060 + }, + { + "epoch": 1.6679018182282817, + "grad_norm": 2.442725419998169, + "learning_rate": 6.675583423908632e-06, + "loss": 0.9834, + "step": 261070 + }, + { + "epoch": 1.6679657053780201, + "grad_norm": 0.6988223195075989, + "learning_rate": 6.67307882712655e-06, + "loss": 0.7203, + "step": 261080 + }, + { + "epoch": 1.668029592527759, + "grad_norm": 1.1047319173812866, + "learning_rate": 6.670574666684054e-06, + "loss": 0.8124, + "step": 261090 + }, + { + "epoch": 1.6680934796774975, + "grad_norm": 0.8708413243293762, + "learning_rate": 6.668070942606352e-06, + "loss": 0.9876, + "step": 261100 + }, + { + "epoch": 1.6681573668272365, + "grad_norm": 1.0053987503051758, + "learning_rate": 6.665567654918659e-06, + "loss": 0.8756, + "step": 261110 + }, + { + "epoch": 1.668221253976975, + "grad_norm": 0.6905286908149719, + "learning_rate": 6.6630648036462015e-06, + "loss": 0.9977, + "step": 261120 + }, + { + "epoch": 1.6682851411267139, + "grad_norm": 0.7959697246551514, + "learning_rate": 6.660562388814162e-06, + "loss": 0.8796, + "step": 261130 + }, + { + "epoch": 1.6683490282764524, + "grad_norm": 0.7797834277153015, + "learning_rate": 6.658060410447764e-06, + "loss": 0.825, + "step": 261140 + }, + { + "epoch": 1.6684129154261913, + "grad_norm": 0.5888381600379944, + "learning_rate": 6.6555588685721815e-06, + "loss": 0.6742, + "step": 261150 + }, + { + "epoch": 1.6684768025759298, + "grad_norm": 1.089735984802246, + "learning_rate": 6.653057763212628e-06, + "loss": 1.0176, + "step": 261160 + }, + { + "epoch": 1.6685406897256687, + "grad_norm": 0.662281334400177, + "learning_rate": 6.650557094394278e-06, + "loss": 0.7775, + "step": 261170 + }, + { + "epoch": 1.6686045768754072, + "grad_norm": 0.7887861132621765, + "learning_rate": 6.6480568621423265e-06, + "loss": 0.6818, + "step": 261180 + }, + { + "epoch": 1.6686684640251461, + "grad_norm": 1.110487937927246, + "learning_rate": 6.645557066481939e-06, + "loss": 0.7015, + "step": 261190 + }, + { + "epoch": 1.6687323511748846, + "grad_norm": 0.8117868304252625, + "learning_rate": 6.643057707438311e-06, + "loss": 0.9294, + "step": 261200 + }, + { + "epoch": 1.6687962383246235, + "grad_norm": 1.027706503868103, + "learning_rate": 6.640558785036588e-06, + "loss": 0.8158, + "step": 261210 + }, + { + "epoch": 1.668860125474362, + "grad_norm": 1.0780733823776245, + "learning_rate": 6.638060299301962e-06, + "loss": 0.9441, + "step": 261220 + }, + { + "epoch": 1.668924012624101, + "grad_norm": 0.6005793213844299, + "learning_rate": 6.6355622502595696e-06, + "loss": 0.7146, + "step": 261230 + }, + { + "epoch": 1.6689878997738394, + "grad_norm": 1.7903164625167847, + "learning_rate": 6.633064637934594e-06, + "loss": 0.7896, + "step": 261240 + }, + { + "epoch": 1.6690517869235781, + "grad_norm": 0.8301639556884766, + "learning_rate": 6.630567462352161e-06, + "loss": 0.9835, + "step": 261250 + }, + { + "epoch": 1.6691156740733168, + "grad_norm": 0.9477317333221436, + "learning_rate": 6.628070723537444e-06, + "loss": 0.7894, + "step": 261260 + }, + { + "epoch": 1.6691795612230556, + "grad_norm": 0.8038954734802246, + "learning_rate": 6.625574421515568e-06, + "loss": 1.0712, + "step": 261270 + }, + { + "epoch": 1.6692434483727943, + "grad_norm": 0.5543741583824158, + "learning_rate": 6.623078556311696e-06, + "loss": 1.0573, + "step": 261280 + }, + { + "epoch": 1.669307335522533, + "grad_norm": 0.5797288417816162, + "learning_rate": 6.620583127950936e-06, + "loss": 0.7646, + "step": 261290 + }, + { + "epoch": 1.6693712226722717, + "grad_norm": 0.6953587532043457, + "learning_rate": 6.618088136458428e-06, + "loss": 0.8131, + "step": 261300 + }, + { + "epoch": 1.6694351098220104, + "grad_norm": 0.9206896424293518, + "learning_rate": 6.615593581859319e-06, + "loss": 0.6902, + "step": 261310 + }, + { + "epoch": 1.669498996971749, + "grad_norm": 0.7176907062530518, + "learning_rate": 6.613348856284723e-06, + "loss": 0.9896, + "step": 261320 + }, + { + "epoch": 1.6695628841214878, + "grad_norm": 0.7699728012084961, + "learning_rate": 6.61085513185225e-06, + "loss": 0.9096, + "step": 261330 + }, + { + "epoch": 1.6696267712712265, + "grad_norm": 1.0003939867019653, + "learning_rate": 6.6083618443859906e-06, + "loss": 1.0432, + "step": 261340 + }, + { + "epoch": 1.6696906584209652, + "grad_norm": 0.6458075642585754, + "learning_rate": 6.605868993911074e-06, + "loss": 0.8929, + "step": 261350 + }, + { + "epoch": 1.669754545570704, + "grad_norm": 0.8971933722496033, + "learning_rate": 6.603376580452591e-06, + "loss": 0.9198, + "step": 261360 + }, + { + "epoch": 1.6698184327204426, + "grad_norm": 0.899749219417572, + "learning_rate": 6.600884604035662e-06, + "loss": 1.0874, + "step": 261370 + }, + { + "epoch": 1.6698823198701813, + "grad_norm": 0.8684645295143127, + "learning_rate": 6.598393064685354e-06, + "loss": 0.9654, + "step": 261380 + }, + { + "epoch": 1.66994620701992, + "grad_norm": 1.1220735311508179, + "learning_rate": 6.595901962426793e-06, + "loss": 0.9396, + "step": 261390 + }, + { + "epoch": 1.6700100941696587, + "grad_norm": 0.7436129450798035, + "learning_rate": 6.593411297285035e-06, + "loss": 0.8479, + "step": 261400 + }, + { + "epoch": 1.6700739813193974, + "grad_norm": 0.8839870095252991, + "learning_rate": 6.590921069285188e-06, + "loss": 0.8318, + "step": 261410 + }, + { + "epoch": 1.6701378684691361, + "grad_norm": 1.568926453590393, + "learning_rate": 6.588431278452312e-06, + "loss": 0.6912, + "step": 261420 + }, + { + "epoch": 1.6702017556188749, + "grad_norm": 0.7818752527236938, + "learning_rate": 6.585941924811484e-06, + "loss": 0.8117, + "step": 261430 + }, + { + "epoch": 1.6702656427686136, + "grad_norm": 0.7253796458244324, + "learning_rate": 6.583453008387797e-06, + "loss": 0.8255, + "step": 261440 + }, + { + "epoch": 1.6703295299183523, + "grad_norm": 0.8075445890426636, + "learning_rate": 6.580964529206285e-06, + "loss": 0.8221, + "step": 261450 + }, + { + "epoch": 1.670393417068091, + "grad_norm": 1.1005500555038452, + "learning_rate": 6.578476487292029e-06, + "loss": 0.7249, + "step": 261460 + }, + { + "epoch": 1.6704573042178297, + "grad_norm": 0.7692040205001831, + "learning_rate": 6.575988882670075e-06, + "loss": 0.815, + "step": 261470 + }, + { + "epoch": 1.6705211913675684, + "grad_norm": 1.0020064115524292, + "learning_rate": 6.573501715365487e-06, + "loss": 0.9109, + "step": 261480 + }, + { + "epoch": 1.6705850785173069, + "grad_norm": 0.8735017776489258, + "learning_rate": 6.571014985403301e-06, + "loss": 0.8558, + "step": 261490 + }, + { + "epoch": 1.6706489656670458, + "grad_norm": 1.1462574005126953, + "learning_rate": 6.568528692808568e-06, + "loss": 1.2546, + "step": 261500 + }, + { + "epoch": 1.6707128528167843, + "grad_norm": 0.6212501525878906, + "learning_rate": 6.566042837606323e-06, + "loss": 0.8025, + "step": 261510 + }, + { + "epoch": 1.6707767399665232, + "grad_norm": 0.7799587249755859, + "learning_rate": 6.563557419821614e-06, + "loss": 0.8916, + "step": 261520 + }, + { + "epoch": 1.6708406271162617, + "grad_norm": 1.1324352025985718, + "learning_rate": 6.561072439479443e-06, + "loss": 0.7127, + "step": 261530 + }, + { + "epoch": 1.6709045142660006, + "grad_norm": 0.8433434367179871, + "learning_rate": 6.558587896604856e-06, + "loss": 0.7443, + "step": 261540 + }, + { + "epoch": 1.670968401415739, + "grad_norm": 0.7592739462852478, + "learning_rate": 6.556103791222879e-06, + "loss": 0.7583, + "step": 261550 + }, + { + "epoch": 1.671032288565478, + "grad_norm": 1.1327989101409912, + "learning_rate": 6.553620123358506e-06, + "loss": 0.9683, + "step": 261560 + }, + { + "epoch": 1.6710961757152165, + "grad_norm": 0.8317406177520752, + "learning_rate": 6.5511368930367855e-06, + "loss": 0.6488, + "step": 261570 + }, + { + "epoch": 1.6711600628649554, + "grad_norm": 1.7189500331878662, + "learning_rate": 6.548654100282686e-06, + "loss": 0.9956, + "step": 261580 + }, + { + "epoch": 1.671223950014694, + "grad_norm": 0.8068856000900269, + "learning_rate": 6.546171745121243e-06, + "loss": 1.0227, + "step": 261590 + }, + { + "epoch": 1.6712878371644329, + "grad_norm": 1.6268513202667236, + "learning_rate": 6.543689827577432e-06, + "loss": 0.8547, + "step": 261600 + }, + { + "epoch": 1.6713517243141713, + "grad_norm": 1.141613483428955, + "learning_rate": 6.541208347676275e-06, + "loss": 0.7498, + "step": 261610 + }, + { + "epoch": 1.6714156114639103, + "grad_norm": 0.9748889207839966, + "learning_rate": 6.538727305442732e-06, + "loss": 0.9626, + "step": 261620 + }, + { + "epoch": 1.6714794986136488, + "grad_norm": 1.0086246728897095, + "learning_rate": 6.5362467009018154e-06, + "loss": 0.9017, + "step": 261630 + }, + { + "epoch": 1.6715433857633877, + "grad_norm": 1.1621673107147217, + "learning_rate": 6.5337665340784835e-06, + "loss": 0.6427, + "step": 261640 + }, + { + "epoch": 1.6716072729131262, + "grad_norm": 0.7008302807807922, + "learning_rate": 6.5312868049977434e-06, + "loss": 0.7493, + "step": 261650 + }, + { + "epoch": 1.671671160062865, + "grad_norm": 1.4598950147628784, + "learning_rate": 6.528807513684537e-06, + "loss": 1.1208, + "step": 261660 + }, + { + "epoch": 1.6717350472126036, + "grad_norm": 0.8973243832588196, + "learning_rate": 6.526328660163855e-06, + "loss": 0.9835, + "step": 261670 + }, + { + "epoch": 1.6717989343623425, + "grad_norm": 1.616610050201416, + "learning_rate": 6.523850244460644e-06, + "loss": 0.9121, + "step": 261680 + }, + { + "epoch": 1.671862821512081, + "grad_norm": 0.8376414179801941, + "learning_rate": 6.521372266599885e-06, + "loss": 0.8662, + "step": 261690 + }, + { + "epoch": 1.67192670866182, + "grad_norm": 1.0430593490600586, + "learning_rate": 6.518894726606511e-06, + "loss": 0.813, + "step": 261700 + }, + { + "epoch": 1.6719905958115584, + "grad_norm": 0.6665481328964233, + "learning_rate": 6.516417624505494e-06, + "loss": 0.7176, + "step": 261710 + }, + { + "epoch": 1.6720544829612973, + "grad_norm": 0.7445017695426941, + "learning_rate": 6.513940960321757e-06, + "loss": 0.8957, + "step": 261720 + }, + { + "epoch": 1.6721183701110358, + "grad_norm": 1.3266037702560425, + "learning_rate": 6.5114647340802695e-06, + "loss": 0.823, + "step": 261730 + }, + { + "epoch": 1.6721822572607745, + "grad_norm": 0.736445963382721, + "learning_rate": 6.508988945805944e-06, + "loss": 0.9823, + "step": 261740 + }, + { + "epoch": 1.6722461444105132, + "grad_norm": 0.9925602674484253, + "learning_rate": 6.506513595523722e-06, + "loss": 1.0593, + "step": 261750 + }, + { + "epoch": 1.672310031560252, + "grad_norm": 0.9995372295379639, + "learning_rate": 6.504038683258551e-06, + "loss": 1.0638, + "step": 261760 + }, + { + "epoch": 1.6723739187099906, + "grad_norm": 0.6971041560173035, + "learning_rate": 6.501564209035327e-06, + "loss": 1.1433, + "step": 261770 + }, + { + "epoch": 1.6724378058597293, + "grad_norm": 0.8027698397636414, + "learning_rate": 6.499090172878991e-06, + "loss": 0.8928, + "step": 261780 + }, + { + "epoch": 1.672501693009468, + "grad_norm": 0.8169634342193604, + "learning_rate": 6.49661657481444e-06, + "loss": 0.746, + "step": 261790 + }, + { + "epoch": 1.6725655801592068, + "grad_norm": 0.7390976548194885, + "learning_rate": 6.49414341486661e-06, + "loss": 0.9419, + "step": 261800 + }, + { + "epoch": 1.6726294673089455, + "grad_norm": 1.2797377109527588, + "learning_rate": 6.491670693060381e-06, + "loss": 1.0843, + "step": 261810 + }, + { + "epoch": 1.6726933544586842, + "grad_norm": 0.6688075065612793, + "learning_rate": 6.489198409420682e-06, + "loss": 0.954, + "step": 261820 + }, + { + "epoch": 1.6727572416084229, + "grad_norm": 0.9320153594017029, + "learning_rate": 6.486726563972384e-06, + "loss": 0.9742, + "step": 261830 + }, + { + "epoch": 1.6728211287581616, + "grad_norm": 1.3801181316375732, + "learning_rate": 6.4842551567404045e-06, + "loss": 0.9538, + "step": 261840 + }, + { + "epoch": 1.6728850159079003, + "grad_norm": 0.7845503091812134, + "learning_rate": 6.481784187749612e-06, + "loss": 0.7776, + "step": 261850 + }, + { + "epoch": 1.672948903057639, + "grad_norm": 1.392819881439209, + "learning_rate": 6.47931365702491e-06, + "loss": 0.7982, + "step": 261860 + }, + { + "epoch": 1.6730127902073777, + "grad_norm": 1.0314170122146606, + "learning_rate": 6.476843564591162e-06, + "loss": 0.8069, + "step": 261870 + }, + { + "epoch": 1.6730766773571164, + "grad_norm": 0.861655592918396, + "learning_rate": 6.474373910473258e-06, + "loss": 0.59, + "step": 261880 + }, + { + "epoch": 1.673140564506855, + "grad_norm": 0.8280917406082153, + "learning_rate": 6.471904694696057e-06, + "loss": 0.6813, + "step": 261890 + }, + { + "epoch": 1.6732044516565938, + "grad_norm": 1.4525123834609985, + "learning_rate": 6.469435917284445e-06, + "loss": 0.6511, + "step": 261900 + }, + { + "epoch": 1.6732683388063325, + "grad_norm": 1.097639799118042, + "learning_rate": 6.466967578263261e-06, + "loss": 0.8087, + "step": 261910 + }, + { + "epoch": 1.6733322259560712, + "grad_norm": 1.2644404172897339, + "learning_rate": 6.4644996776573815e-06, + "loss": 0.8811, + "step": 261920 + }, + { + "epoch": 1.67339611310581, + "grad_norm": 0.8408343195915222, + "learning_rate": 6.462032215491648e-06, + "loss": 0.8309, + "step": 261930 + }, + { + "epoch": 1.6734600002555486, + "grad_norm": 1.4719483852386475, + "learning_rate": 6.4595651917909225e-06, + "loss": 0.7323, + "step": 261940 + }, + { + "epoch": 1.6735238874052873, + "grad_norm": 1.1225919723510742, + "learning_rate": 6.457098606580036e-06, + "loss": 0.9404, + "step": 261950 + }, + { + "epoch": 1.673587774555026, + "grad_norm": 1.05169677734375, + "learning_rate": 6.454632459883836e-06, + "loss": 0.7675, + "step": 261960 + }, + { + "epoch": 1.6736516617047648, + "grad_norm": 1.0684877634048462, + "learning_rate": 6.4521667517271736e-06, + "loss": 0.9407, + "step": 261970 + }, + { + "epoch": 1.6737155488545032, + "grad_norm": 1.4952465295791626, + "learning_rate": 6.449701482134851e-06, + "loss": 0.6827, + "step": 261980 + }, + { + "epoch": 1.6737794360042422, + "grad_norm": 2.874864101409912, + "learning_rate": 6.447236651131722e-06, + "loss": 0.8218, + "step": 261990 + }, + { + "epoch": 1.6738433231539807, + "grad_norm": 1.047302484512329, + "learning_rate": 6.4447722587425985e-06, + "loss": 0.7981, + "step": 262000 + }, + { + "epoch": 1.6739072103037196, + "grad_norm": 1.5692522525787354, + "learning_rate": 6.442308304992295e-06, + "loss": 1.2054, + "step": 262010 + }, + { + "epoch": 1.673971097453458, + "grad_norm": 1.0707038640975952, + "learning_rate": 6.439844789905625e-06, + "loss": 0.8025, + "step": 262020 + }, + { + "epoch": 1.674034984603197, + "grad_norm": 0.7095916867256165, + "learning_rate": 6.437381713507412e-06, + "loss": 0.6055, + "step": 262030 + }, + { + "epoch": 1.6740988717529355, + "grad_norm": 1.031847357749939, + "learning_rate": 6.4349190758224375e-06, + "loss": 0.8475, + "step": 262040 + }, + { + "epoch": 1.6741627589026744, + "grad_norm": 0.6809356808662415, + "learning_rate": 6.432456876875537e-06, + "loss": 0.8638, + "step": 262050 + }, + { + "epoch": 1.674226646052413, + "grad_norm": 1.3535555601119995, + "learning_rate": 6.429995116691468e-06, + "loss": 0.7976, + "step": 262060 + }, + { + "epoch": 1.6742905332021518, + "grad_norm": 1.0233763456344604, + "learning_rate": 6.427533795295043e-06, + "loss": 1.1708, + "step": 262070 + }, + { + "epoch": 1.6743544203518903, + "grad_norm": 1.219503402709961, + "learning_rate": 6.425072912711061e-06, + "loss": 0.8271, + "step": 262080 + }, + { + "epoch": 1.6744183075016292, + "grad_norm": 1.8418071269989014, + "learning_rate": 6.4226124689642784e-06, + "loss": 0.9065, + "step": 262090 + }, + { + "epoch": 1.6744821946513677, + "grad_norm": 1.0959925651550293, + "learning_rate": 6.4201524640795045e-06, + "loss": 1.0587, + "step": 262100 + }, + { + "epoch": 1.6745460818011066, + "grad_norm": 0.8647409081459045, + "learning_rate": 6.4176928980814785e-06, + "loss": 0.9943, + "step": 262110 + }, + { + "epoch": 1.6746099689508451, + "grad_norm": 1.121370553970337, + "learning_rate": 6.415233770995005e-06, + "loss": 1.0122, + "step": 262120 + }, + { + "epoch": 1.674673856100584, + "grad_norm": 0.7462054491043091, + "learning_rate": 6.412775082844824e-06, + "loss": 0.8022, + "step": 262130 + }, + { + "epoch": 1.6747377432503225, + "grad_norm": 1.5839956998825073, + "learning_rate": 6.410316833655716e-06, + "loss": 0.7529, + "step": 262140 + }, + { + "epoch": 1.6748016304000615, + "grad_norm": 0.8902942538261414, + "learning_rate": 6.4078590234524174e-06, + "loss": 0.9492, + "step": 262150 + }, + { + "epoch": 1.6748655175498, + "grad_norm": 1.2881721258163452, + "learning_rate": 6.405401652259701e-06, + "loss": 0.9617, + "step": 262160 + }, + { + "epoch": 1.6749294046995389, + "grad_norm": 1.0738145112991333, + "learning_rate": 6.402944720102294e-06, + "loss": 0.869, + "step": 262170 + }, + { + "epoch": 1.6749932918492774, + "grad_norm": 1.1131645441055298, + "learning_rate": 6.400488227004958e-06, + "loss": 0.7081, + "step": 262180 + }, + { + "epoch": 1.6750571789990163, + "grad_norm": 0.7208460569381714, + "learning_rate": 6.398032172992418e-06, + "loss": 1.1058, + "step": 262190 + }, + { + "epoch": 1.6751210661487548, + "grad_norm": 0.9222758412361145, + "learning_rate": 6.395576558089428e-06, + "loss": 0.8924, + "step": 262200 + }, + { + "epoch": 1.6751849532984937, + "grad_norm": 1.1875065565109253, + "learning_rate": 6.393121382320688e-06, + "loss": 0.6993, + "step": 262210 + }, + { + "epoch": 1.6752488404482322, + "grad_norm": 1.047171950340271, + "learning_rate": 6.390666645710958e-06, + "loss": 0.7424, + "step": 262220 + }, + { + "epoch": 1.675312727597971, + "grad_norm": 0.8368127346038818, + "learning_rate": 6.388212348284928e-06, + "loss": 0.7994, + "step": 262230 + }, + { + "epoch": 1.6753766147477096, + "grad_norm": 0.6694656014442444, + "learning_rate": 6.38575849006734e-06, + "loss": 0.8353, + "step": 262240 + }, + { + "epoch": 1.6754405018974483, + "grad_norm": 1.1006664037704468, + "learning_rate": 6.3833050710828845e-06, + "loss": 0.7687, + "step": 262250 + }, + { + "epoch": 1.675504389047187, + "grad_norm": 1.3615388870239258, + "learning_rate": 6.380852091356293e-06, + "loss": 0.7974, + "step": 262260 + }, + { + "epoch": 1.6755682761969257, + "grad_norm": 1.3918992280960083, + "learning_rate": 6.378399550912245e-06, + "loss": 0.8418, + "step": 262270 + }, + { + "epoch": 1.6756321633466644, + "grad_norm": 0.9409014582633972, + "learning_rate": 6.375947449775449e-06, + "loss": 0.7166, + "step": 262280 + }, + { + "epoch": 1.6756960504964031, + "grad_norm": 0.5877164006233215, + "learning_rate": 6.373495787970618e-06, + "loss": 0.8951, + "step": 262290 + }, + { + "epoch": 1.6757599376461418, + "grad_norm": 1.2059690952301025, + "learning_rate": 6.371044565522416e-06, + "loss": 1.0055, + "step": 262300 + }, + { + "epoch": 1.6758238247958805, + "grad_norm": 1.808434009552002, + "learning_rate": 6.368593782455545e-06, + "loss": 0.9485, + "step": 262310 + }, + { + "epoch": 1.6758877119456193, + "grad_norm": 0.8852072954177856, + "learning_rate": 6.366143438794675e-06, + "loss": 0.9596, + "step": 262320 + }, + { + "epoch": 1.675951599095358, + "grad_norm": 1.7130441665649414, + "learning_rate": 6.363693534564497e-06, + "loss": 0.8242, + "step": 262330 + }, + { + "epoch": 1.6760154862450967, + "grad_norm": 0.662137508392334, + "learning_rate": 6.361244069789668e-06, + "loss": 0.6081, + "step": 262340 + }, + { + "epoch": 1.6760793733948354, + "grad_norm": 1.2962076663970947, + "learning_rate": 6.358795044494875e-06, + "loss": 0.8961, + "step": 262350 + }, + { + "epoch": 1.676143260544574, + "grad_norm": 1.1145683526992798, + "learning_rate": 6.3563464587047596e-06, + "loss": 0.9144, + "step": 262360 + }, + { + "epoch": 1.6762071476943128, + "grad_norm": 0.998020350933075, + "learning_rate": 6.353898312444001e-06, + "loss": 0.9555, + "step": 262370 + }, + { + "epoch": 1.6762710348440515, + "grad_norm": 1.1781702041625977, + "learning_rate": 6.351450605737236e-06, + "loss": 1.0516, + "step": 262380 + }, + { + "epoch": 1.6763349219937902, + "grad_norm": 0.9183396697044373, + "learning_rate": 6.349003338609138e-06, + "loss": 0.791, + "step": 262390 + }, + { + "epoch": 1.676398809143529, + "grad_norm": 1.1460124254226685, + "learning_rate": 6.346556511084329e-06, + "loss": 0.8932, + "step": 262400 + }, + { + "epoch": 1.6764626962932676, + "grad_norm": 0.7981510758399963, + "learning_rate": 6.344110123187469e-06, + "loss": 0.94, + "step": 262410 + }, + { + "epoch": 1.6765265834430063, + "grad_norm": 0.9553250074386597, + "learning_rate": 6.341664174943179e-06, + "loss": 1.0921, + "step": 262420 + }, + { + "epoch": 1.676590470592745, + "grad_norm": 0.48150333762168884, + "learning_rate": 6.339218666376106e-06, + "loss": 0.5992, + "step": 262430 + }, + { + "epoch": 1.6766543577424837, + "grad_norm": 1.119012475013733, + "learning_rate": 6.33677359751087e-06, + "loss": 1.0035, + "step": 262440 + }, + { + "epoch": 1.6767182448922224, + "grad_norm": 1.050771951675415, + "learning_rate": 6.3343289683721055e-06, + "loss": 0.9006, + "step": 262450 + }, + { + "epoch": 1.6767821320419611, + "grad_norm": 0.6806208491325378, + "learning_rate": 6.331884778984415e-06, + "loss": 0.9743, + "step": 262460 + }, + { + "epoch": 1.6768460191916996, + "grad_norm": 0.9381636381149292, + "learning_rate": 6.329441029372435e-06, + "loss": 0.861, + "step": 262470 + }, + { + "epoch": 1.6769099063414385, + "grad_norm": 1.048666000366211, + "learning_rate": 6.326997719560751e-06, + "loss": 0.9453, + "step": 262480 + }, + { + "epoch": 1.676973793491177, + "grad_norm": 0.7955247163772583, + "learning_rate": 6.324554849573994e-06, + "loss": 0.8905, + "step": 262490 + }, + { + "epoch": 1.677037680640916, + "grad_norm": 0.6810150742530823, + "learning_rate": 6.322112419436754e-06, + "loss": 0.6696, + "step": 262500 + }, + { + "epoch": 1.6771015677906544, + "grad_norm": 1.108661413192749, + "learning_rate": 6.319670429173613e-06, + "loss": 0.7696, + "step": 262510 + }, + { + "epoch": 1.6771654549403934, + "grad_norm": 0.9814617037773132, + "learning_rate": 6.3172288788092e-06, + "loss": 0.8261, + "step": 262520 + }, + { + "epoch": 1.6772293420901319, + "grad_norm": 1.04248046875, + "learning_rate": 6.3147877683680635e-06, + "loss": 0.8631, + "step": 262530 + }, + { + "epoch": 1.6772932292398708, + "grad_norm": 1.0217331647872925, + "learning_rate": 6.312347097874821e-06, + "loss": 0.8714, + "step": 262540 + }, + { + "epoch": 1.6773571163896093, + "grad_norm": 1.6767487525939941, + "learning_rate": 6.309906867354032e-06, + "loss": 1.0093, + "step": 262550 + }, + { + "epoch": 1.6774210035393482, + "grad_norm": 1.2772984504699707, + "learning_rate": 6.307467076830287e-06, + "loss": 0.9749, + "step": 262560 + }, + { + "epoch": 1.6774848906890867, + "grad_norm": 1.5580352544784546, + "learning_rate": 6.3050277263281336e-06, + "loss": 0.9933, + "step": 262570 + }, + { + "epoch": 1.6775487778388256, + "grad_norm": 0.8894296884536743, + "learning_rate": 6.302588815872168e-06, + "loss": 0.6818, + "step": 262580 + }, + { + "epoch": 1.677612664988564, + "grad_norm": 0.7048232555389404, + "learning_rate": 6.300150345486921e-06, + "loss": 0.8798, + "step": 262590 + }, + { + "epoch": 1.677676552138303, + "grad_norm": 0.7692033052444458, + "learning_rate": 6.297712315196969e-06, + "loss": 0.6859, + "step": 262600 + }, + { + "epoch": 1.6777404392880415, + "grad_norm": 0.8606083989143372, + "learning_rate": 6.295274725026873e-06, + "loss": 1.0156, + "step": 262610 + }, + { + "epoch": 1.6778043264377804, + "grad_norm": 1.0161243677139282, + "learning_rate": 6.292837575001159e-06, + "loss": 0.8682, + "step": 262620 + }, + { + "epoch": 1.677868213587519, + "grad_norm": 1.0023568868637085, + "learning_rate": 6.290400865144391e-06, + "loss": 1.0135, + "step": 262630 + }, + { + "epoch": 1.6779321007372578, + "grad_norm": 1.5799769163131714, + "learning_rate": 6.287964595481094e-06, + "loss": 0.8239, + "step": 262640 + }, + { + "epoch": 1.6779959878869963, + "grad_norm": 1.00106942653656, + "learning_rate": 6.285528766035814e-06, + "loss": 0.83, + "step": 262650 + }, + { + "epoch": 1.6780598750367353, + "grad_norm": 0.7219306230545044, + "learning_rate": 6.283093376833071e-06, + "loss": 0.8355, + "step": 262660 + }, + { + "epoch": 1.6781237621864737, + "grad_norm": 0.874301552772522, + "learning_rate": 6.280658427897413e-06, + "loss": 0.7951, + "step": 262670 + }, + { + "epoch": 1.6781876493362127, + "grad_norm": 0.7081124782562256, + "learning_rate": 6.278223919253334e-06, + "loss": 0.8836, + "step": 262680 + }, + { + "epoch": 1.6782515364859512, + "grad_norm": 0.8233230113983154, + "learning_rate": 6.275789850925373e-06, + "loss": 0.7395, + "step": 262690 + }, + { + "epoch": 1.6783154236356899, + "grad_norm": 0.8121306896209717, + "learning_rate": 6.273356222938026e-06, + "loss": 0.9084, + "step": 262700 + }, + { + "epoch": 1.6783793107854286, + "grad_norm": 1.0645307302474976, + "learning_rate": 6.270923035315818e-06, + "loss": 1.0377, + "step": 262710 + }, + { + "epoch": 1.6784431979351673, + "grad_norm": 1.1339856386184692, + "learning_rate": 6.268490288083239e-06, + "loss": 0.7891, + "step": 262720 + }, + { + "epoch": 1.678507085084906, + "grad_norm": 0.8035052418708801, + "learning_rate": 6.266057981264805e-06, + "loss": 0.7972, + "step": 262730 + }, + { + "epoch": 1.6785709722346447, + "grad_norm": 0.7048985362052917, + "learning_rate": 6.263626114884996e-06, + "loss": 0.7198, + "step": 262740 + }, + { + "epoch": 1.6786348593843834, + "grad_norm": 1.036059021949768, + "learning_rate": 6.261194688968313e-06, + "loss": 0.9324, + "step": 262750 + }, + { + "epoch": 1.678698746534122, + "grad_norm": 0.9852795004844666, + "learning_rate": 6.258763703539233e-06, + "loss": 0.7835, + "step": 262760 + }, + { + "epoch": 1.6787626336838608, + "grad_norm": 0.8867654204368591, + "learning_rate": 6.2563331586222574e-06, + "loss": 0.8751, + "step": 262770 + }, + { + "epoch": 1.6788265208335995, + "grad_norm": 1.5037943124771118, + "learning_rate": 6.253903054241833e-06, + "loss": 0.7739, + "step": 262780 + }, + { + "epoch": 1.6788904079833382, + "grad_norm": 1.1166731119155884, + "learning_rate": 6.251473390422468e-06, + "loss": 0.9149, + "step": 262790 + }, + { + "epoch": 1.678954295133077, + "grad_norm": 1.3753933906555176, + "learning_rate": 6.2490441671886e-06, + "loss": 0.727, + "step": 262800 + }, + { + "epoch": 1.6790181822828156, + "grad_norm": 1.3363189697265625, + "learning_rate": 6.246615384564702e-06, + "loss": 0.7728, + "step": 262810 + }, + { + "epoch": 1.6790820694325543, + "grad_norm": 0.8456787467002869, + "learning_rate": 6.244187042575256e-06, + "loss": 0.8522, + "step": 262820 + }, + { + "epoch": 1.679145956582293, + "grad_norm": 2.5984883308410645, + "learning_rate": 6.241759141244691e-06, + "loss": 0.9173, + "step": 262830 + }, + { + "epoch": 1.6792098437320317, + "grad_norm": 0.870244026184082, + "learning_rate": 6.239331680597477e-06, + "loss": 0.6767, + "step": 262840 + }, + { + "epoch": 1.6792737308817705, + "grad_norm": 0.7208239436149597, + "learning_rate": 6.236904660658039e-06, + "loss": 0.9085, + "step": 262850 + }, + { + "epoch": 1.6793376180315092, + "grad_norm": 3.9711785316467285, + "learning_rate": 6.234478081450845e-06, + "loss": 0.9046, + "step": 262860 + }, + { + "epoch": 1.6794015051812479, + "grad_norm": 0.9699562191963196, + "learning_rate": 6.232051943000306e-06, + "loss": 0.8882, + "step": 262870 + }, + { + "epoch": 1.6794653923309866, + "grad_norm": 1.0530452728271484, + "learning_rate": 6.229626245330877e-06, + "loss": 0.9335, + "step": 262880 + }, + { + "epoch": 1.6795292794807253, + "grad_norm": 1.5956165790557861, + "learning_rate": 6.227200988466974e-06, + "loss": 1.0207, + "step": 262890 + }, + { + "epoch": 1.679593166630464, + "grad_norm": 0.7701212763786316, + "learning_rate": 6.224776172433033e-06, + "loss": 0.9123, + "step": 262900 + }, + { + "epoch": 1.6796570537802027, + "grad_norm": 1.047654390335083, + "learning_rate": 6.222351797253456e-06, + "loss": 0.9776, + "step": 262910 + }, + { + "epoch": 1.6797209409299414, + "grad_norm": 0.7686278820037842, + "learning_rate": 6.219927862952679e-06, + "loss": 0.797, + "step": 262920 + }, + { + "epoch": 1.67978482807968, + "grad_norm": 1.1466424465179443, + "learning_rate": 6.217504369555094e-06, + "loss": 0.7509, + "step": 262930 + }, + { + "epoch": 1.6798487152294188, + "grad_norm": 0.7692174911499023, + "learning_rate": 6.215081317085131e-06, + "loss": 0.9404, + "step": 262940 + }, + { + "epoch": 1.6799126023791575, + "grad_norm": 1.1606929302215576, + "learning_rate": 6.212658705567165e-06, + "loss": 0.7926, + "step": 262950 + }, + { + "epoch": 1.679976489528896, + "grad_norm": 0.854450523853302, + "learning_rate": 6.2102365350256155e-06, + "loss": 0.6672, + "step": 262960 + }, + { + "epoch": 1.680040376678635, + "grad_norm": 0.9151691198348999, + "learning_rate": 6.207814805484863e-06, + "loss": 0.7784, + "step": 262970 + }, + { + "epoch": 1.6801042638283734, + "grad_norm": 1.528312087059021, + "learning_rate": 6.205393516969304e-06, + "loss": 1.0193, + "step": 262980 + }, + { + "epoch": 1.6801681509781123, + "grad_norm": 1.977976679801941, + "learning_rate": 6.202972669503326e-06, + "loss": 1.1171, + "step": 262990 + }, + { + "epoch": 1.6802320381278508, + "grad_norm": 1.2395837306976318, + "learning_rate": 6.200552263111292e-06, + "loss": 0.8609, + "step": 263000 + }, + { + "epoch": 1.6802959252775898, + "grad_norm": 1.0361305475234985, + "learning_rate": 6.198132297817599e-06, + "loss": 0.8699, + "step": 263010 + }, + { + "epoch": 1.6803598124273282, + "grad_norm": 0.8011088371276855, + "learning_rate": 6.195712773646595e-06, + "loss": 0.9936, + "step": 263020 + }, + { + "epoch": 1.6804236995770672, + "grad_norm": 0.918425440788269, + "learning_rate": 6.19329369062267e-06, + "loss": 0.9927, + "step": 263030 + }, + { + "epoch": 1.6804875867268056, + "grad_norm": 1.8617467880249023, + "learning_rate": 6.190875048770167e-06, + "loss": 0.8369, + "step": 263040 + }, + { + "epoch": 1.6805514738765446, + "grad_norm": 1.9793410301208496, + "learning_rate": 6.188456848113461e-06, + "loss": 1.0095, + "step": 263050 + }, + { + "epoch": 1.680615361026283, + "grad_norm": 1.0629090070724487, + "learning_rate": 6.186039088676887e-06, + "loss": 1.0583, + "step": 263060 + }, + { + "epoch": 1.680679248176022, + "grad_norm": 1.4704509973526, + "learning_rate": 6.183621770484816e-06, + "loss": 0.8265, + "step": 263070 + }, + { + "epoch": 1.6807431353257605, + "grad_norm": 1.697485327720642, + "learning_rate": 6.181204893561571e-06, + "loss": 0.7345, + "step": 263080 + }, + { + "epoch": 1.6808070224754994, + "grad_norm": 0.795141339302063, + "learning_rate": 6.178788457931512e-06, + "loss": 0.8883, + "step": 263090 + }, + { + "epoch": 1.6808709096252379, + "grad_norm": 0.849793553352356, + "learning_rate": 6.176372463618951e-06, + "loss": 1.1035, + "step": 263100 + }, + { + "epoch": 1.6809347967749768, + "grad_norm": 0.7248162031173706, + "learning_rate": 6.173956910648243e-06, + "loss": 0.7855, + "step": 263110 + }, + { + "epoch": 1.6809986839247153, + "grad_norm": 1.4386005401611328, + "learning_rate": 6.17154179904369e-06, + "loss": 0.6941, + "step": 263120 + }, + { + "epoch": 1.6810625710744542, + "grad_norm": 0.7645585536956787, + "learning_rate": 6.1691271288296324e-06, + "loss": 0.9207, + "step": 263130 + }, + { + "epoch": 1.6811264582241927, + "grad_norm": 0.9256292581558228, + "learning_rate": 6.166712900030397e-06, + "loss": 0.9551, + "step": 263140 + }, + { + "epoch": 1.6811903453739316, + "grad_norm": 1.810821294784546, + "learning_rate": 6.164299112670269e-06, + "loss": 0.9704, + "step": 263150 + }, + { + "epoch": 1.6812542325236701, + "grad_norm": 1.022916555404663, + "learning_rate": 6.161885766773584e-06, + "loss": 0.7118, + "step": 263160 + }, + { + "epoch": 1.681318119673409, + "grad_norm": 4.4502668380737305, + "learning_rate": 6.159472862364618e-06, + "loss": 0.8046, + "step": 263170 + }, + { + "epoch": 1.6813820068231475, + "grad_norm": 0.5222861170768738, + "learning_rate": 6.157060399467707e-06, + "loss": 0.9441, + "step": 263180 + }, + { + "epoch": 1.6814458939728862, + "grad_norm": 0.9526025056838989, + "learning_rate": 6.154648378107114e-06, + "loss": 0.8466, + "step": 263190 + }, + { + "epoch": 1.681509781122625, + "grad_norm": 0.8658684492111206, + "learning_rate": 6.152236798307154e-06, + "loss": 0.9679, + "step": 263200 + }, + { + "epoch": 1.6815736682723637, + "grad_norm": 0.6978765726089478, + "learning_rate": 6.14982566009209e-06, + "loss": 0.7612, + "step": 263210 + }, + { + "epoch": 1.6816375554221024, + "grad_norm": 0.8792979121208191, + "learning_rate": 6.147414963486231e-06, + "loss": 0.8342, + "step": 263220 + }, + { + "epoch": 1.681701442571841, + "grad_norm": 0.8527263402938843, + "learning_rate": 6.14500470851383e-06, + "loss": 0.9531, + "step": 263230 + }, + { + "epoch": 1.6817653297215798, + "grad_norm": 1.1380363702774048, + "learning_rate": 6.142594895199183e-06, + "loss": 0.7972, + "step": 263240 + }, + { + "epoch": 1.6818292168713185, + "grad_norm": 0.73790043592453, + "learning_rate": 6.140185523566533e-06, + "loss": 0.8702, + "step": 263250 + }, + { + "epoch": 1.6818931040210572, + "grad_norm": 0.873717725276947, + "learning_rate": 6.1377765936401765e-06, + "loss": 1.0954, + "step": 263260 + }, + { + "epoch": 1.681956991170796, + "grad_norm": 1.1089555025100708, + "learning_rate": 6.135368105444339e-06, + "loss": 0.9219, + "step": 263270 + }, + { + "epoch": 1.6820208783205346, + "grad_norm": 2.0165822505950928, + "learning_rate": 6.1329600590033064e-06, + "loss": 1.0447, + "step": 263280 + }, + { + "epoch": 1.6820847654702733, + "grad_norm": 0.8352879881858826, + "learning_rate": 6.130552454341304e-06, + "loss": 1.0405, + "step": 263290 + }, + { + "epoch": 1.682148652620012, + "grad_norm": 2.2425537109375, + "learning_rate": 6.128145291482601e-06, + "loss": 0.7285, + "step": 263300 + }, + { + "epoch": 1.6822125397697507, + "grad_norm": 0.6852495670318604, + "learning_rate": 6.125738570451422e-06, + "loss": 0.8092, + "step": 263310 + }, + { + "epoch": 1.6822764269194894, + "grad_norm": 0.9322900772094727, + "learning_rate": 6.123332291272021e-06, + "loss": 0.8753, + "step": 263320 + }, + { + "epoch": 1.6823403140692281, + "grad_norm": 1.0719099044799805, + "learning_rate": 6.120926453968612e-06, + "loss": 0.9331, + "step": 263330 + }, + { + "epoch": 1.6824042012189668, + "grad_norm": 1.2141951322555542, + "learning_rate": 6.118521058565435e-06, + "loss": 0.9405, + "step": 263340 + }, + { + "epoch": 1.6824680883687055, + "grad_norm": 0.8354449272155762, + "learning_rate": 6.116116105086728e-06, + "loss": 0.9036, + "step": 263350 + }, + { + "epoch": 1.6825319755184442, + "grad_norm": 0.6558452248573303, + "learning_rate": 6.1137115935566815e-06, + "loss": 0.8491, + "step": 263360 + }, + { + "epoch": 1.682595862668183, + "grad_norm": 1.2620372772216797, + "learning_rate": 6.1115479110657845e-06, + "loss": 0.8819, + "step": 263370 + }, + { + "epoch": 1.6826597498179217, + "grad_norm": 0.8358511328697205, + "learning_rate": 6.109144239304932e-06, + "loss": 0.8308, + "step": 263380 + }, + { + "epoch": 1.6827236369676604, + "grad_norm": 0.8426769375801086, + "learning_rate": 6.1067410095629825e-06, + "loss": 0.8139, + "step": 263390 + }, + { + "epoch": 1.682787524117399, + "grad_norm": 0.7442500591278076, + "learning_rate": 6.104338221864109e-06, + "loss": 0.684, + "step": 263400 + }, + { + "epoch": 1.6828514112671378, + "grad_norm": 0.9698424935340881, + "learning_rate": 6.101935876232534e-06, + "loss": 0.9235, + "step": 263410 + }, + { + "epoch": 1.6829152984168765, + "grad_norm": 0.6997389793395996, + "learning_rate": 6.099533972692439e-06, + "loss": 0.8268, + "step": 263420 + }, + { + "epoch": 1.682979185566615, + "grad_norm": 0.8244001269340515, + "learning_rate": 6.097132511268022e-06, + "loss": 0.949, + "step": 263430 + }, + { + "epoch": 1.683043072716354, + "grad_norm": 0.6640798449516296, + "learning_rate": 6.094731491983446e-06, + "loss": 1.0296, + "step": 263440 + }, + { + "epoch": 1.6831069598660924, + "grad_norm": 2.9415547847747803, + "learning_rate": 6.092330914862915e-06, + "loss": 1.0044, + "step": 263450 + }, + { + "epoch": 1.6831708470158313, + "grad_norm": 1.5144875049591064, + "learning_rate": 6.089930779930608e-06, + "loss": 0.8793, + "step": 263460 + }, + { + "epoch": 1.6832347341655698, + "grad_norm": 1.1107733249664307, + "learning_rate": 6.0875310872106736e-06, + "loss": 1.1556, + "step": 263470 + }, + { + "epoch": 1.6832986213153087, + "grad_norm": 0.9494339227676392, + "learning_rate": 6.085131836727298e-06, + "loss": 0.7324, + "step": 263480 + }, + { + "epoch": 1.6833625084650472, + "grad_norm": 1.08641517162323, + "learning_rate": 6.082733028504628e-06, + "loss": 0.6353, + "step": 263490 + }, + { + "epoch": 1.6834263956147861, + "grad_norm": 1.4187917709350586, + "learning_rate": 6.080334662566839e-06, + "loss": 0.8269, + "step": 263500 + }, + { + "epoch": 1.6834902827645246, + "grad_norm": 0.7869080901145935, + "learning_rate": 6.077936738938073e-06, + "loss": 0.7547, + "step": 263510 + }, + { + "epoch": 1.6835541699142635, + "grad_norm": 0.8862486481666565, + "learning_rate": 6.075539257642482e-06, + "loss": 0.8882, + "step": 263520 + }, + { + "epoch": 1.683618057064002, + "grad_norm": 0.8505191802978516, + "learning_rate": 6.073142218704209e-06, + "loss": 0.725, + "step": 263530 + }, + { + "epoch": 1.683681944213741, + "grad_norm": 1.587677240371704, + "learning_rate": 6.0707456221474005e-06, + "loss": 0.5713, + "step": 263540 + }, + { + "epoch": 1.6837458313634794, + "grad_norm": 1.114354133605957, + "learning_rate": 6.068349467996182e-06, + "loss": 0.8215, + "step": 263550 + }, + { + "epoch": 1.6838097185132184, + "grad_norm": 0.7740732431411743, + "learning_rate": 6.065953756274695e-06, + "loss": 0.8324, + "step": 263560 + }, + { + "epoch": 1.6838736056629569, + "grad_norm": 2.6367008686065674, + "learning_rate": 6.06355848700706e-06, + "loss": 0.995, + "step": 263570 + }, + { + "epoch": 1.6839374928126958, + "grad_norm": 0.7278146147727966, + "learning_rate": 6.061163660217406e-06, + "loss": 0.6742, + "step": 263580 + }, + { + "epoch": 1.6840013799624343, + "grad_norm": 1.1852619647979736, + "learning_rate": 6.058769275929837e-06, + "loss": 0.8899, + "step": 263590 + }, + { + "epoch": 1.6840652671121732, + "grad_norm": 1.1105660200119019, + "learning_rate": 6.056375334168485e-06, + "loss": 0.9496, + "step": 263600 + }, + { + "epoch": 1.6841291542619117, + "grad_norm": 0.8752860426902771, + "learning_rate": 6.05398183495745e-06, + "loss": 0.6881, + "step": 263610 + }, + { + "epoch": 1.6841930414116506, + "grad_norm": 0.7431759238243103, + "learning_rate": 6.051588778320833e-06, + "loss": 0.6131, + "step": 263620 + }, + { + "epoch": 1.684256928561389, + "grad_norm": 1.1750792264938354, + "learning_rate": 6.0491961642827384e-06, + "loss": 0.7242, + "step": 263630 + }, + { + "epoch": 1.684320815711128, + "grad_norm": 1.0727992057800293, + "learning_rate": 6.046803992867256e-06, + "loss": 0.8961, + "step": 263640 + }, + { + "epoch": 1.6843847028608665, + "grad_norm": 0.9775357842445374, + "learning_rate": 6.044412264098493e-06, + "loss": 0.7858, + "step": 263650 + }, + { + "epoch": 1.6844485900106054, + "grad_norm": 1.4500555992126465, + "learning_rate": 6.042020978000518e-06, + "loss": 0.8259, + "step": 263660 + }, + { + "epoch": 1.684512477160344, + "grad_norm": 1.6051480770111084, + "learning_rate": 6.039630134597424e-06, + "loss": 0.9529, + "step": 263670 + }, + { + "epoch": 1.6845763643100826, + "grad_norm": 1.0749129056930542, + "learning_rate": 6.03723973391328e-06, + "loss": 0.7638, + "step": 263680 + }, + { + "epoch": 1.6846402514598213, + "grad_norm": 1.262317180633545, + "learning_rate": 6.034849775972174e-06, + "loss": 0.7457, + "step": 263690 + }, + { + "epoch": 1.68470413860956, + "grad_norm": 1.0481128692626953, + "learning_rate": 6.032460260798156e-06, + "loss": 0.9739, + "step": 263700 + }, + { + "epoch": 1.6847680257592987, + "grad_norm": 0.7693667411804199, + "learning_rate": 6.03007118841531e-06, + "loss": 0.92, + "step": 263710 + }, + { + "epoch": 1.6848319129090374, + "grad_norm": 0.777385413646698, + "learning_rate": 6.0276825588476805e-06, + "loss": 1.0272, + "step": 263720 + }, + { + "epoch": 1.6848958000587761, + "grad_norm": 1.0397416353225708, + "learning_rate": 6.0252943721193334e-06, + "loss": 0.8549, + "step": 263730 + }, + { + "epoch": 1.6849596872085149, + "grad_norm": 0.6522778272628784, + "learning_rate": 6.02290662825431e-06, + "loss": 0.8055, + "step": 263740 + }, + { + "epoch": 1.6850235743582536, + "grad_norm": 0.704421877861023, + "learning_rate": 6.0205193272766695e-06, + "loss": 0.7099, + "step": 263750 + }, + { + "epoch": 1.6850874615079923, + "grad_norm": 0.9648008942604065, + "learning_rate": 6.018132469210436e-06, + "loss": 0.9972, + "step": 263760 + }, + { + "epoch": 1.685151348657731, + "grad_norm": 0.9081729054450989, + "learning_rate": 6.015746054079663e-06, + "loss": 0.6949, + "step": 263770 + }, + { + "epoch": 1.6852152358074697, + "grad_norm": 1.4176228046417236, + "learning_rate": 6.013360081908387e-06, + "loss": 0.8783, + "step": 263780 + }, + { + "epoch": 1.6852791229572084, + "grad_norm": 1.1324151754379272, + "learning_rate": 6.010974552720616e-06, + "loss": 0.8746, + "step": 263790 + }, + { + "epoch": 1.685343010106947, + "grad_norm": 1.1132959127426147, + "learning_rate": 6.008589466540399e-06, + "loss": 0.9005, + "step": 263800 + }, + { + "epoch": 1.6854068972566858, + "grad_norm": 0.9364607930183411, + "learning_rate": 6.006204823391731e-06, + "loss": 0.7132, + "step": 263810 + }, + { + "epoch": 1.6854707844064245, + "grad_norm": 1.3173259496688843, + "learning_rate": 6.003820623298656e-06, + "loss": 1.0, + "step": 263820 + }, + { + "epoch": 1.6855346715561632, + "grad_norm": 1.278419017791748, + "learning_rate": 6.001436866285159e-06, + "loss": 0.8025, + "step": 263830 + }, + { + "epoch": 1.685598558705902, + "grad_norm": 0.7469710111618042, + "learning_rate": 5.999053552375267e-06, + "loss": 0.7053, + "step": 263840 + }, + { + "epoch": 1.6856624458556406, + "grad_norm": 1.4082916975021362, + "learning_rate": 5.996670681592958e-06, + "loss": 1.2888, + "step": 263850 + }, + { + "epoch": 1.6857263330053793, + "grad_norm": 0.9647468328475952, + "learning_rate": 5.994288253962255e-06, + "loss": 0.8287, + "step": 263860 + }, + { + "epoch": 1.685790220155118, + "grad_norm": 0.743918240070343, + "learning_rate": 5.9919062695071304e-06, + "loss": 0.7825, + "step": 263870 + }, + { + "epoch": 1.6858541073048567, + "grad_norm": 0.9181013703346252, + "learning_rate": 5.989524728251594e-06, + "loss": 0.8352, + "step": 263880 + }, + { + "epoch": 1.6859179944545954, + "grad_norm": 0.8533596992492676, + "learning_rate": 5.987143630219605e-06, + "loss": 1.0479, + "step": 263890 + }, + { + "epoch": 1.6859818816043342, + "grad_norm": 1.0133002996444702, + "learning_rate": 5.984762975435166e-06, + "loss": 0.8272, + "step": 263900 + }, + { + "epoch": 1.6860457687540729, + "grad_norm": 0.8661429286003113, + "learning_rate": 5.982382763922234e-06, + "loss": 1.0064, + "step": 263910 + }, + { + "epoch": 1.6861096559038113, + "grad_norm": 0.8174116611480713, + "learning_rate": 5.980002995704798e-06, + "loss": 1.0821, + "step": 263920 + }, + { + "epoch": 1.6861735430535503, + "grad_norm": 0.9533768892288208, + "learning_rate": 5.977623670806804e-06, + "loss": 0.7653, + "step": 263930 + }, + { + "epoch": 1.6862374302032888, + "grad_norm": 1.140524983406067, + "learning_rate": 5.975244789252238e-06, + "loss": 0.7114, + "step": 263940 + }, + { + "epoch": 1.6863013173530277, + "grad_norm": 1.5895030498504639, + "learning_rate": 5.972866351065026e-06, + "loss": 0.8785, + "step": 263950 + }, + { + "epoch": 1.6863652045027662, + "grad_norm": 0.9356734752655029, + "learning_rate": 5.970488356269155e-06, + "loss": 1.0909, + "step": 263960 + }, + { + "epoch": 1.686429091652505, + "grad_norm": 0.9342418909072876, + "learning_rate": 5.968110804888544e-06, + "loss": 0.7535, + "step": 263970 + }, + { + "epoch": 1.6864929788022436, + "grad_norm": 0.8574849963188171, + "learning_rate": 5.965733696947151e-06, + "loss": 0.7482, + "step": 263980 + }, + { + "epoch": 1.6865568659519825, + "grad_norm": 0.6715033650398254, + "learning_rate": 5.9633570324689246e-06, + "loss": 0.7679, + "step": 263990 + }, + { + "epoch": 1.686620753101721, + "grad_norm": 0.6406641006469727, + "learning_rate": 5.960980811477784e-06, + "loss": 0.793, + "step": 264000 + }, + { + "epoch": 1.68668464025146, + "grad_norm": 1.4869370460510254, + "learning_rate": 5.958605033997672e-06, + "loss": 0.912, + "step": 264010 + }, + { + "epoch": 1.6867485274011984, + "grad_norm": 1.958756923675537, + "learning_rate": 5.956229700052501e-06, + "loss": 0.9161, + "step": 264020 + }, + { + "epoch": 1.6868124145509373, + "grad_norm": 0.8519002795219421, + "learning_rate": 5.95385480966621e-06, + "loss": 0.8742, + "step": 264030 + }, + { + "epoch": 1.6868763017006758, + "grad_norm": 1.1269562244415283, + "learning_rate": 5.951480362862694e-06, + "loss": 0.7672, + "step": 264040 + }, + { + "epoch": 1.6869401888504147, + "grad_norm": 3.209676504135132, + "learning_rate": 5.949106359665885e-06, + "loss": 0.9413, + "step": 264050 + }, + { + "epoch": 1.6870040760001532, + "grad_norm": 1.4073954820632935, + "learning_rate": 5.94673280009968e-06, + "loss": 1.1358, + "step": 264060 + }, + { + "epoch": 1.6870679631498922, + "grad_norm": 1.4022773504257202, + "learning_rate": 5.944359684187995e-06, + "loss": 1.0455, + "step": 264070 + }, + { + "epoch": 1.6871318502996306, + "grad_norm": 0.8840337991714478, + "learning_rate": 5.941987011954714e-06, + "loss": 0.7272, + "step": 264080 + }, + { + "epoch": 1.6871957374493696, + "grad_norm": 0.6392529010772705, + "learning_rate": 5.939614783423747e-06, + "loss": 0.7665, + "step": 264090 + }, + { + "epoch": 1.687259624599108, + "grad_norm": 2.5018985271453857, + "learning_rate": 5.937242998618974e-06, + "loss": 0.7378, + "step": 264100 + }, + { + "epoch": 1.687323511748847, + "grad_norm": 3.1915085315704346, + "learning_rate": 5.934871657564278e-06, + "loss": 1.0742, + "step": 264110 + }, + { + "epoch": 1.6873873988985855, + "grad_norm": 1.1561402082443237, + "learning_rate": 5.93250076028356e-06, + "loss": 0.9284, + "step": 264120 + }, + { + "epoch": 1.6874512860483244, + "grad_norm": 0.9742777943611145, + "learning_rate": 5.930130306800669e-06, + "loss": 1.1178, + "step": 264130 + }, + { + "epoch": 1.6875151731980629, + "grad_norm": 1.0389258861541748, + "learning_rate": 5.927760297139501e-06, + "loss": 0.8429, + "step": 264140 + }, + { + "epoch": 1.6875790603478018, + "grad_norm": 1.2347203493118286, + "learning_rate": 5.925390731323904e-06, + "loss": 0.9766, + "step": 264150 + }, + { + "epoch": 1.6876429474975403, + "grad_norm": 1.1833796501159668, + "learning_rate": 5.92302160937776e-06, + "loss": 0.8574, + "step": 264160 + }, + { + "epoch": 1.687706834647279, + "grad_norm": 1.1623376607894897, + "learning_rate": 5.920652931324916e-06, + "loss": 0.8113, + "step": 264170 + }, + { + "epoch": 1.6877707217970177, + "grad_norm": 0.9164713621139526, + "learning_rate": 5.918284697189236e-06, + "loss": 0.8357, + "step": 264180 + }, + { + "epoch": 1.6878346089467564, + "grad_norm": 0.7683905959129333, + "learning_rate": 5.915916906994556e-06, + "loss": 0.8485, + "step": 264190 + }, + { + "epoch": 1.6878984960964951, + "grad_norm": 0.6126852631568909, + "learning_rate": 5.9135495607647475e-06, + "loss": 0.9963, + "step": 264200 + }, + { + "epoch": 1.6879623832462338, + "grad_norm": 3.0341567993164062, + "learning_rate": 5.911182658523618e-06, + "loss": 0.9031, + "step": 264210 + }, + { + "epoch": 1.6880262703959725, + "grad_norm": 1.1758440732955933, + "learning_rate": 5.908816200295036e-06, + "loss": 0.8179, + "step": 264220 + }, + { + "epoch": 1.6880901575457112, + "grad_norm": 2.492748975753784, + "learning_rate": 5.906450186102802e-06, + "loss": 0.8721, + "step": 264230 + }, + { + "epoch": 1.68815404469545, + "grad_norm": 0.7346659302711487, + "learning_rate": 5.904084615970778e-06, + "loss": 0.7518, + "step": 264240 + }, + { + "epoch": 1.6882179318451886, + "grad_norm": 0.5643122792243958, + "learning_rate": 5.901719489922758e-06, + "loss": 0.8097, + "step": 264250 + }, + { + "epoch": 1.6882818189949274, + "grad_norm": 1.1059579849243164, + "learning_rate": 5.899354807982582e-06, + "loss": 1.0127, + "step": 264260 + }, + { + "epoch": 1.688345706144666, + "grad_norm": 0.9043902158737183, + "learning_rate": 5.896990570174049e-06, + "loss": 0.9114, + "step": 264270 + }, + { + "epoch": 1.6884095932944048, + "grad_norm": 1.016475796699524, + "learning_rate": 5.894626776520984e-06, + "loss": 0.8536, + "step": 264280 + }, + { + "epoch": 1.6884734804441435, + "grad_norm": 1.6651692390441895, + "learning_rate": 5.892263427047173e-06, + "loss": 0.7038, + "step": 264290 + }, + { + "epoch": 1.6885373675938822, + "grad_norm": 0.9237732887268066, + "learning_rate": 5.889900521776426e-06, + "loss": 0.9019, + "step": 264300 + }, + { + "epoch": 1.6886012547436209, + "grad_norm": 0.8759250640869141, + "learning_rate": 5.887538060732556e-06, + "loss": 0.7557, + "step": 264310 + }, + { + "epoch": 1.6886651418933596, + "grad_norm": 1.015563726425171, + "learning_rate": 5.885176043939328e-06, + "loss": 0.8072, + "step": 264320 + }, + { + "epoch": 1.6887290290430983, + "grad_norm": 1.165315866470337, + "learning_rate": 5.8828144714205505e-06, + "loss": 1.0102, + "step": 264330 + }, + { + "epoch": 1.688792916192837, + "grad_norm": 1.189011812210083, + "learning_rate": 5.8804533431999935e-06, + "loss": 0.9371, + "step": 264340 + }, + { + "epoch": 1.6888568033425757, + "grad_norm": 1.0117441415786743, + "learning_rate": 5.878092659301443e-06, + "loss": 1.2065, + "step": 264350 + }, + { + "epoch": 1.6889206904923144, + "grad_norm": 0.6729750633239746, + "learning_rate": 5.875732419748664e-06, + "loss": 0.7066, + "step": 264360 + }, + { + "epoch": 1.6889845776420531, + "grad_norm": 2.093863010406494, + "learning_rate": 5.873372624565443e-06, + "loss": 0.7922, + "step": 264370 + }, + { + "epoch": 1.6890484647917918, + "grad_norm": 0.9474301338195801, + "learning_rate": 5.871013273775522e-06, + "loss": 1.0108, + "step": 264380 + }, + { + "epoch": 1.6891123519415305, + "grad_norm": 0.9864105582237244, + "learning_rate": 5.8686543674026875e-06, + "loss": 0.9154, + "step": 264390 + }, + { + "epoch": 1.6891762390912692, + "grad_norm": 0.9760420918464661, + "learning_rate": 5.86629590547067e-06, + "loss": 1.0111, + "step": 264400 + }, + { + "epoch": 1.6892401262410077, + "grad_norm": 1.5423452854156494, + "learning_rate": 5.863937888003246e-06, + "loss": 1.1758, + "step": 264410 + }, + { + "epoch": 1.6893040133907467, + "grad_norm": 0.9814441800117493, + "learning_rate": 5.86158031502414e-06, + "loss": 0.7018, + "step": 264420 + }, + { + "epoch": 1.6893679005404851, + "grad_norm": 0.909810483455658, + "learning_rate": 5.859223186557111e-06, + "loss": 0.9151, + "step": 264430 + }, + { + "epoch": 1.689431787690224, + "grad_norm": 0.6530123949050903, + "learning_rate": 5.856866502625891e-06, + "loss": 0.8051, + "step": 264440 + }, + { + "epoch": 1.6894956748399625, + "grad_norm": 0.843220591545105, + "learning_rate": 5.854510263254215e-06, + "loss": 0.9839, + "step": 264450 + }, + { + "epoch": 1.6895595619897015, + "grad_norm": 1.1965805292129517, + "learning_rate": 5.852154468465809e-06, + "loss": 1.0437, + "step": 264460 + }, + { + "epoch": 1.68962344913944, + "grad_norm": 1.018641471862793, + "learning_rate": 5.849799118284405e-06, + "loss": 0.9625, + "step": 264470 + }, + { + "epoch": 1.6896873362891789, + "grad_norm": 1.1121151447296143, + "learning_rate": 5.847444212733716e-06, + "loss": 0.9465, + "step": 264480 + }, + { + "epoch": 1.6897512234389174, + "grad_norm": 1.031753659248352, + "learning_rate": 5.845089751837462e-06, + "loss": 0.7985, + "step": 264490 + }, + { + "epoch": 1.6898151105886563, + "grad_norm": 0.7224723100662231, + "learning_rate": 5.8427357356193536e-06, + "loss": 0.8471, + "step": 264500 + }, + { + "epoch": 1.6898789977383948, + "grad_norm": 2.845839738845825, + "learning_rate": 5.840382164103092e-06, + "loss": 1.1273, + "step": 264510 + }, + { + "epoch": 1.6899428848881337, + "grad_norm": 0.8431565761566162, + "learning_rate": 5.838029037312398e-06, + "loss": 0.7059, + "step": 264520 + }, + { + "epoch": 1.6900067720378722, + "grad_norm": 1.1503245830535889, + "learning_rate": 5.835676355270942e-06, + "loss": 0.9751, + "step": 264530 + }, + { + "epoch": 1.6900706591876111, + "grad_norm": 1.0872777700424194, + "learning_rate": 5.833324118002448e-06, + "loss": 1.5552, + "step": 264540 + }, + { + "epoch": 1.6901345463373496, + "grad_norm": 1.0153592824935913, + "learning_rate": 5.8309723255305815e-06, + "loss": 1.005, + "step": 264550 + }, + { + "epoch": 1.6901984334870885, + "grad_norm": 1.078887939453125, + "learning_rate": 5.82862097787904e-06, + "loss": 0.9291, + "step": 264560 + }, + { + "epoch": 1.690262320636827, + "grad_norm": 1.0092707872390747, + "learning_rate": 5.826270075071488e-06, + "loss": 0.8592, + "step": 264570 + }, + { + "epoch": 1.690326207786566, + "grad_norm": 3.004528045654297, + "learning_rate": 5.8239196171316405e-06, + "loss": 0.9122, + "step": 264580 + }, + { + "epoch": 1.6903900949363044, + "grad_norm": 0.9294689297676086, + "learning_rate": 5.821569604083111e-06, + "loss": 0.7162, + "step": 264590 + }, + { + "epoch": 1.6904539820860434, + "grad_norm": 0.7767539024353027, + "learning_rate": 5.819220035949613e-06, + "loss": 0.7823, + "step": 264600 + }, + { + "epoch": 1.6905178692357818, + "grad_norm": 1.2645741701126099, + "learning_rate": 5.816870912754774e-06, + "loss": 0.9638, + "step": 264610 + }, + { + "epoch": 1.6905817563855208, + "grad_norm": 1.0132741928100586, + "learning_rate": 5.814522234522274e-06, + "loss": 0.7176, + "step": 264620 + }, + { + "epoch": 1.6906456435352593, + "grad_norm": 0.9678375720977783, + "learning_rate": 5.812174001275766e-06, + "loss": 1.2492, + "step": 264630 + }, + { + "epoch": 1.6907095306849982, + "grad_norm": 1.0396130084991455, + "learning_rate": 5.809826213038888e-06, + "loss": 0.8539, + "step": 264640 + }, + { + "epoch": 1.6907734178347367, + "grad_norm": 0.7152589559555054, + "learning_rate": 5.8074788698352975e-06, + "loss": 0.8661, + "step": 264650 + }, + { + "epoch": 1.6908373049844754, + "grad_norm": 1.435221791267395, + "learning_rate": 5.805131971688621e-06, + "loss": 0.9256, + "step": 264660 + }, + { + "epoch": 1.690901192134214, + "grad_norm": 0.7107662558555603, + "learning_rate": 5.802785518622506e-06, + "loss": 1.0261, + "step": 264670 + }, + { + "epoch": 1.6909650792839528, + "grad_norm": 0.7626693844795227, + "learning_rate": 5.800439510660566e-06, + "loss": 0.9455, + "step": 264680 + }, + { + "epoch": 1.6910289664336915, + "grad_norm": 0.8588293194770813, + "learning_rate": 5.79809394782645e-06, + "loss": 0.7354, + "step": 264690 + }, + { + "epoch": 1.6910928535834302, + "grad_norm": 0.9404697418212891, + "learning_rate": 5.795748830143755e-06, + "loss": 1.0988, + "step": 264700 + }, + { + "epoch": 1.691156740733169, + "grad_norm": 0.4883400797843933, + "learning_rate": 5.7934041576361285e-06, + "loss": 1.0692, + "step": 264710 + }, + { + "epoch": 1.6912206278829076, + "grad_norm": 1.085710883140564, + "learning_rate": 5.7910599303271475e-06, + "loss": 1.007, + "step": 264720 + }, + { + "epoch": 1.6912845150326463, + "grad_norm": 0.68525630235672, + "learning_rate": 5.788716148240458e-06, + "loss": 0.6872, + "step": 264730 + }, + { + "epoch": 1.691348402182385, + "grad_norm": 1.4903061389923096, + "learning_rate": 5.786372811399627e-06, + "loss": 0.7983, + "step": 264740 + }, + { + "epoch": 1.6914122893321237, + "grad_norm": 1.2236237525939941, + "learning_rate": 5.784029919828288e-06, + "loss": 1.0272, + "step": 264750 + }, + { + "epoch": 1.6914761764818624, + "grad_norm": 1.0498038530349731, + "learning_rate": 5.781687473550007e-06, + "loss": 0.8841, + "step": 264760 + }, + { + "epoch": 1.6915400636316011, + "grad_norm": 1.0530762672424316, + "learning_rate": 5.779345472588399e-06, + "loss": 0.8159, + "step": 264770 + }, + { + "epoch": 1.6916039507813398, + "grad_norm": 1.0849366188049316, + "learning_rate": 5.777003916967027e-06, + "loss": 0.924, + "step": 264780 + }, + { + "epoch": 1.6916678379310786, + "grad_norm": 1.0517401695251465, + "learning_rate": 5.774662806709491e-06, + "loss": 1.0696, + "step": 264790 + }, + { + "epoch": 1.6917317250808173, + "grad_norm": 1.0167468786239624, + "learning_rate": 5.772322141839353e-06, + "loss": 0.8677, + "step": 264800 + }, + { + "epoch": 1.691795612230556, + "grad_norm": 1.034781813621521, + "learning_rate": 5.769981922380208e-06, + "loss": 0.7875, + "step": 264810 + }, + { + "epoch": 1.6918594993802947, + "grad_norm": 0.9005605578422546, + "learning_rate": 5.767642148355595e-06, + "loss": 0.7586, + "step": 264820 + }, + { + "epoch": 1.6919233865300334, + "grad_norm": 0.707309901714325, + "learning_rate": 5.765302819789092e-06, + "loss": 0.8886, + "step": 264830 + }, + { + "epoch": 1.691987273679772, + "grad_norm": 0.6730127334594727, + "learning_rate": 5.762963936704269e-06, + "loss": 0.9539, + "step": 264840 + }, + { + "epoch": 1.6920511608295108, + "grad_norm": 0.8145616054534912, + "learning_rate": 5.760625499124661e-06, + "loss": 0.813, + "step": 264850 + }, + { + "epoch": 1.6921150479792495, + "grad_norm": 0.7684074640274048, + "learning_rate": 5.758287507073834e-06, + "loss": 0.7269, + "step": 264860 + }, + { + "epoch": 1.6921789351289882, + "grad_norm": 2.0174615383148193, + "learning_rate": 5.7559499605753185e-06, + "loss": 0.877, + "step": 264870 + }, + { + "epoch": 1.692242822278727, + "grad_norm": 0.8217144012451172, + "learning_rate": 5.753612859652674e-06, + "loss": 0.706, + "step": 264880 + }, + { + "epoch": 1.6923067094284656, + "grad_norm": 1.2375004291534424, + "learning_rate": 5.7512762043294145e-06, + "loss": 1.0963, + "step": 264890 + }, + { + "epoch": 1.692370596578204, + "grad_norm": 2.328045129776001, + "learning_rate": 5.748939994629093e-06, + "loss": 0.7973, + "step": 264900 + }, + { + "epoch": 1.692434483727943, + "grad_norm": 1.1463247537612915, + "learning_rate": 5.74660423057522e-06, + "loss": 0.7571, + "step": 264910 + }, + { + "epoch": 1.6924983708776815, + "grad_norm": 1.045371413230896, + "learning_rate": 5.7442689121913415e-06, + "loss": 0.871, + "step": 264920 + }, + { + "epoch": 1.6925622580274204, + "grad_norm": 0.9297313690185547, + "learning_rate": 5.741934039500946e-06, + "loss": 0.79, + "step": 264930 + }, + { + "epoch": 1.692626145177159, + "grad_norm": 0.9317049980163574, + "learning_rate": 5.739599612527574e-06, + "loss": 0.8123, + "step": 264940 + }, + { + "epoch": 1.6926900323268979, + "grad_norm": 0.9831722378730774, + "learning_rate": 5.737265631294714e-06, + "loss": 1.0598, + "step": 264950 + }, + { + "epoch": 1.6927539194766363, + "grad_norm": 2.586610794067383, + "learning_rate": 5.734932095825895e-06, + "loss": 0.7519, + "step": 264960 + }, + { + "epoch": 1.6928178066263753, + "grad_norm": 1.0624761581420898, + "learning_rate": 5.732599006144595e-06, + "loss": 1.0614, + "step": 264970 + }, + { + "epoch": 1.6928816937761137, + "grad_norm": 0.8045127391815186, + "learning_rate": 5.730266362274328e-06, + "loss": 0.8182, + "step": 264980 + }, + { + "epoch": 1.6929455809258527, + "grad_norm": 0.8507513403892517, + "learning_rate": 5.727934164238563e-06, + "loss": 1.1099, + "step": 264990 + }, + { + "epoch": 1.6930094680755912, + "grad_norm": 1.320726990699768, + "learning_rate": 5.725602412060821e-06, + "loss": 0.9069, + "step": 265000 + }, + { + "epoch": 1.69307335522533, + "grad_norm": 0.8424960374832153, + "learning_rate": 5.723271105764549e-06, + "loss": 0.6733, + "step": 265010 + }, + { + "epoch": 1.6931372423750686, + "grad_norm": 0.7862983345985413, + "learning_rate": 5.720940245373252e-06, + "loss": 0.8787, + "step": 265020 + }, + { + "epoch": 1.6932011295248075, + "grad_norm": 1.225223183631897, + "learning_rate": 5.718609830910388e-06, + "loss": 0.9061, + "step": 265030 + }, + { + "epoch": 1.693265016674546, + "grad_norm": 1.9154473543167114, + "learning_rate": 5.716279862399427e-06, + "loss": 1.0028, + "step": 265040 + }, + { + "epoch": 1.693328903824285, + "grad_norm": 1.0388178825378418, + "learning_rate": 5.713950339863849e-06, + "loss": 0.9553, + "step": 265050 + }, + { + "epoch": 1.6933927909740234, + "grad_norm": 1.0727187395095825, + "learning_rate": 5.711621263327094e-06, + "loss": 0.7793, + "step": 265060 + }, + { + "epoch": 1.6934566781237623, + "grad_norm": 0.8072197437286377, + "learning_rate": 5.709292632812652e-06, + "loss": 0.711, + "step": 265070 + }, + { + "epoch": 1.6935205652735008, + "grad_norm": 0.7687093019485474, + "learning_rate": 5.706964448343926e-06, + "loss": 0.8484, + "step": 265080 + }, + { + "epoch": 1.6935844524232397, + "grad_norm": 0.8324954509735107, + "learning_rate": 5.7046367099444e-06, + "loss": 0.9244, + "step": 265090 + }, + { + "epoch": 1.6936483395729782, + "grad_norm": 0.781612753868103, + "learning_rate": 5.702309417637492e-06, + "loss": 1.0521, + "step": 265100 + }, + { + "epoch": 1.6937122267227172, + "grad_norm": 1.3374513387680054, + "learning_rate": 5.699982571446655e-06, + "loss": 1.1758, + "step": 265110 + }, + { + "epoch": 1.6937761138724556, + "grad_norm": 1.0523484945297241, + "learning_rate": 5.697656171395316e-06, + "loss": 0.8768, + "step": 265120 + }, + { + "epoch": 1.6938400010221943, + "grad_norm": 0.8894367218017578, + "learning_rate": 5.695330217506916e-06, + "loss": 0.6643, + "step": 265130 + }, + { + "epoch": 1.693903888171933, + "grad_norm": 2.262582778930664, + "learning_rate": 5.693004709804855e-06, + "loss": 0.9333, + "step": 265140 + }, + { + "epoch": 1.6939677753216718, + "grad_norm": 0.9840685725212097, + "learning_rate": 5.690679648312575e-06, + "loss": 0.839, + "step": 265150 + }, + { + "epoch": 1.6940316624714105, + "grad_norm": 0.9524529576301575, + "learning_rate": 5.688355033053489e-06, + "loss": 0.8996, + "step": 265160 + }, + { + "epoch": 1.6940955496211492, + "grad_norm": 1.0004558563232422, + "learning_rate": 5.686030864050989e-06, + "loss": 0.7685, + "step": 265170 + }, + { + "epoch": 1.6941594367708879, + "grad_norm": 1.045327067375183, + "learning_rate": 5.683707141328515e-06, + "loss": 0.7907, + "step": 265180 + }, + { + "epoch": 1.6942233239206266, + "grad_norm": 1.2737689018249512, + "learning_rate": 5.681383864909429e-06, + "loss": 0.6774, + "step": 265190 + }, + { + "epoch": 1.6942872110703653, + "grad_norm": 0.997908890247345, + "learning_rate": 5.679061034817168e-06, + "loss": 1.0813, + "step": 265200 + }, + { + "epoch": 1.694351098220104, + "grad_norm": 1.0616806745529175, + "learning_rate": 5.676738651075086e-06, + "loss": 0.9101, + "step": 265210 + }, + { + "epoch": 1.6944149853698427, + "grad_norm": 1.175321102142334, + "learning_rate": 5.6744167137066095e-06, + "loss": 0.6915, + "step": 265220 + }, + { + "epoch": 1.6944788725195814, + "grad_norm": 0.9594330191612244, + "learning_rate": 5.672095222735086e-06, + "loss": 0.8794, + "step": 265230 + }, + { + "epoch": 1.69454275966932, + "grad_norm": 1.3879942893981934, + "learning_rate": 5.669774178183929e-06, + "loss": 0.7531, + "step": 265240 + }, + { + "epoch": 1.6946066468190588, + "grad_norm": 0.6630569696426392, + "learning_rate": 5.667453580076487e-06, + "loss": 0.8141, + "step": 265250 + }, + { + "epoch": 1.6946705339687975, + "grad_norm": 0.8384342789649963, + "learning_rate": 5.665133428436148e-06, + "loss": 1.0865, + "step": 265260 + }, + { + "epoch": 1.6947344211185362, + "grad_norm": 0.8924210071563721, + "learning_rate": 5.662813723286259e-06, + "loss": 0.7966, + "step": 265270 + }, + { + "epoch": 1.694798308268275, + "grad_norm": 1.0226231813430786, + "learning_rate": 5.660494464650207e-06, + "loss": 0.9807, + "step": 265280 + }, + { + "epoch": 1.6948621954180136, + "grad_norm": 1.0482666492462158, + "learning_rate": 5.658175652551317e-06, + "loss": 0.7901, + "step": 265290 + }, + { + "epoch": 1.6949260825677523, + "grad_norm": 0.8355343341827393, + "learning_rate": 5.6558572870129775e-06, + "loss": 0.9319, + "step": 265300 + }, + { + "epoch": 1.694989969717491, + "grad_norm": 1.1238371133804321, + "learning_rate": 5.653539368058508e-06, + "loss": 0.8083, + "step": 265310 + }, + { + "epoch": 1.6950538568672298, + "grad_norm": 1.0383145809173584, + "learning_rate": 5.651221895711268e-06, + "loss": 0.6148, + "step": 265320 + }, + { + "epoch": 1.6951177440169685, + "grad_norm": 1.1226203441619873, + "learning_rate": 5.648904869994581e-06, + "loss": 1.0053, + "step": 265330 + }, + { + "epoch": 1.6951816311667072, + "grad_norm": 0.8662714958190918, + "learning_rate": 5.646588290931804e-06, + "loss": 0.9971, + "step": 265340 + }, + { + "epoch": 1.6952455183164459, + "grad_norm": 0.6560549139976501, + "learning_rate": 5.644272158546243e-06, + "loss": 0.9397, + "step": 265350 + }, + { + "epoch": 1.6953094054661846, + "grad_norm": 0.9533705115318298, + "learning_rate": 5.641956472861232e-06, + "loss": 0.7692, + "step": 265360 + }, + { + "epoch": 1.6953732926159233, + "grad_norm": 3.250364065170288, + "learning_rate": 5.6396412339001116e-06, + "loss": 0.9262, + "step": 265370 + }, + { + "epoch": 1.695437179765662, + "grad_norm": 1.1853001117706299, + "learning_rate": 5.6373264416861635e-06, + "loss": 0.746, + "step": 265380 + }, + { + "epoch": 1.6955010669154005, + "grad_norm": 1.0173426866531372, + "learning_rate": 5.635012096242731e-06, + "loss": 0.7435, + "step": 265390 + }, + { + "epoch": 1.6955649540651394, + "grad_norm": 1.1551456451416016, + "learning_rate": 5.632698197593095e-06, + "loss": 0.8924, + "step": 265400 + }, + { + "epoch": 1.695628841214878, + "grad_norm": 0.8069493770599365, + "learning_rate": 5.630384745760586e-06, + "loss": 0.8283, + "step": 265410 + }, + { + "epoch": 1.6956927283646168, + "grad_norm": 0.8276717662811279, + "learning_rate": 5.628303021159209e-06, + "loss": 0.8786, + "step": 265420 + }, + { + "epoch": 1.6957566155143553, + "grad_norm": 0.9626677632331848, + "learning_rate": 5.625990418343391e-06, + "loss": 0.9655, + "step": 265430 + }, + { + "epoch": 1.6958205026640942, + "grad_norm": 1.4208831787109375, + "learning_rate": 5.62367826241223e-06, + "loss": 0.8715, + "step": 265440 + }, + { + "epoch": 1.6958843898138327, + "grad_norm": 0.8281898498535156, + "learning_rate": 5.621366553389035e-06, + "loss": 1.3512, + "step": 265450 + }, + { + "epoch": 1.6959482769635716, + "grad_norm": 1.0248709917068481, + "learning_rate": 5.619055291297059e-06, + "loss": 0.8385, + "step": 265460 + }, + { + "epoch": 1.6960121641133101, + "grad_norm": 1.3580818176269531, + "learning_rate": 5.616744476159591e-06, + "loss": 0.8225, + "step": 265470 + }, + { + "epoch": 1.696076051263049, + "grad_norm": 0.7210913300514221, + "learning_rate": 5.614434107999911e-06, + "loss": 0.7541, + "step": 265480 + }, + { + "epoch": 1.6961399384127875, + "grad_norm": 1.2958444356918335, + "learning_rate": 5.6121241868412726e-06, + "loss": 0.7449, + "step": 265490 + }, + { + "epoch": 1.6962038255625265, + "grad_norm": 1.102543830871582, + "learning_rate": 5.6098147127069515e-06, + "loss": 1.1964, + "step": 265500 + }, + { + "epoch": 1.696267712712265, + "grad_norm": 1.0483332872390747, + "learning_rate": 5.60750568562019e-06, + "loss": 0.8113, + "step": 265510 + }, + { + "epoch": 1.6963315998620039, + "grad_norm": 0.9912049770355225, + "learning_rate": 5.60519710560426e-06, + "loss": 0.6473, + "step": 265520 + }, + { + "epoch": 1.6963954870117424, + "grad_norm": 0.705596387386322, + "learning_rate": 5.6028889726823905e-06, + "loss": 0.6631, + "step": 265530 + }, + { + "epoch": 1.6964593741614813, + "grad_norm": 0.6908966302871704, + "learning_rate": 5.600581286877854e-06, + "loss": 0.7591, + "step": 265540 + }, + { + "epoch": 1.6965232613112198, + "grad_norm": 0.9993156790733337, + "learning_rate": 5.598274048213858e-06, + "loss": 1.0082, + "step": 265550 + }, + { + "epoch": 1.6965871484609587, + "grad_norm": 0.7562658786773682, + "learning_rate": 5.5959672567136745e-06, + "loss": 0.9683, + "step": 265560 + }, + { + "epoch": 1.6966510356106972, + "grad_norm": 1.1996532678604126, + "learning_rate": 5.5936609124005e-06, + "loss": 1.0089, + "step": 265570 + }, + { + "epoch": 1.6967149227604361, + "grad_norm": 0.7490407228469849, + "learning_rate": 5.591355015297583e-06, + "loss": 0.7151, + "step": 265580 + }, + { + "epoch": 1.6967788099101746, + "grad_norm": 0.9578597545623779, + "learning_rate": 5.589049565428134e-06, + "loss": 0.7907, + "step": 265590 + }, + { + "epoch": 1.6968426970599135, + "grad_norm": 1.636884331703186, + "learning_rate": 5.586744562815388e-06, + "loss": 1.1151, + "step": 265600 + }, + { + "epoch": 1.696906584209652, + "grad_norm": 0.9623717069625854, + "learning_rate": 5.584440007482539e-06, + "loss": 1.0491, + "step": 265610 + }, + { + "epoch": 1.6969704713593907, + "grad_norm": 0.6296240091323853, + "learning_rate": 5.582135899452811e-06, + "loss": 0.979, + "step": 265620 + }, + { + "epoch": 1.6970343585091294, + "grad_norm": 0.869094729423523, + "learning_rate": 5.5798322387493884e-06, + "loss": 1.0489, + "step": 265630 + }, + { + "epoch": 1.6970982456588681, + "grad_norm": 0.6812463402748108, + "learning_rate": 5.5775290253955e-06, + "loss": 0.7392, + "step": 265640 + }, + { + "epoch": 1.6971621328086068, + "grad_norm": 1.0079282522201538, + "learning_rate": 5.575226259414313e-06, + "loss": 0.8945, + "step": 265650 + }, + { + "epoch": 1.6972260199583455, + "grad_norm": 4.871733665466309, + "learning_rate": 5.572923940829039e-06, + "loss": 1.029, + "step": 265660 + }, + { + "epoch": 1.6972899071080843, + "grad_norm": 0.808641791343689, + "learning_rate": 5.570622069662846e-06, + "loss": 0.9738, + "step": 265670 + }, + { + "epoch": 1.697353794257823, + "grad_norm": 0.7088136076927185, + "learning_rate": 5.568320645938929e-06, + "loss": 0.8631, + "step": 265680 + }, + { + "epoch": 1.6974176814075617, + "grad_norm": 0.9034357070922852, + "learning_rate": 5.566019669680467e-06, + "loss": 0.9508, + "step": 265690 + }, + { + "epoch": 1.6974815685573004, + "grad_norm": 1.4830907583236694, + "learning_rate": 5.563719140910628e-06, + "loss": 1.0575, + "step": 265700 + }, + { + "epoch": 1.697545455707039, + "grad_norm": 0.7605845332145691, + "learning_rate": 5.561419059652584e-06, + "loss": 0.7593, + "step": 265710 + }, + { + "epoch": 1.6976093428567778, + "grad_norm": 0.7586846351623535, + "learning_rate": 5.559119425929482e-06, + "loss": 0.7799, + "step": 265720 + }, + { + "epoch": 1.6976732300065165, + "grad_norm": 0.8755923509597778, + "learning_rate": 5.55682023976451e-06, + "loss": 0.8196, + "step": 265730 + }, + { + "epoch": 1.6977371171562552, + "grad_norm": 0.7772778868675232, + "learning_rate": 5.554521501180793e-06, + "loss": 0.7469, + "step": 265740 + }, + { + "epoch": 1.697801004305994, + "grad_norm": 1.2870303392410278, + "learning_rate": 5.552223210201502e-06, + "loss": 1.0656, + "step": 265750 + }, + { + "epoch": 1.6978648914557326, + "grad_norm": 0.8444591164588928, + "learning_rate": 5.549925366849767e-06, + "loss": 1.065, + "step": 265760 + }, + { + "epoch": 1.6979287786054713, + "grad_norm": 1.0897783041000366, + "learning_rate": 5.54762797114875e-06, + "loss": 0.8331, + "step": 265770 + }, + { + "epoch": 1.69799266575521, + "grad_norm": 0.7298462986946106, + "learning_rate": 5.545331023121569e-06, + "loss": 0.6569, + "step": 265780 + }, + { + "epoch": 1.6980565529049487, + "grad_norm": 0.9896948933601379, + "learning_rate": 5.543034522791362e-06, + "loss": 1.0471, + "step": 265790 + }, + { + "epoch": 1.6981204400546874, + "grad_norm": 2.4735770225524902, + "learning_rate": 5.540738470181267e-06, + "loss": 0.9096, + "step": 265800 + }, + { + "epoch": 1.6981843272044261, + "grad_norm": 0.7311177253723145, + "learning_rate": 5.538442865314386e-06, + "loss": 1.044, + "step": 265810 + }, + { + "epoch": 1.6982482143541648, + "grad_norm": 0.8349324464797974, + "learning_rate": 5.536147708213862e-06, + "loss": 0.8147, + "step": 265820 + }, + { + "epoch": 1.6983121015039035, + "grad_norm": 1.0947216749191284, + "learning_rate": 5.5338529989027885e-06, + "loss": 0.7788, + "step": 265830 + }, + { + "epoch": 1.6983759886536423, + "grad_norm": 1.4398168325424194, + "learning_rate": 5.531558737404291e-06, + "loss": 0.9125, + "step": 265840 + }, + { + "epoch": 1.698439875803381, + "grad_norm": 0.9863196611404419, + "learning_rate": 5.529264923741462e-06, + "loss": 1.1985, + "step": 265850 + }, + { + "epoch": 1.6985037629531194, + "grad_norm": 1.338010549545288, + "learning_rate": 5.527200874363519e-06, + "loss": 0.8456, + "step": 265860 + }, + { + "epoch": 1.6985676501028584, + "grad_norm": 1.2365350723266602, + "learning_rate": 5.524907911652111e-06, + "loss": 0.7249, + "step": 265870 + }, + { + "epoch": 1.6986315372525969, + "grad_norm": 0.6513351202011108, + "learning_rate": 5.522615396843362e-06, + "loss": 0.9132, + "step": 265880 + }, + { + "epoch": 1.6986954244023358, + "grad_norm": 1.3514388799667358, + "learning_rate": 5.520323329960347e-06, + "loss": 0.9574, + "step": 265890 + }, + { + "epoch": 1.6987593115520743, + "grad_norm": 1.5558171272277832, + "learning_rate": 5.518031711026161e-06, + "loss": 0.7067, + "step": 265900 + }, + { + "epoch": 1.6988231987018132, + "grad_norm": 1.4183495044708252, + "learning_rate": 5.5157405400638736e-06, + "loss": 0.834, + "step": 265910 + }, + { + "epoch": 1.6988870858515517, + "grad_norm": 1.1658267974853516, + "learning_rate": 5.513449817096561e-06, + "loss": 0.609, + "step": 265920 + }, + { + "epoch": 1.6989509730012906, + "grad_norm": 0.8308910131454468, + "learning_rate": 5.511159542147304e-06, + "loss": 0.6832, + "step": 265930 + }, + { + "epoch": 1.699014860151029, + "grad_norm": 1.268141508102417, + "learning_rate": 5.508869715239151e-06, + "loss": 0.8501, + "step": 265940 + }, + { + "epoch": 1.699078747300768, + "grad_norm": 0.9764382243156433, + "learning_rate": 5.506580336395179e-06, + "loss": 0.9117, + "step": 265950 + }, + { + "epoch": 1.6991426344505065, + "grad_norm": 0.9711419939994812, + "learning_rate": 5.504291405638429e-06, + "loss": 0.8632, + "step": 265960 + }, + { + "epoch": 1.6992065216002454, + "grad_norm": 0.8464389443397522, + "learning_rate": 5.5020029229919664e-06, + "loss": 0.9455, + "step": 265970 + }, + { + "epoch": 1.699270408749984, + "grad_norm": 1.472284197807312, + "learning_rate": 5.499714888478818e-06, + "loss": 0.8437, + "step": 265980 + }, + { + "epoch": 1.6993342958997228, + "grad_norm": 0.63400799036026, + "learning_rate": 5.497427302122054e-06, + "loss": 0.6856, + "step": 265990 + }, + { + "epoch": 1.6993981830494613, + "grad_norm": 0.9483168125152588, + "learning_rate": 5.495140163944684e-06, + "loss": 0.9194, + "step": 266000 + }, + { + "epoch": 1.6994620701992003, + "grad_norm": 1.7425835132598877, + "learning_rate": 5.492853473969761e-06, + "loss": 0.9406, + "step": 266010 + }, + { + "epoch": 1.6995259573489387, + "grad_norm": 1.738411545753479, + "learning_rate": 5.490567232220306e-06, + "loss": 0.941, + "step": 266020 + }, + { + "epoch": 1.6995898444986777, + "grad_norm": 1.1537902355194092, + "learning_rate": 5.488281438719351e-06, + "loss": 0.7692, + "step": 266030 + }, + { + "epoch": 1.6996537316484162, + "grad_norm": 0.5871503353118896, + "learning_rate": 5.485996093489898e-06, + "loss": 0.8314, + "step": 266040 + }, + { + "epoch": 1.699717618798155, + "grad_norm": 1.1925203800201416, + "learning_rate": 5.483711196554986e-06, + "loss": 0.8419, + "step": 266050 + }, + { + "epoch": 1.6997815059478936, + "grad_norm": 1.2385332584381104, + "learning_rate": 5.481426747937601e-06, + "loss": 1.017, + "step": 266060 + }, + { + "epoch": 1.6998453930976325, + "grad_norm": 1.0228863954544067, + "learning_rate": 5.479142747660781e-06, + "loss": 0.963, + "step": 266070 + }, + { + "epoch": 1.699909280247371, + "grad_norm": 0.7386691570281982, + "learning_rate": 5.476859195747492e-06, + "loss": 0.8636, + "step": 266080 + }, + { + "epoch": 1.69997316739711, + "grad_norm": 0.9049159288406372, + "learning_rate": 5.474576092220762e-06, + "loss": 1.1555, + "step": 266090 + }, + { + "epoch": 1.7000370545468484, + "grad_norm": 0.757064700126648, + "learning_rate": 5.47229343710356e-06, + "loss": 0.8304, + "step": 266100 + }, + { + "epoch": 1.700100941696587, + "grad_norm": 0.7645605206489563, + "learning_rate": 5.470011230418887e-06, + "loss": 1.001, + "step": 266110 + }, + { + "epoch": 1.7001648288463258, + "grad_norm": 1.0574803352355957, + "learning_rate": 5.467729472189731e-06, + "loss": 0.9994, + "step": 266120 + }, + { + "epoch": 1.7002287159960645, + "grad_norm": 1.198251485824585, + "learning_rate": 5.465448162439057e-06, + "loss": 0.9769, + "step": 266130 + }, + { + "epoch": 1.7002926031458032, + "grad_norm": 1.3360443115234375, + "learning_rate": 5.4631673011898585e-06, + "loss": 1.074, + "step": 266140 + }, + { + "epoch": 1.700356490295542, + "grad_norm": 0.8937990069389343, + "learning_rate": 5.460886888465088e-06, + "loss": 1.0213, + "step": 266150 + }, + { + "epoch": 1.7004203774452806, + "grad_norm": 0.8572983145713806, + "learning_rate": 5.458606924287723e-06, + "loss": 0.8882, + "step": 266160 + }, + { + "epoch": 1.7004842645950193, + "grad_norm": 0.6053618788719177, + "learning_rate": 5.456327408680711e-06, + "loss": 0.6702, + "step": 266170 + }, + { + "epoch": 1.700548151744758, + "grad_norm": 1.9757890701293945, + "learning_rate": 5.454048341667034e-06, + "loss": 0.6938, + "step": 266180 + }, + { + "epoch": 1.7006120388944967, + "grad_norm": 0.7852265238761902, + "learning_rate": 5.451769723269612e-06, + "loss": 1.011, + "step": 266190 + }, + { + "epoch": 1.7006759260442355, + "grad_norm": 0.7186400890350342, + "learning_rate": 5.449491553511416e-06, + "loss": 0.6203, + "step": 266200 + }, + { + "epoch": 1.7007398131939742, + "grad_norm": 0.8351746201515198, + "learning_rate": 5.447213832415377e-06, + "loss": 0.8689, + "step": 266210 + }, + { + "epoch": 1.7008037003437129, + "grad_norm": 0.8336975574493408, + "learning_rate": 5.444936560004449e-06, + "loss": 0.8293, + "step": 266220 + }, + { + "epoch": 1.7008675874934516, + "grad_norm": 0.5873013734817505, + "learning_rate": 5.442659736301542e-06, + "loss": 0.7018, + "step": 266230 + }, + { + "epoch": 1.7009314746431903, + "grad_norm": 6.3691582679748535, + "learning_rate": 5.44038336132961e-06, + "loss": 1.0403, + "step": 266240 + }, + { + "epoch": 1.700995361792929, + "grad_norm": 0.9431154131889343, + "learning_rate": 5.4381074351115556e-06, + "loss": 0.8367, + "step": 266250 + }, + { + "epoch": 1.7010592489426677, + "grad_norm": 3.6684696674346924, + "learning_rate": 5.4358319576703236e-06, + "loss": 0.8574, + "step": 266260 + }, + { + "epoch": 1.7011231360924064, + "grad_norm": 0.8904080986976624, + "learning_rate": 5.433556929028805e-06, + "loss": 1.121, + "step": 266270 + }, + { + "epoch": 1.701187023242145, + "grad_norm": 0.8094809651374817, + "learning_rate": 5.431282349209937e-06, + "loss": 0.8136, + "step": 266280 + }, + { + "epoch": 1.7012509103918838, + "grad_norm": 1.1437190771102905, + "learning_rate": 5.4290082182365975e-06, + "loss": 0.8366, + "step": 266290 + }, + { + "epoch": 1.7013147975416225, + "grad_norm": 1.4045140743255615, + "learning_rate": 5.426734536131722e-06, + "loss": 0.695, + "step": 266300 + }, + { + "epoch": 1.7013786846913612, + "grad_norm": 0.8198695182800293, + "learning_rate": 5.424461302918177e-06, + "loss": 0.76, + "step": 266310 + }, + { + "epoch": 1.7014425718411, + "grad_norm": 0.7523419260978699, + "learning_rate": 5.422188518618871e-06, + "loss": 0.5761, + "step": 266320 + }, + { + "epoch": 1.7015064589908386, + "grad_norm": 0.9797813296318054, + "learning_rate": 5.419916183256707e-06, + "loss": 0.995, + "step": 266330 + }, + { + "epoch": 1.7015703461405773, + "grad_norm": 1.0118427276611328, + "learning_rate": 5.41764429685454e-06, + "loss": 1.1202, + "step": 266340 + }, + { + "epoch": 1.7016342332903158, + "grad_norm": 0.7771697640419006, + "learning_rate": 5.415372859435275e-06, + "loss": 0.8985, + "step": 266350 + }, + { + "epoch": 1.7016981204400548, + "grad_norm": 1.2752560377120972, + "learning_rate": 5.413101871021764e-06, + "loss": 1.101, + "step": 266360 + }, + { + "epoch": 1.7017620075897932, + "grad_norm": 0.8424556255340576, + "learning_rate": 5.410831331636895e-06, + "loss": 0.8552, + "step": 266370 + }, + { + "epoch": 1.7018258947395322, + "grad_norm": 0.8436341881752014, + "learning_rate": 5.408561241303528e-06, + "loss": 0.9248, + "step": 266380 + }, + { + "epoch": 1.7018897818892706, + "grad_norm": 0.9995906949043274, + "learning_rate": 5.406291600044533e-06, + "loss": 0.8248, + "step": 266390 + }, + { + "epoch": 1.7019536690390096, + "grad_norm": 0.8756603598594666, + "learning_rate": 5.404022407882753e-06, + "loss": 1.2169, + "step": 266400 + }, + { + "epoch": 1.702017556188748, + "grad_norm": 0.9120706915855408, + "learning_rate": 5.401753664841053e-06, + "loss": 0.8411, + "step": 266410 + }, + { + "epoch": 1.702081443338487, + "grad_norm": 0.8383538126945496, + "learning_rate": 5.39948537094227e-06, + "loss": 0.8805, + "step": 266420 + }, + { + "epoch": 1.7021453304882255, + "grad_norm": 1.04457688331604, + "learning_rate": 5.397217526209253e-06, + "loss": 0.9677, + "step": 266430 + }, + { + "epoch": 1.7022092176379644, + "grad_norm": 1.101974368095398, + "learning_rate": 5.394950130664855e-06, + "loss": 0.6521, + "step": 266440 + }, + { + "epoch": 1.7022731047877029, + "grad_norm": 0.7874002456665039, + "learning_rate": 5.392683184331887e-06, + "loss": 1.0725, + "step": 266450 + }, + { + "epoch": 1.7023369919374418, + "grad_norm": 0.8857718110084534, + "learning_rate": 5.390416687233202e-06, + "loss": 0.9898, + "step": 266460 + }, + { + "epoch": 1.7024008790871803, + "grad_norm": 1.0780762434005737, + "learning_rate": 5.388150639391598e-06, + "loss": 0.7795, + "step": 266470 + }, + { + "epoch": 1.7024647662369192, + "grad_norm": 0.5900570154190063, + "learning_rate": 5.38588504082993e-06, + "loss": 0.8662, + "step": 266480 + }, + { + "epoch": 1.7025286533866577, + "grad_norm": 0.7792630195617676, + "learning_rate": 5.383619891570979e-06, + "loss": 1.0849, + "step": 266490 + }, + { + "epoch": 1.7025925405363966, + "grad_norm": 0.9480690360069275, + "learning_rate": 5.381355191637588e-06, + "loss": 0.8941, + "step": 266500 + }, + { + "epoch": 1.7026564276861351, + "grad_norm": 1.1644970178604126, + "learning_rate": 5.379090941052539e-06, + "loss": 0.9368, + "step": 266510 + }, + { + "epoch": 1.702720314835874, + "grad_norm": 1.4844380617141724, + "learning_rate": 5.3768271398386585e-06, + "loss": 0.8222, + "step": 266520 + }, + { + "epoch": 1.7027842019856125, + "grad_norm": 1.1840111017227173, + "learning_rate": 5.374563788018722e-06, + "loss": 0.7043, + "step": 266530 + }, + { + "epoch": 1.7028480891353515, + "grad_norm": 0.8633972406387329, + "learning_rate": 5.372300885615545e-06, + "loss": 0.8175, + "step": 266540 + }, + { + "epoch": 1.70291197628509, + "grad_norm": 1.0917376279830933, + "learning_rate": 5.370038432651897e-06, + "loss": 1.0007, + "step": 266550 + }, + { + "epoch": 1.7029758634348289, + "grad_norm": 0.8144697546958923, + "learning_rate": 5.367776429150584e-06, + "loss": 0.8722, + "step": 266560 + }, + { + "epoch": 1.7030397505845674, + "grad_norm": 0.6899762153625488, + "learning_rate": 5.3655148751343585e-06, + "loss": 0.7992, + "step": 266570 + }, + { + "epoch": 1.7031036377343063, + "grad_norm": 1.0007309913635254, + "learning_rate": 5.363253770626026e-06, + "loss": 0.8943, + "step": 266580 + }, + { + "epoch": 1.7031675248840448, + "grad_norm": 0.7084468007087708, + "learning_rate": 5.360993115648338e-06, + "loss": 1.023, + "step": 266590 + }, + { + "epoch": 1.7032314120337835, + "grad_norm": 1.3472901582717896, + "learning_rate": 5.3587329102240735e-06, + "loss": 1.3554, + "step": 266600 + }, + { + "epoch": 1.7032952991835222, + "grad_norm": 1.2952237129211426, + "learning_rate": 5.356473154375979e-06, + "loss": 1.0382, + "step": 266610 + }, + { + "epoch": 1.7033591863332609, + "grad_norm": 0.8648388385772705, + "learning_rate": 5.354213848126832e-06, + "loss": 0.7916, + "step": 266620 + }, + { + "epoch": 1.7034230734829996, + "grad_norm": 0.8645228147506714, + "learning_rate": 5.35195499149937e-06, + "loss": 0.8126, + "step": 266630 + }, + { + "epoch": 1.7034869606327383, + "grad_norm": 0.6788387894630432, + "learning_rate": 5.349696584516345e-06, + "loss": 0.8482, + "step": 266640 + }, + { + "epoch": 1.703550847782477, + "grad_norm": 0.8210328817367554, + "learning_rate": 5.3474386272005125e-06, + "loss": 0.8949, + "step": 266650 + }, + { + "epoch": 1.7036147349322157, + "grad_norm": 0.9088141918182373, + "learning_rate": 5.345181119574588e-06, + "loss": 0.8878, + "step": 266660 + }, + { + "epoch": 1.7036786220819544, + "grad_norm": 1.4037779569625854, + "learning_rate": 5.342924061661336e-06, + "loss": 0.9856, + "step": 266670 + }, + { + "epoch": 1.7037425092316931, + "grad_norm": 1.1558412313461304, + "learning_rate": 5.340667453483467e-06, + "loss": 0.7853, + "step": 266680 + }, + { + "epoch": 1.7038063963814318, + "grad_norm": 0.6464347243309021, + "learning_rate": 5.338411295063717e-06, + "loss": 0.7154, + "step": 266690 + }, + { + "epoch": 1.7038702835311705, + "grad_norm": 0.820220947265625, + "learning_rate": 5.336155586424796e-06, + "loss": 1.1589, + "step": 266700 + }, + { + "epoch": 1.7039341706809092, + "grad_norm": 0.7956279516220093, + "learning_rate": 5.333900327589436e-06, + "loss": 1.0678, + "step": 266710 + }, + { + "epoch": 1.703998057830648, + "grad_norm": 1.414975881576538, + "learning_rate": 5.33164551858033e-06, + "loss": 0.8709, + "step": 266720 + }, + { + "epoch": 1.7040619449803867, + "grad_norm": 1.05629301071167, + "learning_rate": 5.3293911594202105e-06, + "loss": 0.6103, + "step": 266730 + }, + { + "epoch": 1.7041258321301254, + "grad_norm": 1.6264910697937012, + "learning_rate": 5.327137250131753e-06, + "loss": 0.8797, + "step": 266740 + }, + { + "epoch": 1.704189719279864, + "grad_norm": 1.1576440334320068, + "learning_rate": 5.324883790737684e-06, + "loss": 0.9157, + "step": 266750 + }, + { + "epoch": 1.7042536064296028, + "grad_norm": 0.9160033464431763, + "learning_rate": 5.322630781260679e-06, + "loss": 0.7215, + "step": 266760 + }, + { + "epoch": 1.7043174935793415, + "grad_norm": 1.0382755994796753, + "learning_rate": 5.320378221723438e-06, + "loss": 1.0271, + "step": 266770 + }, + { + "epoch": 1.7043813807290802, + "grad_norm": 0.8948391675949097, + "learning_rate": 5.31812611214863e-06, + "loss": 0.7832, + "step": 266780 + }, + { + "epoch": 1.704445267878819, + "grad_norm": 0.992671549320221, + "learning_rate": 5.315874452558961e-06, + "loss": 0.7864, + "step": 266790 + }, + { + "epoch": 1.7045091550285576, + "grad_norm": 0.9801939725875854, + "learning_rate": 5.3136232429770835e-06, + "loss": 1.0619, + "step": 266800 + }, + { + "epoch": 1.7045730421782963, + "grad_norm": 0.9736009240150452, + "learning_rate": 5.3113724834256916e-06, + "loss": 0.747, + "step": 266810 + }, + { + "epoch": 1.704636929328035, + "grad_norm": 1.2591882944107056, + "learning_rate": 5.309122173927433e-06, + "loss": 0.9616, + "step": 266820 + }, + { + "epoch": 1.7047008164777737, + "grad_norm": 0.9540112614631653, + "learning_rate": 5.306872314504974e-06, + "loss": 0.8396, + "step": 266830 + }, + { + "epoch": 1.7047647036275122, + "grad_norm": 1.0584015846252441, + "learning_rate": 5.304622905180983e-06, + "loss": 1.0238, + "step": 266840 + }, + { + "epoch": 1.7048285907772511, + "grad_norm": 1.0345137119293213, + "learning_rate": 5.302373945978095e-06, + "loss": 0.884, + "step": 266850 + }, + { + "epoch": 1.7048924779269896, + "grad_norm": 0.7487316727638245, + "learning_rate": 5.300125436918979e-06, + "loss": 1.0031, + "step": 266860 + }, + { + "epoch": 1.7049563650767285, + "grad_norm": 1.27576744556427, + "learning_rate": 5.297877378026267e-06, + "loss": 0.6429, + "step": 266870 + }, + { + "epoch": 1.705020252226467, + "grad_norm": 1.08945631980896, + "learning_rate": 5.295629769322607e-06, + "loss": 0.8816, + "step": 266880 + }, + { + "epoch": 1.705084139376206, + "grad_norm": 1.1191450357437134, + "learning_rate": 5.293382610830622e-06, + "loss": 0.8933, + "step": 266890 + }, + { + "epoch": 1.7051480265259444, + "grad_norm": 0.895653247833252, + "learning_rate": 5.291135902572964e-06, + "loss": 0.8744, + "step": 266900 + }, + { + "epoch": 1.7052119136756834, + "grad_norm": 0.926418662071228, + "learning_rate": 5.288889644572231e-06, + "loss": 0.8063, + "step": 266910 + }, + { + "epoch": 1.7052758008254219, + "grad_norm": 0.8096438050270081, + "learning_rate": 5.286643836851069e-06, + "loss": 0.7903, + "step": 266920 + }, + { + "epoch": 1.7053396879751608, + "grad_norm": 1.0355230569839478, + "learning_rate": 5.284398479432079e-06, + "loss": 0.9991, + "step": 266930 + }, + { + "epoch": 1.7054035751248993, + "grad_norm": 0.9099029302597046, + "learning_rate": 5.282153572337895e-06, + "loss": 0.9656, + "step": 266940 + }, + { + "epoch": 1.7054674622746382, + "grad_norm": 0.8696787357330322, + "learning_rate": 5.279909115591092e-06, + "loss": 0.9073, + "step": 266950 + }, + { + "epoch": 1.7055313494243767, + "grad_norm": 0.9176293611526489, + "learning_rate": 5.277665109214297e-06, + "loss": 0.8014, + "step": 266960 + }, + { + "epoch": 1.7055952365741156, + "grad_norm": 1.0077580213546753, + "learning_rate": 5.27542155323012e-06, + "loss": 1.0027, + "step": 266970 + }, + { + "epoch": 1.705659123723854, + "grad_norm": 2.419217824935913, + "learning_rate": 5.273178447661125e-06, + "loss": 0.8749, + "step": 266980 + }, + { + "epoch": 1.705723010873593, + "grad_norm": 1.0237098932266235, + "learning_rate": 5.270935792529924e-06, + "loss": 0.8011, + "step": 266990 + }, + { + "epoch": 1.7057868980233315, + "grad_norm": 0.925464391708374, + "learning_rate": 5.268693587859092e-06, + "loss": 0.8459, + "step": 267000 + }, + { + "epoch": 1.7058507851730704, + "grad_norm": 0.8839012980461121, + "learning_rate": 5.266451833671221e-06, + "loss": 0.8634, + "step": 267010 + }, + { + "epoch": 1.705914672322809, + "grad_norm": 0.9144042134284973, + "learning_rate": 5.264210529988867e-06, + "loss": 1.0476, + "step": 267020 + }, + { + "epoch": 1.7059785594725478, + "grad_norm": 0.9054514765739441, + "learning_rate": 5.261969676834627e-06, + "loss": 0.79, + "step": 267030 + }, + { + "epoch": 1.7060424466222863, + "grad_norm": 1.2751846313476562, + "learning_rate": 5.259729274231051e-06, + "loss": 0.943, + "step": 267040 + }, + { + "epoch": 1.7061063337720253, + "grad_norm": 1.316709041595459, + "learning_rate": 5.25748932220071e-06, + "loss": 0.8292, + "step": 267050 + }, + { + "epoch": 1.7061702209217637, + "grad_norm": 1.0444445610046387, + "learning_rate": 5.255249820766156e-06, + "loss": 0.9404, + "step": 267060 + }, + { + "epoch": 1.7062341080715027, + "grad_norm": 0.821792483329773, + "learning_rate": 5.253010769949951e-06, + "loss": 0.8681, + "step": 267070 + }, + { + "epoch": 1.7062979952212411, + "grad_norm": 1.1909120082855225, + "learning_rate": 5.250772169774632e-06, + "loss": 1.0925, + "step": 267080 + }, + { + "epoch": 1.7063618823709799, + "grad_norm": 1.0737441778182983, + "learning_rate": 5.248534020262757e-06, + "loss": 0.868, + "step": 267090 + }, + { + "epoch": 1.7064257695207186, + "grad_norm": 0.8983291387557983, + "learning_rate": 5.246296321436855e-06, + "loss": 0.8025, + "step": 267100 + }, + { + "epoch": 1.7064896566704573, + "grad_norm": 2.5274810791015625, + "learning_rate": 5.244059073319474e-06, + "loss": 1.0225, + "step": 267110 + }, + { + "epoch": 1.706553543820196, + "grad_norm": 0.5547528266906738, + "learning_rate": 5.241822275933123e-06, + "loss": 1.078, + "step": 267120 + }, + { + "epoch": 1.7066174309699347, + "grad_norm": 1.2405059337615967, + "learning_rate": 5.239585929300361e-06, + "loss": 0.9678, + "step": 267130 + }, + { + "epoch": 1.7066813181196734, + "grad_norm": 0.9723836183547974, + "learning_rate": 5.237350033443678e-06, + "loss": 0.8917, + "step": 267140 + }, + { + "epoch": 1.706745205269412, + "grad_norm": 0.9931864738464355, + "learning_rate": 5.235114588385614e-06, + "loss": 0.4923, + "step": 267150 + }, + { + "epoch": 1.7068090924191508, + "grad_norm": 1.0213977098464966, + "learning_rate": 5.232879594148665e-06, + "loss": 0.9429, + "step": 267160 + }, + { + "epoch": 1.7068729795688895, + "grad_norm": 1.1262112855911255, + "learning_rate": 5.23064505075535e-06, + "loss": 0.9091, + "step": 267170 + }, + { + "epoch": 1.7069368667186282, + "grad_norm": 1.1662230491638184, + "learning_rate": 5.2284109582281745e-06, + "loss": 0.9259, + "step": 267180 + }, + { + "epoch": 1.707000753868367, + "grad_norm": 1.5848547220230103, + "learning_rate": 5.226177316589631e-06, + "loss": 0.7578, + "step": 267190 + }, + { + "epoch": 1.7070646410181056, + "grad_norm": 1.2645552158355713, + "learning_rate": 5.223944125862224e-06, + "loss": 0.927, + "step": 267200 + }, + { + "epoch": 1.7071285281678443, + "grad_norm": 0.8651253581047058, + "learning_rate": 5.221711386068423e-06, + "loss": 1.0176, + "step": 267210 + }, + { + "epoch": 1.707192415317583, + "grad_norm": 1.1839468479156494, + "learning_rate": 5.21947909723074e-06, + "loss": 0.6918, + "step": 267220 + }, + { + "epoch": 1.7072563024673217, + "grad_norm": 1.1756478548049927, + "learning_rate": 5.2172472593716325e-06, + "loss": 0.7856, + "step": 267230 + }, + { + "epoch": 1.7073201896170604, + "grad_norm": 1.2259622812271118, + "learning_rate": 5.215015872513596e-06, + "loss": 1.0075, + "step": 267240 + }, + { + "epoch": 1.7073840767667992, + "grad_norm": 0.8537092208862305, + "learning_rate": 5.212784936679088e-06, + "loss": 0.8574, + "step": 267250 + }, + { + "epoch": 1.7074479639165379, + "grad_norm": 0.8326482772827148, + "learning_rate": 5.210554451890587e-06, + "loss": 0.9249, + "step": 267260 + }, + { + "epoch": 1.7075118510662766, + "grad_norm": 1.3121106624603271, + "learning_rate": 5.208324418170546e-06, + "loss": 0.7788, + "step": 267270 + }, + { + "epoch": 1.7075757382160153, + "grad_norm": 0.8690970540046692, + "learning_rate": 5.206094835541436e-06, + "loss": 0.9285, + "step": 267280 + }, + { + "epoch": 1.707639625365754, + "grad_norm": 1.135063886642456, + "learning_rate": 5.203865704025695e-06, + "loss": 0.9247, + "step": 267290 + }, + { + "epoch": 1.7077035125154927, + "grad_norm": 0.7573182582855225, + "learning_rate": 5.201637023645789e-06, + "loss": 0.7598, + "step": 267300 + }, + { + "epoch": 1.7077673996652314, + "grad_norm": 1.081274390220642, + "learning_rate": 5.199408794424154e-06, + "loss": 0.8923, + "step": 267310 + }, + { + "epoch": 1.70783128681497, + "grad_norm": 0.9698079228401184, + "learning_rate": 5.197181016383224e-06, + "loss": 0.898, + "step": 267320 + }, + { + "epoch": 1.7078951739647086, + "grad_norm": 1.0417249202728271, + "learning_rate": 5.1949536895454454e-06, + "loss": 0.731, + "step": 267330 + }, + { + "epoch": 1.7079590611144475, + "grad_norm": 0.9164946675300598, + "learning_rate": 5.1927268139332355e-06, + "loss": 1.1054, + "step": 267340 + }, + { + "epoch": 1.708022948264186, + "grad_norm": 0.632905125617981, + "learning_rate": 5.190500389569047e-06, + "loss": 0.7001, + "step": 267350 + }, + { + "epoch": 1.708086835413925, + "grad_norm": 1.301917314529419, + "learning_rate": 5.18827441647527e-06, + "loss": 0.8591, + "step": 267360 + }, + { + "epoch": 1.7081507225636634, + "grad_norm": 0.6161066293716431, + "learning_rate": 5.186048894674345e-06, + "loss": 0.9312, + "step": 267370 + }, + { + "epoch": 1.7082146097134023, + "grad_norm": 1.6107707023620605, + "learning_rate": 5.183823824188672e-06, + "loss": 0.7691, + "step": 267380 + }, + { + "epoch": 1.7082784968631408, + "grad_norm": 0.7146519422531128, + "learning_rate": 5.181599205040671e-06, + "loss": 0.6657, + "step": 267390 + }, + { + "epoch": 1.7083423840128797, + "grad_norm": 0.8204352855682373, + "learning_rate": 5.1793750372527376e-06, + "loss": 1.0253, + "step": 267400 + }, + { + "epoch": 1.7084062711626182, + "grad_norm": 0.8192850351333618, + "learning_rate": 5.177151320847273e-06, + "loss": 0.7452, + "step": 267410 + }, + { + "epoch": 1.7084701583123572, + "grad_norm": 1.2230883836746216, + "learning_rate": 5.174928055846667e-06, + "loss": 0.8296, + "step": 267420 + }, + { + "epoch": 1.7085340454620956, + "grad_norm": 0.8661963939666748, + "learning_rate": 5.172705242273324e-06, + "loss": 0.8515, + "step": 267430 + }, + { + "epoch": 1.7085979326118346, + "grad_norm": 0.7626175880432129, + "learning_rate": 5.170482880149608e-06, + "loss": 0.7143, + "step": 267440 + }, + { + "epoch": 1.708661819761573, + "grad_norm": 0.7892603874206543, + "learning_rate": 5.1682609694979236e-06, + "loss": 0.8103, + "step": 267450 + }, + { + "epoch": 1.708725706911312, + "grad_norm": 0.9617498517036438, + "learning_rate": 5.1660395103406255e-06, + "loss": 0.8888, + "step": 267460 + }, + { + "epoch": 1.7087895940610505, + "grad_norm": 1.0265048742294312, + "learning_rate": 5.1638185027001125e-06, + "loss": 0.9709, + "step": 267470 + }, + { + "epoch": 1.7088534812107894, + "grad_norm": 1.4228737354278564, + "learning_rate": 5.161597946598717e-06, + "loss": 0.8084, + "step": 267480 + }, + { + "epoch": 1.7089173683605279, + "grad_norm": 1.0275636911392212, + "learning_rate": 5.159377842058826e-06, + "loss": 0.7827, + "step": 267490 + }, + { + "epoch": 1.7089812555102668, + "grad_norm": 0.8710563778877258, + "learning_rate": 5.157158189102801e-06, + "loss": 0.9638, + "step": 267500 + }, + { + "epoch": 1.7090451426600053, + "grad_norm": 1.2731144428253174, + "learning_rate": 5.154938987752983e-06, + "loss": 0.8887, + "step": 267510 + }, + { + "epoch": 1.7091090298097442, + "grad_norm": 0.8427534103393555, + "learning_rate": 5.152720238031727e-06, + "loss": 0.8941, + "step": 267520 + }, + { + "epoch": 1.7091729169594827, + "grad_norm": 1.2673020362854004, + "learning_rate": 5.150501939961372e-06, + "loss": 0.7282, + "step": 267530 + }, + { + "epoch": 1.7092368041092216, + "grad_norm": 0.9547025561332703, + "learning_rate": 5.1482840935642765e-06, + "loss": 0.813, + "step": 267540 + }, + { + "epoch": 1.7093006912589601, + "grad_norm": 0.8429440855979919, + "learning_rate": 5.146066698862745e-06, + "loss": 0.7106, + "step": 267550 + }, + { + "epoch": 1.7093645784086988, + "grad_norm": 0.9609322547912598, + "learning_rate": 5.143849755879138e-06, + "loss": 1.0052, + "step": 267560 + }, + { + "epoch": 1.7094284655584375, + "grad_norm": 0.7594335079193115, + "learning_rate": 5.141633264635765e-06, + "loss": 1.0813, + "step": 267570 + }, + { + "epoch": 1.7094923527081762, + "grad_norm": 0.7355329990386963, + "learning_rate": 5.1394172251549575e-06, + "loss": 0.7209, + "step": 267580 + }, + { + "epoch": 1.709556239857915, + "grad_norm": 1.0001919269561768, + "learning_rate": 5.137201637459016e-06, + "loss": 0.7211, + "step": 267590 + }, + { + "epoch": 1.7096201270076536, + "grad_norm": 1.5241398811340332, + "learning_rate": 5.134986501570283e-06, + "loss": 0.8565, + "step": 267600 + }, + { + "epoch": 1.7096840141573924, + "grad_norm": 1.0557806491851807, + "learning_rate": 5.1327718175110336e-06, + "loss": 0.8543, + "step": 267610 + }, + { + "epoch": 1.709747901307131, + "grad_norm": 0.9856024980545044, + "learning_rate": 5.130557585303602e-06, + "loss": 0.8925, + "step": 267620 + }, + { + "epoch": 1.7098117884568698, + "grad_norm": 1.1883968114852905, + "learning_rate": 5.128343804970265e-06, + "loss": 0.8502, + "step": 267630 + }, + { + "epoch": 1.7098756756066085, + "grad_norm": 0.9160344004631042, + "learning_rate": 5.126130476533331e-06, + "loss": 0.761, + "step": 267640 + }, + { + "epoch": 1.7099395627563472, + "grad_norm": 0.901768147945404, + "learning_rate": 5.123917600015071e-06, + "loss": 1.0517, + "step": 267650 + }, + { + "epoch": 1.7100034499060859, + "grad_norm": 0.9727572202682495, + "learning_rate": 5.121705175437802e-06, + "loss": 0.8152, + "step": 267660 + }, + { + "epoch": 1.7100673370558246, + "grad_norm": 0.9629524350166321, + "learning_rate": 5.119493202823772e-06, + "loss": 0.7479, + "step": 267670 + }, + { + "epoch": 1.7101312242055633, + "grad_norm": 0.6062182188034058, + "learning_rate": 5.1172816821952855e-06, + "loss": 0.7897, + "step": 267680 + }, + { + "epoch": 1.710195111355302, + "grad_norm": 1.1127554178237915, + "learning_rate": 5.115070613574591e-06, + "loss": 0.9108, + "step": 267690 + }, + { + "epoch": 1.7102589985050407, + "grad_norm": 1.275471806526184, + "learning_rate": 5.112859996983965e-06, + "loss": 0.7668, + "step": 267700 + }, + { + "epoch": 1.7103228856547794, + "grad_norm": 1.0158495903015137, + "learning_rate": 5.110649832445685e-06, + "loss": 1.028, + "step": 267710 + }, + { + "epoch": 1.7103867728045181, + "grad_norm": 0.7695732116699219, + "learning_rate": 5.1084401199819835e-06, + "loss": 0.8964, + "step": 267720 + }, + { + "epoch": 1.7104506599542568, + "grad_norm": 1.0707507133483887, + "learning_rate": 5.106230859615135e-06, + "loss": 0.7492, + "step": 267730 + }, + { + "epoch": 1.7105145471039955, + "grad_norm": 0.9138712286949158, + "learning_rate": 5.1040220513673745e-06, + "loss": 0.8334, + "step": 267740 + }, + { + "epoch": 1.7105784342537342, + "grad_norm": 0.6058676242828369, + "learning_rate": 5.101813695260959e-06, + "loss": 0.6874, + "step": 267750 + }, + { + "epoch": 1.710642321403473, + "grad_norm": 0.9319490790367126, + "learning_rate": 5.099605791318113e-06, + "loss": 0.6997, + "step": 267760 + }, + { + "epoch": 1.7107062085532116, + "grad_norm": 1.1302191019058228, + "learning_rate": 5.097398339561088e-06, + "loss": 1.0276, + "step": 267770 + }, + { + "epoch": 1.7107700957029504, + "grad_norm": 0.9411876201629639, + "learning_rate": 5.095191340012101e-06, + "loss": 0.9568, + "step": 267780 + }, + { + "epoch": 1.710833982852689, + "grad_norm": 1.1420869827270508, + "learning_rate": 5.092984792693406e-06, + "loss": 1.1395, + "step": 267790 + }, + { + "epoch": 1.7108978700024275, + "grad_norm": 0.8047879338264465, + "learning_rate": 5.090778697627185e-06, + "loss": 0.912, + "step": 267800 + }, + { + "epoch": 1.7109617571521665, + "grad_norm": 4.064144611358643, + "learning_rate": 5.0885730548356765e-06, + "loss": 0.7981, + "step": 267810 + }, + { + "epoch": 1.711025644301905, + "grad_norm": 0.9391950964927673, + "learning_rate": 5.086367864341096e-06, + "loss": 0.9571, + "step": 267820 + }, + { + "epoch": 1.7110895314516439, + "grad_norm": 0.7875942587852478, + "learning_rate": 5.084163126165642e-06, + "loss": 0.96, + "step": 267830 + }, + { + "epoch": 1.7111534186013824, + "grad_norm": 0.9419967532157898, + "learning_rate": 5.08195884033153e-06, + "loss": 0.6922, + "step": 267840 + }, + { + "epoch": 1.7112173057511213, + "grad_norm": 0.8862299919128418, + "learning_rate": 5.079755006860943e-06, + "loss": 0.6511, + "step": 267850 + }, + { + "epoch": 1.7112811929008598, + "grad_norm": 0.6669632792472839, + "learning_rate": 5.077551625776089e-06, + "loss": 1.0208, + "step": 267860 + }, + { + "epoch": 1.7113450800505987, + "grad_norm": 0.6567518711090088, + "learning_rate": 5.075348697099152e-06, + "loss": 0.8282, + "step": 267870 + }, + { + "epoch": 1.7114089672003372, + "grad_norm": 0.8719609379768372, + "learning_rate": 5.073146220852326e-06, + "loss": 0.7571, + "step": 267880 + }, + { + "epoch": 1.7114728543500761, + "grad_norm": 1.4976168870925903, + "learning_rate": 5.0709441970577695e-06, + "loss": 0.897, + "step": 267890 + }, + { + "epoch": 1.7115367414998146, + "grad_norm": 4.700047016143799, + "learning_rate": 5.068742625737694e-06, + "loss": 1.1389, + "step": 267900 + }, + { + "epoch": 1.7116006286495535, + "grad_norm": 0.707741916179657, + "learning_rate": 5.066541506914235e-06, + "loss": 0.7893, + "step": 267910 + }, + { + "epoch": 1.711664515799292, + "grad_norm": 0.6900407075881958, + "learning_rate": 5.064340840609588e-06, + "loss": 0.7887, + "step": 267920 + }, + { + "epoch": 1.711728402949031, + "grad_norm": 1.4416577816009521, + "learning_rate": 5.06214062684589e-06, + "loss": 0.7995, + "step": 267930 + }, + { + "epoch": 1.7117922900987694, + "grad_norm": 0.8847475647926331, + "learning_rate": 5.059940865645324e-06, + "loss": 0.6996, + "step": 267940 + }, + { + "epoch": 1.7118561772485084, + "grad_norm": 1.253787636756897, + "learning_rate": 5.057741557030027e-06, + "loss": 0.9299, + "step": 267950 + }, + { + "epoch": 1.7119200643982468, + "grad_norm": 0.7740673422813416, + "learning_rate": 5.055542701022159e-06, + "loss": 0.7667, + "step": 267960 + }, + { + "epoch": 1.7119839515479858, + "grad_norm": 1.0420721769332886, + "learning_rate": 5.053344297643847e-06, + "loss": 0.9228, + "step": 267970 + }, + { + "epoch": 1.7120478386977243, + "grad_norm": 0.8889881372451782, + "learning_rate": 5.051146346917257e-06, + "loss": 1.0123, + "step": 267980 + }, + { + "epoch": 1.7121117258474632, + "grad_norm": 1.4052612781524658, + "learning_rate": 5.048948848864499e-06, + "loss": 0.9129, + "step": 267990 + }, + { + "epoch": 1.7121756129972017, + "grad_norm": 1.0282213687896729, + "learning_rate": 5.046751803507721e-06, + "loss": 0.8117, + "step": 268000 + }, + { + "epoch": 1.7122395001469406, + "grad_norm": 1.0799697637557983, + "learning_rate": 5.0445552108690345e-06, + "loss": 0.8317, + "step": 268010 + }, + { + "epoch": 1.712303387296679, + "grad_norm": 0.5775130987167358, + "learning_rate": 5.042359070970564e-06, + "loss": 0.7675, + "step": 268020 + }, + { + "epoch": 1.712367274446418, + "grad_norm": 0.9590504765510559, + "learning_rate": 5.0401633838344506e-06, + "loss": 0.9477, + "step": 268030 + }, + { + "epoch": 1.7124311615961565, + "grad_norm": 0.8965784907341003, + "learning_rate": 5.037968149482769e-06, + "loss": 0.8345, + "step": 268040 + }, + { + "epoch": 1.7124950487458952, + "grad_norm": 0.86485755443573, + "learning_rate": 5.035773367937663e-06, + "loss": 0.9179, + "step": 268050 + }, + { + "epoch": 1.712558935895634, + "grad_norm": 1.9492807388305664, + "learning_rate": 5.033579039221204e-06, + "loss": 0.7574, + "step": 268060 + }, + { + "epoch": 1.7126228230453726, + "grad_norm": 0.7462204694747925, + "learning_rate": 5.031385163355518e-06, + "loss": 0.9084, + "step": 268070 + }, + { + "epoch": 1.7126867101951113, + "grad_norm": 0.8131275177001953, + "learning_rate": 5.029191740362677e-06, + "loss": 0.8372, + "step": 268080 + }, + { + "epoch": 1.71275059734485, + "grad_norm": 1.006062388420105, + "learning_rate": 5.02699877026479e-06, + "loss": 0.9251, + "step": 268090 + }, + { + "epoch": 1.7128144844945887, + "grad_norm": 0.8451302647590637, + "learning_rate": 5.024806253083919e-06, + "loss": 0.9558, + "step": 268100 + }, + { + "epoch": 1.7128783716443274, + "grad_norm": 1.225652813911438, + "learning_rate": 5.022614188842173e-06, + "loss": 0.7124, + "step": 268110 + }, + { + "epoch": 1.7129422587940661, + "grad_norm": 0.7671294212341309, + "learning_rate": 5.020422577561601e-06, + "loss": 0.8769, + "step": 268120 + }, + { + "epoch": 1.7130061459438048, + "grad_norm": 2.2277419567108154, + "learning_rate": 5.0182314192642965e-06, + "loss": 0.8576, + "step": 268130 + }, + { + "epoch": 1.7130700330935436, + "grad_norm": 1.1126054525375366, + "learning_rate": 5.016040713972309e-06, + "loss": 0.9303, + "step": 268140 + }, + { + "epoch": 1.7131339202432823, + "grad_norm": 1.0343739986419678, + "learning_rate": 5.013850461707714e-06, + "loss": 0.9315, + "step": 268150 + }, + { + "epoch": 1.713197807393021, + "grad_norm": 1.4181034564971924, + "learning_rate": 5.011660662492557e-06, + "loss": 0.8053, + "step": 268160 + }, + { + "epoch": 1.7132616945427597, + "grad_norm": 0.5171089768409729, + "learning_rate": 5.009471316348902e-06, + "loss": 0.9135, + "step": 268170 + }, + { + "epoch": 1.7133255816924984, + "grad_norm": 0.8013474941253662, + "learning_rate": 5.007282423298787e-06, + "loss": 0.8237, + "step": 268180 + }, + { + "epoch": 1.713389468842237, + "grad_norm": 2.410977602005005, + "learning_rate": 5.005093983364273e-06, + "loss": 0.8935, + "step": 268190 + }, + { + "epoch": 1.7134533559919758, + "grad_norm": 1.1010009050369263, + "learning_rate": 5.002905996567381e-06, + "loss": 0.7858, + "step": 268200 + }, + { + "epoch": 1.7135172431417145, + "grad_norm": 1.1124086380004883, + "learning_rate": 5.00071846293016e-06, + "loss": 0.685, + "step": 268210 + }, + { + "epoch": 1.7135811302914532, + "grad_norm": 0.676537036895752, + "learning_rate": 4.998531382474625e-06, + "loss": 0.789, + "step": 268220 + }, + { + "epoch": 1.713645017441192, + "grad_norm": 0.8748432397842407, + "learning_rate": 4.996344755222809e-06, + "loss": 0.715, + "step": 268230 + }, + { + "epoch": 1.7137089045909306, + "grad_norm": 1.8939337730407715, + "learning_rate": 4.994158581196745e-06, + "loss": 0.7261, + "step": 268240 + }, + { + "epoch": 1.7137727917406693, + "grad_norm": 0.611475944519043, + "learning_rate": 4.991972860418431e-06, + "loss": 1.2012, + "step": 268250 + }, + { + "epoch": 1.713836678890408, + "grad_norm": 0.9227023720741272, + "learning_rate": 4.9897875929099005e-06, + "loss": 1.025, + "step": 268260 + }, + { + "epoch": 1.7139005660401467, + "grad_norm": 1.0684404373168945, + "learning_rate": 4.987602778693146e-06, + "loss": 0.8479, + "step": 268270 + }, + { + "epoch": 1.7139644531898854, + "grad_norm": 0.8674401044845581, + "learning_rate": 4.9854184177901716e-06, + "loss": 0.8763, + "step": 268280 + }, + { + "epoch": 1.714028340339624, + "grad_norm": 0.8571915626525879, + "learning_rate": 4.983234510222967e-06, + "loss": 0.7797, + "step": 268290 + }, + { + "epoch": 1.7140922274893629, + "grad_norm": 0.41056373715400696, + "learning_rate": 4.981051056013547e-06, + "loss": 0.7954, + "step": 268300 + }, + { + "epoch": 1.7141561146391013, + "grad_norm": 1.1498006582260132, + "learning_rate": 4.978868055183877e-06, + "loss": 0.9702, + "step": 268310 + }, + { + "epoch": 1.7142200017888403, + "grad_norm": 2.003131628036499, + "learning_rate": 4.976685507755969e-06, + "loss": 0.7933, + "step": 268320 + }, + { + "epoch": 1.7142838889385787, + "grad_norm": 0.7004699110984802, + "learning_rate": 4.974503413751774e-06, + "loss": 0.7621, + "step": 268330 + }, + { + "epoch": 1.7143477760883177, + "grad_norm": 2.447214365005493, + "learning_rate": 4.9723217731932894e-06, + "loss": 1.0202, + "step": 268340 + }, + { + "epoch": 1.7144116632380562, + "grad_norm": 1.0208044052124023, + "learning_rate": 4.970140586102484e-06, + "loss": 0.9783, + "step": 268350 + }, + { + "epoch": 1.714475550387795, + "grad_norm": 0.8741488456726074, + "learning_rate": 4.967959852501308e-06, + "loss": 0.9445, + "step": 268360 + }, + { + "epoch": 1.7145394375375336, + "grad_norm": 1.2652167081832886, + "learning_rate": 4.965779572411744e-06, + "loss": 0.8427, + "step": 268370 + }, + { + "epoch": 1.7146033246872725, + "grad_norm": 0.9176733493804932, + "learning_rate": 4.963817708101692e-06, + "loss": 1.1657, + "step": 268380 + }, + { + "epoch": 1.714667211837011, + "grad_norm": 1.2837026119232178, + "learning_rate": 4.9616382897446634e-06, + "loss": 0.8361, + "step": 268390 + }, + { + "epoch": 1.71473109898675, + "grad_norm": 1.0069950819015503, + "learning_rate": 4.959459324962895e-06, + "loss": 0.8575, + "step": 268400 + }, + { + "epoch": 1.7147949861364884, + "grad_norm": 1.258731722831726, + "learning_rate": 4.9572808137783425e-06, + "loss": 0.8353, + "step": 268410 + }, + { + "epoch": 1.7148588732862273, + "grad_norm": 0.8530985116958618, + "learning_rate": 4.955102756212937e-06, + "loss": 0.9056, + "step": 268420 + }, + { + "epoch": 1.7149227604359658, + "grad_norm": 1.0610921382904053, + "learning_rate": 4.952925152288623e-06, + "loss": 0.9901, + "step": 268430 + }, + { + "epoch": 1.7149866475857047, + "grad_norm": 1.0828012228012085, + "learning_rate": 4.950748002027311e-06, + "loss": 0.7618, + "step": 268440 + }, + { + "epoch": 1.7150505347354432, + "grad_norm": 0.7953183650970459, + "learning_rate": 4.948571305450938e-06, + "loss": 0.9028, + "step": 268450 + }, + { + "epoch": 1.7151144218851821, + "grad_norm": 1.4064747095108032, + "learning_rate": 4.946395062581438e-06, + "loss": 0.9099, + "step": 268460 + }, + { + "epoch": 1.7151783090349206, + "grad_norm": 1.049475073814392, + "learning_rate": 4.944219273440709e-06, + "loss": 0.7286, + "step": 268470 + }, + { + "epoch": 1.7152421961846596, + "grad_norm": 0.9824262261390686, + "learning_rate": 4.942043938050678e-06, + "loss": 1.0499, + "step": 268480 + }, + { + "epoch": 1.715306083334398, + "grad_norm": 0.9291768074035645, + "learning_rate": 4.939869056433233e-06, + "loss": 0.8185, + "step": 268490 + }, + { + "epoch": 1.715369970484137, + "grad_norm": 2.494187831878662, + "learning_rate": 4.937694628610301e-06, + "loss": 0.9579, + "step": 268500 + }, + { + "epoch": 1.7154338576338755, + "grad_norm": 0.866105318069458, + "learning_rate": 4.935520654603759e-06, + "loss": 0.7383, + "step": 268510 + }, + { + "epoch": 1.7154977447836144, + "grad_norm": 0.681208610534668, + "learning_rate": 4.933347134435523e-06, + "loss": 0.7087, + "step": 268520 + }, + { + "epoch": 1.7155616319333529, + "grad_norm": 0.7366973757743835, + "learning_rate": 4.931174068127459e-06, + "loss": 0.7085, + "step": 268530 + }, + { + "epoch": 1.7156255190830916, + "grad_norm": 0.9923181533813477, + "learning_rate": 4.929001455701471e-06, + "loss": 0.6017, + "step": 268540 + }, + { + "epoch": 1.7156894062328303, + "grad_norm": 0.7506087422370911, + "learning_rate": 4.926829297179419e-06, + "loss": 0.7351, + "step": 268550 + }, + { + "epoch": 1.715753293382569, + "grad_norm": 0.7987860441207886, + "learning_rate": 4.924657592583198e-06, + "loss": 0.9795, + "step": 268560 + }, + { + "epoch": 1.7158171805323077, + "grad_norm": 0.9374541640281677, + "learning_rate": 4.922486341934662e-06, + "loss": 0.9364, + "step": 268570 + }, + { + "epoch": 1.7158810676820464, + "grad_norm": 1.0733833312988281, + "learning_rate": 4.920315545255699e-06, + "loss": 0.768, + "step": 268580 + }, + { + "epoch": 1.715944954831785, + "grad_norm": 0.8189911842346191, + "learning_rate": 4.918145202568147e-06, + "loss": 0.9444, + "step": 268590 + }, + { + "epoch": 1.7160088419815238, + "grad_norm": 1.0195735692977905, + "learning_rate": 4.915975313893884e-06, + "loss": 0.8306, + "step": 268600 + }, + { + "epoch": 1.7160727291312625, + "grad_norm": 0.9979621171951294, + "learning_rate": 4.913805879254746e-06, + "loss": 1.1678, + "step": 268610 + }, + { + "epoch": 1.7161366162810012, + "grad_norm": 0.8876195549964905, + "learning_rate": 4.911636898672589e-06, + "loss": 0.8957, + "step": 268620 + }, + { + "epoch": 1.71620050343074, + "grad_norm": 5.677492618560791, + "learning_rate": 4.909468372169251e-06, + "loss": 0.8721, + "step": 268630 + }, + { + "epoch": 1.7162643905804786, + "grad_norm": 0.6202548742294312, + "learning_rate": 4.907300299766588e-06, + "loss": 0.7232, + "step": 268640 + }, + { + "epoch": 1.7163282777302173, + "grad_norm": 0.7246150374412537, + "learning_rate": 4.905132681486407e-06, + "loss": 0.7844, + "step": 268650 + }, + { + "epoch": 1.716392164879956, + "grad_norm": 1.0750226974487305, + "learning_rate": 4.902965517350555e-06, + "loss": 1.0428, + "step": 268660 + }, + { + "epoch": 1.7164560520296948, + "grad_norm": 0.8959199786186218, + "learning_rate": 4.9007988073808635e-06, + "loss": 0.9407, + "step": 268670 + }, + { + "epoch": 1.7165199391794335, + "grad_norm": 0.5611230134963989, + "learning_rate": 4.898632551599136e-06, + "loss": 0.8953, + "step": 268680 + }, + { + "epoch": 1.7165838263291722, + "grad_norm": 1.1511446237564087, + "learning_rate": 4.896466750027206e-06, + "loss": 0.8638, + "step": 268690 + }, + { + "epoch": 1.7166477134789109, + "grad_norm": 1.0932697057724, + "learning_rate": 4.894301402686868e-06, + "loss": 0.8832, + "step": 268700 + }, + { + "epoch": 1.7167116006286496, + "grad_norm": 1.048201322555542, + "learning_rate": 4.892136509599943e-06, + "loss": 0.824, + "step": 268710 + }, + { + "epoch": 1.7167754877783883, + "grad_norm": 1.5334866046905518, + "learning_rate": 4.889972070788218e-06, + "loss": 0.8154, + "step": 268720 + }, + { + "epoch": 1.716839374928127, + "grad_norm": 0.6228106617927551, + "learning_rate": 4.88780808627351e-06, + "loss": 0.7755, + "step": 268730 + }, + { + "epoch": 1.7169032620778657, + "grad_norm": 1.0742732286453247, + "learning_rate": 4.8856445560775955e-06, + "loss": 0.7916, + "step": 268740 + }, + { + "epoch": 1.7169671492276044, + "grad_norm": 0.5827720165252686, + "learning_rate": 4.88348148022228e-06, + "loss": 0.7855, + "step": 268750 + }, + { + "epoch": 1.7170310363773431, + "grad_norm": 0.817838191986084, + "learning_rate": 4.881318858729322e-06, + "loss": 0.8654, + "step": 268760 + }, + { + "epoch": 1.7170949235270818, + "grad_norm": 1.219152569770813, + "learning_rate": 4.879156691620529e-06, + "loss": 1.1188, + "step": 268770 + }, + { + "epoch": 1.7171588106768203, + "grad_norm": 0.955420732498169, + "learning_rate": 4.876994978917654e-06, + "loss": 0.8037, + "step": 268780 + }, + { + "epoch": 1.7172226978265592, + "grad_norm": 1.0800939798355103, + "learning_rate": 4.874833720642485e-06, + "loss": 0.9102, + "step": 268790 + }, + { + "epoch": 1.7172865849762977, + "grad_norm": 1.5440417528152466, + "learning_rate": 4.872672916816767e-06, + "loss": 0.8647, + "step": 268800 + }, + { + "epoch": 1.7173504721260366, + "grad_norm": 1.195446252822876, + "learning_rate": 4.8705125674622875e-06, + "loss": 0.8166, + "step": 268810 + }, + { + "epoch": 1.7174143592757751, + "grad_norm": 1.2400975227355957, + "learning_rate": 4.8683526726007786e-06, + "loss": 0.9157, + "step": 268820 + }, + { + "epoch": 1.717478246425514, + "grad_norm": 1.1519098281860352, + "learning_rate": 4.866193232254013e-06, + "loss": 0.8856, + "step": 268830 + }, + { + "epoch": 1.7175421335752525, + "grad_norm": 1.1491273641586304, + "learning_rate": 4.864034246443716e-06, + "loss": 0.8573, + "step": 268840 + }, + { + "epoch": 1.7176060207249915, + "grad_norm": 1.2674658298492432, + "learning_rate": 4.861875715191655e-06, + "loss": 1.0786, + "step": 268850 + }, + { + "epoch": 1.71766990787473, + "grad_norm": 0.8837629556655884, + "learning_rate": 4.8597176385195455e-06, + "loss": 0.8156, + "step": 268860 + }, + { + "epoch": 1.7177337950244689, + "grad_norm": 1.1899993419647217, + "learning_rate": 4.857560016449125e-06, + "loss": 0.7566, + "step": 268870 + }, + { + "epoch": 1.7177976821742074, + "grad_norm": 0.659336268901825, + "learning_rate": 4.8554028490021445e-06, + "loss": 0.8262, + "step": 268880 + }, + { + "epoch": 1.7178615693239463, + "grad_norm": 0.8648936152458191, + "learning_rate": 4.853246136200301e-06, + "loss": 0.8686, + "step": 268890 + }, + { + "epoch": 1.7179254564736848, + "grad_norm": 1.1850214004516602, + "learning_rate": 4.85108987806534e-06, + "loss": 0.7712, + "step": 268900 + }, + { + "epoch": 1.7179893436234237, + "grad_norm": 1.073351502418518, + "learning_rate": 4.84893407461896e-06, + "loss": 1.0364, + "step": 268910 + }, + { + "epoch": 1.7180532307731622, + "grad_norm": 0.8370411992073059, + "learning_rate": 4.846778725882878e-06, + "loss": 0.7372, + "step": 268920 + }, + { + "epoch": 1.7181171179229011, + "grad_norm": 0.6169006824493408, + "learning_rate": 4.8446238318787805e-06, + "loss": 0.7435, + "step": 268930 + }, + { + "epoch": 1.7181810050726396, + "grad_norm": 1.106313705444336, + "learning_rate": 4.842469392628402e-06, + "loss": 0.9299, + "step": 268940 + }, + { + "epoch": 1.7182448922223785, + "grad_norm": 1.41398024559021, + "learning_rate": 4.840315408153412e-06, + "loss": 0.9412, + "step": 268950 + }, + { + "epoch": 1.718308779372117, + "grad_norm": 1.2122138738632202, + "learning_rate": 4.838161878475528e-06, + "loss": 0.9142, + "step": 268960 + }, + { + "epoch": 1.718372666521856, + "grad_norm": 1.6231672763824463, + "learning_rate": 4.836008803616409e-06, + "loss": 0.7854, + "step": 268970 + }, + { + "epoch": 1.7184365536715944, + "grad_norm": 0.95048987865448, + "learning_rate": 4.833856183597757e-06, + "loss": 0.7186, + "step": 268980 + }, + { + "epoch": 1.7185004408213334, + "grad_norm": 1.2387018203735352, + "learning_rate": 4.831704018441252e-06, + "loss": 0.8779, + "step": 268990 + }, + { + "epoch": 1.7185643279710718, + "grad_norm": 1.0356347560882568, + "learning_rate": 4.829552308168561e-06, + "loss": 0.706, + "step": 269000 + }, + { + "epoch": 1.7186282151208108, + "grad_norm": 1.0229053497314453, + "learning_rate": 4.8274010528013615e-06, + "loss": 0.782, + "step": 269010 + }, + { + "epoch": 1.7186921022705492, + "grad_norm": 0.9773396849632263, + "learning_rate": 4.8252502523613076e-06, + "loss": 0.785, + "step": 269020 + }, + { + "epoch": 1.718755989420288, + "grad_norm": 0.7615492939949036, + "learning_rate": 4.823099906870071e-06, + "loss": 1.1049, + "step": 269030 + }, + { + "epoch": 1.7188198765700267, + "grad_norm": 0.9052096605300903, + "learning_rate": 4.820950016349296e-06, + "loss": 0.7574, + "step": 269040 + }, + { + "epoch": 1.7188837637197654, + "grad_norm": 0.8551297187805176, + "learning_rate": 4.818800580820642e-06, + "loss": 1.1223, + "step": 269050 + }, + { + "epoch": 1.718947650869504, + "grad_norm": 0.8088269829750061, + "learning_rate": 4.816651600305755e-06, + "loss": 0.7293, + "step": 269060 + }, + { + "epoch": 1.7190115380192428, + "grad_norm": 0.7828858494758606, + "learning_rate": 4.8145030748262766e-06, + "loss": 0.7911, + "step": 269070 + }, + { + "epoch": 1.7190754251689815, + "grad_norm": 0.9467841386795044, + "learning_rate": 4.8123550044038345e-06, + "loss": 0.859, + "step": 269080 + }, + { + "epoch": 1.7191393123187202, + "grad_norm": 1.5727406740188599, + "learning_rate": 4.810207389060084e-06, + "loss": 1.0195, + "step": 269090 + }, + { + "epoch": 1.719203199468459, + "grad_norm": 1.2755558490753174, + "learning_rate": 4.80806022881663e-06, + "loss": 1.0695, + "step": 269100 + }, + { + "epoch": 1.7192670866181976, + "grad_norm": 1.217269778251648, + "learning_rate": 4.80591352369511e-06, + "loss": 0.7818, + "step": 269110 + }, + { + "epoch": 1.7193309737679363, + "grad_norm": 0.8727430701255798, + "learning_rate": 4.803767273717136e-06, + "loss": 0.5781, + "step": 269120 + }, + { + "epoch": 1.719394860917675, + "grad_norm": 1.093943476676941, + "learning_rate": 4.801621478904333e-06, + "loss": 0.9036, + "step": 269130 + }, + { + "epoch": 1.7194587480674137, + "grad_norm": 0.843802809715271, + "learning_rate": 4.799476139278291e-06, + "loss": 0.859, + "step": 269140 + }, + { + "epoch": 1.7195226352171524, + "grad_norm": 1.0293015241622925, + "learning_rate": 4.797331254860643e-06, + "loss": 0.9742, + "step": 269150 + }, + { + "epoch": 1.7195865223668911, + "grad_norm": 1.540500283241272, + "learning_rate": 4.7951868256729645e-06, + "loss": 1.136, + "step": 269160 + }, + { + "epoch": 1.7196504095166298, + "grad_norm": 1.239723801612854, + "learning_rate": 4.793042851736867e-06, + "loss": 0.6302, + "step": 269170 + }, + { + "epoch": 1.7197142966663685, + "grad_norm": 0.7998931407928467, + "learning_rate": 4.7908993330739334e-06, + "loss": 0.9544, + "step": 269180 + }, + { + "epoch": 1.7197781838161073, + "grad_norm": 0.8161479234695435, + "learning_rate": 4.7887562697057575e-06, + "loss": 0.7468, + "step": 269190 + }, + { + "epoch": 1.719842070965846, + "grad_norm": 1.1992876529693604, + "learning_rate": 4.786613661653922e-06, + "loss": 0.8672, + "step": 269200 + }, + { + "epoch": 1.7199059581155847, + "grad_norm": 1.1844300031661987, + "learning_rate": 4.7844715089399974e-06, + "loss": 0.9564, + "step": 269210 + }, + { + "epoch": 1.7199698452653234, + "grad_norm": 0.971801221370697, + "learning_rate": 4.782329811585567e-06, + "loss": 0.7125, + "step": 269220 + }, + { + "epoch": 1.720033732415062, + "grad_norm": 0.8483443260192871, + "learning_rate": 4.780188569612187e-06, + "loss": 0.9674, + "step": 269230 + }, + { + "epoch": 1.7200976195648008, + "grad_norm": 1.1602776050567627, + "learning_rate": 4.7780477830414394e-06, + "loss": 1.4011, + "step": 269240 + }, + { + "epoch": 1.7201615067145395, + "grad_norm": 1.1706547737121582, + "learning_rate": 4.775907451894862e-06, + "loss": 0.9959, + "step": 269250 + }, + { + "epoch": 1.7202253938642782, + "grad_norm": 1.2372910976409912, + "learning_rate": 4.7737675761940324e-06, + "loss": 0.8608, + "step": 269260 + }, + { + "epoch": 1.7202892810140167, + "grad_norm": 1.0723450183868408, + "learning_rate": 4.771628155960478e-06, + "loss": 0.9406, + "step": 269270 + }, + { + "epoch": 1.7203531681637556, + "grad_norm": 1.5556002855300903, + "learning_rate": 4.769489191215765e-06, + "loss": 0.7714, + "step": 269280 + }, + { + "epoch": 1.720417055313494, + "grad_norm": 2.422309160232544, + "learning_rate": 4.767350681981419e-06, + "loss": 0.8202, + "step": 269290 + }, + { + "epoch": 1.720480942463233, + "grad_norm": 0.886701762676239, + "learning_rate": 4.7652126282789925e-06, + "loss": 0.9533, + "step": 269300 + }, + { + "epoch": 1.7205448296129715, + "grad_norm": 0.765505850315094, + "learning_rate": 4.763075030129999e-06, + "loss": 0.8945, + "step": 269310 + }, + { + "epoch": 1.7206087167627104, + "grad_norm": 1.1220277547836304, + "learning_rate": 4.760937887555983e-06, + "loss": 0.8573, + "step": 269320 + }, + { + "epoch": 1.720672603912449, + "grad_norm": 0.831519365310669, + "learning_rate": 4.75880120057845e-06, + "loss": 0.9408, + "step": 269330 + }, + { + "epoch": 1.7207364910621878, + "grad_norm": 1.3372504711151123, + "learning_rate": 4.756664969218938e-06, + "loss": 0.7414, + "step": 269340 + }, + { + "epoch": 1.7208003782119263, + "grad_norm": 1.2090286016464233, + "learning_rate": 4.754529193498947e-06, + "loss": 0.8006, + "step": 269350 + }, + { + "epoch": 1.7208642653616653, + "grad_norm": 0.7840335965156555, + "learning_rate": 4.752393873439992e-06, + "loss": 1.1769, + "step": 269360 + }, + { + "epoch": 1.7209281525114037, + "grad_norm": 0.844284176826477, + "learning_rate": 4.750259009063568e-06, + "loss": 0.7698, + "step": 269370 + }, + { + "epoch": 1.7209920396611427, + "grad_norm": 1.0220428705215454, + "learning_rate": 4.748124600391191e-06, + "loss": 0.9851, + "step": 269380 + }, + { + "epoch": 1.7210559268108812, + "grad_norm": 0.8599318265914917, + "learning_rate": 4.745990647444354e-06, + "loss": 0.7958, + "step": 269390 + }, + { + "epoch": 1.72111981396062, + "grad_norm": 0.9162972569465637, + "learning_rate": 4.743857150244524e-06, + "loss": 0.8317, + "step": 269400 + }, + { + "epoch": 1.7211837011103586, + "grad_norm": 0.9127917289733887, + "learning_rate": 4.741724108813217e-06, + "loss": 1.0748, + "step": 269410 + }, + { + "epoch": 1.7212475882600975, + "grad_norm": 0.7067804932594299, + "learning_rate": 4.739591523171894e-06, + "loss": 0.7738, + "step": 269420 + }, + { + "epoch": 1.721311475409836, + "grad_norm": 1.1091209650039673, + "learning_rate": 4.7374593933420475e-06, + "loss": 0.6752, + "step": 269430 + }, + { + "epoch": 1.721375362559575, + "grad_norm": 1.3698811531066895, + "learning_rate": 4.735327719345134e-06, + "loss": 0.6906, + "step": 269440 + }, + { + "epoch": 1.7214392497093134, + "grad_norm": 0.8478043079376221, + "learning_rate": 4.733196501202641e-06, + "loss": 0.8665, + "step": 269450 + }, + { + "epoch": 1.7215031368590523, + "grad_norm": 1.7327885627746582, + "learning_rate": 4.731065738936008e-06, + "loss": 1.2341, + "step": 269460 + }, + { + "epoch": 1.7215670240087908, + "grad_norm": 0.9931337833404541, + "learning_rate": 4.728935432566722e-06, + "loss": 1.0581, + "step": 269470 + }, + { + "epoch": 1.7216309111585297, + "grad_norm": 1.094643473625183, + "learning_rate": 4.726805582116206e-06, + "loss": 1.039, + "step": 269480 + }, + { + "epoch": 1.7216947983082682, + "grad_norm": 0.7432576417922974, + "learning_rate": 4.724676187605937e-06, + "loss": 0.75, + "step": 269490 + }, + { + "epoch": 1.721758685458007, + "grad_norm": 1.0011460781097412, + "learning_rate": 4.7225472490573355e-06, + "loss": 0.8445, + "step": 269500 + }, + { + "epoch": 1.7218225726077456, + "grad_norm": 1.158536434173584, + "learning_rate": 4.720418766491852e-06, + "loss": 1.1112, + "step": 269510 + }, + { + "epoch": 1.7218864597574843, + "grad_norm": 0.8979527354240417, + "learning_rate": 4.718290739930936e-06, + "loss": 1.0536, + "step": 269520 + }, + { + "epoch": 1.721950346907223, + "grad_norm": 1.2593528032302856, + "learning_rate": 4.7161631693959985e-06, + "loss": 0.9534, + "step": 269530 + }, + { + "epoch": 1.7220142340569617, + "grad_norm": 0.8251490592956543, + "learning_rate": 4.7140360549084825e-06, + "loss": 0.637, + "step": 269540 + }, + { + "epoch": 1.7220781212067005, + "grad_norm": 1.0493764877319336, + "learning_rate": 4.711909396489795e-06, + "loss": 0.7499, + "step": 269550 + }, + { + "epoch": 1.7221420083564392, + "grad_norm": 1.0737330913543701, + "learning_rate": 4.70978319416136e-06, + "loss": 0.8281, + "step": 269560 + }, + { + "epoch": 1.7222058955061779, + "grad_norm": 1.0529547929763794, + "learning_rate": 4.707657447944591e-06, + "loss": 0.6255, + "step": 269570 + }, + { + "epoch": 1.7222697826559166, + "grad_norm": 1.3771990537643433, + "learning_rate": 4.705532157860898e-06, + "loss": 1.1111, + "step": 269580 + }, + { + "epoch": 1.7223336698056553, + "grad_norm": 0.7908219695091248, + "learning_rate": 4.7034073239316745e-06, + "loss": 0.8958, + "step": 269590 + }, + { + "epoch": 1.722397556955394, + "grad_norm": 0.8063262701034546, + "learning_rate": 4.701282946178332e-06, + "loss": 0.8439, + "step": 269600 + }, + { + "epoch": 1.7224614441051327, + "grad_norm": 0.8116989135742188, + "learning_rate": 4.699159024622252e-06, + "loss": 0.8012, + "step": 269610 + }, + { + "epoch": 1.7225253312548714, + "grad_norm": 1.1216683387756348, + "learning_rate": 4.69703555928484e-06, + "loss": 0.8178, + "step": 269620 + }, + { + "epoch": 1.72258921840461, + "grad_norm": 1.1037665605545044, + "learning_rate": 4.694912550187469e-06, + "loss": 1.1767, + "step": 269630 + }, + { + "epoch": 1.7226531055543488, + "grad_norm": 0.8387174606323242, + "learning_rate": 4.6927899973515265e-06, + "loss": 0.9968, + "step": 269640 + }, + { + "epoch": 1.7227169927040875, + "grad_norm": 0.756056010723114, + "learning_rate": 4.690667900798379e-06, + "loss": 0.7008, + "step": 269650 + }, + { + "epoch": 1.7227808798538262, + "grad_norm": 1.05203378200531, + "learning_rate": 4.688546260549409e-06, + "loss": 1.0279, + "step": 269660 + }, + { + "epoch": 1.722844767003565, + "grad_norm": 0.9116624593734741, + "learning_rate": 4.686425076625972e-06, + "loss": 0.7539, + "step": 269670 + }, + { + "epoch": 1.7229086541533036, + "grad_norm": 2.241201639175415, + "learning_rate": 4.684304349049445e-06, + "loss": 0.7772, + "step": 269680 + }, + { + "epoch": 1.7229725413030423, + "grad_norm": 0.8135285377502441, + "learning_rate": 4.682184077841168e-06, + "loss": 0.8999, + "step": 269690 + }, + { + "epoch": 1.723036428452781, + "grad_norm": 1.494868278503418, + "learning_rate": 4.68006426302251e-06, + "loss": 1.0202, + "step": 269700 + }, + { + "epoch": 1.7231003156025197, + "grad_norm": 2.18159556388855, + "learning_rate": 4.677944904614806e-06, + "loss": 0.9514, + "step": 269710 + }, + { + "epoch": 1.7231642027522585, + "grad_norm": 1.3357326984405518, + "learning_rate": 4.675826002639405e-06, + "loss": 0.8269, + "step": 269720 + }, + { + "epoch": 1.7232280899019972, + "grad_norm": 1.1173053979873657, + "learning_rate": 4.673707557117657e-06, + "loss": 0.9594, + "step": 269730 + }, + { + "epoch": 1.7232919770517359, + "grad_norm": 1.1313519477844238, + "learning_rate": 4.671589568070872e-06, + "loss": 0.8786, + "step": 269740 + }, + { + "epoch": 1.7233558642014746, + "grad_norm": 0.8571352958679199, + "learning_rate": 4.669472035520411e-06, + "loss": 0.9914, + "step": 269750 + }, + { + "epoch": 1.723419751351213, + "grad_norm": 0.6213200688362122, + "learning_rate": 4.6673549594875684e-06, + "loss": 1.0952, + "step": 269760 + }, + { + "epoch": 1.723483638500952, + "grad_norm": 1.1289409399032593, + "learning_rate": 4.665238339993694e-06, + "loss": 0.7568, + "step": 269770 + }, + { + "epoch": 1.7235475256506905, + "grad_norm": 1.6800681352615356, + "learning_rate": 4.663122177060081e-06, + "loss": 0.8188, + "step": 269780 + }, + { + "epoch": 1.7236114128004294, + "grad_norm": 1.429936170578003, + "learning_rate": 4.661006470708052e-06, + "loss": 0.9426, + "step": 269790 + }, + { + "epoch": 1.7236752999501679, + "grad_norm": 1.010558843612671, + "learning_rate": 4.658891220958905e-06, + "loss": 1.11, + "step": 269800 + }, + { + "epoch": 1.7237391870999068, + "grad_norm": 0.7443026304244995, + "learning_rate": 4.656776427833964e-06, + "loss": 1.108, + "step": 269810 + }, + { + "epoch": 1.7238030742496453, + "grad_norm": 1.2368794679641724, + "learning_rate": 4.654662091354495e-06, + "loss": 0.8759, + "step": 269820 + }, + { + "epoch": 1.7238669613993842, + "grad_norm": 1.9988175630569458, + "learning_rate": 4.652548211541824e-06, + "loss": 0.8806, + "step": 269830 + }, + { + "epoch": 1.7239308485491227, + "grad_norm": 0.7888319492340088, + "learning_rate": 4.650434788417207e-06, + "loss": 0.8124, + "step": 269840 + }, + { + "epoch": 1.7239947356988616, + "grad_norm": 0.6178026795387268, + "learning_rate": 4.648321822001961e-06, + "loss": 0.8807, + "step": 269850 + }, + { + "epoch": 1.7240586228486001, + "grad_norm": 1.3354192972183228, + "learning_rate": 4.6462093123173345e-06, + "loss": 0.9558, + "step": 269860 + }, + { + "epoch": 1.724122509998339, + "grad_norm": 1.0010740756988525, + "learning_rate": 4.644097259384628e-06, + "loss": 0.6324, + "step": 269870 + }, + { + "epoch": 1.7241863971480775, + "grad_norm": 1.5502755641937256, + "learning_rate": 4.6419856632251015e-06, + "loss": 0.8098, + "step": 269880 + }, + { + "epoch": 1.7242502842978165, + "grad_norm": 1.1586580276489258, + "learning_rate": 4.639874523860011e-06, + "loss": 0.7353, + "step": 269890 + }, + { + "epoch": 1.724314171447555, + "grad_norm": 0.6563423275947571, + "learning_rate": 4.637763841310633e-06, + "loss": 0.7076, + "step": 269900 + }, + { + "epoch": 1.7243780585972939, + "grad_norm": 2.1584129333496094, + "learning_rate": 4.635653615598206e-06, + "loss": 0.8794, + "step": 269910 + }, + { + "epoch": 1.7244419457470324, + "grad_norm": 1.012452244758606, + "learning_rate": 4.63354384674401e-06, + "loss": 0.9036, + "step": 269920 + }, + { + "epoch": 1.7245058328967713, + "grad_norm": 1.451828122138977, + "learning_rate": 4.631434534769258e-06, + "loss": 1.0659, + "step": 269930 + }, + { + "epoch": 1.7245697200465098, + "grad_norm": 0.7395955920219421, + "learning_rate": 4.629325679695223e-06, + "loss": 0.7284, + "step": 269940 + }, + { + "epoch": 1.7246336071962487, + "grad_norm": 1.9066523313522339, + "learning_rate": 4.627217281543117e-06, + "loss": 0.7908, + "step": 269950 + }, + { + "epoch": 1.7246974943459872, + "grad_norm": 0.6626289486885071, + "learning_rate": 4.6251093403342e-06, + "loss": 0.8238, + "step": 269960 + }, + { + "epoch": 1.724761381495726, + "grad_norm": 1.87825345993042, + "learning_rate": 4.623001856089676e-06, + "loss": 1.1427, + "step": 269970 + }, + { + "epoch": 1.7248252686454646, + "grad_norm": 0.829743504524231, + "learning_rate": 4.6208948288307855e-06, + "loss": 0.9348, + "step": 269980 + }, + { + "epoch": 1.7248891557952033, + "grad_norm": 0.6813519597053528, + "learning_rate": 4.618788258578738e-06, + "loss": 0.934, + "step": 269990 + }, + { + "epoch": 1.724953042944942, + "grad_norm": 0.6842412948608398, + "learning_rate": 4.616682145354756e-06, + "loss": 0.9531, + "step": 270000 + }, + { + "epoch": 1.7250169300946807, + "grad_norm": 1.5858936309814453, + "learning_rate": 4.614576489180045e-06, + "loss": 0.882, + "step": 270010 + }, + { + "epoch": 1.7250808172444194, + "grad_norm": 1.5098166465759277, + "learning_rate": 4.612471290075821e-06, + "loss": 0.8706, + "step": 270020 + }, + { + "epoch": 1.7251447043941581, + "grad_norm": 0.6777145266532898, + "learning_rate": 4.610366548063261e-06, + "loss": 0.7599, + "step": 270030 + }, + { + "epoch": 1.7252085915438968, + "grad_norm": 1.5332057476043701, + "learning_rate": 4.608262263163582e-06, + "loss": 1.1588, + "step": 270040 + }, + { + "epoch": 1.7252724786936355, + "grad_norm": 2.3782873153686523, + "learning_rate": 4.606158435397984e-06, + "loss": 0.7293, + "step": 270050 + }, + { + "epoch": 1.7253363658433742, + "grad_norm": 0.5773844718933105, + "learning_rate": 4.6040550647876264e-06, + "loss": 0.8931, + "step": 270060 + }, + { + "epoch": 1.725400252993113, + "grad_norm": 1.5350877046585083, + "learning_rate": 4.6019521513537226e-06, + "loss": 1.0427, + "step": 270070 + }, + { + "epoch": 1.7254641401428517, + "grad_norm": 1.2986094951629639, + "learning_rate": 4.59984969511742e-06, + "loss": 1.1083, + "step": 270080 + }, + { + "epoch": 1.7255280272925904, + "grad_norm": 1.510398268699646, + "learning_rate": 4.597747696099919e-06, + "loss": 1.1534, + "step": 270090 + }, + { + "epoch": 1.725591914442329, + "grad_norm": 1.6239967346191406, + "learning_rate": 4.59564615432237e-06, + "loss": 0.9943, + "step": 270100 + }, + { + "epoch": 1.7256558015920678, + "grad_norm": 1.4070252180099487, + "learning_rate": 4.5935450698059544e-06, + "loss": 0.927, + "step": 270110 + }, + { + "epoch": 1.7257196887418065, + "grad_norm": 1.0951735973358154, + "learning_rate": 4.591444442571813e-06, + "loss": 0.8948, + "step": 270120 + }, + { + "epoch": 1.7257835758915452, + "grad_norm": 1.776873230934143, + "learning_rate": 4.589344272641117e-06, + "loss": 0.8, + "step": 270130 + }, + { + "epoch": 1.725847463041284, + "grad_norm": 0.7132924795150757, + "learning_rate": 4.587244560035003e-06, + "loss": 0.8365, + "step": 270140 + }, + { + "epoch": 1.7259113501910226, + "grad_norm": 1.0497385263442993, + "learning_rate": 4.58514530477463e-06, + "loss": 1.0442, + "step": 270150 + }, + { + "epoch": 1.7259752373407613, + "grad_norm": 1.2083861827850342, + "learning_rate": 4.583046506881128e-06, + "loss": 1.1931, + "step": 270160 + }, + { + "epoch": 1.7260391244905, + "grad_norm": 1.1868774890899658, + "learning_rate": 4.580948166375642e-06, + "loss": 0.8016, + "step": 270170 + }, + { + "epoch": 1.7261030116402387, + "grad_norm": 0.8388951420783997, + "learning_rate": 4.578850283279296e-06, + "loss": 0.6478, + "step": 270180 + }, + { + "epoch": 1.7261668987899774, + "grad_norm": 1.142733097076416, + "learning_rate": 4.576752857613231e-06, + "loss": 1.1856, + "step": 270190 + }, + { + "epoch": 1.7262307859397161, + "grad_norm": 5.006933689117432, + "learning_rate": 4.574655889398554e-06, + "loss": 0.7749, + "step": 270200 + }, + { + "epoch": 1.7262946730894548, + "grad_norm": 1.5445818901062012, + "learning_rate": 4.572559378656399e-06, + "loss": 0.8089, + "step": 270210 + }, + { + "epoch": 1.7263585602391935, + "grad_norm": 0.9058724641799927, + "learning_rate": 4.570463325407859e-06, + "loss": 0.8403, + "step": 270220 + }, + { + "epoch": 1.726422447388932, + "grad_norm": 0.807124674320221, + "learning_rate": 4.568367729674067e-06, + "loss": 1.0011, + "step": 270230 + }, + { + "epoch": 1.726486334538671, + "grad_norm": 0.8807920217514038, + "learning_rate": 4.566272591476112e-06, + "loss": 0.9613, + "step": 270240 + }, + { + "epoch": 1.7265502216884094, + "grad_norm": 1.1456222534179688, + "learning_rate": 4.564177910835088e-06, + "loss": 1.14, + "step": 270250 + }, + { + "epoch": 1.7266141088381484, + "grad_norm": 0.9585066437721252, + "learning_rate": 4.562083687772118e-06, + "loss": 0.9113, + "step": 270260 + }, + { + "epoch": 1.7266779959878868, + "grad_norm": 0.6454988718032837, + "learning_rate": 4.559989922308261e-06, + "loss": 0.8286, + "step": 270270 + }, + { + "epoch": 1.7267418831376258, + "grad_norm": 1.003688931465149, + "learning_rate": 4.557896614464624e-06, + "loss": 0.9706, + "step": 270280 + }, + { + "epoch": 1.7268057702873643, + "grad_norm": 0.7934787273406982, + "learning_rate": 4.555803764262273e-06, + "loss": 0.9545, + "step": 270290 + }, + { + "epoch": 1.7268696574371032, + "grad_norm": 1.3392540216445923, + "learning_rate": 4.553711371722308e-06, + "loss": 0.7657, + "step": 270300 + }, + { + "epoch": 1.7269335445868417, + "grad_norm": 0.7471747994422913, + "learning_rate": 4.551619436865767e-06, + "loss": 0.7288, + "step": 270310 + }, + { + "epoch": 1.7269974317365806, + "grad_norm": 2.051301956176758, + "learning_rate": 4.549527959713756e-06, + "loss": 0.6624, + "step": 270320 + }, + { + "epoch": 1.727061318886319, + "grad_norm": 1.2567614316940308, + "learning_rate": 4.547436940287303e-06, + "loss": 0.7613, + "step": 270330 + }, + { + "epoch": 1.727125206036058, + "grad_norm": 0.9878590106964111, + "learning_rate": 4.54534637860749e-06, + "loss": 1.0794, + "step": 270340 + }, + { + "epoch": 1.7271890931857965, + "grad_norm": 0.7812796831130981, + "learning_rate": 4.543256274695362e-06, + "loss": 0.6185, + "step": 270350 + }, + { + "epoch": 1.7272529803355354, + "grad_norm": 0.7838647365570068, + "learning_rate": 4.541166628571969e-06, + "loss": 1.0459, + "step": 270360 + }, + { + "epoch": 1.727316867485274, + "grad_norm": 0.5343107581138611, + "learning_rate": 4.53907744025836e-06, + "loss": 0.5513, + "step": 270370 + }, + { + "epoch": 1.7273807546350128, + "grad_norm": 0.9320610761642456, + "learning_rate": 4.536988709775564e-06, + "loss": 0.9895, + "step": 270380 + }, + { + "epoch": 1.7274446417847513, + "grad_norm": 1.0904432535171509, + "learning_rate": 4.5349004371446234e-06, + "loss": 0.7556, + "step": 270390 + }, + { + "epoch": 1.7275085289344903, + "grad_norm": 1.0576446056365967, + "learning_rate": 4.5328126223865675e-06, + "loss": 0.7109, + "step": 270400 + }, + { + "epoch": 1.7275724160842287, + "grad_norm": 1.0743281841278076, + "learning_rate": 4.530725265522429e-06, + "loss": 0.7066, + "step": 270410 + }, + { + "epoch": 1.7276363032339677, + "grad_norm": 1.1610684394836426, + "learning_rate": 4.528638366573212e-06, + "loss": 0.6315, + "step": 270420 + }, + { + "epoch": 1.7277001903837061, + "grad_norm": 0.5903398990631104, + "learning_rate": 4.526551925559957e-06, + "loss": 0.9977, + "step": 270430 + }, + { + "epoch": 1.727764077533445, + "grad_norm": 1.0846974849700928, + "learning_rate": 4.5244659425036585e-06, + "loss": 0.7972, + "step": 270440 + }, + { + "epoch": 1.7278279646831836, + "grad_norm": 0.6714950203895569, + "learning_rate": 4.522380417425331e-06, + "loss": 0.9027, + "step": 270450 + }, + { + "epoch": 1.7278918518329225, + "grad_norm": 1.0459481477737427, + "learning_rate": 4.5202953503459766e-06, + "loss": 0.9928, + "step": 270460 + }, + { + "epoch": 1.727955738982661, + "grad_norm": 0.9128748178482056, + "learning_rate": 4.518210741286594e-06, + "loss": 0.9621, + "step": 270470 + }, + { + "epoch": 1.7280196261323997, + "grad_norm": 0.890533983707428, + "learning_rate": 4.5161265902681714e-06, + "loss": 0.8388, + "step": 270480 + }, + { + "epoch": 1.7280835132821384, + "grad_norm": 0.6935803890228271, + "learning_rate": 4.514042897311715e-06, + "loss": 1.0362, + "step": 270490 + }, + { + "epoch": 1.728147400431877, + "grad_norm": 0.8639132380485535, + "learning_rate": 4.511959662438187e-06, + "loss": 0.6503, + "step": 270500 + }, + { + "epoch": 1.7282112875816158, + "grad_norm": 1.1737576723098755, + "learning_rate": 4.509876885668585e-06, + "loss": 0.8988, + "step": 270510 + }, + { + "epoch": 1.7282751747313545, + "grad_norm": 1.220122218132019, + "learning_rate": 4.507794567023865e-06, + "loss": 1.0354, + "step": 270520 + }, + { + "epoch": 1.7283390618810932, + "grad_norm": 0.9614644646644592, + "learning_rate": 4.505712706525028e-06, + "loss": 0.8705, + "step": 270530 + }, + { + "epoch": 1.728402949030832, + "grad_norm": 0.4436861276626587, + "learning_rate": 4.503631304193007e-06, + "loss": 0.7628, + "step": 270540 + }, + { + "epoch": 1.7284668361805706, + "grad_norm": 0.8033667802810669, + "learning_rate": 4.50155036004879e-06, + "loss": 0.941, + "step": 270550 + }, + { + "epoch": 1.7285307233303093, + "grad_norm": 0.8721498250961304, + "learning_rate": 4.49946987411331e-06, + "loss": 0.8562, + "step": 270560 + }, + { + "epoch": 1.728594610480048, + "grad_norm": 0.6049971580505371, + "learning_rate": 4.497389846407535e-06, + "loss": 0.7328, + "step": 270570 + }, + { + "epoch": 1.7286584976297867, + "grad_norm": 0.9980120658874512, + "learning_rate": 4.4953102769524195e-06, + "loss": 0.9632, + "step": 270580 + }, + { + "epoch": 1.7287223847795254, + "grad_norm": 1.7371565103530884, + "learning_rate": 4.493231165768886e-06, + "loss": 1.2044, + "step": 270590 + }, + { + "epoch": 1.7287862719292642, + "grad_norm": 1.1560472249984741, + "learning_rate": 4.491152512877895e-06, + "loss": 0.7284, + "step": 270600 + }, + { + "epoch": 1.7288501590790029, + "grad_norm": 0.8397563099861145, + "learning_rate": 4.489074318300357e-06, + "loss": 0.7156, + "step": 270610 + }, + { + "epoch": 1.7289140462287416, + "grad_norm": 1.4910930395126343, + "learning_rate": 4.486996582057224e-06, + "loss": 0.9487, + "step": 270620 + }, + { + "epoch": 1.7289779333784803, + "grad_norm": 0.9516547322273254, + "learning_rate": 4.4849193041693996e-06, + "loss": 0.8044, + "step": 270630 + }, + { + "epoch": 1.729041820528219, + "grad_norm": 1.6026740074157715, + "learning_rate": 4.482842484657824e-06, + "loss": 0.7698, + "step": 270640 + }, + { + "epoch": 1.7291057076779577, + "grad_norm": 0.6650766134262085, + "learning_rate": 4.48076612354339e-06, + "loss": 0.8774, + "step": 270650 + }, + { + "epoch": 1.7291695948276964, + "grad_norm": 0.9393697381019592, + "learning_rate": 4.478690220847032e-06, + "loss": 0.9469, + "step": 270660 + }, + { + "epoch": 1.729233481977435, + "grad_norm": 1.0290288925170898, + "learning_rate": 4.476614776589638e-06, + "loss": 0.9257, + "step": 270670 + }, + { + "epoch": 1.7292973691271738, + "grad_norm": 1.147005319595337, + "learning_rate": 4.4745397907921205e-06, + "loss": 0.7659, + "step": 270680 + }, + { + "epoch": 1.7293612562769125, + "grad_norm": 1.7700022459030151, + "learning_rate": 4.4724652634753674e-06, + "loss": 0.7812, + "step": 270690 + }, + { + "epoch": 1.7294251434266512, + "grad_norm": 1.0767414569854736, + "learning_rate": 4.47039119466029e-06, + "loss": 0.8533, + "step": 270700 + }, + { + "epoch": 1.72948903057639, + "grad_norm": 1.2778061628341675, + "learning_rate": 4.468317584367743e-06, + "loss": 0.7759, + "step": 270710 + }, + { + "epoch": 1.7295529177261284, + "grad_norm": 1.1343021392822266, + "learning_rate": 4.466244432618644e-06, + "loss": 0.8308, + "step": 270720 + }, + { + "epoch": 1.7296168048758673, + "grad_norm": 0.6189510226249695, + "learning_rate": 4.464171739433842e-06, + "loss": 0.7996, + "step": 270730 + }, + { + "epoch": 1.7296806920256058, + "grad_norm": 0.9053893089294434, + "learning_rate": 4.4620995048342384e-06, + "loss": 1.0211, + "step": 270740 + }, + { + "epoch": 1.7297445791753447, + "grad_norm": 0.9649641513824463, + "learning_rate": 4.460027728840676e-06, + "loss": 0.8243, + "step": 270750 + }, + { + "epoch": 1.7298084663250832, + "grad_norm": 0.9888417720794678, + "learning_rate": 4.457956411474046e-06, + "loss": 0.818, + "step": 270760 + }, + { + "epoch": 1.7298723534748222, + "grad_norm": 2.9465692043304443, + "learning_rate": 4.455885552755185e-06, + "loss": 0.988, + "step": 270770 + }, + { + "epoch": 1.7299362406245606, + "grad_norm": 0.9890803694725037, + "learning_rate": 4.4538151527049545e-06, + "loss": 0.9858, + "step": 270780 + }, + { + "epoch": 1.7300001277742996, + "grad_norm": 1.079901933670044, + "learning_rate": 4.451745211344216e-06, + "loss": 0.8648, + "step": 270790 + }, + { + "epoch": 1.730064014924038, + "grad_norm": 0.6345334053039551, + "learning_rate": 4.449675728693803e-06, + "loss": 0.7422, + "step": 270800 + }, + { + "epoch": 1.730127902073777, + "grad_norm": 0.6401709318161011, + "learning_rate": 4.447606704774565e-06, + "loss": 0.67, + "step": 270810 + }, + { + "epoch": 1.7301917892235155, + "grad_norm": 1.0097521543502808, + "learning_rate": 4.445538139607336e-06, + "loss": 0.943, + "step": 270820 + }, + { + "epoch": 1.7302556763732544, + "grad_norm": 0.9358034133911133, + "learning_rate": 4.443470033212948e-06, + "loss": 0.8079, + "step": 270830 + }, + { + "epoch": 1.7303195635229929, + "grad_norm": 0.8332350254058838, + "learning_rate": 4.441402385612225e-06, + "loss": 0.9739, + "step": 270840 + }, + { + "epoch": 1.7303834506727318, + "grad_norm": 0.3689040243625641, + "learning_rate": 4.4393351968260115e-06, + "loss": 0.8645, + "step": 270850 + }, + { + "epoch": 1.7304473378224703, + "grad_norm": 0.7520756125450134, + "learning_rate": 4.437268466875089e-06, + "loss": 0.6531, + "step": 270860 + }, + { + "epoch": 1.7305112249722092, + "grad_norm": 0.8130874633789062, + "learning_rate": 4.435202195780303e-06, + "loss": 0.8444, + "step": 270870 + }, + { + "epoch": 1.7305751121219477, + "grad_norm": 0.807141900062561, + "learning_rate": 4.433136383562436e-06, + "loss": 1.1926, + "step": 270880 + }, + { + "epoch": 1.7306389992716866, + "grad_norm": 0.6432427167892456, + "learning_rate": 4.4310710302423064e-06, + "loss": 0.7599, + "step": 270890 + }, + { + "epoch": 1.7307028864214251, + "grad_norm": 1.7837029695510864, + "learning_rate": 4.4290061358407295e-06, + "loss": 0.8501, + "step": 270900 + }, + { + "epoch": 1.730766773571164, + "grad_norm": 1.2138731479644775, + "learning_rate": 4.426941700378473e-06, + "loss": 0.9177, + "step": 270910 + }, + { + "epoch": 1.7308306607209025, + "grad_norm": 1.1648060083389282, + "learning_rate": 4.4248777238763474e-06, + "loss": 0.9268, + "step": 270920 + }, + { + "epoch": 1.7308945478706415, + "grad_norm": 1.0720033645629883, + "learning_rate": 4.422814206355119e-06, + "loss": 0.8604, + "step": 270930 + }, + { + "epoch": 1.73095843502038, + "grad_norm": 1.0889008045196533, + "learning_rate": 4.420751147835595e-06, + "loss": 0.9616, + "step": 270940 + }, + { + "epoch": 1.7310223221701189, + "grad_norm": 1.2953075170516968, + "learning_rate": 4.418688548338529e-06, + "loss": 0.8462, + "step": 270950 + }, + { + "epoch": 1.7310862093198573, + "grad_norm": 0.790887713432312, + "learning_rate": 4.416626407884711e-06, + "loss": 0.8467, + "step": 270960 + }, + { + "epoch": 1.731150096469596, + "grad_norm": 0.937939465045929, + "learning_rate": 4.414564726494896e-06, + "loss": 0.9561, + "step": 270970 + }, + { + "epoch": 1.7312139836193348, + "grad_norm": 0.899032711982727, + "learning_rate": 4.412503504189852e-06, + "loss": 0.9822, + "step": 270980 + }, + { + "epoch": 1.7312778707690735, + "grad_norm": 1.0886012315750122, + "learning_rate": 4.410442740990334e-06, + "loss": 0.8803, + "step": 270990 + }, + { + "epoch": 1.7313417579188122, + "grad_norm": 1.7397100925445557, + "learning_rate": 4.408382436917108e-06, + "loss": 0.6794, + "step": 271000 + }, + { + "epoch": 1.7314056450685509, + "grad_norm": 0.8017283082008362, + "learning_rate": 4.406322591990897e-06, + "loss": 1.1767, + "step": 271010 + }, + { + "epoch": 1.7314695322182896, + "grad_norm": 1.061044454574585, + "learning_rate": 4.404263206232478e-06, + "loss": 0.8546, + "step": 271020 + }, + { + "epoch": 1.7315334193680283, + "grad_norm": 0.7479444742202759, + "learning_rate": 4.402204279662558e-06, + "loss": 1.0758, + "step": 271030 + }, + { + "epoch": 1.731597306517767, + "grad_norm": 1.4993268251419067, + "learning_rate": 4.400145812301904e-06, + "loss": 0.6991, + "step": 271040 + }, + { + "epoch": 1.7316611936675057, + "grad_norm": 2.6139063835144043, + "learning_rate": 4.398293584318353e-06, + "loss": 1.0884, + "step": 271050 + }, + { + "epoch": 1.7317250808172444, + "grad_norm": 0.9762659668922424, + "learning_rate": 4.396235989512371e-06, + "loss": 1.2984, + "step": 271060 + }, + { + "epoch": 1.7317889679669831, + "grad_norm": 0.7708228230476379, + "learning_rate": 4.394178853975755e-06, + "loss": 1.0642, + "step": 271070 + }, + { + "epoch": 1.7318528551167218, + "grad_norm": 2.804189920425415, + "learning_rate": 4.392122177729207e-06, + "loss": 1.2598, + "step": 271080 + }, + { + "epoch": 1.7319167422664605, + "grad_norm": 1.322363018989563, + "learning_rate": 4.390065960793455e-06, + "loss": 1.2285, + "step": 271090 + }, + { + "epoch": 1.7319806294161992, + "grad_norm": 1.2776496410369873, + "learning_rate": 4.38801020318918e-06, + "loss": 0.8559, + "step": 271100 + }, + { + "epoch": 1.732044516565938, + "grad_norm": 0.9014029502868652, + "learning_rate": 4.385954904937117e-06, + "loss": 0.8942, + "step": 271110 + }, + { + "epoch": 1.7321084037156766, + "grad_norm": 0.7205672860145569, + "learning_rate": 4.383900066057944e-06, + "loss": 0.7632, + "step": 271120 + }, + { + "epoch": 1.7321722908654154, + "grad_norm": 2.3227195739746094, + "learning_rate": 4.3818456865723665e-06, + "loss": 0.9524, + "step": 271130 + }, + { + "epoch": 1.732236178015154, + "grad_norm": 0.7535308003425598, + "learning_rate": 4.379791766501057e-06, + "loss": 0.7515, + "step": 271140 + }, + { + "epoch": 1.7323000651648928, + "grad_norm": 1.2400829792022705, + "learning_rate": 4.377738305864721e-06, + "loss": 0.9976, + "step": 271150 + }, + { + "epoch": 1.7323639523146315, + "grad_norm": 0.676798939704895, + "learning_rate": 4.375685304684024e-06, + "loss": 0.834, + "step": 271160 + }, + { + "epoch": 1.7324278394643702, + "grad_norm": 1.0050621032714844, + "learning_rate": 4.373632762979651e-06, + "loss": 0.9509, + "step": 271170 + }, + { + "epoch": 1.7324917266141089, + "grad_norm": 0.9960595369338989, + "learning_rate": 4.371580680772264e-06, + "loss": 0.8235, + "step": 271180 + }, + { + "epoch": 1.7325556137638476, + "grad_norm": 1.1460243463516235, + "learning_rate": 4.369529058082539e-06, + "loss": 0.8149, + "step": 271190 + }, + { + "epoch": 1.7326195009135863, + "grad_norm": 0.9614496827125549, + "learning_rate": 4.367477894931121e-06, + "loss": 0.9482, + "step": 271200 + }, + { + "epoch": 1.7326833880633248, + "grad_norm": 0.5980064272880554, + "learning_rate": 4.365427191338683e-06, + "loss": 0.8493, + "step": 271210 + }, + { + "epoch": 1.7327472752130637, + "grad_norm": 0.5648938417434692, + "learning_rate": 4.3633769473258815e-06, + "loss": 0.8194, + "step": 271220 + }, + { + "epoch": 1.7328111623628022, + "grad_norm": 0.7305797338485718, + "learning_rate": 4.361327162913348e-06, + "loss": 0.8932, + "step": 271230 + }, + { + "epoch": 1.7328750495125411, + "grad_norm": 1.1702698469161987, + "learning_rate": 4.359277838121739e-06, + "loss": 0.9584, + "step": 271240 + }, + { + "epoch": 1.7329389366622796, + "grad_norm": 1.350797414779663, + "learning_rate": 4.357228972971677e-06, + "loss": 0.9944, + "step": 271250 + }, + { + "epoch": 1.7330028238120185, + "grad_norm": 0.7506915926933289, + "learning_rate": 4.355180567483819e-06, + "loss": 0.7929, + "step": 271260 + }, + { + "epoch": 1.733066710961757, + "grad_norm": 0.7021801471710205, + "learning_rate": 4.3531326216787685e-06, + "loss": 0.9577, + "step": 271270 + }, + { + "epoch": 1.733130598111496, + "grad_norm": 1.2569390535354614, + "learning_rate": 4.3510851355771705e-06, + "loss": 0.9093, + "step": 271280 + }, + { + "epoch": 1.7331944852612344, + "grad_norm": 0.826291024684906, + "learning_rate": 4.349038109199632e-06, + "loss": 0.8654, + "step": 271290 + }, + { + "epoch": 1.7332583724109734, + "grad_norm": 0.7624689340591431, + "learning_rate": 4.34699154256678e-06, + "loss": 0.9783, + "step": 271300 + }, + { + "epoch": 1.7333222595607118, + "grad_norm": 1.0965466499328613, + "learning_rate": 4.344945435699211e-06, + "loss": 0.8737, + "step": 271310 + }, + { + "epoch": 1.7333861467104508, + "grad_norm": 0.9986264705657959, + "learning_rate": 4.34289978861755e-06, + "loss": 0.916, + "step": 271320 + }, + { + "epoch": 1.7334500338601893, + "grad_norm": 0.7352056503295898, + "learning_rate": 4.34085460134237e-06, + "loss": 0.9011, + "step": 271330 + }, + { + "epoch": 1.7335139210099282, + "grad_norm": 2.520620584487915, + "learning_rate": 4.3388098738943e-06, + "loss": 0.7457, + "step": 271340 + }, + { + "epoch": 1.7335778081596667, + "grad_norm": 0.8847737908363342, + "learning_rate": 4.336765606293907e-06, + "loss": 1.1116, + "step": 271350 + }, + { + "epoch": 1.7336416953094056, + "grad_norm": 1.127189040184021, + "learning_rate": 4.334721798561803e-06, + "loss": 0.9713, + "step": 271360 + }, + { + "epoch": 1.733705582459144, + "grad_norm": 0.7761907577514648, + "learning_rate": 4.332678450718541e-06, + "loss": 0.9633, + "step": 271370 + }, + { + "epoch": 1.733769469608883, + "grad_norm": 1.4676905870437622, + "learning_rate": 4.330635562784724e-06, + "loss": 0.9541, + "step": 271380 + }, + { + "epoch": 1.7338333567586215, + "grad_norm": 0.9573052525520325, + "learning_rate": 4.328593134780911e-06, + "loss": 0.7551, + "step": 271390 + }, + { + "epoch": 1.7338972439083604, + "grad_norm": 1.7084014415740967, + "learning_rate": 4.326551166727683e-06, + "loss": 1.0257, + "step": 271400 + }, + { + "epoch": 1.733961131058099, + "grad_norm": 1.5131990909576416, + "learning_rate": 4.324509658645587e-06, + "loss": 0.8531, + "step": 271410 + }, + { + "epoch": 1.7340250182078378, + "grad_norm": 0.8419858813285828, + "learning_rate": 4.322468610555197e-06, + "loss": 1.057, + "step": 271420 + }, + { + "epoch": 1.7340889053575763, + "grad_norm": 0.9144232273101807, + "learning_rate": 4.320428022477075e-06, + "loss": 1.0028, + "step": 271430 + }, + { + "epoch": 1.7341527925073152, + "grad_norm": 1.0411046743392944, + "learning_rate": 4.318387894431747e-06, + "loss": 1.1744, + "step": 271440 + }, + { + "epoch": 1.7342166796570537, + "grad_norm": 0.8902087807655334, + "learning_rate": 4.316348226439787e-06, + "loss": 0.8054, + "step": 271450 + }, + { + "epoch": 1.7342805668067924, + "grad_norm": 0.8370457291603088, + "learning_rate": 4.314309018521712e-06, + "loss": 0.9544, + "step": 271460 + }, + { + "epoch": 1.7343444539565311, + "grad_norm": 1.351797103881836, + "learning_rate": 4.312270270698076e-06, + "loss": 0.7003, + "step": 271470 + }, + { + "epoch": 1.7344083411062698, + "grad_norm": 0.8992185592651367, + "learning_rate": 4.310231982989405e-06, + "loss": 0.6515, + "step": 271480 + }, + { + "epoch": 1.7344722282560086, + "grad_norm": 1.824882984161377, + "learning_rate": 4.308194155416228e-06, + "loss": 0.845, + "step": 271490 + }, + { + "epoch": 1.7345361154057473, + "grad_norm": 0.7211819291114807, + "learning_rate": 4.3061567879990495e-06, + "loss": 0.716, + "step": 271500 + }, + { + "epoch": 1.734600002555486, + "grad_norm": 1.2011065483093262, + "learning_rate": 4.304119880758417e-06, + "loss": 0.9571, + "step": 271510 + }, + { + "epoch": 1.7346638897052247, + "grad_norm": 1.1488710641860962, + "learning_rate": 4.302083433714821e-06, + "loss": 1.1657, + "step": 271520 + }, + { + "epoch": 1.7347277768549634, + "grad_norm": 0.9128865599632263, + "learning_rate": 4.300047446888777e-06, + "loss": 0.9392, + "step": 271530 + }, + { + "epoch": 1.734791664004702, + "grad_norm": 0.9169224500656128, + "learning_rate": 4.298011920300804e-06, + "loss": 0.8583, + "step": 271540 + }, + { + "epoch": 1.7348555511544408, + "grad_norm": 0.6351904273033142, + "learning_rate": 4.295976853971373e-06, + "loss": 0.6869, + "step": 271550 + }, + { + "epoch": 1.7349194383041795, + "grad_norm": 0.9080760478973389, + "learning_rate": 4.293942247921012e-06, + "loss": 0.8738, + "step": 271560 + }, + { + "epoch": 1.7349833254539182, + "grad_norm": 4.88934850692749, + "learning_rate": 4.291908102170178e-06, + "loss": 1.0908, + "step": 271570 + }, + { + "epoch": 1.735047212603657, + "grad_norm": 1.1222317218780518, + "learning_rate": 4.289874416739387e-06, + "loss": 0.8656, + "step": 271580 + }, + { + "epoch": 1.7351110997533956, + "grad_norm": 0.6947523951530457, + "learning_rate": 4.287841191649095e-06, + "loss": 0.9174, + "step": 271590 + }, + { + "epoch": 1.7351749869031343, + "grad_norm": 1.1124070882797241, + "learning_rate": 4.285808426919791e-06, + "loss": 0.855, + "step": 271600 + }, + { + "epoch": 1.735238874052873, + "grad_norm": 1.0762344598770142, + "learning_rate": 4.283776122571942e-06, + "loss": 0.8326, + "step": 271610 + }, + { + "epoch": 1.7353027612026117, + "grad_norm": 0.9816240072250366, + "learning_rate": 4.281744278626027e-06, + "loss": 0.7708, + "step": 271620 + }, + { + "epoch": 1.7353666483523504, + "grad_norm": 0.8895612955093384, + "learning_rate": 4.279712895102489e-06, + "loss": 0.7371, + "step": 271630 + }, + { + "epoch": 1.7354305355020891, + "grad_norm": 0.6587034463882446, + "learning_rate": 4.277681972021808e-06, + "loss": 1.0135, + "step": 271640 + }, + { + "epoch": 1.7354944226518279, + "grad_norm": 0.7945663332939148, + "learning_rate": 4.2756515094044105e-06, + "loss": 1.0117, + "step": 271650 + }, + { + "epoch": 1.7355583098015666, + "grad_norm": 1.7672518491744995, + "learning_rate": 4.2736215072707696e-06, + "loss": 1.2048, + "step": 271660 + }, + { + "epoch": 1.7356221969513053, + "grad_norm": 1.1510045528411865, + "learning_rate": 4.271591965641308e-06, + "loss": 0.7788, + "step": 271670 + }, + { + "epoch": 1.735686084101044, + "grad_norm": 0.8241963386535645, + "learning_rate": 4.269562884536493e-06, + "loss": 1.0226, + "step": 271680 + }, + { + "epoch": 1.7357499712507827, + "grad_norm": 0.9838940501213074, + "learning_rate": 4.267534263976724e-06, + "loss": 1.2249, + "step": 271690 + }, + { + "epoch": 1.7358138584005212, + "grad_norm": 3.085310459136963, + "learning_rate": 4.265506103982464e-06, + "loss": 1.2433, + "step": 271700 + }, + { + "epoch": 1.73587774555026, + "grad_norm": 0.7970023155212402, + "learning_rate": 4.263478404574111e-06, + "loss": 0.8654, + "step": 271710 + }, + { + "epoch": 1.7359416326999986, + "grad_norm": 1.1733765602111816, + "learning_rate": 4.2614511657721124e-06, + "loss": 0.8471, + "step": 271720 + }, + { + "epoch": 1.7360055198497375, + "grad_norm": 0.8512540459632874, + "learning_rate": 4.259424387596856e-06, + "loss": 1.1134, + "step": 271730 + }, + { + "epoch": 1.736069406999476, + "grad_norm": 0.8959993124008179, + "learning_rate": 4.257398070068769e-06, + "loss": 0.7911, + "step": 271740 + }, + { + "epoch": 1.736133294149215, + "grad_norm": 1.955090045928955, + "learning_rate": 4.255372213208264e-06, + "loss": 1.0413, + "step": 271750 + }, + { + "epoch": 1.7361971812989534, + "grad_norm": 0.7032087445259094, + "learning_rate": 4.253346817035731e-06, + "loss": 0.8343, + "step": 271760 + }, + { + "epoch": 1.7362610684486923, + "grad_norm": 1.0131257772445679, + "learning_rate": 4.251321881571579e-06, + "loss": 0.8446, + "step": 271770 + }, + { + "epoch": 1.7363249555984308, + "grad_norm": 0.958465039730072, + "learning_rate": 4.2492974068361835e-06, + "loss": 1.0861, + "step": 271780 + }, + { + "epoch": 1.7363888427481697, + "grad_norm": 1.5234622955322266, + "learning_rate": 4.2472733928499485e-06, + "loss": 0.7778, + "step": 271790 + }, + { + "epoch": 1.7364527298979082, + "grad_norm": 1.2108267545700073, + "learning_rate": 4.245249839633247e-06, + "loss": 0.9079, + "step": 271800 + }, + { + "epoch": 1.7365166170476471, + "grad_norm": 1.0668816566467285, + "learning_rate": 4.2432267472064745e-06, + "loss": 1.101, + "step": 271810 + }, + { + "epoch": 1.7365805041973856, + "grad_norm": 0.7074256539344788, + "learning_rate": 4.241204115589986e-06, + "loss": 1.0892, + "step": 271820 + }, + { + "epoch": 1.7366443913471246, + "grad_norm": 1.3085336685180664, + "learning_rate": 4.23918194480416e-06, + "loss": 0.977, + "step": 271830 + }, + { + "epoch": 1.736708278496863, + "grad_norm": 1.2348912954330444, + "learning_rate": 4.2371602348693584e-06, + "loss": 0.6361, + "step": 271840 + }, + { + "epoch": 1.736772165646602, + "grad_norm": 0.6151230931282043, + "learning_rate": 4.235138985805953e-06, + "loss": 0.8245, + "step": 271850 + }, + { + "epoch": 1.7368360527963405, + "grad_norm": 1.233837366104126, + "learning_rate": 4.233118197634279e-06, + "loss": 0.7507, + "step": 271860 + }, + { + "epoch": 1.7368999399460794, + "grad_norm": 1.0764507055282593, + "learning_rate": 4.231097870374706e-06, + "loss": 1.1995, + "step": 271870 + }, + { + "epoch": 1.7369638270958179, + "grad_norm": 1.1469213962554932, + "learning_rate": 4.229078004047565e-06, + "loss": 0.7324, + "step": 271880 + }, + { + "epoch": 1.7370277142455568, + "grad_norm": 0.8744142651557922, + "learning_rate": 4.227058598673217e-06, + "loss": 0.8402, + "step": 271890 + }, + { + "epoch": 1.7370916013952953, + "grad_norm": 0.9857259392738342, + "learning_rate": 4.225039654271978e-06, + "loss": 1.0731, + "step": 271900 + }, + { + "epoch": 1.7371554885450342, + "grad_norm": 1.1654436588287354, + "learning_rate": 4.2230211708642e-06, + "loss": 0.6894, + "step": 271910 + }, + { + "epoch": 1.7372193756947727, + "grad_norm": 0.9708123803138733, + "learning_rate": 4.221003148470193e-06, + "loss": 0.8785, + "step": 271920 + }, + { + "epoch": 1.7372832628445114, + "grad_norm": 0.9018045663833618, + "learning_rate": 4.2189855871102965e-06, + "loss": 0.75, + "step": 271930 + }, + { + "epoch": 1.73734714999425, + "grad_norm": 0.7483968138694763, + "learning_rate": 4.2169684868048124e-06, + "loss": 0.8853, + "step": 271940 + }, + { + "epoch": 1.7374110371439888, + "grad_norm": 1.216666579246521, + "learning_rate": 4.214951847574061e-06, + "loss": 0.7667, + "step": 271950 + }, + { + "epoch": 1.7374749242937275, + "grad_norm": 0.7332319021224976, + "learning_rate": 4.212935669438367e-06, + "loss": 0.9833, + "step": 271960 + }, + { + "epoch": 1.7375388114434662, + "grad_norm": 1.1881091594696045, + "learning_rate": 4.2109199524180245e-06, + "loss": 0.7004, + "step": 271970 + }, + { + "epoch": 1.737602698593205, + "grad_norm": 0.7997522950172424, + "learning_rate": 4.208904696533322e-06, + "loss": 0.6841, + "step": 271980 + }, + { + "epoch": 1.7376665857429436, + "grad_norm": 0.9761389493942261, + "learning_rate": 4.206889901804567e-06, + "loss": 0.7738, + "step": 271990 + }, + { + "epoch": 1.7377304728926823, + "grad_norm": 0.8929351568222046, + "learning_rate": 4.204875568252048e-06, + "loss": 0.9213, + "step": 272000 + }, + { + "epoch": 1.737794360042421, + "grad_norm": 1.209773302078247, + "learning_rate": 4.202861695896043e-06, + "loss": 0.9985, + "step": 272010 + }, + { + "epoch": 1.7378582471921598, + "grad_norm": 0.8349350690841675, + "learning_rate": 4.2008482847568535e-06, + "loss": 0.9204, + "step": 272020 + }, + { + "epoch": 1.7379221343418985, + "grad_norm": 0.8182839751243591, + "learning_rate": 4.198835334854734e-06, + "loss": 0.727, + "step": 272030 + }, + { + "epoch": 1.7379860214916372, + "grad_norm": 1.3342599868774414, + "learning_rate": 4.196822846209969e-06, + "loss": 0.76, + "step": 272040 + }, + { + "epoch": 1.7380499086413759, + "grad_norm": 0.8246582746505737, + "learning_rate": 4.19481081884282e-06, + "loss": 0.932, + "step": 272050 + }, + { + "epoch": 1.7381137957911146, + "grad_norm": 1.2628209590911865, + "learning_rate": 4.19279925277355e-06, + "loss": 1.127, + "step": 272060 + }, + { + "epoch": 1.7381776829408533, + "grad_norm": 1.0423669815063477, + "learning_rate": 4.190788148022434e-06, + "loss": 1.0765, + "step": 272070 + }, + { + "epoch": 1.738241570090592, + "grad_norm": 2.2734713554382324, + "learning_rate": 4.188777504609698e-06, + "loss": 0.7287, + "step": 272080 + }, + { + "epoch": 1.7383054572403307, + "grad_norm": 2.1262595653533936, + "learning_rate": 4.186767322555618e-06, + "loss": 0.8729, + "step": 272090 + }, + { + "epoch": 1.7383693443900694, + "grad_norm": 0.8176138401031494, + "learning_rate": 4.184757601880413e-06, + "loss": 1.0373, + "step": 272100 + }, + { + "epoch": 1.738433231539808, + "grad_norm": 0.9398905634880066, + "learning_rate": 4.182748342604348e-06, + "loss": 0.7781, + "step": 272110 + }, + { + "epoch": 1.7384971186895468, + "grad_norm": 0.8366762399673462, + "learning_rate": 4.18073954474763e-06, + "loss": 1.0162, + "step": 272120 + }, + { + "epoch": 1.7385610058392855, + "grad_norm": 1.1980094909667969, + "learning_rate": 4.1787312083305165e-06, + "loss": 0.9061, + "step": 272130 + }, + { + "epoch": 1.7386248929890242, + "grad_norm": 1.1438990831375122, + "learning_rate": 4.176723333373211e-06, + "loss": 0.8736, + "step": 272140 + }, + { + "epoch": 1.738688780138763, + "grad_norm": 1.3131656646728516, + "learning_rate": 4.174715919895955e-06, + "loss": 0.7236, + "step": 272150 + }, + { + "epoch": 1.7387526672885016, + "grad_norm": 1.1731828451156616, + "learning_rate": 4.1727089679189426e-06, + "loss": 1.1201, + "step": 272160 + }, + { + "epoch": 1.7388165544382403, + "grad_norm": 0.84318608045578, + "learning_rate": 4.170702477462407e-06, + "loss": 0.8461, + "step": 272170 + }, + { + "epoch": 1.738880441587979, + "grad_norm": 0.7444975972175598, + "learning_rate": 4.168696448546539e-06, + "loss": 0.8019, + "step": 272180 + }, + { + "epoch": 1.7389443287377175, + "grad_norm": 1.2526865005493164, + "learning_rate": 4.166690881191554e-06, + "loss": 0.9074, + "step": 272190 + }, + { + "epoch": 1.7390082158874565, + "grad_norm": 1.6187922954559326, + "learning_rate": 4.164685775417632e-06, + "loss": 1.0094, + "step": 272200 + }, + { + "epoch": 1.739072103037195, + "grad_norm": 1.1000633239746094, + "learning_rate": 4.1626811312449884e-06, + "loss": 0.9165, + "step": 272210 + }, + { + "epoch": 1.7391359901869339, + "grad_norm": 0.9860941767692566, + "learning_rate": 4.1606769486937925e-06, + "loss": 1.0631, + "step": 272220 + }, + { + "epoch": 1.7391998773366724, + "grad_norm": 1.1407016515731812, + "learning_rate": 4.158673227784249e-06, + "loss": 0.5973, + "step": 272230 + }, + { + "epoch": 1.7392637644864113, + "grad_norm": 1.0110845565795898, + "learning_rate": 4.156669968536509e-06, + "loss": 0.879, + "step": 272240 + }, + { + "epoch": 1.7393276516361498, + "grad_norm": 1.1520662307739258, + "learning_rate": 4.154667170970777e-06, + "loss": 1.0506, + "step": 272250 + }, + { + "epoch": 1.7393915387858887, + "grad_norm": 0.8411242961883545, + "learning_rate": 4.152664835107195e-06, + "loss": 0.9874, + "step": 272260 + }, + { + "epoch": 1.7394554259356272, + "grad_norm": 1.4038716554641724, + "learning_rate": 4.150662960965946e-06, + "loss": 0.9303, + "step": 272270 + }, + { + "epoch": 1.7395193130853661, + "grad_norm": 0.9342250823974609, + "learning_rate": 4.14866154856719e-06, + "loss": 0.9133, + "step": 272280 + }, + { + "epoch": 1.7395832002351046, + "grad_norm": 1.3906513452529907, + "learning_rate": 4.146660597931074e-06, + "loss": 0.956, + "step": 272290 + }, + { + "epoch": 1.7396470873848435, + "grad_norm": 0.841761589050293, + "learning_rate": 4.144660109077764e-06, + "loss": 0.9594, + "step": 272300 + }, + { + "epoch": 1.739710974534582, + "grad_norm": 0.8125125169754028, + "learning_rate": 4.142660082027383e-06, + "loss": 0.9382, + "step": 272310 + }, + { + "epoch": 1.739774861684321, + "grad_norm": 0.9159964919090271, + "learning_rate": 4.140660516800099e-06, + "loss": 0.6513, + "step": 272320 + }, + { + "epoch": 1.7398387488340594, + "grad_norm": 0.679341197013855, + "learning_rate": 4.138661413416034e-06, + "loss": 0.8616, + "step": 272330 + }, + { + "epoch": 1.7399026359837984, + "grad_norm": 1.6589670181274414, + "learning_rate": 4.136662771895328e-06, + "loss": 1.0038, + "step": 272340 + }, + { + "epoch": 1.7399665231335368, + "grad_norm": 0.7846057415008545, + "learning_rate": 4.134664592258098e-06, + "loss": 0.796, + "step": 272350 + }, + { + "epoch": 1.7400304102832758, + "grad_norm": 1.1777966022491455, + "learning_rate": 4.132666874524482e-06, + "loss": 0.8401, + "step": 272360 + }, + { + "epoch": 1.7400942974330142, + "grad_norm": 0.8257417678833008, + "learning_rate": 4.130669618714583e-06, + "loss": 0.902, + "step": 272370 + }, + { + "epoch": 1.7401581845827532, + "grad_norm": 0.881574273109436, + "learning_rate": 4.1286728248485284e-06, + "loss": 0.7615, + "step": 272380 + }, + { + "epoch": 1.7402220717324917, + "grad_norm": 1.0708754062652588, + "learning_rate": 4.126676492946418e-06, + "loss": 0.7624, + "step": 272390 + }, + { + "epoch": 1.7402859588822306, + "grad_norm": 0.7476529479026794, + "learning_rate": 4.124680623028371e-06, + "loss": 0.7909, + "step": 272400 + }, + { + "epoch": 1.740349846031969, + "grad_norm": 1.2250369787216187, + "learning_rate": 4.122685215114469e-06, + "loss": 0.8418, + "step": 272410 + }, + { + "epoch": 1.7404137331817078, + "grad_norm": 0.7783737778663635, + "learning_rate": 4.120690269224825e-06, + "loss": 0.7274, + "step": 272420 + }, + { + "epoch": 1.7404776203314465, + "grad_norm": 1.6127846240997314, + "learning_rate": 4.118695785379512e-06, + "loss": 0.7691, + "step": 272430 + }, + { + "epoch": 1.7405415074811852, + "grad_norm": 0.8426513075828552, + "learning_rate": 4.116701763598635e-06, + "loss": 0.7788, + "step": 272440 + }, + { + "epoch": 1.740605394630924, + "grad_norm": 1.2168272733688354, + "learning_rate": 4.114708203902268e-06, + "loss": 0.7771, + "step": 272450 + }, + { + "epoch": 1.7406692817806626, + "grad_norm": 1.7537342309951782, + "learning_rate": 4.1127151063104714e-06, + "loss": 0.9725, + "step": 272460 + }, + { + "epoch": 1.7407331689304013, + "grad_norm": 3.2746481895446777, + "learning_rate": 4.110722470843348e-06, + "loss": 1.1022, + "step": 272470 + }, + { + "epoch": 1.74079705608014, + "grad_norm": 1.014939546585083, + "learning_rate": 4.108730297520935e-06, + "loss": 0.725, + "step": 272480 + }, + { + "epoch": 1.7408609432298787, + "grad_norm": 0.6446115374565125, + "learning_rate": 4.106738586363318e-06, + "loss": 0.8543, + "step": 272490 + }, + { + "epoch": 1.7409248303796174, + "grad_norm": 1.045475721359253, + "learning_rate": 4.1047473373905406e-06, + "loss": 0.7015, + "step": 272500 + }, + { + "epoch": 1.7409887175293561, + "grad_norm": 0.8331463932991028, + "learning_rate": 4.102756550622672e-06, + "loss": 0.6955, + "step": 272510 + }, + { + "epoch": 1.7410526046790948, + "grad_norm": 0.7141439914703369, + "learning_rate": 4.100766226079739e-06, + "loss": 0.9134, + "step": 272520 + }, + { + "epoch": 1.7411164918288335, + "grad_norm": 1.0723060369491577, + "learning_rate": 4.098776363781809e-06, + "loss": 0.6845, + "step": 272530 + }, + { + "epoch": 1.7411803789785723, + "grad_norm": 0.7683533430099487, + "learning_rate": 4.0967869637489e-06, + "loss": 0.7682, + "step": 272540 + }, + { + "epoch": 1.741244266128311, + "grad_norm": 0.8382824063301086, + "learning_rate": 4.094798026001074e-06, + "loss": 0.9829, + "step": 272550 + }, + { + "epoch": 1.7413081532780497, + "grad_norm": 0.8560100197792053, + "learning_rate": 4.0928095505583295e-06, + "loss": 0.7741, + "step": 272560 + }, + { + "epoch": 1.7413720404277884, + "grad_norm": 0.55511873960495, + "learning_rate": 4.09082153744072e-06, + "loss": 0.7897, + "step": 272570 + }, + { + "epoch": 1.741435927577527, + "grad_norm": 0.6832825541496277, + "learning_rate": 4.088833986668245e-06, + "loss": 1.0163, + "step": 272580 + }, + { + "epoch": 1.7414998147272658, + "grad_norm": 1.0762792825698853, + "learning_rate": 4.086846898260932e-06, + "loss": 0.8245, + "step": 272590 + }, + { + "epoch": 1.7415637018770045, + "grad_norm": 1.5624908208847046, + "learning_rate": 4.084860272238799e-06, + "loss": 0.7916, + "step": 272600 + }, + { + "epoch": 1.7416275890267432, + "grad_norm": 2.772987127304077, + "learning_rate": 4.0828741086218365e-06, + "loss": 1.0442, + "step": 272610 + }, + { + "epoch": 1.741691476176482, + "grad_norm": 0.8396859169006348, + "learning_rate": 4.080888407430067e-06, + "loss": 0.6276, + "step": 272620 + }, + { + "epoch": 1.7417553633262206, + "grad_norm": 1.0782091617584229, + "learning_rate": 4.078903168683468e-06, + "loss": 0.9455, + "step": 272630 + }, + { + "epoch": 1.7418192504759593, + "grad_norm": 1.256385087966919, + "learning_rate": 4.0769183924020456e-06, + "loss": 0.5904, + "step": 272640 + }, + { + "epoch": 1.741883137625698, + "grad_norm": 0.8513548970222473, + "learning_rate": 4.07493407860578e-06, + "loss": 0.9067, + "step": 272650 + }, + { + "epoch": 1.7419470247754365, + "grad_norm": 1.0283018350601196, + "learning_rate": 4.072950227314665e-06, + "loss": 1.066, + "step": 272660 + }, + { + "epoch": 1.7420109119251754, + "grad_norm": 0.6767253875732422, + "learning_rate": 4.070966838548668e-06, + "loss": 0.8141, + "step": 272670 + }, + { + "epoch": 1.742074799074914, + "grad_norm": 1.193358302116394, + "learning_rate": 4.0689839123277725e-06, + "loss": 0.8988, + "step": 272680 + }, + { + "epoch": 1.7421386862246528, + "grad_norm": 0.9188578128814697, + "learning_rate": 4.067001448671942e-06, + "loss": 0.6723, + "step": 272690 + }, + { + "epoch": 1.7422025733743913, + "grad_norm": 1.4902043342590332, + "learning_rate": 4.065019447601154e-06, + "loss": 0.7255, + "step": 272700 + }, + { + "epoch": 1.7422664605241303, + "grad_norm": 0.8948713541030884, + "learning_rate": 4.0630379091353475e-06, + "loss": 0.84, + "step": 272710 + }, + { + "epoch": 1.7423303476738687, + "grad_norm": 0.7600857019424438, + "learning_rate": 4.061056833294497e-06, + "loss": 0.8922, + "step": 272720 + }, + { + "epoch": 1.7423942348236077, + "grad_norm": 0.9479913711547852, + "learning_rate": 4.05907622009854e-06, + "loss": 0.7225, + "step": 272730 + }, + { + "epoch": 1.7424581219733462, + "grad_norm": 0.9155439734458923, + "learning_rate": 4.05709606956744e-06, + "loss": 0.9039, + "step": 272740 + }, + { + "epoch": 1.742522009123085, + "grad_norm": 0.7955976128578186, + "learning_rate": 4.055116381721119e-06, + "loss": 0.7655, + "step": 272750 + }, + { + "epoch": 1.7425858962728236, + "grad_norm": 1.2719805240631104, + "learning_rate": 4.053137156579534e-06, + "loss": 0.8107, + "step": 272760 + }, + { + "epoch": 1.7426497834225625, + "grad_norm": 1.0513930320739746, + "learning_rate": 4.051158394162602e-06, + "loss": 0.8821, + "step": 272770 + }, + { + "epoch": 1.742713670572301, + "grad_norm": 0.9332901835441589, + "learning_rate": 4.049180094490257e-06, + "loss": 0.655, + "step": 272780 + }, + { + "epoch": 1.74277755772204, + "grad_norm": 1.1140823364257812, + "learning_rate": 4.047202257582422e-06, + "loss": 1.0025, + "step": 272790 + }, + { + "epoch": 1.7428414448717784, + "grad_norm": 3.2180252075195312, + "learning_rate": 4.045224883459009e-06, + "loss": 1.0532, + "step": 272800 + }, + { + "epoch": 1.7429053320215173, + "grad_norm": 0.7645334005355835, + "learning_rate": 4.043247972139946e-06, + "loss": 0.8575, + "step": 272810 + }, + { + "epoch": 1.7429692191712558, + "grad_norm": 1.2418649196624756, + "learning_rate": 4.041271523645129e-06, + "loss": 0.7803, + "step": 272820 + }, + { + "epoch": 1.7430331063209947, + "grad_norm": 1.6020556688308716, + "learning_rate": 4.039295537994475e-06, + "loss": 0.8428, + "step": 272830 + }, + { + "epoch": 1.7430969934707332, + "grad_norm": 0.8513848185539246, + "learning_rate": 4.037320015207868e-06, + "loss": 0.758, + "step": 272840 + }, + { + "epoch": 1.7431608806204721, + "grad_norm": 1.0215321779251099, + "learning_rate": 4.0353449553052205e-06, + "loss": 0.7453, + "step": 272850 + }, + { + "epoch": 1.7432247677702106, + "grad_norm": 0.789835512638092, + "learning_rate": 4.0333703583064045e-06, + "loss": 0.7061, + "step": 272860 + }, + { + "epoch": 1.7432886549199496, + "grad_norm": 1.0114730596542358, + "learning_rate": 4.031396224231326e-06, + "loss": 0.8441, + "step": 272870 + }, + { + "epoch": 1.743352542069688, + "grad_norm": 0.5644866228103638, + "learning_rate": 4.029422553099843e-06, + "loss": 0.8071, + "step": 272880 + }, + { + "epoch": 1.743416429219427, + "grad_norm": 0.9736144542694092, + "learning_rate": 4.027449344931861e-06, + "loss": 0.9542, + "step": 272890 + }, + { + "epoch": 1.7434803163691655, + "grad_norm": 1.1532905101776123, + "learning_rate": 4.025476599747225e-06, + "loss": 0.5613, + "step": 272900 + }, + { + "epoch": 1.7435442035189042, + "grad_norm": 1.4434239864349365, + "learning_rate": 4.023504317565818e-06, + "loss": 0.6643, + "step": 272910 + }, + { + "epoch": 1.7436080906686429, + "grad_norm": 1.9808419942855835, + "learning_rate": 4.0215324984074935e-06, + "loss": 0.7913, + "step": 272920 + }, + { + "epoch": 1.7436719778183816, + "grad_norm": 0.6124402284622192, + "learning_rate": 4.019561142292122e-06, + "loss": 0.8837, + "step": 272930 + }, + { + "epoch": 1.7437358649681203, + "grad_norm": 1.0098079442977905, + "learning_rate": 4.017590249239544e-06, + "loss": 0.8274, + "step": 272940 + }, + { + "epoch": 1.743799752117859, + "grad_norm": 1.0427857637405396, + "learning_rate": 4.015619819269612e-06, + "loss": 0.9455, + "step": 272950 + }, + { + "epoch": 1.7438636392675977, + "grad_norm": 1.9890084266662598, + "learning_rate": 4.013649852402174e-06, + "loss": 0.631, + "step": 272960 + }, + { + "epoch": 1.7439275264173364, + "grad_norm": 0.7183147668838501, + "learning_rate": 4.011680348657054e-06, + "loss": 0.9754, + "step": 272970 + }, + { + "epoch": 1.743991413567075, + "grad_norm": 1.2257187366485596, + "learning_rate": 4.009711308054115e-06, + "loss": 1.1979, + "step": 272980 + }, + { + "epoch": 1.7440553007168138, + "grad_norm": 0.5666488409042358, + "learning_rate": 4.0077427306131565e-06, + "loss": 0.6182, + "step": 272990 + }, + { + "epoch": 1.7441191878665525, + "grad_norm": 1.052765130996704, + "learning_rate": 4.00577461635403e-06, + "loss": 0.9402, + "step": 273000 + }, + { + "epoch": 1.7441830750162912, + "grad_norm": 0.763070821762085, + "learning_rate": 4.00380696529653e-06, + "loss": 0.6844, + "step": 273010 + }, + { + "epoch": 1.74424696216603, + "grad_norm": 1.1995108127593994, + "learning_rate": 4.001839777460498e-06, + "loss": 1.0478, + "step": 273020 + }, + { + "epoch": 1.7443108493157686, + "grad_norm": 0.8796985745429993, + "learning_rate": 3.999873052865727e-06, + "loss": 0.818, + "step": 273030 + }, + { + "epoch": 1.7443747364655073, + "grad_norm": 0.7032323479652405, + "learning_rate": 3.997906791532036e-06, + "loss": 0.8984, + "step": 273040 + }, + { + "epoch": 1.744438623615246, + "grad_norm": 1.2603150606155396, + "learning_rate": 3.995940993479208e-06, + "loss": 0.7287, + "step": 273050 + }, + { + "epoch": 1.7445025107649847, + "grad_norm": 0.902034342288971, + "learning_rate": 3.9939756587270675e-06, + "loss": 1.0164, + "step": 273060 + }, + { + "epoch": 1.7445663979147235, + "grad_norm": 1.0966947078704834, + "learning_rate": 3.992010787295386e-06, + "loss": 0.916, + "step": 273070 + }, + { + "epoch": 1.7446302850644622, + "grad_norm": 1.2284424304962158, + "learning_rate": 3.990046379203965e-06, + "loss": 0.9632, + "step": 273080 + }, + { + "epoch": 1.7446941722142009, + "grad_norm": 1.0536028146743774, + "learning_rate": 3.988082434472573e-06, + "loss": 0.8907, + "step": 273090 + }, + { + "epoch": 1.7447580593639396, + "grad_norm": 1.0629215240478516, + "learning_rate": 3.986118953121004e-06, + "loss": 0.855, + "step": 273100 + }, + { + "epoch": 1.7448219465136783, + "grad_norm": 1.213989496231079, + "learning_rate": 3.984155935169015e-06, + "loss": 0.9258, + "step": 273110 + }, + { + "epoch": 1.744885833663417, + "grad_norm": 0.9842742681503296, + "learning_rate": 3.98219338063639e-06, + "loss": 1.0247, + "step": 273120 + }, + { + "epoch": 1.7449497208131557, + "grad_norm": 1.209633469581604, + "learning_rate": 3.9802312895428904e-06, + "loss": 0.9219, + "step": 273130 + }, + { + "epoch": 1.7450136079628944, + "grad_norm": 1.1460846662521362, + "learning_rate": 3.978269661908268e-06, + "loss": 0.9312, + "step": 273140 + }, + { + "epoch": 1.7450774951126329, + "grad_norm": 0.75315260887146, + "learning_rate": 3.976308497752296e-06, + "loss": 0.9829, + "step": 273150 + }, + { + "epoch": 1.7451413822623718, + "grad_norm": 1.920297622680664, + "learning_rate": 3.9743477970947075e-06, + "loss": 1.0939, + "step": 273160 + }, + { + "epoch": 1.7452052694121103, + "grad_norm": 1.0004301071166992, + "learning_rate": 3.97238755995526e-06, + "loss": 0.8376, + "step": 273170 + }, + { + "epoch": 1.7452691565618492, + "grad_norm": 1.2894623279571533, + "learning_rate": 3.970427786353681e-06, + "loss": 0.8547, + "step": 273180 + }, + { + "epoch": 1.7453330437115877, + "grad_norm": 2.6042747497558594, + "learning_rate": 3.9684684763097225e-06, + "loss": 0.7822, + "step": 273190 + }, + { + "epoch": 1.7453969308613266, + "grad_norm": 0.6897473931312561, + "learning_rate": 3.966509629843101e-06, + "loss": 0.8511, + "step": 273200 + }, + { + "epoch": 1.7454608180110651, + "grad_norm": 0.7066933512687683, + "learning_rate": 3.964551246973558e-06, + "loss": 1.0343, + "step": 273210 + }, + { + "epoch": 1.745524705160804, + "grad_norm": 1.0931143760681152, + "learning_rate": 3.962593327720809e-06, + "loss": 0.9285, + "step": 273220 + }, + { + "epoch": 1.7455885923105425, + "grad_norm": 0.878592312335968, + "learning_rate": 3.960635872104573e-06, + "loss": 1.004, + "step": 273230 + }, + { + "epoch": 1.7456524794602815, + "grad_norm": 3.0393524169921875, + "learning_rate": 3.958678880144562e-06, + "loss": 0.9157, + "step": 273240 + }, + { + "epoch": 1.74571636661002, + "grad_norm": 0.7381773591041565, + "learning_rate": 3.956722351860492e-06, + "loss": 0.7609, + "step": 273250 + }, + { + "epoch": 1.7457802537597589, + "grad_norm": 1.044517993927002, + "learning_rate": 3.954766287272055e-06, + "loss": 0.9149, + "step": 273260 + }, + { + "epoch": 1.7458441409094974, + "grad_norm": 1.0228127241134644, + "learning_rate": 3.9528106863989615e-06, + "loss": 0.7727, + "step": 273270 + }, + { + "epoch": 1.7459080280592363, + "grad_norm": 0.7310999631881714, + "learning_rate": 3.950855549260896e-06, + "loss": 0.6429, + "step": 273280 + }, + { + "epoch": 1.7459719152089748, + "grad_norm": 0.7440299987792969, + "learning_rate": 3.9489008758775595e-06, + "loss": 0.7806, + "step": 273290 + }, + { + "epoch": 1.7460358023587137, + "grad_norm": 1.0097001791000366, + "learning_rate": 3.9469466662686195e-06, + "loss": 0.878, + "step": 273300 + }, + { + "epoch": 1.7460996895084522, + "grad_norm": 0.48892977833747864, + "learning_rate": 3.944992920453783e-06, + "loss": 0.6629, + "step": 273310 + }, + { + "epoch": 1.746163576658191, + "grad_norm": 0.8091384768486023, + "learning_rate": 3.943039638452694e-06, + "loss": 0.8756, + "step": 273320 + }, + { + "epoch": 1.7462274638079296, + "grad_norm": 1.1609203815460205, + "learning_rate": 3.941086820285051e-06, + "loss": 0.9226, + "step": 273330 + }, + { + "epoch": 1.7462913509576685, + "grad_norm": 0.7286596298217773, + "learning_rate": 3.939134465970512e-06, + "loss": 0.8502, + "step": 273340 + }, + { + "epoch": 1.746355238107407, + "grad_norm": 0.649346649646759, + "learning_rate": 3.937182575528731e-06, + "loss": 0.7657, + "step": 273350 + }, + { + "epoch": 1.746419125257146, + "grad_norm": 1.26242995262146, + "learning_rate": 3.935231148979379e-06, + "loss": 0.6829, + "step": 273360 + }, + { + "epoch": 1.7464830124068844, + "grad_norm": 0.7780110239982605, + "learning_rate": 3.933280186342092e-06, + "loss": 0.8697, + "step": 273370 + }, + { + "epoch": 1.7465468995566233, + "grad_norm": 1.1589394807815552, + "learning_rate": 3.9313296876365365e-06, + "loss": 0.9451, + "step": 273380 + }, + { + "epoch": 1.7466107867063618, + "grad_norm": 1.0762748718261719, + "learning_rate": 3.929379652882337e-06, + "loss": 1.202, + "step": 273390 + }, + { + "epoch": 1.7466746738561005, + "grad_norm": 0.7835119962692261, + "learning_rate": 3.927430082099154e-06, + "loss": 0.818, + "step": 273400 + }, + { + "epoch": 1.7467385610058392, + "grad_norm": 0.9623402953147888, + "learning_rate": 3.925480975306595e-06, + "loss": 0.8321, + "step": 273410 + }, + { + "epoch": 1.746802448155578, + "grad_norm": 0.8765859603881836, + "learning_rate": 3.92353233252431e-06, + "loss": 1.0047, + "step": 273420 + }, + { + "epoch": 1.7468663353053167, + "grad_norm": 1.7282193899154663, + "learning_rate": 3.9215841537719175e-06, + "loss": 0.7416, + "step": 273430 + }, + { + "epoch": 1.7469302224550554, + "grad_norm": 0.9430720210075378, + "learning_rate": 3.91963643906903e-06, + "loss": 0.7672, + "step": 273440 + }, + { + "epoch": 1.746994109604794, + "grad_norm": 0.8017761707305908, + "learning_rate": 3.917689188435275e-06, + "loss": 0.8316, + "step": 273450 + }, + { + "epoch": 1.7470579967545328, + "grad_norm": 0.7006866335868835, + "learning_rate": 3.915742401890249e-06, + "loss": 0.748, + "step": 273460 + }, + { + "epoch": 1.7471218839042715, + "grad_norm": 0.8986942768096924, + "learning_rate": 3.913796079453575e-06, + "loss": 0.997, + "step": 273470 + }, + { + "epoch": 1.7471857710540102, + "grad_norm": 1.1999050378799438, + "learning_rate": 3.911850221144836e-06, + "loss": 1.0847, + "step": 273480 + }, + { + "epoch": 1.747249658203749, + "grad_norm": 1.0616730451583862, + "learning_rate": 3.909904826983646e-06, + "loss": 0.8245, + "step": 273490 + }, + { + "epoch": 1.7473135453534876, + "grad_norm": 0.9581630825996399, + "learning_rate": 3.907959896989577e-06, + "loss": 0.6649, + "step": 273500 + }, + { + "epoch": 1.7473774325032263, + "grad_norm": 0.7028853297233582, + "learning_rate": 3.906015431182242e-06, + "loss": 0.7376, + "step": 273510 + }, + { + "epoch": 1.747441319652965, + "grad_norm": 0.8302884101867676, + "learning_rate": 3.904071429581191e-06, + "loss": 1.0715, + "step": 273520 + }, + { + "epoch": 1.7475052068027037, + "grad_norm": 1.7672325372695923, + "learning_rate": 3.902127892206037e-06, + "loss": 1.0683, + "step": 273530 + }, + { + "epoch": 1.7475690939524424, + "grad_norm": 1.1879684925079346, + "learning_rate": 3.900184819076319e-06, + "loss": 0.7099, + "step": 273540 + }, + { + "epoch": 1.7476329811021811, + "grad_norm": 1.1202888488769531, + "learning_rate": 3.8982422102116335e-06, + "loss": 0.751, + "step": 273550 + }, + { + "epoch": 1.7476968682519198, + "grad_norm": 1.224502682685852, + "learning_rate": 3.896300065631525e-06, + "loss": 0.8411, + "step": 273560 + }, + { + "epoch": 1.7477607554016585, + "grad_norm": 1.0552843809127808, + "learning_rate": 3.894358385355568e-06, + "loss": 1.0771, + "step": 273570 + }, + { + "epoch": 1.7478246425513972, + "grad_norm": 0.961863100528717, + "learning_rate": 3.8924171694033005e-06, + "loss": 0.7817, + "step": 273580 + }, + { + "epoch": 1.747888529701136, + "grad_norm": 0.8923673629760742, + "learning_rate": 3.890476417794286e-06, + "loss": 0.9693, + "step": 273590 + }, + { + "epoch": 1.7479524168508747, + "grad_norm": 0.7879084944725037, + "learning_rate": 3.888536130548065e-06, + "loss": 0.9674, + "step": 273600 + }, + { + "epoch": 1.7480163040006134, + "grad_norm": 1.2090606689453125, + "learning_rate": 3.886596307684176e-06, + "loss": 0.8197, + "step": 273610 + }, + { + "epoch": 1.748080191150352, + "grad_norm": 1.116636037826538, + "learning_rate": 3.884656949222154e-06, + "loss": 1.0655, + "step": 273620 + }, + { + "epoch": 1.7481440783000908, + "grad_norm": 0.9606205821037292, + "learning_rate": 3.882718055181539e-06, + "loss": 0.7936, + "step": 273630 + }, + { + "epoch": 1.7482079654498293, + "grad_norm": 1.0061858892440796, + "learning_rate": 3.880779625581837e-06, + "loss": 0.9665, + "step": 273640 + }, + { + "epoch": 1.7482718525995682, + "grad_norm": 1.251664638519287, + "learning_rate": 3.878841660442589e-06, + "loss": 0.843, + "step": 273650 + }, + { + "epoch": 1.7483357397493067, + "grad_norm": 1.0400848388671875, + "learning_rate": 3.876904159783312e-06, + "loss": 1.1301, + "step": 273660 + }, + { + "epoch": 1.7483996268990456, + "grad_norm": 0.9120842218399048, + "learning_rate": 3.874967123623502e-06, + "loss": 0.8551, + "step": 273670 + }, + { + "epoch": 1.748463514048784, + "grad_norm": 0.9632731676101685, + "learning_rate": 3.873030551982687e-06, + "loss": 0.7669, + "step": 273680 + }, + { + "epoch": 1.748527401198523, + "grad_norm": 0.9888166785240173, + "learning_rate": 3.8710944448803525e-06, + "loss": 0.7613, + "step": 273690 + }, + { + "epoch": 1.7485912883482615, + "grad_norm": 0.7286546230316162, + "learning_rate": 3.869158802336015e-06, + "loss": 0.7758, + "step": 273700 + }, + { + "epoch": 1.7486551754980004, + "grad_norm": 0.7055147886276245, + "learning_rate": 3.867223624369143e-06, + "loss": 1.0643, + "step": 273710 + }, + { + "epoch": 1.748719062647739, + "grad_norm": 1.2502223253250122, + "learning_rate": 3.865288910999254e-06, + "loss": 0.8506, + "step": 273720 + }, + { + "epoch": 1.7487829497974778, + "grad_norm": 1.2963123321533203, + "learning_rate": 3.863354662245805e-06, + "loss": 0.9619, + "step": 273730 + }, + { + "epoch": 1.7488468369472163, + "grad_norm": 1.1647197008132935, + "learning_rate": 3.861420878128296e-06, + "loss": 0.7913, + "step": 273740 + }, + { + "epoch": 1.7489107240969552, + "grad_norm": 1.0641111135482788, + "learning_rate": 3.8594875586661914e-06, + "loss": 0.7081, + "step": 273750 + }, + { + "epoch": 1.7489746112466937, + "grad_norm": 1.3449046611785889, + "learning_rate": 3.8575547038789685e-06, + "loss": 0.7491, + "step": 273760 + }, + { + "epoch": 1.7490384983964327, + "grad_norm": 0.7886676788330078, + "learning_rate": 3.855622313786078e-06, + "loss": 0.7877, + "step": 273770 + }, + { + "epoch": 1.7491023855461711, + "grad_norm": 1.1052894592285156, + "learning_rate": 3.853690388407006e-06, + "loss": 0.7696, + "step": 273780 + }, + { + "epoch": 1.74916627269591, + "grad_norm": 0.8340318202972412, + "learning_rate": 3.851758927761178e-06, + "loss": 0.7721, + "step": 273790 + }, + { + "epoch": 1.7492301598456486, + "grad_norm": 2.4856231212615967, + "learning_rate": 3.849827931868077e-06, + "loss": 0.9347, + "step": 273800 + }, + { + "epoch": 1.7492940469953875, + "grad_norm": 0.927199125289917, + "learning_rate": 3.8478974007471245e-06, + "loss": 1.0155, + "step": 273810 + }, + { + "epoch": 1.749357934145126, + "grad_norm": 1.527815580368042, + "learning_rate": 3.845967334417777e-06, + "loss": 0.9916, + "step": 273820 + }, + { + "epoch": 1.749421821294865, + "grad_norm": 0.7958928942680359, + "learning_rate": 3.8440377328994635e-06, + "loss": 0.8657, + "step": 273830 + }, + { + "epoch": 1.7494857084446034, + "grad_norm": 1.13398015499115, + "learning_rate": 3.842108596211619e-06, + "loss": 0.9195, + "step": 273840 + }, + { + "epoch": 1.7495495955943423, + "grad_norm": 0.7874863147735596, + "learning_rate": 3.840179924373677e-06, + "loss": 0.8805, + "step": 273850 + }, + { + "epoch": 1.7496134827440808, + "grad_norm": 1.1453686952590942, + "learning_rate": 3.838251717405056e-06, + "loss": 0.8999, + "step": 273860 + }, + { + "epoch": 1.7496773698938197, + "grad_norm": 0.9717140197753906, + "learning_rate": 3.836323975325179e-06, + "loss": 0.8969, + "step": 273870 + }, + { + "epoch": 1.7497412570435582, + "grad_norm": 0.8356024026870728, + "learning_rate": 3.834396698153453e-06, + "loss": 0.876, + "step": 273880 + }, + { + "epoch": 1.749805144193297, + "grad_norm": 0.8037473559379578, + "learning_rate": 3.832469885909296e-06, + "loss": 0.8511, + "step": 273890 + }, + { + "epoch": 1.7498690313430356, + "grad_norm": 0.9532538652420044, + "learning_rate": 3.830543538612103e-06, + "loss": 0.8685, + "step": 273900 + }, + { + "epoch": 1.7499329184927743, + "grad_norm": 1.1712671518325806, + "learning_rate": 3.828617656281292e-06, + "loss": 1.0234, + "step": 273910 + }, + { + "epoch": 1.749996805642513, + "grad_norm": 1.6563823223114014, + "learning_rate": 3.82669223893623e-06, + "loss": 0.809, + "step": 273920 + }, + { + "epoch": 1.7500606927922517, + "grad_norm": 1.1994155645370483, + "learning_rate": 3.824767286596331e-06, + "loss": 0.7852, + "step": 273930 + }, + { + "epoch": 1.7501245799419904, + "grad_norm": 0.8417662382125854, + "learning_rate": 3.8228427992809625e-06, + "loss": 0.8903, + "step": 273940 + }, + { + "epoch": 1.7501884670917291, + "grad_norm": 0.8921682238578796, + "learning_rate": 3.820918777009525e-06, + "loss": 1.0056, + "step": 273950 + }, + { + "epoch": 1.7502523542414679, + "grad_norm": 0.6824707984924316, + "learning_rate": 3.81899521980138e-06, + "loss": 0.8528, + "step": 273960 + }, + { + "epoch": 1.7503162413912066, + "grad_norm": 0.944797933101654, + "learning_rate": 3.817072127675902e-06, + "loss": 0.9613, + "step": 273970 + }, + { + "epoch": 1.7503801285409453, + "grad_norm": 0.9320737719535828, + "learning_rate": 3.815149500652471e-06, + "loss": 1.1001, + "step": 273980 + }, + { + "epoch": 1.750444015690684, + "grad_norm": 0.78386390209198, + "learning_rate": 3.81322733875043e-06, + "loss": 1.0468, + "step": 273990 + }, + { + "epoch": 1.7505079028404227, + "grad_norm": 1.0317583084106445, + "learning_rate": 3.8113056419891603e-06, + "loss": 0.9644, + "step": 274000 + }, + { + "epoch": 1.7505717899901614, + "grad_norm": 1.138235330581665, + "learning_rate": 3.8093844103879893e-06, + "loss": 0.8534, + "step": 274010 + }, + { + "epoch": 1.7506356771399, + "grad_norm": 0.7695803642272949, + "learning_rate": 3.8074636439662913e-06, + "loss": 0.8002, + "step": 274020 + }, + { + "epoch": 1.7506995642896388, + "grad_norm": 1.170741081237793, + "learning_rate": 3.8055433427433838e-06, + "loss": 1.1794, + "step": 274030 + }, + { + "epoch": 1.7507634514393775, + "grad_norm": 0.96340411901474, + "learning_rate": 3.8036235067386295e-06, + "loss": 0.7233, + "step": 274040 + }, + { + "epoch": 1.7508273385891162, + "grad_norm": 1.1876211166381836, + "learning_rate": 3.80170413597134e-06, + "loss": 0.8274, + "step": 274050 + }, + { + "epoch": 1.750891225738855, + "grad_norm": 0.652173638343811, + "learning_rate": 3.7997852304608673e-06, + "loss": 0.923, + "step": 274060 + }, + { + "epoch": 1.7509551128885936, + "grad_norm": 0.9098193645477295, + "learning_rate": 3.7978667902265175e-06, + "loss": 0.813, + "step": 274070 + }, + { + "epoch": 1.7510190000383323, + "grad_norm": 2.1511783599853516, + "learning_rate": 3.795948815287631e-06, + "loss": 0.721, + "step": 274080 + }, + { + "epoch": 1.751082887188071, + "grad_norm": 1.1116671562194824, + "learning_rate": 3.794031305663498e-06, + "loss": 1.0658, + "step": 274090 + }, + { + "epoch": 1.7511467743378097, + "grad_norm": 1.0649752616882324, + "learning_rate": 3.7921142613734586e-06, + "loss": 0.8786, + "step": 274100 + }, + { + "epoch": 1.7512106614875484, + "grad_norm": 1.5795048475265503, + "learning_rate": 3.790197682436791e-06, + "loss": 0.915, + "step": 274110 + }, + { + "epoch": 1.7512745486372872, + "grad_norm": 1.1178737878799438, + "learning_rate": 3.7882815688728203e-06, + "loss": 0.6936, + "step": 274120 + }, + { + "epoch": 1.7513384357870256, + "grad_norm": 1.1777743101119995, + "learning_rate": 3.786365920700824e-06, + "loss": 0.8257, + "step": 274130 + }, + { + "epoch": 1.7514023229367646, + "grad_norm": 0.7389242649078369, + "learning_rate": 3.784450737940115e-06, + "loss": 0.9016, + "step": 274140 + }, + { + "epoch": 1.751466210086503, + "grad_norm": 0.9251654744148254, + "learning_rate": 3.7825360206099556e-06, + "loss": 0.6585, + "step": 274150 + }, + { + "epoch": 1.751530097236242, + "grad_norm": 1.0728000402450562, + "learning_rate": 3.780621768729642e-06, + "loss": 0.8943, + "step": 274160 + }, + { + "epoch": 1.7515939843859805, + "grad_norm": 1.1792503595352173, + "learning_rate": 3.7787079823184635e-06, + "loss": 0.724, + "step": 274170 + }, + { + "epoch": 1.7516578715357194, + "grad_norm": 0.9268394708633423, + "learning_rate": 3.7767946613956775e-06, + "loss": 0.8984, + "step": 274180 + }, + { + "epoch": 1.7517217586854579, + "grad_norm": 0.8403618931770325, + "learning_rate": 3.774881805980562e-06, + "loss": 0.9804, + "step": 274190 + }, + { + "epoch": 1.7517856458351968, + "grad_norm": 1.1892706155776978, + "learning_rate": 3.77296941609237e-06, + "loss": 0.9623, + "step": 274200 + }, + { + "epoch": 1.7518495329849353, + "grad_norm": 0.9425559639930725, + "learning_rate": 3.7710574917503736e-06, + "loss": 0.7345, + "step": 274210 + }, + { + "epoch": 1.7519134201346742, + "grad_norm": 0.986301064491272, + "learning_rate": 3.769146032973819e-06, + "loss": 0.7878, + "step": 274220 + }, + { + "epoch": 1.7519773072844127, + "grad_norm": 1.136167049407959, + "learning_rate": 3.7672350397819633e-06, + "loss": 0.7139, + "step": 274230 + }, + { + "epoch": 1.7520411944341516, + "grad_norm": 1.2224135398864746, + "learning_rate": 3.7653245121940406e-06, + "loss": 0.9626, + "step": 274240 + }, + { + "epoch": 1.7521050815838901, + "grad_norm": 1.4254239797592163, + "learning_rate": 3.763414450229308e-06, + "loss": 0.8709, + "step": 274250 + }, + { + "epoch": 1.752168968733629, + "grad_norm": 1.0309650897979736, + "learning_rate": 3.761504853906983e-06, + "loss": 0.8792, + "step": 274260 + }, + { + "epoch": 1.7522328558833675, + "grad_norm": 1.1323344707489014, + "learning_rate": 3.7595957232463174e-06, + "loss": 1.0123, + "step": 274270 + }, + { + "epoch": 1.7522967430331065, + "grad_norm": 0.822934091091156, + "learning_rate": 3.7576870582665125e-06, + "loss": 0.6937, + "step": 274280 + }, + { + "epoch": 1.752360630182845, + "grad_norm": 1.0567173957824707, + "learning_rate": 3.755778858986819e-06, + "loss": 0.741, + "step": 274290 + }, + { + "epoch": 1.7524245173325839, + "grad_norm": 0.784637451171875, + "learning_rate": 3.753871125426428e-06, + "loss": 0.7805, + "step": 274300 + }, + { + "epoch": 1.7524884044823223, + "grad_norm": 1.0519686937332153, + "learning_rate": 3.751963857604568e-06, + "loss": 0.8545, + "step": 274310 + }, + { + "epoch": 1.7525522916320613, + "grad_norm": 0.5889614224433899, + "learning_rate": 3.75005705554044e-06, + "loss": 1.0286, + "step": 274320 + }, + { + "epoch": 1.7526161787817998, + "grad_norm": 0.8073184490203857, + "learning_rate": 3.7481507192532574e-06, + "loss": 0.8111, + "step": 274330 + }, + { + "epoch": 1.7526800659315387, + "grad_norm": 0.8798998594284058, + "learning_rate": 3.7462448487621982e-06, + "loss": 1.0912, + "step": 274340 + }, + { + "epoch": 1.7527439530812772, + "grad_norm": 0.648949384689331, + "learning_rate": 3.7443394440864755e-06, + "loss": 0.7207, + "step": 274350 + }, + { + "epoch": 1.7528078402310159, + "grad_norm": 1.0135072469711304, + "learning_rate": 3.742434505245268e-06, + "loss": 0.9531, + "step": 274360 + }, + { + "epoch": 1.7528717273807546, + "grad_norm": 0.8730331659317017, + "learning_rate": 3.7405300322577607e-06, + "loss": 0.6867, + "step": 274370 + }, + { + "epoch": 1.7529356145304933, + "grad_norm": 1.0686019659042358, + "learning_rate": 3.7386260251431494e-06, + "loss": 0.8929, + "step": 274380 + }, + { + "epoch": 1.752999501680232, + "grad_norm": 1.0393273830413818, + "learning_rate": 3.7367224839205796e-06, + "loss": 0.7427, + "step": 274390 + }, + { + "epoch": 1.7530633888299707, + "grad_norm": 0.8225563764572144, + "learning_rate": 3.734819408609258e-06, + "loss": 0.8183, + "step": 274400 + }, + { + "epoch": 1.7531272759797094, + "grad_norm": 1.018921971321106, + "learning_rate": 3.7329167992283143e-06, + "loss": 0.8351, + "step": 274410 + }, + { + "epoch": 1.7531911631294481, + "grad_norm": 0.7754051685333252, + "learning_rate": 3.731014655796933e-06, + "loss": 0.6592, + "step": 274420 + }, + { + "epoch": 1.7532550502791868, + "grad_norm": 0.7325241565704346, + "learning_rate": 3.729112978334254e-06, + "loss": 0.8715, + "step": 274430 + }, + { + "epoch": 1.7533189374289255, + "grad_norm": 0.9066896438598633, + "learning_rate": 3.727211766859445e-06, + "loss": 0.7465, + "step": 274440 + }, + { + "epoch": 1.7533828245786642, + "grad_norm": 0.7925408482551575, + "learning_rate": 3.7253110213916365e-06, + "loss": 0.7522, + "step": 274450 + }, + { + "epoch": 1.753446711728403, + "grad_norm": 1.0522191524505615, + "learning_rate": 3.72341074194999e-06, + "loss": 0.9043, + "step": 274460 + }, + { + "epoch": 1.7535105988781416, + "grad_norm": 0.9948369860649109, + "learning_rate": 3.721510928553623e-06, + "loss": 0.954, + "step": 274470 + }, + { + "epoch": 1.7535744860278804, + "grad_norm": 0.7237744927406311, + "learning_rate": 3.719611581221688e-06, + "loss": 0.5447, + "step": 274480 + }, + { + "epoch": 1.753638373177619, + "grad_norm": 0.5895835161209106, + "learning_rate": 3.7177126999732913e-06, + "loss": 0.7027, + "step": 274490 + }, + { + "epoch": 1.7537022603273578, + "grad_norm": 0.9909476041793823, + "learning_rate": 3.715814284827568e-06, + "loss": 0.6463, + "step": 274500 + }, + { + "epoch": 1.7537661474770965, + "grad_norm": 0.7529990673065186, + "learning_rate": 3.7139163358036464e-06, + "loss": 0.8121, + "step": 274510 + }, + { + "epoch": 1.7538300346268352, + "grad_norm": 0.7453489899635315, + "learning_rate": 3.712018852920618e-06, + "loss": 0.6555, + "step": 274520 + }, + { + "epoch": 1.7538939217765739, + "grad_norm": 2.500941753387451, + "learning_rate": 3.7101218361976165e-06, + "loss": 0.9852, + "step": 274530 + }, + { + "epoch": 1.7539578089263126, + "grad_norm": 1.3202393054962158, + "learning_rate": 3.708225285653727e-06, + "loss": 0.829, + "step": 274540 + }, + { + "epoch": 1.7540216960760513, + "grad_norm": 1.2711588144302368, + "learning_rate": 3.706329201308062e-06, + "loss": 1.165, + "step": 274550 + }, + { + "epoch": 1.75408558322579, + "grad_norm": 1.3019556999206543, + "learning_rate": 3.704433583179706e-06, + "loss": 0.9572, + "step": 274560 + }, + { + "epoch": 1.7541494703755287, + "grad_norm": 1.163252353668213, + "learning_rate": 3.7025384312877607e-06, + "loss": 0.8862, + "step": 274570 + }, + { + "epoch": 1.7542133575252674, + "grad_norm": 1.1229562759399414, + "learning_rate": 3.7006437456513e-06, + "loss": 0.8069, + "step": 274580 + }, + { + "epoch": 1.7542772446750061, + "grad_norm": 1.851560354232788, + "learning_rate": 3.698749526289419e-06, + "loss": 1.021, + "step": 274590 + }, + { + "epoch": 1.7543411318247448, + "grad_norm": 0.8247557282447815, + "learning_rate": 3.6968557732211752e-06, + "loss": 0.8604, + "step": 274600 + }, + { + "epoch": 1.7544050189744835, + "grad_norm": 0.7716491222381592, + "learning_rate": 3.694962486465664e-06, + "loss": 0.9591, + "step": 274610 + }, + { + "epoch": 1.754468906124222, + "grad_norm": 2.111315965652466, + "learning_rate": 3.6930696660419317e-06, + "loss": 1.0666, + "step": 274620 + }, + { + "epoch": 1.754532793273961, + "grad_norm": 2.500415086746216, + "learning_rate": 3.6911773119690572e-06, + "loss": 0.988, + "step": 274630 + }, + { + "epoch": 1.7545966804236994, + "grad_norm": 1.3053700923919678, + "learning_rate": 3.689285424266081e-06, + "loss": 0.9789, + "step": 274640 + }, + { + "epoch": 1.7546605675734384, + "grad_norm": 0.8793125152587891, + "learning_rate": 3.687394002952077e-06, + "loss": 1.0954, + "step": 274650 + }, + { + "epoch": 1.7547244547231768, + "grad_norm": 0.8418011665344238, + "learning_rate": 3.6855030480460686e-06, + "loss": 0.8198, + "step": 274660 + }, + { + "epoch": 1.7547883418729158, + "grad_norm": 1.2235418558120728, + "learning_rate": 3.6836125595671234e-06, + "loss": 0.7422, + "step": 274670 + }, + { + "epoch": 1.7548522290226543, + "grad_norm": 0.7017014026641846, + "learning_rate": 3.6817225375342603e-06, + "loss": 0.7226, + "step": 274680 + }, + { + "epoch": 1.7549161161723932, + "grad_norm": 1.1416294574737549, + "learning_rate": 3.679832981966519e-06, + "loss": 0.7806, + "step": 274690 + }, + { + "epoch": 1.7549800033221317, + "grad_norm": 1.0321959257125854, + "learning_rate": 3.6779438928829403e-06, + "loss": 1.0022, + "step": 274700 + }, + { + "epoch": 1.7550438904718706, + "grad_norm": 0.8865734338760376, + "learning_rate": 3.6760552703025364e-06, + "loss": 1.2421, + "step": 274710 + }, + { + "epoch": 1.755107777621609, + "grad_norm": 0.9447980523109436, + "learning_rate": 3.674167114244342e-06, + "loss": 0.9127, + "step": 274720 + }, + { + "epoch": 1.755171664771348, + "grad_norm": 0.7748607397079468, + "learning_rate": 3.6722794247273483e-06, + "loss": 0.7871, + "step": 274730 + }, + { + "epoch": 1.7552355519210865, + "grad_norm": 0.8334684371948242, + "learning_rate": 3.6703922017705895e-06, + "loss": 0.8679, + "step": 274740 + }, + { + "epoch": 1.7552994390708254, + "grad_norm": 1.1279202699661255, + "learning_rate": 3.6685054453930558e-06, + "loss": 0.7924, + "step": 274750 + }, + { + "epoch": 1.755363326220564, + "grad_norm": 0.8488116264343262, + "learning_rate": 3.666619155613765e-06, + "loss": 0.8025, + "step": 274760 + }, + { + "epoch": 1.7554272133703028, + "grad_norm": 0.7714635729789734, + "learning_rate": 3.664733332451692e-06, + "loss": 0.7927, + "step": 274770 + }, + { + "epoch": 1.7554911005200413, + "grad_norm": 0.9155358672142029, + "learning_rate": 3.6628479759258485e-06, + "loss": 0.8599, + "step": 274780 + }, + { + "epoch": 1.7555549876697802, + "grad_norm": 0.6552935838699341, + "learning_rate": 3.6609630860552023e-06, + "loss": 1.0248, + "step": 274790 + }, + { + "epoch": 1.7556188748195187, + "grad_norm": 1.1390666961669922, + "learning_rate": 3.6590786628587615e-06, + "loss": 0.9757, + "step": 274800 + }, + { + "epoch": 1.7556827619692577, + "grad_norm": 0.7530357837677002, + "learning_rate": 3.657194706355477e-06, + "loss": 0.7806, + "step": 274810 + }, + { + "epoch": 1.7557466491189961, + "grad_norm": 0.8142558932304382, + "learning_rate": 3.6553112165643387e-06, + "loss": 0.9117, + "step": 274820 + }, + { + "epoch": 1.755810536268735, + "grad_norm": 1.016108751296997, + "learning_rate": 3.65342819350431e-06, + "loss": 0.8177, + "step": 274830 + }, + { + "epoch": 1.7558744234184736, + "grad_norm": 1.1278547048568726, + "learning_rate": 3.651545637194359e-06, + "loss": 0.7878, + "step": 274840 + }, + { + "epoch": 1.7559383105682123, + "grad_norm": 1.5448342561721802, + "learning_rate": 3.6496635476534313e-06, + "loss": 0.7154, + "step": 274850 + }, + { + "epoch": 1.756002197717951, + "grad_norm": 1.060778260231018, + "learning_rate": 3.6477819249005007e-06, + "loss": 0.9649, + "step": 274860 + }, + { + "epoch": 1.7560660848676897, + "grad_norm": 1.027547836303711, + "learning_rate": 3.645900768954497e-06, + "loss": 1.0678, + "step": 274870 + }, + { + "epoch": 1.7561299720174284, + "grad_norm": 0.8141332268714905, + "learning_rate": 3.6440200798343815e-06, + "loss": 0.9652, + "step": 274880 + }, + { + "epoch": 1.756193859167167, + "grad_norm": 0.9573503136634827, + "learning_rate": 3.6421398575590903e-06, + "loss": 0.9573, + "step": 274890 + }, + { + "epoch": 1.7562577463169058, + "grad_norm": 1.0918097496032715, + "learning_rate": 3.640260102147547e-06, + "loss": 0.9735, + "step": 274900 + }, + { + "epoch": 1.7563216334666445, + "grad_norm": 1.0628275871276855, + "learning_rate": 3.638380813618697e-06, + "loss": 0.8058, + "step": 274910 + }, + { + "epoch": 1.7563855206163832, + "grad_norm": 1.0889729261398315, + "learning_rate": 3.6365019919914533e-06, + "loss": 1.0859, + "step": 274920 + }, + { + "epoch": 1.756449407766122, + "grad_norm": 1.5826724767684937, + "learning_rate": 3.6346236372847564e-06, + "loss": 0.7498, + "step": 274930 + }, + { + "epoch": 1.7565132949158606, + "grad_norm": 0.85593581199646, + "learning_rate": 3.632745749517502e-06, + "loss": 0.9096, + "step": 274940 + }, + { + "epoch": 1.7565771820655993, + "grad_norm": 1.0470643043518066, + "learning_rate": 3.6308683287086143e-06, + "loss": 0.8088, + "step": 274950 + }, + { + "epoch": 1.756641069215338, + "grad_norm": 0.9474973678588867, + "learning_rate": 3.6289913748769944e-06, + "loss": 0.816, + "step": 274960 + }, + { + "epoch": 1.7567049563650767, + "grad_norm": 0.6857970356941223, + "learning_rate": 3.627114888041555e-06, + "loss": 0.9826, + "step": 274970 + }, + { + "epoch": 1.7567688435148154, + "grad_norm": 0.4515606760978699, + "learning_rate": 3.6252388682211815e-06, + "loss": 0.9802, + "step": 274980 + }, + { + "epoch": 1.7568327306645541, + "grad_norm": 1.347015380859375, + "learning_rate": 3.6233633154347747e-06, + "loss": 0.9361, + "step": 274990 + }, + { + "epoch": 1.7568966178142928, + "grad_norm": 1.2522832155227661, + "learning_rate": 3.62148822970122e-06, + "loss": 0.6884, + "step": 275000 + }, + { + "epoch": 1.7569605049640316, + "grad_norm": 0.6121107935905457, + "learning_rate": 3.6196136110393963e-06, + "loss": 0.57, + "step": 275010 + }, + { + "epoch": 1.7570243921137703, + "grad_norm": 0.9248584508895874, + "learning_rate": 3.6177394594682e-06, + "loss": 0.8776, + "step": 275020 + }, + { + "epoch": 1.757088279263509, + "grad_norm": 2.5324671268463135, + "learning_rate": 3.6158657750064828e-06, + "loss": 0.6613, + "step": 275030 + }, + { + "epoch": 1.7571521664132477, + "grad_norm": 2.7908589839935303, + "learning_rate": 3.613992557673135e-06, + "loss": 1.0586, + "step": 275040 + }, + { + "epoch": 1.7572160535629864, + "grad_norm": 0.8149098753929138, + "learning_rate": 3.612119807487002e-06, + "loss": 0.747, + "step": 275050 + }, + { + "epoch": 1.757279940712725, + "grad_norm": 0.5843006372451782, + "learning_rate": 3.610247524466959e-06, + "loss": 0.8862, + "step": 275060 + }, + { + "epoch": 1.7573438278624638, + "grad_norm": 1.139170527458191, + "learning_rate": 3.608375708631856e-06, + "loss": 0.7829, + "step": 275070 + }, + { + "epoch": 1.7574077150122025, + "grad_norm": 1.3390437364578247, + "learning_rate": 3.6065043600005453e-06, + "loss": 0.8041, + "step": 275080 + }, + { + "epoch": 1.757471602161941, + "grad_norm": 0.9355593919754028, + "learning_rate": 3.604633478591868e-06, + "loss": 0.6621, + "step": 275090 + }, + { + "epoch": 1.75753548931168, + "grad_norm": NaN, + "learning_rate": 3.6029500848149933e-06, + "loss": 0.9683, + "step": 275100 + }, + { + "epoch": 1.7575993764614184, + "grad_norm": 0.6551210880279541, + "learning_rate": 3.6010800911812303e-06, + "loss": 0.92, + "step": 275110 + }, + { + "epoch": 1.7576632636111573, + "grad_norm": 0.9354518055915833, + "learning_rate": 3.59921056482474e-06, + "loss": 0.792, + "step": 275120 + }, + { + "epoch": 1.7577271507608958, + "grad_norm": 0.7017722725868225, + "learning_rate": 3.597341505764329e-06, + "loss": 0.807, + "step": 275130 + }, + { + "epoch": 1.7577910379106347, + "grad_norm": 1.1099634170532227, + "learning_rate": 3.595472914018838e-06, + "loss": 1.1256, + "step": 275140 + }, + { + "epoch": 1.7578549250603732, + "grad_norm": 0.9013046026229858, + "learning_rate": 3.5936047896070856e-06, + "loss": 0.7302, + "step": 275150 + }, + { + "epoch": 1.7579188122101121, + "grad_norm": 0.8545605540275574, + "learning_rate": 3.591737132547873e-06, + "loss": 0.7842, + "step": 275160 + }, + { + "epoch": 1.7579826993598506, + "grad_norm": 0.887650728225708, + "learning_rate": 3.5898699428600245e-06, + "loss": 1.1057, + "step": 275170 + }, + { + "epoch": 1.7580465865095896, + "grad_norm": 0.7681732177734375, + "learning_rate": 3.5880032205623304e-06, + "loss": 0.8383, + "step": 275180 + }, + { + "epoch": 1.758110473659328, + "grad_norm": 1.0179623365402222, + "learning_rate": 3.5861369656736034e-06, + "loss": 0.8732, + "step": 275190 + }, + { + "epoch": 1.758174360809067, + "grad_norm": 0.8366377353668213, + "learning_rate": 3.584271178212617e-06, + "loss": 0.7925, + "step": 275200 + }, + { + "epoch": 1.7582382479588055, + "grad_norm": 1.0721460580825806, + "learning_rate": 3.58240585819819e-06, + "loss": 0.7945, + "step": 275210 + }, + { + "epoch": 1.7583021351085444, + "grad_norm": 1.5178515911102295, + "learning_rate": 3.580541005649085e-06, + "loss": 0.6637, + "step": 275220 + }, + { + "epoch": 1.7583660222582829, + "grad_norm": 1.2210355997085571, + "learning_rate": 3.5786766205840928e-06, + "loss": 0.7637, + "step": 275230 + }, + { + "epoch": 1.7584299094080218, + "grad_norm": 0.9875472187995911, + "learning_rate": 3.576812703021981e-06, + "loss": 0.9334, + "step": 275240 + }, + { + "epoch": 1.7584937965577603, + "grad_norm": 1.1672228574752808, + "learning_rate": 3.5749492529815407e-06, + "loss": 0.8829, + "step": 275250 + }, + { + "epoch": 1.7585576837074992, + "grad_norm": 0.7068379521369934, + "learning_rate": 3.5730862704815125e-06, + "loss": 0.7711, + "step": 275260 + }, + { + "epoch": 1.7586215708572377, + "grad_norm": 1.5251413583755493, + "learning_rate": 3.5712237555406756e-06, + "loss": 1.1443, + "step": 275270 + }, + { + "epoch": 1.7586854580069766, + "grad_norm": 0.8467580080032349, + "learning_rate": 3.569361708177776e-06, + "loss": 0.7313, + "step": 275280 + }, + { + "epoch": 1.758749345156715, + "grad_norm": 0.9554235935211182, + "learning_rate": 3.567500128411583e-06, + "loss": 1.1889, + "step": 275290 + }, + { + "epoch": 1.758813232306454, + "grad_norm": 1.2097324132919312, + "learning_rate": 3.5656390162608246e-06, + "loss": 0.9851, + "step": 275300 + }, + { + "epoch": 1.7588771194561925, + "grad_norm": 0.9549233913421631, + "learning_rate": 3.5637783717442587e-06, + "loss": 1.0911, + "step": 275310 + }, + { + "epoch": 1.7589410066059314, + "grad_norm": 0.894060492515564, + "learning_rate": 3.561918194880609e-06, + "loss": 1.0995, + "step": 275320 + }, + { + "epoch": 1.75900489375567, + "grad_norm": 1.4603536128997803, + "learning_rate": 3.5600584856886277e-06, + "loss": 0.6686, + "step": 275330 + }, + { + "epoch": 1.7590687809054086, + "grad_norm": 0.8984410166740417, + "learning_rate": 3.5581992441870273e-06, + "loss": 0.7872, + "step": 275340 + }, + { + "epoch": 1.7591326680551473, + "grad_norm": 0.9645366668701172, + "learning_rate": 3.5563404703945367e-06, + "loss": 0.8436, + "step": 275350 + }, + { + "epoch": 1.759196555204886, + "grad_norm": 2.0763237476348877, + "learning_rate": 3.5544821643298864e-06, + "loss": 0.6756, + "step": 275360 + }, + { + "epoch": 1.7592604423546248, + "grad_norm": 1.1000155210494995, + "learning_rate": 3.552624326011772e-06, + "loss": 0.9746, + "step": 275370 + }, + { + "epoch": 1.7593243295043635, + "grad_norm": 0.747199296951294, + "learning_rate": 3.550766955458923e-06, + "loss": 0.7532, + "step": 275380 + }, + { + "epoch": 1.7593882166541022, + "grad_norm": 1.1015061140060425, + "learning_rate": 3.5489100526900244e-06, + "loss": 1.1804, + "step": 275390 + }, + { + "epoch": 1.7594521038038409, + "grad_norm": 0.7550133466720581, + "learning_rate": 3.5470536177238e-06, + "loss": 0.9128, + "step": 275400 + }, + { + "epoch": 1.7595159909535796, + "grad_norm": 0.7676512002944946, + "learning_rate": 3.5451976505789188e-06, + "loss": 0.8771, + "step": 275410 + }, + { + "epoch": 1.7595798781033183, + "grad_norm": 0.9398499131202698, + "learning_rate": 3.5433421512740983e-06, + "loss": 0.7606, + "step": 275420 + }, + { + "epoch": 1.759643765253057, + "grad_norm": 0.7365298867225647, + "learning_rate": 3.5414871198280076e-06, + "loss": 0.7261, + "step": 275430 + }, + { + "epoch": 1.7597076524027957, + "grad_norm": 1.3296804428100586, + "learning_rate": 3.539632556259337e-06, + "loss": 0.9213, + "step": 275440 + }, + { + "epoch": 1.7597715395525344, + "grad_norm": 0.9793141484260559, + "learning_rate": 3.5377784605867546e-06, + "loss": 0.9185, + "step": 275450 + }, + { + "epoch": 1.759835426702273, + "grad_norm": 1.0745623111724854, + "learning_rate": 3.5359248328289464e-06, + "loss": 0.9798, + "step": 275460 + }, + { + "epoch": 1.7598993138520118, + "grad_norm": 0.9916425943374634, + "learning_rate": 3.534071673004563e-06, + "loss": 0.7789, + "step": 275470 + }, + { + "epoch": 1.7599632010017505, + "grad_norm": 1.2632156610488892, + "learning_rate": 3.532218981132285e-06, + "loss": 0.8559, + "step": 275480 + }, + { + "epoch": 1.7600270881514892, + "grad_norm": 0.8439663052558899, + "learning_rate": 3.5303667572307573e-06, + "loss": 0.7617, + "step": 275490 + }, + { + "epoch": 1.760090975301228, + "grad_norm": 1.366858959197998, + "learning_rate": 3.528515001318644e-06, + "loss": 0.8881, + "step": 275500 + }, + { + "epoch": 1.7601548624509666, + "grad_norm": 1.0938761234283447, + "learning_rate": 3.52666371341458e-06, + "loss": 0.9536, + "step": 275510 + }, + { + "epoch": 1.7602187496007053, + "grad_norm": 1.4395384788513184, + "learning_rate": 3.524812893537227e-06, + "loss": 0.8025, + "step": 275520 + }, + { + "epoch": 1.760282636750444, + "grad_norm": 2.9542572498321533, + "learning_rate": 3.5229625417052104e-06, + "loss": 0.9451, + "step": 275530 + }, + { + "epoch": 1.7603465239001828, + "grad_norm": 0.8342198133468628, + "learning_rate": 3.5211126579371646e-06, + "loss": 0.9168, + "step": 275540 + }, + { + "epoch": 1.7604104110499215, + "grad_norm": 0.983436107635498, + "learning_rate": 3.519263242251736e-06, + "loss": 1.034, + "step": 275550 + }, + { + "epoch": 1.7604742981996602, + "grad_norm": 0.7490174770355225, + "learning_rate": 3.517414294667526e-06, + "loss": 0.7333, + "step": 275560 + }, + { + "epoch": 1.7605381853493989, + "grad_norm": 1.718109130859375, + "learning_rate": 3.5155658152031757e-06, + "loss": 0.9025, + "step": 275570 + }, + { + "epoch": 1.7606020724991374, + "grad_norm": 0.7378758788108826, + "learning_rate": 3.513717803877281e-06, + "loss": 0.8956, + "step": 275580 + }, + { + "epoch": 1.7606659596488763, + "grad_norm": 1.3051201105117798, + "learning_rate": 3.5118702607084773e-06, + "loss": 0.9403, + "step": 275590 + }, + { + "epoch": 1.7607298467986148, + "grad_norm": 0.6553983688354492, + "learning_rate": 3.51002318571535e-06, + "loss": 1.1106, + "step": 275600 + }, + { + "epoch": 1.7607937339483537, + "grad_norm": 0.7879045009613037, + "learning_rate": 3.5081765789165164e-06, + "loss": 0.6659, + "step": 275610 + }, + { + "epoch": 1.7608576210980922, + "grad_norm": 0.8307299613952637, + "learning_rate": 3.5063304403305577e-06, + "loss": 0.7752, + "step": 275620 + }, + { + "epoch": 1.7609215082478311, + "grad_norm": 1.0239109992980957, + "learning_rate": 3.5044847699760795e-06, + "loss": 0.7822, + "step": 275630 + }, + { + "epoch": 1.7609853953975696, + "grad_norm": 2.6831319332122803, + "learning_rate": 3.5026395678716572e-06, + "loss": 0.8854, + "step": 275640 + }, + { + "epoch": 1.7610492825473085, + "grad_norm": 0.6408683061599731, + "learning_rate": 3.5007948340358864e-06, + "loss": 1.0743, + "step": 275650 + }, + { + "epoch": 1.761113169697047, + "grad_norm": 0.7142156958580017, + "learning_rate": 3.498950568487336e-06, + "loss": 0.9547, + "step": 275660 + }, + { + "epoch": 1.761177056846786, + "grad_norm": 1.1051113605499268, + "learning_rate": 3.497106771244579e-06, + "loss": 0.8606, + "step": 275670 + }, + { + "epoch": 1.7612409439965244, + "grad_norm": 1.0908756256103516, + "learning_rate": 3.4952634423261966e-06, + "loss": 0.639, + "step": 275680 + }, + { + "epoch": 1.7613048311462633, + "grad_norm": 1.1603355407714844, + "learning_rate": 3.493420581750739e-06, + "loss": 0.7543, + "step": 275690 + }, + { + "epoch": 1.7613687182960018, + "grad_norm": 0.5926783084869385, + "learning_rate": 3.491578189536776e-06, + "loss": 1.0203, + "step": 275700 + }, + { + "epoch": 1.7614326054457408, + "grad_norm": 0.939707338809967, + "learning_rate": 3.4897362657028476e-06, + "loss": 0.8937, + "step": 275710 + }, + { + "epoch": 1.7614964925954792, + "grad_norm": 1.0953004360198975, + "learning_rate": 3.487894810267517e-06, + "loss": 0.8824, + "step": 275720 + }, + { + "epoch": 1.7615603797452182, + "grad_norm": 1.258279800415039, + "learning_rate": 3.486053823249319e-06, + "loss": 0.9268, + "step": 275730 + }, + { + "epoch": 1.7616242668949567, + "grad_norm": 0.9371123909950256, + "learning_rate": 3.4842133046668055e-06, + "loss": 0.9814, + "step": 275740 + }, + { + "epoch": 1.7616881540446956, + "grad_norm": 2.118590831756592, + "learning_rate": 3.4823732545385013e-06, + "loss": 0.9614, + "step": 275750 + }, + { + "epoch": 1.761752041194434, + "grad_norm": 0.7608202695846558, + "learning_rate": 3.4805336728829464e-06, + "loss": 1.0091, + "step": 275760 + }, + { + "epoch": 1.761815928344173, + "grad_norm": 0.9030619263648987, + "learning_rate": 3.4786945597186593e-06, + "loss": 0.8409, + "step": 275770 + }, + { + "epoch": 1.7618798154939115, + "grad_norm": 1.2993314266204834, + "learning_rate": 3.47685591506417e-06, + "loss": 1.0666, + "step": 275780 + }, + { + "epoch": 1.7619437026436504, + "grad_norm": 2.566411256790161, + "learning_rate": 3.47501773893798e-06, + "loss": 0.8391, + "step": 275790 + }, + { + "epoch": 1.762007589793389, + "grad_norm": 1.1588977575302124, + "learning_rate": 3.4731800313586195e-06, + "loss": 0.7433, + "step": 275800 + }, + { + "epoch": 1.7620714769431278, + "grad_norm": 1.8010987043380737, + "learning_rate": 3.471342792344584e-06, + "loss": 1.0248, + "step": 275810 + }, + { + "epoch": 1.7621353640928663, + "grad_norm": 0.8282926082611084, + "learning_rate": 3.4695060219143815e-06, + "loss": 0.8259, + "step": 275820 + }, + { + "epoch": 1.762199251242605, + "grad_norm": 1.285724401473999, + "learning_rate": 3.4676697200865025e-06, + "loss": 0.8297, + "step": 275830 + }, + { + "epoch": 1.7622631383923437, + "grad_norm": 1.0171455144882202, + "learning_rate": 3.465833886879455e-06, + "loss": 0.8694, + "step": 275840 + }, + { + "epoch": 1.7623270255420824, + "grad_norm": 1.3438469171524048, + "learning_rate": 3.4639985223117066e-06, + "loss": 0.9486, + "step": 275850 + }, + { + "epoch": 1.7623909126918211, + "grad_norm": 0.8757327795028687, + "learning_rate": 3.4621636264017596e-06, + "loss": 1.0706, + "step": 275860 + }, + { + "epoch": 1.7624547998415598, + "grad_norm": 0.7441837787628174, + "learning_rate": 3.4603291991680885e-06, + "loss": 0.8272, + "step": 275870 + }, + { + "epoch": 1.7625186869912985, + "grad_norm": 0.8464133739471436, + "learning_rate": 3.458495240629156e-06, + "loss": 1.1859, + "step": 275880 + }, + { + "epoch": 1.7625825741410373, + "grad_norm": 0.8027544617652893, + "learning_rate": 3.456661750803447e-06, + "loss": 0.7095, + "step": 275890 + }, + { + "epoch": 1.762646461290776, + "grad_norm": 0.8621078729629517, + "learning_rate": 3.4548287297094196e-06, + "loss": 0.8539, + "step": 275900 + }, + { + "epoch": 1.7627103484405147, + "grad_norm": 1.7479239702224731, + "learning_rate": 3.452996177365536e-06, + "loss": 0.6586, + "step": 275910 + }, + { + "epoch": 1.7627742355902534, + "grad_norm": 1.0658780336380005, + "learning_rate": 3.4511640937902434e-06, + "loss": 0.6436, + "step": 275920 + }, + { + "epoch": 1.762838122739992, + "grad_norm": 0.9121516942977905, + "learning_rate": 3.4493324790020044e-06, + "loss": 0.9354, + "step": 275930 + }, + { + "epoch": 1.7629020098897308, + "grad_norm": 0.8368337750434875, + "learning_rate": 3.447501333019254e-06, + "loss": 0.8779, + "step": 275940 + }, + { + "epoch": 1.7629658970394695, + "grad_norm": 0.9886699318885803, + "learning_rate": 3.4456706558604502e-06, + "loss": 0.8241, + "step": 275950 + }, + { + "epoch": 1.7630297841892082, + "grad_norm": 0.7883988618850708, + "learning_rate": 3.4438404475440055e-06, + "loss": 0.7167, + "step": 275960 + }, + { + "epoch": 1.763093671338947, + "grad_norm": 0.9630401134490967, + "learning_rate": 3.4420107080883723e-06, + "loss": 0.7229, + "step": 275970 + }, + { + "epoch": 1.7631575584886856, + "grad_norm": 1.144049048423767, + "learning_rate": 3.4401814375119632e-06, + "loss": 1.0576, + "step": 275980 + }, + { + "epoch": 1.7632214456384243, + "grad_norm": 0.8971433043479919, + "learning_rate": 3.4383526358332142e-06, + "loss": 0.8893, + "step": 275990 + }, + { + "epoch": 1.763285332788163, + "grad_norm": 0.8197485208511353, + "learning_rate": 3.4365243030705373e-06, + "loss": 0.9808, + "step": 276000 + }, + { + "epoch": 1.7633492199379017, + "grad_norm": 1.0844979286193848, + "learning_rate": 3.4346964392423296e-06, + "loss": 1.062, + "step": 276010 + }, + { + "epoch": 1.7634131070876404, + "grad_norm": 0.7812053561210632, + "learning_rate": 3.432869044367032e-06, + "loss": 0.8334, + "step": 276020 + }, + { + "epoch": 1.7634769942373791, + "grad_norm": 0.8636895418167114, + "learning_rate": 3.4310421184630127e-06, + "loss": 0.629, + "step": 276030 + }, + { + "epoch": 1.7635408813871178, + "grad_norm": 0.8970896005630493, + "learning_rate": 3.4292156615487013e-06, + "loss": 0.8893, + "step": 276040 + }, + { + "epoch": 1.7636047685368565, + "grad_norm": 1.0095678567886353, + "learning_rate": 3.4273896736424615e-06, + "loss": 0.7679, + "step": 276050 + }, + { + "epoch": 1.7636686556865953, + "grad_norm": 1.979683518409729, + "learning_rate": 3.4255641547627172e-06, + "loss": 0.9901, + "step": 276060 + }, + { + "epoch": 1.7637325428363337, + "grad_norm": 1.5521012544631958, + "learning_rate": 3.42373910492782e-06, + "loss": 1.0343, + "step": 276070 + }, + { + "epoch": 1.7637964299860727, + "grad_norm": 1.0704764127731323, + "learning_rate": 3.4219145241561725e-06, + "loss": 0.7701, + "step": 276080 + }, + { + "epoch": 1.7638603171358112, + "grad_norm": 0.7185716032981873, + "learning_rate": 3.4200904124661316e-06, + "loss": 0.629, + "step": 276090 + }, + { + "epoch": 1.76392420428555, + "grad_norm": 1.0846500396728516, + "learning_rate": 3.4182667698760883e-06, + "loss": 1.2639, + "step": 276100 + }, + { + "epoch": 1.7639880914352886, + "grad_norm": 1.1522587537765503, + "learning_rate": 3.4164435964043894e-06, + "loss": 1.037, + "step": 276110 + }, + { + "epoch": 1.7640519785850275, + "grad_norm": 1.6337602138519287, + "learning_rate": 3.4146208920694146e-06, + "loss": 0.7858, + "step": 276120 + }, + { + "epoch": 1.764115865734766, + "grad_norm": 1.0767061710357666, + "learning_rate": 3.412798656889499e-06, + "loss": 0.8875, + "step": 276130 + }, + { + "epoch": 1.764179752884505, + "grad_norm": 1.116227388381958, + "learning_rate": 3.410976890883011e-06, + "loss": 0.9552, + "step": 276140 + }, + { + "epoch": 1.7642436400342434, + "grad_norm": 1.6324573755264282, + "learning_rate": 3.409155594068286e-06, + "loss": 0.8507, + "step": 276150 + }, + { + "epoch": 1.7643075271839823, + "grad_norm": 1.1252837181091309, + "learning_rate": 3.407334766463677e-06, + "loss": 0.7192, + "step": 276160 + }, + { + "epoch": 1.7643714143337208, + "grad_norm": 0.8622950911521912, + "learning_rate": 3.4055144080875066e-06, + "loss": 0.7325, + "step": 276170 + }, + { + "epoch": 1.7644353014834597, + "grad_norm": 1.1538571119308472, + "learning_rate": 3.403694518958128e-06, + "loss": 1.0452, + "step": 276180 + }, + { + "epoch": 1.7644991886331982, + "grad_norm": 0.7291510701179504, + "learning_rate": 3.401875099093843e-06, + "loss": 1.0454, + "step": 276190 + }, + { + "epoch": 1.7645630757829371, + "grad_norm": 0.843101441860199, + "learning_rate": 3.4000561485129924e-06, + "loss": 1.0167, + "step": 276200 + }, + { + "epoch": 1.7646269629326756, + "grad_norm": 1.3674508333206177, + "learning_rate": 3.3982376672339e-06, + "loss": 1.0741, + "step": 276210 + }, + { + "epoch": 1.7646908500824146, + "grad_norm": 0.8573333621025085, + "learning_rate": 3.3964196552748627e-06, + "loss": 0.9575, + "step": 276220 + }, + { + "epoch": 1.764754737232153, + "grad_norm": 2.657174587249756, + "learning_rate": 3.3946021126542047e-06, + "loss": 1.0709, + "step": 276230 + }, + { + "epoch": 1.764818624381892, + "grad_norm": 0.7357352375984192, + "learning_rate": 3.3927850393902115e-06, + "loss": 0.8728, + "step": 276240 + }, + { + "epoch": 1.7648825115316304, + "grad_norm": 2.2075095176696777, + "learning_rate": 3.3909684355012074e-06, + "loss": 0.7582, + "step": 276250 + }, + { + "epoch": 1.7649463986813694, + "grad_norm": 0.9739333391189575, + "learning_rate": 3.3891523010054606e-06, + "loss": 1.1386, + "step": 276260 + }, + { + "epoch": 1.7650102858311079, + "grad_norm": 1.1989326477050781, + "learning_rate": 3.3873366359212845e-06, + "loss": 0.7894, + "step": 276270 + }, + { + "epoch": 1.7650741729808468, + "grad_norm": 1.4745514392852783, + "learning_rate": 3.385521440266948e-06, + "loss": 0.6456, + "step": 276280 + }, + { + "epoch": 1.7651380601305853, + "grad_norm": 0.7231010794639587, + "learning_rate": 3.383706714060747e-06, + "loss": 0.9416, + "step": 276290 + }, + { + "epoch": 1.7652019472803242, + "grad_norm": 0.5068309903144836, + "learning_rate": 3.3818924573209342e-06, + "loss": 1.1731, + "step": 276300 + }, + { + "epoch": 1.7652658344300627, + "grad_norm": 2.7875280380249023, + "learning_rate": 3.380078670065806e-06, + "loss": 0.9234, + "step": 276310 + }, + { + "epoch": 1.7653297215798014, + "grad_norm": 0.622486412525177, + "learning_rate": 3.3782653523136144e-06, + "loss": 0.5415, + "step": 276320 + }, + { + "epoch": 1.76539360872954, + "grad_norm": 0.9856690168380737, + "learning_rate": 3.3764525040826224e-06, + "loss": 0.9147, + "step": 276330 + }, + { + "epoch": 1.7654574958792788, + "grad_norm": 0.8422602415084839, + "learning_rate": 3.3746401253910874e-06, + "loss": 1.0639, + "step": 276340 + }, + { + "epoch": 1.7655213830290175, + "grad_norm": 0.8297490477561951, + "learning_rate": 3.3728282162572676e-06, + "loss": 1.2381, + "step": 276350 + }, + { + "epoch": 1.7655852701787562, + "grad_norm": 0.8892385363578796, + "learning_rate": 3.371016776699404e-06, + "loss": 0.6482, + "step": 276360 + }, + { + "epoch": 1.765649157328495, + "grad_norm": 1.1976794004440308, + "learning_rate": 3.3692058067357425e-06, + "loss": 0.924, + "step": 276370 + }, + { + "epoch": 1.7657130444782336, + "grad_norm": 1.4974721670150757, + "learning_rate": 3.367395306384513e-06, + "loss": 0.9347, + "step": 276380 + }, + { + "epoch": 1.7657769316279723, + "grad_norm": 0.9004227519035339, + "learning_rate": 3.365585275663957e-06, + "loss": 0.7606, + "step": 276390 + }, + { + "epoch": 1.765840818777711, + "grad_norm": 0.8492181897163391, + "learning_rate": 3.36377571459231e-06, + "loss": 0.8315, + "step": 276400 + }, + { + "epoch": 1.7659047059274497, + "grad_norm": 1.402463674545288, + "learning_rate": 3.3619666231877846e-06, + "loss": 0.9241, + "step": 276410 + }, + { + "epoch": 1.7659685930771885, + "grad_norm": 0.6081668138504028, + "learning_rate": 3.3601580014686053e-06, + "loss": 0.8031, + "step": 276420 + }, + { + "epoch": 1.7660324802269272, + "grad_norm": 1.5759546756744385, + "learning_rate": 3.358349849452974e-06, + "loss": 0.64, + "step": 276430 + }, + { + "epoch": 1.7660963673766659, + "grad_norm": 0.9555615186691284, + "learning_rate": 3.356542167159121e-06, + "loss": 0.801, + "step": 276440 + }, + { + "epoch": 1.7661602545264046, + "grad_norm": 0.8987722396850586, + "learning_rate": 3.354734954605232e-06, + "loss": 0.8198, + "step": 276450 + }, + { + "epoch": 1.7662241416761433, + "grad_norm": 1.1487159729003906, + "learning_rate": 3.352928211809525e-06, + "loss": 0.801, + "step": 276460 + }, + { + "epoch": 1.766288028825882, + "grad_norm": 0.9127746820449829, + "learning_rate": 3.3511219387901803e-06, + "loss": 0.9023, + "step": 276470 + }, + { + "epoch": 1.7663519159756207, + "grad_norm": 0.9431878924369812, + "learning_rate": 3.3493161355654e-06, + "loss": 1.065, + "step": 276480 + }, + { + "epoch": 1.7664158031253594, + "grad_norm": 1.7045027017593384, + "learning_rate": 3.347510802153364e-06, + "loss": 0.9515, + "step": 276490 + }, + { + "epoch": 1.766479690275098, + "grad_norm": 0.4762714207172394, + "learning_rate": 3.3457059385722577e-06, + "loss": 0.9678, + "step": 276500 + }, + { + "epoch": 1.7665435774248368, + "grad_norm": 1.3455365896224976, + "learning_rate": 3.3439015448402444e-06, + "loss": 0.7454, + "step": 276510 + }, + { + "epoch": 1.7666074645745755, + "grad_norm": 0.713361382484436, + "learning_rate": 3.342097620975504e-06, + "loss": 0.7936, + "step": 276520 + }, + { + "epoch": 1.7666713517243142, + "grad_norm": 1.2825889587402344, + "learning_rate": 3.340294166996216e-06, + "loss": 0.9622, + "step": 276530 + }, + { + "epoch": 1.766735238874053, + "grad_norm": 0.8402884006500244, + "learning_rate": 3.3384911829205222e-06, + "loss": 1.0876, + "step": 276540 + }, + { + "epoch": 1.7667991260237916, + "grad_norm": 1.9258328676223755, + "learning_rate": 3.3366886687666022e-06, + "loss": 0.7881, + "step": 276550 + }, + { + "epoch": 1.7668630131735301, + "grad_norm": 1.8669530153274536, + "learning_rate": 3.3348866245525855e-06, + "loss": 1.2596, + "step": 276560 + }, + { + "epoch": 1.766926900323269, + "grad_norm": 0.6702399849891663, + "learning_rate": 3.3330850502966416e-06, + "loss": 0.8636, + "step": 276570 + }, + { + "epoch": 1.7669907874730075, + "grad_norm": 1.2956793308258057, + "learning_rate": 3.3312839460168943e-06, + "loss": 0.8951, + "step": 276580 + }, + { + "epoch": 1.7670546746227465, + "grad_norm": 0.9595689177513123, + "learning_rate": 3.329483311731496e-06, + "loss": 0.8165, + "step": 276590 + }, + { + "epoch": 1.767118561772485, + "grad_norm": 0.9628953337669373, + "learning_rate": 3.3276831474585712e-06, + "loss": 0.9032, + "step": 276600 + }, + { + "epoch": 1.7671824489222239, + "grad_norm": 1.8576788902282715, + "learning_rate": 3.3258834532162664e-06, + "loss": 0.6831, + "step": 276610 + }, + { + "epoch": 1.7672463360719624, + "grad_norm": 0.9650187492370605, + "learning_rate": 3.3240842290226835e-06, + "loss": 0.8597, + "step": 276620 + }, + { + "epoch": 1.7673102232217013, + "grad_norm": 0.8248710632324219, + "learning_rate": 3.322285474895959e-06, + "loss": 0.7399, + "step": 276630 + }, + { + "epoch": 1.7673741103714398, + "grad_norm": 1.4074620008468628, + "learning_rate": 3.3204871908541935e-06, + "loss": 0.8139, + "step": 276640 + }, + { + "epoch": 1.7674379975211787, + "grad_norm": 0.8231216669082642, + "learning_rate": 3.318689376915518e-06, + "loss": 0.7705, + "step": 276650 + }, + { + "epoch": 1.7675018846709172, + "grad_norm": 0.8074039220809937, + "learning_rate": 3.3168920330980125e-06, + "loss": 0.8506, + "step": 276660 + }, + { + "epoch": 1.767565771820656, + "grad_norm": 1.0663862228393555, + "learning_rate": 3.315095159419801e-06, + "loss": 0.8019, + "step": 276670 + }, + { + "epoch": 1.7676296589703946, + "grad_norm": 1.2024550437927246, + "learning_rate": 3.3132987558989636e-06, + "loss": 0.9289, + "step": 276680 + }, + { + "epoch": 1.7676935461201335, + "grad_norm": 0.8090937733650208, + "learning_rate": 3.3115028225536028e-06, + "loss": 1.0337, + "step": 276690 + }, + { + "epoch": 1.767757433269872, + "grad_norm": 1.0161302089691162, + "learning_rate": 3.3097073594017926e-06, + "loss": 0.8066, + "step": 276700 + }, + { + "epoch": 1.767821320419611, + "grad_norm": 0.6586167216300964, + "learning_rate": 3.307912366461624e-06, + "loss": 0.7344, + "step": 276710 + }, + { + "epoch": 1.7678852075693494, + "grad_norm": 1.5533359050750732, + "learning_rate": 3.3061178437511776e-06, + "loss": 1.1381, + "step": 276720 + }, + { + "epoch": 1.7679490947190883, + "grad_norm": 1.1131467819213867, + "learning_rate": 3.3043237912885107e-06, + "loss": 1.1111, + "step": 276730 + }, + { + "epoch": 1.7680129818688268, + "grad_norm": 1.5943667888641357, + "learning_rate": 3.302530209091714e-06, + "loss": 0.9598, + "step": 276740 + }, + { + "epoch": 1.7680768690185658, + "grad_norm": 0.6993541717529297, + "learning_rate": 3.300737097178824e-06, + "loss": 0.9595, + "step": 276750 + }, + { + "epoch": 1.7681407561683042, + "grad_norm": 0.9403067231178284, + "learning_rate": 3.298944455567926e-06, + "loss": 1.0233, + "step": 276760 + }, + { + "epoch": 1.7682046433180432, + "grad_norm": 1.483699917793274, + "learning_rate": 3.2971522842770442e-06, + "loss": 0.7465, + "step": 276770 + }, + { + "epoch": 1.7682685304677817, + "grad_norm": 0.983133852481842, + "learning_rate": 3.2953605833242583e-06, + "loss": 0.7698, + "step": 276780 + }, + { + "epoch": 1.7683324176175204, + "grad_norm": 1.0668721199035645, + "learning_rate": 3.2935693527275826e-06, + "loss": 0.7182, + "step": 276790 + }, + { + "epoch": 1.768396304767259, + "grad_norm": 0.9246516227722168, + "learning_rate": 3.2917785925050792e-06, + "loss": 0.7324, + "step": 276800 + }, + { + "epoch": 1.7684601919169978, + "grad_norm": 0.9839482307434082, + "learning_rate": 3.289988302674768e-06, + "loss": 0.9245, + "step": 276810 + }, + { + "epoch": 1.7685240790667365, + "grad_norm": 0.8526636362075806, + "learning_rate": 3.28819848325469e-06, + "loss": 0.7506, + "step": 276820 + }, + { + "epoch": 1.7685879662164752, + "grad_norm": 1.9389556646347046, + "learning_rate": 3.286409134262858e-06, + "loss": 1.1102, + "step": 276830 + }, + { + "epoch": 1.7686518533662139, + "grad_norm": 1.0603058338165283, + "learning_rate": 3.284620255717308e-06, + "loss": 0.8008, + "step": 276840 + }, + { + "epoch": 1.7687157405159526, + "grad_norm": 1.1200687885284424, + "learning_rate": 3.2828318476360364e-06, + "loss": 0.8575, + "step": 276850 + }, + { + "epoch": 1.7687796276656913, + "grad_norm": 0.8741885423660278, + "learning_rate": 3.2810439100370736e-06, + "loss": 0.9655, + "step": 276860 + }, + { + "epoch": 1.76884351481543, + "grad_norm": 1.8504701852798462, + "learning_rate": 3.279256442938411e-06, + "loss": 0.8095, + "step": 276870 + }, + { + "epoch": 1.7689074019651687, + "grad_norm": 0.5831618309020996, + "learning_rate": 3.2774694463580615e-06, + "loss": 0.724, + "step": 276880 + }, + { + "epoch": 1.7689712891149074, + "grad_norm": 0.8052890300750732, + "learning_rate": 3.275682920314005e-06, + "loss": 0.9097, + "step": 276890 + }, + { + "epoch": 1.7690351762646461, + "grad_norm": 0.6509976983070374, + "learning_rate": 3.2738968648242496e-06, + "loss": 1.1575, + "step": 276900 + }, + { + "epoch": 1.7690990634143848, + "grad_norm": 0.7471203207969666, + "learning_rate": 3.2721112799067754e-06, + "loss": 0.7795, + "step": 276910 + }, + { + "epoch": 1.7691629505641235, + "grad_norm": 0.8894343376159668, + "learning_rate": 3.2703261655795626e-06, + "loss": 0.7585, + "step": 276920 + }, + { + "epoch": 1.7692268377138622, + "grad_norm": 1.0406216382980347, + "learning_rate": 3.268541521860602e-06, + "loss": 0.8372, + "step": 276930 + }, + { + "epoch": 1.769290724863601, + "grad_norm": 0.7905427813529968, + "learning_rate": 3.266757348767846e-06, + "loss": 0.95, + "step": 276940 + }, + { + "epoch": 1.7693546120133397, + "grad_norm": 1.1895004510879517, + "learning_rate": 3.2649736463192806e-06, + "loss": 0.983, + "step": 276950 + }, + { + "epoch": 1.7694184991630784, + "grad_norm": 0.6956319808959961, + "learning_rate": 3.2631904145328575e-06, + "loss": 0.7909, + "step": 276960 + }, + { + "epoch": 1.769482386312817, + "grad_norm": 0.8152353167533875, + "learning_rate": 3.2614076534265513e-06, + "loss": 0.8163, + "step": 276970 + }, + { + "epoch": 1.7695462734625558, + "grad_norm": 0.6797952055931091, + "learning_rate": 3.259625363018293e-06, + "loss": 0.8145, + "step": 276980 + }, + { + "epoch": 1.7696101606122945, + "grad_norm": 1.0962252616882324, + "learning_rate": 3.25784354332605e-06, + "loss": 0.9919, + "step": 276990 + }, + { + "epoch": 1.7696740477620332, + "grad_norm": 2.2895405292510986, + "learning_rate": 3.2560621943677537e-06, + "loss": 0.8943, + "step": 277000 + }, + { + "epoch": 1.769737934911772, + "grad_norm": 1.275295615196228, + "learning_rate": 3.254281316161362e-06, + "loss": 0.9772, + "step": 277010 + }, + { + "epoch": 1.7698018220615106, + "grad_norm": 0.9024759531021118, + "learning_rate": 3.2525009087247816e-06, + "loss": 0.9631, + "step": 277020 + }, + { + "epoch": 1.769865709211249, + "grad_norm": 1.0591357946395874, + "learning_rate": 3.2507209720759656e-06, + "loss": 0.9282, + "step": 277030 + }, + { + "epoch": 1.769929596360988, + "grad_norm": 1.0445270538330078, + "learning_rate": 3.2489415062328387e-06, + "loss": 0.9007, + "step": 277040 + }, + { + "epoch": 1.7699934835107265, + "grad_norm": 0.9703136682510376, + "learning_rate": 3.247162511213309e-06, + "loss": 1.0824, + "step": 277050 + }, + { + "epoch": 1.7700573706604654, + "grad_norm": 0.7905303835868835, + "learning_rate": 3.2453839870353055e-06, + "loss": 0.9206, + "step": 277060 + }, + { + "epoch": 1.770121257810204, + "grad_norm": 1.0880717039108276, + "learning_rate": 3.2436059337167313e-06, + "loss": 0.6517, + "step": 277070 + }, + { + "epoch": 1.7701851449599428, + "grad_norm": 1.243598461151123, + "learning_rate": 3.241828351275494e-06, + "loss": 0.9865, + "step": 277080 + }, + { + "epoch": 1.7702490321096813, + "grad_norm": 0.9723210334777832, + "learning_rate": 3.240051239729497e-06, + "loss": 1.0514, + "step": 277090 + }, + { + "epoch": 1.7703129192594202, + "grad_norm": 0.8347112536430359, + "learning_rate": 3.2382745990966412e-06, + "loss": 1.0216, + "step": 277100 + }, + { + "epoch": 1.7703768064091587, + "grad_norm": 0.49017333984375, + "learning_rate": 3.236498429394813e-06, + "loss": 0.7933, + "step": 277110 + }, + { + "epoch": 1.7704406935588977, + "grad_norm": 0.6923054456710815, + "learning_rate": 3.234722730641904e-06, + "loss": 0.7284, + "step": 277120 + }, + { + "epoch": 1.7705045807086361, + "grad_norm": 1.0289267301559448, + "learning_rate": 3.2329475028557876e-06, + "loss": 0.7274, + "step": 277130 + }, + { + "epoch": 1.770568467858375, + "grad_norm": 1.0343220233917236, + "learning_rate": 3.2311727460543563e-06, + "loss": 0.8632, + "step": 277140 + }, + { + "epoch": 1.7706323550081136, + "grad_norm": 0.7125779986381531, + "learning_rate": 3.229398460255467e-06, + "loss": 0.7936, + "step": 277150 + }, + { + "epoch": 1.7706962421578525, + "grad_norm": 1.3944861888885498, + "learning_rate": 3.227624645477012e-06, + "loss": 0.8167, + "step": 277160 + }, + { + "epoch": 1.770760129307591, + "grad_norm": 1.0906258821487427, + "learning_rate": 3.2258513017368265e-06, + "loss": 0.9059, + "step": 277170 + }, + { + "epoch": 1.77082401645733, + "grad_norm": 1.4286187887191772, + "learning_rate": 3.2240784290527957e-06, + "loss": 0.6454, + "step": 277180 + }, + { + "epoch": 1.7708879036070684, + "grad_norm": 0.8852258324623108, + "learning_rate": 3.222306027442751e-06, + "loss": 0.6915, + "step": 277190 + }, + { + "epoch": 1.7709517907568073, + "grad_norm": 1.2390975952148438, + "learning_rate": 3.22053409692456e-06, + "loss": 0.8217, + "step": 277200 + }, + { + "epoch": 1.7710156779065458, + "grad_norm": 1.6062967777252197, + "learning_rate": 3.2187626375160594e-06, + "loss": 1.0739, + "step": 277210 + }, + { + "epoch": 1.7710795650562847, + "grad_norm": 1.1709768772125244, + "learning_rate": 3.2169916492350906e-06, + "loss": 0.9225, + "step": 277220 + }, + { + "epoch": 1.7711434522060232, + "grad_norm": 1.0782562494277954, + "learning_rate": 3.2152211320994884e-06, + "loss": 0.8072, + "step": 277230 + }, + { + "epoch": 1.7712073393557621, + "grad_norm": 0.743114173412323, + "learning_rate": 3.2134510861270783e-06, + "loss": 0.8746, + "step": 277240 + }, + { + "epoch": 1.7712712265055006, + "grad_norm": 0.7684194445610046, + "learning_rate": 3.211858447611188e-06, + "loss": 0.9179, + "step": 277250 + }, + { + "epoch": 1.7713351136552395, + "grad_norm": 1.0073596239089966, + "learning_rate": 3.2100892968979613e-06, + "loss": 0.9356, + "step": 277260 + }, + { + "epoch": 1.771399000804978, + "grad_norm": 0.894205629825592, + "learning_rate": 3.2083206173996238e-06, + "loss": 0.9639, + "step": 277270 + }, + { + "epoch": 1.7714628879547167, + "grad_norm": 0.9179940223693848, + "learning_rate": 3.2065524091339726e-06, + "loss": 0.6965, + "step": 277280 + }, + { + "epoch": 1.7715267751044554, + "grad_norm": 1.3406168222427368, + "learning_rate": 3.2047846721188215e-06, + "loss": 1.0092, + "step": 277290 + }, + { + "epoch": 1.7715906622541941, + "grad_norm": 0.5127822160720825, + "learning_rate": 3.203017406371972e-06, + "loss": 0.7664, + "step": 277300 + }, + { + "epoch": 1.7716545494039329, + "grad_norm": 0.6956382989883423, + "learning_rate": 3.2012506119112273e-06, + "loss": 0.8804, + "step": 277310 + }, + { + "epoch": 1.7717184365536716, + "grad_norm": 1.7899528741836548, + "learning_rate": 3.1994842887543673e-06, + "loss": 0.8313, + "step": 277320 + }, + { + "epoch": 1.7717823237034103, + "grad_norm": 0.7749053239822388, + "learning_rate": 3.197718436919195e-06, + "loss": 1.0221, + "step": 277330 + }, + { + "epoch": 1.771846210853149, + "grad_norm": 0.6026731729507446, + "learning_rate": 3.1959530564234787e-06, + "loss": 0.9889, + "step": 277340 + }, + { + "epoch": 1.7719100980028877, + "grad_norm": 1.6500853300094604, + "learning_rate": 3.1941881472850212e-06, + "loss": 0.9839, + "step": 277350 + }, + { + "epoch": 1.7719739851526264, + "grad_norm": 1.0572153329849243, + "learning_rate": 3.192423709521569e-06, + "loss": 0.9506, + "step": 277360 + }, + { + "epoch": 1.772037872302365, + "grad_norm": 1.2893636226654053, + "learning_rate": 3.190659743150903e-06, + "loss": 0.7442, + "step": 277370 + }, + { + "epoch": 1.7721017594521038, + "grad_norm": 1.0569548606872559, + "learning_rate": 3.188896248190798e-06, + "loss": 0.7426, + "step": 277380 + }, + { + "epoch": 1.7721656466018425, + "grad_norm": 0.6380995512008667, + "learning_rate": 3.1871332246590003e-06, + "loss": 0.8383, + "step": 277390 + }, + { + "epoch": 1.7722295337515812, + "grad_norm": 1.4150300025939941, + "learning_rate": 3.185370672573279e-06, + "loss": 1.0424, + "step": 277400 + }, + { + "epoch": 1.77229342090132, + "grad_norm": 0.9538283348083496, + "learning_rate": 3.1836085919513648e-06, + "loss": 0.8074, + "step": 277410 + }, + { + "epoch": 1.7723573080510586, + "grad_norm": 2.292234420776367, + "learning_rate": 3.181846982811021e-06, + "loss": 0.9066, + "step": 277420 + }, + { + "epoch": 1.7724211952007973, + "grad_norm": 0.9210676550865173, + "learning_rate": 3.180085845169972e-06, + "loss": 1.0798, + "step": 277430 + }, + { + "epoch": 1.772485082350536, + "grad_norm": 0.7483298182487488, + "learning_rate": 3.1783251790459766e-06, + "loss": 0.8801, + "step": 277440 + }, + { + "epoch": 1.7725489695002747, + "grad_norm": 0.8505380153656006, + "learning_rate": 3.1765649844567424e-06, + "loss": 0.5436, + "step": 277450 + }, + { + "epoch": 1.7726128566500134, + "grad_norm": 0.7014589905738831, + "learning_rate": 3.1748052614200106e-06, + "loss": 0.9626, + "step": 277460 + }, + { + "epoch": 1.7726767437997522, + "grad_norm": 0.8798777461051941, + "learning_rate": 3.173046009953495e-06, + "loss": 0.8165, + "step": 277470 + }, + { + "epoch": 1.7727406309494909, + "grad_norm": 0.9326733350753784, + "learning_rate": 3.1712872300749265e-06, + "loss": 0.6907, + "step": 277480 + }, + { + "epoch": 1.7728045180992296, + "grad_norm": 0.8117529153823853, + "learning_rate": 3.169528921802001e-06, + "loss": 0.9274, + "step": 277490 + }, + { + "epoch": 1.7728684052489683, + "grad_norm": 1.0946813821792603, + "learning_rate": 3.167771085152438e-06, + "loss": 1.1256, + "step": 277500 + }, + { + "epoch": 1.772932292398707, + "grad_norm": 0.8175672292709351, + "learning_rate": 3.1660137201439287e-06, + "loss": 0.68, + "step": 277510 + }, + { + "epoch": 1.7729961795484455, + "grad_norm": 0.4712793529033661, + "learning_rate": 3.164256826794193e-06, + "loss": 0.6028, + "step": 277520 + }, + { + "epoch": 1.7730600666981844, + "grad_norm": 0.7874718308448792, + "learning_rate": 3.162500405120894e-06, + "loss": 0.8733, + "step": 277530 + }, + { + "epoch": 1.7731239538479229, + "grad_norm": 1.705673098564148, + "learning_rate": 3.160744455141745e-06, + "loss": 0.9226, + "step": 277540 + }, + { + "epoch": 1.7731878409976618, + "grad_norm": 0.8848811388015747, + "learning_rate": 3.1589889768744162e-06, + "loss": 0.836, + "step": 277550 + }, + { + "epoch": 1.7732517281474003, + "grad_norm": 0.8991907835006714, + "learning_rate": 3.1572339703365874e-06, + "loss": 0.9667, + "step": 277560 + }, + { + "epoch": 1.7733156152971392, + "grad_norm": 1.2283343076705933, + "learning_rate": 3.1554794355459493e-06, + "loss": 0.8699, + "step": 277570 + }, + { + "epoch": 1.7733795024468777, + "grad_norm": 0.8715161085128784, + "learning_rate": 3.15372537252015e-06, + "loss": 0.6607, + "step": 277580 + }, + { + "epoch": 1.7734433895966166, + "grad_norm": 0.7413138747215271, + "learning_rate": 3.151971781276869e-06, + "loss": 0.8635, + "step": 277590 + }, + { + "epoch": 1.773507276746355, + "grad_norm": 0.7562516927719116, + "learning_rate": 3.1502186618337593e-06, + "loss": 0.8093, + "step": 277600 + }, + { + "epoch": 1.773571163896094, + "grad_norm": 0.8659891486167908, + "learning_rate": 3.1484660142084897e-06, + "loss": 0.8792, + "step": 277610 + }, + { + "epoch": 1.7736350510458325, + "grad_norm": 0.9960166811943054, + "learning_rate": 3.1467138384186857e-06, + "loss": 0.8298, + "step": 277620 + }, + { + "epoch": 1.7736989381955715, + "grad_norm": 0.9417649507522583, + "learning_rate": 3.1449621344820156e-06, + "loss": 1.0236, + "step": 277630 + }, + { + "epoch": 1.77376282534531, + "grad_norm": 1.2807464599609375, + "learning_rate": 3.143210902416105e-06, + "loss": 0.8184, + "step": 277640 + }, + { + "epoch": 1.7738267124950489, + "grad_norm": 0.8112235069274902, + "learning_rate": 3.1414601422386002e-06, + "loss": 0.8804, + "step": 277650 + }, + { + "epoch": 1.7738905996447873, + "grad_norm": 1.1869792938232422, + "learning_rate": 3.1397098539671264e-06, + "loss": 0.5419, + "step": 277660 + }, + { + "epoch": 1.7739544867945263, + "grad_norm": 0.7551632523536682, + "learning_rate": 3.137960037619325e-06, + "loss": 0.7217, + "step": 277670 + }, + { + "epoch": 1.7740183739442648, + "grad_norm": 0.8500282764434814, + "learning_rate": 3.136210693212793e-06, + "loss": 0.6447, + "step": 277680 + }, + { + "epoch": 1.7740822610940037, + "grad_norm": 0.8427389860153198, + "learning_rate": 3.1344618207651666e-06, + "loss": 0.7903, + "step": 277690 + }, + { + "epoch": 1.7741461482437422, + "grad_norm": 0.6282423734664917, + "learning_rate": 3.1327134202940645e-06, + "loss": 0.8771, + "step": 277700 + }, + { + "epoch": 1.774210035393481, + "grad_norm": 1.025516390800476, + "learning_rate": 3.1309654918170726e-06, + "loss": 0.8486, + "step": 277710 + }, + { + "epoch": 1.7742739225432196, + "grad_norm": 0.846153736114502, + "learning_rate": 3.129218035351811e-06, + "loss": 1.0854, + "step": 277720 + }, + { + "epoch": 1.7743378096929585, + "grad_norm": 2.999692916870117, + "learning_rate": 3.12747105091587e-06, + "loss": 0.923, + "step": 277730 + }, + { + "epoch": 1.774401696842697, + "grad_norm": 1.197917103767395, + "learning_rate": 3.125724538526853e-06, + "loss": 1.0771, + "step": 277740 + }, + { + "epoch": 1.774465583992436, + "grad_norm": 0.8440120220184326, + "learning_rate": 3.1239784982023344e-06, + "loss": 0.7957, + "step": 277750 + }, + { + "epoch": 1.7745294711421744, + "grad_norm": 1.0203328132629395, + "learning_rate": 3.122232929959912e-06, + "loss": 1.3361, + "step": 277760 + }, + { + "epoch": 1.7745933582919131, + "grad_norm": 0.8135339617729187, + "learning_rate": 3.120487833817148e-06, + "loss": 0.9691, + "step": 277770 + }, + { + "epoch": 1.7746572454416518, + "grad_norm": 1.0247467756271362, + "learning_rate": 3.1187432097916415e-06, + "loss": 0.8512, + "step": 277780 + }, + { + "epoch": 1.7747211325913905, + "grad_norm": 0.8693737387657166, + "learning_rate": 3.116999057900938e-06, + "loss": 1.0537, + "step": 277790 + }, + { + "epoch": 1.7747850197411292, + "grad_norm": 0.9472058415412903, + "learning_rate": 3.1152553781626237e-06, + "loss": 1.0081, + "step": 277800 + }, + { + "epoch": 1.774848906890868, + "grad_norm": 0.6859394907951355, + "learning_rate": 3.113512170594235e-06, + "loss": 0.8502, + "step": 277810 + }, + { + "epoch": 1.7749127940406066, + "grad_norm": 0.7783265709877014, + "learning_rate": 3.1117694352133576e-06, + "loss": 0.9364, + "step": 277820 + }, + { + "epoch": 1.7749766811903454, + "grad_norm": 0.9184941053390503, + "learning_rate": 3.1100271720375106e-06, + "loss": 0.9731, + "step": 277830 + }, + { + "epoch": 1.775040568340084, + "grad_norm": 0.9210687875747681, + "learning_rate": 3.1082853810842694e-06, + "loss": 0.8218, + "step": 277840 + }, + { + "epoch": 1.7751044554898228, + "grad_norm": 0.6392104029655457, + "learning_rate": 3.1065440623711527e-06, + "loss": 0.8711, + "step": 277850 + }, + { + "epoch": 1.7751683426395615, + "grad_norm": 1.07809579372406, + "learning_rate": 3.1048032159157136e-06, + "loss": 0.7592, + "step": 277860 + }, + { + "epoch": 1.7752322297893002, + "grad_norm": 0.7940744757652283, + "learning_rate": 3.1030628417354658e-06, + "loss": 0.9796, + "step": 277870 + }, + { + "epoch": 1.7752961169390389, + "grad_norm": 0.9738584160804749, + "learning_rate": 3.101322939847945e-06, + "loss": 0.8472, + "step": 277880 + }, + { + "epoch": 1.7753600040887776, + "grad_norm": 0.6570689082145691, + "learning_rate": 3.099583510270687e-06, + "loss": 0.8304, + "step": 277890 + }, + { + "epoch": 1.7754238912385163, + "grad_norm": 0.966304361820221, + "learning_rate": 3.0978445530211898e-06, + "loss": 0.7785, + "step": 277900 + }, + { + "epoch": 1.775487778388255, + "grad_norm": 0.8394570350646973, + "learning_rate": 3.0961060681169827e-06, + "loss": 0.9422, + "step": 277910 + }, + { + "epoch": 1.7755516655379937, + "grad_norm": 0.7498250007629395, + "learning_rate": 3.0943680555755583e-06, + "loss": 0.7033, + "step": 277920 + }, + { + "epoch": 1.7756155526877324, + "grad_norm": 1.4077019691467285, + "learning_rate": 3.0926305154144296e-06, + "loss": 1.0761, + "step": 277930 + }, + { + "epoch": 1.7756794398374711, + "grad_norm": 2.1638362407684326, + "learning_rate": 3.0908934476510888e-06, + "loss": 0.9859, + "step": 277940 + }, + { + "epoch": 1.7757433269872098, + "grad_norm": 0.9528393149375916, + "learning_rate": 3.0891568523030377e-06, + "loss": 0.8106, + "step": 277950 + }, + { + "epoch": 1.7758072141369485, + "grad_norm": 1.0252223014831543, + "learning_rate": 3.087420729387752e-06, + "loss": 1.1796, + "step": 277960 + }, + { + "epoch": 1.7758711012866872, + "grad_norm": 1.5917168855667114, + "learning_rate": 3.0856850789227397e-06, + "loss": 0.7109, + "step": 277970 + }, + { + "epoch": 1.775934988436426, + "grad_norm": 1.0380032062530518, + "learning_rate": 3.0839499009254534e-06, + "loss": 0.9216, + "step": 277980 + }, + { + "epoch": 1.7759988755861646, + "grad_norm": 1.0510272979736328, + "learning_rate": 3.0822151954133848e-06, + "loss": 0.8541, + "step": 277990 + }, + { + "epoch": 1.7760627627359034, + "grad_norm": 1.1530609130859375, + "learning_rate": 3.080480962403992e-06, + "loss": 0.7698, + "step": 278000 + }, + { + "epoch": 1.7761266498856418, + "grad_norm": 0.9889609813690186, + "learning_rate": 3.0787472019147556e-06, + "loss": 0.876, + "step": 278010 + }, + { + "epoch": 1.7761905370353808, + "grad_norm": 0.921779453754425, + "learning_rate": 3.0770139139631225e-06, + "loss": 0.6693, + "step": 278020 + }, + { + "epoch": 1.7762544241851193, + "grad_norm": 0.8533374667167664, + "learning_rate": 3.075281098566557e-06, + "loss": 0.8092, + "step": 278030 + }, + { + "epoch": 1.7763183113348582, + "grad_norm": 1.26589035987854, + "learning_rate": 3.0735487557425004e-06, + "loss": 1.2583, + "step": 278040 + }, + { + "epoch": 1.7763821984845967, + "grad_norm": 1.0811630487442017, + "learning_rate": 3.071816885508416e-06, + "loss": 0.6931, + "step": 278050 + }, + { + "epoch": 1.7764460856343356, + "grad_norm": 0.6349918246269226, + "learning_rate": 3.070085487881724e-06, + "loss": 1.1791, + "step": 278060 + }, + { + "epoch": 1.776509972784074, + "grad_norm": 0.718450665473938, + "learning_rate": 3.068354562879877e-06, + "loss": 0.8377, + "step": 278070 + }, + { + "epoch": 1.776573859933813, + "grad_norm": 0.8059900999069214, + "learning_rate": 3.0666241105202942e-06, + "loss": 0.8405, + "step": 278080 + }, + { + "epoch": 1.7766377470835515, + "grad_norm": 1.0362147092819214, + "learning_rate": 3.0648941308204172e-06, + "loss": 0.8976, + "step": 278090 + }, + { + "epoch": 1.7767016342332904, + "grad_norm": 0.9528014063835144, + "learning_rate": 3.063164623797665e-06, + "loss": 0.6518, + "step": 278100 + }, + { + "epoch": 1.776765521383029, + "grad_norm": 2.4749984741210938, + "learning_rate": 3.061435589469441e-06, + "loss": 0.9381, + "step": 278110 + }, + { + "epoch": 1.7768294085327678, + "grad_norm": 0.8902806043624878, + "learning_rate": 3.059707027853176e-06, + "loss": 0.8247, + "step": 278120 + }, + { + "epoch": 1.7768932956825063, + "grad_norm": 1.6432876586914062, + "learning_rate": 3.0579789389662607e-06, + "loss": 0.6879, + "step": 278130 + }, + { + "epoch": 1.7769571828322452, + "grad_norm": 0.9981576800346375, + "learning_rate": 3.0562513228261147e-06, + "loss": 0.9707, + "step": 278140 + }, + { + "epoch": 1.7770210699819837, + "grad_norm": 0.7556832432746887, + "learning_rate": 3.0545241794501246e-06, + "loss": 0.7934, + "step": 278150 + }, + { + "epoch": 1.7770849571317227, + "grad_norm": 0.7431328892707825, + "learning_rate": 3.052797508855698e-06, + "loss": 0.6255, + "step": 278160 + }, + { + "epoch": 1.7771488442814611, + "grad_norm": 1.2007827758789062, + "learning_rate": 3.0510713110602106e-06, + "loss": 0.7706, + "step": 278170 + }, + { + "epoch": 1.7772127314312, + "grad_norm": 0.9197219014167786, + "learning_rate": 3.049345586081054e-06, + "loss": 0.7547, + "step": 278180 + }, + { + "epoch": 1.7772766185809385, + "grad_norm": 0.6609572172164917, + "learning_rate": 3.0476203339356023e-06, + "loss": 0.6398, + "step": 278190 + }, + { + "epoch": 1.7773405057306775, + "grad_norm": 0.6442859768867493, + "learning_rate": 3.045895554641237e-06, + "loss": 0.7506, + "step": 278200 + }, + { + "epoch": 1.777404392880416, + "grad_norm": 1.3745826482772827, + "learning_rate": 3.044171248215316e-06, + "loss": 0.9476, + "step": 278210 + }, + { + "epoch": 1.777468280030155, + "grad_norm": 1.1745824813842773, + "learning_rate": 3.04244741467522e-06, + "loss": 0.9133, + "step": 278220 + }, + { + "epoch": 1.7775321671798934, + "grad_norm": 1.0461777448654175, + "learning_rate": 3.040724054038302e-06, + "loss": 0.9329, + "step": 278230 + }, + { + "epoch": 1.7775960543296323, + "grad_norm": 1.0739800930023193, + "learning_rate": 3.0390011663219198e-06, + "loss": 0.7345, + "step": 278240 + }, + { + "epoch": 1.7776599414793708, + "grad_norm": 0.7248935103416443, + "learning_rate": 3.037278751543421e-06, + "loss": 0.8388, + "step": 278250 + }, + { + "epoch": 1.7777238286291095, + "grad_norm": 1.6017006635665894, + "learning_rate": 3.0355568097201526e-06, + "loss": 0.8607, + "step": 278260 + }, + { + "epoch": 1.7777877157788482, + "grad_norm": 0.7197296023368835, + "learning_rate": 3.033835340869462e-06, + "loss": 0.7747, + "step": 278270 + }, + { + "epoch": 1.777851602928587, + "grad_norm": 0.9725298285484314, + "learning_rate": 3.032114345008674e-06, + "loss": 1.0129, + "step": 278280 + }, + { + "epoch": 1.7779154900783256, + "grad_norm": 0.7443470358848572, + "learning_rate": 3.0303938221551363e-06, + "loss": 0.7772, + "step": 278290 + }, + { + "epoch": 1.7779793772280643, + "grad_norm": 1.0656518936157227, + "learning_rate": 3.0286737723261573e-06, + "loss": 1.0035, + "step": 278300 + }, + { + "epoch": 1.778043264377803, + "grad_norm": 0.737565815448761, + "learning_rate": 3.026954195539078e-06, + "loss": 0.8194, + "step": 278310 + }, + { + "epoch": 1.7781071515275417, + "grad_norm": 1.947318196296692, + "learning_rate": 3.0252350918112015e-06, + "loss": 1.1093, + "step": 278320 + }, + { + "epoch": 1.7781710386772804, + "grad_norm": 0.6763631105422974, + "learning_rate": 3.023516461159853e-06, + "loss": 0.7545, + "step": 278330 + }, + { + "epoch": 1.7782349258270191, + "grad_norm": 0.7558571100234985, + "learning_rate": 3.0217983036023246e-06, + "loss": 0.8406, + "step": 278340 + }, + { + "epoch": 1.7782988129767578, + "grad_norm": 0.950883150100708, + "learning_rate": 3.0200806191559407e-06, + "loss": 0.8893, + "step": 278350 + }, + { + "epoch": 1.7783627001264966, + "grad_norm": 0.9428911805152893, + "learning_rate": 3.018363407837976e-06, + "loss": 0.869, + "step": 278360 + }, + { + "epoch": 1.7784265872762353, + "grad_norm": 0.6479962468147278, + "learning_rate": 3.016646669665751e-06, + "loss": 0.8571, + "step": 278370 + }, + { + "epoch": 1.778490474425974, + "grad_norm": 1.2233079671859741, + "learning_rate": 3.0149304046565296e-06, + "loss": 0.902, + "step": 278380 + }, + { + "epoch": 1.7785543615757127, + "grad_norm": 0.8563507199287415, + "learning_rate": 3.013214612827614e-06, + "loss": 0.7688, + "step": 278390 + }, + { + "epoch": 1.7786182487254514, + "grad_norm": 0.8968834280967712, + "learning_rate": 3.011499294196268e-06, + "loss": 0.8853, + "step": 278400 + }, + { + "epoch": 1.77868213587519, + "grad_norm": 0.4790927469730377, + "learning_rate": 3.0097844487797733e-06, + "loss": 0.7484, + "step": 278410 + }, + { + "epoch": 1.7787460230249288, + "grad_norm": 0.8275429010391235, + "learning_rate": 3.008070076595415e-06, + "loss": 1.1507, + "step": 278420 + }, + { + "epoch": 1.7788099101746675, + "grad_norm": 1.8453471660614014, + "learning_rate": 3.0063561776604298e-06, + "loss": 0.852, + "step": 278430 + }, + { + "epoch": 1.7788737973244062, + "grad_norm": 0.7870879173278809, + "learning_rate": 3.004642751992104e-06, + "loss": 0.8445, + "step": 278440 + }, + { + "epoch": 1.778937684474145, + "grad_norm": 0.8181558847427368, + "learning_rate": 3.0029297996076787e-06, + "loss": 0.993, + "step": 278450 + }, + { + "epoch": 1.7790015716238836, + "grad_norm": 1.7123284339904785, + "learning_rate": 3.0012173205244132e-06, + "loss": 0.7388, + "step": 278460 + }, + { + "epoch": 1.7790654587736223, + "grad_norm": 0.8432309031486511, + "learning_rate": 2.999505314759543e-06, + "loss": 1.0784, + "step": 278470 + }, + { + "epoch": 1.779129345923361, + "grad_norm": 1.3005728721618652, + "learning_rate": 2.9977937823303217e-06, + "loss": 1.1719, + "step": 278480 + }, + { + "epoch": 1.7791932330730997, + "grad_norm": 1.1980042457580566, + "learning_rate": 2.9960827232539677e-06, + "loss": 0.845, + "step": 278490 + }, + { + "epoch": 1.7792571202228382, + "grad_norm": 1.6383336782455444, + "learning_rate": 2.9943721375477408e-06, + "loss": 0.9195, + "step": 278500 + }, + { + "epoch": 1.7793210073725771, + "grad_norm": 0.8793671131134033, + "learning_rate": 2.9926620252288373e-06, + "loss": 0.7686, + "step": 278510 + }, + { + "epoch": 1.7793848945223156, + "grad_norm": 0.8216747045516968, + "learning_rate": 2.990952386314505e-06, + "loss": 0.6514, + "step": 278520 + }, + { + "epoch": 1.7794487816720546, + "grad_norm": 0.842990517616272, + "learning_rate": 2.989243220821941e-06, + "loss": 1.0126, + "step": 278530 + }, + { + "epoch": 1.779512668821793, + "grad_norm": 1.0940358638763428, + "learning_rate": 2.987534528768382e-06, + "loss": 0.7461, + "step": 278540 + }, + { + "epoch": 1.779576555971532, + "grad_norm": 1.65617036819458, + "learning_rate": 2.985826310171008e-06, + "loss": 1.0087, + "step": 278550 + }, + { + "epoch": 1.7796404431212705, + "grad_norm": 0.6357586979866028, + "learning_rate": 2.984118565047045e-06, + "loss": 0.7344, + "step": 278560 + }, + { + "epoch": 1.7797043302710094, + "grad_norm": 0.8728939890861511, + "learning_rate": 2.9824112934136784e-06, + "loss": 0.8424, + "step": 278570 + }, + { + "epoch": 1.7797682174207479, + "grad_norm": 1.3085471391677856, + "learning_rate": 2.9807044952881115e-06, + "loss": 0.8914, + "step": 278580 + }, + { + "epoch": 1.7798321045704868, + "grad_norm": 1.9346404075622559, + "learning_rate": 2.978998170687525e-06, + "loss": 0.6496, + "step": 278590 + }, + { + "epoch": 1.7798959917202253, + "grad_norm": 1.123392939567566, + "learning_rate": 2.977292319629099e-06, + "loss": 1.0704, + "step": 278600 + }, + { + "epoch": 1.7799598788699642, + "grad_norm": 1.1311415433883667, + "learning_rate": 2.975586942130032e-06, + "loss": 0.8, + "step": 278610 + }, + { + "epoch": 1.7800237660197027, + "grad_norm": 0.8052006959915161, + "learning_rate": 2.9738820382074704e-06, + "loss": 0.8831, + "step": 278620 + }, + { + "epoch": 1.7800876531694416, + "grad_norm": 1.8844988346099854, + "learning_rate": 2.972177607878618e-06, + "loss": 1.1519, + "step": 278630 + }, + { + "epoch": 1.78015154031918, + "grad_norm": 1.1015048027038574, + "learning_rate": 2.9704736511606047e-06, + "loss": 0.7323, + "step": 278640 + }, + { + "epoch": 1.780215427468919, + "grad_norm": 3.153304100036621, + "learning_rate": 2.9687701680706227e-06, + "loss": 1.3893, + "step": 278650 + }, + { + "epoch": 1.7802793146186575, + "grad_norm": 1.0777071714401245, + "learning_rate": 2.9670671586258024e-06, + "loss": 0.8355, + "step": 278660 + }, + { + "epoch": 1.7803432017683964, + "grad_norm": 0.8020691275596619, + "learning_rate": 2.9653646228433143e-06, + "loss": 0.8968, + "step": 278670 + }, + { + "epoch": 1.780407088918135, + "grad_norm": 0.9972878098487854, + "learning_rate": 2.9636625607402825e-06, + "loss": 0.9588, + "step": 278680 + }, + { + "epoch": 1.7804709760678739, + "grad_norm": 0.5217418670654297, + "learning_rate": 2.961960972333877e-06, + "loss": 0.8306, + "step": 278690 + }, + { + "epoch": 1.7805348632176123, + "grad_norm": 0.9895110130310059, + "learning_rate": 2.960259857641201e-06, + "loss": 0.8076, + "step": 278700 + }, + { + "epoch": 1.7805987503673513, + "grad_norm": 0.6926957964897156, + "learning_rate": 2.9585592166794185e-06, + "loss": 0.7323, + "step": 278710 + }, + { + "epoch": 1.7806626375170898, + "grad_norm": 4.4439167976379395, + "learning_rate": 2.9568590494656322e-06, + "loss": 0.8698, + "step": 278720 + }, + { + "epoch": 1.7807265246668285, + "grad_norm": 1.4048861265182495, + "learning_rate": 2.9551593560169733e-06, + "loss": 0.6555, + "step": 278730 + }, + { + "epoch": 1.7807904118165672, + "grad_norm": 0.5778505802154541, + "learning_rate": 2.9534601363505666e-06, + "loss": 0.6811, + "step": 278740 + }, + { + "epoch": 1.7808542989663059, + "grad_norm": 1.2079328298568726, + "learning_rate": 2.9517613904835095e-06, + "loss": 1.0647, + "step": 278750 + }, + { + "epoch": 1.7809181861160446, + "grad_norm": 1.3561972379684448, + "learning_rate": 2.950063118432922e-06, + "loss": 0.9528, + "step": 278760 + }, + { + "epoch": 1.7809820732657833, + "grad_norm": 1.4584667682647705, + "learning_rate": 2.948365320215901e-06, + "loss": 1.1817, + "step": 278770 + }, + { + "epoch": 1.781045960415522, + "grad_norm": 0.6736270785331726, + "learning_rate": 2.94666799584955e-06, + "loss": 0.9713, + "step": 278780 + }, + { + "epoch": 1.7811098475652607, + "grad_norm": 1.5708751678466797, + "learning_rate": 2.94497114535095e-06, + "loss": 0.7995, + "step": 278790 + }, + { + "epoch": 1.7811737347149994, + "grad_norm": 0.9714763760566711, + "learning_rate": 2.943274768737214e-06, + "loss": 1.1868, + "step": 278800 + }, + { + "epoch": 1.781237621864738, + "grad_norm": 1.4231477975845337, + "learning_rate": 2.9415788660253963e-06, + "loss": 0.9282, + "step": 278810 + }, + { + "epoch": 1.7813015090144768, + "grad_norm": 0.867780864238739, + "learning_rate": 2.939883437232599e-06, + "loss": 0.8377, + "step": 278820 + }, + { + "epoch": 1.7813653961642155, + "grad_norm": 0.6904677152633667, + "learning_rate": 2.9381884823758866e-06, + "loss": 0.8879, + "step": 278830 + }, + { + "epoch": 1.7814292833139542, + "grad_norm": 0.8885203003883362, + "learning_rate": 2.9364940014723343e-06, + "loss": 0.7462, + "step": 278840 + }, + { + "epoch": 1.781493170463693, + "grad_norm": 1.1848344802856445, + "learning_rate": 2.9347999945389946e-06, + "loss": 0.7568, + "step": 278850 + }, + { + "epoch": 1.7815570576134316, + "grad_norm": 1.0793530941009521, + "learning_rate": 2.9331064615929438e-06, + "loss": 0.7066, + "step": 278860 + }, + { + "epoch": 1.7816209447631703, + "grad_norm": 1.1547480821609497, + "learning_rate": 2.931413402651223e-06, + "loss": 0.7813, + "step": 278870 + }, + { + "epoch": 1.781684831912909, + "grad_norm": 1.114449143409729, + "learning_rate": 2.9297208177308964e-06, + "loss": 0.7996, + "step": 278880 + }, + { + "epoch": 1.7817487190626478, + "grad_norm": 1.191058874130249, + "learning_rate": 2.9280287068489954e-06, + "loss": 0.8922, + "step": 278890 + }, + { + "epoch": 1.7818126062123865, + "grad_norm": 0.906446099281311, + "learning_rate": 2.926337070022578e-06, + "loss": 0.8086, + "step": 278900 + }, + { + "epoch": 1.7818764933621252, + "grad_norm": 1.750216007232666, + "learning_rate": 2.924645907268658e-06, + "loss": 0.8135, + "step": 278910 + }, + { + "epoch": 1.7819403805118639, + "grad_norm": 2.0710866451263428, + "learning_rate": 2.9229552186042896e-06, + "loss": 0.9289, + "step": 278920 + }, + { + "epoch": 1.7820042676616026, + "grad_norm": 1.7744084596633911, + "learning_rate": 2.921265004046486e-06, + "loss": 0.867, + "step": 278930 + }, + { + "epoch": 1.7820681548113413, + "grad_norm": 1.1938283443450928, + "learning_rate": 2.9195752636122677e-06, + "loss": 1.0544, + "step": 278940 + }, + { + "epoch": 1.78213204196108, + "grad_norm": 0.7993178367614746, + "learning_rate": 2.9178859973186703e-06, + "loss": 0.8555, + "step": 278950 + }, + { + "epoch": 1.7821959291108187, + "grad_norm": 0.5344235301017761, + "learning_rate": 2.91619720518268e-06, + "loss": 1.0545, + "step": 278960 + }, + { + "epoch": 1.7822598162605574, + "grad_norm": 0.8354693651199341, + "learning_rate": 2.9145088872213233e-06, + "loss": 0.7915, + "step": 278970 + }, + { + "epoch": 1.7823237034102961, + "grad_norm": 0.97170490026474, + "learning_rate": 2.912821043451591e-06, + "loss": 0.7246, + "step": 278980 + }, + { + "epoch": 1.7823875905600346, + "grad_norm": 0.8123016953468323, + "learning_rate": 2.911133673890498e-06, + "loss": 0.8792, + "step": 278990 + }, + { + "epoch": 1.7824514777097735, + "grad_norm": 1.0553772449493408, + "learning_rate": 2.9094467785550193e-06, + "loss": 0.8939, + "step": 279000 + }, + { + "epoch": 1.782515364859512, + "grad_norm": 0.7460412383079529, + "learning_rate": 2.9077603574621526e-06, + "loss": 0.8926, + "step": 279010 + }, + { + "epoch": 1.782579252009251, + "grad_norm": 0.870805025100708, + "learning_rate": 2.906074410628873e-06, + "loss": 0.8963, + "step": 279020 + }, + { + "epoch": 1.7826431391589894, + "grad_norm": 1.2756316661834717, + "learning_rate": 2.9043889380721778e-06, + "loss": 0.9378, + "step": 279030 + }, + { + "epoch": 1.7827070263087283, + "grad_norm": 1.367229700088501, + "learning_rate": 2.9027039398090204e-06, + "loss": 0.8562, + "step": 279040 + }, + { + "epoch": 1.7827709134584668, + "grad_norm": 1.108358383178711, + "learning_rate": 2.9010194158563876e-06, + "loss": 0.7689, + "step": 279050 + }, + { + "epoch": 1.7828348006082058, + "grad_norm": 1.223518967628479, + "learning_rate": 2.8993353662312316e-06, + "loss": 0.9621, + "step": 279060 + }, + { + "epoch": 1.7828986877579442, + "grad_norm": 1.6448414325714111, + "learning_rate": 2.897651790950512e-06, + "loss": 1.0777, + "step": 279070 + }, + { + "epoch": 1.7829625749076832, + "grad_norm": 1.6710528135299683, + "learning_rate": 2.895968690031198e-06, + "loss": 0.781, + "step": 279080 + }, + { + "epoch": 1.7830264620574217, + "grad_norm": 1.0129505395889282, + "learning_rate": 2.8942860634902202e-06, + "loss": 0.8742, + "step": 279090 + }, + { + "epoch": 1.7830903492071606, + "grad_norm": 1.2868965864181519, + "learning_rate": 2.892603911344538e-06, + "loss": 0.8708, + "step": 279100 + }, + { + "epoch": 1.783154236356899, + "grad_norm": 0.8371222019195557, + "learning_rate": 2.890922233611082e-06, + "loss": 0.804, + "step": 279110 + }, + { + "epoch": 1.783218123506638, + "grad_norm": 1.1141200065612793, + "learning_rate": 2.8892410303068053e-06, + "loss": 0.8911, + "step": 279120 + }, + { + "epoch": 1.7832820106563765, + "grad_norm": 1.4201537370681763, + "learning_rate": 2.887560301448622e-06, + "loss": 0.6672, + "step": 279130 + }, + { + "epoch": 1.7833458978061154, + "grad_norm": 0.7168004512786865, + "learning_rate": 2.8858800470534684e-06, + "loss": 0.8023, + "step": 279140 + }, + { + "epoch": 1.783409784955854, + "grad_norm": 1.1202197074890137, + "learning_rate": 2.884200267138254e-06, + "loss": 0.8567, + "step": 279150 + }, + { + "epoch": 1.7834736721055928, + "grad_norm": 0.7139795422554016, + "learning_rate": 2.882520961719909e-06, + "loss": 0.9578, + "step": 279160 + }, + { + "epoch": 1.7835375592553313, + "grad_norm": 0.8012449145317078, + "learning_rate": 2.8808421308153367e-06, + "loss": 0.7102, + "step": 279170 + }, + { + "epoch": 1.7836014464050702, + "grad_norm": 1.4097734689712524, + "learning_rate": 2.8791637744414566e-06, + "loss": 1.0442, + "step": 279180 + }, + { + "epoch": 1.7836653335548087, + "grad_norm": 0.6851403117179871, + "learning_rate": 2.877485892615156e-06, + "loss": 0.8663, + "step": 279190 + }, + { + "epoch": 1.7837292207045476, + "grad_norm": 1.378808617591858, + "learning_rate": 2.875808485353343e-06, + "loss": 1.0306, + "step": 279200 + }, + { + "epoch": 1.7837931078542861, + "grad_norm": 1.5122627019882202, + "learning_rate": 2.8741315526728985e-06, + "loss": 0.8163, + "step": 279210 + }, + { + "epoch": 1.7838569950040248, + "grad_norm": 1.231703519821167, + "learning_rate": 2.8724550945907313e-06, + "loss": 0.8521, + "step": 279220 + }, + { + "epoch": 1.7839208821537635, + "grad_norm": 0.7119479179382324, + "learning_rate": 2.8707791111237005e-06, + "loss": 0.9475, + "step": 279230 + }, + { + "epoch": 1.7839847693035022, + "grad_norm": 1.721937656402588, + "learning_rate": 2.8691036022887086e-06, + "loss": 0.9926, + "step": 279240 + }, + { + "epoch": 1.784048656453241, + "grad_norm": 1.1561145782470703, + "learning_rate": 2.867428568102604e-06, + "loss": 0.8603, + "step": 279250 + }, + { + "epoch": 1.7841125436029797, + "grad_norm": 0.8052932620048523, + "learning_rate": 2.8657540085822722e-06, + "loss": 0.9676, + "step": 279260 + }, + { + "epoch": 1.7841764307527184, + "grad_norm": 0.6737070083618164, + "learning_rate": 2.864079923744584e-06, + "loss": 0.7396, + "step": 279270 + }, + { + "epoch": 1.784240317902457, + "grad_norm": 0.632921040058136, + "learning_rate": 2.8624063136063805e-06, + "loss": 0.7189, + "step": 279280 + }, + { + "epoch": 1.7843042050521958, + "grad_norm": 0.8166862726211548, + "learning_rate": 2.8607331781845328e-06, + "loss": 0.7994, + "step": 279290 + }, + { + "epoch": 1.7843680922019345, + "grad_norm": 3.0488243103027344, + "learning_rate": 2.8590605174958706e-06, + "loss": 0.7269, + "step": 279300 + }, + { + "epoch": 1.7844319793516732, + "grad_norm": 1.1048704385757446, + "learning_rate": 2.8573883315572647e-06, + "loss": 1.0499, + "step": 279310 + }, + { + "epoch": 1.784495866501412, + "grad_norm": 1.0506843328475952, + "learning_rate": 2.8557166203855344e-06, + "loss": 1.1301, + "step": 279320 + }, + { + "epoch": 1.7845597536511506, + "grad_norm": 1.0728496313095093, + "learning_rate": 2.8540453839975334e-06, + "loss": 0.8735, + "step": 279330 + }, + { + "epoch": 1.7846236408008893, + "grad_norm": 0.7642419338226318, + "learning_rate": 2.852541677202314e-06, + "loss": 1.0205, + "step": 279340 + }, + { + "epoch": 1.784687527950628, + "grad_norm": 0.9445726275444031, + "learning_rate": 2.85087134294974e-06, + "loss": 0.6283, + "step": 279350 + }, + { + "epoch": 1.7847514151003667, + "grad_norm": 1.404316782951355, + "learning_rate": 2.8492014835296787e-06, + "loss": 0.9088, + "step": 279360 + }, + { + "epoch": 1.7848153022501054, + "grad_norm": 0.8889473080635071, + "learning_rate": 2.847532098958955e-06, + "loss": 0.6324, + "step": 279370 + }, + { + "epoch": 1.7848791893998441, + "grad_norm": 1.1736335754394531, + "learning_rate": 2.8458631892543665e-06, + "loss": 0.7437, + "step": 279380 + }, + { + "epoch": 1.7849430765495828, + "grad_norm": 1.3474094867706299, + "learning_rate": 2.8441947544327276e-06, + "loss": 0.8552, + "step": 279390 + }, + { + "epoch": 1.7850069636993215, + "grad_norm": 0.7636286616325378, + "learning_rate": 2.842526794510858e-06, + "loss": 0.7877, + "step": 279400 + }, + { + "epoch": 1.7850708508490603, + "grad_norm": 1.1588830947875977, + "learning_rate": 2.840859309505528e-06, + "loss": 0.8884, + "step": 279410 + }, + { + "epoch": 1.785134737998799, + "grad_norm": 0.8181111812591553, + "learning_rate": 2.8391922994335517e-06, + "loss": 0.9266, + "step": 279420 + }, + { + "epoch": 1.7851986251485377, + "grad_norm": 2.1087288856506348, + "learning_rate": 2.8375257643116995e-06, + "loss": 0.7319, + "step": 279430 + }, + { + "epoch": 1.7852625122982764, + "grad_norm": 0.7751528024673462, + "learning_rate": 2.8358597041567737e-06, + "loss": 0.5595, + "step": 279440 + }, + { + "epoch": 1.785326399448015, + "grad_norm": 0.7834802269935608, + "learning_rate": 2.834194118985534e-06, + "loss": 0.654, + "step": 279450 + }, + { + "epoch": 1.7853902865977536, + "grad_norm": 1.1989582777023315, + "learning_rate": 2.8325290088147718e-06, + "loss": 1.0158, + "step": 279460 + }, + { + "epoch": 1.7854541737474925, + "grad_norm": 0.7740297913551331, + "learning_rate": 2.830864373661246e-06, + "loss": 1.0383, + "step": 279470 + }, + { + "epoch": 1.785518060897231, + "grad_norm": 0.8387667536735535, + "learning_rate": 2.829200213541722e-06, + "loss": 0.9114, + "step": 279480 + }, + { + "epoch": 1.78558194804697, + "grad_norm": 0.8275271058082581, + "learning_rate": 2.8275365284729573e-06, + "loss": 0.9103, + "step": 279490 + }, + { + "epoch": 1.7856458351967084, + "grad_norm": 0.7638615965843201, + "learning_rate": 2.8258733184717168e-06, + "loss": 1.0611, + "step": 279500 + }, + { + "epoch": 1.7857097223464473, + "grad_norm": 0.6845916509628296, + "learning_rate": 2.824210583554737e-06, + "loss": 0.6659, + "step": 279510 + }, + { + "epoch": 1.7857736094961858, + "grad_norm": 0.6819515824317932, + "learning_rate": 2.822548323738777e-06, + "loss": 0.8027, + "step": 279520 + }, + { + "epoch": 1.7858374966459247, + "grad_norm": 0.7219198942184448, + "learning_rate": 2.8208865390405627e-06, + "loss": 0.7542, + "step": 279530 + }, + { + "epoch": 1.7859013837956632, + "grad_norm": 1.013564944267273, + "learning_rate": 2.8192252294768406e-06, + "loss": 1.0698, + "step": 279540 + }, + { + "epoch": 1.7859652709454021, + "grad_norm": 1.280639886856079, + "learning_rate": 2.8175643950643314e-06, + "loss": 0.7745, + "step": 279550 + }, + { + "epoch": 1.7860291580951406, + "grad_norm": 3.118612289428711, + "learning_rate": 2.8159040358197775e-06, + "loss": 0.8694, + "step": 279560 + }, + { + "epoch": 1.7860930452448796, + "grad_norm": 1.6748183965682983, + "learning_rate": 2.8142441517598816e-06, + "loss": 0.9532, + "step": 279570 + }, + { + "epoch": 1.786156932394618, + "grad_norm": 1.1805542707443237, + "learning_rate": 2.812584742901364e-06, + "loss": 0.7383, + "step": 279580 + }, + { + "epoch": 1.786220819544357, + "grad_norm": 1.7524734735488892, + "learning_rate": 2.810925809260956e-06, + "loss": 1.2987, + "step": 279590 + }, + { + "epoch": 1.7862847066940954, + "grad_norm": 0.9927526116371155, + "learning_rate": 2.809267350855338e-06, + "loss": 1.0647, + "step": 279600 + }, + { + "epoch": 1.7863485938438344, + "grad_norm": 0.8447758555412292, + "learning_rate": 2.8076093677012304e-06, + "loss": 0.9666, + "step": 279610 + }, + { + "epoch": 1.7864124809935729, + "grad_norm": 0.811472475528717, + "learning_rate": 2.8059518598153144e-06, + "loss": 0.8278, + "step": 279620 + }, + { + "epoch": 1.7864763681433118, + "grad_norm": 0.9613576531410217, + "learning_rate": 2.8042948272143044e-06, + "loss": 0.8291, + "step": 279630 + }, + { + "epoch": 1.7865402552930503, + "grad_norm": 1.0779619216918945, + "learning_rate": 2.8026382699148645e-06, + "loss": 0.8296, + "step": 279640 + }, + { + "epoch": 1.7866041424427892, + "grad_norm": 1.7146506309509277, + "learning_rate": 2.800982187933698e-06, + "loss": 0.9163, + "step": 279650 + }, + { + "epoch": 1.7866680295925277, + "grad_norm": 1.0744938850402832, + "learning_rate": 2.7993265812874646e-06, + "loss": 0.7879, + "step": 279660 + }, + { + "epoch": 1.7867319167422666, + "grad_norm": 1.2424300909042358, + "learning_rate": 2.7976714499928556e-06, + "loss": 0.8617, + "step": 279670 + }, + { + "epoch": 1.786795803892005, + "grad_norm": 0.732014536857605, + "learning_rate": 2.796016794066525e-06, + "loss": 0.9629, + "step": 279680 + }, + { + "epoch": 1.786859691041744, + "grad_norm": 1.452577829360962, + "learning_rate": 2.794362613525148e-06, + "loss": 1.0294, + "step": 279690 + }, + { + "epoch": 1.7869235781914825, + "grad_norm": 0.8016159534454346, + "learning_rate": 2.7927089083853776e-06, + "loss": 0.7991, + "step": 279700 + }, + { + "epoch": 1.7869874653412212, + "grad_norm": 0.7308264970779419, + "learning_rate": 2.791055678663862e-06, + "loss": 0.8295, + "step": 279710 + }, + { + "epoch": 1.78705135249096, + "grad_norm": 0.6703952550888062, + "learning_rate": 2.789402924377266e-06, + "loss": 0.8897, + "step": 279720 + }, + { + "epoch": 1.7871152396406986, + "grad_norm": 0.7939437627792358, + "learning_rate": 2.7877506455422144e-06, + "loss": 0.8806, + "step": 279730 + }, + { + "epoch": 1.7871791267904373, + "grad_norm": 0.9304759502410889, + "learning_rate": 2.786098842175372e-06, + "loss": 0.8723, + "step": 279740 + }, + { + "epoch": 1.787243013940176, + "grad_norm": 1.3307108879089355, + "learning_rate": 2.784447514293348e-06, + "loss": 0.8925, + "step": 279750 + }, + { + "epoch": 1.7873069010899147, + "grad_norm": 2.4299263954162598, + "learning_rate": 2.7827966619127897e-06, + "loss": 0.8418, + "step": 279760 + }, + { + "epoch": 1.7873707882396535, + "grad_norm": 0.7929475903511047, + "learning_rate": 2.781146285050318e-06, + "loss": 0.9435, + "step": 279770 + }, + { + "epoch": 1.7874346753893922, + "grad_norm": 1.3617475032806396, + "learning_rate": 2.7794963837225576e-06, + "loss": 0.6976, + "step": 279780 + }, + { + "epoch": 1.7874985625391309, + "grad_norm": 1.198539137840271, + "learning_rate": 2.7778469579461065e-06, + "loss": 0.9228, + "step": 279790 + }, + { + "epoch": 1.7875624496888696, + "grad_norm": 0.852506697177887, + "learning_rate": 2.7761980077376017e-06, + "loss": 0.7974, + "step": 279800 + }, + { + "epoch": 1.7876263368386083, + "grad_norm": 1.110378384590149, + "learning_rate": 2.7745495331136353e-06, + "loss": 0.7613, + "step": 279810 + }, + { + "epoch": 1.787690223988347, + "grad_norm": 0.4857124090194702, + "learning_rate": 2.7729015340908104e-06, + "loss": 0.633, + "step": 279820 + }, + { + "epoch": 1.7877541111380857, + "grad_norm": 1.0407607555389404, + "learning_rate": 2.77125401068572e-06, + "loss": 0.8661, + "step": 279830 + }, + { + "epoch": 1.7878179982878244, + "grad_norm": 0.7220622897148132, + "learning_rate": 2.769606962914967e-06, + "loss": 0.6934, + "step": 279840 + }, + { + "epoch": 1.787881885437563, + "grad_norm": 0.8035644888877869, + "learning_rate": 2.767960390795127e-06, + "loss": 1.0495, + "step": 279850 + }, + { + "epoch": 1.7879457725873018, + "grad_norm": 0.9362843632698059, + "learning_rate": 2.766314294342792e-06, + "loss": 0.9978, + "step": 279860 + }, + { + "epoch": 1.7880096597370405, + "grad_norm": 3.5854671001434326, + "learning_rate": 2.7646686735745274e-06, + "loss": 0.938, + "step": 279870 + }, + { + "epoch": 1.7880735468867792, + "grad_norm": 0.7580956220626831, + "learning_rate": 2.763023528506925e-06, + "loss": 0.933, + "step": 279880 + }, + { + "epoch": 1.788137434036518, + "grad_norm": 1.3497252464294434, + "learning_rate": 2.761378859156527e-06, + "loss": 0.99, + "step": 279890 + }, + { + "epoch": 1.7882013211862566, + "grad_norm": 1.0116132497787476, + "learning_rate": 2.7597346655399258e-06, + "loss": 0.6312, + "step": 279900 + }, + { + "epoch": 1.7882652083359953, + "grad_norm": 0.81215500831604, + "learning_rate": 2.7580909476736527e-06, + "loss": 0.8248, + "step": 279910 + }, + { + "epoch": 1.788329095485734, + "grad_norm": 1.3211039304733276, + "learning_rate": 2.7564477055742775e-06, + "loss": 0.8918, + "step": 279920 + }, + { + "epoch": 1.7883929826354727, + "grad_norm": 0.8589543104171753, + "learning_rate": 2.7548049392583485e-06, + "loss": 0.7415, + "step": 279930 + }, + { + "epoch": 1.7884568697852115, + "grad_norm": 0.7645835876464844, + "learning_rate": 2.753162648742402e-06, + "loss": 0.7322, + "step": 279940 + }, + { + "epoch": 1.78852075693495, + "grad_norm": 0.8146962523460388, + "learning_rate": 2.7515208340429922e-06, + "loss": 0.8259, + "step": 279950 + }, + { + "epoch": 1.7885846440846889, + "grad_norm": 0.9893962144851685, + "learning_rate": 2.7498794951766326e-06, + "loss": 0.7888, + "step": 279960 + }, + { + "epoch": 1.7886485312344274, + "grad_norm": 1.0243322849273682, + "learning_rate": 2.7482386321598717e-06, + "loss": 1.0943, + "step": 279970 + }, + { + "epoch": 1.7887124183841663, + "grad_norm": 1.2842010259628296, + "learning_rate": 2.7465982450092187e-06, + "loss": 1.1016, + "step": 279980 + }, + { + "epoch": 1.7887763055339048, + "grad_norm": 0.7452923655509949, + "learning_rate": 2.74495833374121e-06, + "loss": 0.7953, + "step": 279990 + }, + { + "epoch": 1.7888401926836437, + "grad_norm": 0.8941696286201477, + "learning_rate": 2.7433188983723436e-06, + "loss": 0.7354, + "step": 280000 + }, + { + "epoch": 1.7889040798333822, + "grad_norm": 0.608316957950592, + "learning_rate": 2.741679938919145e-06, + "loss": 0.7913, + "step": 280010 + }, + { + "epoch": 1.788967966983121, + "grad_norm": 0.7802823185920715, + "learning_rate": 2.740041455398107e-06, + "loss": 0.8159, + "step": 280020 + }, + { + "epoch": 1.7890318541328596, + "grad_norm": 1.0596716403961182, + "learning_rate": 2.738403447825744e-06, + "loss": 1.2073, + "step": 280030 + }, + { + "epoch": 1.7890957412825985, + "grad_norm": 1.100813388824463, + "learning_rate": 2.7367659162185424e-06, + "loss": 0.7146, + "step": 280040 + }, + { + "epoch": 1.789159628432337, + "grad_norm": 0.778558075428009, + "learning_rate": 2.7351288605930005e-06, + "loss": 0.9001, + "step": 280050 + }, + { + "epoch": 1.789223515582076, + "grad_norm": 0.989571750164032, + "learning_rate": 2.733492280965594e-06, + "loss": 0.7467, + "step": 280060 + }, + { + "epoch": 1.7892874027318144, + "grad_norm": 0.946502685546875, + "learning_rate": 2.7318561773528153e-06, + "loss": 1.2593, + "step": 280070 + }, + { + "epoch": 1.7893512898815533, + "grad_norm": 0.8070095181465149, + "learning_rate": 2.730220549771134e-06, + "loss": 0.8182, + "step": 280080 + }, + { + "epoch": 1.7894151770312918, + "grad_norm": 1.299543023109436, + "learning_rate": 2.728585398237032e-06, + "loss": 0.7447, + "step": 280090 + }, + { + "epoch": 1.7894790641810308, + "grad_norm": 1.6871373653411865, + "learning_rate": 2.726950722766969e-06, + "loss": 0.921, + "step": 280100 + }, + { + "epoch": 1.7895429513307692, + "grad_norm": 0.7189459204673767, + "learning_rate": 2.7253165233774026e-06, + "loss": 0.8707, + "step": 280110 + }, + { + "epoch": 1.7896068384805082, + "grad_norm": 1.1018993854522705, + "learning_rate": 2.7236828000848034e-06, + "loss": 0.8479, + "step": 280120 + }, + { + "epoch": 1.7896707256302467, + "grad_norm": 1.2004035711288452, + "learning_rate": 2.7220495529056143e-06, + "loss": 0.848, + "step": 280130 + }, + { + "epoch": 1.7897346127799856, + "grad_norm": 1.2776861190795898, + "learning_rate": 2.7204167818562944e-06, + "loss": 0.9096, + "step": 280140 + }, + { + "epoch": 1.789798499929724, + "grad_norm": 0.7575962543487549, + "learning_rate": 2.718784486953274e-06, + "loss": 0.8194, + "step": 280150 + }, + { + "epoch": 1.789862387079463, + "grad_norm": 0.7639335989952087, + "learning_rate": 2.717152668213008e-06, + "loss": 0.8457, + "step": 280160 + }, + { + "epoch": 1.7899262742292015, + "grad_norm": 1.0216253995895386, + "learning_rate": 2.715521325651904e-06, + "loss": 0.7565, + "step": 280170 + }, + { + "epoch": 1.7899901613789404, + "grad_norm": 1.1266084909439087, + "learning_rate": 2.7138904592864278e-06, + "loss": 1.0067, + "step": 280180 + }, + { + "epoch": 1.7900540485286789, + "grad_norm": 1.3970292806625366, + "learning_rate": 2.7122600691329657e-06, + "loss": 0.8546, + "step": 280190 + }, + { + "epoch": 1.7901179356784176, + "grad_norm": 0.7054768204689026, + "learning_rate": 2.7106301552079662e-06, + "loss": 0.7755, + "step": 280200 + }, + { + "epoch": 1.7901818228281563, + "grad_norm": 0.8589091897010803, + "learning_rate": 2.7090007175278207e-06, + "loss": 0.85, + "step": 280210 + }, + { + "epoch": 1.790245709977895, + "grad_norm": 0.6626318693161011, + "learning_rate": 2.7073717561089616e-06, + "loss": 0.69, + "step": 280220 + }, + { + "epoch": 1.7903095971276337, + "grad_norm": 0.8320964574813843, + "learning_rate": 2.70574327096777e-06, + "loss": 0.9992, + "step": 280230 + }, + { + "epoch": 1.7903734842773724, + "grad_norm": 0.9648492932319641, + "learning_rate": 2.7041152621206655e-06, + "loss": 1.0147, + "step": 280240 + }, + { + "epoch": 1.7904373714271111, + "grad_norm": 1.173795223236084, + "learning_rate": 2.7024877295840413e-06, + "loss": 0.8556, + "step": 280250 + }, + { + "epoch": 1.7905012585768498, + "grad_norm": 0.774719774723053, + "learning_rate": 2.7008606733742723e-06, + "loss": 0.7414, + "step": 280260 + }, + { + "epoch": 1.7905651457265885, + "grad_norm": 1.7096000909805298, + "learning_rate": 2.6992340935077685e-06, + "loss": 0.7764, + "step": 280270 + }, + { + "epoch": 1.7906290328763272, + "grad_norm": 1.836374282836914, + "learning_rate": 2.6976079900008887e-06, + "loss": 0.8475, + "step": 280280 + }, + { + "epoch": 1.790692920026066, + "grad_norm": 0.7114518284797668, + "learning_rate": 2.6959823628700255e-06, + "loss": 0.7563, + "step": 280290 + }, + { + "epoch": 1.7907568071758047, + "grad_norm": 0.8420717716217041, + "learning_rate": 2.694357212131543e-06, + "loss": 0.815, + "step": 280300 + }, + { + "epoch": 1.7908206943255434, + "grad_norm": 0.8970429301261902, + "learning_rate": 2.6927325378018063e-06, + "loss": 0.7784, + "step": 280310 + }, + { + "epoch": 1.790884581475282, + "grad_norm": 1.0795053243637085, + "learning_rate": 2.6911083398971805e-06, + "loss": 1.127, + "step": 280320 + }, + { + "epoch": 1.7909484686250208, + "grad_norm": 1.1815121173858643, + "learning_rate": 2.689484618434024e-06, + "loss": 1.085, + "step": 280330 + }, + { + "epoch": 1.7910123557747595, + "grad_norm": 0.815788745880127, + "learning_rate": 2.6878613734286797e-06, + "loss": 0.8834, + "step": 280340 + }, + { + "epoch": 1.7910762429244982, + "grad_norm": 0.9623815417289734, + "learning_rate": 2.686238604897512e-06, + "loss": 1.0261, + "step": 280350 + }, + { + "epoch": 1.791140130074237, + "grad_norm": 0.9388995170593262, + "learning_rate": 2.684616312856841e-06, + "loss": 0.7752, + "step": 280360 + }, + { + "epoch": 1.7912040172239756, + "grad_norm": 1.041695475578308, + "learning_rate": 2.6829944973230323e-06, + "loss": 0.8244, + "step": 280370 + }, + { + "epoch": 1.7912679043737143, + "grad_norm": 1.519972324371338, + "learning_rate": 2.6813731583123948e-06, + "loss": 1.1163, + "step": 280380 + }, + { + "epoch": 1.791331791523453, + "grad_norm": 1.1777002811431885, + "learning_rate": 2.67975229584127e-06, + "loss": 1.0122, + "step": 280390 + }, + { + "epoch": 1.7913956786731917, + "grad_norm": 1.1681914329528809, + "learning_rate": 2.6781319099259737e-06, + "loss": 0.9996, + "step": 280400 + }, + { + "epoch": 1.7914595658229304, + "grad_norm": 0.7550179362297058, + "learning_rate": 2.6765120005828315e-06, + "loss": 0.8414, + "step": 280410 + }, + { + "epoch": 1.7915234529726691, + "grad_norm": 1.111717939376831, + "learning_rate": 2.6748925678281465e-06, + "loss": 0.6463, + "step": 280420 + }, + { + "epoch": 1.7915873401224078, + "grad_norm": 1.5783500671386719, + "learning_rate": 2.6732736116782396e-06, + "loss": 0.712, + "step": 280430 + }, + { + "epoch": 1.7916512272721463, + "grad_norm": 1.1098181009292603, + "learning_rate": 2.67165513214942e-06, + "loss": 1.0321, + "step": 280440 + }, + { + "epoch": 1.7917151144218852, + "grad_norm": 0.6833020448684692, + "learning_rate": 2.6700371292579628e-06, + "loss": 0.7637, + "step": 280450 + }, + { + "epoch": 1.7917790015716237, + "grad_norm": 0.8996888995170593, + "learning_rate": 2.6684196030201892e-06, + "loss": 0.9861, + "step": 280460 + }, + { + "epoch": 1.7918428887213627, + "grad_norm": 1.858432412147522, + "learning_rate": 2.6668025534523743e-06, + "loss": 0.967, + "step": 280470 + }, + { + "epoch": 1.7919067758711011, + "grad_norm": 0.7341635227203369, + "learning_rate": 2.665185980570811e-06, + "loss": 0.8536, + "step": 280480 + }, + { + "epoch": 1.79197066302084, + "grad_norm": 1.4962414503097534, + "learning_rate": 2.6635698843917644e-06, + "loss": 0.8061, + "step": 280490 + }, + { + "epoch": 1.7920345501705786, + "grad_norm": 1.0739679336547852, + "learning_rate": 2.6619542649315323e-06, + "loss": 0.7118, + "step": 280500 + }, + { + "epoch": 1.7920984373203175, + "grad_norm": 0.8998137712478638, + "learning_rate": 2.6603391222063677e-06, + "loss": 0.8174, + "step": 280510 + }, + { + "epoch": 1.792162324470056, + "grad_norm": 0.8208849430084229, + "learning_rate": 2.6587244562325476e-06, + "loss": 1.0456, + "step": 280520 + }, + { + "epoch": 1.792226211619795, + "grad_norm": 0.8737877011299133, + "learning_rate": 2.657110267026325e-06, + "loss": 0.8667, + "step": 280530 + }, + { + "epoch": 1.7922900987695334, + "grad_norm": 2.8004097938537598, + "learning_rate": 2.655496554603959e-06, + "loss": 1.0523, + "step": 280540 + }, + { + "epoch": 1.7923539859192723, + "grad_norm": 1.1986647844314575, + "learning_rate": 2.6538833189817035e-06, + "loss": 1.1346, + "step": 280550 + }, + { + "epoch": 1.7924178730690108, + "grad_norm": 0.9740709662437439, + "learning_rate": 2.652270560175801e-06, + "loss": 0.8708, + "step": 280560 + }, + { + "epoch": 1.7924817602187497, + "grad_norm": 0.835901141166687, + "learning_rate": 2.6506582782024946e-06, + "loss": 0.7152, + "step": 280570 + }, + { + "epoch": 1.7925456473684882, + "grad_norm": 0.7605105638504028, + "learning_rate": 2.6490464730780264e-06, + "loss": 0.9103, + "step": 280580 + }, + { + "epoch": 1.7926095345182271, + "grad_norm": 0.8529285192489624, + "learning_rate": 2.647435144818622e-06, + "loss": 0.9013, + "step": 280590 + }, + { + "epoch": 1.7926734216679656, + "grad_norm": 0.9947511553764343, + "learning_rate": 2.645824293440513e-06, + "loss": 0.866, + "step": 280600 + }, + { + "epoch": 1.7927373088177045, + "grad_norm": 0.9029473066329956, + "learning_rate": 2.6442139189599203e-06, + "loss": 0.9818, + "step": 280610 + }, + { + "epoch": 1.792801195967443, + "grad_norm": 0.9523748755455017, + "learning_rate": 2.6426040213930635e-06, + "loss": 0.9322, + "step": 280620 + }, + { + "epoch": 1.792865083117182, + "grad_norm": 0.8351441621780396, + "learning_rate": 2.640994600756147e-06, + "loss": 1.008, + "step": 280630 + }, + { + "epoch": 1.7929289702669204, + "grad_norm": 1.6545028686523438, + "learning_rate": 2.6393856570653906e-06, + "loss": 1.0136, + "step": 280640 + }, + { + "epoch": 1.7929928574166594, + "grad_norm": 0.8904891014099121, + "learning_rate": 2.637777190336993e-06, + "loss": 0.8758, + "step": 280650 + }, + { + "epoch": 1.7930567445663979, + "grad_norm": 1.1041239500045776, + "learning_rate": 2.6361692005871517e-06, + "loss": 0.7892, + "step": 280660 + }, + { + "epoch": 1.7931206317161368, + "grad_norm": 1.135854721069336, + "learning_rate": 2.6345616878320767e-06, + "loss": 1.1031, + "step": 280670 + }, + { + "epoch": 1.7931845188658753, + "grad_norm": 0.6659204959869385, + "learning_rate": 2.6329546520879265e-06, + "loss": 1.0759, + "step": 280680 + }, + { + "epoch": 1.793248406015614, + "grad_norm": 1.7435749769210815, + "learning_rate": 2.631348093370911e-06, + "loss": 0.9217, + "step": 280690 + }, + { + "epoch": 1.7933122931653527, + "grad_norm": 2.084075689315796, + "learning_rate": 2.6297420116971895e-06, + "loss": 1.0404, + "step": 280700 + }, + { + "epoch": 1.7933761803150914, + "grad_norm": 1.4105054140090942, + "learning_rate": 2.628136407082954e-06, + "loss": 0.8211, + "step": 280710 + }, + { + "epoch": 1.79344006746483, + "grad_norm": 1.0215950012207031, + "learning_rate": 2.626531279544364e-06, + "loss": 1.0964, + "step": 280720 + }, + { + "epoch": 1.7935039546145688, + "grad_norm": 0.8240851759910583, + "learning_rate": 2.6249266290975905e-06, + "loss": 0.8405, + "step": 280730 + }, + { + "epoch": 1.7935678417643075, + "grad_norm": 0.5688413381576538, + "learning_rate": 2.6233224557587867e-06, + "loss": 1.1805, + "step": 280740 + }, + { + "epoch": 1.7936317289140462, + "grad_norm": 5.479889392852783, + "learning_rate": 2.6217187595441172e-06, + "loss": 1.1006, + "step": 280750 + }, + { + "epoch": 1.793695616063785, + "grad_norm": 0.7729905247688293, + "learning_rate": 2.6201155404697253e-06, + "loss": 0.7942, + "step": 280760 + }, + { + "epoch": 1.7937595032135236, + "grad_norm": 1.061532735824585, + "learning_rate": 2.618512798551753e-06, + "loss": 0.9488, + "step": 280770 + }, + { + "epoch": 1.7938233903632623, + "grad_norm": 0.8962979316711426, + "learning_rate": 2.6169105338063604e-06, + "loss": 0.8684, + "step": 280780 + }, + { + "epoch": 1.793887277513001, + "grad_norm": 1.4239200353622437, + "learning_rate": 2.6153087462496616e-06, + "loss": 0.732, + "step": 280790 + }, + { + "epoch": 1.7939511646627397, + "grad_norm": 1.5325372219085693, + "learning_rate": 2.6137074358978054e-06, + "loss": 0.8815, + "step": 280800 + }, + { + "epoch": 1.7940150518124784, + "grad_norm": 1.595062494277954, + "learning_rate": 2.6121066027669006e-06, + "loss": 0.9914, + "step": 280810 + }, + { + "epoch": 1.7940789389622172, + "grad_norm": 0.9620861411094666, + "learning_rate": 2.61050624687309e-06, + "loss": 1.1952, + "step": 280820 + }, + { + "epoch": 1.7941428261119559, + "grad_norm": 1.8029391765594482, + "learning_rate": 2.6089063682324664e-06, + "loss": 1.0367, + "step": 280830 + }, + { + "epoch": 1.7942067132616946, + "grad_norm": 0.8456335663795471, + "learning_rate": 2.607306966861167e-06, + "loss": 1.0241, + "step": 280840 + }, + { + "epoch": 1.7942706004114333, + "grad_norm": 0.8139656782150269, + "learning_rate": 2.6057080427752787e-06, + "loss": 0.954, + "step": 280850 + }, + { + "epoch": 1.794334487561172, + "grad_norm": 0.5983510613441467, + "learning_rate": 2.6041095959909167e-06, + "loss": 0.9332, + "step": 280860 + }, + { + "epoch": 1.7943983747109107, + "grad_norm": 1.8724719285964966, + "learning_rate": 2.602511626524168e-06, + "loss": 0.7195, + "step": 280870 + }, + { + "epoch": 1.7944622618606494, + "grad_norm": 1.7823671102523804, + "learning_rate": 2.6009141343911414e-06, + "loss": 1.058, + "step": 280880 + }, + { + "epoch": 1.794526149010388, + "grad_norm": 1.1335912942886353, + "learning_rate": 2.599317119607908e-06, + "loss": 0.7466, + "step": 280890 + }, + { + "epoch": 1.7945900361601268, + "grad_norm": 0.8973339200019836, + "learning_rate": 2.597720582190566e-06, + "loss": 0.825, + "step": 280900 + }, + { + "epoch": 1.7946539233098655, + "grad_norm": 0.9591314792633057, + "learning_rate": 2.5961245221551746e-06, + "loss": 0.9578, + "step": 280910 + }, + { + "epoch": 1.7947178104596042, + "grad_norm": 1.0243194103240967, + "learning_rate": 2.594528939517832e-06, + "loss": 0.6736, + "step": 280920 + }, + { + "epoch": 1.7947816976093427, + "grad_norm": 0.5387315154075623, + "learning_rate": 2.5929338342945865e-06, + "loss": 0.9586, + "step": 280930 + }, + { + "epoch": 1.7948455847590816, + "grad_norm": 0.982440173625946, + "learning_rate": 2.591339206501514e-06, + "loss": 0.7208, + "step": 280940 + }, + { + "epoch": 1.79490947190882, + "grad_norm": 1.2296168804168701, + "learning_rate": 2.589745056154669e-06, + "loss": 0.8387, + "step": 280950 + }, + { + "epoch": 1.794973359058559, + "grad_norm": 1.1976443529129028, + "learning_rate": 2.5881513832701044e-06, + "loss": 0.9695, + "step": 280960 + }, + { + "epoch": 1.7950372462082975, + "grad_norm": 1.1437692642211914, + "learning_rate": 2.5865581878638745e-06, + "loss": 0.6054, + "step": 280970 + }, + { + "epoch": 1.7951011333580364, + "grad_norm": 0.8556347489356995, + "learning_rate": 2.584965469952022e-06, + "loss": 0.9218, + "step": 280980 + }, + { + "epoch": 1.795165020507775, + "grad_norm": 0.7509362101554871, + "learning_rate": 2.5833732295505895e-06, + "loss": 0.9018, + "step": 280990 + }, + { + "epoch": 1.7952289076575139, + "grad_norm": 0.9571146368980408, + "learning_rate": 2.5817814666756035e-06, + "loss": 0.7376, + "step": 281000 + }, + { + "epoch": 1.7952927948072523, + "grad_norm": 1.0304933786392212, + "learning_rate": 2.5801901813431116e-06, + "loss": 1.0329, + "step": 281010 + }, + { + "epoch": 1.7953566819569913, + "grad_norm": 0.7072136402130127, + "learning_rate": 2.5785993735691184e-06, + "loss": 0.644, + "step": 281020 + }, + { + "epoch": 1.7954205691067298, + "grad_norm": 1.249049186706543, + "learning_rate": 2.5770090433696604e-06, + "loss": 1.0775, + "step": 281030 + }, + { + "epoch": 1.7954844562564687, + "grad_norm": 1.0728756189346313, + "learning_rate": 2.5754191907607473e-06, + "loss": 1.0548, + "step": 281040 + }, + { + "epoch": 1.7955483434062072, + "grad_norm": 0.8451454043388367, + "learning_rate": 2.5738298157583885e-06, + "loss": 0.9135, + "step": 281050 + }, + { + "epoch": 1.795612230555946, + "grad_norm": 2.4911673069000244, + "learning_rate": 2.5722409183785933e-06, + "loss": 0.9373, + "step": 281060 + }, + { + "epoch": 1.7956761177056846, + "grad_norm": 0.8939559459686279, + "learning_rate": 2.570652498637366e-06, + "loss": 0.8767, + "step": 281070 + }, + { + "epoch": 1.7957400048554235, + "grad_norm": 5.280501365661621, + "learning_rate": 2.5690645565506987e-06, + "loss": 0.8534, + "step": 281080 + }, + { + "epoch": 1.795803892005162, + "grad_norm": 0.7299918532371521, + "learning_rate": 2.567477092134585e-06, + "loss": 0.7935, + "step": 281090 + }, + { + "epoch": 1.795867779154901, + "grad_norm": 0.9297654032707214, + "learning_rate": 2.565890105405011e-06, + "loss": 0.7796, + "step": 281100 + }, + { + "epoch": 1.7959316663046394, + "grad_norm": 1.0455909967422485, + "learning_rate": 2.564303596377965e-06, + "loss": 0.9042, + "step": 281110 + }, + { + "epoch": 1.7959955534543783, + "grad_norm": 1.2347888946533203, + "learning_rate": 2.562717565069411e-06, + "loss": 1.0895, + "step": 281120 + }, + { + "epoch": 1.7960594406041168, + "grad_norm": 0.769524335861206, + "learning_rate": 2.5611320114953374e-06, + "loss": 0.6139, + "step": 281130 + }, + { + "epoch": 1.7961233277538557, + "grad_norm": 0.8894920945167542, + "learning_rate": 2.559546935671697e-06, + "loss": 1.0127, + "step": 281140 + }, + { + "epoch": 1.7961872149035942, + "grad_norm": 0.9349243640899658, + "learning_rate": 2.557962337614467e-06, + "loss": 0.8493, + "step": 281150 + }, + { + "epoch": 1.796251102053333, + "grad_norm": 1.5105633735656738, + "learning_rate": 2.5563782173396054e-06, + "loss": 1.0115, + "step": 281160 + }, + { + "epoch": 1.7963149892030716, + "grad_norm": 1.3091589212417603, + "learning_rate": 2.5547945748630454e-06, + "loss": 0.8647, + "step": 281170 + }, + { + "epoch": 1.7963788763528103, + "grad_norm": 0.8863422274589539, + "learning_rate": 2.553211410200762e-06, + "loss": 0.8885, + "step": 281180 + }, + { + "epoch": 1.796442763502549, + "grad_norm": 1.0648189783096313, + "learning_rate": 2.5516287233686766e-06, + "loss": 0.7835, + "step": 281190 + }, + { + "epoch": 1.7965066506522878, + "grad_norm": 1.1669793128967285, + "learning_rate": 2.550046514382748e-06, + "loss": 1.1771, + "step": 281200 + }, + { + "epoch": 1.7965705378020265, + "grad_norm": 1.1036698818206787, + "learning_rate": 2.548464783258897e-06, + "loss": 1.0883, + "step": 281210 + }, + { + "epoch": 1.7966344249517652, + "grad_norm": 1.1956664323806763, + "learning_rate": 2.546883530013061e-06, + "loss": 0.7737, + "step": 281220 + }, + { + "epoch": 1.7966983121015039, + "grad_norm": 0.824192225933075, + "learning_rate": 2.5453027546611495e-06, + "loss": 0.8914, + "step": 281230 + }, + { + "epoch": 1.7967621992512426, + "grad_norm": 0.9419105052947998, + "learning_rate": 2.5437224572191053e-06, + "loss": 0.7246, + "step": 281240 + }, + { + "epoch": 1.7968260864009813, + "grad_norm": 1.3376890420913696, + "learning_rate": 2.542142637702821e-06, + "loss": 0.8412, + "step": 281250 + }, + { + "epoch": 1.79688997355072, + "grad_norm": 0.7286721467971802, + "learning_rate": 2.540563296128229e-06, + "loss": 0.7594, + "step": 281260 + }, + { + "epoch": 1.7969538607004587, + "grad_norm": 1.2010440826416016, + "learning_rate": 2.5389844325112157e-06, + "loss": 0.8162, + "step": 281270 + }, + { + "epoch": 1.7970177478501974, + "grad_norm": 1.0000466108322144, + "learning_rate": 2.537406046867685e-06, + "loss": 0.72, + "step": 281280 + }, + { + "epoch": 1.7970816349999361, + "grad_norm": 0.7603448033332825, + "learning_rate": 2.5358281392135417e-06, + "loss": 0.9938, + "step": 281290 + }, + { + "epoch": 1.7971455221496748, + "grad_norm": 0.7650514841079712, + "learning_rate": 2.5342507095646727e-06, + "loss": 0.7677, + "step": 281300 + }, + { + "epoch": 1.7972094092994135, + "grad_norm": 0.8645716309547424, + "learning_rate": 2.53267375793696e-06, + "loss": 0.8062, + "step": 281310 + }, + { + "epoch": 1.7972732964491522, + "grad_norm": 1.3306423425674438, + "learning_rate": 2.53109728434629e-06, + "loss": 0.7866, + "step": 281320 + }, + { + "epoch": 1.797337183598891, + "grad_norm": 0.8305380940437317, + "learning_rate": 2.5295212888085398e-06, + "loss": 0.8128, + "step": 281330 + }, + { + "epoch": 1.7974010707486296, + "grad_norm": 1.4031856060028076, + "learning_rate": 2.5279457713395684e-06, + "loss": 0.7964, + "step": 281340 + }, + { + "epoch": 1.7974649578983684, + "grad_norm": 0.9651740193367004, + "learning_rate": 2.526370731955258e-06, + "loss": 0.7984, + "step": 281350 + }, + { + "epoch": 1.797528845048107, + "grad_norm": 1.2332689762115479, + "learning_rate": 2.524796170671462e-06, + "loss": 0.9097, + "step": 281360 + }, + { + "epoch": 1.7975927321978458, + "grad_norm": 0.8523402214050293, + "learning_rate": 2.5232220875040456e-06, + "loss": 0.7414, + "step": 281370 + }, + { + "epoch": 1.7976566193475845, + "grad_norm": 1.4965907335281372, + "learning_rate": 2.5216484824688522e-06, + "loss": 0.8115, + "step": 281380 + }, + { + "epoch": 1.7977205064973232, + "grad_norm": 1.2132138013839722, + "learning_rate": 2.5200753555817358e-06, + "loss": 0.9746, + "step": 281390 + }, + { + "epoch": 1.7977843936470619, + "grad_norm": 1.3495724201202393, + "learning_rate": 2.518502706858533e-06, + "loss": 1.032, + "step": 281400 + }, + { + "epoch": 1.7978482807968006, + "grad_norm": 1.1350055932998657, + "learning_rate": 2.516930536315093e-06, + "loss": 0.9833, + "step": 281410 + }, + { + "epoch": 1.797912167946539, + "grad_norm": 1.4620221853256226, + "learning_rate": 2.5153588439672303e-06, + "loss": 0.7098, + "step": 281420 + }, + { + "epoch": 1.797976055096278, + "grad_norm": 0.6531504988670349, + "learning_rate": 2.513787629830794e-06, + "loss": 0.784, + "step": 281430 + }, + { + "epoch": 1.7980399422460165, + "grad_norm": 0.8889174461364746, + "learning_rate": 2.512216893921587e-06, + "loss": 1.0544, + "step": 281440 + }, + { + "epoch": 1.7981038293957554, + "grad_norm": 2.057274341583252, + "learning_rate": 2.510646636255448e-06, + "loss": 0.6042, + "step": 281450 + }, + { + "epoch": 1.798167716545494, + "grad_norm": 0.6107593774795532, + "learning_rate": 2.5090768568481804e-06, + "loss": 0.8022, + "step": 281460 + }, + { + "epoch": 1.7982316036952328, + "grad_norm": 0.7373892664909363, + "learning_rate": 2.5075075557155935e-06, + "loss": 0.6763, + "step": 281470 + }, + { + "epoch": 1.7982954908449713, + "grad_norm": 1.1133463382720947, + "learning_rate": 2.505938732873486e-06, + "loss": 0.9057, + "step": 281480 + }, + { + "epoch": 1.7983593779947102, + "grad_norm": 2.7442684173583984, + "learning_rate": 2.504370388337668e-06, + "loss": 0.9499, + "step": 281490 + }, + { + "epoch": 1.7984232651444487, + "grad_norm": 1.0574442148208618, + "learning_rate": 2.5028025221239315e-06, + "loss": 1.0303, + "step": 281500 + }, + { + "epoch": 1.7984871522941877, + "grad_norm": 0.6583574414253235, + "learning_rate": 2.5012351342480587e-06, + "loss": 0.8544, + "step": 281510 + }, + { + "epoch": 1.7985510394439261, + "grad_norm": 3.4296627044677734, + "learning_rate": 2.499668224725854e-06, + "loss": 0.7747, + "step": 281520 + }, + { + "epoch": 1.798614926593665, + "grad_norm": 0.8521742224693298, + "learning_rate": 2.498101793573071e-06, + "loss": 0.9831, + "step": 281530 + }, + { + "epoch": 1.7986788137434035, + "grad_norm": 1.4164676666259766, + "learning_rate": 2.4965358408055026e-06, + "loss": 0.8453, + "step": 281540 + }, + { + "epoch": 1.7987427008931425, + "grad_norm": 1.0506097078323364, + "learning_rate": 2.4949703664389144e-06, + "loss": 0.8994, + "step": 281550 + }, + { + "epoch": 1.798806588042881, + "grad_norm": 1.057515025138855, + "learning_rate": 2.4934053704890712e-06, + "loss": 0.7166, + "step": 281560 + }, + { + "epoch": 1.7988704751926199, + "grad_norm": 1.1061023473739624, + "learning_rate": 2.491840852971733e-06, + "loss": 0.6907, + "step": 281570 + }, + { + "epoch": 1.7989343623423584, + "grad_norm": 1.2089459896087646, + "learning_rate": 2.4902768139026646e-06, + "loss": 0.8302, + "step": 281580 + }, + { + "epoch": 1.7989982494920973, + "grad_norm": 1.5082370042800903, + "learning_rate": 2.488713253297603e-06, + "loss": 0.8124, + "step": 281590 + }, + { + "epoch": 1.7990621366418358, + "grad_norm": 0.8607321977615356, + "learning_rate": 2.487150171172309e-06, + "loss": 1.0578, + "step": 281600 + }, + { + "epoch": 1.7991260237915747, + "grad_norm": 0.9606289863586426, + "learning_rate": 2.4855875675425135e-06, + "loss": 1.0664, + "step": 281610 + }, + { + "epoch": 1.7991899109413132, + "grad_norm": 1.1536765098571777, + "learning_rate": 2.4840254424239595e-06, + "loss": 0.6769, + "step": 281620 + }, + { + "epoch": 1.7992537980910521, + "grad_norm": 1.187015175819397, + "learning_rate": 2.4824637958323683e-06, + "loss": 0.632, + "step": 281630 + }, + { + "epoch": 1.7993176852407906, + "grad_norm": 0.8003060221672058, + "learning_rate": 2.480902627783488e-06, + "loss": 0.9771, + "step": 281640 + }, + { + "epoch": 1.7993815723905293, + "grad_norm": 0.8572350740432739, + "learning_rate": 2.4793419382930226e-06, + "loss": 0.8384, + "step": 281650 + }, + { + "epoch": 1.799445459540268, + "grad_norm": 1.1636265516281128, + "learning_rate": 2.4777817273766935e-06, + "loss": 0.7707, + "step": 281660 + }, + { + "epoch": 1.7995093466900067, + "grad_norm": 0.8209835886955261, + "learning_rate": 2.476221995050215e-06, + "loss": 0.7719, + "step": 281670 + }, + { + "epoch": 1.7995732338397454, + "grad_norm": 1.7101112604141235, + "learning_rate": 2.474662741329292e-06, + "loss": 0.8495, + "step": 281680 + }, + { + "epoch": 1.7996371209894841, + "grad_norm": 0.7957033514976501, + "learning_rate": 2.4731039662296394e-06, + "loss": 0.8171, + "step": 281690 + }, + { + "epoch": 1.7997010081392228, + "grad_norm": 1.2597066164016724, + "learning_rate": 2.4715456697669336e-06, + "loss": 0.8401, + "step": 281700 + }, + { + "epoch": 1.7997648952889616, + "grad_norm": 0.9615569710731506, + "learning_rate": 2.4699878519568954e-06, + "loss": 0.856, + "step": 281710 + }, + { + "epoch": 1.7998287824387003, + "grad_norm": 0.6585493683815002, + "learning_rate": 2.468430512815184e-06, + "loss": 0.9701, + "step": 281720 + }, + { + "epoch": 1.799892669588439, + "grad_norm": 1.0327953100204468, + "learning_rate": 2.46687365235751e-06, + "loss": 0.7134, + "step": 281730 + }, + { + "epoch": 1.7999565567381777, + "grad_norm": 1.1007215976715088, + "learning_rate": 2.465317270599532e-06, + "loss": 0.819, + "step": 281740 + }, + { + "epoch": 1.8000204438879164, + "grad_norm": 1.172269582748413, + "learning_rate": 2.463761367556944e-06, + "loss": 0.8989, + "step": 281750 + }, + { + "epoch": 1.800084331037655, + "grad_norm": 0.9221561551094055, + "learning_rate": 2.462205943245388e-06, + "loss": 0.9553, + "step": 281760 + }, + { + "epoch": 1.8001482181873938, + "grad_norm": 1.5058354139328003, + "learning_rate": 2.460650997680558e-06, + "loss": 0.7927, + "step": 281770 + }, + { + "epoch": 1.8002121053371325, + "grad_norm": 0.97669917345047, + "learning_rate": 2.4590965308780913e-06, + "loss": 0.8084, + "step": 281780 + }, + { + "epoch": 1.8002759924868712, + "grad_norm": 1.6297342777252197, + "learning_rate": 2.457542542853658e-06, + "loss": 0.9909, + "step": 281790 + }, + { + "epoch": 1.80033987963661, + "grad_norm": 0.9887993335723877, + "learning_rate": 2.455989033622891e-06, + "loss": 1.1471, + "step": 281800 + }, + { + "epoch": 1.8004037667863486, + "grad_norm": 1.278480887413025, + "learning_rate": 2.454436003201449e-06, + "loss": 0.74, + "step": 281810 + }, + { + "epoch": 1.8004676539360873, + "grad_norm": 0.8170709609985352, + "learning_rate": 2.452883451604976e-06, + "loss": 0.7428, + "step": 281820 + }, + { + "epoch": 1.800531541085826, + "grad_norm": 0.9986400604248047, + "learning_rate": 2.4513313788490923e-06, + "loss": 1.0936, + "step": 281830 + }, + { + "epoch": 1.8005954282355647, + "grad_norm": 2.614264726638794, + "learning_rate": 2.449779784949446e-06, + "loss": 0.7667, + "step": 281840 + }, + { + "epoch": 1.8006593153853034, + "grad_norm": 1.312461018562317, + "learning_rate": 2.4482286699216483e-06, + "loss": 0.9062, + "step": 281850 + }, + { + "epoch": 1.8007232025350421, + "grad_norm": 1.056638240814209, + "learning_rate": 2.44667803378133e-06, + "loss": 0.9314, + "step": 281860 + }, + { + "epoch": 1.8007870896847809, + "grad_norm": 0.556037425994873, + "learning_rate": 2.4451278765440954e-06, + "loss": 0.9544, + "step": 281870 + }, + { + "epoch": 1.8008509768345196, + "grad_norm": 0.976234495639801, + "learning_rate": 2.443578198225577e-06, + "loss": 0.8445, + "step": 281880 + }, + { + "epoch": 1.800914863984258, + "grad_norm": 0.8293027877807617, + "learning_rate": 2.4420289988413557e-06, + "loss": 0.9645, + "step": 281890 + }, + { + "epoch": 1.800978751133997, + "grad_norm": 1.5045146942138672, + "learning_rate": 2.4404802784070535e-06, + "loss": 0.8362, + "step": 281900 + }, + { + "epoch": 1.8010426382837355, + "grad_norm": 0.8356955051422119, + "learning_rate": 2.4389320369382574e-06, + "loss": 0.7479, + "step": 281910 + }, + { + "epoch": 1.8011065254334744, + "grad_norm": 0.9490901231765747, + "learning_rate": 2.437384274450566e-06, + "loss": 0.8967, + "step": 281920 + }, + { + "epoch": 1.8011704125832129, + "grad_norm": 1.8158320188522339, + "learning_rate": 2.43583699095955e-06, + "loss": 0.7877, + "step": 281930 + }, + { + "epoch": 1.8012342997329518, + "grad_norm": 1.2870683670043945, + "learning_rate": 2.43429018648082e-06, + "loss": 0.7343, + "step": 281940 + }, + { + "epoch": 1.8012981868826903, + "grad_norm": 0.7817414999008179, + "learning_rate": 2.4327438610299238e-06, + "loss": 0.7824, + "step": 281950 + }, + { + "epoch": 1.8013620740324292, + "grad_norm": 0.9672678112983704, + "learning_rate": 2.43119801462246e-06, + "loss": 1.0592, + "step": 281960 + }, + { + "epoch": 1.8014259611821677, + "grad_norm": 1.268545150756836, + "learning_rate": 2.429652647273978e-06, + "loss": 0.943, + "step": 281970 + }, + { + "epoch": 1.8014898483319066, + "grad_norm": 0.6564430594444275, + "learning_rate": 2.4281077590000533e-06, + "loss": 0.8437, + "step": 281980 + }, + { + "epoch": 1.801553735481645, + "grad_norm": 0.684155285358429, + "learning_rate": 2.42656334981623e-06, + "loss": 0.7213, + "step": 281990 + }, + { + "epoch": 1.801617622631384, + "grad_norm": 1.8212864398956299, + "learning_rate": 2.4250194197380837e-06, + "loss": 0.7796, + "step": 282000 + }, + { + "epoch": 1.8016815097811225, + "grad_norm": 1.3423765897750854, + "learning_rate": 2.423630292315937e-06, + "loss": 1.1215, + "step": 282010 + }, + { + "epoch": 1.8017453969308614, + "grad_norm": 1.205952763557434, + "learning_rate": 2.4220872725813747e-06, + "loss": 0.8938, + "step": 282020 + }, + { + "epoch": 1.8018092840806, + "grad_norm": 1.2857036590576172, + "learning_rate": 2.4205447319975593e-06, + "loss": 0.977, + "step": 282030 + }, + { + "epoch": 1.8018731712303389, + "grad_norm": 0.7614555358886719, + "learning_rate": 2.4190026705800175e-06, + "loss": 0.8993, + "step": 282040 + }, + { + "epoch": 1.8019370583800773, + "grad_norm": 2.7580156326293945, + "learning_rate": 2.417461088344286e-06, + "loss": 1.2222, + "step": 282050 + }, + { + "epoch": 1.8020009455298163, + "grad_norm": 1.0370123386383057, + "learning_rate": 2.415919985305881e-06, + "loss": 0.7246, + "step": 282060 + }, + { + "epoch": 1.8020648326795548, + "grad_norm": 0.966238260269165, + "learning_rate": 2.4143793614803347e-06, + "loss": 1.3524, + "step": 282070 + }, + { + "epoch": 1.8021287198292937, + "grad_norm": 0.941066324710846, + "learning_rate": 2.4128392168831504e-06, + "loss": 1.0444, + "step": 282080 + }, + { + "epoch": 1.8021926069790322, + "grad_norm": 0.6195220947265625, + "learning_rate": 2.4112995515298444e-06, + "loss": 0.7292, + "step": 282090 + }, + { + "epoch": 1.802256494128771, + "grad_norm": 0.6646325588226318, + "learning_rate": 2.4097603654359203e-06, + "loss": 0.7402, + "step": 282100 + }, + { + "epoch": 1.8023203812785096, + "grad_norm": 0.9299492835998535, + "learning_rate": 2.4082216586168883e-06, + "loss": 0.7345, + "step": 282110 + }, + { + "epoch": 1.8023842684282485, + "grad_norm": 0.9561014175415039, + "learning_rate": 2.4066834310882247e-06, + "loss": 0.8976, + "step": 282120 + }, + { + "epoch": 1.802448155577987, + "grad_norm": 1.0798419713974, + "learning_rate": 2.4051456828654394e-06, + "loss": 0.8831, + "step": 282130 + }, + { + "epoch": 1.8025120427277257, + "grad_norm": 0.5972111821174622, + "learning_rate": 2.4036084139640146e-06, + "loss": 0.7122, + "step": 282140 + }, + { + "epoch": 1.8025759298774644, + "grad_norm": 1.6115037202835083, + "learning_rate": 2.4020716243994267e-06, + "loss": 0.8927, + "step": 282150 + }, + { + "epoch": 1.802639817027203, + "grad_norm": 1.084858775138855, + "learning_rate": 2.400535314187158e-06, + "loss": 0.7647, + "step": 282160 + }, + { + "epoch": 1.8027037041769418, + "grad_norm": 1.1421221494674683, + "learning_rate": 2.3989994833426788e-06, + "loss": 0.8956, + "step": 282170 + }, + { + "epoch": 1.8027675913266805, + "grad_norm": 0.9091895818710327, + "learning_rate": 2.397464131881455e-06, + "loss": 1.0873, + "step": 282180 + }, + { + "epoch": 1.8028314784764192, + "grad_norm": 1.038530707359314, + "learning_rate": 2.3959292598189463e-06, + "loss": 1.2032, + "step": 282190 + }, + { + "epoch": 1.802895365626158, + "grad_norm": 0.9789948463439941, + "learning_rate": 2.3943948671706183e-06, + "loss": 0.6278, + "step": 282200 + }, + { + "epoch": 1.8029592527758966, + "grad_norm": 1.1401680707931519, + "learning_rate": 2.392860953951909e-06, + "loss": 0.9235, + "step": 282210 + }, + { + "epoch": 1.8030231399256353, + "grad_norm": 0.698215901851654, + "learning_rate": 2.391327520178288e-06, + "loss": 1.067, + "step": 282220 + }, + { + "epoch": 1.803087027075374, + "grad_norm": 1.2067962884902954, + "learning_rate": 2.3897945658651777e-06, + "loss": 0.987, + "step": 282230 + }, + { + "epoch": 1.8031509142251128, + "grad_norm": 1.8061639070510864, + "learning_rate": 2.3882620910280316e-06, + "loss": 1.1182, + "step": 282240 + }, + { + "epoch": 1.8032148013748515, + "grad_norm": 0.9816470146179199, + "learning_rate": 2.3867300956822714e-06, + "loss": 1.0446, + "step": 282250 + }, + { + "epoch": 1.8032786885245902, + "grad_norm": 0.7584142088890076, + "learning_rate": 2.385198579843334e-06, + "loss": 0.9838, + "step": 282260 + }, + { + "epoch": 1.8033425756743289, + "grad_norm": 1.0630838871002197, + "learning_rate": 2.383667543526641e-06, + "loss": 0.9216, + "step": 282270 + }, + { + "epoch": 1.8034064628240676, + "grad_norm": 0.9282609224319458, + "learning_rate": 2.3821369867476016e-06, + "loss": 1.1867, + "step": 282280 + }, + { + "epoch": 1.8034703499738063, + "grad_norm": 0.8381339311599731, + "learning_rate": 2.3806069095216487e-06, + "loss": 0.7, + "step": 282290 + }, + { + "epoch": 1.803534237123545, + "grad_norm": 0.9528864026069641, + "learning_rate": 2.3790773118641695e-06, + "loss": 0.9546, + "step": 282300 + }, + { + "epoch": 1.8035981242732837, + "grad_norm": 1.0386065244674683, + "learning_rate": 2.377548193790591e-06, + "loss": 1.104, + "step": 282310 + }, + { + "epoch": 1.8036620114230224, + "grad_norm": 0.791536271572113, + "learning_rate": 2.376019555316289e-06, + "loss": 0.7687, + "step": 282320 + }, + { + "epoch": 1.803725898572761, + "grad_norm": 1.0364983081817627, + "learning_rate": 2.3744913964566795e-06, + "loss": 1.0096, + "step": 282330 + }, + { + "epoch": 1.8037897857224998, + "grad_norm": 1.0348879098892212, + "learning_rate": 2.372963717227139e-06, + "loss": 0.8892, + "step": 282340 + }, + { + "epoch": 1.8038536728722385, + "grad_norm": 0.7205535769462585, + "learning_rate": 2.3714365176430662e-06, + "loss": 0.8306, + "step": 282350 + }, + { + "epoch": 1.8039175600219772, + "grad_norm": 1.5916186571121216, + "learning_rate": 2.3699097977198215e-06, + "loss": 0.7633, + "step": 282360 + }, + { + "epoch": 1.803981447171716, + "grad_norm": 0.7715607285499573, + "learning_rate": 2.3683835574727973e-06, + "loss": 0.6269, + "step": 282370 + }, + { + "epoch": 1.8040453343214544, + "grad_norm": 0.8104017972946167, + "learning_rate": 2.3668577969173544e-06, + "loss": 1.0736, + "step": 282380 + }, + { + "epoch": 1.8041092214711933, + "grad_norm": 1.4469940662384033, + "learning_rate": 2.365332516068863e-06, + "loss": 0.8361, + "step": 282390 + }, + { + "epoch": 1.8041731086209318, + "grad_norm": 1.8273128271102905, + "learning_rate": 2.363807714942684e-06, + "loss": 0.7805, + "step": 282400 + }, + { + "epoch": 1.8042369957706708, + "grad_norm": 0.9197923541069031, + "learning_rate": 2.362283393554171e-06, + "loss": 0.745, + "step": 282410 + }, + { + "epoch": 1.8043008829204092, + "grad_norm": 0.9238811731338501, + "learning_rate": 2.360759551918673e-06, + "loss": 0.6943, + "step": 282420 + }, + { + "epoch": 1.8043647700701482, + "grad_norm": 0.8211013674736023, + "learning_rate": 2.3592361900515504e-06, + "loss": 0.7315, + "step": 282430 + }, + { + "epoch": 1.8044286572198867, + "grad_norm": 1.5779584646224976, + "learning_rate": 2.3577133079681235e-06, + "loss": 0.8395, + "step": 282440 + }, + { + "epoch": 1.8044925443696256, + "grad_norm": 0.649736762046814, + "learning_rate": 2.356190905683742e-06, + "loss": 0.7725, + "step": 282450 + }, + { + "epoch": 1.804556431519364, + "grad_norm": 1.5219225883483887, + "learning_rate": 2.3546689832137423e-06, + "loss": 1.0372, + "step": 282460 + }, + { + "epoch": 1.804620318669103, + "grad_norm": 0.9140293598175049, + "learning_rate": 2.3531475405734414e-06, + "loss": 0.6017, + "step": 282470 + }, + { + "epoch": 1.8046842058188415, + "grad_norm": 1.8277541399002075, + "learning_rate": 2.3516265777781656e-06, + "loss": 0.8381, + "step": 282480 + }, + { + "epoch": 1.8047480929685804, + "grad_norm": 0.8005611896514893, + "learning_rate": 2.35010609484323e-06, + "loss": 0.8137, + "step": 282490 + }, + { + "epoch": 1.804811980118319, + "grad_norm": 1.1830252408981323, + "learning_rate": 2.348586091783955e-06, + "loss": 0.9156, + "step": 282500 + }, + { + "epoch": 1.8048758672680578, + "grad_norm": 0.9271206855773926, + "learning_rate": 2.3470665686156356e-06, + "loss": 1.1015, + "step": 282510 + }, + { + "epoch": 1.8049397544177963, + "grad_norm": 1.3627598285675049, + "learning_rate": 2.3455475253535864e-06, + "loss": 1.0528, + "step": 282520 + }, + { + "epoch": 1.8050036415675352, + "grad_norm": 0.9792562127113342, + "learning_rate": 2.344028962013095e-06, + "loss": 0.9871, + "step": 282530 + }, + { + "epoch": 1.8050675287172737, + "grad_norm": 0.9024627804756165, + "learning_rate": 2.3425108786094663e-06, + "loss": 0.9655, + "step": 282540 + }, + { + "epoch": 1.8051314158670126, + "grad_norm": 0.9467481374740601, + "learning_rate": 2.3409932751579767e-06, + "loss": 0.8784, + "step": 282550 + }, + { + "epoch": 1.8051953030167511, + "grad_norm": 0.7127422094345093, + "learning_rate": 2.339476151673925e-06, + "loss": 0.7942, + "step": 282560 + }, + { + "epoch": 1.80525919016649, + "grad_norm": 0.7716130614280701, + "learning_rate": 2.3379595081725715e-06, + "loss": 0.8521, + "step": 282570 + }, + { + "epoch": 1.8053230773162285, + "grad_norm": 0.6751583218574524, + "learning_rate": 2.3364433446692038e-06, + "loss": 0.8518, + "step": 282580 + }, + { + "epoch": 1.8053869644659675, + "grad_norm": 1.3516067266464233, + "learning_rate": 2.3349276611790816e-06, + "loss": 0.7958, + "step": 282590 + }, + { + "epoch": 1.805450851615706, + "grad_norm": 0.9990543127059937, + "learning_rate": 2.333412457717482e-06, + "loss": 0.9145, + "step": 282600 + }, + { + "epoch": 1.8055147387654449, + "grad_norm": 0.9455886483192444, + "learning_rate": 2.3318977342996486e-06, + "loss": 0.7915, + "step": 282610 + }, + { + "epoch": 1.8055786259151834, + "grad_norm": 0.8298186659812927, + "learning_rate": 2.330383490940852e-06, + "loss": 0.7342, + "step": 282620 + }, + { + "epoch": 1.805642513064922, + "grad_norm": 1.161590337753296, + "learning_rate": 2.328869727656324e-06, + "loss": 0.7807, + "step": 282630 + }, + { + "epoch": 1.8057064002146608, + "grad_norm": 0.8286991119384766, + "learning_rate": 2.3273564444613262e-06, + "loss": 0.8986, + "step": 282640 + }, + { + "epoch": 1.8057702873643995, + "grad_norm": 0.6541506052017212, + "learning_rate": 2.325843641371084e-06, + "loss": 0.9294, + "step": 282650 + }, + { + "epoch": 1.8058341745141382, + "grad_norm": 0.7337067723274231, + "learning_rate": 2.3243313184008463e-06, + "loss": 0.8336, + "step": 282660 + }, + { + "epoch": 1.805898061663877, + "grad_norm": 1.4487308263778687, + "learning_rate": 2.3228194755658405e-06, + "loss": 0.7292, + "step": 282670 + }, + { + "epoch": 1.8059619488136156, + "grad_norm": 1.053105354309082, + "learning_rate": 2.3213081128812818e-06, + "loss": 0.7138, + "step": 282680 + }, + { + "epoch": 1.8060258359633543, + "grad_norm": 0.898478090763092, + "learning_rate": 2.3197972303624027e-06, + "loss": 0.8208, + "step": 282690 + }, + { + "epoch": 1.806089723113093, + "grad_norm": 1.4228217601776123, + "learning_rate": 2.318286828024413e-06, + "loss": 0.7939, + "step": 282700 + }, + { + "epoch": 1.8061536102628317, + "grad_norm": 1.2164430618286133, + "learning_rate": 2.3167769058825283e-06, + "loss": 0.988, + "step": 282710 + }, + { + "epoch": 1.8062174974125704, + "grad_norm": 1.0811247825622559, + "learning_rate": 2.3152674639519476e-06, + "loss": 0.855, + "step": 282720 + }, + { + "epoch": 1.8062813845623091, + "grad_norm": 0.7520591616630554, + "learning_rate": 2.313758502247887e-06, + "loss": 1.1404, + "step": 282730 + }, + { + "epoch": 1.8063452717120478, + "grad_norm": 0.7815333604812622, + "learning_rate": 2.3122500207855225e-06, + "loss": 0.7324, + "step": 282740 + }, + { + "epoch": 1.8064091588617865, + "grad_norm": 0.845707356929779, + "learning_rate": 2.310742019580059e-06, + "loss": 0.8677, + "step": 282750 + }, + { + "epoch": 1.8064730460115253, + "grad_norm": 1.769837498664856, + "learning_rate": 2.309234498646684e-06, + "loss": 1.0049, + "step": 282760 + }, + { + "epoch": 1.806536933161264, + "grad_norm": 1.8039084672927856, + "learning_rate": 2.307727458000575e-06, + "loss": 0.9322, + "step": 282770 + }, + { + "epoch": 1.8066008203110027, + "grad_norm": 0.7408353090286255, + "learning_rate": 2.306220897656908e-06, + "loss": 1.0801, + "step": 282780 + }, + { + "epoch": 1.8066647074607414, + "grad_norm": 1.0164926052093506, + "learning_rate": 2.304714817630854e-06, + "loss": 0.9326, + "step": 282790 + }, + { + "epoch": 1.80672859461048, + "grad_norm": 0.8177975416183472, + "learning_rate": 2.3032092179375906e-06, + "loss": 0.851, + "step": 282800 + }, + { + "epoch": 1.8067924817602188, + "grad_norm": 1.1540977954864502, + "learning_rate": 2.3017040985922668e-06, + "loss": 0.8106, + "step": 282810 + }, + { + "epoch": 1.8068563689099575, + "grad_norm": 0.9932825565338135, + "learning_rate": 2.300199459610053e-06, + "loss": 0.8606, + "step": 282820 + }, + { + "epoch": 1.8069202560596962, + "grad_norm": 1.113124966621399, + "learning_rate": 2.298695301006093e-06, + "loss": 0.9581, + "step": 282830 + }, + { + "epoch": 1.806984143209435, + "grad_norm": 1.4343130588531494, + "learning_rate": 2.297191622795547e-06, + "loss": 0.9953, + "step": 282840 + }, + { + "epoch": 1.8070480303591736, + "grad_norm": 1.3366652727127075, + "learning_rate": 2.295688424993536e-06, + "loss": 0.861, + "step": 282850 + }, + { + "epoch": 1.8071119175089123, + "grad_norm": 0.9450911283493042, + "learning_rate": 2.294185707615226e-06, + "loss": 0.9268, + "step": 282860 + }, + { + "epoch": 1.8071758046586508, + "grad_norm": 0.7475114464759827, + "learning_rate": 2.292683470675733e-06, + "loss": 0.5739, + "step": 282870 + }, + { + "epoch": 1.8072396918083897, + "grad_norm": 1.023122787475586, + "learning_rate": 2.2911817141901883e-06, + "loss": 0.7994, + "step": 282880 + }, + { + "epoch": 1.8073035789581282, + "grad_norm": 1.0646544694900513, + "learning_rate": 2.289680438173719e-06, + "loss": 1.0825, + "step": 282890 + }, + { + "epoch": 1.8073674661078671, + "grad_norm": 1.0106379985809326, + "learning_rate": 2.288179642641447e-06, + "loss": 0.936, + "step": 282900 + }, + { + "epoch": 1.8074313532576056, + "grad_norm": 0.8868458271026611, + "learning_rate": 2.286679327608471e-06, + "loss": 0.7621, + "step": 282910 + }, + { + "epoch": 1.8074952404073445, + "grad_norm": 0.7061320543289185, + "learning_rate": 2.285179493089923e-06, + "loss": 0.9102, + "step": 282920 + }, + { + "epoch": 1.807559127557083, + "grad_norm": 1.1011165380477905, + "learning_rate": 2.2836801391008913e-06, + "loss": 1.0421, + "step": 282930 + }, + { + "epoch": 1.807623014706822, + "grad_norm": 0.8100254535675049, + "learning_rate": 2.2821812656564857e-06, + "loss": 0.7593, + "step": 282940 + }, + { + "epoch": 1.8076869018565604, + "grad_norm": 0.8423644304275513, + "learning_rate": 2.2806828727717946e-06, + "loss": 0.7817, + "step": 282950 + }, + { + "epoch": 1.8077507890062994, + "grad_norm": 1.0324517488479614, + "learning_rate": 2.279184960461911e-06, + "loss": 1.0454, + "step": 282960 + }, + { + "epoch": 1.8078146761560379, + "grad_norm": 1.0457466840744019, + "learning_rate": 2.277687528741912e-06, + "loss": 0.7965, + "step": 282970 + }, + { + "epoch": 1.8078785633057768, + "grad_norm": 0.8995627164840698, + "learning_rate": 2.276190577626891e-06, + "loss": 0.7965, + "step": 282980 + }, + { + "epoch": 1.8079424504555153, + "grad_norm": 0.7673644423484802, + "learning_rate": 2.2746941071319194e-06, + "loss": 0.7311, + "step": 282990 + }, + { + "epoch": 1.8080063376052542, + "grad_norm": 1.0009177923202515, + "learning_rate": 2.2731981172720627e-06, + "loss": 1.0254, + "step": 283000 + }, + { + "epoch": 1.8080702247549927, + "grad_norm": 1.051266074180603, + "learning_rate": 2.271702608062393e-06, + "loss": 0.7851, + "step": 283010 + }, + { + "epoch": 1.8081341119047316, + "grad_norm": 1.0838903188705444, + "learning_rate": 2.270207579517969e-06, + "loss": 0.9183, + "step": 283020 + }, + { + "epoch": 1.80819799905447, + "grad_norm": 0.7509300708770752, + "learning_rate": 2.2687130316538463e-06, + "loss": 0.7546, + "step": 283030 + }, + { + "epoch": 1.808261886204209, + "grad_norm": 0.8449200987815857, + "learning_rate": 2.267218964485074e-06, + "loss": 0.8132, + "step": 283040 + }, + { + "epoch": 1.8083257733539475, + "grad_norm": 0.5527771711349487, + "learning_rate": 2.2657253780267062e-06, + "loss": 0.9708, + "step": 283050 + }, + { + "epoch": 1.8083896605036864, + "grad_norm": 0.9516294002532959, + "learning_rate": 2.264232272293776e-06, + "loss": 0.9998, + "step": 283060 + }, + { + "epoch": 1.808453547653425, + "grad_norm": 0.8535572290420532, + "learning_rate": 2.262739647301332e-06, + "loss": 0.8959, + "step": 283070 + }, + { + "epoch": 1.8085174348031638, + "grad_norm": 1.2427952289581299, + "learning_rate": 2.2612475030643843e-06, + "loss": 0.8699, + "step": 283080 + }, + { + "epoch": 1.8085813219529023, + "grad_norm": 1.534712791442871, + "learning_rate": 2.259755839597988e-06, + "loss": 1.1143, + "step": 283090 + }, + { + "epoch": 1.8086452091026413, + "grad_norm": 0.5216849446296692, + "learning_rate": 2.2582646569171416e-06, + "loss": 0.8651, + "step": 283100 + }, + { + "epoch": 1.8087090962523797, + "grad_norm": 1.19882071018219, + "learning_rate": 2.2567739550368783e-06, + "loss": 0.92, + "step": 283110 + }, + { + "epoch": 1.8087729834021185, + "grad_norm": 0.9051129817962646, + "learning_rate": 2.255283733972202e-06, + "loss": 0.8427, + "step": 283120 + }, + { + "epoch": 1.8088368705518572, + "grad_norm": 1.0221871137619019, + "learning_rate": 2.253793993738129e-06, + "loss": 1.0342, + "step": 283130 + }, + { + "epoch": 1.8089007577015959, + "grad_norm": 0.7139307260513306, + "learning_rate": 2.252304734349647e-06, + "loss": 0.7471, + "step": 283140 + }, + { + "epoch": 1.8089646448513346, + "grad_norm": 1.5615328550338745, + "learning_rate": 2.250815955821778e-06, + "loss": 0.5934, + "step": 283150 + }, + { + "epoch": 1.8090285320010733, + "grad_norm": 0.7617145776748657, + "learning_rate": 2.2493276581694866e-06, + "loss": 0.8384, + "step": 283160 + }, + { + "epoch": 1.809092419150812, + "grad_norm": 3.028179407119751, + "learning_rate": 2.2478398414077895e-06, + "loss": 0.9085, + "step": 283170 + }, + { + "epoch": 1.8091563063005507, + "grad_norm": 0.448412150144577, + "learning_rate": 2.2463525055516465e-06, + "loss": 0.8219, + "step": 283180 + }, + { + "epoch": 1.8092201934502894, + "grad_norm": 1.0630991458892822, + "learning_rate": 2.244865650616046e-06, + "loss": 0.8723, + "step": 283190 + }, + { + "epoch": 1.809284080600028, + "grad_norm": 1.0292770862579346, + "learning_rate": 2.24337927661597e-06, + "loss": 0.9843, + "step": 283200 + }, + { + "epoch": 1.8093479677497668, + "grad_norm": 0.7813323736190796, + "learning_rate": 2.241893383566368e-06, + "loss": 0.9085, + "step": 283210 + }, + { + "epoch": 1.8094118548995055, + "grad_norm": 0.8884089589118958, + "learning_rate": 2.240407971482228e-06, + "loss": 0.9564, + "step": 283220 + }, + { + "epoch": 1.8094757420492442, + "grad_norm": 1.4575201272964478, + "learning_rate": 2.2389230403784933e-06, + "loss": 0.8366, + "step": 283230 + }, + { + "epoch": 1.809539629198983, + "grad_norm": 0.9241499900817871, + "learning_rate": 2.2374385902701357e-06, + "loss": 0.8214, + "step": 283240 + }, + { + "epoch": 1.8096035163487216, + "grad_norm": 0.8342140316963196, + "learning_rate": 2.2361029964359847e-06, + "loss": 0.9822, + "step": 283250 + }, + { + "epoch": 1.8096674034984603, + "grad_norm": 0.6626742482185364, + "learning_rate": 2.2346194602599955e-06, + "loss": 0.7827, + "step": 283260 + }, + { + "epoch": 1.809731290648199, + "grad_norm": 1.3511712551116943, + "learning_rate": 2.2331364051226987e-06, + "loss": 0.8839, + "step": 283270 + }, + { + "epoch": 1.8097951777979377, + "grad_norm": 1.3152260780334473, + "learning_rate": 2.2316538310390547e-06, + "loss": 0.9771, + "step": 283280 + }, + { + "epoch": 1.8098590649476765, + "grad_norm": 0.9161074161529541, + "learning_rate": 2.2301717380239727e-06, + "loss": 0.6344, + "step": 283290 + }, + { + "epoch": 1.8099229520974152, + "grad_norm": 0.6363855004310608, + "learning_rate": 2.2286901260923866e-06, + "loss": 0.8171, + "step": 283300 + }, + { + "epoch": 1.8099868392471539, + "grad_norm": 0.9332031607627869, + "learning_rate": 2.227208995259228e-06, + "loss": 1.1902, + "step": 283310 + }, + { + "epoch": 1.8100507263968926, + "grad_norm": 0.9157412648200989, + "learning_rate": 2.2257283455393964e-06, + "loss": 0.5489, + "step": 283320 + }, + { + "epoch": 1.8101146135466313, + "grad_norm": 1.413439154624939, + "learning_rate": 2.2242481769478185e-06, + "loss": 1.0107, + "step": 283330 + }, + { + "epoch": 1.81017850069637, + "grad_norm": 1.1347328424453735, + "learning_rate": 2.2227684894993882e-06, + "loss": 0.9759, + "step": 283340 + }, + { + "epoch": 1.8102423878461087, + "grad_norm": 0.6124105453491211, + "learning_rate": 2.221289283209016e-06, + "loss": 0.7626, + "step": 283350 + }, + { + "epoch": 1.8103062749958472, + "grad_norm": 1.255640983581543, + "learning_rate": 2.2198105580915895e-06, + "loss": 0.7451, + "step": 283360 + }, + { + "epoch": 1.810370162145586, + "grad_norm": 1.1673067808151245, + "learning_rate": 2.218332314162014e-06, + "loss": 0.8239, + "step": 283370 + }, + { + "epoch": 1.8104340492953246, + "grad_norm": 0.4938371181488037, + "learning_rate": 2.2168545514351656e-06, + "loss": 0.6179, + "step": 283380 + }, + { + "epoch": 1.8104979364450635, + "grad_norm": 1.0467393398284912, + "learning_rate": 2.215377269925939e-06, + "loss": 0.8178, + "step": 283390 + }, + { + "epoch": 1.810561823594802, + "grad_norm": 1.0822023153305054, + "learning_rate": 2.2139004696491885e-06, + "loss": 0.7513, + "step": 283400 + }, + { + "epoch": 1.810625710744541, + "grad_norm": 0.784705400466919, + "learning_rate": 2.2124241506198074e-06, + "loss": 0.8451, + "step": 283410 + }, + { + "epoch": 1.8106895978942794, + "grad_norm": 1.294501781463623, + "learning_rate": 2.2109483128526566e-06, + "loss": 0.8586, + "step": 283420 + }, + { + "epoch": 1.8107534850440183, + "grad_norm": 0.9332607388496399, + "learning_rate": 2.209472956362596e-06, + "loss": 0.8953, + "step": 283430 + }, + { + "epoch": 1.8108173721937568, + "grad_norm": 1.299521803855896, + "learning_rate": 2.2079980811644917e-06, + "loss": 0.9633, + "step": 283440 + }, + { + "epoch": 1.8108812593434958, + "grad_norm": 0.6523328423500061, + "learning_rate": 2.2065236872731876e-06, + "loss": 0.8005, + "step": 283450 + }, + { + "epoch": 1.8109451464932342, + "grad_norm": 1.0625931024551392, + "learning_rate": 2.205049774703544e-06, + "loss": 0.7675, + "step": 283460 + }, + { + "epoch": 1.8110090336429732, + "grad_norm": 0.5946071147918701, + "learning_rate": 2.2035763434703928e-06, + "loss": 0.7786, + "step": 283470 + }, + { + "epoch": 1.8110729207927116, + "grad_norm": 1.482882022857666, + "learning_rate": 2.2021033935885837e-06, + "loss": 0.7202, + "step": 283480 + }, + { + "epoch": 1.8111368079424506, + "grad_norm": 1.312387228012085, + "learning_rate": 2.2006309250729384e-06, + "loss": 1.0036, + "step": 283490 + }, + { + "epoch": 1.811200695092189, + "grad_norm": 1.288197636604309, + "learning_rate": 2.199158937938295e-06, + "loss": 1.034, + "step": 283500 + }, + { + "epoch": 1.811264582241928, + "grad_norm": 0.9824373126029968, + "learning_rate": 2.1976874321994744e-06, + "loss": 0.9011, + "step": 283510 + }, + { + "epoch": 1.8113284693916665, + "grad_norm": 0.9257283210754395, + "learning_rate": 2.196216407871299e-06, + "loss": 0.8062, + "step": 283520 + }, + { + "epoch": 1.8113923565414054, + "grad_norm": 0.9991766214370728, + "learning_rate": 2.1947458649685727e-06, + "loss": 0.7861, + "step": 283530 + }, + { + "epoch": 1.8114562436911439, + "grad_norm": 0.8082060813903809, + "learning_rate": 2.1932758035061228e-06, + "loss": 0.8023, + "step": 283540 + }, + { + "epoch": 1.8115201308408828, + "grad_norm": 0.8874626159667969, + "learning_rate": 2.1918062234987323e-06, + "loss": 0.9567, + "step": 283550 + }, + { + "epoch": 1.8115840179906213, + "grad_norm": 1.4034126996994019, + "learning_rate": 2.1903371249612227e-06, + "loss": 0.7498, + "step": 283560 + }, + { + "epoch": 1.8116479051403602, + "grad_norm": 1.0518414974212646, + "learning_rate": 2.188868507908376e-06, + "loss": 0.8519, + "step": 283570 + }, + { + "epoch": 1.8117117922900987, + "grad_norm": 1.1383647918701172, + "learning_rate": 2.187400372354986e-06, + "loss": 1.0501, + "step": 283580 + }, + { + "epoch": 1.8117756794398374, + "grad_norm": 1.1427676677703857, + "learning_rate": 2.185932718315836e-06, + "loss": 0.7585, + "step": 283590 + }, + { + "epoch": 1.8118395665895761, + "grad_norm": 1.0937600135803223, + "learning_rate": 2.184465545805714e-06, + "loss": 0.8869, + "step": 283600 + }, + { + "epoch": 1.8119034537393148, + "grad_norm": 1.1126290559768677, + "learning_rate": 2.1829988548393855e-06, + "loss": 0.9393, + "step": 283610 + }, + { + "epoch": 1.8119673408890535, + "grad_norm": 0.9303832054138184, + "learning_rate": 2.181532645431633e-06, + "loss": 0.8961, + "step": 283620 + }, + { + "epoch": 1.8120312280387922, + "grad_norm": 0.836120069026947, + "learning_rate": 2.1800669175972068e-06, + "loss": 0.9489, + "step": 283630 + }, + { + "epoch": 1.812095115188531, + "grad_norm": 2.5739543437957764, + "learning_rate": 2.178601671350877e-06, + "loss": 0.8026, + "step": 283640 + }, + { + "epoch": 1.8121590023382697, + "grad_norm": 1.465082049369812, + "learning_rate": 2.1771369067074056e-06, + "loss": 0.8689, + "step": 283650 + }, + { + "epoch": 1.8122228894880084, + "grad_norm": 1.140472173690796, + "learning_rate": 2.175672623681535e-06, + "loss": 0.8422, + "step": 283660 + }, + { + "epoch": 1.812286776637747, + "grad_norm": 1.1891902685165405, + "learning_rate": 2.1742088222880207e-06, + "loss": 0.8625, + "step": 283670 + }, + { + "epoch": 1.8123506637874858, + "grad_norm": 0.7614114284515381, + "learning_rate": 2.17274550254159e-06, + "loss": 0.8563, + "step": 283680 + }, + { + "epoch": 1.8124145509372245, + "grad_norm": 1.4389500617980957, + "learning_rate": 2.1712826644569972e-06, + "loss": 0.7662, + "step": 283690 + }, + { + "epoch": 1.8124784380869632, + "grad_norm": 1.6076115369796753, + "learning_rate": 2.169820308048959e-06, + "loss": 0.9157, + "step": 283700 + }, + { + "epoch": 1.812542325236702, + "grad_norm": 1.6042400598526, + "learning_rate": 2.168358433332213e-06, + "loss": 0.8883, + "step": 283710 + }, + { + "epoch": 1.8126062123864406, + "grad_norm": 1.0674022436141968, + "learning_rate": 2.166897040321475e-06, + "loss": 0.8062, + "step": 283720 + }, + { + "epoch": 1.8126700995361793, + "grad_norm": 0.7992262840270996, + "learning_rate": 2.1654361290314674e-06, + "loss": 0.9028, + "step": 283730 + }, + { + "epoch": 1.812733986685918, + "grad_norm": 1.30583918094635, + "learning_rate": 2.1639756994768944e-06, + "loss": 0.8859, + "step": 283740 + }, + { + "epoch": 1.8127978738356567, + "grad_norm": 0.9546254873275757, + "learning_rate": 2.1625157516724837e-06, + "loss": 0.8229, + "step": 283750 + }, + { + "epoch": 1.8128617609853954, + "grad_norm": 1.1246157884597778, + "learning_rate": 2.161056285632912e-06, + "loss": 0.7242, + "step": 283760 + }, + { + "epoch": 1.8129256481351341, + "grad_norm": 0.8130073547363281, + "learning_rate": 2.1595973013728956e-06, + "loss": 0.8151, + "step": 283770 + }, + { + "epoch": 1.8129895352848728, + "grad_norm": 0.9397286772727966, + "learning_rate": 2.1581387989071224e-06, + "loss": 0.8207, + "step": 283780 + }, + { + "epoch": 1.8130534224346115, + "grad_norm": 1.3959110975265503, + "learning_rate": 2.156680778250281e-06, + "loss": 1.1043, + "step": 283790 + }, + { + "epoch": 1.8131173095843502, + "grad_norm": 1.0556739568710327, + "learning_rate": 2.155223239417048e-06, + "loss": 0.8429, + "step": 283800 + }, + { + "epoch": 1.813181196734089, + "grad_norm": 1.2097365856170654, + "learning_rate": 2.153766182422118e-06, + "loss": 0.7923, + "step": 283810 + }, + { + "epoch": 1.8132450838838277, + "grad_norm": 2.9758658409118652, + "learning_rate": 2.1523096072801506e-06, + "loss": 0.9167, + "step": 283820 + }, + { + "epoch": 1.8133089710335664, + "grad_norm": 0.846839964389801, + "learning_rate": 2.1508535140058184e-06, + "loss": 1.0546, + "step": 283830 + }, + { + "epoch": 1.813372858183305, + "grad_norm": 0.9610458016395569, + "learning_rate": 2.1493979026137867e-06, + "loss": 0.7903, + "step": 283840 + }, + { + "epoch": 1.8134367453330436, + "grad_norm": 0.992201030254364, + "learning_rate": 2.147942773118716e-06, + "loss": 1.0594, + "step": 283850 + }, + { + "epoch": 1.8135006324827825, + "grad_norm": 0.9246998429298401, + "learning_rate": 2.146488125535262e-06, + "loss": 0.9725, + "step": 283860 + }, + { + "epoch": 1.813564519632521, + "grad_norm": 0.8264791369438171, + "learning_rate": 2.1450339598780677e-06, + "loss": 0.8062, + "step": 283870 + }, + { + "epoch": 1.81362840678226, + "grad_norm": 1.0013701915740967, + "learning_rate": 2.143580276161794e-06, + "loss": 0.8676, + "step": 283880 + }, + { + "epoch": 1.8136922939319984, + "grad_norm": 0.7805340886116028, + "learning_rate": 2.142127074401057e-06, + "loss": 0.7871, + "step": 283890 + }, + { + "epoch": 1.8137561810817373, + "grad_norm": 0.8282870650291443, + "learning_rate": 2.140674354610506e-06, + "loss": 0.6807, + "step": 283900 + }, + { + "epoch": 1.8138200682314758, + "grad_norm": 1.1535768508911133, + "learning_rate": 2.1392221168047677e-06, + "loss": 0.7917, + "step": 283910 + }, + { + "epoch": 1.8138839553812147, + "grad_norm": 1.1144474744796753, + "learning_rate": 2.13777036099847e-06, + "loss": 0.8424, + "step": 283920 + }, + { + "epoch": 1.8139478425309532, + "grad_norm": 1.0237375497817993, + "learning_rate": 2.1363190872062234e-06, + "loss": 0.7886, + "step": 283930 + }, + { + "epoch": 1.8140117296806921, + "grad_norm": 1.0485482215881348, + "learning_rate": 2.1348682954426602e-06, + "loss": 0.8336, + "step": 283940 + }, + { + "epoch": 1.8140756168304306, + "grad_norm": 0.586771547794342, + "learning_rate": 2.1334179857223803e-06, + "loss": 1.0313, + "step": 283950 + }, + { + "epoch": 1.8141395039801695, + "grad_norm": 0.8109305500984192, + "learning_rate": 2.131968158059988e-06, + "loss": 1.0908, + "step": 283960 + }, + { + "epoch": 1.814203391129908, + "grad_norm": 1.0652990341186523, + "learning_rate": 2.1305188124700946e-06, + "loss": 0.8719, + "step": 283970 + }, + { + "epoch": 1.814267278279647, + "grad_norm": 1.1867952346801758, + "learning_rate": 2.1290699489672827e-06, + "loss": 0.8067, + "step": 283980 + }, + { + "epoch": 1.8143311654293854, + "grad_norm": 0.8866208791732788, + "learning_rate": 2.127621567566157e-06, + "loss": 1.0105, + "step": 283990 + }, + { + "epoch": 1.8143950525791244, + "grad_norm": 1.064661979675293, + "learning_rate": 2.126173668281295e-06, + "loss": 0.8171, + "step": 284000 + }, + { + "epoch": 1.8144589397288629, + "grad_norm": 0.8402761816978455, + "learning_rate": 2.124726251127279e-06, + "loss": 0.7702, + "step": 284010 + }, + { + "epoch": 1.8145228268786018, + "grad_norm": 1.0374879837036133, + "learning_rate": 2.123279316118687e-06, + "loss": 0.8017, + "step": 284020 + }, + { + "epoch": 1.8145867140283403, + "grad_norm": 0.9119057655334473, + "learning_rate": 2.121832863270101e-06, + "loss": 0.9122, + "step": 284030 + }, + { + "epoch": 1.8146506011780792, + "grad_norm": 1.0535051822662354, + "learning_rate": 2.1203868925960655e-06, + "loss": 0.7125, + "step": 284040 + }, + { + "epoch": 1.8147144883278177, + "grad_norm": 0.5721514821052551, + "learning_rate": 2.118941404111169e-06, + "loss": 0.7486, + "step": 284050 + }, + { + "epoch": 1.8147783754775566, + "grad_norm": 1.2209059000015259, + "learning_rate": 2.1174963978299432e-06, + "loss": 0.7943, + "step": 284060 + }, + { + "epoch": 1.814842262627295, + "grad_norm": 0.9881746172904968, + "learning_rate": 2.116051873766961e-06, + "loss": 0.832, + "step": 284070 + }, + { + "epoch": 1.8149061497770338, + "grad_norm": 0.7639790773391724, + "learning_rate": 2.1146078319367547e-06, + "loss": 0.8015, + "step": 284080 + }, + { + "epoch": 1.8149700369267725, + "grad_norm": 0.627166211605072, + "learning_rate": 2.113164272353885e-06, + "loss": 0.6067, + "step": 284090 + }, + { + "epoch": 1.8150339240765112, + "grad_norm": 0.7728124856948853, + "learning_rate": 2.1117211950328674e-06, + "loss": 0.8219, + "step": 284100 + }, + { + "epoch": 1.81509781122625, + "grad_norm": 0.9364749193191528, + "learning_rate": 2.110278599988258e-06, + "loss": 1.1201, + "step": 284110 + }, + { + "epoch": 1.8151616983759886, + "grad_norm": 1.0755594968795776, + "learning_rate": 2.1088364872345667e-06, + "loss": 1.0939, + "step": 284120 + }, + { + "epoch": 1.8152255855257273, + "grad_norm": 2.0930886268615723, + "learning_rate": 2.107394856786327e-06, + "loss": 0.9332, + "step": 284130 + }, + { + "epoch": 1.815289472675466, + "grad_norm": 1.3515701293945312, + "learning_rate": 2.1059537086580485e-06, + "loss": 1.0684, + "step": 284140 + }, + { + "epoch": 1.8153533598252047, + "grad_norm": 1.5052813291549683, + "learning_rate": 2.104513042864248e-06, + "loss": 0.8437, + "step": 284150 + }, + { + "epoch": 1.8154172469749434, + "grad_norm": 1.600309133529663, + "learning_rate": 2.103072859419447e-06, + "loss": 0.8595, + "step": 284160 + }, + { + "epoch": 1.8154811341246821, + "grad_norm": 1.3267834186553955, + "learning_rate": 2.1016331583381344e-06, + "loss": 0.8628, + "step": 284170 + }, + { + "epoch": 1.8155450212744209, + "grad_norm": 0.9317293763160706, + "learning_rate": 2.1001939396348147e-06, + "loss": 0.7491, + "step": 284180 + }, + { + "epoch": 1.8156089084241596, + "grad_norm": 2.272695541381836, + "learning_rate": 2.0987552033239765e-06, + "loss": 0.8343, + "step": 284190 + }, + { + "epoch": 1.8156727955738983, + "grad_norm": 0.9980998039245605, + "learning_rate": 2.097316949420125e-06, + "loss": 0.9049, + "step": 284200 + }, + { + "epoch": 1.815736682723637, + "grad_norm": 0.7396201491355896, + "learning_rate": 2.095879177937721e-06, + "loss": 0.8789, + "step": 284210 + }, + { + "epoch": 1.8158005698733757, + "grad_norm": 0.8641042113304138, + "learning_rate": 2.094441888891263e-06, + "loss": 1.0926, + "step": 284220 + }, + { + "epoch": 1.8158644570231144, + "grad_norm": 1.1559175252914429, + "learning_rate": 2.0930050822952186e-06, + "loss": 1.0042, + "step": 284230 + }, + { + "epoch": 1.815928344172853, + "grad_norm": 0.7350589036941528, + "learning_rate": 2.091568758164059e-06, + "loss": 0.8153, + "step": 284240 + }, + { + "epoch": 1.8159922313225918, + "grad_norm": 0.7597360610961914, + "learning_rate": 2.0901329165122496e-06, + "loss": 0.8937, + "step": 284250 + }, + { + "epoch": 1.8160561184723305, + "grad_norm": 1.366105556488037, + "learning_rate": 2.0886975573542465e-06, + "loss": 0.763, + "step": 284260 + }, + { + "epoch": 1.8161200056220692, + "grad_norm": 1.1255083084106445, + "learning_rate": 2.0872626807045104e-06, + "loss": 0.6321, + "step": 284270 + }, + { + "epoch": 1.816183892771808, + "grad_norm": 0.6423645615577698, + "learning_rate": 2.0858282865774905e-06, + "loss": 1.0003, + "step": 284280 + }, + { + "epoch": 1.8162477799215466, + "grad_norm": 0.853395938873291, + "learning_rate": 2.0843943749876248e-06, + "loss": 0.9148, + "step": 284290 + }, + { + "epoch": 1.8163116670712853, + "grad_norm": 1.3879306316375732, + "learning_rate": 2.0829609459493692e-06, + "loss": 0.9273, + "step": 284300 + }, + { + "epoch": 1.816375554221024, + "grad_norm": 2.7183949947357178, + "learning_rate": 2.0815279994771454e-06, + "loss": 0.9227, + "step": 284310 + }, + { + "epoch": 1.8164394413707625, + "grad_norm": 0.8922224044799805, + "learning_rate": 2.080095535585397e-06, + "loss": 0.935, + "step": 284320 + }, + { + "epoch": 1.8165033285205014, + "grad_norm": 1.177586555480957, + "learning_rate": 2.078663554288535e-06, + "loss": 0.7243, + "step": 284330 + }, + { + "epoch": 1.81656721567024, + "grad_norm": 1.1185641288757324, + "learning_rate": 2.077232055600997e-06, + "loss": 0.7291, + "step": 284340 + }, + { + "epoch": 1.8166311028199789, + "grad_norm": 0.8241533041000366, + "learning_rate": 2.0758010395371842e-06, + "loss": 0.7383, + "step": 284350 + }, + { + "epoch": 1.8166949899697173, + "grad_norm": 1.203342318534851, + "learning_rate": 2.074370506111517e-06, + "loss": 0.7972, + "step": 284360 + }, + { + "epoch": 1.8167588771194563, + "grad_norm": 0.8048965334892273, + "learning_rate": 2.072940455338407e-06, + "loss": 1.1044, + "step": 284370 + }, + { + "epoch": 1.8168227642691948, + "grad_norm": 0.7029027938842773, + "learning_rate": 2.0715108872322363e-06, + "loss": 1.0691, + "step": 284380 + }, + { + "epoch": 1.8168866514189337, + "grad_norm": 0.8525398373603821, + "learning_rate": 2.0700818018074276e-06, + "loss": 0.8284, + "step": 284390 + }, + { + "epoch": 1.8169505385686722, + "grad_norm": 1.0670232772827148, + "learning_rate": 2.068653199078352e-06, + "loss": 0.9253, + "step": 284400 + }, + { + "epoch": 1.817014425718411, + "grad_norm": 2.1545190811157227, + "learning_rate": 2.0672250790594095e-06, + "loss": 0.7876, + "step": 284410 + }, + { + "epoch": 1.8170783128681496, + "grad_norm": 0.5782620310783386, + "learning_rate": 2.065797441764977e-06, + "loss": 0.7137, + "step": 284420 + }, + { + "epoch": 1.8171422000178885, + "grad_norm": 0.8786792755126953, + "learning_rate": 2.0643702872094327e-06, + "loss": 0.7858, + "step": 284430 + }, + { + "epoch": 1.817206087167627, + "grad_norm": 1.353689432144165, + "learning_rate": 2.062943615407148e-06, + "loss": 1.114, + "step": 284440 + }, + { + "epoch": 1.817269974317366, + "grad_norm": 1.3771668672561646, + "learning_rate": 2.0615174263725e-06, + "loss": 0.7619, + "step": 284450 + }, + { + "epoch": 1.8173338614671044, + "grad_norm": 1.0626250505447388, + "learning_rate": 2.060091720119833e-06, + "loss": 0.6769, + "step": 284460 + }, + { + "epoch": 1.8173977486168433, + "grad_norm": 0.7767465710639954, + "learning_rate": 2.058666496663525e-06, + "loss": 0.8848, + "step": 284470 + }, + { + "epoch": 1.8174616357665818, + "grad_norm": 0.9560141563415527, + "learning_rate": 2.0572417560179192e-06, + "loss": 0.9424, + "step": 284480 + }, + { + "epoch": 1.8175255229163207, + "grad_norm": 0.9091188311576843, + "learning_rate": 2.05581749819736e-06, + "loss": 0.8649, + "step": 284490 + }, + { + "epoch": 1.8175894100660592, + "grad_norm": 0.8125816583633423, + "learning_rate": 2.0543937232162026e-06, + "loss": 0.7962, + "step": 284500 + }, + { + "epoch": 1.8176532972157982, + "grad_norm": 0.9906743168830872, + "learning_rate": 2.052970431088774e-06, + "loss": 0.8921, + "step": 284510 + }, + { + "epoch": 1.8177171843655366, + "grad_norm": 1.846880316734314, + "learning_rate": 2.0515476218294193e-06, + "loss": 0.571, + "step": 284520 + }, + { + "epoch": 1.8177810715152756, + "grad_norm": 0.8297739624977112, + "learning_rate": 2.050125295452454e-06, + "loss": 0.9286, + "step": 284530 + }, + { + "epoch": 1.817844958665014, + "grad_norm": 0.6898651123046875, + "learning_rate": 2.048703451972217e-06, + "loss": 0.6276, + "step": 284540 + }, + { + "epoch": 1.817908845814753, + "grad_norm": 0.741755485534668, + "learning_rate": 2.0472820914030134e-06, + "loss": 0.9608, + "step": 284550 + }, + { + "epoch": 1.8179727329644915, + "grad_norm": 1.0275651216506958, + "learning_rate": 2.04586121375917e-06, + "loss": 1.0575, + "step": 284560 + }, + { + "epoch": 1.8180366201142302, + "grad_norm": 1.8861565589904785, + "learning_rate": 2.0444408190549824e-06, + "loss": 0.8141, + "step": 284570 + }, + { + "epoch": 1.8181005072639689, + "grad_norm": 1.1138237714767456, + "learning_rate": 2.0430209073047767e-06, + "loss": 1.0783, + "step": 284580 + }, + { + "epoch": 1.8181643944137076, + "grad_norm": 0.9914634823799133, + "learning_rate": 2.0416014785228253e-06, + "loss": 0.7674, + "step": 284590 + }, + { + "epoch": 1.8182282815634463, + "grad_norm": 0.7898954153060913, + "learning_rate": 2.0401825327234446e-06, + "loss": 0.851, + "step": 284600 + }, + { + "epoch": 1.818292168713185, + "grad_norm": 1.3129220008850098, + "learning_rate": 2.038764069920912e-06, + "loss": 1.0338, + "step": 284610 + }, + { + "epoch": 1.8183560558629237, + "grad_norm": 0.6844892501831055, + "learning_rate": 2.037346090129527e-06, + "loss": 0.787, + "step": 284620 + }, + { + "epoch": 1.8184199430126624, + "grad_norm": 0.5973532795906067, + "learning_rate": 2.035928593363551e-06, + "loss": 0.639, + "step": 284630 + }, + { + "epoch": 1.8184838301624011, + "grad_norm": 0.8840516209602356, + "learning_rate": 2.0345115796372715e-06, + "loss": 0.8543, + "step": 284640 + }, + { + "epoch": 1.8185477173121398, + "grad_norm": 2.3488833904266357, + "learning_rate": 2.033095048964956e-06, + "loss": 0.8962, + "step": 284650 + }, + { + "epoch": 1.8186116044618785, + "grad_norm": 1.3273308277130127, + "learning_rate": 2.0316790013608753e-06, + "loss": 0.9147, + "step": 284660 + }, + { + "epoch": 1.8186754916116172, + "grad_norm": 0.9628444910049438, + "learning_rate": 2.0302634368392803e-06, + "loss": 1.0421, + "step": 284670 + }, + { + "epoch": 1.818739378761356, + "grad_norm": 0.9748392105102539, + "learning_rate": 2.0288483554144254e-06, + "loss": 0.7174, + "step": 284680 + }, + { + "epoch": 1.8188032659110946, + "grad_norm": 1.3948633670806885, + "learning_rate": 2.027433757100583e-06, + "loss": 0.7698, + "step": 284690 + }, + { + "epoch": 1.8188671530608334, + "grad_norm": 1.6660586595535278, + "learning_rate": 2.0260196419119747e-06, + "loss": 0.8549, + "step": 284700 + }, + { + "epoch": 1.818931040210572, + "grad_norm": 0.9206308722496033, + "learning_rate": 2.0246060098628616e-06, + "loss": 0.698, + "step": 284710 + }, + { + "epoch": 1.8189949273603108, + "grad_norm": 1.0280396938323975, + "learning_rate": 2.02319286096746e-06, + "loss": 1.1205, + "step": 284720 + }, + { + "epoch": 1.8190588145100495, + "grad_norm": 0.9921185374259949, + "learning_rate": 2.0217801952400196e-06, + "loss": 0.618, + "step": 284730 + }, + { + "epoch": 1.8191227016597882, + "grad_norm": 0.6777898073196411, + "learning_rate": 2.0203680126947565e-06, + "loss": 1.006, + "step": 284740 + }, + { + "epoch": 1.8191865888095269, + "grad_norm": 0.919839084148407, + "learning_rate": 2.018956313345899e-06, + "loss": 0.895, + "step": 284750 + }, + { + "epoch": 1.8192504759592656, + "grad_norm": 1.8676611185073853, + "learning_rate": 2.0175450972076516e-06, + "loss": 1.0452, + "step": 284760 + }, + { + "epoch": 1.8193143631090043, + "grad_norm": 0.9780756235122681, + "learning_rate": 2.0161343642942476e-06, + "loss": 0.8915, + "step": 284770 + }, + { + "epoch": 1.819378250258743, + "grad_norm": 3.6714227199554443, + "learning_rate": 2.014724114619876e-06, + "loss": 1.0235, + "step": 284780 + }, + { + "epoch": 1.8194421374084817, + "grad_norm": 2.075040340423584, + "learning_rate": 2.013314348198747e-06, + "loss": 0.8235, + "step": 284790 + }, + { + "epoch": 1.8195060245582204, + "grad_norm": 0.809281587600708, + "learning_rate": 2.0119050650450556e-06, + "loss": 0.7084, + "step": 284800 + }, + { + "epoch": 1.819569911707959, + "grad_norm": 1.4547041654586792, + "learning_rate": 2.010496265173001e-06, + "loss": 1.0448, + "step": 284810 + }, + { + "epoch": 1.8196337988576978, + "grad_norm": 1.625180959701538, + "learning_rate": 2.009087948596761e-06, + "loss": 0.9319, + "step": 284820 + }, + { + "epoch": 1.8196976860074363, + "grad_norm": 1.166869044303894, + "learning_rate": 2.007680115330529e-06, + "loss": 1.0157, + "step": 284830 + }, + { + "epoch": 1.8197615731571752, + "grad_norm": 0.996919572353363, + "learning_rate": 2.006272765388467e-06, + "loss": 1.0949, + "step": 284840 + }, + { + "epoch": 1.8198254603069137, + "grad_norm": 1.169065237045288, + "learning_rate": 2.004865898784769e-06, + "loss": 0.8891, + "step": 284850 + }, + { + "epoch": 1.8198893474566527, + "grad_norm": 0.8748899102210999, + "learning_rate": 2.00345951553359e-06, + "loss": 0.9441, + "step": 284860 + }, + { + "epoch": 1.8199532346063911, + "grad_norm": 1.0787811279296875, + "learning_rate": 2.0020536156490964e-06, + "loss": 0.7037, + "step": 284870 + }, + { + "epoch": 1.82001712175613, + "grad_norm": 0.8389084935188293, + "learning_rate": 2.0006481991454495e-06, + "loss": 0.8076, + "step": 284880 + }, + { + "epoch": 1.8200810089058685, + "grad_norm": 1.2037585973739624, + "learning_rate": 1.9992432660367933e-06, + "loss": 0.7315, + "step": 284890 + }, + { + "epoch": 1.8201448960556075, + "grad_norm": 0.9362344741821289, + "learning_rate": 1.9978388163372885e-06, + "loss": 0.9803, + "step": 284900 + }, + { + "epoch": 1.820208783205346, + "grad_norm": 0.7869431376457214, + "learning_rate": 1.996434850061074e-06, + "loss": 0.9087, + "step": 284910 + }, + { + "epoch": 1.8202726703550849, + "grad_norm": 1.0346866846084595, + "learning_rate": 1.9950313672222944e-06, + "loss": 0.6123, + "step": 284920 + }, + { + "epoch": 1.8203365575048234, + "grad_norm": 1.109389305114746, + "learning_rate": 1.993628367835071e-06, + "loss": 0.7859, + "step": 284930 + }, + { + "epoch": 1.8204004446545623, + "grad_norm": 1.341280221939087, + "learning_rate": 1.992225851913543e-06, + "loss": 0.7019, + "step": 284940 + }, + { + "epoch": 1.8204643318043008, + "grad_norm": 0.7633939385414124, + "learning_rate": 1.9908238194718323e-06, + "loss": 0.8567, + "step": 284950 + }, + { + "epoch": 1.8205282189540397, + "grad_norm": 0.699362576007843, + "learning_rate": 1.989422270524066e-06, + "loss": 0.8734, + "step": 284960 + }, + { + "epoch": 1.8205921061037782, + "grad_norm": 0.8467990756034851, + "learning_rate": 1.988021205084345e-06, + "loss": 0.9091, + "step": 284970 + }, + { + "epoch": 1.8206559932535171, + "grad_norm": 1.1297391653060913, + "learning_rate": 1.986620623166796e-06, + "loss": 0.989, + "step": 284980 + }, + { + "epoch": 1.8207198804032556, + "grad_norm": 0.8497728705406189, + "learning_rate": 1.9852205247855083e-06, + "loss": 0.5939, + "step": 284990 + }, + { + "epoch": 1.8207837675529945, + "grad_norm": 1.0683796405792236, + "learning_rate": 1.983820909954587e-06, + "loss": 0.7312, + "step": 285000 + }, + { + "epoch": 1.820847654702733, + "grad_norm": 1.2456072568893433, + "learning_rate": 1.9824217786881316e-06, + "loss": 0.7684, + "step": 285010 + }, + { + "epoch": 1.820911541852472, + "grad_norm": 1.282422661781311, + "learning_rate": 1.9810231310002256e-06, + "loss": 0.7891, + "step": 285020 + }, + { + "epoch": 1.8209754290022104, + "grad_norm": 1.13451087474823, + "learning_rate": 1.9796249669049694e-06, + "loss": 1.3058, + "step": 285030 + }, + { + "epoch": 1.8210393161519494, + "grad_norm": 0.85088711977005, + "learning_rate": 1.978227286416423e-06, + "loss": 0.7471, + "step": 285040 + }, + { + "epoch": 1.8211032033016878, + "grad_norm": 0.8464387655258179, + "learning_rate": 1.9768300895486813e-06, + "loss": 0.9672, + "step": 285050 + }, + { + "epoch": 1.8211670904514266, + "grad_norm": 0.8177611827850342, + "learning_rate": 1.9754333763157994e-06, + "loss": 0.9349, + "step": 285060 + }, + { + "epoch": 1.8212309776011653, + "grad_norm": 1.3880621194839478, + "learning_rate": 1.9740371467318555e-06, + "loss": 0.9024, + "step": 285070 + }, + { + "epoch": 1.821294864750904, + "grad_norm": 1.3378520011901855, + "learning_rate": 1.972641400810904e-06, + "loss": 0.7301, + "step": 285080 + }, + { + "epoch": 1.8213587519006427, + "grad_norm": 0.9654492735862732, + "learning_rate": 1.9712461385670065e-06, + "loss": 0.9758, + "step": 285090 + }, + { + "epoch": 1.8214226390503814, + "grad_norm": 0.9789209961891174, + "learning_rate": 1.969851360014202e-06, + "loss": 0.8543, + "step": 285100 + }, + { + "epoch": 1.82148652620012, + "grad_norm": 0.5984829068183899, + "learning_rate": 1.9684570651665566e-06, + "loss": 0.7107, + "step": 285110 + }, + { + "epoch": 1.8215504133498588, + "grad_norm": 0.8956750631332397, + "learning_rate": 1.967063254038093e-06, + "loss": 0.8092, + "step": 285120 + }, + { + "epoch": 1.8216143004995975, + "grad_norm": 1.5672224760055542, + "learning_rate": 1.965669926642866e-06, + "loss": 1.1351, + "step": 285130 + }, + { + "epoch": 1.8216781876493362, + "grad_norm": 1.1448312997817993, + "learning_rate": 1.9642770829948864e-06, + "loss": 0.9314, + "step": 285140 + }, + { + "epoch": 1.821742074799075, + "grad_norm": 1.4996016025543213, + "learning_rate": 1.9628847231082047e-06, + "loss": 0.9699, + "step": 285150 + }, + { + "epoch": 1.8218059619488136, + "grad_norm": 0.7268596291542053, + "learning_rate": 1.9614928469968264e-06, + "loss": 0.899, + "step": 285160 + }, + { + "epoch": 1.8218698490985523, + "grad_norm": 0.6328108906745911, + "learning_rate": 1.9601014546747787e-06, + "loss": 1.0637, + "step": 285170 + }, + { + "epoch": 1.821933736248291, + "grad_norm": 2.3796627521514893, + "learning_rate": 1.958710546156062e-06, + "loss": 0.787, + "step": 285180 + }, + { + "epoch": 1.8219976233980297, + "grad_norm": 1.022774338722229, + "learning_rate": 1.957320121454698e-06, + "loss": 0.9771, + "step": 285190 + }, + { + "epoch": 1.8220615105477684, + "grad_norm": 0.7801091074943542, + "learning_rate": 1.955930180584681e-06, + "loss": 0.7787, + "step": 285200 + }, + { + "epoch": 1.8221253976975071, + "grad_norm": 1.1597459316253662, + "learning_rate": 1.9545407235600054e-06, + "loss": 0.7169, + "step": 285210 + }, + { + "epoch": 1.8221892848472458, + "grad_norm": 0.8490064144134521, + "learning_rate": 1.953151750394683e-06, + "loss": 0.8965, + "step": 285220 + }, + { + "epoch": 1.8222531719969846, + "grad_norm": 0.5219688415527344, + "learning_rate": 1.951763261102679e-06, + "loss": 0.7575, + "step": 285230 + }, + { + "epoch": 1.8223170591467233, + "grad_norm": 1.8479723930358887, + "learning_rate": 1.950375255697989e-06, + "loss": 0.7659, + "step": 285240 + }, + { + "epoch": 1.822380946296462, + "grad_norm": 1.2113089561462402, + "learning_rate": 1.9489877341945906e-06, + "loss": 0.6914, + "step": 285250 + }, + { + "epoch": 1.8224448334462007, + "grad_norm": 0.5127272605895996, + "learning_rate": 1.9476006966064555e-06, + "loss": 0.7166, + "step": 285260 + }, + { + "epoch": 1.8225087205959394, + "grad_norm": 0.689415454864502, + "learning_rate": 1.94621414294755e-06, + "loss": 0.7155, + "step": 285270 + }, + { + "epoch": 1.822572607745678, + "grad_norm": 1.3685857057571411, + "learning_rate": 1.944828073231847e-06, + "loss": 1.0151, + "step": 285280 + }, + { + "epoch": 1.8226364948954168, + "grad_norm": 1.7720893621444702, + "learning_rate": 1.94344248747329e-06, + "loss": 0.9299, + "step": 285290 + }, + { + "epoch": 1.8227003820451553, + "grad_norm": 0.789768397808075, + "learning_rate": 1.9420573856858526e-06, + "loss": 0.8446, + "step": 285300 + }, + { + "epoch": 1.8227642691948942, + "grad_norm": 0.7223408818244934, + "learning_rate": 1.940672767883461e-06, + "loss": 0.6014, + "step": 285310 + }, + { + "epoch": 1.8228281563446327, + "grad_norm": 1.5340150594711304, + "learning_rate": 1.939288634080083e-06, + "loss": 0.6325, + "step": 285320 + }, + { + "epoch": 1.8228920434943716, + "grad_norm": 0.9605322480201721, + "learning_rate": 1.937904984289646e-06, + "loss": 0.9184, + "step": 285330 + }, + { + "epoch": 1.82295593064411, + "grad_norm": 0.9917205572128296, + "learning_rate": 1.936521818526077e-06, + "loss": 0.9888, + "step": 285340 + }, + { + "epoch": 1.823019817793849, + "grad_norm": 0.8223958611488342, + "learning_rate": 1.935139136803321e-06, + "loss": 0.7055, + "step": 285350 + }, + { + "epoch": 1.8230837049435875, + "grad_norm": 0.7280120849609375, + "learning_rate": 1.9337569391352896e-06, + "loss": 1.1515, + "step": 285360 + }, + { + "epoch": 1.8231475920933264, + "grad_norm": 0.5626384615898132, + "learning_rate": 1.932375225535915e-06, + "loss": 0.7377, + "step": 285370 + }, + { + "epoch": 1.823211479243065, + "grad_norm": 0.7775994539260864, + "learning_rate": 1.9309939960191036e-06, + "loss": 0.9574, + "step": 285380 + }, + { + "epoch": 1.8232753663928039, + "grad_norm": 0.4441358149051666, + "learning_rate": 1.929613250598772e-06, + "loss": 0.6747, + "step": 285390 + }, + { + "epoch": 1.8233392535425423, + "grad_norm": 1.1189658641815186, + "learning_rate": 1.928232989288814e-06, + "loss": 0.885, + "step": 285400 + }, + { + "epoch": 1.8234031406922813, + "grad_norm": 2.306811809539795, + "learning_rate": 1.926853212103147e-06, + "loss": 0.9191, + "step": 285410 + }, + { + "epoch": 1.8234670278420197, + "grad_norm": 0.9467042088508606, + "learning_rate": 1.925473919055648e-06, + "loss": 1.0129, + "step": 285420 + }, + { + "epoch": 1.8235309149917587, + "grad_norm": 1.0102781057357788, + "learning_rate": 1.9240951101602287e-06, + "loss": 0.918, + "step": 285430 + }, + { + "epoch": 1.8235948021414972, + "grad_norm": 0.7928588390350342, + "learning_rate": 1.9227167854307506e-06, + "loss": 0.7995, + "step": 285440 + }, + { + "epoch": 1.823658689291236, + "grad_norm": 2.1351563930511475, + "learning_rate": 1.921338944881118e-06, + "loss": 1.0241, + "step": 285450 + }, + { + "epoch": 1.8237225764409746, + "grad_norm": 0.6350265741348267, + "learning_rate": 1.9199615885251878e-06, + "loss": 0.7882, + "step": 285460 + }, + { + "epoch": 1.8237864635907135, + "grad_norm": 1.3224437236785889, + "learning_rate": 1.918584716376842e-06, + "loss": 0.9932, + "step": 285470 + }, + { + "epoch": 1.823850350740452, + "grad_norm": 0.47573602199554443, + "learning_rate": 1.9172083284499432e-06, + "loss": 0.8303, + "step": 285480 + }, + { + "epoch": 1.823914237890191, + "grad_norm": 1.2311193943023682, + "learning_rate": 1.9158324247583624e-06, + "loss": 0.9476, + "step": 285490 + }, + { + "epoch": 1.8239781250399294, + "grad_norm": 1.1468632221221924, + "learning_rate": 1.9144570053159393e-06, + "loss": 0.8672, + "step": 285500 + }, + { + "epoch": 1.8240420121896683, + "grad_norm": 0.8073374629020691, + "learning_rate": 1.9130820701365404e-06, + "loss": 1.0607, + "step": 285510 + }, + { + "epoch": 1.8241058993394068, + "grad_norm": 1.221170425415039, + "learning_rate": 1.911707619233999e-06, + "loss": 0.7194, + "step": 285520 + }, + { + "epoch": 1.8241697864891457, + "grad_norm": 0.8472967743873596, + "learning_rate": 1.9103336526221595e-06, + "loss": 0.7551, + "step": 285530 + }, + { + "epoch": 1.8242336736388842, + "grad_norm": 0.6870689988136292, + "learning_rate": 1.9089601703148773e-06, + "loss": 0.7329, + "step": 285540 + }, + { + "epoch": 1.824297560788623, + "grad_norm": 0.8167701363563538, + "learning_rate": 1.907587172325959e-06, + "loss": 0.7073, + "step": 285550 + }, + { + "epoch": 1.8243614479383616, + "grad_norm": 0.9098202586174011, + "learning_rate": 1.906214658669253e-06, + "loss": 0.8432, + "step": 285560 + }, + { + "epoch": 1.8244253350881003, + "grad_norm": 5.167019367218018, + "learning_rate": 1.9048426293585663e-06, + "loss": 0.7477, + "step": 285570 + }, + { + "epoch": 1.824489222237839, + "grad_norm": 0.9570047855377197, + "learning_rate": 1.903471084407732e-06, + "loss": 0.9087, + "step": 285580 + }, + { + "epoch": 1.8245531093875778, + "grad_norm": 1.1526376008987427, + "learning_rate": 1.9021000238305441e-06, + "loss": 0.8563, + "step": 285590 + }, + { + "epoch": 1.8246169965373165, + "grad_norm": 0.6891779899597168, + "learning_rate": 1.9007294476408256e-06, + "loss": 0.6378, + "step": 285600 + }, + { + "epoch": 1.8246808836870552, + "grad_norm": 0.8805965781211853, + "learning_rate": 1.8993593558523648e-06, + "loss": 0.6564, + "step": 285610 + }, + { + "epoch": 1.8247447708367939, + "grad_norm": 1.0953179597854614, + "learning_rate": 1.8979897484789789e-06, + "loss": 0.9682, + "step": 285620 + }, + { + "epoch": 1.8248086579865326, + "grad_norm": 0.8742592930793762, + "learning_rate": 1.8966206255344398e-06, + "loss": 1.0923, + "step": 285630 + }, + { + "epoch": 1.8248725451362713, + "grad_norm": 1.3812202215194702, + "learning_rate": 1.8952519870325535e-06, + "loss": 0.9457, + "step": 285640 + }, + { + "epoch": 1.82493643228601, + "grad_norm": 1.8078464269638062, + "learning_rate": 1.8938838329870922e-06, + "loss": 1.048, + "step": 285650 + }, + { + "epoch": 1.8250003194357487, + "grad_norm": 1.5746678113937378, + "learning_rate": 1.892516163411845e-06, + "loss": 1.0377, + "step": 285660 + }, + { + "epoch": 1.8250642065854874, + "grad_norm": 1.2783678770065308, + "learning_rate": 1.8911489783205672e-06, + "loss": 1.2767, + "step": 285670 + }, + { + "epoch": 1.825128093735226, + "grad_norm": 1.2706398963928223, + "learning_rate": 1.8897822777270536e-06, + "loss": 0.8448, + "step": 285680 + }, + { + "epoch": 1.8251919808849648, + "grad_norm": 1.1190025806427002, + "learning_rate": 1.888416061645043e-06, + "loss": 0.9448, + "step": 285690 + }, + { + "epoch": 1.8252558680347035, + "grad_norm": 1.6733462810516357, + "learning_rate": 1.887050330088308e-06, + "loss": 1.0124, + "step": 285700 + }, + { + "epoch": 1.8253197551844422, + "grad_norm": 1.2430444955825806, + "learning_rate": 1.8856850830705985e-06, + "loss": 0.9413, + "step": 285710 + }, + { + "epoch": 1.825383642334181, + "grad_norm": 1.307869553565979, + "learning_rate": 1.8843203206056704e-06, + "loss": 0.9672, + "step": 285720 + }, + { + "epoch": 1.8254475294839196, + "grad_norm": 1.8900365829467773, + "learning_rate": 1.8829560427072569e-06, + "loss": 1.2764, + "step": 285730 + }, + { + "epoch": 1.8255114166336583, + "grad_norm": 0.8162425756454468, + "learning_rate": 1.8815922493891024e-06, + "loss": 0.7358, + "step": 285740 + }, + { + "epoch": 1.825575303783397, + "grad_norm": 1.0894966125488281, + "learning_rate": 1.8802289406649464e-06, + "loss": 0.9248, + "step": 285750 + }, + { + "epoch": 1.8256391909331358, + "grad_norm": 0.7507548332214355, + "learning_rate": 1.878866116548511e-06, + "loss": 0.9988, + "step": 285760 + }, + { + "epoch": 1.8257030780828745, + "grad_norm": 1.6045938730239868, + "learning_rate": 1.8775037770535298e-06, + "loss": 0.7603, + "step": 285770 + }, + { + "epoch": 1.8257669652326132, + "grad_norm": 0.7645987272262573, + "learning_rate": 1.8761419221937138e-06, + "loss": 1.0192, + "step": 285780 + }, + { + "epoch": 1.8258308523823517, + "grad_norm": 1.4978324174880981, + "learning_rate": 1.8747805519827855e-06, + "loss": 0.7535, + "step": 285790 + }, + { + "epoch": 1.8258947395320906, + "grad_norm": 0.6685463786125183, + "learning_rate": 1.873419666434445e-06, + "loss": 0.8815, + "step": 285800 + }, + { + "epoch": 1.825958626681829, + "grad_norm": 1.0744636058807373, + "learning_rate": 1.872059265562409e-06, + "loss": 0.7381, + "step": 285810 + }, + { + "epoch": 1.826022513831568, + "grad_norm": 0.9824548959732056, + "learning_rate": 1.8706993493803726e-06, + "loss": 1.048, + "step": 285820 + }, + { + "epoch": 1.8260864009813065, + "grad_norm": 1.2852534055709839, + "learning_rate": 1.8693399179020353e-06, + "loss": 0.6329, + "step": 285830 + }, + { + "epoch": 1.8261502881310454, + "grad_norm": 0.7904815077781677, + "learning_rate": 1.8679809711410757e-06, + "loss": 0.7611, + "step": 285840 + }, + { + "epoch": 1.826214175280784, + "grad_norm": 1.1488205194473267, + "learning_rate": 1.8666225091111878e-06, + "loss": 0.802, + "step": 285850 + }, + { + "epoch": 1.8262780624305228, + "grad_norm": 0.7961608171463013, + "learning_rate": 1.8652645318260608e-06, + "loss": 0.9429, + "step": 285860 + }, + { + "epoch": 1.8263419495802613, + "grad_norm": 1.1986531019210815, + "learning_rate": 1.8639070392993506e-06, + "loss": 0.8178, + "step": 285870 + }, + { + "epoch": 1.8264058367300002, + "grad_norm": 1.0353028774261475, + "learning_rate": 1.862550031544752e-06, + "loss": 0.7877, + "step": 285880 + }, + { + "epoch": 1.8264697238797387, + "grad_norm": 0.8865573406219482, + "learning_rate": 1.8611935085759092e-06, + "loss": 0.7353, + "step": 285890 + }, + { + "epoch": 1.8265336110294776, + "grad_norm": 0.6933355331420898, + "learning_rate": 1.8598374704065002e-06, + "loss": 0.9469, + "step": 285900 + }, + { + "epoch": 1.8265974981792161, + "grad_norm": 0.6500905752182007, + "learning_rate": 1.8584819170501755e-06, + "loss": 0.8656, + "step": 285910 + }, + { + "epoch": 1.826661385328955, + "grad_norm": 0.8234435319900513, + "learning_rate": 1.8571268485205851e-06, + "loss": 0.937, + "step": 285920 + }, + { + "epoch": 1.8267252724786935, + "grad_norm": 2.764732837677002, + "learning_rate": 1.8557722648313735e-06, + "loss": 0.5897, + "step": 285930 + }, + { + "epoch": 1.8267891596284325, + "grad_norm": 1.0919827222824097, + "learning_rate": 1.8544181659961911e-06, + "loss": 0.6466, + "step": 285940 + }, + { + "epoch": 1.826853046778171, + "grad_norm": 1.1406519412994385, + "learning_rate": 1.853064552028666e-06, + "loss": 0.9709, + "step": 285950 + }, + { + "epoch": 1.8269169339279099, + "grad_norm": 2.4636154174804688, + "learning_rate": 1.851711422942437e-06, + "loss": 1.1361, + "step": 285960 + }, + { + "epoch": 1.8269808210776484, + "grad_norm": 0.872745931148529, + "learning_rate": 1.8503587787511212e-06, + "loss": 0.8822, + "step": 285970 + }, + { + "epoch": 1.8270447082273873, + "grad_norm": 1.3167917728424072, + "learning_rate": 1.8490066194683575e-06, + "loss": 0.8776, + "step": 285980 + }, + { + "epoch": 1.8271085953771258, + "grad_norm": 0.6231662631034851, + "learning_rate": 1.8476549451077462e-06, + "loss": 0.9667, + "step": 285990 + }, + { + "epoch": 1.8271724825268647, + "grad_norm": 1.095565915107727, + "learning_rate": 1.84630375568291e-06, + "loss": 0.9367, + "step": 286000 + }, + { + "epoch": 1.8272363696766032, + "grad_norm": 0.926649808883667, + "learning_rate": 1.8449530512074542e-06, + "loss": 1.0359, + "step": 286010 + }, + { + "epoch": 1.827300256826342, + "grad_norm": 5.4127068519592285, + "learning_rate": 1.8436028316949793e-06, + "loss": 0.9705, + "step": 286020 + }, + { + "epoch": 1.8273641439760806, + "grad_norm": 1.2995518445968628, + "learning_rate": 1.8422530971590856e-06, + "loss": 0.8106, + "step": 286030 + }, + { + "epoch": 1.8274280311258193, + "grad_norm": 1.374437928199768, + "learning_rate": 1.8409038476133678e-06, + "loss": 0.8928, + "step": 286040 + }, + { + "epoch": 1.827491918275558, + "grad_norm": 1.131495714187622, + "learning_rate": 1.8395550830714093e-06, + "loss": 0.8288, + "step": 286050 + }, + { + "epoch": 1.8275558054252967, + "grad_norm": 1.104823112487793, + "learning_rate": 1.8382068035467937e-06, + "loss": 1.0923, + "step": 286060 + }, + { + "epoch": 1.8276196925750354, + "grad_norm": 1.117811679840088, + "learning_rate": 1.8368590090531102e-06, + "loss": 0.8615, + "step": 286070 + }, + { + "epoch": 1.8276835797247741, + "grad_norm": 0.7175796031951904, + "learning_rate": 1.8355116996039146e-06, + "loss": 0.7234, + "step": 286080 + }, + { + "epoch": 1.8277474668745128, + "grad_norm": 1.426501750946045, + "learning_rate": 1.8341648752127905e-06, + "loss": 0.8822, + "step": 286090 + }, + { + "epoch": 1.8278113540242515, + "grad_norm": 1.320160150527954, + "learning_rate": 1.8328185358932882e-06, + "loss": 0.8113, + "step": 286100 + }, + { + "epoch": 1.8278752411739903, + "grad_norm": 1.6251180171966553, + "learning_rate": 1.83147268165898e-06, + "loss": 0.8898, + "step": 286110 + }, + { + "epoch": 1.827939128323729, + "grad_norm": 0.7918174862861633, + "learning_rate": 1.830127312523411e-06, + "loss": 0.8613, + "step": 286120 + }, + { + "epoch": 1.8280030154734677, + "grad_norm": 0.9489846229553223, + "learning_rate": 1.8287824285001365e-06, + "loss": 0.9402, + "step": 286130 + }, + { + "epoch": 1.8280669026232064, + "grad_norm": 1.3466228246688843, + "learning_rate": 1.8274380296026905e-06, + "loss": 0.8081, + "step": 286140 + }, + { + "epoch": 1.828130789772945, + "grad_norm": 0.9148550033569336, + "learning_rate": 1.8260941158446288e-06, + "loss": 0.7575, + "step": 286150 + }, + { + "epoch": 1.8281946769226838, + "grad_norm": 0.531804621219635, + "learning_rate": 1.8247506872394681e-06, + "loss": 0.8309, + "step": 286160 + }, + { + "epoch": 1.8282585640724225, + "grad_norm": 0.610683798789978, + "learning_rate": 1.823407743800748e-06, + "loss": 0.6663, + "step": 286170 + }, + { + "epoch": 1.8283224512221612, + "grad_norm": 0.8566081523895264, + "learning_rate": 1.822065285541985e-06, + "loss": 1.0289, + "step": 286180 + }, + { + "epoch": 1.8283863383719, + "grad_norm": 0.7995787858963013, + "learning_rate": 1.8207233124767132e-06, + "loss": 0.8781, + "step": 286190 + }, + { + "epoch": 1.8284502255216386, + "grad_norm": 1.0090097188949585, + "learning_rate": 1.8193818246184323e-06, + "loss": 0.7488, + "step": 286200 + }, + { + "epoch": 1.8285141126713773, + "grad_norm": 1.12493896484375, + "learning_rate": 1.8180408219806655e-06, + "loss": 0.9024, + "step": 286210 + }, + { + "epoch": 1.828577999821116, + "grad_norm": 0.7762433290481567, + "learning_rate": 1.8167003045769016e-06, + "loss": 0.9644, + "step": 286220 + }, + { + "epoch": 1.8286418869708547, + "grad_norm": 1.022645115852356, + "learning_rate": 1.8153602724206576e-06, + "loss": 0.9341, + "step": 286230 + }, + { + "epoch": 1.8287057741205934, + "grad_norm": 1.4305933713912964, + "learning_rate": 1.814020725525417e-06, + "loss": 1.0233, + "step": 286240 + }, + { + "epoch": 1.8287696612703321, + "grad_norm": 0.9852200746536255, + "learning_rate": 1.8126816639046751e-06, + "loss": 0.8914, + "step": 286250 + }, + { + "epoch": 1.8288335484200706, + "grad_norm": 1.5261846780776978, + "learning_rate": 1.811343087571915e-06, + "loss": 0.9038, + "step": 286260 + }, + { + "epoch": 1.8288974355698095, + "grad_norm": 1.0979644060134888, + "learning_rate": 1.8100049965406206e-06, + "loss": 0.8995, + "step": 286270 + }, + { + "epoch": 1.828961322719548, + "grad_norm": 1.460039496421814, + "learning_rate": 1.8086673908242702e-06, + "loss": 0.8506, + "step": 286280 + }, + { + "epoch": 1.829025209869287, + "grad_norm": 1.119490623474121, + "learning_rate": 1.8073302704363248e-06, + "loss": 0.8478, + "step": 286290 + }, + { + "epoch": 1.8290890970190254, + "grad_norm": 0.6196275353431702, + "learning_rate": 1.8059936353902684e-06, + "loss": 0.7895, + "step": 286300 + }, + { + "epoch": 1.8291529841687644, + "grad_norm": 0.7568504810333252, + "learning_rate": 1.8046574856995345e-06, + "loss": 0.7273, + "step": 286310 + }, + { + "epoch": 1.8292168713185029, + "grad_norm": 1.0650426149368286, + "learning_rate": 1.803321821377607e-06, + "loss": 0.9648, + "step": 286320 + }, + { + "epoch": 1.8292807584682418, + "grad_norm": 1.2894755601882935, + "learning_rate": 1.801986642437914e-06, + "loss": 0.8205, + "step": 286330 + }, + { + "epoch": 1.8293446456179803, + "grad_norm": 0.8295124769210815, + "learning_rate": 1.800651948893922e-06, + "loss": 0.8621, + "step": 286340 + }, + { + "epoch": 1.8294085327677192, + "grad_norm": 1.0976676940917969, + "learning_rate": 1.7993177407590544e-06, + "loss": 0.8081, + "step": 286350 + }, + { + "epoch": 1.8294724199174577, + "grad_norm": 1.242518424987793, + "learning_rate": 1.7979840180467666e-06, + "loss": 0.9352, + "step": 286360 + }, + { + "epoch": 1.8295363070671966, + "grad_norm": 0.7539724707603455, + "learning_rate": 1.7966507807704701e-06, + "loss": 0.8317, + "step": 286370 + }, + { + "epoch": 1.829600194216935, + "grad_norm": 0.8778108954429626, + "learning_rate": 1.7953180289436044e-06, + "loss": 0.7374, + "step": 286380 + }, + { + "epoch": 1.829664081366674, + "grad_norm": 1.0061482191085815, + "learning_rate": 1.793985762579592e-06, + "loss": 0.9777, + "step": 286390 + }, + { + "epoch": 1.8297279685164125, + "grad_norm": 0.8564932346343994, + "learning_rate": 1.7926539816918443e-06, + "loss": 0.9104, + "step": 286400 + }, + { + "epoch": 1.8297918556661514, + "grad_norm": 0.9976624250411987, + "learning_rate": 1.7913226862937837e-06, + "loss": 0.9519, + "step": 286410 + }, + { + "epoch": 1.82985574281589, + "grad_norm": 0.8627698421478271, + "learning_rate": 1.7899918763988e-06, + "loss": 0.6869, + "step": 286420 + }, + { + "epoch": 1.8299196299656288, + "grad_norm": 1.1749376058578491, + "learning_rate": 1.7886615520203097e-06, + "loss": 0.9258, + "step": 286430 + }, + { + "epoch": 1.8299835171153673, + "grad_norm": 0.8614131212234497, + "learning_rate": 1.7873317131717082e-06, + "loss": 0.8995, + "step": 286440 + }, + { + "epoch": 1.8300474042651063, + "grad_norm": 1.5317989587783813, + "learning_rate": 1.7860023598663844e-06, + "loss": 1.0053, + "step": 286450 + }, + { + "epoch": 1.8301112914148447, + "grad_norm": 1.088987112045288, + "learning_rate": 1.7846734921177276e-06, + "loss": 0.8924, + "step": 286460 + }, + { + "epoch": 1.8301751785645837, + "grad_norm": 1.2498538494110107, + "learning_rate": 1.7833451099391274e-06, + "loss": 0.879, + "step": 286470 + }, + { + "epoch": 1.8302390657143222, + "grad_norm": 0.6987992525100708, + "learning_rate": 1.782017213343945e-06, + "loss": 0.8498, + "step": 286480 + }, + { + "epoch": 1.830302952864061, + "grad_norm": 2.299447774887085, + "learning_rate": 1.7806898023455698e-06, + "loss": 0.8671, + "step": 286490 + }, + { + "epoch": 1.8303668400137996, + "grad_norm": 0.996013879776001, + "learning_rate": 1.7793628769573633e-06, + "loss": 0.9277, + "step": 286500 + }, + { + "epoch": 1.8304307271635383, + "grad_norm": 0.5067451596260071, + "learning_rate": 1.7780364371926927e-06, + "loss": 0.9661, + "step": 286510 + }, + { + "epoch": 1.830494614313277, + "grad_norm": 0.6430745124816895, + "learning_rate": 1.776710483064903e-06, + "loss": 1.0379, + "step": 286520 + }, + { + "epoch": 1.8305585014630157, + "grad_norm": 0.7153158783912659, + "learning_rate": 1.7753850145873664e-06, + "loss": 1.0634, + "step": 286530 + }, + { + "epoch": 1.8306223886127544, + "grad_norm": 0.8899746537208557, + "learning_rate": 1.774060031773417e-06, + "loss": 0.898, + "step": 286540 + }, + { + "epoch": 1.830686275762493, + "grad_norm": 1.5335935354232788, + "learning_rate": 1.7727355346364105e-06, + "loss": 0.9678, + "step": 286550 + }, + { + "epoch": 1.8307501629122318, + "grad_norm": 0.967056393623352, + "learning_rate": 1.7714115231896755e-06, + "loss": 0.9838, + "step": 286560 + }, + { + "epoch": 1.8308140500619705, + "grad_norm": 2.244809150695801, + "learning_rate": 1.7700879974465568e-06, + "loss": 0.9862, + "step": 286570 + }, + { + "epoch": 1.8308779372117092, + "grad_norm": 1.108665943145752, + "learning_rate": 1.7687649574203714e-06, + "loss": 1.0522, + "step": 286580 + }, + { + "epoch": 1.830941824361448, + "grad_norm": 1.2201392650604248, + "learning_rate": 1.7674424031244418e-06, + "loss": 0.6931, + "step": 286590 + }, + { + "epoch": 1.8310057115111866, + "grad_norm": 0.8409663438796997, + "learning_rate": 1.7661203345721078e-06, + "loss": 0.9458, + "step": 286600 + }, + { + "epoch": 1.8310695986609253, + "grad_norm": 1.0214024782180786, + "learning_rate": 1.7647987517766585e-06, + "loss": 0.6715, + "step": 286610 + }, + { + "epoch": 1.831133485810664, + "grad_norm": 1.5210096836090088, + "learning_rate": 1.7634776547514275e-06, + "loss": 0.8119, + "step": 286620 + }, + { + "epoch": 1.8311973729604027, + "grad_norm": 1.4261603355407715, + "learning_rate": 1.7621570435096934e-06, + "loss": 0.7577, + "step": 286630 + }, + { + "epoch": 1.8312612601101415, + "grad_norm": 1.194935917854309, + "learning_rate": 1.7608369180647788e-06, + "loss": 0.8755, + "step": 286640 + }, + { + "epoch": 1.8313251472598802, + "grad_norm": 2.938659191131592, + "learning_rate": 1.7595172784299673e-06, + "loss": 0.6624, + "step": 286650 + }, + { + "epoch": 1.8313890344096189, + "grad_norm": 0.9520533680915833, + "learning_rate": 1.7581981246185542e-06, + "loss": 0.7869, + "step": 286660 + }, + { + "epoch": 1.8314529215593576, + "grad_norm": 2.5831644535064697, + "learning_rate": 1.7568794566438118e-06, + "loss": 0.7728, + "step": 286670 + }, + { + "epoch": 1.8315168087090963, + "grad_norm": 0.9544631242752075, + "learning_rate": 1.755561274519041e-06, + "loss": 0.8714, + "step": 286680 + }, + { + "epoch": 1.831580695858835, + "grad_norm": 0.8048824071884155, + "learning_rate": 1.7542435782574974e-06, + "loss": 0.9109, + "step": 286690 + }, + { + "epoch": 1.8316445830085737, + "grad_norm": 0.8290845155715942, + "learning_rate": 1.752926367872465e-06, + "loss": 0.7139, + "step": 286700 + }, + { + "epoch": 1.8317084701583124, + "grad_norm": 1.0401479005813599, + "learning_rate": 1.7516096433771944e-06, + "loss": 0.8462, + "step": 286710 + }, + { + "epoch": 1.831772357308051, + "grad_norm": 0.9665109515190125, + "learning_rate": 1.7502934047849639e-06, + "loss": 1.1502, + "step": 286720 + }, + { + "epoch": 1.8318362444577898, + "grad_norm": 1.5829625129699707, + "learning_rate": 1.7489776521090184e-06, + "loss": 0.7414, + "step": 286730 + }, + { + "epoch": 1.8319001316075285, + "grad_norm": 0.9078747034072876, + "learning_rate": 1.747662385362614e-06, + "loss": 0.8966, + "step": 286740 + }, + { + "epoch": 1.831964018757267, + "grad_norm": 1.2583354711532593, + "learning_rate": 1.7463476045589844e-06, + "loss": 1.0177, + "step": 286750 + }, + { + "epoch": 1.832027905907006, + "grad_norm": 0.9693160653114319, + "learning_rate": 1.7450333097113913e-06, + "loss": 0.7749, + "step": 286760 + }, + { + "epoch": 1.8320917930567444, + "grad_norm": 1.3210150003433228, + "learning_rate": 1.743719500833052e-06, + "loss": 1.3723, + "step": 286770 + }, + { + "epoch": 1.8321556802064833, + "grad_norm": 0.4251079559326172, + "learning_rate": 1.7424061779372114e-06, + "loss": 0.7269, + "step": 286780 + }, + { + "epoch": 1.8322195673562218, + "grad_norm": 1.2777466773986816, + "learning_rate": 1.7410933410370867e-06, + "loss": 0.8654, + "step": 286790 + }, + { + "epoch": 1.8322834545059608, + "grad_norm": 1.4226998090744019, + "learning_rate": 1.739780990145895e-06, + "loss": 0.9033, + "step": 286800 + }, + { + "epoch": 1.8323473416556992, + "grad_norm": 0.9256820678710938, + "learning_rate": 1.7384691252768704e-06, + "loss": 1.1099, + "step": 286810 + }, + { + "epoch": 1.8324112288054382, + "grad_norm": 1.0882539749145508, + "learning_rate": 1.737157746443202e-06, + "loss": 0.7031, + "step": 286820 + }, + { + "epoch": 1.8324751159551766, + "grad_norm": 0.7053834795951843, + "learning_rate": 1.7358468536581185e-06, + "loss": 0.9929, + "step": 286830 + }, + { + "epoch": 1.8325390031049156, + "grad_norm": 2.0581183433532715, + "learning_rate": 1.7345364469348035e-06, + "loss": 1.1567, + "step": 286840 + }, + { + "epoch": 1.832602890254654, + "grad_norm": 0.852189302444458, + "learning_rate": 1.7332265262864744e-06, + "loss": 0.828, + "step": 286850 + }, + { + "epoch": 1.832666777404393, + "grad_norm": 1.145141363143921, + "learning_rate": 1.7319170917262983e-06, + "loss": 0.8113, + "step": 286860 + }, + { + "epoch": 1.8327306645541315, + "grad_norm": 1.4547284841537476, + "learning_rate": 1.7306081432674814e-06, + "loss": 0.8355, + "step": 286870 + }, + { + "epoch": 1.8327945517038704, + "grad_norm": 0.9284454584121704, + "learning_rate": 1.7292996809231965e-06, + "loss": 0.8272, + "step": 286880 + }, + { + "epoch": 1.8328584388536089, + "grad_norm": 1.0113308429718018, + "learning_rate": 1.7279917047066275e-06, + "loss": 0.9879, + "step": 286890 + }, + { + "epoch": 1.8329223260033478, + "grad_norm": 2.9800875186920166, + "learning_rate": 1.7266842146309358e-06, + "loss": 0.5337, + "step": 286900 + }, + { + "epoch": 1.8329862131530863, + "grad_norm": 0.674667239189148, + "learning_rate": 1.7253772107093003e-06, + "loss": 0.8118, + "step": 286910 + }, + { + "epoch": 1.8330501003028252, + "grad_norm": 2.0343778133392334, + "learning_rate": 1.7240706929548822e-06, + "loss": 0.8474, + "step": 286920 + }, + { + "epoch": 1.8331139874525637, + "grad_norm": 0.5409013628959656, + "learning_rate": 1.7227646613808324e-06, + "loss": 0.802, + "step": 286930 + }, + { + "epoch": 1.8331778746023026, + "grad_norm": 1.0042235851287842, + "learning_rate": 1.7214591160003124e-06, + "loss": 0.8092, + "step": 286940 + }, + { + "epoch": 1.8332417617520411, + "grad_norm": 0.6316293478012085, + "learning_rate": 1.7201540568264673e-06, + "loss": 0.7423, + "step": 286950 + }, + { + "epoch": 1.83330564890178, + "grad_norm": 3.186025857925415, + "learning_rate": 1.7188494838724367e-06, + "loss": 0.8688, + "step": 286960 + }, + { + "epoch": 1.8333695360515185, + "grad_norm": 1.0892722606658936, + "learning_rate": 1.71754539715136e-06, + "loss": 0.7347, + "step": 286970 + }, + { + "epoch": 1.8334334232012575, + "grad_norm": 0.6171730160713196, + "learning_rate": 1.716241796676371e-06, + "loss": 0.9285, + "step": 286980 + }, + { + "epoch": 1.833497310350996, + "grad_norm": 1.5199726819992065, + "learning_rate": 1.7149386824605983e-06, + "loss": 0.9471, + "step": 286990 + }, + { + "epoch": 1.8335611975007347, + "grad_norm": 0.735580325126648, + "learning_rate": 1.7136360545171703e-06, + "loss": 0.8594, + "step": 287000 + }, + { + "epoch": 1.8336250846504734, + "grad_norm": 1.267344355583191, + "learning_rate": 1.712333912859193e-06, + "loss": 0.981, + "step": 287010 + }, + { + "epoch": 1.833688971800212, + "grad_norm": 2.0481607913970947, + "learning_rate": 1.711032257499795e-06, + "loss": 1.0415, + "step": 287020 + }, + { + "epoch": 1.8337528589499508, + "grad_norm": 1.4587774276733398, + "learning_rate": 1.7097310884520768e-06, + "loss": 0.9411, + "step": 287030 + }, + { + "epoch": 1.8338167460996895, + "grad_norm": 0.8664649724960327, + "learning_rate": 1.7084304057291445e-06, + "loss": 1.0081, + "step": 287040 + }, + { + "epoch": 1.8338806332494282, + "grad_norm": 1.8076395988464355, + "learning_rate": 1.7071302093440878e-06, + "loss": 0.9189, + "step": 287050 + }, + { + "epoch": 1.8339445203991669, + "grad_norm": 0.7583857178688049, + "learning_rate": 1.7058304993100183e-06, + "loss": 0.9083, + "step": 287060 + }, + { + "epoch": 1.8340084075489056, + "grad_norm": 1.2719414234161377, + "learning_rate": 1.7045312756400145e-06, + "loss": 0.8046, + "step": 287070 + }, + { + "epoch": 1.8340722946986443, + "grad_norm": 1.1557854413986206, + "learning_rate": 1.7032325383471604e-06, + "loss": 0.888, + "step": 287080 + }, + { + "epoch": 1.834136181848383, + "grad_norm": 1.6682637929916382, + "learning_rate": 1.7019342874445343e-06, + "loss": 0.8401, + "step": 287090 + }, + { + "epoch": 1.8342000689981217, + "grad_norm": 1.3640217781066895, + "learning_rate": 1.7006365229452204e-06, + "loss": 0.9644, + "step": 287100 + }, + { + "epoch": 1.8342639561478604, + "grad_norm": 1.4971295595169067, + "learning_rate": 1.6993392448622747e-06, + "loss": 1.241, + "step": 287110 + }, + { + "epoch": 1.8343278432975991, + "grad_norm": 0.9940297603607178, + "learning_rate": 1.6980424532087702e-06, + "loss": 0.9292, + "step": 287120 + }, + { + "epoch": 1.8343917304473378, + "grad_norm": 1.4869452714920044, + "learning_rate": 1.696746147997774e-06, + "loss": 0.9535, + "step": 287130 + }, + { + "epoch": 1.8344556175970765, + "grad_norm": 1.4654951095581055, + "learning_rate": 1.6954503292423207e-06, + "loss": 0.7897, + "step": 287140 + }, + { + "epoch": 1.8345195047468152, + "grad_norm": 0.8898797631263733, + "learning_rate": 1.6941549969554826e-06, + "loss": 0.8698, + "step": 287150 + }, + { + "epoch": 1.834583391896554, + "grad_norm": 1.1146529912948608, + "learning_rate": 1.6928601511502828e-06, + "loss": 0.8981, + "step": 287160 + }, + { + "epoch": 1.8346472790462927, + "grad_norm": 1.2310070991516113, + "learning_rate": 1.6915657918397831e-06, + "loss": 1.1368, + "step": 287170 + }, + { + "epoch": 1.8347111661960314, + "grad_norm": 1.4128965139389038, + "learning_rate": 1.690271919037001e-06, + "loss": 1.0198, + "step": 287180 + }, + { + "epoch": 1.83477505334577, + "grad_norm": 1.618044376373291, + "learning_rate": 1.6889785327549811e-06, + "loss": 1.0275, + "step": 287190 + }, + { + "epoch": 1.8348389404955088, + "grad_norm": 1.2252626419067383, + "learning_rate": 1.6876856330067359e-06, + "loss": 1.1003, + "step": 287200 + }, + { + "epoch": 1.8349028276452475, + "grad_norm": 0.8357836008071899, + "learning_rate": 1.686393219805299e-06, + "loss": 0.9564, + "step": 287210 + }, + { + "epoch": 1.8349667147949862, + "grad_norm": 1.0785801410675049, + "learning_rate": 1.6851012931636767e-06, + "loss": 0.8547, + "step": 287220 + }, + { + "epoch": 1.835030601944725, + "grad_norm": 0.7203280329704285, + "learning_rate": 1.683809853094881e-06, + "loss": 0.9503, + "step": 287230 + }, + { + "epoch": 1.8350944890944634, + "grad_norm": 0.9999285936355591, + "learning_rate": 1.6825188996119178e-06, + "loss": 0.7627, + "step": 287240 + }, + { + "epoch": 1.8351583762442023, + "grad_norm": 0.6865089535713196, + "learning_rate": 1.6812284327277994e-06, + "loss": 0.8916, + "step": 287250 + }, + { + "epoch": 1.8352222633939408, + "grad_norm": 1.0471559762954712, + "learning_rate": 1.679938452455504e-06, + "loss": 0.8243, + "step": 287260 + }, + { + "epoch": 1.8352861505436797, + "grad_norm": 1.1230990886688232, + "learning_rate": 1.6786489588080322e-06, + "loss": 0.884, + "step": 287270 + }, + { + "epoch": 1.8353500376934182, + "grad_norm": 1.3938952684402466, + "learning_rate": 1.677359951798374e-06, + "loss": 0.8603, + "step": 287280 + }, + { + "epoch": 1.8354139248431571, + "grad_norm": 1.2799034118652344, + "learning_rate": 1.676071431439502e-06, + "loss": 1.0646, + "step": 287290 + }, + { + "epoch": 1.8354778119928956, + "grad_norm": 1.3219281435012817, + "learning_rate": 1.6747833977444006e-06, + "loss": 0.9428, + "step": 287300 + }, + { + "epoch": 1.8355416991426345, + "grad_norm": 0.9605557322502136, + "learning_rate": 1.6734958507260313e-06, + "loss": 0.6734, + "step": 287310 + }, + { + "epoch": 1.835605586292373, + "grad_norm": 0.8685451745986938, + "learning_rate": 1.6722087903973726e-06, + "loss": 0.8308, + "step": 287320 + }, + { + "epoch": 1.835669473442112, + "grad_norm": 0.6971920132637024, + "learning_rate": 1.6709222167713756e-06, + "loss": 0.7354, + "step": 287330 + }, + { + "epoch": 1.8357333605918504, + "grad_norm": 0.8624512553215027, + "learning_rate": 1.6696361298610075e-06, + "loss": 0.8008, + "step": 287340 + }, + { + "epoch": 1.8357972477415894, + "grad_norm": 1.2721669673919678, + "learning_rate": 1.668350529679208e-06, + "loss": 0.7032, + "step": 287350 + }, + { + "epoch": 1.8358611348913279, + "grad_norm": 1.4571473598480225, + "learning_rate": 1.6670654162389387e-06, + "loss": 0.9812, + "step": 287360 + }, + { + "epoch": 1.8359250220410668, + "grad_norm": 0.6332854628562927, + "learning_rate": 1.6657807895531342e-06, + "loss": 0.6715, + "step": 287370 + }, + { + "epoch": 1.8359889091908053, + "grad_norm": 0.9317585825920105, + "learning_rate": 1.6644966496347336e-06, + "loss": 0.643, + "step": 287380 + }, + { + "epoch": 1.8360527963405442, + "grad_norm": 0.5826150178909302, + "learning_rate": 1.6632129964966602e-06, + "loss": 0.9093, + "step": 287390 + }, + { + "epoch": 1.8361166834902827, + "grad_norm": 1.3244376182556152, + "learning_rate": 1.6619298301518594e-06, + "loss": 0.8643, + "step": 287400 + }, + { + "epoch": 1.8361805706400216, + "grad_norm": 1.741381049156189, + "learning_rate": 1.6606471506132315e-06, + "loss": 0.828, + "step": 287410 + }, + { + "epoch": 1.83624445778976, + "grad_norm": 1.6920572519302368, + "learning_rate": 1.6593649578937165e-06, + "loss": 0.941, + "step": 287420 + }, + { + "epoch": 1.836308344939499, + "grad_norm": 1.039543867111206, + "learning_rate": 1.6580832520062095e-06, + "loss": 1.0667, + "step": 287430 + }, + { + "epoch": 1.8363722320892375, + "grad_norm": 0.8765792846679688, + "learning_rate": 1.656802032963628e-06, + "loss": 0.7044, + "step": 287440 + }, + { + "epoch": 1.8364361192389764, + "grad_norm": 0.897909939289093, + "learning_rate": 1.6555213007788784e-06, + "loss": 0.9957, + "step": 287450 + }, + { + "epoch": 1.836500006388715, + "grad_norm": 1.0654629468917847, + "learning_rate": 1.6542410554648446e-06, + "loss": 0.8229, + "step": 287460 + }, + { + "epoch": 1.8365638935384538, + "grad_norm": 0.8408827781677246, + "learning_rate": 1.6529612970344389e-06, + "loss": 0.8093, + "step": 287470 + }, + { + "epoch": 1.8366277806881923, + "grad_norm": 0.887602686882019, + "learning_rate": 1.6516820255005284e-06, + "loss": 0.8046, + "step": 287480 + }, + { + "epoch": 1.836691667837931, + "grad_norm": 0.9669412970542908, + "learning_rate": 1.650403240876014e-06, + "loss": 0.8687, + "step": 287490 + }, + { + "epoch": 1.8367555549876697, + "grad_norm": 1.320043921470642, + "learning_rate": 1.6491249431737633e-06, + "loss": 0.9349, + "step": 287500 + }, + { + "epoch": 1.8368194421374084, + "grad_norm": 1.1708213090896606, + "learning_rate": 1.6478471324066603e-06, + "loss": 1.0136, + "step": 287510 + }, + { + "epoch": 1.8368833292871471, + "grad_norm": 0.8634885549545288, + "learning_rate": 1.6465698085875558e-06, + "loss": 0.9404, + "step": 287520 + }, + { + "epoch": 1.8369472164368859, + "grad_norm": 0.9964984059333801, + "learning_rate": 1.6452929717293398e-06, + "loss": 0.7876, + "step": 287530 + }, + { + "epoch": 1.8370111035866246, + "grad_norm": 0.9559676647186279, + "learning_rate": 1.644016621844846e-06, + "loss": 1.0267, + "step": 287540 + }, + { + "epoch": 1.8370749907363633, + "grad_norm": 0.8222513198852539, + "learning_rate": 1.6427407589469424e-06, + "loss": 0.8964, + "step": 287550 + }, + { + "epoch": 1.837138877886102, + "grad_norm": 0.8146112561225891, + "learning_rate": 1.6415928987229767e-06, + "loss": 0.8593, + "step": 287560 + }, + { + "epoch": 1.8372027650358407, + "grad_norm": 0.6883629560470581, + "learning_rate": 1.6403179611349794e-06, + "loss": 0.9835, + "step": 287570 + }, + { + "epoch": 1.8372666521855794, + "grad_norm": 0.7211798429489136, + "learning_rate": 1.6390435105708256e-06, + "loss": 0.9449, + "step": 287580 + }, + { + "epoch": 1.837330539335318, + "grad_norm": 1.4879876375198364, + "learning_rate": 1.637769547043333e-06, + "loss": 0.8503, + "step": 287590 + }, + { + "epoch": 1.8373944264850568, + "grad_norm": 0.8715039491653442, + "learning_rate": 1.636496070565341e-06, + "loss": 1.0713, + "step": 287600 + }, + { + "epoch": 1.8374583136347955, + "grad_norm": 0.9951567053794861, + "learning_rate": 1.6352230811496726e-06, + "loss": 0.992, + "step": 287610 + }, + { + "epoch": 1.8375222007845342, + "grad_norm": 1.3647722005844116, + "learning_rate": 1.6339505788091568e-06, + "loss": 0.6799, + "step": 287620 + }, + { + "epoch": 1.837586087934273, + "grad_norm": 1.3705017566680908, + "learning_rate": 1.6326785635565944e-06, + "loss": 0.8357, + "step": 287630 + }, + { + "epoch": 1.8376499750840116, + "grad_norm": 0.980297327041626, + "learning_rate": 1.6314070354048082e-06, + "loss": 0.9319, + "step": 287640 + }, + { + "epoch": 1.8377138622337503, + "grad_norm": 1.1601104736328125, + "learning_rate": 1.630135994366594e-06, + "loss": 0.8532, + "step": 287650 + }, + { + "epoch": 1.837777749383489, + "grad_norm": 2.0433003902435303, + "learning_rate": 1.6288654404547576e-06, + "loss": 0.5762, + "step": 287660 + }, + { + "epoch": 1.8378416365332277, + "grad_norm": 1.367573857307434, + "learning_rate": 1.6275953736820893e-06, + "loss": 0.692, + "step": 287670 + }, + { + "epoch": 1.8379055236829664, + "grad_norm": 0.7693257927894592, + "learning_rate": 1.6263257940613895e-06, + "loss": 0.7481, + "step": 287680 + }, + { + "epoch": 1.8379694108327052, + "grad_norm": 0.8016722202301025, + "learning_rate": 1.6250567016054374e-06, + "loss": 0.8896, + "step": 287690 + }, + { + "epoch": 1.8380332979824439, + "grad_norm": 0.7425395846366882, + "learning_rate": 1.6237880963270113e-06, + "loss": 0.6763, + "step": 287700 + }, + { + "epoch": 1.8380971851321826, + "grad_norm": 1.4685392379760742, + "learning_rate": 1.62251997823889e-06, + "loss": 0.9643, + "step": 287710 + }, + { + "epoch": 1.8381610722819213, + "grad_norm": 0.5433425903320312, + "learning_rate": 1.6212523473538521e-06, + "loss": 1.035, + "step": 287720 + }, + { + "epoch": 1.8382249594316598, + "grad_norm": 0.8438354134559631, + "learning_rate": 1.6199852036846486e-06, + "loss": 0.7417, + "step": 287730 + }, + { + "epoch": 1.8382888465813987, + "grad_norm": 0.7638826966285706, + "learning_rate": 1.6187185472440525e-06, + "loss": 0.9106, + "step": 287740 + }, + { + "epoch": 1.8383527337311372, + "grad_norm": 0.8857637643814087, + "learning_rate": 1.6174523780448148e-06, + "loss": 0.8946, + "step": 287750 + }, + { + "epoch": 1.838416620880876, + "grad_norm": 0.9542201161384583, + "learning_rate": 1.6161866960996864e-06, + "loss": 0.6759, + "step": 287760 + }, + { + "epoch": 1.8384805080306146, + "grad_norm": 2.320218324661255, + "learning_rate": 1.6149215014214236e-06, + "loss": 0.9944, + "step": 287770 + }, + { + "epoch": 1.8385443951803535, + "grad_norm": 4.098390579223633, + "learning_rate": 1.61365679402275e-06, + "loss": 0.9212, + "step": 287780 + }, + { + "epoch": 1.838608282330092, + "grad_norm": 2.041461944580078, + "learning_rate": 1.6123925739164213e-06, + "loss": 0.9272, + "step": 287790 + }, + { + "epoch": 1.838672169479831, + "grad_norm": 1.1152058839797974, + "learning_rate": 1.6111288411151559e-06, + "loss": 0.9968, + "step": 287800 + }, + { + "epoch": 1.8387360566295694, + "grad_norm": 0.7218173146247864, + "learning_rate": 1.6098655956316878e-06, + "loss": 1.0449, + "step": 287810 + }, + { + "epoch": 1.8387999437793083, + "grad_norm": 1.862316370010376, + "learning_rate": 1.6086028374787343e-06, + "loss": 0.8643, + "step": 287820 + }, + { + "epoch": 1.8388638309290468, + "grad_norm": 1.164618730545044, + "learning_rate": 1.6073405666690188e-06, + "loss": 0.9774, + "step": 287830 + }, + { + "epoch": 1.8389277180787857, + "grad_norm": 1.2470999956130981, + "learning_rate": 1.6060787832152425e-06, + "loss": 0.9728, + "step": 287840 + }, + { + "epoch": 1.8389916052285242, + "grad_norm": 0.8947863578796387, + "learning_rate": 1.604817487130128e-06, + "loss": 1.2326, + "step": 287850 + }, + { + "epoch": 1.8390554923782632, + "grad_norm": 1.0724859237670898, + "learning_rate": 1.6035566784263656e-06, + "loss": 0.9335, + "step": 287860 + }, + { + "epoch": 1.8391193795280016, + "grad_norm": 0.7424639463424683, + "learning_rate": 1.602296357116656e-06, + "loss": 1.0477, + "step": 287870 + }, + { + "epoch": 1.8391832666777406, + "grad_norm": 1.7759449481964111, + "learning_rate": 1.6010365232136893e-06, + "loss": 0.6222, + "step": 287880 + }, + { + "epoch": 1.839247153827479, + "grad_norm": 1.2649390697479248, + "learning_rate": 1.5997771767301605e-06, + "loss": 0.7512, + "step": 287890 + }, + { + "epoch": 1.839311040977218, + "grad_norm": 0.6217682957649231, + "learning_rate": 1.5985183176787433e-06, + "loss": 0.8029, + "step": 287900 + }, + { + "epoch": 1.8393749281269565, + "grad_norm": 0.551947832107544, + "learning_rate": 1.597259946072127e-06, + "loss": 0.9035, + "step": 287910 + }, + { + "epoch": 1.8394388152766954, + "grad_norm": 0.8400312066078186, + "learning_rate": 1.5960020619229743e-06, + "loss": 0.6912, + "step": 287920 + }, + { + "epoch": 1.8395027024264339, + "grad_norm": 0.7524799108505249, + "learning_rate": 1.5947446652439524e-06, + "loss": 0.7257, + "step": 287930 + }, + { + "epoch": 1.8395665895761728, + "grad_norm": 0.602583646774292, + "learning_rate": 1.5934877560477347e-06, + "loss": 0.6298, + "step": 287940 + }, + { + "epoch": 1.8396304767259113, + "grad_norm": 1.2925169467926025, + "learning_rate": 1.5922313343469607e-06, + "loss": 0.8135, + "step": 287950 + }, + { + "epoch": 1.8396943638756502, + "grad_norm": 0.9474080801010132, + "learning_rate": 1.5909754001543097e-06, + "loss": 0.8911, + "step": 287960 + }, + { + "epoch": 1.8397582510253887, + "grad_norm": 1.1987119913101196, + "learning_rate": 1.5897199534824048e-06, + "loss": 1.0228, + "step": 287970 + }, + { + "epoch": 1.8398221381751274, + "grad_norm": 0.922034502029419, + "learning_rate": 1.5884649943439134e-06, + "loss": 0.7103, + "step": 287980 + }, + { + "epoch": 1.8398860253248661, + "grad_norm": 1.1555603742599487, + "learning_rate": 1.5872105227514477e-06, + "loss": 1.0575, + "step": 287990 + }, + { + "epoch": 1.8399499124746048, + "grad_norm": 1.153542160987854, + "learning_rate": 1.5859565387176644e-06, + "loss": 0.8817, + "step": 288000 + }, + { + "epoch": 1.8400137996243435, + "grad_norm": 0.9662725925445557, + "learning_rate": 1.5847030422551812e-06, + "loss": 0.8067, + "step": 288010 + }, + { + "epoch": 1.8400776867740822, + "grad_norm": 1.2538738250732422, + "learning_rate": 1.5834500333766212e-06, + "loss": 1.019, + "step": 288020 + }, + { + "epoch": 1.840141573923821, + "grad_norm": 0.8852131962776184, + "learning_rate": 1.5821975120946076e-06, + "loss": 0.7435, + "step": 288030 + }, + { + "epoch": 1.8402054610735596, + "grad_norm": 1.0147486925125122, + "learning_rate": 1.5809454784217525e-06, + "loss": 0.8279, + "step": 288040 + }, + { + "epoch": 1.8402693482232984, + "grad_norm": 1.0600953102111816, + "learning_rate": 1.5796939323706628e-06, + "loss": 0.9367, + "step": 288050 + }, + { + "epoch": 1.840333235373037, + "grad_norm": 0.8614944219589233, + "learning_rate": 1.5784428739539503e-06, + "loss": 0.7678, + "step": 288060 + }, + { + "epoch": 1.8403971225227758, + "grad_norm": 0.8316081762313843, + "learning_rate": 1.5771923031841994e-06, + "loss": 0.9097, + "step": 288070 + }, + { + "epoch": 1.8404610096725145, + "grad_norm": 0.9914312362670898, + "learning_rate": 1.575942220074017e-06, + "loss": 0.7828, + "step": 288080 + }, + { + "epoch": 1.8405248968222532, + "grad_norm": 0.7772271037101746, + "learning_rate": 1.5746926246359929e-06, + "loss": 0.82, + "step": 288090 + }, + { + "epoch": 1.8405887839719919, + "grad_norm": 0.935269832611084, + "learning_rate": 1.5734435168827055e-06, + "loss": 0.8284, + "step": 288100 + }, + { + "epoch": 1.8406526711217306, + "grad_norm": 1.687303900718689, + "learning_rate": 1.5721948968267398e-06, + "loss": 0.9816, + "step": 288110 + }, + { + "epoch": 1.8407165582714693, + "grad_norm": 1.192875862121582, + "learning_rate": 1.5709467644806631e-06, + "loss": 1.1521, + "step": 288120 + }, + { + "epoch": 1.840780445421208, + "grad_norm": 0.7837314009666443, + "learning_rate": 1.5696991198570488e-06, + "loss": 0.8343, + "step": 288130 + }, + { + "epoch": 1.8408443325709467, + "grad_norm": 1.3365057706832886, + "learning_rate": 1.5684519629684646e-06, + "loss": 0.5837, + "step": 288140 + }, + { + "epoch": 1.8409082197206854, + "grad_norm": 1.0137981176376343, + "learning_rate": 1.5672052938274729e-06, + "loss": 0.8087, + "step": 288150 + }, + { + "epoch": 1.8409721068704241, + "grad_norm": 0.8681532740592957, + "learning_rate": 1.565959112446619e-06, + "loss": 0.9406, + "step": 288160 + }, + { + "epoch": 1.8410359940201628, + "grad_norm": 0.8951544165611267, + "learning_rate": 1.5647134188384593e-06, + "loss": 1.009, + "step": 288170 + }, + { + "epoch": 1.8410998811699015, + "grad_norm": 0.882915198802948, + "learning_rate": 1.5634682130155343e-06, + "loss": 0.7041, + "step": 288180 + }, + { + "epoch": 1.8411637683196402, + "grad_norm": 0.8915712833404541, + "learning_rate": 1.5622234949903947e-06, + "loss": 0.7062, + "step": 288190 + }, + { + "epoch": 1.841227655469379, + "grad_norm": 1.0253684520721436, + "learning_rate": 1.5609792647755638e-06, + "loss": 0.8977, + "step": 288200 + }, + { + "epoch": 1.8412915426191176, + "grad_norm": 0.9191253185272217, + "learning_rate": 1.5597355223835818e-06, + "loss": 0.7529, + "step": 288210 + }, + { + "epoch": 1.8413554297688561, + "grad_norm": 1.1533896923065186, + "learning_rate": 1.5584922678269665e-06, + "loss": 0.8378, + "step": 288220 + }, + { + "epoch": 1.841419316918595, + "grad_norm": 1.0386509895324707, + "learning_rate": 1.5572495011182464e-06, + "loss": 1.2127, + "step": 288230 + }, + { + "epoch": 1.8414832040683335, + "grad_norm": 0.6427866220474243, + "learning_rate": 1.5560072222699284e-06, + "loss": 0.8755, + "step": 288240 + }, + { + "epoch": 1.8415470912180725, + "grad_norm": 1.0928500890731812, + "learning_rate": 1.5547654312945303e-06, + "loss": 0.7724, + "step": 288250 + }, + { + "epoch": 1.841610978367811, + "grad_norm": 0.9827240705490112, + "learning_rate": 1.5535241282045533e-06, + "loss": 0.8836, + "step": 288260 + }, + { + "epoch": 1.8416748655175499, + "grad_norm": 0.6995918154716492, + "learning_rate": 1.5522833130125036e-06, + "loss": 0.7469, + "step": 288270 + }, + { + "epoch": 1.8417387526672884, + "grad_norm": 1.1925073862075806, + "learning_rate": 1.5510429857308717e-06, + "loss": 0.9589, + "step": 288280 + }, + { + "epoch": 1.8418026398170273, + "grad_norm": 1.1228277683258057, + "learning_rate": 1.5498031463721475e-06, + "loss": 0.8682, + "step": 288290 + }, + { + "epoch": 1.8418665269667658, + "grad_norm": 0.6336076259613037, + "learning_rate": 1.5485637949488262e-06, + "loss": 1.0068, + "step": 288300 + }, + { + "epoch": 1.8419304141165047, + "grad_norm": 0.8404168486595154, + "learning_rate": 1.5473249314733818e-06, + "loss": 0.9751, + "step": 288310 + }, + { + "epoch": 1.8419943012662432, + "grad_norm": 0.8288664221763611, + "learning_rate": 1.5460865559582983e-06, + "loss": 0.8593, + "step": 288320 + }, + { + "epoch": 1.8420581884159821, + "grad_norm": 0.7925769090652466, + "learning_rate": 1.5448486684160325e-06, + "loss": 0.9489, + "step": 288330 + }, + { + "epoch": 1.8421220755657206, + "grad_norm": 1.0064537525177002, + "learning_rate": 1.5436112688590688e-06, + "loss": 0.9611, + "step": 288340 + }, + { + "epoch": 1.8421859627154595, + "grad_norm": 0.691359281539917, + "learning_rate": 1.5423743572998527e-06, + "loss": 0.722, + "step": 288350 + }, + { + "epoch": 1.842249849865198, + "grad_norm": 1.405712366104126, + "learning_rate": 1.5411379337508524e-06, + "loss": 0.9765, + "step": 288360 + }, + { + "epoch": 1.842313737014937, + "grad_norm": 0.6220401525497437, + "learning_rate": 1.539901998224519e-06, + "loss": 0.5838, + "step": 288370 + }, + { + "epoch": 1.8423776241646754, + "grad_norm": 0.6910040378570557, + "learning_rate": 1.538666550733292e-06, + "loss": 0.8711, + "step": 288380 + }, + { + "epoch": 1.8424415113144144, + "grad_norm": 0.8294619917869568, + "learning_rate": 1.537431591289623e-06, + "loss": 0.9425, + "step": 288390 + }, + { + "epoch": 1.8425053984641528, + "grad_norm": 0.6665295958518982, + "learning_rate": 1.5361971199059412e-06, + "loss": 0.8017, + "step": 288400 + }, + { + "epoch": 1.8425692856138918, + "grad_norm": 1.0412260293960571, + "learning_rate": 1.5349631365946805e-06, + "loss": 0.9082, + "step": 288410 + }, + { + "epoch": 1.8426331727636303, + "grad_norm": 3.229555368423462, + "learning_rate": 1.5337296413682644e-06, + "loss": 0.7304, + "step": 288420 + }, + { + "epoch": 1.8426970599133692, + "grad_norm": 2.063892364501953, + "learning_rate": 1.5324966342391333e-06, + "loss": 0.8002, + "step": 288430 + }, + { + "epoch": 1.8427609470631077, + "grad_norm": 1.6300920248031616, + "learning_rate": 1.5312641152196772e-06, + "loss": 1.0348, + "step": 288440 + }, + { + "epoch": 1.8428248342128464, + "grad_norm": 0.600158154964447, + "learning_rate": 1.5300320843223304e-06, + "loss": 0.8067, + "step": 288450 + }, + { + "epoch": 1.842888721362585, + "grad_norm": 0.8928928375244141, + "learning_rate": 1.5288005415594886e-06, + "loss": 1.1524, + "step": 288460 + }, + { + "epoch": 1.8429526085123238, + "grad_norm": 0.9556632041931152, + "learning_rate": 1.5275694869435698e-06, + "loss": 0.9155, + "step": 288470 + }, + { + "epoch": 1.8430164956620625, + "grad_norm": 0.8034154176712036, + "learning_rate": 1.5263389204869472e-06, + "loss": 0.8602, + "step": 288480 + }, + { + "epoch": 1.8430803828118012, + "grad_norm": 0.9617866277694702, + "learning_rate": 1.5251088422020389e-06, + "loss": 0.7674, + "step": 288490 + }, + { + "epoch": 1.84314426996154, + "grad_norm": 0.8912802934646606, + "learning_rate": 1.5238792521012124e-06, + "loss": 0.8824, + "step": 288500 + }, + { + "epoch": 1.8432081571112786, + "grad_norm": 0.8857015371322632, + "learning_rate": 1.5226501501968636e-06, + "loss": 0.8232, + "step": 288510 + }, + { + "epoch": 1.8432720442610173, + "grad_norm": 0.6874731779098511, + "learning_rate": 1.5214215365013661e-06, + "loss": 0.7386, + "step": 288520 + }, + { + "epoch": 1.843335931410756, + "grad_norm": 1.0015004873275757, + "learning_rate": 1.5201934110270932e-06, + "loss": 0.9648, + "step": 288530 + }, + { + "epoch": 1.8433998185604947, + "grad_norm": 1.2330206632614136, + "learning_rate": 1.5189657737864127e-06, + "loss": 0.8617, + "step": 288540 + }, + { + "epoch": 1.8434637057102334, + "grad_norm": 0.8535049557685852, + "learning_rate": 1.5177386247916925e-06, + "loss": 0.8291, + "step": 288550 + }, + { + "epoch": 1.8435275928599721, + "grad_norm": 1.0177775621414185, + "learning_rate": 1.516511964055284e-06, + "loss": 0.857, + "step": 288560 + }, + { + "epoch": 1.8435914800097108, + "grad_norm": 1.0535317659378052, + "learning_rate": 1.5152857915895436e-06, + "loss": 0.9683, + "step": 288570 + }, + { + "epoch": 1.8436553671594496, + "grad_norm": 0.6651054620742798, + "learning_rate": 1.5140601074068228e-06, + "loss": 0.9026, + "step": 288580 + }, + { + "epoch": 1.8437192543091883, + "grad_norm": 0.7523433566093445, + "learning_rate": 1.5128349115194619e-06, + "loss": 0.6263, + "step": 288590 + }, + { + "epoch": 1.843783141458927, + "grad_norm": 0.7467684149742126, + "learning_rate": 1.5116102039398005e-06, + "loss": 0.8923, + "step": 288600 + }, + { + "epoch": 1.8438470286086657, + "grad_norm": 1.0226775407791138, + "learning_rate": 1.5103859846801738e-06, + "loss": 0.999, + "step": 288610 + }, + { + "epoch": 1.8439109157584044, + "grad_norm": 1.6206825971603394, + "learning_rate": 1.5091622537529105e-06, + "loss": 0.9215, + "step": 288620 + }, + { + "epoch": 1.843974802908143, + "grad_norm": 1.0661346912384033, + "learning_rate": 1.5079390111703285e-06, + "loss": 0.6762, + "step": 288630 + }, + { + "epoch": 1.8440386900578818, + "grad_norm": 0.7579408288002014, + "learning_rate": 1.5067162569447623e-06, + "loss": 0.7026, + "step": 288640 + }, + { + "epoch": 1.8441025772076205, + "grad_norm": 0.6882103085517883, + "learning_rate": 1.5054939910885079e-06, + "loss": 0.8362, + "step": 288650 + }, + { + "epoch": 1.8441664643573592, + "grad_norm": 0.9294477701187134, + "learning_rate": 1.5042722136138887e-06, + "loss": 0.8447, + "step": 288660 + }, + { + "epoch": 1.844230351507098, + "grad_norm": 0.9065951108932495, + "learning_rate": 1.5030509245331947e-06, + "loss": 0.9052, + "step": 288670 + }, + { + "epoch": 1.8442942386568366, + "grad_norm": 1.4120420217514038, + "learning_rate": 1.501830123858744e-06, + "loss": 0.8919, + "step": 288680 + }, + { + "epoch": 1.844358125806575, + "grad_norm": 0.5979344844818115, + "learning_rate": 1.50060981160281e-06, + "loss": 0.905, + "step": 288690 + }, + { + "epoch": 1.844422012956314, + "grad_norm": 1.209514856338501, + "learning_rate": 1.4993899877776995e-06, + "loss": 0.9565, + "step": 288700 + }, + { + "epoch": 1.8444859001060525, + "grad_norm": 0.7128211259841919, + "learning_rate": 1.4981706523956918e-06, + "loss": 0.714, + "step": 288710 + }, + { + "epoch": 1.8445497872557914, + "grad_norm": 0.9613263607025146, + "learning_rate": 1.4969518054690657e-06, + "loss": 0.8484, + "step": 288720 + }, + { + "epoch": 1.84461367440553, + "grad_norm": 0.8404272794723511, + "learning_rate": 1.4957334470100892e-06, + "loss": 0.6542, + "step": 288730 + }, + { + "epoch": 1.8446775615552689, + "grad_norm": 1.007049560546875, + "learning_rate": 1.494515577031047e-06, + "loss": 0.8816, + "step": 288740 + }, + { + "epoch": 1.8447414487050073, + "grad_norm": 1.3427143096923828, + "learning_rate": 1.4932981955441906e-06, + "loss": 1.0498, + "step": 288750 + }, + { + "epoch": 1.8448053358547463, + "grad_norm": 1.0202215909957886, + "learning_rate": 1.492081302561793e-06, + "loss": 0.9685, + "step": 288760 + }, + { + "epoch": 1.8448692230044847, + "grad_norm": 1.1484458446502686, + "learning_rate": 1.490864898096095e-06, + "loss": 1.004, + "step": 288770 + }, + { + "epoch": 1.8449331101542237, + "grad_norm": 0.7522885203361511, + "learning_rate": 1.4896489821593584e-06, + "loss": 0.7973, + "step": 288780 + }, + { + "epoch": 1.8449969973039622, + "grad_norm": 1.5658546686172485, + "learning_rate": 1.4884335547638185e-06, + "loss": 1.1108, + "step": 288790 + }, + { + "epoch": 1.845060884453701, + "grad_norm": 1.085937738418579, + "learning_rate": 1.487218615921726e-06, + "loss": 0.9513, + "step": 288800 + }, + { + "epoch": 1.8451247716034396, + "grad_norm": 1.7706001996994019, + "learning_rate": 1.4860041656453106e-06, + "loss": 0.9771, + "step": 288810 + }, + { + "epoch": 1.8451886587531785, + "grad_norm": 0.6471644043922424, + "learning_rate": 1.4847902039467953e-06, + "loss": 0.9614, + "step": 288820 + }, + { + "epoch": 1.845252545902917, + "grad_norm": 0.7767409682273865, + "learning_rate": 1.4835767308384264e-06, + "loss": 0.8322, + "step": 288830 + }, + { + "epoch": 1.845316433052656, + "grad_norm": 1.066179633140564, + "learning_rate": 1.4823637463324047e-06, + "loss": 0.7936, + "step": 288840 + }, + { + "epoch": 1.8453803202023944, + "grad_norm": 0.8228965997695923, + "learning_rate": 1.4811512504409597e-06, + "loss": 0.8987, + "step": 288850 + }, + { + "epoch": 1.8454442073521333, + "grad_norm": 1.1014748811721802, + "learning_rate": 1.4799392431762927e-06, + "loss": 0.7715, + "step": 288860 + }, + { + "epoch": 1.8455080945018718, + "grad_norm": 0.8230012059211731, + "learning_rate": 1.478727724550616e-06, + "loss": 0.7198, + "step": 288870 + }, + { + "epoch": 1.8455719816516107, + "grad_norm": 0.8320961594581604, + "learning_rate": 1.4775166945761309e-06, + "loss": 0.9715, + "step": 288880 + }, + { + "epoch": 1.8456358688013492, + "grad_norm": 0.9189849495887756, + "learning_rate": 1.476306153265028e-06, + "loss": 1.0131, + "step": 288890 + }, + { + "epoch": 1.8456997559510882, + "grad_norm": 1.1729072332382202, + "learning_rate": 1.4750961006294972e-06, + "loss": 0.9605, + "step": 288900 + }, + { + "epoch": 1.8457636431008266, + "grad_norm": 0.8219770193099976, + "learning_rate": 1.4738865366817345e-06, + "loss": 0.7105, + "step": 288910 + }, + { + "epoch": 1.8458275302505656, + "grad_norm": 1.2123422622680664, + "learning_rate": 1.4726774614339079e-06, + "loss": 1.0589, + "step": 288920 + }, + { + "epoch": 1.845891417400304, + "grad_norm": 0.8864356875419617, + "learning_rate": 1.4714688748982075e-06, + "loss": 1.0387, + "step": 288930 + }, + { + "epoch": 1.8459553045500428, + "grad_norm": 1.4217449426651, + "learning_rate": 1.470260777086796e-06, + "loss": 0.9907, + "step": 288940 + }, + { + "epoch": 1.8460191916997815, + "grad_norm": 0.843344509601593, + "learning_rate": 1.4690531680118413e-06, + "loss": 1.0661, + "step": 288950 + }, + { + "epoch": 1.8460830788495202, + "grad_norm": 0.7461937069892883, + "learning_rate": 1.4678460476855116e-06, + "loss": 0.9238, + "step": 288960 + }, + { + "epoch": 1.8461469659992589, + "grad_norm": 1.053181529045105, + "learning_rate": 1.4666394161199527e-06, + "loss": 0.8147, + "step": 288970 + }, + { + "epoch": 1.8462108531489976, + "grad_norm": 1.020796537399292, + "learning_rate": 1.4654332733273269e-06, + "loss": 0.8422, + "step": 288980 + }, + { + "epoch": 1.8462747402987363, + "grad_norm": 1.2979925870895386, + "learning_rate": 1.4642276193197747e-06, + "loss": 0.9054, + "step": 288990 + }, + { + "epoch": 1.846338627448475, + "grad_norm": 0.9862684011459351, + "learning_rate": 1.4630224541094417e-06, + "loss": 0.8083, + "step": 289000 + }, + { + "epoch": 1.8464025145982137, + "grad_norm": 0.7166469097137451, + "learning_rate": 1.4618177777084574e-06, + "loss": 0.6495, + "step": 289010 + }, + { + "epoch": 1.8464664017479524, + "grad_norm": 0.8175358772277832, + "learning_rate": 1.4606135901289618e-06, + "loss": 0.881, + "step": 289020 + }, + { + "epoch": 1.846530288897691, + "grad_norm": 1.0863618850708008, + "learning_rate": 1.4594098913830788e-06, + "loss": 0.7885, + "step": 289030 + }, + { + "epoch": 1.8465941760474298, + "grad_norm": 0.9460238814353943, + "learning_rate": 1.4582066814829376e-06, + "loss": 0.9725, + "step": 289040 + }, + { + "epoch": 1.8466580631971685, + "grad_norm": 0.8985845446586609, + "learning_rate": 1.457003960440645e-06, + "loss": 0.8003, + "step": 289050 + }, + { + "epoch": 1.8467219503469072, + "grad_norm": 0.9575114250183105, + "learning_rate": 1.4558017282683189e-06, + "loss": 0.8808, + "step": 289060 + }, + { + "epoch": 1.846785837496646, + "grad_norm": 0.9031855463981628, + "learning_rate": 1.4545999849780668e-06, + "loss": 1.0921, + "step": 289070 + }, + { + "epoch": 1.8468497246463846, + "grad_norm": 0.9414730072021484, + "learning_rate": 1.4533987305819953e-06, + "loss": 0.8704, + "step": 289080 + }, + { + "epoch": 1.8469136117961233, + "grad_norm": 1.9726576805114746, + "learning_rate": 1.4521979650921891e-06, + "loss": 0.7585, + "step": 289090 + }, + { + "epoch": 1.846977498945862, + "grad_norm": 0.775337815284729, + "learning_rate": 1.4509976885207555e-06, + "loss": 0.9518, + "step": 289100 + }, + { + "epoch": 1.8470413860956008, + "grad_norm": 1.590173602104187, + "learning_rate": 1.4497979008797679e-06, + "loss": 0.856, + "step": 289110 + }, + { + "epoch": 1.8471052732453395, + "grad_norm": 0.996599555015564, + "learning_rate": 1.448598602181328e-06, + "loss": 0.828, + "step": 289120 + }, + { + "epoch": 1.8471691603950782, + "grad_norm": 0.8098533749580383, + "learning_rate": 1.4473997924374927e-06, + "loss": 0.8946, + "step": 289130 + }, + { + "epoch": 1.8472330475448169, + "grad_norm": 0.5876967906951904, + "learning_rate": 1.4462014716603466e-06, + "loss": 0.7622, + "step": 289140 + }, + { + "epoch": 1.8472969346945556, + "grad_norm": 1.886121392250061, + "learning_rate": 1.4450036398619637e-06, + "loss": 0.7134, + "step": 289150 + }, + { + "epoch": 1.8473608218442943, + "grad_norm": 1.2650415897369385, + "learning_rate": 1.4438062970543953e-06, + "loss": 0.8986, + "step": 289160 + }, + { + "epoch": 1.847424708994033, + "grad_norm": 0.9794378280639648, + "learning_rate": 1.4426094432497096e-06, + "loss": 1.0858, + "step": 289170 + }, + { + "epoch": 1.8474885961437715, + "grad_norm": 1.2420276403427124, + "learning_rate": 1.4414130784599466e-06, + "loss": 0.9369, + "step": 289180 + }, + { + "epoch": 1.8475524832935104, + "grad_norm": 1.1234242916107178, + "learning_rate": 1.4402172026971694e-06, + "loss": 0.6944, + "step": 289190 + }, + { + "epoch": 1.847616370443249, + "grad_norm": 1.2630137205123901, + "learning_rate": 1.4390218159734125e-06, + "loss": 0.9539, + "step": 289200 + }, + { + "epoch": 1.8476802575929878, + "grad_norm": 1.18568754196167, + "learning_rate": 1.437826918300722e-06, + "loss": 0.8495, + "step": 289210 + }, + { + "epoch": 1.8477441447427263, + "grad_norm": 1.0551838874816895, + "learning_rate": 1.4366325096911215e-06, + "loss": 0.8087, + "step": 289220 + }, + { + "epoch": 1.8478080318924652, + "grad_norm": 0.7045217156410217, + "learning_rate": 1.435438590156646e-06, + "loss": 0.9491, + "step": 289230 + }, + { + "epoch": 1.8478719190422037, + "grad_norm": 0.5378367304801941, + "learning_rate": 1.4342451597093187e-06, + "loss": 0.6279, + "step": 289240 + }, + { + "epoch": 1.8479358061919426, + "grad_norm": 1.6226013898849487, + "learning_rate": 1.4330522183611583e-06, + "loss": 0.7129, + "step": 289250 + }, + { + "epoch": 1.8479996933416811, + "grad_norm": 0.9038100242614746, + "learning_rate": 1.4318597661241773e-06, + "loss": 0.7994, + "step": 289260 + }, + { + "epoch": 1.84806358049142, + "grad_norm": 0.9648569822311401, + "learning_rate": 1.4306678030103881e-06, + "loss": 0.9585, + "step": 289270 + }, + { + "epoch": 1.8481274676411585, + "grad_norm": 0.7719781398773193, + "learning_rate": 1.4294763290317926e-06, + "loss": 0.8269, + "step": 289280 + }, + { + "epoch": 1.8481913547908975, + "grad_norm": 0.5771094560623169, + "learning_rate": 1.4282853442003918e-06, + "loss": 0.6463, + "step": 289290 + }, + { + "epoch": 1.848255241940636, + "grad_norm": 0.7412580847740173, + "learning_rate": 1.427094848528171e-06, + "loss": 0.9148, + "step": 289300 + }, + { + "epoch": 1.8483191290903749, + "grad_norm": 1.0679619312286377, + "learning_rate": 1.4259048420271315e-06, + "loss": 0.7854, + "step": 289310 + }, + { + "epoch": 1.8483830162401134, + "grad_norm": 0.9758639931678772, + "learning_rate": 1.4247153247092527e-06, + "loss": 0.9488, + "step": 289320 + }, + { + "epoch": 1.8484469033898523, + "grad_norm": 1.6436443328857422, + "learning_rate": 1.4235262965865137e-06, + "loss": 1.0376, + "step": 289330 + }, + { + "epoch": 1.8485107905395908, + "grad_norm": 0.9758222103118896, + "learning_rate": 1.4223377576708884e-06, + "loss": 0.6294, + "step": 289340 + }, + { + "epoch": 1.8485746776893297, + "grad_norm": 1.415855050086975, + "learning_rate": 1.4211497079743452e-06, + "loss": 1.0529, + "step": 289350 + }, + { + "epoch": 1.8486385648390682, + "grad_norm": 1.0808721780776978, + "learning_rate": 1.4199621475088576e-06, + "loss": 0.918, + "step": 289360 + }, + { + "epoch": 1.8487024519888071, + "grad_norm": 1.55045747756958, + "learning_rate": 1.4187750762863773e-06, + "loss": 0.8256, + "step": 289370 + }, + { + "epoch": 1.8487663391385456, + "grad_norm": 1.0587116479873657, + "learning_rate": 1.417588494318861e-06, + "loss": 0.8804, + "step": 289380 + }, + { + "epoch": 1.8488302262882845, + "grad_norm": 0.9089915156364441, + "learning_rate": 1.4164024016182553e-06, + "loss": 0.9252, + "step": 289390 + }, + { + "epoch": 1.848894113438023, + "grad_norm": 0.6744896173477173, + "learning_rate": 1.4152167981965114e-06, + "loss": 0.7266, + "step": 289400 + }, + { + "epoch": 1.848958000587762, + "grad_norm": 1.4949560165405273, + "learning_rate": 1.4140316840655587e-06, + "loss": 0.6444, + "step": 289410 + }, + { + "epoch": 1.8490218877375004, + "grad_norm": 0.991797149181366, + "learning_rate": 1.4128470592373488e-06, + "loss": 0.7479, + "step": 289420 + }, + { + "epoch": 1.8490857748872391, + "grad_norm": 0.8915597200393677, + "learning_rate": 1.4116629237237944e-06, + "loss": 1.0406, + "step": 289430 + }, + { + "epoch": 1.8491496620369778, + "grad_norm": 0.934266209602356, + "learning_rate": 1.410479277536836e-06, + "loss": 1.1014, + "step": 289440 + }, + { + "epoch": 1.8492135491867165, + "grad_norm": 0.8267480731010437, + "learning_rate": 1.409296120688386e-06, + "loss": 0.7946, + "step": 289450 + }, + { + "epoch": 1.8492774363364552, + "grad_norm": 1.9359806776046753, + "learning_rate": 1.408231697919049e-06, + "loss": 1.0949, + "step": 289460 + }, + { + "epoch": 1.849341323486194, + "grad_norm": 1.0157192945480347, + "learning_rate": 1.4070494708465886e-06, + "loss": 0.7052, + "step": 289470 + }, + { + "epoch": 1.8494052106359327, + "grad_norm": 2.140254020690918, + "learning_rate": 1.4058677331471814e-06, + "loss": 0.9596, + "step": 289480 + }, + { + "epoch": 1.8494690977856714, + "grad_norm": 1.4555343389511108, + "learning_rate": 1.4046864848327236e-06, + "loss": 0.774, + "step": 289490 + }, + { + "epoch": 1.84953298493541, + "grad_norm": 1.036714792251587, + "learning_rate": 1.4035057259151108e-06, + "loss": 0.7356, + "step": 289500 + }, + { + "epoch": 1.8495968720851488, + "grad_norm": 0.5356025695800781, + "learning_rate": 1.4023254564062505e-06, + "loss": 0.6487, + "step": 289510 + }, + { + "epoch": 1.8496607592348875, + "grad_norm": 1.2562586069107056, + "learning_rate": 1.4011456763180053e-06, + "loss": 0.9194, + "step": 289520 + }, + { + "epoch": 1.8497246463846262, + "grad_norm": 1.0846186876296997, + "learning_rate": 1.3999663856622714e-06, + "loss": 0.7698, + "step": 289530 + }, + { + "epoch": 1.849788533534365, + "grad_norm": 1.2222474813461304, + "learning_rate": 1.3987875844509113e-06, + "loss": 0.6781, + "step": 289540 + }, + { + "epoch": 1.8498524206841036, + "grad_norm": 0.9994776844978333, + "learning_rate": 1.3976092726958157e-06, + "loss": 0.8111, + "step": 289550 + }, + { + "epoch": 1.8499163078338423, + "grad_norm": 1.1925878524780273, + "learning_rate": 1.396431450408836e-06, + "loss": 0.8134, + "step": 289560 + }, + { + "epoch": 1.849980194983581, + "grad_norm": 0.8951390385627747, + "learning_rate": 1.3952541176018407e-06, + "loss": 0.8301, + "step": 289570 + }, + { + "epoch": 1.8500440821333197, + "grad_norm": 1.0971461534500122, + "learning_rate": 1.3940772742866926e-06, + "loss": 1.1852, + "step": 289580 + }, + { + "epoch": 1.8501079692830584, + "grad_norm": 2.3525285720825195, + "learning_rate": 1.3929009204752263e-06, + "loss": 0.95, + "step": 289590 + }, + { + "epoch": 1.8501718564327971, + "grad_norm": 1.19795823097229, + "learning_rate": 1.391725056179305e-06, + "loss": 1.1132, + "step": 289600 + }, + { + "epoch": 1.8502357435825358, + "grad_norm": 1.1813701391220093, + "learning_rate": 1.3905496814107633e-06, + "loss": 0.8383, + "step": 289610 + }, + { + "epoch": 1.8502996307322745, + "grad_norm": 0.970208466053009, + "learning_rate": 1.389374796181442e-06, + "loss": 0.9159, + "step": 289620 + }, + { + "epoch": 1.8503635178820133, + "grad_norm": 1.0656824111938477, + "learning_rate": 1.3882004005031645e-06, + "loss": 1.0625, + "step": 289630 + }, + { + "epoch": 1.850427405031752, + "grad_norm": 0.7011227011680603, + "learning_rate": 1.387026494387772e-06, + "loss": 0.8958, + "step": 289640 + }, + { + "epoch": 1.8504912921814907, + "grad_norm": 0.8881285786628723, + "learning_rate": 1.3858530778470714e-06, + "loss": 0.9315, + "step": 289650 + }, + { + "epoch": 1.8505551793312294, + "grad_norm": 0.6982859969139099, + "learning_rate": 1.384680150892892e-06, + "loss": 0.8761, + "step": 289660 + }, + { + "epoch": 1.8506190664809679, + "grad_norm": 1.166447639465332, + "learning_rate": 1.3835077135370355e-06, + "loss": 1.014, + "step": 289670 + }, + { + "epoch": 1.8506829536307068, + "grad_norm": 0.8452163338661194, + "learning_rate": 1.382335765791326e-06, + "loss": 0.9305, + "step": 289680 + }, + { + "epoch": 1.8507468407804453, + "grad_norm": 3.4163830280303955, + "learning_rate": 1.3811643076675484e-06, + "loss": 0.8241, + "step": 289690 + }, + { + "epoch": 1.8508107279301842, + "grad_norm": 1.1839015483856201, + "learning_rate": 1.37999333917751e-06, + "loss": 0.8769, + "step": 289700 + }, + { + "epoch": 1.8508746150799227, + "grad_norm": 0.5793390274047852, + "learning_rate": 1.3788228603329955e-06, + "loss": 0.8559, + "step": 289710 + }, + { + "epoch": 1.8509385022296616, + "grad_norm": 1.1785459518432617, + "learning_rate": 1.377652871145807e-06, + "loss": 0.9978, + "step": 289720 + }, + { + "epoch": 1.8510023893794, + "grad_norm": 0.8161349892616272, + "learning_rate": 1.3764833716277126e-06, + "loss": 0.7114, + "step": 289730 + }, + { + "epoch": 1.851066276529139, + "grad_norm": 0.8620417714118958, + "learning_rate": 1.3753143617904974e-06, + "loss": 0.7987, + "step": 289740 + }, + { + "epoch": 1.8511301636788775, + "grad_norm": 0.9723739624023438, + "learning_rate": 1.374145841645935e-06, + "loss": 0.9672, + "step": 289750 + }, + { + "epoch": 1.8511940508286164, + "grad_norm": 0.8605661988258362, + "learning_rate": 1.3729778112057889e-06, + "loss": 1.0071, + "step": 289760 + }, + { + "epoch": 1.851257937978355, + "grad_norm": 0.6105464696884155, + "learning_rate": 1.3718102704818215e-06, + "loss": 1.0402, + "step": 289770 + }, + { + "epoch": 1.8513218251280938, + "grad_norm": 0.5702158212661743, + "learning_rate": 1.3706432194857954e-06, + "loss": 0.7694, + "step": 289780 + }, + { + "epoch": 1.8513857122778323, + "grad_norm": 0.7528900504112244, + "learning_rate": 1.3694766582294682e-06, + "loss": 0.9634, + "step": 289790 + }, + { + "epoch": 1.8514495994275713, + "grad_norm": 0.6970102787017822, + "learning_rate": 1.3683105867245748e-06, + "loss": 0.8428, + "step": 289800 + }, + { + "epoch": 1.8515134865773097, + "grad_norm": 0.9566949605941772, + "learning_rate": 1.3671450049828782e-06, + "loss": 0.9473, + "step": 289810 + }, + { + "epoch": 1.8515773737270487, + "grad_norm": 0.6424906253814697, + "learning_rate": 1.365979913016091e-06, + "loss": 0.8256, + "step": 289820 + }, + { + "epoch": 1.8516412608767872, + "grad_norm": 1.8221441507339478, + "learning_rate": 1.3648153108359708e-06, + "loss": 0.7576, + "step": 289830 + }, + { + "epoch": 1.851705148026526, + "grad_norm": 1.3779975175857544, + "learning_rate": 1.3636511984542299e-06, + "loss": 0.8611, + "step": 289840 + }, + { + "epoch": 1.8517690351762646, + "grad_norm": 0.9683537483215332, + "learning_rate": 1.3624875758825984e-06, + "loss": 0.8873, + "step": 289850 + }, + { + "epoch": 1.8518329223260035, + "grad_norm": 0.961801290512085, + "learning_rate": 1.3613244431327943e-06, + "loss": 0.7379, + "step": 289860 + }, + { + "epoch": 1.851896809475742, + "grad_norm": 1.2155733108520508, + "learning_rate": 1.3601618002165362e-06, + "loss": 0.9428, + "step": 289870 + }, + { + "epoch": 1.851960696625481, + "grad_norm": 0.9074127674102783, + "learning_rate": 1.3589996471455202e-06, + "loss": 0.868, + "step": 289880 + }, + { + "epoch": 1.8520245837752194, + "grad_norm": 1.2908611297607422, + "learning_rate": 1.3578379839314647e-06, + "loss": 0.669, + "step": 289890 + }, + { + "epoch": 1.8520884709249583, + "grad_norm": 2.684487819671631, + "learning_rate": 1.3566768105860606e-06, + "loss": 0.8911, + "step": 289900 + }, + { + "epoch": 1.8521523580746968, + "grad_norm": 1.0935617685317993, + "learning_rate": 1.3555161271210037e-06, + "loss": 0.7533, + "step": 289910 + }, + { + "epoch": 1.8522162452244355, + "grad_norm": 0.655015766620636, + "learning_rate": 1.3543559335479793e-06, + "loss": 0.8738, + "step": 289920 + }, + { + "epoch": 1.8522801323741742, + "grad_norm": 1.0216906070709229, + "learning_rate": 1.3531962298786838e-06, + "loss": 0.9277, + "step": 289930 + }, + { + "epoch": 1.852344019523913, + "grad_norm": 0.6868817210197449, + "learning_rate": 1.3520370161247798e-06, + "loss": 0.7341, + "step": 289940 + }, + { + "epoch": 1.8524079066736516, + "grad_norm": 0.8469321131706238, + "learning_rate": 1.3508782922979524e-06, + "loss": 0.9673, + "step": 289950 + }, + { + "epoch": 1.8524717938233903, + "grad_norm": 0.8194224238395691, + "learning_rate": 1.3497200584098645e-06, + "loss": 0.7645, + "step": 289960 + }, + { + "epoch": 1.852535680973129, + "grad_norm": 0.7928370237350464, + "learning_rate": 1.3485623144721904e-06, + "loss": 1.3606, + "step": 289970 + }, + { + "epoch": 1.8525995681228677, + "grad_norm": 1.1735436916351318, + "learning_rate": 1.347405060496576e-06, + "loss": 0.9206, + "step": 289980 + }, + { + "epoch": 1.8526634552726065, + "grad_norm": 0.6793167591094971, + "learning_rate": 1.3462482964946844e-06, + "loss": 0.6639, + "step": 289990 + }, + { + "epoch": 1.8527273424223452, + "grad_norm": 0.9222388863563538, + "learning_rate": 1.3450920224781727e-06, + "loss": 0.7729, + "step": 290000 + }, + { + "epoch": 1.8527912295720839, + "grad_norm": 1.0564346313476562, + "learning_rate": 1.3439362384586706e-06, + "loss": 1.0737, + "step": 290010 + }, + { + "epoch": 1.8528551167218226, + "grad_norm": 1.6825110912322998, + "learning_rate": 1.3427809444478246e-06, + "loss": 1.3275, + "step": 290020 + }, + { + "epoch": 1.8529190038715613, + "grad_norm": 0.9517335295677185, + "learning_rate": 1.3416261404572694e-06, + "loss": 1.2495, + "step": 290030 + }, + { + "epoch": 1.8529828910213, + "grad_norm": 0.9224988222122192, + "learning_rate": 1.340471826498635e-06, + "loss": 1.0336, + "step": 290040 + }, + { + "epoch": 1.8530467781710387, + "grad_norm": 0.8164480328559875, + "learning_rate": 1.3393180025835395e-06, + "loss": 0.7448, + "step": 290050 + }, + { + "epoch": 1.8531106653207774, + "grad_norm": 0.8956406116485596, + "learning_rate": 1.3381646687236182e-06, + "loss": 0.7648, + "step": 290060 + }, + { + "epoch": 1.853174552470516, + "grad_norm": 1.2125521898269653, + "learning_rate": 1.337011824930473e-06, + "loss": 0.81, + "step": 290070 + }, + { + "epoch": 1.8532384396202548, + "grad_norm": 1.1064581871032715, + "learning_rate": 1.3358594712157169e-06, + "loss": 0.8545, + "step": 290080 + }, + { + "epoch": 1.8533023267699935, + "grad_norm": 0.8873088955879211, + "learning_rate": 1.334707607590957e-06, + "loss": 1.0518, + "step": 290090 + }, + { + "epoch": 1.8533662139197322, + "grad_norm": 2.381409168243408, + "learning_rate": 1.3335562340677898e-06, + "loss": 1.0322, + "step": 290100 + }, + { + "epoch": 1.853430101069471, + "grad_norm": 0.8438361883163452, + "learning_rate": 1.3324053506578226e-06, + "loss": 0.7972, + "step": 290110 + }, + { + "epoch": 1.8534939882192096, + "grad_norm": 0.9788957238197327, + "learning_rate": 1.3312549573726295e-06, + "loss": 0.8298, + "step": 290120 + }, + { + "epoch": 1.8535578753689483, + "grad_norm": 0.5684459209442139, + "learning_rate": 1.330105054223807e-06, + "loss": 0.8578, + "step": 290130 + }, + { + "epoch": 1.853621762518687, + "grad_norm": 1.5629345178604126, + "learning_rate": 1.3289556412229287e-06, + "loss": 0.9286, + "step": 290140 + }, + { + "epoch": 1.8536856496684258, + "grad_norm": 0.9215679168701172, + "learning_rate": 1.3278067183815801e-06, + "loss": 0.997, + "step": 290150 + }, + { + "epoch": 1.8537495368181642, + "grad_norm": 0.9726619124412537, + "learning_rate": 1.3266582857113185e-06, + "loss": 0.8652, + "step": 290160 + }, + { + "epoch": 1.8538134239679032, + "grad_norm": 0.964089572429657, + "learning_rate": 1.3255103432237181e-06, + "loss": 0.7237, + "step": 290170 + }, + { + "epoch": 1.8538773111176416, + "grad_norm": 0.9652983546257019, + "learning_rate": 1.3243628909303363e-06, + "loss": 0.837, + "step": 290180 + }, + { + "epoch": 1.8539411982673806, + "grad_norm": 0.981907308101654, + "learning_rate": 1.323215928842736e-06, + "loss": 0.9947, + "step": 290190 + }, + { + "epoch": 1.854005085417119, + "grad_norm": 0.6423254609107971, + "learning_rate": 1.322069456972458e-06, + "loss": 0.8177, + "step": 290200 + }, + { + "epoch": 1.854068972566858, + "grad_norm": 1.2880287170410156, + "learning_rate": 1.3209234753310595e-06, + "loss": 0.6577, + "step": 290210 + }, + { + "epoch": 1.8541328597165965, + "grad_norm": 1.2770487070083618, + "learning_rate": 1.319777983930065e-06, + "loss": 0.8499, + "step": 290220 + }, + { + "epoch": 1.8541967468663354, + "grad_norm": 1.0457178354263306, + "learning_rate": 1.3186329827810317e-06, + "loss": 0.9701, + "step": 290230 + }, + { + "epoch": 1.8542606340160739, + "grad_norm": 1.2421879768371582, + "learning_rate": 1.3174884718954727e-06, + "loss": 0.7487, + "step": 290240 + }, + { + "epoch": 1.8543245211658128, + "grad_norm": 0.7995020151138306, + "learning_rate": 1.3163444512849232e-06, + "loss": 1.0551, + "step": 290250 + }, + { + "epoch": 1.8543884083155513, + "grad_norm": 0.7688214182853699, + "learning_rate": 1.3152009209608963e-06, + "loss": 0.8622, + "step": 290260 + }, + { + "epoch": 1.8544522954652902, + "grad_norm": 0.9706707000732422, + "learning_rate": 1.3140578809349212e-06, + "loss": 1.0842, + "step": 290270 + }, + { + "epoch": 1.8545161826150287, + "grad_norm": 1.2777706384658813, + "learning_rate": 1.3129153312185006e-06, + "loss": 0.8934, + "step": 290280 + }, + { + "epoch": 1.8545800697647676, + "grad_norm": 1.1299149990081787, + "learning_rate": 1.3117732718231468e-06, + "loss": 0.9253, + "step": 290290 + }, + { + "epoch": 1.8546439569145061, + "grad_norm": 0.8032364845275879, + "learning_rate": 1.310631702760351e-06, + "loss": 1.0305, + "step": 290300 + }, + { + "epoch": 1.854707844064245, + "grad_norm": 1.5992950201034546, + "learning_rate": 1.309490624041615e-06, + "loss": 0.7963, + "step": 290310 + }, + { + "epoch": 1.8547717312139835, + "grad_norm": 0.9917843341827393, + "learning_rate": 1.3083500356784405e-06, + "loss": 0.7164, + "step": 290320 + }, + { + "epoch": 1.8548356183637225, + "grad_norm": 0.9065639972686768, + "learning_rate": 1.3072099376822966e-06, + "loss": 0.9515, + "step": 290330 + }, + { + "epoch": 1.854899505513461, + "grad_norm": 0.980902373790741, + "learning_rate": 1.3060703300646848e-06, + "loss": 0.7473, + "step": 290340 + }, + { + "epoch": 1.8549633926631999, + "grad_norm": 1.5900095701217651, + "learning_rate": 1.3049312128370629e-06, + "loss": 0.8436, + "step": 290350 + }, + { + "epoch": 1.8550272798129384, + "grad_norm": 1.0598942041397095, + "learning_rate": 1.3037925860109101e-06, + "loss": 0.9816, + "step": 290360 + }, + { + "epoch": 1.8550911669626773, + "grad_norm": 1.210523247718811, + "learning_rate": 1.3026544495976955e-06, + "loss": 0.7237, + "step": 290370 + }, + { + "epoch": 1.8551550541124158, + "grad_norm": 1.4614925384521484, + "learning_rate": 1.301516803608882e-06, + "loss": 1.0234, + "step": 290380 + }, + { + "epoch": 1.8552189412621545, + "grad_norm": 0.6827429533004761, + "learning_rate": 1.3003796480559217e-06, + "loss": 0.7856, + "step": 290390 + }, + { + "epoch": 1.8552828284118932, + "grad_norm": 0.9089744687080383, + "learning_rate": 1.2992429829502772e-06, + "loss": 0.9732, + "step": 290400 + }, + { + "epoch": 1.8553467155616319, + "grad_norm": 1.2129607200622559, + "learning_rate": 1.2981068083033787e-06, + "loss": 0.7271, + "step": 290410 + }, + { + "epoch": 1.8554106027113706, + "grad_norm": 1.3265371322631836, + "learning_rate": 1.2969711241266836e-06, + "loss": 0.9427, + "step": 290420 + }, + { + "epoch": 1.8554744898611093, + "grad_norm": 1.1904284954071045, + "learning_rate": 1.2958359304316159e-06, + "loss": 0.9471, + "step": 290430 + }, + { + "epoch": 1.855538377010848, + "grad_norm": 1.0264545679092407, + "learning_rate": 1.2947012272296221e-06, + "loss": 1.1399, + "step": 290440 + }, + { + "epoch": 1.8556022641605867, + "grad_norm": 1.146247386932373, + "learning_rate": 1.2935670145321211e-06, + "loss": 0.952, + "step": 290450 + }, + { + "epoch": 1.8556661513103254, + "grad_norm": 1.2765698432922363, + "learning_rate": 1.2924332923505367e-06, + "loss": 0.6927, + "step": 290460 + }, + { + "epoch": 1.8557300384600641, + "grad_norm": 2.164412498474121, + "learning_rate": 1.2913000606962878e-06, + "loss": 1.0325, + "step": 290470 + }, + { + "epoch": 1.8557939256098028, + "grad_norm": 1.5119715929031372, + "learning_rate": 1.2901673195807873e-06, + "loss": 0.8881, + "step": 290480 + }, + { + "epoch": 1.8558578127595415, + "grad_norm": 1.2955068349838257, + "learning_rate": 1.289035069015443e-06, + "loss": 0.8746, + "step": 290490 + }, + { + "epoch": 1.8559216999092802, + "grad_norm": 1.5162767171859741, + "learning_rate": 1.2879033090116565e-06, + "loss": 0.6947, + "step": 290500 + }, + { + "epoch": 1.855985587059019, + "grad_norm": 0.8264356255531311, + "learning_rate": 1.2867720395808247e-06, + "loss": 0.8881, + "step": 290510 + }, + { + "epoch": 1.8560494742087577, + "grad_norm": 0.6280115842819214, + "learning_rate": 1.2856412607343382e-06, + "loss": 0.9813, + "step": 290520 + }, + { + "epoch": 1.8561133613584964, + "grad_norm": 1.1387577056884766, + "learning_rate": 1.2845109724835935e-06, + "loss": 0.9298, + "step": 290530 + }, + { + "epoch": 1.856177248508235, + "grad_norm": 0.8114959597587585, + "learning_rate": 1.2833811748399593e-06, + "loss": 0.766, + "step": 290540 + }, + { + "epoch": 1.8562411356579738, + "grad_norm": 1.303884506225586, + "learning_rate": 1.2822518678148321e-06, + "loss": 0.8965, + "step": 290550 + }, + { + "epoch": 1.8563050228077125, + "grad_norm": 0.871311604976654, + "learning_rate": 1.2811230514195693e-06, + "loss": 0.6701, + "step": 290560 + }, + { + "epoch": 1.8563689099574512, + "grad_norm": 1.7185081243515015, + "learning_rate": 1.279994725665551e-06, + "loss": 0.8616, + "step": 290570 + }, + { + "epoch": 1.85643279710719, + "grad_norm": 1.096208930015564, + "learning_rate": 1.2788668905641287e-06, + "loss": 0.7098, + "step": 290580 + }, + { + "epoch": 1.8564966842569286, + "grad_norm": 0.938231348991394, + "learning_rate": 1.2777395461266716e-06, + "loss": 1.1072, + "step": 290590 + }, + { + "epoch": 1.8565605714066673, + "grad_norm": 1.491259217262268, + "learning_rate": 1.2766126923645205e-06, + "loss": 0.6843, + "step": 290600 + }, + { + "epoch": 1.856624458556406, + "grad_norm": 1.6792606115341187, + "learning_rate": 1.2754863292890385e-06, + "loss": 0.7934, + "step": 290610 + }, + { + "epoch": 1.8566883457061447, + "grad_norm": 1.2424652576446533, + "learning_rate": 1.2743604569115607e-06, + "loss": 1.0605, + "step": 290620 + }, + { + "epoch": 1.8567522328558834, + "grad_norm": 1.131174921989441, + "learning_rate": 1.273235075243423e-06, + "loss": 0.8315, + "step": 290630 + }, + { + "epoch": 1.8568161200056221, + "grad_norm": 1.2208259105682373, + "learning_rate": 1.272110184295966e-06, + "loss": 0.8992, + "step": 290640 + }, + { + "epoch": 1.8568800071553606, + "grad_norm": 1.2590363025665283, + "learning_rate": 1.270985784080514e-06, + "loss": 1.0244, + "step": 290650 + }, + { + "epoch": 1.8569438943050995, + "grad_norm": 0.8811933398246765, + "learning_rate": 1.2698618746083912e-06, + "loss": 0.8166, + "step": 290660 + }, + { + "epoch": 1.857007781454838, + "grad_norm": 0.8141939640045166, + "learning_rate": 1.2687384558909165e-06, + "loss": 0.9293, + "step": 290670 + }, + { + "epoch": 1.857071668604577, + "grad_norm": 1.0431616306304932, + "learning_rate": 1.2676155279394087e-06, + "loss": 0.918, + "step": 290680 + }, + { + "epoch": 1.8571355557543154, + "grad_norm": 0.7412940263748169, + "learning_rate": 1.2664930907651695e-06, + "loss": 0.8712, + "step": 290690 + }, + { + "epoch": 1.8571994429040544, + "grad_norm": 0.980758011341095, + "learning_rate": 1.2653711443795069e-06, + "loss": 0.8441, + "step": 290700 + }, + { + "epoch": 1.8572633300537928, + "grad_norm": 0.8978270888328552, + "learning_rate": 1.2642496887937117e-06, + "loss": 0.761, + "step": 290710 + }, + { + "epoch": 1.8573272172035318, + "grad_norm": 3.439507246017456, + "learning_rate": 1.2631287240190915e-06, + "loss": 0.9548, + "step": 290720 + }, + { + "epoch": 1.8573911043532703, + "grad_norm": 0.7687893509864807, + "learning_rate": 1.2620082500669205e-06, + "loss": 0.7786, + "step": 290730 + }, + { + "epoch": 1.8574549915030092, + "grad_norm": 0.965783417224884, + "learning_rate": 1.2608882669485012e-06, + "loss": 0.7568, + "step": 290740 + }, + { + "epoch": 1.8575188786527477, + "grad_norm": 1.017303705215454, + "learning_rate": 1.2597687746750963e-06, + "loss": 1.0047, + "step": 290750 + }, + { + "epoch": 1.8575827658024866, + "grad_norm": 0.7670354247093201, + "learning_rate": 1.2586497732579916e-06, + "loss": 0.7342, + "step": 290760 + }, + { + "epoch": 1.857646652952225, + "grad_norm": 0.6807330250740051, + "learning_rate": 1.257531262708439e-06, + "loss": 0.948, + "step": 290770 + }, + { + "epoch": 1.857710540101964, + "grad_norm": 1.206899881362915, + "learning_rate": 1.2564132430377296e-06, + "loss": 0.6925, + "step": 290780 + }, + { + "epoch": 1.8577744272517025, + "grad_norm": 1.0641649961471558, + "learning_rate": 1.2552957142570986e-06, + "loss": 0.9098, + "step": 290790 + }, + { + "epoch": 1.8578383144014414, + "grad_norm": 0.9408820271492004, + "learning_rate": 1.2541786763778152e-06, + "loss": 0.7734, + "step": 290800 + }, + { + "epoch": 1.85790220155118, + "grad_norm": 1.2925841808319092, + "learning_rate": 1.2530621294111145e-06, + "loss": 0.8562, + "step": 290810 + }, + { + "epoch": 1.8579660887009188, + "grad_norm": 1.395602822303772, + "learning_rate": 1.2519460733682598e-06, + "loss": 0.7759, + "step": 290820 + }, + { + "epoch": 1.8580299758506573, + "grad_norm": 0.9474008083343506, + "learning_rate": 1.2508305082604754e-06, + "loss": 0.8571, + "step": 290830 + }, + { + "epoch": 1.8580938630003963, + "grad_norm": 0.8584325313568115, + "learning_rate": 1.2497154340990024e-06, + "loss": 0.7673, + "step": 290840 + }, + { + "epoch": 1.8581577501501347, + "grad_norm": 1.0238312482833862, + "learning_rate": 1.2486008508950763e-06, + "loss": 0.7351, + "step": 290850 + }, + { + "epoch": 1.8582216372998737, + "grad_norm": 1.515025019645691, + "learning_rate": 1.247486758659905e-06, + "loss": 0.8174, + "step": 290860 + }, + { + "epoch": 1.8582855244496121, + "grad_norm": 1.1440699100494385, + "learning_rate": 1.2463731574047288e-06, + "loss": 0.9326, + "step": 290870 + }, + { + "epoch": 1.8583494115993509, + "grad_norm": 0.7021576762199402, + "learning_rate": 1.2452600471407449e-06, + "loss": 1.0455, + "step": 290880 + }, + { + "epoch": 1.8584132987490896, + "grad_norm": 1.4968088865280151, + "learning_rate": 1.2441474278791775e-06, + "loss": 0.7411, + "step": 290890 + }, + { + "epoch": 1.8584771858988283, + "grad_norm": 2.256848096847534, + "learning_rate": 1.2430352996312234e-06, + "loss": 0.9038, + "step": 290900 + }, + { + "epoch": 1.858541073048567, + "grad_norm": 0.9790036678314209, + "learning_rate": 1.2419236624080844e-06, + "loss": 0.9509, + "step": 290910 + }, + { + "epoch": 1.8586049601983057, + "grad_norm": 1.1517078876495361, + "learning_rate": 1.2408125162209571e-06, + "loss": 0.8685, + "step": 290920 + }, + { + "epoch": 1.8586688473480444, + "grad_norm": 1.1216986179351807, + "learning_rate": 1.239701861081033e-06, + "loss": 0.8746, + "step": 290930 + }, + { + "epoch": 1.858732734497783, + "grad_norm": 1.2162419557571411, + "learning_rate": 1.238591696999486e-06, + "loss": 0.8214, + "step": 290940 + }, + { + "epoch": 1.8587966216475218, + "grad_norm": 0.6631404161453247, + "learning_rate": 1.2374820239875129e-06, + "loss": 0.7694, + "step": 290950 + }, + { + "epoch": 1.8588605087972605, + "grad_norm": 0.6616999506950378, + "learning_rate": 1.236372842056277e-06, + "loss": 0.8894, + "step": 290960 + }, + { + "epoch": 1.8589243959469992, + "grad_norm": 0.9456769824028015, + "learning_rate": 1.2352641512169583e-06, + "loss": 0.8276, + "step": 290970 + }, + { + "epoch": 1.858988283096738, + "grad_norm": 0.9639400839805603, + "learning_rate": 1.2341559514807144e-06, + "loss": 0.8073, + "step": 290980 + }, + { + "epoch": 1.8590521702464766, + "grad_norm": 0.9093899726867676, + "learning_rate": 1.2330482428587031e-06, + "loss": 0.9685, + "step": 290990 + }, + { + "epoch": 1.8591160573962153, + "grad_norm": 2.9453232288360596, + "learning_rate": 1.2319410253620933e-06, + "loss": 0.8626, + "step": 291000 + }, + { + "epoch": 1.859179944545954, + "grad_norm": 1.9953830242156982, + "learning_rate": 1.230834299002026e-06, + "loss": 0.9299, + "step": 291010 + }, + { + "epoch": 1.8592438316956927, + "grad_norm": 0.8659750819206238, + "learning_rate": 1.229728063789648e-06, + "loss": 0.9576, + "step": 291020 + }, + { + "epoch": 1.8593077188454314, + "grad_norm": 1.3473628759384155, + "learning_rate": 1.2286223197360947e-06, + "loss": 0.8491, + "step": 291030 + }, + { + "epoch": 1.8593716059951702, + "grad_norm": 1.3287653923034668, + "learning_rate": 1.2275170668525183e-06, + "loss": 0.9237, + "step": 291040 + }, + { + "epoch": 1.8594354931449089, + "grad_norm": 0.8199862837791443, + "learning_rate": 1.226412305150032e-06, + "loss": 1.0027, + "step": 291050 + }, + { + "epoch": 1.8594993802946476, + "grad_norm": 1.043298363685608, + "learning_rate": 1.2253080346397717e-06, + "loss": 0.9655, + "step": 291060 + }, + { + "epoch": 1.8595632674443863, + "grad_norm": 1.0660456418991089, + "learning_rate": 1.224204255332856e-06, + "loss": 0.8745, + "step": 291070 + }, + { + "epoch": 1.859627154594125, + "grad_norm": 1.1216233968734741, + "learning_rate": 1.2231009672403981e-06, + "loss": 0.9918, + "step": 291080 + }, + { + "epoch": 1.8596910417438637, + "grad_norm": 0.8022388219833374, + "learning_rate": 1.2219981703735117e-06, + "loss": 0.6397, + "step": 291090 + }, + { + "epoch": 1.8597549288936024, + "grad_norm": 0.7386797666549683, + "learning_rate": 1.2208958647433045e-06, + "loss": 0.9875, + "step": 291100 + }, + { + "epoch": 1.859818816043341, + "grad_norm": 1.1579437255859375, + "learning_rate": 1.2197940503608728e-06, + "loss": 0.8379, + "step": 291110 + }, + { + "epoch": 1.8598827031930796, + "grad_norm": 3.6156814098358154, + "learning_rate": 1.218692727237325e-06, + "loss": 0.8989, + "step": 291120 + }, + { + "epoch": 1.8599465903428185, + "grad_norm": 0.734523355960846, + "learning_rate": 1.2175918953837296e-06, + "loss": 0.6623, + "step": 291130 + }, + { + "epoch": 1.860010477492557, + "grad_norm": 0.9780398607254028, + "learning_rate": 1.2164915548111998e-06, + "loss": 0.78, + "step": 291140 + }, + { + "epoch": 1.860074364642296, + "grad_norm": 0.7017815113067627, + "learning_rate": 1.215391705530794e-06, + "loss": 0.7613, + "step": 291150 + }, + { + "epoch": 1.8601382517920344, + "grad_norm": 0.9394316077232361, + "learning_rate": 1.2142923475535973e-06, + "loss": 0.8459, + "step": 291160 + }, + { + "epoch": 1.8602021389417733, + "grad_norm": 1.0311943292617798, + "learning_rate": 1.2131934808906898e-06, + "loss": 1.0536, + "step": 291170 + }, + { + "epoch": 1.8602660260915118, + "grad_norm": 0.7530616521835327, + "learning_rate": 1.2120951055531294e-06, + "loss": 0.987, + "step": 291180 + }, + { + "epoch": 1.8603299132412507, + "grad_norm": 0.7854243516921997, + "learning_rate": 1.2109972215519793e-06, + "loss": 0.753, + "step": 291190 + }, + { + "epoch": 1.8603938003909892, + "grad_norm": 1.3844327926635742, + "learning_rate": 1.2098998288982866e-06, + "loss": 0.9866, + "step": 291200 + }, + { + "epoch": 1.8604576875407282, + "grad_norm": 0.8715339303016663, + "learning_rate": 1.2088029276031255e-06, + "loss": 0.8878, + "step": 291210 + }, + { + "epoch": 1.8605215746904666, + "grad_norm": 0.9319707751274109, + "learning_rate": 1.2077065176775204e-06, + "loss": 0.8751, + "step": 291220 + }, + { + "epoch": 1.8605854618402056, + "grad_norm": 3.1977455615997314, + "learning_rate": 1.2066105991325238e-06, + "loss": 1.1112, + "step": 291230 + }, + { + "epoch": 1.860649348989944, + "grad_norm": 0.8579459190368652, + "learning_rate": 1.2055151719791714e-06, + "loss": 0.8311, + "step": 291240 + }, + { + "epoch": 1.860713236139683, + "grad_norm": 0.9029688239097595, + "learning_rate": 1.2044202362284984e-06, + "loss": 0.9702, + "step": 291250 + }, + { + "epoch": 1.8607771232894215, + "grad_norm": 1.4221550226211548, + "learning_rate": 1.2033257918915185e-06, + "loss": 0.651, + "step": 291260 + }, + { + "epoch": 1.8608410104391604, + "grad_norm": 1.015944004058838, + "learning_rate": 1.202231838979273e-06, + "loss": 0.8291, + "step": 291270 + }, + { + "epoch": 1.8609048975888989, + "grad_norm": 0.8289602398872375, + "learning_rate": 1.2011383775027585e-06, + "loss": 1.0116, + "step": 291280 + }, + { + "epoch": 1.8609687847386378, + "grad_norm": 1.497787356376648, + "learning_rate": 1.200045407473005e-06, + "loss": 1.0533, + "step": 291290 + }, + { + "epoch": 1.8610326718883763, + "grad_norm": 2.19814133644104, + "learning_rate": 1.1989529289010093e-06, + "loss": 1.2857, + "step": 291300 + }, + { + "epoch": 1.8610965590381152, + "grad_norm": 0.7582222819328308, + "learning_rate": 1.1978609417977793e-06, + "loss": 0.8011, + "step": 291310 + }, + { + "epoch": 1.8611604461878537, + "grad_norm": 1.9151540994644165, + "learning_rate": 1.1967694461743063e-06, + "loss": 0.7417, + "step": 291320 + }, + { + "epoch": 1.8612243333375926, + "grad_norm": 0.6472824811935425, + "learning_rate": 1.1956784420415923e-06, + "loss": 0.8088, + "step": 291330 + }, + { + "epoch": 1.8612882204873311, + "grad_norm": 1.377766728401184, + "learning_rate": 1.1945879294106123e-06, + "loss": 0.782, + "step": 291340 + }, + { + "epoch": 1.86135210763707, + "grad_norm": 1.153731346130371, + "learning_rate": 1.193497908292357e-06, + "loss": 1.0337, + "step": 291350 + }, + { + "epoch": 1.8614159947868085, + "grad_norm": 1.072165846824646, + "learning_rate": 1.1924083786977958e-06, + "loss": 0.6777, + "step": 291360 + }, + { + "epoch": 1.8614798819365472, + "grad_norm": 0.8234143853187561, + "learning_rate": 1.1913193406379086e-06, + "loss": 1.0006, + "step": 291370 + }, + { + "epoch": 1.861543769086286, + "grad_norm": 1.5603644847869873, + "learning_rate": 1.1902307941236646e-06, + "loss": 1.0589, + "step": 291380 + }, + { + "epoch": 1.8616076562360246, + "grad_norm": 0.6928756237030029, + "learning_rate": 1.1891427391660215e-06, + "loss": 0.7702, + "step": 291390 + }, + { + "epoch": 1.8616715433857633, + "grad_norm": 1.0293365716934204, + "learning_rate": 1.1880551757759428e-06, + "loss": 1.0253, + "step": 291400 + }, + { + "epoch": 1.861735430535502, + "grad_norm": 0.9429540634155273, + "learning_rate": 1.18696810396437e-06, + "loss": 0.7044, + "step": 291410 + }, + { + "epoch": 1.8617993176852408, + "grad_norm": 1.1565495729446411, + "learning_rate": 1.1858815237422604e-06, + "loss": 0.8934, + "step": 291420 + }, + { + "epoch": 1.8618632048349795, + "grad_norm": 1.087480902671814, + "learning_rate": 1.1847954351205503e-06, + "loss": 0.8431, + "step": 291430 + }, + { + "epoch": 1.8619270919847182, + "grad_norm": 0.8481289744377136, + "learning_rate": 1.1837098381101919e-06, + "loss": 0.8539, + "step": 291440 + }, + { + "epoch": 1.8619909791344569, + "grad_norm": 0.8765703439712524, + "learning_rate": 1.1826247327220986e-06, + "loss": 0.8162, + "step": 291450 + }, + { + "epoch": 1.8620548662841956, + "grad_norm": 0.9616901874542236, + "learning_rate": 1.181540118967217e-06, + "loss": 0.9633, + "step": 291460 + }, + { + "epoch": 1.8621187534339343, + "grad_norm": 0.8359056711196899, + "learning_rate": 1.1804559968564498e-06, + "loss": 0.7899, + "step": 291470 + }, + { + "epoch": 1.862182640583673, + "grad_norm": 1.9986016750335693, + "learning_rate": 1.1793723664007218e-06, + "loss": 0.7305, + "step": 291480 + }, + { + "epoch": 1.8622465277334117, + "grad_norm": 1.48747980594635, + "learning_rate": 1.1782892276109625e-06, + "loss": 0.8578, + "step": 291490 + }, + { + "epoch": 1.8623104148831504, + "grad_norm": 1.4891974925994873, + "learning_rate": 1.177206580498058e-06, + "loss": 1.1286, + "step": 291500 + }, + { + "epoch": 1.8623743020328891, + "grad_norm": 0.9178832769393921, + "learning_rate": 1.1761244250729275e-06, + "loss": 0.6851, + "step": 291510 + }, + { + "epoch": 1.8624381891826278, + "grad_norm": 0.8637306094169617, + "learning_rate": 1.1750427613464566e-06, + "loss": 0.8119, + "step": 291520 + }, + { + "epoch": 1.8625020763323665, + "grad_norm": 0.7352100014686584, + "learning_rate": 1.173961589329553e-06, + "loss": 0.7242, + "step": 291530 + }, + { + "epoch": 1.8625659634821052, + "grad_norm": 1.4195234775543213, + "learning_rate": 1.172880909033086e-06, + "loss": 0.7602, + "step": 291540 + }, + { + "epoch": 1.862629850631844, + "grad_norm": 0.9579132199287415, + "learning_rate": 1.1718007204679582e-06, + "loss": 0.9725, + "step": 291550 + }, + { + "epoch": 1.8626937377815826, + "grad_norm": 0.652401328086853, + "learning_rate": 1.1707210236450382e-06, + "loss": 0.8777, + "step": 291560 + }, + { + "epoch": 1.8627576249313214, + "grad_norm": 1.022739052772522, + "learning_rate": 1.169641818575201e-06, + "loss": 0.8411, + "step": 291570 + }, + { + "epoch": 1.86282151208106, + "grad_norm": 1.5225722789764404, + "learning_rate": 1.1685631052693103e-06, + "loss": 0.7824, + "step": 291580 + }, + { + "epoch": 1.8628853992307988, + "grad_norm": 0.7213976383209229, + "learning_rate": 1.1674848837382402e-06, + "loss": 0.6937, + "step": 291590 + }, + { + "epoch": 1.8629492863805375, + "grad_norm": 0.5849156975746155, + "learning_rate": 1.1664071539928378e-06, + "loss": 0.85, + "step": 291600 + }, + { + "epoch": 1.863013173530276, + "grad_norm": 0.8378762602806091, + "learning_rate": 1.1653299160439667e-06, + "loss": 1.06, + "step": 291610 + }, + { + "epoch": 1.8630770606800149, + "grad_norm": 0.7279397249221802, + "learning_rate": 1.1642531699024684e-06, + "loss": 0.6376, + "step": 291620 + }, + { + "epoch": 1.8631409478297534, + "grad_norm": 0.9828930497169495, + "learning_rate": 1.163176915579195e-06, + "loss": 0.9272, + "step": 291630 + }, + { + "epoch": 1.8632048349794923, + "grad_norm": 1.0489158630371094, + "learning_rate": 1.1621011530849713e-06, + "loss": 1.0979, + "step": 291640 + }, + { + "epoch": 1.8632687221292308, + "grad_norm": 0.6363334059715271, + "learning_rate": 1.1610258824306496e-06, + "loss": 0.7668, + "step": 291650 + }, + { + "epoch": 1.8633326092789697, + "grad_norm": 0.6473334431648254, + "learning_rate": 1.1599511036270383e-06, + "loss": 0.8058, + "step": 291660 + }, + { + "epoch": 1.8633964964287082, + "grad_norm": 0.8759834170341492, + "learning_rate": 1.158876816684984e-06, + "loss": 1.0041, + "step": 291670 + }, + { + "epoch": 1.8634603835784471, + "grad_norm": 0.7590951323509216, + "learning_rate": 1.1578030216152835e-06, + "loss": 0.7303, + "step": 291680 + }, + { + "epoch": 1.8635242707281856, + "grad_norm": 0.8698746562004089, + "learning_rate": 1.1567297184287618e-06, + "loss": 0.6939, + "step": 291690 + }, + { + "epoch": 1.8635881578779245, + "grad_norm": 0.7157119512557983, + "learning_rate": 1.1556569071362322e-06, + "loss": 1.029, + "step": 291700 + }, + { + "epoch": 1.863652045027663, + "grad_norm": 1.00758695602417, + "learning_rate": 1.1545845877484917e-06, + "loss": 0.8832, + "step": 291710 + }, + { + "epoch": 1.863715932177402, + "grad_norm": 4.0602288246154785, + "learning_rate": 1.153512760276343e-06, + "loss": 0.8208, + "step": 291720 + }, + { + "epoch": 1.8637798193271404, + "grad_norm": 0.9860572218894958, + "learning_rate": 1.152441424730577e-06, + "loss": 0.7314, + "step": 291730 + }, + { + "epoch": 1.8638437064768794, + "grad_norm": 1.118744134902954, + "learning_rate": 1.151370581121991e-06, + "loss": 1.145, + "step": 291740 + }, + { + "epoch": 1.8639075936266178, + "grad_norm": 1.2803094387054443, + "learning_rate": 1.150300229461354e-06, + "loss": 0.7106, + "step": 291750 + }, + { + "epoch": 1.8639714807763568, + "grad_norm": 0.9607510566711426, + "learning_rate": 1.1492303697594632e-06, + "loss": 0.9641, + "step": 291760 + }, + { + "epoch": 1.8640353679260953, + "grad_norm": 1.4401522874832153, + "learning_rate": 1.1481610020270761e-06, + "loss": 1.2303, + "step": 291770 + }, + { + "epoch": 1.8640992550758342, + "grad_norm": 1.5572141408920288, + "learning_rate": 1.1470921262749789e-06, + "loss": 0.7103, + "step": 291780 + }, + { + "epoch": 1.8641631422255727, + "grad_norm": 0.9234890937805176, + "learning_rate": 1.1460237425139242e-06, + "loss": 0.6631, + "step": 291790 + }, + { + "epoch": 1.8642270293753116, + "grad_norm": 0.7566113471984863, + "learning_rate": 1.1449558507546754e-06, + "loss": 1.0075, + "step": 291800 + }, + { + "epoch": 1.86429091652505, + "grad_norm": 0.6989375352859497, + "learning_rate": 1.143888451007985e-06, + "loss": 0.8058, + "step": 291810 + }, + { + "epoch": 1.864354803674789, + "grad_norm": 1.4395372867584229, + "learning_rate": 1.1428215432846056e-06, + "loss": 0.8574, + "step": 291820 + }, + { + "epoch": 1.8644186908245275, + "grad_norm": 1.168811321258545, + "learning_rate": 1.1417551275952786e-06, + "loss": 0.9201, + "step": 291830 + }, + { + "epoch": 1.8644825779742664, + "grad_norm": 0.9244377017021179, + "learning_rate": 1.1406892039507511e-06, + "loss": 0.992, + "step": 291840 + }, + { + "epoch": 1.864546465124005, + "grad_norm": 0.9813092350959778, + "learning_rate": 1.1396237723617476e-06, + "loss": 0.891, + "step": 291850 + }, + { + "epoch": 1.8646103522737436, + "grad_norm": 0.8621505498886108, + "learning_rate": 1.1385588328390096e-06, + "loss": 0.7602, + "step": 291860 + }, + { + "epoch": 1.8646742394234823, + "grad_norm": 1.225386142730713, + "learning_rate": 1.1374943853932452e-06, + "loss": 0.8276, + "step": 291870 + }, + { + "epoch": 1.864738126573221, + "grad_norm": 1.4588954448699951, + "learning_rate": 1.1364304300351958e-06, + "loss": 0.9811, + "step": 291880 + }, + { + "epoch": 1.8648020137229597, + "grad_norm": 0.9999317526817322, + "learning_rate": 1.1353669667755529e-06, + "loss": 0.6851, + "step": 291890 + }, + { + "epoch": 1.8648659008726984, + "grad_norm": 1.1840487718582153, + "learning_rate": 1.1343039956250467e-06, + "loss": 1.0295, + "step": 291900 + }, + { + "epoch": 1.8649297880224371, + "grad_norm": 0.9932942986488342, + "learning_rate": 1.1332415165943743e-06, + "loss": 1.1241, + "step": 291910 + }, + { + "epoch": 1.8649936751721758, + "grad_norm": 1.07947838306427, + "learning_rate": 1.1321795296942273e-06, + "loss": 0.9967, + "step": 291920 + }, + { + "epoch": 1.8650575623219146, + "grad_norm": 0.7847141623497009, + "learning_rate": 1.131118034935319e-06, + "loss": 0.797, + "step": 291930 + }, + { + "epoch": 1.8651214494716533, + "grad_norm": 0.7838201522827148, + "learning_rate": 1.1301631104418842e-06, + "loss": 0.7981, + "step": 291940 + }, + { + "epoch": 1.865185336621392, + "grad_norm": 0.748388409614563, + "learning_rate": 1.1291025507807584e-06, + "loss": 1.0643, + "step": 291950 + }, + { + "epoch": 1.8652492237711307, + "grad_norm": 0.8686913847923279, + "learning_rate": 1.1280424832918413e-06, + "loss": 0.9263, + "step": 291960 + }, + { + "epoch": 1.8653131109208694, + "grad_norm": 0.9542390704154968, + "learning_rate": 1.126982907985824e-06, + "loss": 0.7934, + "step": 291970 + }, + { + "epoch": 1.865376998070608, + "grad_norm": 0.9301961660385132, + "learning_rate": 1.125923824873365e-06, + "loss": 0.9645, + "step": 291980 + }, + { + "epoch": 1.8654408852203468, + "grad_norm": 1.346179723739624, + "learning_rate": 1.1248652339651388e-06, + "loss": 0.7489, + "step": 291990 + }, + { + "epoch": 1.8655047723700855, + "grad_norm": 0.9386990070343018, + "learning_rate": 1.1238071352717984e-06, + "loss": 0.7801, + "step": 292000 + }, + { + "epoch": 1.8655686595198242, + "grad_norm": 1.1930619478225708, + "learning_rate": 1.1227495288040013e-06, + "loss": 0.9082, + "step": 292010 + }, + { + "epoch": 1.865632546669563, + "grad_norm": 1.2605247497558594, + "learning_rate": 1.1216924145724117e-06, + "loss": 0.701, + "step": 292020 + }, + { + "epoch": 1.8656964338193016, + "grad_norm": 0.5679419636726379, + "learning_rate": 1.1206357925876542e-06, + "loss": 0.9249, + "step": 292030 + }, + { + "epoch": 1.8657603209690403, + "grad_norm": 0.7029687166213989, + "learning_rate": 1.1195796628603927e-06, + "loss": 0.6778, + "step": 292040 + }, + { + "epoch": 1.865824208118779, + "grad_norm": 0.5767757296562195, + "learning_rate": 1.1185240254012408e-06, + "loss": 0.8903, + "step": 292050 + }, + { + "epoch": 1.8658880952685177, + "grad_norm": 0.8139119744300842, + "learning_rate": 1.1174688802208455e-06, + "loss": 1.1197, + "step": 292060 + }, + { + "epoch": 1.8659519824182564, + "grad_norm": 0.6697121262550354, + "learning_rate": 1.1164142273298262e-06, + "loss": 1.1021, + "step": 292070 + }, + { + "epoch": 1.8660158695679951, + "grad_norm": 0.991718053817749, + "learning_rate": 1.1153600667388132e-06, + "loss": 0.8899, + "step": 292080 + }, + { + "epoch": 1.8660797567177339, + "grad_norm": 1.0249316692352295, + "learning_rate": 1.114306398458409e-06, + "loss": 0.7283, + "step": 292090 + }, + { + "epoch": 1.8661436438674723, + "grad_norm": 1.8425500392913818, + "learning_rate": 1.113253222499233e-06, + "loss": 0.7682, + "step": 292100 + }, + { + "epoch": 1.8662075310172113, + "grad_norm": 0.7939163446426392, + "learning_rate": 1.112200538871888e-06, + "loss": 0.9963, + "step": 292110 + }, + { + "epoch": 1.8662714181669497, + "grad_norm": 1.0355535745620728, + "learning_rate": 1.1111483475869767e-06, + "loss": 0.6348, + "step": 292120 + }, + { + "epoch": 1.8663353053166887, + "grad_norm": 2.9539527893066406, + "learning_rate": 1.1100966486551013e-06, + "loss": 0.8386, + "step": 292130 + }, + { + "epoch": 1.8663991924664272, + "grad_norm": 0.8861859440803528, + "learning_rate": 1.1090454420868425e-06, + "loss": 0.8075, + "step": 292140 + }, + { + "epoch": 1.866463079616166, + "grad_norm": 0.9066374897956848, + "learning_rate": 1.1079947278927971e-06, + "loss": 0.7879, + "step": 292150 + }, + { + "epoch": 1.8665269667659046, + "grad_norm": 0.9951545596122742, + "learning_rate": 1.1069445060835403e-06, + "loss": 1.1387, + "step": 292160 + }, + { + "epoch": 1.8665908539156435, + "grad_norm": 1.09367036819458, + "learning_rate": 1.1058947766696526e-06, + "loss": 0.7966, + "step": 292170 + }, + { + "epoch": 1.866654741065382, + "grad_norm": 1.531260371208191, + "learning_rate": 1.104845539661703e-06, + "loss": 0.727, + "step": 292180 + }, + { + "epoch": 1.866718628215121, + "grad_norm": 2.0904078483581543, + "learning_rate": 1.103796795070261e-06, + "loss": 0.8937, + "step": 292190 + }, + { + "epoch": 1.8667825153648594, + "grad_norm": 1.1105701923370361, + "learning_rate": 1.1027485429058847e-06, + "loss": 0.838, + "step": 292200 + }, + { + "epoch": 1.8668464025145983, + "grad_norm": 1.3237465620040894, + "learning_rate": 1.1017007831791326e-06, + "loss": 0.6458, + "step": 292210 + }, + { + "epoch": 1.8669102896643368, + "grad_norm": 0.9129582643508911, + "learning_rate": 1.1006535159005571e-06, + "loss": 0.9384, + "step": 292220 + }, + { + "epoch": 1.8669741768140757, + "grad_norm": 1.4068692922592163, + "learning_rate": 1.0996067410807053e-06, + "loss": 1.1028, + "step": 292230 + }, + { + "epoch": 1.8670380639638142, + "grad_norm": 0.8618285059928894, + "learning_rate": 1.0985604587301135e-06, + "loss": 0.9018, + "step": 292240 + }, + { + "epoch": 1.8671019511135531, + "grad_norm": 1.1460120677947998, + "learning_rate": 1.0975146688593341e-06, + "loss": 0.9585, + "step": 292250 + }, + { + "epoch": 1.8671658382632916, + "grad_norm": 0.9741784930229187, + "learning_rate": 1.0964693714788753e-06, + "loss": 0.8074, + "step": 292260 + }, + { + "epoch": 1.8672297254130306, + "grad_norm": 0.9012332558631897, + "learning_rate": 1.09542456659929e-06, + "loss": 1.1284, + "step": 292270 + }, + { + "epoch": 1.867293612562769, + "grad_norm": 1.258500337600708, + "learning_rate": 1.094380254231081e-06, + "loss": 0.7771, + "step": 292280 + }, + { + "epoch": 1.867357499712508, + "grad_norm": 0.8321709036827087, + "learning_rate": 1.093336434384773e-06, + "loss": 0.7485, + "step": 292290 + }, + { + "epoch": 1.8674213868622465, + "grad_norm": 0.9962365627288818, + "learning_rate": 1.0922931070708742e-06, + "loss": 0.8935, + "step": 292300 + }, + { + "epoch": 1.8674852740119854, + "grad_norm": 0.6500903964042664, + "learning_rate": 1.0912502722999042e-06, + "loss": 0.7122, + "step": 292310 + }, + { + "epoch": 1.8675491611617239, + "grad_norm": 1.1838070154190063, + "learning_rate": 1.0902079300823487e-06, + "loss": 0.8033, + "step": 292320 + }, + { + "epoch": 1.8676130483114628, + "grad_norm": 0.6644697189331055, + "learning_rate": 1.0891660804287108e-06, + "loss": 0.7257, + "step": 292330 + }, + { + "epoch": 1.8676769354612013, + "grad_norm": 0.8090490102767944, + "learning_rate": 1.0881247233494928e-06, + "loss": 0.9717, + "step": 292340 + }, + { + "epoch": 1.86774082261094, + "grad_norm": 1.0317925214767456, + "learning_rate": 1.0870838588551647e-06, + "loss": 0.8417, + "step": 292350 + }, + { + "epoch": 1.8678047097606787, + "grad_norm": 1.298757791519165, + "learning_rate": 1.086043486956223e-06, + "loss": 0.9991, + "step": 292360 + }, + { + "epoch": 1.8678685969104174, + "grad_norm": 0.9334030151367188, + "learning_rate": 1.0850036076631375e-06, + "loss": 1.0006, + "step": 292370 + }, + { + "epoch": 1.867932484060156, + "grad_norm": 1.458702564239502, + "learning_rate": 1.0839642209863831e-06, + "loss": 0.6428, + "step": 292380 + }, + { + "epoch": 1.8679963712098948, + "grad_norm": 1.0316441059112549, + "learning_rate": 1.0829253269364292e-06, + "loss": 0.7904, + "step": 292390 + }, + { + "epoch": 1.8680602583596335, + "grad_norm": 1.0853855609893799, + "learning_rate": 1.0818869255237396e-06, + "loss": 0.7114, + "step": 292400 + }, + { + "epoch": 1.8681241455093722, + "grad_norm": 1.5720373392105103, + "learning_rate": 1.0808490167587616e-06, + "loss": 1.0899, + "step": 292410 + }, + { + "epoch": 1.868188032659111, + "grad_norm": 0.6928960084915161, + "learning_rate": 1.0798116006519587e-06, + "loss": 0.6736, + "step": 292420 + }, + { + "epoch": 1.8682519198088496, + "grad_norm": 0.898154079914093, + "learning_rate": 1.078774677213773e-06, + "loss": 0.9786, + "step": 292430 + }, + { + "epoch": 1.8683158069585883, + "grad_norm": 1.0389325618743896, + "learning_rate": 1.0777382464546571e-06, + "loss": 0.7767, + "step": 292440 + }, + { + "epoch": 1.868379694108327, + "grad_norm": 0.9077551364898682, + "learning_rate": 1.0767023083850304e-06, + "loss": 0.6231, + "step": 292450 + }, + { + "epoch": 1.8684435812580658, + "grad_norm": 0.8413649797439575, + "learning_rate": 1.0756668630153454e-06, + "loss": 0.9749, + "step": 292460 + }, + { + "epoch": 1.8685074684078045, + "grad_norm": 0.8598177433013916, + "learning_rate": 1.0746319103560109e-06, + "loss": 0.8766, + "step": 292470 + }, + { + "epoch": 1.8685713555575432, + "grad_norm": 0.6066474318504333, + "learning_rate": 1.0735974504174685e-06, + "loss": 0.7777, + "step": 292480 + }, + { + "epoch": 1.8686352427072819, + "grad_norm": 1.1965216398239136, + "learning_rate": 1.0725634832101206e-06, + "loss": 0.8804, + "step": 292490 + }, + { + "epoch": 1.8686991298570206, + "grad_norm": 0.983538031578064, + "learning_rate": 1.0715300087443925e-06, + "loss": 0.8417, + "step": 292500 + }, + { + "epoch": 1.8687630170067593, + "grad_norm": 0.6179125905036926, + "learning_rate": 1.0704970270306813e-06, + "loss": 0.9033, + "step": 292510 + }, + { + "epoch": 1.868826904156498, + "grad_norm": 1.353305459022522, + "learning_rate": 1.0694645380793956e-06, + "loss": 1.0423, + "step": 292520 + }, + { + "epoch": 1.8688907913062367, + "grad_norm": 0.8663296103477478, + "learning_rate": 1.0684325419009322e-06, + "loss": 0.688, + "step": 292530 + }, + { + "epoch": 1.8689546784559754, + "grad_norm": 1.348539113998413, + "learning_rate": 1.0674010385056887e-06, + "loss": 0.9348, + "step": 292540 + }, + { + "epoch": 1.869018565605714, + "grad_norm": 0.8060293793678284, + "learning_rate": 1.0663700279040455e-06, + "loss": 0.6422, + "step": 292550 + }, + { + "epoch": 1.8690824527554528, + "grad_norm": 0.7460553050041199, + "learning_rate": 1.0653395101063945e-06, + "loss": 0.7276, + "step": 292560 + }, + { + "epoch": 1.8691463399051915, + "grad_norm": 0.8735922574996948, + "learning_rate": 1.0643094851231106e-06, + "loss": 0.7634, + "step": 292570 + }, + { + "epoch": 1.8692102270549302, + "grad_norm": 0.6985118389129639, + "learning_rate": 1.0632799529645577e-06, + "loss": 1.0026, + "step": 292580 + }, + { + "epoch": 1.8692741142046687, + "grad_norm": 0.7997674942016602, + "learning_rate": 1.0622509136411219e-06, + "loss": 0.8164, + "step": 292590 + }, + { + "epoch": 1.8693380013544076, + "grad_norm": 0.8839585185050964, + "learning_rate": 1.061222367163145e-06, + "loss": 1.2042, + "step": 292600 + }, + { + "epoch": 1.8694018885041461, + "grad_norm": 0.857516348361969, + "learning_rate": 1.0601943135410076e-06, + "loss": 0.9505, + "step": 292610 + }, + { + "epoch": 1.869465775653885, + "grad_norm": 0.7346429228782654, + "learning_rate": 1.0591667527850456e-06, + "loss": 1.3544, + "step": 292620 + }, + { + "epoch": 1.8695296628036235, + "grad_norm": 1.585056185722351, + "learning_rate": 1.058139684905618e-06, + "loss": 0.7689, + "step": 292630 + }, + { + "epoch": 1.8695935499533625, + "grad_norm": 1.9108575582504272, + "learning_rate": 1.0571131099130605e-06, + "loss": 1.1067, + "step": 292640 + }, + { + "epoch": 1.869657437103101, + "grad_norm": 0.894305408000946, + "learning_rate": 1.0560870278177148e-06, + "loss": 1.0915, + "step": 292650 + }, + { + "epoch": 1.8697213242528399, + "grad_norm": 0.8008806705474854, + "learning_rate": 1.0550614386299174e-06, + "loss": 0.8791, + "step": 292660 + }, + { + "epoch": 1.8697852114025784, + "grad_norm": 0.8988972306251526, + "learning_rate": 1.0540363423599931e-06, + "loss": 0.9824, + "step": 292670 + }, + { + "epoch": 1.8698490985523173, + "grad_norm": 1.3784595727920532, + "learning_rate": 1.0530117390182724e-06, + "loss": 0.9264, + "step": 292680 + }, + { + "epoch": 1.8699129857020558, + "grad_norm": 0.7012110352516174, + "learning_rate": 1.0519876286150644e-06, + "loss": 0.7323, + "step": 292690 + }, + { + "epoch": 1.8699768728517947, + "grad_norm": 0.7183138132095337, + "learning_rate": 1.0509640111606878e-06, + "loss": 0.8072, + "step": 292700 + }, + { + "epoch": 1.8700407600015332, + "grad_norm": 1.9337592124938965, + "learning_rate": 1.0499408866654515e-06, + "loss": 1.141, + "step": 292710 + }, + { + "epoch": 1.8701046471512721, + "grad_norm": 0.8221973180770874, + "learning_rate": 1.0489182551396582e-06, + "loss": 0.663, + "step": 292720 + }, + { + "epoch": 1.8701685343010106, + "grad_norm": 0.6989620923995972, + "learning_rate": 1.0478961165936052e-06, + "loss": 0.8676, + "step": 292730 + }, + { + "epoch": 1.8702324214507495, + "grad_norm": 1.8266626596450806, + "learning_rate": 1.04687447103759e-06, + "loss": 0.7127, + "step": 292740 + }, + { + "epoch": 1.870296308600488, + "grad_norm": 0.7952519655227661, + "learning_rate": 1.0458533184818985e-06, + "loss": 0.7495, + "step": 292750 + }, + { + "epoch": 1.870360195750227, + "grad_norm": 1.0890142917633057, + "learning_rate": 1.0448326589368174e-06, + "loss": 0.959, + "step": 292760 + }, + { + "epoch": 1.8704240828999654, + "grad_norm": 0.8280913829803467, + "learning_rate": 1.0438124924126214e-06, + "loss": 0.6805, + "step": 292770 + }, + { + "epoch": 1.8704879700497044, + "grad_norm": 1.139344334602356, + "learning_rate": 1.0427928189195858e-06, + "loss": 1.0305, + "step": 292780 + }, + { + "epoch": 1.8705518571994428, + "grad_norm": 1.243733286857605, + "learning_rate": 1.04177363846798e-06, + "loss": 0.6495, + "step": 292790 + }, + { + "epoch": 1.8706157443491818, + "grad_norm": 0.7558228373527527, + "learning_rate": 1.0407549510680737e-06, + "loss": 0.7663, + "step": 292800 + }, + { + "epoch": 1.8706796314989202, + "grad_norm": 1.4247775077819824, + "learning_rate": 1.0397367567301141e-06, + "loss": 0.9372, + "step": 292810 + }, + { + "epoch": 1.870743518648659, + "grad_norm": 2.343604803085327, + "learning_rate": 1.0387190554643656e-06, + "loss": 0.8101, + "step": 292820 + }, + { + "epoch": 1.8708074057983977, + "grad_norm": 1.535949945449829, + "learning_rate": 1.037701847281075e-06, + "loss": 0.7573, + "step": 292830 + }, + { + "epoch": 1.8708712929481364, + "grad_norm": 1.2534575462341309, + "learning_rate": 1.0366851321904846e-06, + "loss": 0.8837, + "step": 292840 + }, + { + "epoch": 1.870935180097875, + "grad_norm": 0.8451334834098816, + "learning_rate": 1.0356689102028305e-06, + "loss": 0.868, + "step": 292850 + }, + { + "epoch": 1.8709990672476138, + "grad_norm": 1.0970386266708374, + "learning_rate": 1.0346531813283488e-06, + "loss": 0.9886, + "step": 292860 + }, + { + "epoch": 1.8710629543973525, + "grad_norm": 1.1927027702331543, + "learning_rate": 1.0336379455772816e-06, + "loss": 0.8813, + "step": 292870 + }, + { + "epoch": 1.8711268415470912, + "grad_norm": 0.9931333661079407, + "learning_rate": 1.0326232029598314e-06, + "loss": 0.7571, + "step": 292880 + }, + { + "epoch": 1.87119072869683, + "grad_norm": 1.6181813478469849, + "learning_rate": 1.031608953486235e-06, + "loss": 0.7072, + "step": 292890 + }, + { + "epoch": 1.8712546158465686, + "grad_norm": 0.9007042050361633, + "learning_rate": 1.030595197166695e-06, + "loss": 0.7609, + "step": 292900 + }, + { + "epoch": 1.8713185029963073, + "grad_norm": 1.2203731536865234, + "learning_rate": 1.029581934011431e-06, + "loss": 0.9663, + "step": 292910 + }, + { + "epoch": 1.871382390146046, + "grad_norm": 1.218652606010437, + "learning_rate": 1.0285691640306405e-06, + "loss": 1.052, + "step": 292920 + }, + { + "epoch": 1.8714462772957847, + "grad_norm": 1.5266693830490112, + "learning_rate": 1.0275568872345264e-06, + "loss": 0.8739, + "step": 292930 + }, + { + "epoch": 1.8715101644455234, + "grad_norm": 1.157273292541504, + "learning_rate": 1.0265451036332751e-06, + "loss": 0.7755, + "step": 292940 + }, + { + "epoch": 1.8715740515952621, + "grad_norm": 2.2461156845092773, + "learning_rate": 1.0255338132370895e-06, + "loss": 1.2531, + "step": 292950 + }, + { + "epoch": 1.8716379387450008, + "grad_norm": 1.6639330387115479, + "learning_rate": 1.0245230160561447e-06, + "loss": 0.9475, + "step": 292960 + }, + { + "epoch": 1.8717018258947395, + "grad_norm": 0.8339784145355225, + "learning_rate": 1.023512712100627e-06, + "loss": 0.6774, + "step": 292970 + }, + { + "epoch": 1.8717657130444783, + "grad_norm": 0.7059604525566101, + "learning_rate": 1.0225029013807009e-06, + "loss": 0.8867, + "step": 292980 + }, + { + "epoch": 1.871829600194217, + "grad_norm": 1.0560535192489624, + "learning_rate": 1.0214935839065465e-06, + "loss": 0.6824, + "step": 292990 + }, + { + "epoch": 1.8718934873439557, + "grad_norm": 1.1694117784500122, + "learning_rate": 1.0204847596883228e-06, + "loss": 0.825, + "step": 293000 + }, + { + "epoch": 1.8719573744936944, + "grad_norm": 3.1129536628723145, + "learning_rate": 1.019476428736188e-06, + "loss": 0.9892, + "step": 293010 + }, + { + "epoch": 1.872021261643433, + "grad_norm": 1.1729931831359863, + "learning_rate": 1.0184685910603009e-06, + "loss": 0.7504, + "step": 293020 + }, + { + "epoch": 1.8720851487931718, + "grad_norm": 1.0270169973373413, + "learning_rate": 1.0174612466708143e-06, + "loss": 0.8028, + "step": 293030 + }, + { + "epoch": 1.8721490359429105, + "grad_norm": 0.8455641269683838, + "learning_rate": 1.016454395577865e-06, + "loss": 0.7136, + "step": 293040 + }, + { + "epoch": 1.8722129230926492, + "grad_norm": 1.1827267408370972, + "learning_rate": 1.0154480377915999e-06, + "loss": 1.1107, + "step": 293050 + }, + { + "epoch": 1.872276810242388, + "grad_norm": 0.7174341082572937, + "learning_rate": 1.0144421733221499e-06, + "loss": 0.7128, + "step": 293060 + }, + { + "epoch": 1.8723406973921266, + "grad_norm": 0.9067844152450562, + "learning_rate": 1.0134368021796402e-06, + "loss": 0.9257, + "step": 293070 + }, + { + "epoch": 1.872404584541865, + "grad_norm": 0.9582309126853943, + "learning_rate": 1.0124319243742075e-06, + "loss": 0.8328, + "step": 293080 + }, + { + "epoch": 1.872468471691604, + "grad_norm": 0.9209734797477722, + "learning_rate": 1.0114275399159656e-06, + "loss": 0.6514, + "step": 293090 + }, + { + "epoch": 1.8725323588413425, + "grad_norm": 2.1290807723999023, + "learning_rate": 1.0104236488150288e-06, + "loss": 0.9401, + "step": 293100 + }, + { + "epoch": 1.8725962459910814, + "grad_norm": 0.7640361189842224, + "learning_rate": 1.0094202510815054e-06, + "loss": 0.8225, + "step": 293110 + }, + { + "epoch": 1.87266013314082, + "grad_norm": 1.0006041526794434, + "learning_rate": 1.0084173467255042e-06, + "loss": 0.7502, + "step": 293120 + }, + { + "epoch": 1.8727240202905588, + "grad_norm": 0.6950806379318237, + "learning_rate": 1.0074149357571227e-06, + "loss": 0.7883, + "step": 293130 + }, + { + "epoch": 1.8727879074402973, + "grad_norm": 1.5263314247131348, + "learning_rate": 1.0064130181864584e-06, + "loss": 0.8787, + "step": 293140 + }, + { + "epoch": 1.8728517945900363, + "grad_norm": 1.1775399446487427, + "learning_rate": 1.005411594023603e-06, + "loss": 1.0736, + "step": 293150 + }, + { + "epoch": 1.8729156817397747, + "grad_norm": 1.1319595575332642, + "learning_rate": 1.0044106632786377e-06, + "loss": 0.8147, + "step": 293160 + }, + { + "epoch": 1.8729795688895137, + "grad_norm": 1.4746376276016235, + "learning_rate": 1.003410225961643e-06, + "loss": 0.8774, + "step": 293170 + }, + { + "epoch": 1.8730434560392522, + "grad_norm": 1.2940460443496704, + "learning_rate": 1.002410282082694e-06, + "loss": 0.7233, + "step": 293180 + }, + { + "epoch": 1.873107343188991, + "grad_norm": 0.9180367588996887, + "learning_rate": 1.0014108316518667e-06, + "loss": 0.7021, + "step": 293190 + }, + { + "epoch": 1.8731712303387296, + "grad_norm": 1.2637653350830078, + "learning_rate": 1.0004118746792136e-06, + "loss": 1.0029, + "step": 293200 + }, + { + "epoch": 1.8732351174884685, + "grad_norm": 0.6989993453025818, + "learning_rate": 9.994134111748155e-07, + "loss": 0.7655, + "step": 293210 + }, + { + "epoch": 1.873299004638207, + "grad_norm": 0.7211950421333313, + "learning_rate": 9.984154411487035e-07, + "loss": 0.9773, + "step": 293220 + }, + { + "epoch": 1.873362891787946, + "grad_norm": 0.7398399710655212, + "learning_rate": 9.974179646109527e-07, + "loss": 0.8612, + "step": 293230 + }, + { + "epoch": 1.8734267789376844, + "grad_norm": 0.6061854362487793, + "learning_rate": 9.964209815715885e-07, + "loss": 0.8147, + "step": 293240 + }, + { + "epoch": 1.8734906660874233, + "grad_norm": 2.689434289932251, + "learning_rate": 9.95424492040664e-07, + "loss": 0.7556, + "step": 293250 + }, + { + "epoch": 1.8735545532371618, + "grad_norm": 0.9904452562332153, + "learning_rate": 9.944284960282047e-07, + "loss": 0.8019, + "step": 293260 + }, + { + "epoch": 1.8736184403869007, + "grad_norm": 0.8292899131774902, + "learning_rate": 9.934329935442522e-07, + "loss": 0.8431, + "step": 293270 + }, + { + "epoch": 1.8736823275366392, + "grad_norm": 0.9797796607017517, + "learning_rate": 9.924379845988207e-07, + "loss": 0.7326, + "step": 293280 + }, + { + "epoch": 1.8737462146863781, + "grad_norm": 1.3742291927337646, + "learning_rate": 9.914434692019358e-07, + "loss": 0.8503, + "step": 293290 + }, + { + "epoch": 1.8738101018361166, + "grad_norm": 0.8166207671165466, + "learning_rate": 9.904494473636173e-07, + "loss": 1.1437, + "step": 293300 + }, + { + "epoch": 1.8738739889858553, + "grad_norm": 1.2097806930541992, + "learning_rate": 9.894559190938736e-07, + "loss": 0.9368, + "step": 293310 + }, + { + "epoch": 1.873937876135594, + "grad_norm": 0.9409804940223694, + "learning_rate": 9.88462884402702e-07, + "loss": 0.7663, + "step": 293320 + }, + { + "epoch": 1.8740017632853327, + "grad_norm": 0.996408998966217, + "learning_rate": 9.874703433001175e-07, + "loss": 0.9313, + "step": 293330 + }, + { + "epoch": 1.8740656504350715, + "grad_norm": 2.0611023902893066, + "learning_rate": 9.86478295796106e-07, + "loss": 0.7475, + "step": 293340 + }, + { + "epoch": 1.8741295375848102, + "grad_norm": 1.0358458757400513, + "learning_rate": 9.854867419006597e-07, + "loss": 0.9663, + "step": 293350 + }, + { + "epoch": 1.8741934247345489, + "grad_norm": 0.6037928462028503, + "learning_rate": 9.84495681623765e-07, + "loss": 0.911, + "step": 293360 + }, + { + "epoch": 1.8742573118842876, + "grad_norm": 0.8664078712463379, + "learning_rate": 9.83505114975408e-07, + "loss": 0.9303, + "step": 293370 + }, + { + "epoch": 1.8743211990340263, + "grad_norm": 1.4451388120651245, + "learning_rate": 9.825150419655538e-07, + "loss": 0.8275, + "step": 293380 + }, + { + "epoch": 1.874385086183765, + "grad_norm": 1.456311583518982, + "learning_rate": 9.81525462604177e-07, + "loss": 0.8143, + "step": 293390 + }, + { + "epoch": 1.8744489733335037, + "grad_norm": 0.6796972155570984, + "learning_rate": 9.805363769012532e-07, + "loss": 0.8208, + "step": 293400 + }, + { + "epoch": 1.8745128604832424, + "grad_norm": 0.8689972162246704, + "learning_rate": 9.7954778486673e-07, + "loss": 0.8274, + "step": 293410 + }, + { + "epoch": 1.874576747632981, + "grad_norm": 1.17522132396698, + "learning_rate": 9.785596865105772e-07, + "loss": 0.8881, + "step": 293420 + }, + { + "epoch": 1.8746406347827198, + "grad_norm": 0.7953550219535828, + "learning_rate": 9.775720818427315e-07, + "loss": 0.8244, + "step": 293430 + }, + { + "epoch": 1.8747045219324585, + "grad_norm": 0.6701993942260742, + "learning_rate": 9.765849708731455e-07, + "loss": 0.7774, + "step": 293440 + }, + { + "epoch": 1.8747684090821972, + "grad_norm": 1.067297101020813, + "learning_rate": 9.755983536117618e-07, + "loss": 0.8687, + "step": 293450 + }, + { + "epoch": 1.874832296231936, + "grad_norm": 1.085796594619751, + "learning_rate": 9.746122300685168e-07, + "loss": 0.8056, + "step": 293460 + }, + { + "epoch": 1.8748961833816746, + "grad_norm": 0.8632622957229614, + "learning_rate": 9.736266002533357e-07, + "loss": 0.9159, + "step": 293470 + }, + { + "epoch": 1.8749600705314133, + "grad_norm": 0.7865797877311707, + "learning_rate": 9.72641464176155e-07, + "loss": 0.7732, + "step": 293480 + }, + { + "epoch": 1.875023957681152, + "grad_norm": 1.2301065921783447, + "learning_rate": 9.71656821846878e-07, + "loss": 1.1233, + "step": 293490 + }, + { + "epoch": 1.8750878448308907, + "grad_norm": 0.8430875539779663, + "learning_rate": 9.706726732754413e-07, + "loss": 0.8162, + "step": 293500 + }, + { + "epoch": 1.8751517319806295, + "grad_norm": 1.114546775817871, + "learning_rate": 9.696890184717478e-07, + "loss": 1.1276, + "step": 293510 + }, + { + "epoch": 1.8752156191303682, + "grad_norm": 0.7824980616569519, + "learning_rate": 9.687058574457008e-07, + "loss": 0.7735, + "step": 293520 + }, + { + "epoch": 1.8752795062801069, + "grad_norm": 0.917863667011261, + "learning_rate": 9.677231902072037e-07, + "loss": 0.918, + "step": 293530 + }, + { + "epoch": 1.8753433934298456, + "grad_norm": 0.9983965754508972, + "learning_rate": 9.66741016766154e-07, + "loss": 0.6978, + "step": 293540 + }, + { + "epoch": 1.875407280579584, + "grad_norm": 1.4215760231018066, + "learning_rate": 9.657593371324437e-07, + "loss": 0.9128, + "step": 293550 + }, + { + "epoch": 1.875471167729323, + "grad_norm": 1.064919352531433, + "learning_rate": 9.647781513159538e-07, + "loss": 0.7519, + "step": 293560 + }, + { + "epoch": 1.8755350548790615, + "grad_norm": 1.0606855154037476, + "learning_rate": 9.637974593265708e-07, + "loss": 0.9517, + "step": 293570 + }, + { + "epoch": 1.8755989420288004, + "grad_norm": 1.0682787895202637, + "learning_rate": 9.628172611741647e-07, + "loss": 0.8493, + "step": 293580 + }, + { + "epoch": 1.8756628291785389, + "grad_norm": 1.3362362384796143, + "learning_rate": 9.618375568686222e-07, + "loss": 0.7255, + "step": 293590 + }, + { + "epoch": 1.8757267163282778, + "grad_norm": 1.1866968870162964, + "learning_rate": 9.608583464197907e-07, + "loss": 0.9001, + "step": 293600 + }, + { + "epoch": 1.8757906034780163, + "grad_norm": 0.8186989426612854, + "learning_rate": 9.598796298375456e-07, + "loss": 0.8377, + "step": 293610 + }, + { + "epoch": 1.8758544906277552, + "grad_norm": 1.4240480661392212, + "learning_rate": 9.589014071317348e-07, + "loss": 0.6917, + "step": 293620 + }, + { + "epoch": 1.8759183777774937, + "grad_norm": 0.9501568675041199, + "learning_rate": 9.579236783122169e-07, + "loss": 0.754, + "step": 293630 + }, + { + "epoch": 1.8759822649272326, + "grad_norm": 1.0874764919281006, + "learning_rate": 9.569464433888342e-07, + "loss": 0.9862, + "step": 293640 + }, + { + "epoch": 1.8760461520769711, + "grad_norm": 1.2083163261413574, + "learning_rate": 9.559697023714286e-07, + "loss": 0.7187, + "step": 293650 + }, + { + "epoch": 1.87611003922671, + "grad_norm": 0.9831110239028931, + "learning_rate": 9.54993455269837e-07, + "loss": 0.9622, + "step": 293660 + }, + { + "epoch": 1.8761739263764485, + "grad_norm": 1.3009384870529175, + "learning_rate": 9.540177020938902e-07, + "loss": 0.8913, + "step": 293670 + }, + { + "epoch": 1.8762378135261875, + "grad_norm": 1.0569088459014893, + "learning_rate": 9.530424428534135e-07, + "loss": 0.7587, + "step": 293680 + }, + { + "epoch": 1.876301700675926, + "grad_norm": 1.2314594984054565, + "learning_rate": 9.520676775582382e-07, + "loss": 0.8637, + "step": 293690 + }, + { + "epoch": 1.8763655878256649, + "grad_norm": 0.889288604259491, + "learning_rate": 9.510934062181675e-07, + "loss": 0.7726, + "step": 293700 + }, + { + "epoch": 1.8764294749754034, + "grad_norm": 0.7331340312957764, + "learning_rate": 9.501196288430215e-07, + "loss": 0.6647, + "step": 293710 + }, + { + "epoch": 1.8764933621251423, + "grad_norm": 2.036623239517212, + "learning_rate": 9.491463454426086e-07, + "loss": 0.8645, + "step": 293720 + }, + { + "epoch": 1.8765572492748808, + "grad_norm": 1.0206176042556763, + "learning_rate": 9.481735560267213e-07, + "loss": 0.8772, + "step": 293730 + }, + { + "epoch": 1.8766211364246197, + "grad_norm": 2.132143259048462, + "learning_rate": 9.472012606051683e-07, + "loss": 0.936, + "step": 293740 + }, + { + "epoch": 1.8766850235743582, + "grad_norm": 1.135907769203186, + "learning_rate": 9.462294591877307e-07, + "loss": 0.8672, + "step": 293750 + }, + { + "epoch": 1.876748910724097, + "grad_norm": 0.9503557682037354, + "learning_rate": 9.452581517842008e-07, + "loss": 0.9258, + "step": 293760 + }, + { + "epoch": 1.8768127978738356, + "grad_norm": 1.0329943895339966, + "learning_rate": 9.442873384043594e-07, + "loss": 0.6602, + "step": 293770 + }, + { + "epoch": 1.8768766850235745, + "grad_norm": 1.0028157234191895, + "learning_rate": 9.433170190579876e-07, + "loss": 0.9158, + "step": 293780 + }, + { + "epoch": 1.876940572173313, + "grad_norm": 0.954515814781189, + "learning_rate": 9.4234719375485e-07, + "loss": 0.8213, + "step": 293790 + }, + { + "epoch": 1.8770044593230517, + "grad_norm": 0.5199512839317322, + "learning_rate": 9.413778625047165e-07, + "loss": 0.871, + "step": 293800 + }, + { + "epoch": 1.8770683464727904, + "grad_norm": 1.4453988075256348, + "learning_rate": 9.404090253173514e-07, + "loss": 0.8009, + "step": 293810 + }, + { + "epoch": 1.8771322336225291, + "grad_norm": 0.8859098553657532, + "learning_rate": 9.394406822025081e-07, + "loss": 0.8949, + "step": 293820 + }, + { + "epoch": 1.8771961207722678, + "grad_norm": 4.4526047706604, + "learning_rate": 9.384728331699399e-07, + "loss": 0.869, + "step": 293830 + }, + { + "epoch": 1.8772600079220065, + "grad_norm": 0.8694409728050232, + "learning_rate": 9.375054782294001e-07, + "loss": 0.8192, + "step": 293840 + }, + { + "epoch": 1.8773238950717452, + "grad_norm": 1.8180179595947266, + "learning_rate": 9.365386173906199e-07, + "loss": 0.9313, + "step": 293850 + }, + { + "epoch": 1.877387782221484, + "grad_norm": 0.6987054347991943, + "learning_rate": 9.355722506633469e-07, + "loss": 0.9417, + "step": 293860 + }, + { + "epoch": 1.8774516693712227, + "grad_norm": 1.0461370944976807, + "learning_rate": 9.346063780573011e-07, + "loss": 0.7896, + "step": 293870 + }, + { + "epoch": 1.8775155565209614, + "grad_norm": 1.137550711631775, + "learning_rate": 9.336409995822193e-07, + "loss": 0.9223, + "step": 293880 + }, + { + "epoch": 1.8775794436707, + "grad_norm": 1.02553129196167, + "learning_rate": 9.326761152478214e-07, + "loss": 0.7892, + "step": 293890 + }, + { + "epoch": 1.8776433308204388, + "grad_norm": 1.0381640195846558, + "learning_rate": 9.317117250638274e-07, + "loss": 1.0834, + "step": 293900 + }, + { + "epoch": 1.8777072179701775, + "grad_norm": 0.9272412061691284, + "learning_rate": 9.307478290399408e-07, + "loss": 0.9732, + "step": 293910 + }, + { + "epoch": 1.8777711051199162, + "grad_norm": 1.2473331689834595, + "learning_rate": 9.297844271858758e-07, + "loss": 0.962, + "step": 293920 + }, + { + "epoch": 1.877834992269655, + "grad_norm": 1.2069578170776367, + "learning_rate": 9.288215195113359e-07, + "loss": 1.2623, + "step": 293930 + }, + { + "epoch": 1.8778988794193936, + "grad_norm": 1.2229667901992798, + "learning_rate": 9.278591060260134e-07, + "loss": 0.6933, + "step": 293940 + }, + { + "epoch": 1.8779627665691323, + "grad_norm": 0.9840774536132812, + "learning_rate": 9.268971867396114e-07, + "loss": 0.6754, + "step": 293950 + }, + { + "epoch": 1.878026653718871, + "grad_norm": 1.087142825126648, + "learning_rate": 9.259357616618003e-07, + "loss": 0.8483, + "step": 293960 + }, + { + "epoch": 1.8780905408686097, + "grad_norm": 1.1667194366455078, + "learning_rate": 9.249748308022721e-07, + "loss": 0.8202, + "step": 293970 + }, + { + "epoch": 1.8781544280183484, + "grad_norm": 0.888049304485321, + "learning_rate": 9.240143941707024e-07, + "loss": 1.299, + "step": 293980 + }, + { + "epoch": 1.8782183151680871, + "grad_norm": 1.1439584493637085, + "learning_rate": 9.230544517767726e-07, + "loss": 0.5954, + "step": 293990 + }, + { + "epoch": 1.8782822023178258, + "grad_norm": 0.981224000453949, + "learning_rate": 9.220950036301302e-07, + "loss": 0.8226, + "step": 294000 + }, + { + "epoch": 1.8783460894675645, + "grad_norm": 1.2684990167617798, + "learning_rate": 9.21136049740462e-07, + "loss": 0.9423, + "step": 294010 + }, + { + "epoch": 1.8784099766173032, + "grad_norm": 1.2848501205444336, + "learning_rate": 9.201775901174048e-07, + "loss": 0.7289, + "step": 294020 + }, + { + "epoch": 1.878473863767042, + "grad_norm": 0.8658862709999084, + "learning_rate": 9.192196247706231e-07, + "loss": 0.8491, + "step": 294030 + }, + { + "epoch": 1.8785377509167804, + "grad_norm": 1.240254282951355, + "learning_rate": 9.182621537097591e-07, + "loss": 0.9544, + "step": 294040 + }, + { + "epoch": 1.8786016380665194, + "grad_norm": 1.1382461786270142, + "learning_rate": 9.173051769444552e-07, + "loss": 0.8901, + "step": 294050 + }, + { + "epoch": 1.8786655252162578, + "grad_norm": 1.0991541147232056, + "learning_rate": 9.163486944843536e-07, + "loss": 0.7114, + "step": 294060 + }, + { + "epoch": 1.8787294123659968, + "grad_norm": 1.1713433265686035, + "learning_rate": 9.1539270633908e-07, + "loss": 0.7156, + "step": 294070 + }, + { + "epoch": 1.8787932995157353, + "grad_norm": 1.8868058919906616, + "learning_rate": 9.14437212518271e-07, + "loss": 0.9405, + "step": 294080 + }, + { + "epoch": 1.8788571866654742, + "grad_norm": 1.1627702713012695, + "learning_rate": 9.134822130315413e-07, + "loss": 1.0747, + "step": 294090 + }, + { + "epoch": 1.8789210738152127, + "grad_norm": 1.2614167928695679, + "learning_rate": 9.125277078885164e-07, + "loss": 0.8957, + "step": 294100 + }, + { + "epoch": 1.8789849609649516, + "grad_norm": 1.0662238597869873, + "learning_rate": 9.115736970987943e-07, + "loss": 0.6811, + "step": 294110 + }, + { + "epoch": 1.87904884811469, + "grad_norm": 1.1519864797592163, + "learning_rate": 9.106201806720005e-07, + "loss": 1.1223, + "step": 294120 + }, + { + "epoch": 1.879112735264429, + "grad_norm": 1.1450780630111694, + "learning_rate": 9.096671586177274e-07, + "loss": 0.8483, + "step": 294130 + }, + { + "epoch": 1.8791766224141675, + "grad_norm": 0.6603822708129883, + "learning_rate": 9.087146309455786e-07, + "loss": 0.6462, + "step": 294140 + }, + { + "epoch": 1.8792405095639064, + "grad_norm": 0.8664895296096802, + "learning_rate": 9.077625976651349e-07, + "loss": 0.7966, + "step": 294150 + }, + { + "epoch": 1.879304396713645, + "grad_norm": 1.2432823181152344, + "learning_rate": 9.068110587860001e-07, + "loss": 0.9826, + "step": 294160 + }, + { + "epoch": 1.8793682838633838, + "grad_norm": 0.8814300894737244, + "learning_rate": 9.058600143177498e-07, + "loss": 0.9677, + "step": 294170 + }, + { + "epoch": 1.8794321710131223, + "grad_norm": 1.6960625648498535, + "learning_rate": 9.050044970255411e-07, + "loss": 0.8404, + "step": 294180 + }, + { + "epoch": 1.8794960581628612, + "grad_norm": 2.7894229888916016, + "learning_rate": 9.04054391964354e-07, + "loss": 0.937, + "step": 294190 + }, + { + "epoch": 1.8795599453125997, + "grad_norm": 1.0094927549362183, + "learning_rate": 9.031047813418125e-07, + "loss": 0.7125, + "step": 294200 + }, + { + "epoch": 1.8796238324623387, + "grad_norm": 1.201365351676941, + "learning_rate": 9.021556651674812e-07, + "loss": 0.8694, + "step": 294210 + }, + { + "epoch": 1.8796877196120771, + "grad_norm": 3.078171730041504, + "learning_rate": 9.012070434509134e-07, + "loss": 0.7851, + "step": 294220 + }, + { + "epoch": 1.879751606761816, + "grad_norm": 1.129648208618164, + "learning_rate": 9.002589162016684e-07, + "loss": 0.9309, + "step": 294230 + }, + { + "epoch": 1.8798154939115546, + "grad_norm": 1.0142879486083984, + "learning_rate": 8.993112834292938e-07, + "loss": 0.7422, + "step": 294240 + }, + { + "epoch": 1.8798793810612935, + "grad_norm": 0.6870994567871094, + "learning_rate": 8.983641451433378e-07, + "loss": 0.77, + "step": 294250 + }, + { + "epoch": 1.879943268211032, + "grad_norm": 0.812430202960968, + "learning_rate": 8.97417501353326e-07, + "loss": 1.0045, + "step": 294260 + }, + { + "epoch": 1.880007155360771, + "grad_norm": 0.6833242774009705, + "learning_rate": 8.964713520688061e-07, + "loss": 0.9386, + "step": 294270 + }, + { + "epoch": 1.8800710425105094, + "grad_norm": 1.2980430126190186, + "learning_rate": 8.955256972992931e-07, + "loss": 0.9414, + "step": 294280 + }, + { + "epoch": 1.880134929660248, + "grad_norm": 1.0034370422363281, + "learning_rate": 8.945805370543292e-07, + "loss": 0.8657, + "step": 294290 + }, + { + "epoch": 1.8801988168099868, + "grad_norm": 1.363470435142517, + "learning_rate": 8.936358713434124e-07, + "loss": 0.8611, + "step": 294300 + }, + { + "epoch": 1.8802627039597255, + "grad_norm": 0.93649822473526, + "learning_rate": 8.926917001760682e-07, + "loss": 0.9878, + "step": 294310 + }, + { + "epoch": 1.8803265911094642, + "grad_norm": 1.797559142112732, + "learning_rate": 8.917480235618003e-07, + "loss": 0.8329, + "step": 294320 + }, + { + "epoch": 1.880390478259203, + "grad_norm": 1.6403647661209106, + "learning_rate": 8.908048415101178e-07, + "loss": 0.7079, + "step": 294330 + }, + { + "epoch": 1.8804543654089416, + "grad_norm": 0.9935147166252136, + "learning_rate": 8.89862154030513e-07, + "loss": 0.8007, + "step": 294340 + }, + { + "epoch": 1.8805182525586803, + "grad_norm": 1.2044225931167603, + "learning_rate": 8.889199611324783e-07, + "loss": 1.0715, + "step": 294350 + }, + { + "epoch": 1.880582139708419, + "grad_norm": 0.8857986330986023, + "learning_rate": 8.879782628255173e-07, + "loss": 0.695, + "step": 294360 + }, + { + "epoch": 1.8806460268581577, + "grad_norm": 0.9867647290229797, + "learning_rate": 8.870370591190946e-07, + "loss": 0.8391, + "step": 294370 + }, + { + "epoch": 1.8807099140078964, + "grad_norm": 0.795604407787323, + "learning_rate": 8.860963500227027e-07, + "loss": 0.7813, + "step": 294380 + }, + { + "epoch": 1.8807738011576352, + "grad_norm": 1.4664380550384521, + "learning_rate": 8.85156135545806e-07, + "loss": 1.0997, + "step": 294390 + }, + { + "epoch": 1.8808376883073739, + "grad_norm": 0.8081570863723755, + "learning_rate": 8.842164156978861e-07, + "loss": 0.8407, + "step": 294400 + }, + { + "epoch": 1.8809015754571126, + "grad_norm": 0.9665111303329468, + "learning_rate": 8.832771904883851e-07, + "loss": 0.6138, + "step": 294410 + }, + { + "epoch": 1.8809654626068513, + "grad_norm": 0.6109456419944763, + "learning_rate": 8.823384599267848e-07, + "loss": 0.7707, + "step": 294420 + }, + { + "epoch": 1.88102934975659, + "grad_norm": 1.4124689102172852, + "learning_rate": 8.814002240225272e-07, + "loss": 1.1025, + "step": 294430 + }, + { + "epoch": 1.8810932369063287, + "grad_norm": 0.8159995675086975, + "learning_rate": 8.804624827850605e-07, + "loss": 1.0161, + "step": 294440 + }, + { + "epoch": 1.8811571240560674, + "grad_norm": 1.37120521068573, + "learning_rate": 8.795252362238327e-07, + "loss": 0.9202, + "step": 294450 + }, + { + "epoch": 1.881221011205806, + "grad_norm": 0.7169210910797119, + "learning_rate": 8.785884843482806e-07, + "loss": 0.9267, + "step": 294460 + }, + { + "epoch": 1.8812848983555448, + "grad_norm": 2.554936408996582, + "learning_rate": 8.776522271678356e-07, + "loss": 1.1622, + "step": 294470 + }, + { + "epoch": 1.8813487855052835, + "grad_norm": 1.1686550378799438, + "learning_rate": 8.767164646919346e-07, + "loss": 0.7131, + "step": 294480 + }, + { + "epoch": 1.8814126726550222, + "grad_norm": 0.7276962399482727, + "learning_rate": 8.757811969299923e-07, + "loss": 0.8749, + "step": 294490 + }, + { + "epoch": 1.881476559804761, + "grad_norm": 1.421937108039856, + "learning_rate": 8.748464238914344e-07, + "loss": 1.047, + "step": 294500 + }, + { + "epoch": 1.8815404469544996, + "grad_norm": 0.8149546980857849, + "learning_rate": 8.739121455856703e-07, + "loss": 0.7673, + "step": 294510 + }, + { + "epoch": 1.8816043341042383, + "grad_norm": 1.083941102027893, + "learning_rate": 8.729783620221143e-07, + "loss": 0.5612, + "step": 294520 + }, + { + "epoch": 1.8816682212539768, + "grad_norm": 1.2710938453674316, + "learning_rate": 8.720450732101649e-07, + "loss": 0.8329, + "step": 294530 + }, + { + "epoch": 1.8817321084037157, + "grad_norm": 0.7315378189086914, + "learning_rate": 8.711122791592252e-07, + "loss": 0.6684, + "step": 294540 + }, + { + "epoch": 1.8817959955534542, + "grad_norm": 1.0665138959884644, + "learning_rate": 8.701799798786825e-07, + "loss": 0.9872, + "step": 294550 + }, + { + "epoch": 1.8818598827031932, + "grad_norm": 0.810028076171875, + "learning_rate": 8.692481753779347e-07, + "loss": 0.8548, + "step": 294560 + }, + { + "epoch": 1.8819237698529316, + "grad_norm": 0.8060693740844727, + "learning_rate": 8.683168656663631e-07, + "loss": 0.8849, + "step": 294570 + }, + { + "epoch": 1.8819876570026706, + "grad_norm": 1.5116766691207886, + "learning_rate": 8.673860507533437e-07, + "loss": 0.9512, + "step": 294580 + }, + { + "epoch": 1.882051544152409, + "grad_norm": 1.0401395559310913, + "learning_rate": 8.664557306482523e-07, + "loss": 0.7837, + "step": 294590 + }, + { + "epoch": 1.882115431302148, + "grad_norm": 0.8519911170005798, + "learning_rate": 8.655259053604592e-07, + "loss": 0.8609, + "step": 294600 + }, + { + "epoch": 1.8821793184518865, + "grad_norm": 1.1604868173599243, + "learning_rate": 8.64596574899329e-07, + "loss": 0.717, + "step": 294610 + }, + { + "epoch": 1.8822432056016254, + "grad_norm": 1.0016506910324097, + "learning_rate": 8.636677392742154e-07, + "loss": 1.0582, + "step": 294620 + }, + { + "epoch": 1.8823070927513639, + "grad_norm": 0.9217401742935181, + "learning_rate": 8.627393984944776e-07, + "loss": 0.6711, + "step": 294630 + }, + { + "epoch": 1.8823709799011028, + "grad_norm": 0.6310465931892395, + "learning_rate": 8.618115525694637e-07, + "loss": 1.0225, + "step": 294640 + }, + { + "epoch": 1.8824348670508413, + "grad_norm": 1.1562144756317139, + "learning_rate": 8.608842015085217e-07, + "loss": 0.8545, + "step": 294650 + }, + { + "epoch": 1.8824987542005802, + "grad_norm": 2.5376696586608887, + "learning_rate": 8.599573453209886e-07, + "loss": 1.0066, + "step": 294660 + }, + { + "epoch": 1.8825626413503187, + "grad_norm": 0.8767745494842529, + "learning_rate": 8.590309840161903e-07, + "loss": 1.0221, + "step": 294670 + }, + { + "epoch": 1.8826265285000576, + "grad_norm": 0.9438298940658569, + "learning_rate": 8.581051176034694e-07, + "loss": 0.7347, + "step": 294680 + }, + { + "epoch": 1.8826904156497961, + "grad_norm": 0.8803073763847351, + "learning_rate": 8.571797460921349e-07, + "loss": 1.154, + "step": 294690 + }, + { + "epoch": 1.882754302799535, + "grad_norm": 1.1712591648101807, + "learning_rate": 8.56254869491524e-07, + "loss": 1.035, + "step": 294700 + }, + { + "epoch": 1.8828181899492735, + "grad_norm": 0.6817288994789124, + "learning_rate": 8.553304878109347e-07, + "loss": 0.8175, + "step": 294710 + }, + { + "epoch": 1.8828820770990125, + "grad_norm": 0.8393722176551819, + "learning_rate": 8.54406601059693e-07, + "loss": 0.8887, + "step": 294720 + }, + { + "epoch": 1.882945964248751, + "grad_norm": 0.9503065347671509, + "learning_rate": 8.534832092470857e-07, + "loss": 0.9214, + "step": 294730 + }, + { + "epoch": 1.8830098513984899, + "grad_norm": 0.6335180997848511, + "learning_rate": 8.525603123824222e-07, + "loss": 0.7857, + "step": 294740 + }, + { + "epoch": 1.8830737385482283, + "grad_norm": 0.7557439804077148, + "learning_rate": 8.51637910474995e-07, + "loss": 0.8171, + "step": 294750 + }, + { + "epoch": 1.8831376256979673, + "grad_norm": 0.79386967420578, + "learning_rate": 8.507160035340966e-07, + "loss": 0.7434, + "step": 294760 + }, + { + "epoch": 1.8832015128477058, + "grad_norm": 0.8553667664527893, + "learning_rate": 8.497945915690031e-07, + "loss": 0.9031, + "step": 294770 + }, + { + "epoch": 1.8832653999974445, + "grad_norm": 1.2695742845535278, + "learning_rate": 8.488736745890013e-07, + "loss": 1.095, + "step": 294780 + }, + { + "epoch": 1.8833292871471832, + "grad_norm": 1.114892601966858, + "learning_rate": 8.479532526033618e-07, + "loss": 0.8603, + "step": 294790 + }, + { + "epoch": 1.8833931742969219, + "grad_norm": 1.5279147624969482, + "learning_rate": 8.470333256213603e-07, + "loss": 0.808, + "step": 294800 + }, + { + "epoch": 1.8834570614466606, + "grad_norm": 0.6107496023178101, + "learning_rate": 8.461138936522506e-07, + "loss": 0.8576, + "step": 294810 + }, + { + "epoch": 1.8835209485963993, + "grad_norm": 0.7857877016067505, + "learning_rate": 8.45194956705303e-07, + "loss": 0.9546, + "step": 294820 + }, + { + "epoch": 1.883584835746138, + "grad_norm": 1.1471401453018188, + "learning_rate": 8.442765147897657e-07, + "loss": 0.7312, + "step": 294830 + }, + { + "epoch": 1.8836487228958767, + "grad_norm": 0.9037813544273376, + "learning_rate": 8.433585679148926e-07, + "loss": 0.8613, + "step": 294840 + }, + { + "epoch": 1.8837126100456154, + "grad_norm": 0.8556420207023621, + "learning_rate": 8.424411160899204e-07, + "loss": 0.8962, + "step": 294850 + }, + { + "epoch": 1.8837764971953541, + "grad_norm": 0.6740580201148987, + "learning_rate": 8.415241593240974e-07, + "loss": 1.1488, + "step": 294860 + }, + { + "epoch": 1.8838403843450928, + "grad_norm": 0.9400308728218079, + "learning_rate": 8.406076976266497e-07, + "loss": 0.6987, + "step": 294870 + }, + { + "epoch": 1.8839042714948315, + "grad_norm": 1.5534549951553345, + "learning_rate": 8.396917310068086e-07, + "loss": 0.8772, + "step": 294880 + }, + { + "epoch": 1.8839681586445702, + "grad_norm": 1.0593148469924927, + "learning_rate": 8.387762594738114e-07, + "loss": 0.6944, + "step": 294890 + }, + { + "epoch": 1.884032045794309, + "grad_norm": 0.9874648451805115, + "learning_rate": 8.378612830368615e-07, + "loss": 0.7625, + "step": 294900 + }, + { + "epoch": 1.8840959329440476, + "grad_norm": 0.998270571231842, + "learning_rate": 8.369468017051796e-07, + "loss": 0.8599, + "step": 294910 + }, + { + "epoch": 1.8841598200937864, + "grad_norm": 1.26828134059906, + "learning_rate": 8.360328154879749e-07, + "loss": 0.9925, + "step": 294920 + }, + { + "epoch": 1.884223707243525, + "grad_norm": 0.8565728068351746, + "learning_rate": 8.351193243944566e-07, + "loss": 0.8852, + "step": 294930 + }, + { + "epoch": 1.8842875943932638, + "grad_norm": 0.6440544724464417, + "learning_rate": 8.342063284338175e-07, + "loss": 0.7831, + "step": 294940 + }, + { + "epoch": 1.8843514815430025, + "grad_norm": 0.646582841873169, + "learning_rate": 8.332938276152613e-07, + "loss": 0.9184, + "step": 294950 + }, + { + "epoch": 1.8844153686927412, + "grad_norm": 0.6677904725074768, + "learning_rate": 8.32381821947964e-07, + "loss": 0.73, + "step": 294960 + }, + { + "epoch": 1.8844792558424799, + "grad_norm": 0.7851777076721191, + "learning_rate": 8.314703114411182e-07, + "loss": 1.1174, + "step": 294970 + }, + { + "epoch": 1.8845431429922186, + "grad_norm": 1.0177836418151855, + "learning_rate": 8.305592961039055e-07, + "loss": 0.7214, + "step": 294980 + }, + { + "epoch": 1.8846070301419573, + "grad_norm": 1.17615807056427, + "learning_rate": 8.296487759455019e-07, + "loss": 0.8174, + "step": 294990 + }, + { + "epoch": 1.884670917291696, + "grad_norm": 2.1096794605255127, + "learning_rate": 8.287387509750666e-07, + "loss": 1.018, + "step": 295000 + }, + { + "epoch": 1.8847348044414347, + "grad_norm": 1.9853860139846802, + "learning_rate": 8.278292212017758e-07, + "loss": 0.6983, + "step": 295010 + }, + { + "epoch": 1.8847986915911732, + "grad_norm": 0.6836848855018616, + "learning_rate": 8.269201866347831e-07, + "loss": 0.8722, + "step": 295020 + }, + { + "epoch": 1.8848625787409121, + "grad_norm": 0.900377631187439, + "learning_rate": 8.260116472832479e-07, + "loss": 0.7465, + "step": 295030 + }, + { + "epoch": 1.8849264658906506, + "grad_norm": 0.8947538733482361, + "learning_rate": 8.251036031563075e-07, + "loss": 1.0633, + "step": 295040 + }, + { + "epoch": 1.8849903530403895, + "grad_norm": 0.8567830324172974, + "learning_rate": 8.241960542631266e-07, + "loss": 0.966, + "step": 295050 + }, + { + "epoch": 1.885054240190128, + "grad_norm": 0.9220252633094788, + "learning_rate": 8.232890006128313e-07, + "loss": 0.9313, + "step": 295060 + }, + { + "epoch": 1.885118127339867, + "grad_norm": 3.402864456176758, + "learning_rate": 8.223824422145587e-07, + "loss": 0.8671, + "step": 295070 + }, + { + "epoch": 1.8851820144896054, + "grad_norm": 0.9252503514289856, + "learning_rate": 8.214763790774405e-07, + "loss": 0.7996, + "step": 295080 + }, + { + "epoch": 1.8852459016393444, + "grad_norm": 1.2209160327911377, + "learning_rate": 8.20570811210597e-07, + "loss": 0.6249, + "step": 295090 + }, + { + "epoch": 1.8853097887890828, + "grad_norm": 1.0366135835647583, + "learning_rate": 8.196657386231543e-07, + "loss": 0.8923, + "step": 295100 + }, + { + "epoch": 1.8853736759388218, + "grad_norm": 1.104676365852356, + "learning_rate": 8.187611613242274e-07, + "loss": 0.748, + "step": 295110 + }, + { + "epoch": 1.8854375630885603, + "grad_norm": 1.3151907920837402, + "learning_rate": 8.178570793229201e-07, + "loss": 0.7379, + "step": 295120 + }, + { + "epoch": 1.8855014502382992, + "grad_norm": 0.8398836851119995, + "learning_rate": 8.169534926283418e-07, + "loss": 1.088, + "step": 295130 + }, + { + "epoch": 1.8855653373880377, + "grad_norm": 3.185319423675537, + "learning_rate": 8.160504012495906e-07, + "loss": 0.9349, + "step": 295140 + }, + { + "epoch": 1.8856292245377766, + "grad_norm": 0.7827329039573669, + "learning_rate": 8.15147805195765e-07, + "loss": 0.7621, + "step": 295150 + }, + { + "epoch": 1.885693111687515, + "grad_norm": 0.9202792644500732, + "learning_rate": 8.142457044759522e-07, + "loss": 0.7232, + "step": 295160 + }, + { + "epoch": 1.885756998837254, + "grad_norm": 1.3906402587890625, + "learning_rate": 8.133440990992336e-07, + "loss": 0.9122, + "step": 295170 + }, + { + "epoch": 1.8858208859869925, + "grad_norm": 1.051938533782959, + "learning_rate": 8.124429890746965e-07, + "loss": 0.8986, + "step": 295180 + }, + { + "epoch": 1.8858847731367314, + "grad_norm": 1.882930040359497, + "learning_rate": 8.115423744114059e-07, + "loss": 1.1432, + "step": 295190 + }, + { + "epoch": 1.88594866028647, + "grad_norm": 1.0550377368927002, + "learning_rate": 8.106422551184378e-07, + "loss": 0.9013, + "step": 295200 + }, + { + "epoch": 1.8860125474362088, + "grad_norm": 0.7018422484397888, + "learning_rate": 8.097426312048573e-07, + "loss": 0.8337, + "step": 295210 + }, + { + "epoch": 1.8860764345859473, + "grad_norm": 1.2057894468307495, + "learning_rate": 8.088435026797292e-07, + "loss": 1.0478, + "step": 295220 + }, + { + "epoch": 1.8861403217356862, + "grad_norm": 0.8332082629203796, + "learning_rate": 8.07944869552102e-07, + "loss": 0.7315, + "step": 295230 + }, + { + "epoch": 1.8862042088854247, + "grad_norm": 0.5676252245903015, + "learning_rate": 8.070467318310238e-07, + "loss": 0.7383, + "step": 295240 + }, + { + "epoch": 1.8862680960351634, + "grad_norm": 1.2264729738235474, + "learning_rate": 8.061490895255431e-07, + "loss": 1.0706, + "step": 295250 + }, + { + "epoch": 1.8863319831849021, + "grad_norm": 0.9976449608802795, + "learning_rate": 8.052519426447025e-07, + "loss": 0.8886, + "step": 295260 + }, + { + "epoch": 1.8863958703346408, + "grad_norm": 0.8264930248260498, + "learning_rate": 8.043552911975338e-07, + "loss": 0.7411, + "step": 295270 + }, + { + "epoch": 1.8864597574843796, + "grad_norm": 1.149370789527893, + "learning_rate": 8.034591351930632e-07, + "loss": 0.7386, + "step": 295280 + }, + { + "epoch": 1.8865236446341183, + "grad_norm": 1.0822536945343018, + "learning_rate": 8.025634746403277e-07, + "loss": 0.7569, + "step": 295290 + }, + { + "epoch": 1.886587531783857, + "grad_norm": 0.8716220855712891, + "learning_rate": 8.016683095483368e-07, + "loss": 0.7104, + "step": 295300 + }, + { + "epoch": 1.8866514189335957, + "grad_norm": 0.6852596402168274, + "learning_rate": 8.007736399261057e-07, + "loss": 0.8472, + "step": 295310 + }, + { + "epoch": 1.8867153060833344, + "grad_norm": 0.975752592086792, + "learning_rate": 7.998794657826491e-07, + "loss": 0.7972, + "step": 295320 + }, + { + "epoch": 1.886779193233073, + "grad_norm": 1.0218257904052734, + "learning_rate": 7.989857871269768e-07, + "loss": 0.8167, + "step": 295330 + }, + { + "epoch": 1.8868430803828118, + "grad_norm": 1.0164343118667603, + "learning_rate": 7.980926039680702e-07, + "loss": 0.9823, + "step": 295340 + }, + { + "epoch": 1.8869069675325505, + "grad_norm": 0.8673834204673767, + "learning_rate": 7.971999163149501e-07, + "loss": 1.0353, + "step": 295350 + }, + { + "epoch": 1.8869708546822892, + "grad_norm": 1.0239756107330322, + "learning_rate": 7.963077241765815e-07, + "loss": 0.81, + "step": 295360 + }, + { + "epoch": 1.887034741832028, + "grad_norm": 0.7221771478652954, + "learning_rate": 7.954160275619682e-07, + "loss": 0.6973, + "step": 295370 + }, + { + "epoch": 1.8870986289817666, + "grad_norm": 0.7318668365478516, + "learning_rate": 7.945248264800808e-07, + "loss": 0.9796, + "step": 295380 + }, + { + "epoch": 1.8871625161315053, + "grad_norm": 0.6194974184036255, + "learning_rate": 7.936341209399012e-07, + "loss": 0.6397, + "step": 295390 + }, + { + "epoch": 1.887226403281244, + "grad_norm": 0.9081866145133972, + "learning_rate": 7.927439109503887e-07, + "loss": 0.804, + "step": 295400 + }, + { + "epoch": 1.8872902904309827, + "grad_norm": 1.4008513689041138, + "learning_rate": 7.918541965205195e-07, + "loss": 0.9405, + "step": 295410 + }, + { + "epoch": 1.8873541775807214, + "grad_norm": 1.3504021167755127, + "learning_rate": 7.909649776592532e-07, + "loss": 0.8039, + "step": 295420 + }, + { + "epoch": 1.8874180647304601, + "grad_norm": 0.5890411138534546, + "learning_rate": 7.900762543755325e-07, + "loss": 0.8612, + "step": 295430 + }, + { + "epoch": 1.8874819518801988, + "grad_norm": 0.855315089225769, + "learning_rate": 7.891880266783225e-07, + "loss": 0.6389, + "step": 295440 + }, + { + "epoch": 1.8875458390299376, + "grad_norm": 0.9654882550239563, + "learning_rate": 7.883002945765605e-07, + "loss": 0.9406, + "step": 295450 + }, + { + "epoch": 1.8876097261796763, + "grad_norm": 1.4022914171218872, + "learning_rate": 7.874130580791949e-07, + "loss": 1.3269, + "step": 295460 + }, + { + "epoch": 1.887673613329415, + "grad_norm": 0.707993745803833, + "learning_rate": 7.865263171951465e-07, + "loss": 0.9352, + "step": 295470 + }, + { + "epoch": 1.8877375004791537, + "grad_norm": 2.06457257270813, + "learning_rate": 7.856400719333579e-07, + "loss": 0.8777, + "step": 295480 + }, + { + "epoch": 1.8878013876288924, + "grad_norm": 2.3128252029418945, + "learning_rate": 7.847543223027498e-07, + "loss": 0.8605, + "step": 295490 + }, + { + "epoch": 1.887865274778631, + "grad_norm": 1.0920239686965942, + "learning_rate": 7.83869068312243e-07, + "loss": 0.9512, + "step": 295500 + }, + { + "epoch": 1.8879291619283696, + "grad_norm": 1.3171707391738892, + "learning_rate": 7.829843099707524e-07, + "loss": 0.8446, + "step": 295510 + }, + { + "epoch": 1.8879930490781085, + "grad_norm": 0.6091967821121216, + "learning_rate": 7.821000472871875e-07, + "loss": 0.8654, + "step": 295520 + }, + { + "epoch": 1.888056936227847, + "grad_norm": 0.8139270544052124, + "learning_rate": 7.812162802704582e-07, + "loss": 0.698, + "step": 295530 + }, + { + "epoch": 1.888120823377586, + "grad_norm": 1.0595269203186035, + "learning_rate": 7.803330089294569e-07, + "loss": 1.1004, + "step": 295540 + }, + { + "epoch": 1.8881847105273244, + "grad_norm": 0.7380056381225586, + "learning_rate": 7.794502332730824e-07, + "loss": 1.1522, + "step": 295550 + }, + { + "epoch": 1.8882485976770633, + "grad_norm": 0.7570953965187073, + "learning_rate": 7.785679533102331e-07, + "loss": 0.9224, + "step": 295560 + }, + { + "epoch": 1.8883124848268018, + "grad_norm": 0.9701615571975708, + "learning_rate": 7.776861690497794e-07, + "loss": 0.5713, + "step": 295570 + }, + { + "epoch": 1.8883763719765407, + "grad_norm": 1.3707141876220703, + "learning_rate": 7.768048805006145e-07, + "loss": 1.0388, + "step": 295580 + }, + { + "epoch": 1.8884402591262792, + "grad_norm": 1.3032087087631226, + "learning_rate": 7.759240876716034e-07, + "loss": 0.8332, + "step": 295590 + }, + { + "epoch": 1.8885041462760181, + "grad_norm": 1.7075655460357666, + "learning_rate": 7.750437905716279e-07, + "loss": 0.9384, + "step": 295600 + }, + { + "epoch": 1.8885680334257566, + "grad_norm": 0.8511855602264404, + "learning_rate": 7.74163989209542e-07, + "loss": 0.9488, + "step": 295610 + }, + { + "epoch": 1.8886319205754956, + "grad_norm": 1.4510815143585205, + "learning_rate": 7.732846835942109e-07, + "loss": 0.7513, + "step": 295620 + }, + { + "epoch": 1.888695807725234, + "grad_norm": 0.7361019849777222, + "learning_rate": 7.724058737344942e-07, + "loss": 0.8163, + "step": 295630 + }, + { + "epoch": 1.888759694874973, + "grad_norm": 1.0821374654769897, + "learning_rate": 7.715275596392402e-07, + "loss": 0.8498, + "step": 295640 + }, + { + "epoch": 1.8888235820247115, + "grad_norm": 0.9581074118614197, + "learning_rate": 7.70649741317292e-07, + "loss": 0.6997, + "step": 295650 + }, + { + "epoch": 1.8888874691744504, + "grad_norm": 1.1700917482376099, + "learning_rate": 7.697724187774868e-07, + "loss": 0.7939, + "step": 295660 + }, + { + "epoch": 1.8889513563241889, + "grad_norm": 0.6985589265823364, + "learning_rate": 7.688955920286623e-07, + "loss": 0.9785, + "step": 295670 + }, + { + "epoch": 1.8890152434739278, + "grad_norm": 1.1156351566314697, + "learning_rate": 7.6801926107965e-07, + "loss": 0.8195, + "step": 295680 + }, + { + "epoch": 1.8890791306236663, + "grad_norm": 0.6667960286140442, + "learning_rate": 7.671434259392818e-07, + "loss": 0.9264, + "step": 295690 + }, + { + "epoch": 1.8891430177734052, + "grad_norm": 3.8637044429779053, + "learning_rate": 7.662680866163619e-07, + "loss": 1.2968, + "step": 295700 + }, + { + "epoch": 1.8892069049231437, + "grad_norm": 0.7520202398300171, + "learning_rate": 7.653932431197219e-07, + "loss": 0.8864, + "step": 295710 + }, + { + "epoch": 1.8892707920728826, + "grad_norm": 0.8065469264984131, + "learning_rate": 7.645188954581661e-07, + "loss": 0.865, + "step": 295720 + }, + { + "epoch": 1.889334679222621, + "grad_norm": 0.6768770217895508, + "learning_rate": 7.636450436404985e-07, + "loss": 0.8843, + "step": 295730 + }, + { + "epoch": 1.8893985663723598, + "grad_norm": 1.8908792734146118, + "learning_rate": 7.627716876755176e-07, + "loss": 0.9908, + "step": 295740 + }, + { + "epoch": 1.8894624535220985, + "grad_norm": 1.037677526473999, + "learning_rate": 7.618988275720273e-07, + "loss": 1.0596, + "step": 295750 + }, + { + "epoch": 1.8895263406718372, + "grad_norm": 0.7945002317428589, + "learning_rate": 7.610264633388098e-07, + "loss": 0.9728, + "step": 295760 + }, + { + "epoch": 1.889590227821576, + "grad_norm": 1.0304378271102905, + "learning_rate": 7.601545949846523e-07, + "loss": 0.9641, + "step": 295770 + }, + { + "epoch": 1.8896541149713146, + "grad_norm": 0.7987378239631653, + "learning_rate": 7.592832225183421e-07, + "loss": 0.8351, + "step": 295780 + }, + { + "epoch": 1.8897180021210533, + "grad_norm": 1.3686131238937378, + "learning_rate": 7.584123459486447e-07, + "loss": 0.7295, + "step": 295790 + }, + { + "epoch": 1.889781889270792, + "grad_norm": 1.25900137424469, + "learning_rate": 7.575419652843363e-07, + "loss": 0.6627, + "step": 295800 + }, + { + "epoch": 1.8898457764205308, + "grad_norm": 0.8558017015457153, + "learning_rate": 7.56672080534182e-07, + "loss": 0.9229, + "step": 295810 + }, + { + "epoch": 1.8899096635702695, + "grad_norm": 0.9014974236488342, + "learning_rate": 7.558026917069416e-07, + "loss": 0.944, + "step": 295820 + }, + { + "epoch": 1.8899735507200082, + "grad_norm": 0.7495697140693665, + "learning_rate": 7.549337988113691e-07, + "loss": 0.9312, + "step": 295830 + }, + { + "epoch": 1.8900374378697469, + "grad_norm": 2.5676043033599854, + "learning_rate": 7.540654018562188e-07, + "loss": 1.0228, + "step": 295840 + }, + { + "epoch": 1.8901013250194856, + "grad_norm": 1.3730984926223755, + "learning_rate": 7.531975008502279e-07, + "loss": 0.7373, + "step": 295850 + }, + { + "epoch": 1.8901652121692243, + "grad_norm": 0.9089881181716919, + "learning_rate": 7.523300958021451e-07, + "loss": 0.812, + "step": 295860 + }, + { + "epoch": 1.890229099318963, + "grad_norm": 1.2313027381896973, + "learning_rate": 7.514631867207078e-07, + "loss": 0.6691, + "step": 295870 + }, + { + "epoch": 1.8902929864687017, + "grad_norm": 0.9636072516441345, + "learning_rate": 7.505967736146369e-07, + "loss": 1.005, + "step": 295880 + }, + { + "epoch": 1.8903568736184404, + "grad_norm": 0.7294650077819824, + "learning_rate": 7.497308564926641e-07, + "loss": 0.8859, + "step": 295890 + }, + { + "epoch": 1.890420760768179, + "grad_norm": 0.9986069798469543, + "learning_rate": 7.488654353635105e-07, + "loss": 0.5529, + "step": 295900 + }, + { + "epoch": 1.8904846479179178, + "grad_norm": 0.9179570078849792, + "learning_rate": 7.480005102358911e-07, + "loss": 0.7446, + "step": 295910 + }, + { + "epoch": 1.8905485350676565, + "grad_norm": 0.981963038444519, + "learning_rate": 7.471360811185157e-07, + "loss": 0.8282, + "step": 295920 + }, + { + "epoch": 1.8906124222173952, + "grad_norm": 0.8185356259346008, + "learning_rate": 7.462721480200885e-07, + "loss": 1.0243, + "step": 295930 + }, + { + "epoch": 1.890676309367134, + "grad_norm": 0.9393899440765381, + "learning_rate": 7.45408710949308e-07, + "loss": 0.9815, + "step": 295940 + }, + { + "epoch": 1.8907401965168726, + "grad_norm": 0.8965507745742798, + "learning_rate": 7.445457699148783e-07, + "loss": 0.7835, + "step": 295950 + }, + { + "epoch": 1.8908040836666113, + "grad_norm": 0.8095226287841797, + "learning_rate": 7.436833249254816e-07, + "loss": 0.8778, + "step": 295960 + }, + { + "epoch": 1.89086797081635, + "grad_norm": 1.0888183116912842, + "learning_rate": 7.428213759898106e-07, + "loss": 0.9133, + "step": 295970 + }, + { + "epoch": 1.8909318579660885, + "grad_norm": 1.2279393672943115, + "learning_rate": 7.419599231165364e-07, + "loss": 0.7502, + "step": 295980 + }, + { + "epoch": 1.8909957451158275, + "grad_norm": 0.6059055924415588, + "learning_rate": 7.410989663143464e-07, + "loss": 1.0614, + "step": 295990 + }, + { + "epoch": 1.891059632265566, + "grad_norm": 0.8721667528152466, + "learning_rate": 7.402385055919003e-07, + "loss": 0.4989, + "step": 296000 + }, + { + "epoch": 1.8911235194153049, + "grad_norm": 0.5935079455375671, + "learning_rate": 7.393785409578691e-07, + "loss": 0.9195, + "step": 296010 + }, + { + "epoch": 1.8911874065650434, + "grad_norm": 0.7822202444076538, + "learning_rate": 7.385190724209123e-07, + "loss": 0.9554, + "step": 296020 + }, + { + "epoch": 1.8912512937147823, + "grad_norm": 2.311864137649536, + "learning_rate": 7.376600999896899e-07, + "loss": 0.8155, + "step": 296030 + }, + { + "epoch": 1.8913151808645208, + "grad_norm": 0.9179143309593201, + "learning_rate": 7.368016236728392e-07, + "loss": 0.8417, + "step": 296040 + }, + { + "epoch": 1.8913790680142597, + "grad_norm": 1.1837308406829834, + "learning_rate": 7.359436434790257e-07, + "loss": 0.8029, + "step": 296050 + }, + { + "epoch": 1.8914429551639982, + "grad_norm": 1.0275431871414185, + "learning_rate": 7.350861594168701e-07, + "loss": 0.7866, + "step": 296060 + }, + { + "epoch": 1.8915068423137371, + "grad_norm": 0.9733939170837402, + "learning_rate": 7.342291714950211e-07, + "loss": 0.9155, + "step": 296070 + }, + { + "epoch": 1.8915707294634756, + "grad_norm": 0.6989685893058777, + "learning_rate": 7.333726797221053e-07, + "loss": 0.7643, + "step": 296080 + }, + { + "epoch": 1.8916346166132145, + "grad_norm": 1.013899564743042, + "learning_rate": 7.325166841067487e-07, + "loss": 0.9916, + "step": 296090 + }, + { + "epoch": 1.891698503762953, + "grad_norm": 0.8235689401626587, + "learning_rate": 7.316611846575672e-07, + "loss": 0.9801, + "step": 296100 + }, + { + "epoch": 1.891762390912692, + "grad_norm": 1.1039879322052002, + "learning_rate": 7.308061813831868e-07, + "loss": 0.8541, + "step": 296110 + }, + { + "epoch": 1.8918262780624304, + "grad_norm": 0.9536022543907166, + "learning_rate": 7.299516742922119e-07, + "loss": 0.6973, + "step": 296120 + }, + { + "epoch": 1.8918901652121694, + "grad_norm": 0.7637679576873779, + "learning_rate": 7.290976633932411e-07, + "loss": 0.8889, + "step": 296130 + }, + { + "epoch": 1.8919540523619078, + "grad_norm": 0.8953757286071777, + "learning_rate": 7.282441486948899e-07, + "loss": 1.065, + "step": 296140 + }, + { + "epoch": 1.8920179395116468, + "grad_norm": 1.9607559442520142, + "learning_rate": 7.273911302057457e-07, + "loss": 0.8208, + "step": 296150 + }, + { + "epoch": 1.8920818266613852, + "grad_norm": 0.8156129717826843, + "learning_rate": 7.265386079343961e-07, + "loss": 1.0972, + "step": 296160 + }, + { + "epoch": 1.8921457138111242, + "grad_norm": 1.9369901418685913, + "learning_rate": 7.256865818894288e-07, + "loss": 0.8655, + "step": 296170 + }, + { + "epoch": 1.8922096009608627, + "grad_norm": 0.618569552898407, + "learning_rate": 7.248350520794312e-07, + "loss": 0.7582, + "step": 296180 + }, + { + "epoch": 1.8922734881106016, + "grad_norm": 0.8822636604309082, + "learning_rate": 7.239840185129687e-07, + "loss": 0.8655, + "step": 296190 + }, + { + "epoch": 1.89233737526034, + "grad_norm": 1.2168229818344116, + "learning_rate": 7.231334811986234e-07, + "loss": 0.7847, + "step": 296200 + }, + { + "epoch": 1.892401262410079, + "grad_norm": 0.5056108236312866, + "learning_rate": 7.222834401449496e-07, + "loss": 0.8557, + "step": 296210 + }, + { + "epoch": 1.8924651495598175, + "grad_norm": 0.9576126337051392, + "learning_rate": 7.214338953605127e-07, + "loss": 0.7553, + "step": 296220 + }, + { + "epoch": 1.8925290367095562, + "grad_norm": 0.8779643177986145, + "learning_rate": 7.205848468538723e-07, + "loss": 0.9253, + "step": 296230 + }, + { + "epoch": 1.892592923859295, + "grad_norm": 1.0522737503051758, + "learning_rate": 7.197362946335718e-07, + "loss": 0.7269, + "step": 296240 + }, + { + "epoch": 1.8926568110090336, + "grad_norm": 1.0928726196289062, + "learning_rate": 7.188882387081597e-07, + "loss": 0.8349, + "step": 296250 + }, + { + "epoch": 1.8927206981587723, + "grad_norm": 0.9813924431800842, + "learning_rate": 7.180406790861794e-07, + "loss": 1.1308, + "step": 296260 + }, + { + "epoch": 1.892784585308511, + "grad_norm": 0.6593855023384094, + "learning_rate": 7.171936157761628e-07, + "loss": 0.9651, + "step": 296270 + }, + { + "epoch": 1.8928484724582497, + "grad_norm": 0.9737311601638794, + "learning_rate": 7.16347048786642e-07, + "loss": 0.9277, + "step": 296280 + }, + { + "epoch": 1.8929123596079884, + "grad_norm": 1.488356351852417, + "learning_rate": 7.155009781261435e-07, + "loss": 1.0415, + "step": 296290 + }, + { + "epoch": 1.8929762467577271, + "grad_norm": 0.7735053300857544, + "learning_rate": 7.146554038031883e-07, + "loss": 0.8097, + "step": 296300 + }, + { + "epoch": 1.8930401339074658, + "grad_norm": 1.3010677099227905, + "learning_rate": 7.138948112881683e-07, + "loss": 0.9612, + "step": 296310 + }, + { + "epoch": 1.8931040210572045, + "grad_norm": 0.6956140995025635, + "learning_rate": 7.130501800300004e-07, + "loss": 1.1993, + "step": 296320 + }, + { + "epoch": 1.8931679082069433, + "grad_norm": 0.8052523732185364, + "learning_rate": 7.122060451340562e-07, + "loss": 0.726, + "step": 296330 + }, + { + "epoch": 1.893231795356682, + "grad_norm": 0.8581894636154175, + "learning_rate": 7.113624066088342e-07, + "loss": 0.8463, + "step": 296340 + }, + { + "epoch": 1.8932956825064207, + "grad_norm": 1.2147918939590454, + "learning_rate": 7.105192644628389e-07, + "loss": 0.9161, + "step": 296350 + }, + { + "epoch": 1.8933595696561594, + "grad_norm": 1.007676362991333, + "learning_rate": 7.096766187045467e-07, + "loss": 1.032, + "step": 296360 + }, + { + "epoch": 1.893423456805898, + "grad_norm": 1.0709525346755981, + "learning_rate": 7.08834469342462e-07, + "loss": 0.9742, + "step": 296370 + }, + { + "epoch": 1.8934873439556368, + "grad_norm": 0.7088238596916199, + "learning_rate": 7.079928163850558e-07, + "loss": 1.0639, + "step": 296380 + }, + { + "epoch": 1.8935512311053755, + "grad_norm": 1.4133071899414062, + "learning_rate": 7.07151659840799e-07, + "loss": 0.6822, + "step": 296390 + }, + { + "epoch": 1.8936151182551142, + "grad_norm": 1.2081222534179688, + "learning_rate": 7.063109997181793e-07, + "loss": 0.9155, + "step": 296400 + }, + { + "epoch": 1.893679005404853, + "grad_norm": 1.5343451499938965, + "learning_rate": 7.054708360256457e-07, + "loss": 0.9854, + "step": 296410 + }, + { + "epoch": 1.8937428925545916, + "grad_norm": 0.7808114886283875, + "learning_rate": 7.046311687716689e-07, + "loss": 0.9595, + "step": 296420 + }, + { + "epoch": 1.8938067797043303, + "grad_norm": 0.7206653356552124, + "learning_rate": 7.037919979647034e-07, + "loss": 0.5652, + "step": 296430 + }, + { + "epoch": 1.893870666854069, + "grad_norm": 0.6024944186210632, + "learning_rate": 7.029533236132035e-07, + "loss": 0.8725, + "step": 296440 + }, + { + "epoch": 1.8939345540038077, + "grad_norm": 3.378950357437134, + "learning_rate": 7.02115145725607e-07, + "loss": 0.8943, + "step": 296450 + }, + { + "epoch": 1.8939984411535464, + "grad_norm": 0.7265892624855042, + "learning_rate": 7.012774643103571e-07, + "loss": 0.9218, + "step": 296460 + }, + { + "epoch": 1.894062328303285, + "grad_norm": 1.143747091293335, + "learning_rate": 7.004402793758968e-07, + "loss": 1.0715, + "step": 296470 + }, + { + "epoch": 1.8941262154530238, + "grad_norm": 0.8175809979438782, + "learning_rate": 6.99603590930653e-07, + "loss": 0.7293, + "step": 296480 + }, + { + "epoch": 1.8941901026027623, + "grad_norm": 0.9248138666152954, + "learning_rate": 6.987673989830523e-07, + "loss": 0.8777, + "step": 296490 + }, + { + "epoch": 1.8942539897525013, + "grad_norm": 0.9757458567619324, + "learning_rate": 6.979317035415156e-07, + "loss": 0.7873, + "step": 296500 + }, + { + "epoch": 1.8943178769022397, + "grad_norm": 2.3840856552124023, + "learning_rate": 6.970965046144528e-07, + "loss": 0.7192, + "step": 296510 + }, + { + "epoch": 1.8943817640519787, + "grad_norm": 1.2808120250701904, + "learning_rate": 6.962618022102907e-07, + "loss": 0.8615, + "step": 296520 + }, + { + "epoch": 1.8944456512017172, + "grad_norm": 0.7400955557823181, + "learning_rate": 6.954275963374168e-07, + "loss": 0.9027, + "step": 296530 + }, + { + "epoch": 1.894509538351456, + "grad_norm": 0.9748068451881409, + "learning_rate": 6.945938870042524e-07, + "loss": 0.6593, + "step": 296540 + }, + { + "epoch": 1.8945734255011946, + "grad_norm": 0.9538581371307373, + "learning_rate": 6.937606742191738e-07, + "loss": 0.8944, + "step": 296550 + }, + { + "epoch": 1.8946373126509335, + "grad_norm": 0.9795793890953064, + "learning_rate": 6.929279579905856e-07, + "loss": 1.0333, + "step": 296560 + }, + { + "epoch": 1.894701199800672, + "grad_norm": 0.6251556277275085, + "learning_rate": 6.920957383268645e-07, + "loss": 0.8936, + "step": 296570 + }, + { + "epoch": 1.894765086950411, + "grad_norm": 0.9500271677970886, + "learning_rate": 6.912640152363981e-07, + "loss": 1.1232, + "step": 296580 + }, + { + "epoch": 1.8948289741001494, + "grad_norm": 0.9892424941062927, + "learning_rate": 6.904327887275686e-07, + "loss": 0.8446, + "step": 296590 + }, + { + "epoch": 1.8948928612498883, + "grad_norm": 1.1366146802902222, + "learning_rate": 6.896020588087304e-07, + "loss": 0.8872, + "step": 296600 + }, + { + "epoch": 1.8949567483996268, + "grad_norm": 0.8495091199874878, + "learning_rate": 6.887718254882658e-07, + "loss": 0.9404, + "step": 296610 + }, + { + "epoch": 1.8950206355493657, + "grad_norm": 1.9020293951034546, + "learning_rate": 6.879420887745235e-07, + "loss": 1.0483, + "step": 296620 + }, + { + "epoch": 1.8950845226991042, + "grad_norm": 2.187201738357544, + "learning_rate": 6.871128486758694e-07, + "loss": 0.8387, + "step": 296630 + }, + { + "epoch": 1.8951484098488431, + "grad_norm": 1.4816763401031494, + "learning_rate": 6.862841052006519e-07, + "loss": 0.7488, + "step": 296640 + }, + { + "epoch": 1.8952122969985816, + "grad_norm": 1.045585036277771, + "learning_rate": 6.854558583572146e-07, + "loss": 0.6764, + "step": 296650 + }, + { + "epoch": 1.8952761841483206, + "grad_norm": 1.1862424612045288, + "learning_rate": 6.846281081538952e-07, + "loss": 0.7465, + "step": 296660 + }, + { + "epoch": 1.895340071298059, + "grad_norm": 0.6618155837059021, + "learning_rate": 6.838008545990426e-07, + "loss": 1.0474, + "step": 296670 + }, + { + "epoch": 1.895403958447798, + "grad_norm": 0.9749765396118164, + "learning_rate": 6.829740977009724e-07, + "loss": 0.9448, + "step": 296680 + }, + { + "epoch": 1.8954678455975364, + "grad_norm": 1.0106642246246338, + "learning_rate": 6.821478374680223e-07, + "loss": 0.771, + "step": 296690 + }, + { + "epoch": 1.8955317327472754, + "grad_norm": 0.9189677834510803, + "learning_rate": 6.81322073908508e-07, + "loss": 0.9741, + "step": 296700 + }, + { + "epoch": 1.8955956198970139, + "grad_norm": 1.4712313413619995, + "learning_rate": 6.804968070307505e-07, + "loss": 0.8101, + "step": 296710 + }, + { + "epoch": 1.8956595070467526, + "grad_norm": 1.327321171760559, + "learning_rate": 6.796720368430542e-07, + "loss": 0.7703, + "step": 296720 + }, + { + "epoch": 1.8957233941964913, + "grad_norm": 0.8706001043319702, + "learning_rate": 6.788477633537293e-07, + "loss": 0.9648, + "step": 296730 + }, + { + "epoch": 1.89578728134623, + "grad_norm": 0.8188433051109314, + "learning_rate": 6.780239865710747e-07, + "loss": 0.8005, + "step": 296740 + }, + { + "epoch": 1.8958511684959687, + "grad_norm": 1.1353031396865845, + "learning_rate": 6.772007065033947e-07, + "loss": 0.8646, + "step": 296750 + }, + { + "epoch": 1.8959150556457074, + "grad_norm": 1.0272502899169922, + "learning_rate": 6.763779231589717e-07, + "loss": 1.0035, + "step": 296760 + }, + { + "epoch": 1.895978942795446, + "grad_norm": 0.8530550599098206, + "learning_rate": 6.75555636546088e-07, + "loss": 0.7951, + "step": 296770 + }, + { + "epoch": 1.8960428299451848, + "grad_norm": 0.9556049108505249, + "learning_rate": 6.747338466730369e-07, + "loss": 0.9092, + "step": 296780 + }, + { + "epoch": 1.8961067170949235, + "grad_norm": 1.1247327327728271, + "learning_rate": 6.739125535480839e-07, + "loss": 0.9113, + "step": 296790 + }, + { + "epoch": 1.8961706042446622, + "grad_norm": 1.8721996545791626, + "learning_rate": 6.730917571795059e-07, + "loss": 1.0448, + "step": 296800 + }, + { + "epoch": 1.896234491394401, + "grad_norm": 0.783287763595581, + "learning_rate": 6.722714575755684e-07, + "loss": 0.7878, + "step": 296810 + }, + { + "epoch": 1.8962983785441396, + "grad_norm": 0.8555763959884644, + "learning_rate": 6.714516547445315e-07, + "loss": 0.7399, + "step": 296820 + }, + { + "epoch": 1.8963622656938783, + "grad_norm": 0.8863575458526611, + "learning_rate": 6.706323486946553e-07, + "loss": 0.743, + "step": 296830 + }, + { + "epoch": 1.896426152843617, + "grad_norm": 0.8079519867897034, + "learning_rate": 6.698135394341832e-07, + "loss": 0.7099, + "step": 296840 + }, + { + "epoch": 1.8964900399933557, + "grad_norm": 1.180772304534912, + "learning_rate": 6.68995226971364e-07, + "loss": 0.7238, + "step": 296850 + }, + { + "epoch": 1.8965539271430945, + "grad_norm": 0.9137830138206482, + "learning_rate": 6.681774113144468e-07, + "loss": 0.8114, + "step": 296860 + }, + { + "epoch": 1.8966178142928332, + "grad_norm": 1.2317211627960205, + "learning_rate": 6.673600924716528e-07, + "loss": 1.0328, + "step": 296870 + }, + { + "epoch": 1.8966817014425719, + "grad_norm": 1.8075276613235474, + "learning_rate": 6.665432704512309e-07, + "loss": 0.7587, + "step": 296880 + }, + { + "epoch": 1.8967455885923106, + "grad_norm": 0.7599256634712219, + "learning_rate": 6.657269452613856e-07, + "loss": 1.0978, + "step": 296890 + }, + { + "epoch": 1.8968094757420493, + "grad_norm": 1.4767416715621948, + "learning_rate": 6.649111169103606e-07, + "loss": 0.6445, + "step": 296900 + }, + { + "epoch": 1.896873362891788, + "grad_norm": 0.8817050457000732, + "learning_rate": 6.640957854063601e-07, + "loss": 0.6194, + "step": 296910 + }, + { + "epoch": 1.8969372500415267, + "grad_norm": 0.8108386993408203, + "learning_rate": 6.632809507575888e-07, + "loss": 0.8546, + "step": 296920 + }, + { + "epoch": 1.8970011371912654, + "grad_norm": 0.8253944516181946, + "learning_rate": 6.624666129722678e-07, + "loss": 0.7681, + "step": 296930 + }, + { + "epoch": 1.897065024341004, + "grad_norm": 0.8359985947608948, + "learning_rate": 6.616527720585908e-07, + "loss": 0.8461, + "step": 296940 + }, + { + "epoch": 1.8971289114907428, + "grad_norm": 0.5892947316169739, + "learning_rate": 6.608394280247565e-07, + "loss": 0.6485, + "step": 296950 + }, + { + "epoch": 1.8971927986404813, + "grad_norm": 0.7966788411140442, + "learning_rate": 6.600265808789475e-07, + "loss": 0.8482, + "step": 296960 + }, + { + "epoch": 1.8972566857902202, + "grad_norm": 1.2839607000350952, + "learning_rate": 6.592142306293569e-07, + "loss": 0.7302, + "step": 296970 + }, + { + "epoch": 1.8973205729399587, + "grad_norm": 0.5650898814201355, + "learning_rate": 6.584023772841674e-07, + "loss": 0.7182, + "step": 296980 + }, + { + "epoch": 1.8973844600896976, + "grad_norm": 3.5387778282165527, + "learning_rate": 6.575910208515557e-07, + "loss": 0.8163, + "step": 296990 + }, + { + "epoch": 1.8974483472394361, + "grad_norm": 1.5479587316513062, + "learning_rate": 6.567801613396817e-07, + "loss": 0.9965, + "step": 297000 + }, + { + "epoch": 1.897512234389175, + "grad_norm": 0.8099126815795898, + "learning_rate": 6.559697987567226e-07, + "loss": 0.9669, + "step": 297010 + }, + { + "epoch": 1.8975761215389135, + "grad_norm": 1.0780574083328247, + "learning_rate": 6.551599331108382e-07, + "loss": 0.9857, + "step": 297020 + }, + { + "epoch": 1.8976400086886525, + "grad_norm": 0.7173043489456177, + "learning_rate": 6.543505644101833e-07, + "loss": 0.8413, + "step": 297030 + }, + { + "epoch": 1.897703895838391, + "grad_norm": 0.9796221852302551, + "learning_rate": 6.535416926629067e-07, + "loss": 0.6674, + "step": 297040 + }, + { + "epoch": 1.8977677829881299, + "grad_norm": 1.4020726680755615, + "learning_rate": 6.527333178771577e-07, + "loss": 0.8466, + "step": 297050 + }, + { + "epoch": 1.8978316701378684, + "grad_norm": 1.3557363748550415, + "learning_rate": 6.519254400610686e-07, + "loss": 1.1703, + "step": 297060 + }, + { + "epoch": 1.8978955572876073, + "grad_norm": 1.3132637739181519, + "learning_rate": 6.511180592227939e-07, + "loss": 0.8213, + "step": 297070 + }, + { + "epoch": 1.8979594444373458, + "grad_norm": 1.5618770122528076, + "learning_rate": 6.503111753704439e-07, + "loss": 0.7792, + "step": 297080 + }, + { + "epoch": 1.8980233315870847, + "grad_norm": 0.9405543804168701, + "learning_rate": 6.495047885121563e-07, + "loss": 0.8413, + "step": 297090 + }, + { + "epoch": 1.8980872187368232, + "grad_norm": 1.0824949741363525, + "learning_rate": 6.486988986560527e-07, + "loss": 0.8309, + "step": 297100 + }, + { + "epoch": 1.898151105886562, + "grad_norm": 1.8957220315933228, + "learning_rate": 6.478935058102375e-07, + "loss": 0.8497, + "step": 297110 + }, + { + "epoch": 1.8982149930363006, + "grad_norm": 3.8873727321624756, + "learning_rate": 6.470886099828432e-07, + "loss": 0.8598, + "step": 297120 + }, + { + "epoch": 1.8982788801860395, + "grad_norm": 1.3968037366867065, + "learning_rate": 6.462842111819523e-07, + "loss": 0.9953, + "step": 297130 + }, + { + "epoch": 1.898342767335778, + "grad_norm": 0.8467806577682495, + "learning_rate": 6.454803094156803e-07, + "loss": 0.8903, + "step": 297140 + }, + { + "epoch": 1.898406654485517, + "grad_norm": 0.9526151418685913, + "learning_rate": 6.446769046921208e-07, + "loss": 0.7577, + "step": 297150 + }, + { + "epoch": 1.8984705416352554, + "grad_norm": 0.5876900553703308, + "learning_rate": 6.438739970193619e-07, + "loss": 0.7978, + "step": 297160 + }, + { + "epoch": 1.8985344287849943, + "grad_norm": 0.9410789608955383, + "learning_rate": 6.430715864054915e-07, + "loss": 0.8442, + "step": 297170 + }, + { + "epoch": 1.8985983159347328, + "grad_norm": 0.5718557834625244, + "learning_rate": 6.42269672858592e-07, + "loss": 0.7736, + "step": 297180 + }, + { + "epoch": 1.8986622030844718, + "grad_norm": 1.0744683742523193, + "learning_rate": 6.414682563867347e-07, + "loss": 0.8258, + "step": 297190 + }, + { + "epoch": 1.8987260902342102, + "grad_norm": 0.9959836602210999, + "learning_rate": 6.40667336997991e-07, + "loss": 0.8789, + "step": 297200 + }, + { + "epoch": 1.898789977383949, + "grad_norm": 0.6707397103309631, + "learning_rate": 6.398669147004321e-07, + "loss": 0.8412, + "step": 297210 + }, + { + "epoch": 1.8988538645336877, + "grad_norm": 1.135922908782959, + "learning_rate": 6.390669895021184e-07, + "loss": 0.7758, + "step": 297220 + }, + { + "epoch": 1.8989177516834264, + "grad_norm": 1.4174561500549316, + "learning_rate": 6.382675614110989e-07, + "loss": 0.8635, + "step": 297230 + }, + { + "epoch": 1.898981638833165, + "grad_norm": 2.0364394187927246, + "learning_rate": 6.374686304354338e-07, + "loss": 0.644, + "step": 297240 + }, + { + "epoch": 1.8990455259829038, + "grad_norm": 1.104007363319397, + "learning_rate": 6.366701965831612e-07, + "loss": 0.8382, + "step": 297250 + }, + { + "epoch": 1.8991094131326425, + "grad_norm": 1.1201772689819336, + "learning_rate": 6.358722598623246e-07, + "loss": 0.9077, + "step": 297260 + }, + { + "epoch": 1.8991733002823812, + "grad_norm": 0.5292360782623291, + "learning_rate": 6.35074820280962e-07, + "loss": 0.7113, + "step": 297270 + }, + { + "epoch": 1.8992371874321199, + "grad_norm": 1.2881067991256714, + "learning_rate": 6.342778778471004e-07, + "loss": 0.8994, + "step": 297280 + }, + { + "epoch": 1.8993010745818586, + "grad_norm": 0.8692072033882141, + "learning_rate": 6.334814325687721e-07, + "loss": 0.9001, + "step": 297290 + }, + { + "epoch": 1.8993649617315973, + "grad_norm": 0.9872758984565735, + "learning_rate": 6.326854844539876e-07, + "loss": 0.8542, + "step": 297300 + }, + { + "epoch": 1.899428848881336, + "grad_norm": 1.0740875005722046, + "learning_rate": 6.318900335107736e-07, + "loss": 0.8472, + "step": 297310 + }, + { + "epoch": 1.8994927360310747, + "grad_norm": 0.7466246485710144, + "learning_rate": 6.310950797471349e-07, + "loss": 0.7936, + "step": 297320 + }, + { + "epoch": 1.8995566231808134, + "grad_norm": 0.9600816965103149, + "learning_rate": 6.303006231710818e-07, + "loss": 0.7065, + "step": 297330 + }, + { + "epoch": 1.8996205103305521, + "grad_norm": 0.7528518438339233, + "learning_rate": 6.295066637906077e-07, + "loss": 0.788, + "step": 297340 + }, + { + "epoch": 1.8996843974802908, + "grad_norm": 0.9469917416572571, + "learning_rate": 6.287132016137177e-07, + "loss": 0.83, + "step": 297350 + }, + { + "epoch": 1.8997482846300295, + "grad_norm": 0.7395723462104797, + "learning_rate": 6.279202366483939e-07, + "loss": 0.8339, + "step": 297360 + }, + { + "epoch": 1.8998121717797682, + "grad_norm": 0.881351113319397, + "learning_rate": 6.2712776890263e-07, + "loss": 0.7541, + "step": 297370 + }, + { + "epoch": 1.899876058929507, + "grad_norm": 0.8257976770401001, + "learning_rate": 6.263357983843976e-07, + "loss": 1.041, + "step": 297380 + }, + { + "epoch": 1.8999399460792457, + "grad_norm": 0.9281383156776428, + "learning_rate": 6.255443251016846e-07, + "loss": 0.7552, + "step": 297390 + }, + { + "epoch": 1.9000038332289844, + "grad_norm": 1.1832506656646729, + "learning_rate": 6.247533490624513e-07, + "loss": 0.679, + "step": 297400 + }, + { + "epoch": 1.900067720378723, + "grad_norm": 0.9818503856658936, + "learning_rate": 6.239628702746691e-07, + "loss": 0.6673, + "step": 297410 + }, + { + "epoch": 1.9001316075284618, + "grad_norm": 0.8966130018234253, + "learning_rate": 6.231728887462929e-07, + "loss": 0.7171, + "step": 297420 + }, + { + "epoch": 1.9001954946782005, + "grad_norm": 0.9654017090797424, + "learning_rate": 6.223834044852883e-07, + "loss": 0.7999, + "step": 297430 + }, + { + "epoch": 1.9002593818279392, + "grad_norm": 1.1915733814239502, + "learning_rate": 6.215944174995992e-07, + "loss": 0.7432, + "step": 297440 + }, + { + "epoch": 1.9003232689776777, + "grad_norm": 0.6636626124382019, + "learning_rate": 6.208059277971689e-07, + "loss": 0.9614, + "step": 297450 + }, + { + "epoch": 1.9003871561274166, + "grad_norm": 0.7218161821365356, + "learning_rate": 6.20017935385947e-07, + "loss": 0.6887, + "step": 297460 + }, + { + "epoch": 1.900451043277155, + "grad_norm": 2.2991743087768555, + "learning_rate": 6.192304402738603e-07, + "loss": 0.97, + "step": 297470 + }, + { + "epoch": 1.900514930426894, + "grad_norm": 0.8845507502555847, + "learning_rate": 6.184434424688467e-07, + "loss": 0.8686, + "step": 297480 + }, + { + "epoch": 1.9005788175766325, + "grad_norm": 0.7170679569244385, + "learning_rate": 6.176569419788281e-07, + "loss": 0.9057, + "step": 297490 + }, + { + "epoch": 1.9006427047263714, + "grad_norm": 0.8707745671272278, + "learning_rate": 6.168709388117255e-07, + "loss": 0.8525, + "step": 297500 + }, + { + "epoch": 1.90070659187611, + "grad_norm": 1.2940828800201416, + "learning_rate": 6.160854329754551e-07, + "loss": 0.6402, + "step": 297510 + }, + { + "epoch": 1.9007704790258488, + "grad_norm": 1.2797635793685913, + "learning_rate": 6.15300424477927e-07, + "loss": 0.8512, + "step": 297520 + }, + { + "epoch": 1.9008343661755873, + "grad_norm": 0.953954815864563, + "learning_rate": 6.145159133270461e-07, + "loss": 0.9747, + "step": 297530 + }, + { + "epoch": 1.9008982533253262, + "grad_norm": 1.0358965396881104, + "learning_rate": 6.137318995307173e-07, + "loss": 0.8922, + "step": 297540 + }, + { + "epoch": 1.9009621404750647, + "grad_norm": 1.3977952003479004, + "learning_rate": 6.129483830968285e-07, + "loss": 0.8189, + "step": 297550 + }, + { + "epoch": 1.9010260276248037, + "grad_norm": 0.7680107951164246, + "learning_rate": 6.121653640332848e-07, + "loss": 0.9159, + "step": 297560 + }, + { + "epoch": 1.9010899147745421, + "grad_norm": 1.2032699584960938, + "learning_rate": 6.113828423479517e-07, + "loss": 1.0925, + "step": 297570 + }, + { + "epoch": 1.901153801924281, + "grad_norm": 1.2679545879364014, + "learning_rate": 6.106008180487288e-07, + "loss": 0.9924, + "step": 297580 + }, + { + "epoch": 1.9012176890740196, + "grad_norm": 0.7097789645195007, + "learning_rate": 6.098192911434763e-07, + "loss": 1.1285, + "step": 297590 + }, + { + "epoch": 1.9012815762237585, + "grad_norm": 1.2766450643539429, + "learning_rate": 6.090382616400825e-07, + "loss": 1.1143, + "step": 297600 + }, + { + "epoch": 1.901345463373497, + "grad_norm": 0.8973786234855652, + "learning_rate": 6.082577295463909e-07, + "loss": 0.549, + "step": 297610 + }, + { + "epoch": 1.901409350523236, + "grad_norm": 1.1079696416854858, + "learning_rate": 6.074776948702843e-07, + "loss": 0.7188, + "step": 297620 + }, + { + "epoch": 1.9014732376729744, + "grad_norm": 1.3107043504714966, + "learning_rate": 6.066981576196007e-07, + "loss": 0.9424, + "step": 297630 + }, + { + "epoch": 1.9015371248227133, + "grad_norm": 1.010285496711731, + "learning_rate": 6.059191178022005e-07, + "loss": 1.2047, + "step": 297640 + }, + { + "epoch": 1.9016010119724518, + "grad_norm": 1.1612452268600464, + "learning_rate": 6.051405754259276e-07, + "loss": 1.06, + "step": 297650 + }, + { + "epoch": 1.9016648991221907, + "grad_norm": 1.245816946029663, + "learning_rate": 6.0436253049862e-07, + "loss": 0.7431, + "step": 297660 + }, + { + "epoch": 1.9017287862719292, + "grad_norm": 0.9382659792900085, + "learning_rate": 6.03584983028116e-07, + "loss": 0.7355, + "step": 297670 + }, + { + "epoch": 1.901792673421668, + "grad_norm": 1.9679533243179321, + "learning_rate": 6.028079330222425e-07, + "loss": 0.7824, + "step": 297680 + }, + { + "epoch": 1.9018565605714066, + "grad_norm": 1.0482052564620972, + "learning_rate": 6.020313804888323e-07, + "loss": 0.8567, + "step": 297690 + }, + { + "epoch": 1.9019204477211453, + "grad_norm": 0.8708074688911438, + "learning_rate": 6.012553254356957e-07, + "loss": 0.7518, + "step": 297700 + }, + { + "epoch": 1.901984334870884, + "grad_norm": 0.7268311381340027, + "learning_rate": 6.004797678706598e-07, + "loss": 0.7066, + "step": 297710 + }, + { + "epoch": 1.9020482220206227, + "grad_norm": 0.9656032919883728, + "learning_rate": 5.997047078015295e-07, + "loss": 0.7627, + "step": 297720 + }, + { + "epoch": 1.9021121091703614, + "grad_norm": 0.854823648929596, + "learning_rate": 5.989301452361096e-07, + "loss": 0.9002, + "step": 297730 + }, + { + "epoch": 1.9021759963201001, + "grad_norm": 1.0799676179885864, + "learning_rate": 5.981560801821995e-07, + "loss": 1.0072, + "step": 297740 + }, + { + "epoch": 1.9022398834698389, + "grad_norm": 4.408614158630371, + "learning_rate": 5.97382512647593e-07, + "loss": 0.9735, + "step": 297750 + }, + { + "epoch": 1.9023037706195776, + "grad_norm": 0.6965596675872803, + "learning_rate": 5.966094426400892e-07, + "loss": 0.935, + "step": 297760 + }, + { + "epoch": 1.9023676577693163, + "grad_norm": 1.042432188987732, + "learning_rate": 5.958368701674655e-07, + "loss": 0.7964, + "step": 297770 + }, + { + "epoch": 1.902431544919055, + "grad_norm": 0.9587807655334473, + "learning_rate": 5.950647952375043e-07, + "loss": 1.0627, + "step": 297780 + }, + { + "epoch": 1.9024954320687937, + "grad_norm": 1.1214818954467773, + "learning_rate": 5.94293217857983e-07, + "loss": 0.9112, + "step": 297790 + }, + { + "epoch": 1.9025593192185324, + "grad_norm": 0.829887866973877, + "learning_rate": 5.935221380366729e-07, + "loss": 0.8339, + "step": 297800 + }, + { + "epoch": 1.902623206368271, + "grad_norm": 0.9119672179222107, + "learning_rate": 5.927515557813345e-07, + "loss": 1.0159, + "step": 297810 + }, + { + "epoch": 1.9026870935180098, + "grad_norm": 0.8453476428985596, + "learning_rate": 5.91981471099734e-07, + "loss": 0.8833, + "step": 297820 + }, + { + "epoch": 1.9027509806677485, + "grad_norm": 1.4447675943374634, + "learning_rate": 5.912118839996261e-07, + "loss": 0.6944, + "step": 297830 + }, + { + "epoch": 1.9028148678174872, + "grad_norm": 0.6993200182914734, + "learning_rate": 5.904427944887547e-07, + "loss": 0.8901, + "step": 297840 + }, + { + "epoch": 1.902878754967226, + "grad_norm": 0.7036521434783936, + "learning_rate": 5.896742025748691e-07, + "loss": 0.6993, + "step": 297850 + }, + { + "epoch": 1.9029426421169646, + "grad_norm": 1.3977524042129517, + "learning_rate": 5.889061082657188e-07, + "loss": 0.7307, + "step": 297860 + }, + { + "epoch": 1.9030065292667033, + "grad_norm": 0.8624377846717834, + "learning_rate": 5.881385115690197e-07, + "loss": 0.9632, + "step": 297870 + }, + { + "epoch": 1.903070416416442, + "grad_norm": 0.7405509352684021, + "learning_rate": 5.873714124925211e-07, + "loss": 0.8716, + "step": 297880 + }, + { + "epoch": 1.9031343035661807, + "grad_norm": 0.808953046798706, + "learning_rate": 5.866048110439337e-07, + "loss": 0.7768, + "step": 297890 + }, + { + "epoch": 1.9031981907159194, + "grad_norm": 1.1939847469329834, + "learning_rate": 5.858387072309901e-07, + "loss": 0.6911, + "step": 297900 + }, + { + "epoch": 1.9032620778656582, + "grad_norm": 1.3884003162384033, + "learning_rate": 5.850731010613952e-07, + "loss": 0.7971, + "step": 297910 + }, + { + "epoch": 1.9033259650153966, + "grad_norm": 1.2230150699615479, + "learning_rate": 5.843079925428708e-07, + "loss": 0.8024, + "step": 297920 + }, + { + "epoch": 1.9033898521651356, + "grad_norm": 1.0419037342071533, + "learning_rate": 5.835433816831104e-07, + "loss": 0.9005, + "step": 297930 + }, + { + "epoch": 1.903453739314874, + "grad_norm": 0.9676340818405151, + "learning_rate": 5.827792684898193e-07, + "loss": 0.9439, + "step": 297940 + }, + { + "epoch": 1.903517626464613, + "grad_norm": 0.9829468727111816, + "learning_rate": 5.820156529706911e-07, + "loss": 0.6664, + "step": 297950 + }, + { + "epoch": 1.9035815136143515, + "grad_norm": 1.0047937631607056, + "learning_rate": 5.812525351334197e-07, + "loss": 1.0327, + "step": 297960 + }, + { + "epoch": 1.9036454007640904, + "grad_norm": 0.8629552721977234, + "learning_rate": 5.804899149856934e-07, + "loss": 0.9404, + "step": 297970 + }, + { + "epoch": 1.9037092879138289, + "grad_norm": 1.8847241401672363, + "learning_rate": 5.797277925351841e-07, + "loss": 0.8702, + "step": 297980 + }, + { + "epoch": 1.9037731750635678, + "grad_norm": 0.6932811141014099, + "learning_rate": 5.789661677895741e-07, + "loss": 0.6871, + "step": 297990 + }, + { + "epoch": 1.9038370622133063, + "grad_norm": 1.0819748640060425, + "learning_rate": 5.782050407565243e-07, + "loss": 0.7768, + "step": 298000 + }, + { + "epoch": 1.9039009493630452, + "grad_norm": 0.9804828763008118, + "learning_rate": 5.774444114437061e-07, + "loss": 0.8023, + "step": 298010 + }, + { + "epoch": 1.9039648365127837, + "grad_norm": 1.4732115268707275, + "learning_rate": 5.766842798587802e-07, + "loss": 0.9734, + "step": 298020 + }, + { + "epoch": 1.9040287236625226, + "grad_norm": 1.7599880695343018, + "learning_rate": 5.759246460094069e-07, + "loss": 0.7371, + "step": 298030 + }, + { + "epoch": 1.904092610812261, + "grad_norm": 1.3185185194015503, + "learning_rate": 5.751655099032193e-07, + "loss": 0.7012, + "step": 298040 + }, + { + "epoch": 1.904156497962, + "grad_norm": 0.753343939781189, + "learning_rate": 5.744068715478835e-07, + "loss": 0.9058, + "step": 298050 + }, + { + "epoch": 1.9042203851117385, + "grad_norm": 2.9530723094940186, + "learning_rate": 5.736487309510263e-07, + "loss": 0.8327, + "step": 298060 + }, + { + "epoch": 1.9042842722614775, + "grad_norm": 0.975061297416687, + "learning_rate": 5.728910881202864e-07, + "loss": 0.9793, + "step": 298070 + }, + { + "epoch": 1.904348159411216, + "grad_norm": 0.7971157431602478, + "learning_rate": 5.72133943063291e-07, + "loss": 0.6126, + "step": 298080 + }, + { + "epoch": 1.9044120465609549, + "grad_norm": 0.778538167476654, + "learning_rate": 5.713772957876728e-07, + "loss": 0.784, + "step": 298090 + }, + { + "epoch": 1.9044759337106933, + "grad_norm": 1.185715913772583, + "learning_rate": 5.706211463010424e-07, + "loss": 0.7173, + "step": 298100 + }, + { + "epoch": 1.9045398208604323, + "grad_norm": 1.3961964845657349, + "learning_rate": 5.698654946110215e-07, + "loss": 1.0598, + "step": 298110 + }, + { + "epoch": 1.9046037080101708, + "grad_norm": 0.8438785076141357, + "learning_rate": 5.691103407252152e-07, + "loss": 0.9722, + "step": 298120 + }, + { + "epoch": 1.9046675951599097, + "grad_norm": 1.0225751399993896, + "learning_rate": 5.683556846512395e-07, + "loss": 0.9682, + "step": 298130 + }, + { + "epoch": 1.9047314823096482, + "grad_norm": 1.0111029148101807, + "learning_rate": 5.676015263966772e-07, + "loss": 1.0396, + "step": 298140 + }, + { + "epoch": 1.904795369459387, + "grad_norm": 1.1241531372070312, + "learning_rate": 5.66847865969139e-07, + "loss": 0.8262, + "step": 298150 + }, + { + "epoch": 1.9048592566091256, + "grad_norm": 1.169219732284546, + "learning_rate": 5.660947033762076e-07, + "loss": 0.7759, + "step": 298160 + }, + { + "epoch": 1.9049231437588643, + "grad_norm": 1.2458572387695312, + "learning_rate": 5.65342038625466e-07, + "loss": 0.8906, + "step": 298170 + }, + { + "epoch": 1.904987030908603, + "grad_norm": 0.9076129198074341, + "learning_rate": 5.645898717244969e-07, + "loss": 0.9836, + "step": 298180 + }, + { + "epoch": 1.9050509180583417, + "grad_norm": 0.7402836084365845, + "learning_rate": 5.638382026808775e-07, + "loss": 0.6931, + "step": 298190 + }, + { + "epoch": 1.9051148052080804, + "grad_norm": 2.3426685333251953, + "learning_rate": 5.630870315021797e-07, + "loss": 0.6219, + "step": 298200 + }, + { + "epoch": 1.9051786923578191, + "grad_norm": 1.006109356880188, + "learning_rate": 5.62336358195964e-07, + "loss": 0.7118, + "step": 298210 + }, + { + "epoch": 1.9052425795075578, + "grad_norm": 1.1263142824172974, + "learning_rate": 5.615861827697855e-07, + "loss": 1.0447, + "step": 298220 + }, + { + "epoch": 1.9053064666572965, + "grad_norm": 0.8541145324707031, + "learning_rate": 5.608365052312048e-07, + "loss": 0.9683, + "step": 298230 + }, + { + "epoch": 1.9053703538070352, + "grad_norm": 0.7063738703727722, + "learning_rate": 5.60087325587777e-07, + "loss": 0.7833, + "step": 298240 + }, + { + "epoch": 1.905434240956774, + "grad_norm": 0.9849272966384888, + "learning_rate": 5.593386438470349e-07, + "loss": 1.0038, + "step": 298250 + }, + { + "epoch": 1.9054981281065126, + "grad_norm": 0.8187015652656555, + "learning_rate": 5.585904600165281e-07, + "loss": 0.843, + "step": 298260 + }, + { + "epoch": 1.9055620152562514, + "grad_norm": 1.0451970100402832, + "learning_rate": 5.578427741037895e-07, + "loss": 0.958, + "step": 298270 + }, + { + "epoch": 1.90562590240599, + "grad_norm": 1.0295426845550537, + "learning_rate": 5.570955861163407e-07, + "loss": 0.9706, + "step": 298280 + }, + { + "epoch": 1.9056897895557288, + "grad_norm": 0.785459041595459, + "learning_rate": 5.563488960617202e-07, + "loss": 0.8274, + "step": 298290 + }, + { + "epoch": 1.9057536767054675, + "grad_norm": 0.8145094513893127, + "learning_rate": 5.556027039474387e-07, + "loss": 1.0412, + "step": 298300 + }, + { + "epoch": 1.9058175638552062, + "grad_norm": 0.7018794417381287, + "learning_rate": 5.548570097810179e-07, + "loss": 0.9255, + "step": 298310 + }, + { + "epoch": 1.9058814510049449, + "grad_norm": 1.8968229293823242, + "learning_rate": 5.541118135699574e-07, + "loss": 1.2584, + "step": 298320 + }, + { + "epoch": 1.9059453381546836, + "grad_norm": 1.589530348777771, + "learning_rate": 5.533671153217734e-07, + "loss": 1.0387, + "step": 298330 + }, + { + "epoch": 1.9060092253044223, + "grad_norm": 1.1622456312179565, + "learning_rate": 5.526229150439544e-07, + "loss": 0.894, + "step": 298340 + }, + { + "epoch": 1.906073112454161, + "grad_norm": 1.0762825012207031, + "learning_rate": 5.518792127440053e-07, + "loss": 0.8478, + "step": 298350 + }, + { + "epoch": 1.9061369996038997, + "grad_norm": 1.124621033668518, + "learning_rate": 5.511360084294093e-07, + "loss": 1.104, + "step": 298360 + }, + { + "epoch": 1.9062008867536384, + "grad_norm": 1.6566370725631714, + "learning_rate": 5.503933021076546e-07, + "loss": 0.7928, + "step": 298370 + }, + { + "epoch": 1.9062647739033771, + "grad_norm": 0.7469080090522766, + "learning_rate": 5.496510937862132e-07, + "loss": 1.1642, + "step": 298380 + }, + { + "epoch": 1.9063286610531158, + "grad_norm": 0.85306715965271, + "learning_rate": 5.489093834725733e-07, + "loss": 0.5735, + "step": 298390 + }, + { + "epoch": 1.9063925482028545, + "grad_norm": 0.8713909387588501, + "learning_rate": 5.481681711741904e-07, + "loss": 0.903, + "step": 298400 + }, + { + "epoch": 1.906456435352593, + "grad_norm": 0.6891903877258301, + "learning_rate": 5.474274568985416e-07, + "loss": 0.7379, + "step": 298410 + }, + { + "epoch": 1.906520322502332, + "grad_norm": 0.9910528063774109, + "learning_rate": 5.466872406530766e-07, + "loss": 0.611, + "step": 298420 + }, + { + "epoch": 1.9065842096520704, + "grad_norm": 0.9393962621688843, + "learning_rate": 5.459475224452614e-07, + "loss": 0.7234, + "step": 298430 + }, + { + "epoch": 1.9066480968018094, + "grad_norm": 0.7950835824012756, + "learning_rate": 5.452083022825294e-07, + "loss": 0.9627, + "step": 298440 + }, + { + "epoch": 1.9067119839515478, + "grad_norm": 1.1390228271484375, + "learning_rate": 5.444695801723409e-07, + "loss": 0.6353, + "step": 298450 + }, + { + "epoch": 1.9067758711012868, + "grad_norm": 0.8612496256828308, + "learning_rate": 5.437313561221291e-07, + "loss": 0.8579, + "step": 298460 + }, + { + "epoch": 1.9068397582510253, + "grad_norm": 1.0620536804199219, + "learning_rate": 5.429936301393268e-07, + "loss": 0.9749, + "step": 298470 + }, + { + "epoch": 1.9069036454007642, + "grad_norm": 0.9960135221481323, + "learning_rate": 5.422564022313614e-07, + "loss": 0.837, + "step": 298480 + }, + { + "epoch": 1.9069675325505027, + "grad_norm": 0.6055924892425537, + "learning_rate": 5.415196724056604e-07, + "loss": 0.7759, + "step": 298490 + }, + { + "epoch": 1.9070314197002416, + "grad_norm": 1.1424163579940796, + "learning_rate": 5.407834406696511e-07, + "loss": 0.8854, + "step": 298500 + }, + { + "epoch": 1.90709530684998, + "grad_norm": 0.9661005735397339, + "learning_rate": 5.400477070307331e-07, + "loss": 1.0237, + "step": 298510 + }, + { + "epoch": 1.907159193999719, + "grad_norm": 0.5954277515411377, + "learning_rate": 5.393124714963283e-07, + "loss": 0.9063, + "step": 298520 + }, + { + "epoch": 1.9072230811494575, + "grad_norm": 0.6582477688789368, + "learning_rate": 5.385777340738363e-07, + "loss": 1.1954, + "step": 298530 + }, + { + "epoch": 1.9072869682991964, + "grad_norm": 0.908450186252594, + "learning_rate": 5.378434947706568e-07, + "loss": 0.8158, + "step": 298540 + }, + { + "epoch": 1.907350855448935, + "grad_norm": 1.31619393825531, + "learning_rate": 5.371097535941838e-07, + "loss": 0.7553, + "step": 298550 + }, + { + "epoch": 1.9074147425986738, + "grad_norm": 0.9759384989738464, + "learning_rate": 5.364498124398043e-07, + "loss": 0.9226, + "step": 298560 + }, + { + "epoch": 1.9074786297484123, + "grad_norm": 1.0978734493255615, + "learning_rate": 5.357170177244242e-07, + "loss": 0.7859, + "step": 298570 + }, + { + "epoch": 1.9075425168981512, + "grad_norm": 1.4422558546066284, + "learning_rate": 5.349847211571724e-07, + "loss": 0.9151, + "step": 298580 + }, + { + "epoch": 1.9076064040478897, + "grad_norm": 1.002185344696045, + "learning_rate": 5.342529227454152e-07, + "loss": 0.8842, + "step": 298590 + }, + { + "epoch": 1.9076702911976287, + "grad_norm": 1.7048262357711792, + "learning_rate": 5.335216224965189e-07, + "loss": 0.6968, + "step": 298600 + }, + { + "epoch": 1.9077341783473671, + "grad_norm": 0.5697035193443298, + "learning_rate": 5.327908204178666e-07, + "loss": 0.7287, + "step": 298610 + }, + { + "epoch": 1.907798065497106, + "grad_norm": 0.9011799097061157, + "learning_rate": 5.320605165167969e-07, + "loss": 1.1011, + "step": 298620 + }, + { + "epoch": 1.9078619526468445, + "grad_norm": 0.9827040433883667, + "learning_rate": 5.31330710800676e-07, + "loss": 0.6246, + "step": 298630 + }, + { + "epoch": 1.9079258397965835, + "grad_norm": 0.7323964834213257, + "learning_rate": 5.306014032768536e-07, + "loss": 0.8952, + "step": 298640 + }, + { + "epoch": 1.907989726946322, + "grad_norm": 0.8654759526252747, + "learning_rate": 5.298725939526738e-07, + "loss": 0.8405, + "step": 298650 + }, + { + "epoch": 1.9080536140960607, + "grad_norm": 0.9938244223594666, + "learning_rate": 5.291442828354698e-07, + "loss": 0.9068, + "step": 298660 + }, + { + "epoch": 1.9081175012457994, + "grad_norm": 0.8327669501304626, + "learning_rate": 5.284164699325855e-07, + "loss": 0.9399, + "step": 298670 + }, + { + "epoch": 1.908181388395538, + "grad_norm": 0.8316702842712402, + "learning_rate": 5.27689155251343e-07, + "loss": 0.842, + "step": 298680 + }, + { + "epoch": 1.9082452755452768, + "grad_norm": 0.7041569948196411, + "learning_rate": 5.269623387990697e-07, + "loss": 0.9606, + "step": 298690 + }, + { + "epoch": 1.9083091626950155, + "grad_norm": 1.169980764389038, + "learning_rate": 5.262360205830874e-07, + "loss": 0.8678, + "step": 298700 + }, + { + "epoch": 1.9083730498447542, + "grad_norm": 1.0179364681243896, + "learning_rate": 5.255102006107127e-07, + "loss": 0.8915, + "step": 298710 + }, + { + "epoch": 1.908436936994493, + "grad_norm": 1.13489830493927, + "learning_rate": 5.247848788892451e-07, + "loss": 0.9829, + "step": 298720 + }, + { + "epoch": 1.9085008241442316, + "grad_norm": 0.8179365396499634, + "learning_rate": 5.240600554260011e-07, + "loss": 0.7415, + "step": 298730 + }, + { + "epoch": 1.9085647112939703, + "grad_norm": 0.911197304725647, + "learning_rate": 5.233357302282749e-07, + "loss": 0.6342, + "step": 298740 + }, + { + "epoch": 1.908628598443709, + "grad_norm": 0.8303636312484741, + "learning_rate": 5.226119033033605e-07, + "loss": 0.8932, + "step": 298750 + }, + { + "epoch": 1.9086924855934477, + "grad_norm": 1.0183674097061157, + "learning_rate": 5.218885746585467e-07, + "loss": 0.9725, + "step": 298760 + }, + { + "epoch": 1.9087563727431864, + "grad_norm": 0.7333863973617554, + "learning_rate": 5.211657443011219e-07, + "loss": 0.9961, + "step": 298770 + }, + { + "epoch": 1.9088202598929251, + "grad_norm": 0.9927520155906677, + "learning_rate": 5.204434122383583e-07, + "loss": 0.6939, + "step": 298780 + }, + { + "epoch": 1.9088841470426638, + "grad_norm": 0.9672486186027527, + "learning_rate": 5.197215784775389e-07, + "loss": 0.675, + "step": 298790 + }, + { + "epoch": 1.9089480341924026, + "grad_norm": 0.8010286092758179, + "learning_rate": 5.1900024302593e-07, + "loss": 1.0252, + "step": 298800 + }, + { + "epoch": 1.9090119213421413, + "grad_norm": 0.6621715426445007, + "learning_rate": 5.182794058907925e-07, + "loss": 0.8564, + "step": 298810 + }, + { + "epoch": 1.90907580849188, + "grad_norm": 1.5702663660049438, + "learning_rate": 5.175590670793984e-07, + "loss": 0.751, + "step": 298820 + }, + { + "epoch": 1.9091396956416187, + "grad_norm": 1.3269661664962769, + "learning_rate": 5.168392265989808e-07, + "loss": 0.9925, + "step": 298830 + }, + { + "epoch": 1.9092035827913574, + "grad_norm": 0.9862305521965027, + "learning_rate": 5.161198844568171e-07, + "loss": 0.8104, + "step": 298840 + }, + { + "epoch": 1.909267469941096, + "grad_norm": 1.0738799571990967, + "learning_rate": 5.154010406601239e-07, + "loss": 0.9463, + "step": 298850 + }, + { + "epoch": 1.9093313570908348, + "grad_norm": 1.043198585510254, + "learning_rate": 5.146826952161565e-07, + "loss": 0.9851, + "step": 298860 + }, + { + "epoch": 1.9093952442405735, + "grad_norm": 1.2452592849731445, + "learning_rate": 5.139648481321424e-07, + "loss": 0.8567, + "step": 298870 + }, + { + "epoch": 1.9094591313903122, + "grad_norm": 0.4939374029636383, + "learning_rate": 5.132474994153147e-07, + "loss": 0.8316, + "step": 298880 + }, + { + "epoch": 1.909523018540051, + "grad_norm": 0.9886005520820618, + "learning_rate": 5.125306490728954e-07, + "loss": 0.7403, + "step": 298890 + }, + { + "epoch": 1.9095869056897894, + "grad_norm": 1.311178207397461, + "learning_rate": 5.118142971121065e-07, + "loss": 0.8995, + "step": 298900 + }, + { + "epoch": 1.9096507928395283, + "grad_norm": 1.4442360401153564, + "learning_rate": 5.110984435401589e-07, + "loss": 0.682, + "step": 298910 + }, + { + "epoch": 1.9097146799892668, + "grad_norm": 1.067458987236023, + "learning_rate": 5.10383088364269e-07, + "loss": 0.8009, + "step": 298920 + }, + { + "epoch": 1.9097785671390057, + "grad_norm": 0.740051805973053, + "learning_rate": 5.096682315916313e-07, + "loss": 1.2432, + "step": 298930 + }, + { + "epoch": 1.9098424542887442, + "grad_norm": 0.8193008899688721, + "learning_rate": 5.089538732294507e-07, + "loss": 1.0796, + "step": 298940 + }, + { + "epoch": 1.9099063414384831, + "grad_norm": 0.8132315874099731, + "learning_rate": 5.082400132849219e-07, + "loss": 0.7697, + "step": 298950 + }, + { + "epoch": 1.9099702285882216, + "grad_norm": 0.9826362729072571, + "learning_rate": 5.075266517652333e-07, + "loss": 0.8249, + "step": 298960 + }, + { + "epoch": 1.9100341157379606, + "grad_norm": 1.4442051649093628, + "learning_rate": 5.068137886775681e-07, + "loss": 0.8598, + "step": 298970 + }, + { + "epoch": 1.910098002887699, + "grad_norm": 1.0820285081863403, + "learning_rate": 5.061014240291039e-07, + "loss": 0.7014, + "step": 298980 + }, + { + "epoch": 1.910161890037438, + "grad_norm": 0.83199542760849, + "learning_rate": 5.053895578270185e-07, + "loss": 0.7628, + "step": 298990 + }, + { + "epoch": 1.9102257771871765, + "grad_norm": 0.9732728004455566, + "learning_rate": 5.04678190078478e-07, + "loss": 0.9333, + "step": 299000 + }, + { + "epoch": 1.9102896643369154, + "grad_norm": 0.919940173625946, + "learning_rate": 5.039673207906492e-07, + "loss": 0.9274, + "step": 299010 + }, + { + "epoch": 1.9103535514866539, + "grad_norm": 0.6937656998634338, + "learning_rate": 5.032569499706874e-07, + "loss": 0.693, + "step": 299020 + }, + { + "epoch": 1.9104174386363928, + "grad_norm": 1.305191159248352, + "learning_rate": 5.025470776257535e-07, + "loss": 0.731, + "step": 299030 + }, + { + "epoch": 1.9104813257861313, + "grad_norm": 1.3394445180892944, + "learning_rate": 5.018377037629862e-07, + "loss": 0.8673, + "step": 299040 + }, + { + "epoch": 1.9105452129358702, + "grad_norm": 0.7385538220405579, + "learning_rate": 5.011288283895354e-07, + "loss": 0.8233, + "step": 299050 + }, + { + "epoch": 1.9106091000856087, + "grad_norm": 1.0281059741973877, + "learning_rate": 5.004204515125454e-07, + "loss": 0.7892, + "step": 299060 + }, + { + "epoch": 1.9106729872353476, + "grad_norm": 1.4805169105529785, + "learning_rate": 4.997125731391383e-07, + "loss": 0.9471, + "step": 299070 + }, + { + "epoch": 1.910736874385086, + "grad_norm": 2.7536349296569824, + "learning_rate": 4.990051932764528e-07, + "loss": 1.0098, + "step": 299080 + }, + { + "epoch": 1.910800761534825, + "grad_norm": 0.9810320138931274, + "learning_rate": 4.98298311931611e-07, + "loss": 0.9173, + "step": 299090 + }, + { + "epoch": 1.9108646486845635, + "grad_norm": 0.8710843920707703, + "learning_rate": 4.975919291117292e-07, + "loss": 0.878, + "step": 299100 + }, + { + "epoch": 1.9109285358343024, + "grad_norm": 1.1887816190719604, + "learning_rate": 4.968860448239187e-07, + "loss": 0.95, + "step": 299110 + }, + { + "epoch": 1.910992422984041, + "grad_norm": 0.7549880146980286, + "learning_rate": 4.96180659075296e-07, + "loss": 1.0875, + "step": 299120 + }, + { + "epoch": 1.9110563101337799, + "grad_norm": 0.7689534425735474, + "learning_rate": 4.954757718729553e-07, + "loss": 0.6713, + "step": 299130 + }, + { + "epoch": 1.9111201972835183, + "grad_norm": 0.8089300990104675, + "learning_rate": 4.947713832240075e-07, + "loss": 0.6852, + "step": 299140 + }, + { + "epoch": 1.911184084433257, + "grad_norm": 0.9446051120758057, + "learning_rate": 4.940674931355361e-07, + "loss": 0.8864, + "step": 299150 + }, + { + "epoch": 1.9112479715829958, + "grad_norm": 1.5336053371429443, + "learning_rate": 4.933641016146407e-07, + "loss": 0.8913, + "step": 299160 + }, + { + "epoch": 1.9113118587327345, + "grad_norm": 0.8503767848014832, + "learning_rate": 4.926612086683879e-07, + "loss": 0.7915, + "step": 299170 + }, + { + "epoch": 1.9113757458824732, + "grad_norm": 1.0193933248519897, + "learning_rate": 4.919588143038778e-07, + "loss": 0.7958, + "step": 299180 + }, + { + "epoch": 1.9114396330322119, + "grad_norm": 2.003002166748047, + "learning_rate": 4.912569185281657e-07, + "loss": 0.9709, + "step": 299190 + }, + { + "epoch": 1.9115035201819506, + "grad_norm": 0.6801881790161133, + "learning_rate": 4.905555213483293e-07, + "loss": 0.8338, + "step": 299200 + }, + { + "epoch": 1.9115674073316893, + "grad_norm": 2.296818733215332, + "learning_rate": 4.898546227714295e-07, + "loss": 0.9052, + "step": 299210 + }, + { + "epoch": 1.911631294481428, + "grad_norm": 1.7277367115020752, + "learning_rate": 4.891542228045276e-07, + "loss": 0.8738, + "step": 299220 + }, + { + "epoch": 1.9116951816311667, + "grad_norm": 0.8613941669464111, + "learning_rate": 4.884543214546733e-07, + "loss": 0.94, + "step": 299230 + }, + { + "epoch": 1.9117590687809054, + "grad_norm": 1.1765998601913452, + "learning_rate": 4.877549187289221e-07, + "loss": 0.9561, + "step": 299240 + }, + { + "epoch": 1.911822955930644, + "grad_norm": 0.697990357875824, + "learning_rate": 4.870560146343073e-07, + "loss": 0.8642, + "step": 299250 + }, + { + "epoch": 1.9118868430803828, + "grad_norm": 0.831953763961792, + "learning_rate": 4.863576091778788e-07, + "loss": 0.836, + "step": 299260 + }, + { + "epoch": 1.9119507302301215, + "grad_norm": 0.548539400100708, + "learning_rate": 4.856597023666586e-07, + "loss": 0.6535, + "step": 299270 + }, + { + "epoch": 1.9120146173798602, + "grad_norm": 0.8374563455581665, + "learning_rate": 4.849622942076859e-07, + "loss": 0.8024, + "step": 299280 + }, + { + "epoch": 1.912078504529599, + "grad_norm": 1.9350553750991821, + "learning_rate": 4.84265384707977e-07, + "loss": 0.8086, + "step": 299290 + }, + { + "epoch": 1.9121423916793376, + "grad_norm": 1.3241461515426636, + "learning_rate": 4.835689738745541e-07, + "loss": 1.0187, + "step": 299300 + }, + { + "epoch": 1.9122062788290763, + "grad_norm": 0.6663087606430054, + "learning_rate": 4.828730617144283e-07, + "loss": 0.9202, + "step": 299310 + }, + { + "epoch": 1.912270165978815, + "grad_norm": 0.7999659180641174, + "learning_rate": 4.821776482346108e-07, + "loss": 0.8876, + "step": 299320 + }, + { + "epoch": 1.9123340531285538, + "grad_norm": 1.4280097484588623, + "learning_rate": 4.814827334421068e-07, + "loss": 1.0624, + "step": 299330 + }, + { + "epoch": 1.9123979402782925, + "grad_norm": 4.343797206878662, + "learning_rate": 4.807883173439054e-07, + "loss": 1.1265, + "step": 299340 + }, + { + "epoch": 1.9124618274280312, + "grad_norm": 0.9057490229606628, + "learning_rate": 4.800943999470064e-07, + "loss": 0.8631, + "step": 299350 + }, + { + "epoch": 1.9125257145777699, + "grad_norm": 0.8484719395637512, + "learning_rate": 4.794009812583988e-07, + "loss": 0.9666, + "step": 299360 + }, + { + "epoch": 1.9125896017275086, + "grad_norm": 1.200207233428955, + "learning_rate": 4.787080612850659e-07, + "loss": 0.8848, + "step": 299370 + }, + { + "epoch": 1.9126534888772473, + "grad_norm": 1.1934592723846436, + "learning_rate": 4.780156400339853e-07, + "loss": 0.6613, + "step": 299380 + }, + { + "epoch": 1.9127173760269858, + "grad_norm": 1.0981838703155518, + "learning_rate": 4.773237175121293e-07, + "loss": 1.0469, + "step": 299390 + }, + { + "epoch": 1.9127812631767247, + "grad_norm": 1.2460277080535889, + "learning_rate": 4.7663229372646443e-07, + "loss": 0.9713, + "step": 299400 + }, + { + "epoch": 1.9128451503264632, + "grad_norm": 1.1386065483093262, + "learning_rate": 4.7594136868395756e-07, + "loss": 1.0489, + "step": 299410 + }, + { + "epoch": 1.9129090374762021, + "grad_norm": 0.6835050582885742, + "learning_rate": 4.752509423915641e-07, + "loss": 0.9037, + "step": 299420 + }, + { + "epoch": 1.9129729246259406, + "grad_norm": 0.7506453394889832, + "learning_rate": 4.745610148562396e-07, + "loss": 0.7473, + "step": 299430 + }, + { + "epoch": 1.9130368117756795, + "grad_norm": 1.6977883577346802, + "learning_rate": 4.7387158608492853e-07, + "loss": 0.9052, + "step": 299440 + }, + { + "epoch": 1.913100698925418, + "grad_norm": 1.0963678359985352, + "learning_rate": 4.731826560845809e-07, + "loss": 1.1013, + "step": 299450 + }, + { + "epoch": 1.913164586075157, + "grad_norm": 0.818276584148407, + "learning_rate": 4.7249422486213003e-07, + "loss": 0.7577, + "step": 299460 + }, + { + "epoch": 1.9132284732248954, + "grad_norm": 1.0423306226730347, + "learning_rate": 4.7180629242450923e-07, + "loss": 0.9088, + "step": 299470 + }, + { + "epoch": 1.9132923603746343, + "grad_norm": 0.8312962055206299, + "learning_rate": 4.7111885877864635e-07, + "loss": 0.8614, + "step": 299480 + }, + { + "epoch": 1.9133562475243728, + "grad_norm": 1.4227992296218872, + "learning_rate": 4.704319239314636e-07, + "loss": 0.8673, + "step": 299490 + }, + { + "epoch": 1.9134201346741118, + "grad_norm": 1.1332935094833374, + "learning_rate": 4.697454878898888e-07, + "loss": 1.0211, + "step": 299500 + }, + { + "epoch": 1.9134840218238502, + "grad_norm": 0.896660566329956, + "learning_rate": 4.690595506608164e-07, + "loss": 0.7276, + "step": 299510 + }, + { + "epoch": 1.9135479089735892, + "grad_norm": 0.8751005530357361, + "learning_rate": 4.683741122511742e-07, + "loss": 0.7033, + "step": 299520 + }, + { + "epoch": 1.9136117961233277, + "grad_norm": 0.6766482591629028, + "learning_rate": 4.676891726678456e-07, + "loss": 1.0435, + "step": 299530 + }, + { + "epoch": 1.9136756832730666, + "grad_norm": 1.7327042818069458, + "learning_rate": 4.670047319177473e-07, + "loss": 0.6154, + "step": 299540 + }, + { + "epoch": 1.913739570422805, + "grad_norm": 1.3828660249710083, + "learning_rate": 4.663207900077571e-07, + "loss": 0.7493, + "step": 299550 + }, + { + "epoch": 1.913803457572544, + "grad_norm": 1.1648141145706177, + "learning_rate": 4.6563734694477503e-07, + "loss": 1.0343, + "step": 299560 + }, + { + "epoch": 1.9138673447222825, + "grad_norm": 0.800993800163269, + "learning_rate": 4.649544027356734e-07, + "loss": 0.8081, + "step": 299570 + }, + { + "epoch": 1.9139312318720214, + "grad_norm": 0.8474355936050415, + "learning_rate": 4.6427195738733553e-07, + "loss": 0.943, + "step": 299580 + }, + { + "epoch": 1.91399511902176, + "grad_norm": 0.9152101874351501, + "learning_rate": 4.635900109066338e-07, + "loss": 1.0417, + "step": 299590 + }, + { + "epoch": 1.9140590061714988, + "grad_norm": 1.0631303787231445, + "learning_rate": 4.629085633004404e-07, + "loss": 0.9787, + "step": 299600 + }, + { + "epoch": 1.9141228933212373, + "grad_norm": 1.1691399812698364, + "learning_rate": 4.622276145756055e-07, + "loss": 0.9034, + "step": 299610 + }, + { + "epoch": 1.914186780470976, + "grad_norm": 0.645896315574646, + "learning_rate": 4.615471647390013e-07, + "loss": 0.8945, + "step": 299620 + }, + { + "epoch": 1.9142506676207147, + "grad_norm": 1.9830036163330078, + "learning_rate": 4.608672137974668e-07, + "loss": 0.6084, + "step": 299630 + }, + { + "epoch": 1.9143145547704534, + "grad_norm": 0.9347323775291443, + "learning_rate": 4.6018776175786317e-07, + "loss": 0.8824, + "step": 299640 + }, + { + "epoch": 1.9143784419201921, + "grad_norm": 0.5798866748809814, + "learning_rate": 4.5950880862702385e-07, + "loss": 0.9729, + "step": 299650 + }, + { + "epoch": 1.9144423290699308, + "grad_norm": 2.9061269760131836, + "learning_rate": 4.588303544117933e-07, + "loss": 0.8328, + "step": 299660 + }, + { + "epoch": 1.9145062162196695, + "grad_norm": 0.9483212232589722, + "learning_rate": 4.5815239911899953e-07, + "loss": 0.9326, + "step": 299670 + }, + { + "epoch": 1.9145701033694082, + "grad_norm": 1.4846980571746826, + "learning_rate": 4.574749427554648e-07, + "loss": 0.8387, + "step": 299680 + }, + { + "epoch": 1.914633990519147, + "grad_norm": 0.7667824625968933, + "learning_rate": 4.567979853280224e-07, + "loss": 0.8384, + "step": 299690 + }, + { + "epoch": 1.9146978776688857, + "grad_norm": 0.8439723253250122, + "learning_rate": 4.5612152684348373e-07, + "loss": 0.8051, + "step": 299700 + }, + { + "epoch": 1.9147617648186244, + "grad_norm": 0.8428217172622681, + "learning_rate": 4.554455673086655e-07, + "loss": 0.9557, + "step": 299710 + }, + { + "epoch": 1.914825651968363, + "grad_norm": 0.8997921943664551, + "learning_rate": 4.547701067303733e-07, + "loss": 1.1864, + "step": 299720 + }, + { + "epoch": 1.9148895391181018, + "grad_norm": 0.9388466477394104, + "learning_rate": 4.540951451154074e-07, + "loss": 0.8625, + "step": 299730 + }, + { + "epoch": 1.9149534262678405, + "grad_norm": 0.8117624521255493, + "learning_rate": 4.5342068247056225e-07, + "loss": 0.9298, + "step": 299740 + }, + { + "epoch": 1.9150173134175792, + "grad_norm": 1.0533971786499023, + "learning_rate": 4.5274671880264353e-07, + "loss": 0.7363, + "step": 299750 + }, + { + "epoch": 1.915081200567318, + "grad_norm": 1.016930341720581, + "learning_rate": 4.520732541184236e-07, + "loss": 0.8498, + "step": 299760 + }, + { + "epoch": 1.9151450877170566, + "grad_norm": 0.7840309739112854, + "learning_rate": 4.514002884246915e-07, + "loss": 0.564, + "step": 299770 + }, + { + "epoch": 1.9152089748667953, + "grad_norm": 0.9618764519691467, + "learning_rate": 4.5072782172822514e-07, + "loss": 0.8378, + "step": 299780 + }, + { + "epoch": 1.915272862016534, + "grad_norm": 0.8250643610954285, + "learning_rate": 4.5005585403579687e-07, + "loss": 0.8388, + "step": 299790 + }, + { + "epoch": 1.9153367491662727, + "grad_norm": 0.9536775350570679, + "learning_rate": 4.493843853541679e-07, + "loss": 0.9822, + "step": 299800 + }, + { + "epoch": 1.9154006363160114, + "grad_norm": 0.9676626920700073, + "learning_rate": 4.487134156901107e-07, + "loss": 0.941, + "step": 299810 + }, + { + "epoch": 1.9154645234657501, + "grad_norm": 1.5679287910461426, + "learning_rate": 4.480429450503809e-07, + "loss": 0.993, + "step": 299820 + }, + { + "epoch": 1.9155284106154888, + "grad_norm": 1.0796259641647339, + "learning_rate": 4.4737297344171757e-07, + "loss": 0.7983, + "step": 299830 + }, + { + "epoch": 1.9155922977652275, + "grad_norm": 1.0324289798736572, + "learning_rate": 4.467035008708875e-07, + "loss": 0.7589, + "step": 299840 + }, + { + "epoch": 1.9156561849149663, + "grad_norm": 1.006419062614441, + "learning_rate": 4.4603452734461317e-07, + "loss": 1.0923, + "step": 299850 + }, + { + "epoch": 1.915720072064705, + "grad_norm": 0.981799840927124, + "learning_rate": 4.4536605286965015e-07, + "loss": 0.8727, + "step": 299860 + }, + { + "epoch": 1.9157839592144437, + "grad_norm": 1.746458888053894, + "learning_rate": 4.446980774527154e-07, + "loss": 1.0021, + "step": 299870 + }, + { + "epoch": 1.9158478463641821, + "grad_norm": 0.8302098512649536, + "learning_rate": 4.440306011005424e-07, + "loss": 0.7477, + "step": 299880 + }, + { + "epoch": 1.915911733513921, + "grad_norm": 1.0629510879516602, + "learning_rate": 4.4336362381985905e-07, + "loss": 0.8635, + "step": 299890 + }, + { + "epoch": 1.9159756206636596, + "grad_norm": 1.2623989582061768, + "learning_rate": 4.4269714561737117e-07, + "loss": 0.9379, + "step": 299900 + }, + { + "epoch": 1.9160395078133985, + "grad_norm": 1.230000376701355, + "learning_rate": 4.420311664997956e-07, + "loss": 0.8526, + "step": 299910 + }, + { + "epoch": 1.916103394963137, + "grad_norm": 1.3445703983306885, + "learning_rate": 4.413656864738436e-07, + "loss": 0.8425, + "step": 299920 + }, + { + "epoch": 1.916167282112876, + "grad_norm": 0.8907622694969177, + "learning_rate": 4.407007055462153e-07, + "loss": 0.7709, + "step": 299930 + }, + { + "epoch": 1.9162311692626144, + "grad_norm": 2.3243703842163086, + "learning_rate": 4.400362237236e-07, + "loss": 0.8596, + "step": 299940 + }, + { + "epoch": 1.9162950564123533, + "grad_norm": 3.5086824893951416, + "learning_rate": 4.393722410126977e-07, + "loss": 0.8725, + "step": 299950 + }, + { + "epoch": 1.9163589435620918, + "grad_norm": 0.6275745630264282, + "learning_rate": 4.3870875742019757e-07, + "loss": 0.8443, + "step": 299960 + }, + { + "epoch": 1.9164228307118307, + "grad_norm": 0.9403732419013977, + "learning_rate": 4.3804577295277204e-07, + "loss": 0.8953, + "step": 299970 + }, + { + "epoch": 1.9164867178615692, + "grad_norm": 0.9695448875427246, + "learning_rate": 4.373832876170991e-07, + "loss": 0.5875, + "step": 299980 + }, + { + "epoch": 1.9165506050113081, + "grad_norm": 0.9237552881240845, + "learning_rate": 4.3672130141986234e-07, + "loss": 0.9285, + "step": 299990 + }, + { + "epoch": 1.9166144921610466, + "grad_norm": 1.1848057508468628, + "learning_rate": 4.3605981436771195e-07, + "loss": 1.0084, + "step": 300000 + }, + { + "epoch": 1.9166783793107856, + "grad_norm": 0.8543856739997864, + "learning_rate": 4.353988264673259e-07, + "loss": 0.8232, + "step": 300010 + }, + { + "epoch": 1.916742266460524, + "grad_norm": 1.3468594551086426, + "learning_rate": 4.3473833772534887e-07, + "loss": 1.0356, + "step": 300020 + }, + { + "epoch": 1.916806153610263, + "grad_norm": 1.0246607065200806, + "learning_rate": 4.340783481484367e-07, + "loss": 0.945, + "step": 300030 + }, + { + "epoch": 1.9168700407600014, + "grad_norm": 0.7356663346290588, + "learning_rate": 4.3341885774323966e-07, + "loss": 0.8515, + "step": 300040 + }, + { + "epoch": 1.9169339279097404, + "grad_norm": 0.770076334476471, + "learning_rate": 4.3275986651639677e-07, + "loss": 0.9127, + "step": 300050 + }, + { + "epoch": 1.9169978150594789, + "grad_norm": 1.0495332479476929, + "learning_rate": 4.3210137447453615e-07, + "loss": 0.664, + "step": 300060 + }, + { + "epoch": 1.9170617022092178, + "grad_norm": 0.8794049620628357, + "learning_rate": 4.3144338162430244e-07, + "loss": 0.6817, + "step": 300070 + }, + { + "epoch": 1.9171255893589563, + "grad_norm": 0.8359986543655396, + "learning_rate": 4.3078588797231814e-07, + "loss": 0.937, + "step": 300080 + }, + { + "epoch": 1.9171894765086952, + "grad_norm": 1.266853928565979, + "learning_rate": 4.3012889352520014e-07, + "loss": 0.797, + "step": 300090 + }, + { + "epoch": 1.9172533636584337, + "grad_norm": 2.2502448558807373, + "learning_rate": 4.29472398289571e-07, + "loss": 1.0825, + "step": 300100 + }, + { + "epoch": 1.9173172508081724, + "grad_norm": 3.827913761138916, + "learning_rate": 4.2881640227203646e-07, + "loss": 0.9192, + "step": 300110 + }, + { + "epoch": 1.917381137957911, + "grad_norm": 0.9653197526931763, + "learning_rate": 4.2816090547920793e-07, + "loss": 0.7286, + "step": 300120 + }, + { + "epoch": 1.9174450251076498, + "grad_norm": 1.0422720909118652, + "learning_rate": 4.2750590791768574e-07, + "loss": 0.8416, + "step": 300130 + }, + { + "epoch": 1.9175089122573885, + "grad_norm": 1.3192023038864136, + "learning_rate": 4.26851409594059e-07, + "loss": 0.9013, + "step": 300140 + }, + { + "epoch": 1.9175727994071272, + "grad_norm": 0.9541807770729065, + "learning_rate": 4.261974105149336e-07, + "loss": 1.204, + "step": 300150 + }, + { + "epoch": 1.917636686556866, + "grad_norm": 0.9419488906860352, + "learning_rate": 4.255439106868819e-07, + "loss": 0.6745, + "step": 300160 + }, + { + "epoch": 1.9177005737066046, + "grad_norm": 1.0936365127563477, + "learning_rate": 4.248909101164933e-07, + "loss": 0.972, + "step": 300170 + }, + { + "epoch": 1.9177644608563433, + "grad_norm": 1.177007794380188, + "learning_rate": 4.242384088103457e-07, + "loss": 0.9293, + "step": 300180 + }, + { + "epoch": 1.917828348006082, + "grad_norm": 0.6732769012451172, + "learning_rate": 4.235864067750006e-07, + "loss": 1.2595, + "step": 300190 + }, + { + "epoch": 1.9178922351558207, + "grad_norm": 0.7928746938705444, + "learning_rate": 4.229349040170305e-07, + "loss": 0.8386, + "step": 300200 + }, + { + "epoch": 1.9179561223055595, + "grad_norm": 1.158821702003479, + "learning_rate": 4.2228390054299683e-07, + "loss": 0.7762, + "step": 300210 + }, + { + "epoch": 1.9180200094552982, + "grad_norm": 0.8231037855148315, + "learning_rate": 4.2163339635946097e-07, + "loss": 0.8962, + "step": 300220 + }, + { + "epoch": 1.9180838966050369, + "grad_norm": 0.989878237247467, + "learning_rate": 4.2098339147296215e-07, + "loss": 0.9074, + "step": 300230 + }, + { + "epoch": 1.9181477837547756, + "grad_norm": 0.7574949264526367, + "learning_rate": 4.2033388589005075e-07, + "loss": 0.8568, + "step": 300240 + }, + { + "epoch": 1.9182116709045143, + "grad_norm": 1.9735184907913208, + "learning_rate": 4.196848796172714e-07, + "loss": 0.8463, + "step": 300250 + }, + { + "epoch": 1.918275558054253, + "grad_norm": 1.1065763235092163, + "learning_rate": 4.1903637266116347e-07, + "loss": 0.989, + "step": 300260 + }, + { + "epoch": 1.9183394452039917, + "grad_norm": 0.918474555015564, + "learning_rate": 4.183883650282494e-07, + "loss": 0.6855, + "step": 300270 + }, + { + "epoch": 1.9184033323537304, + "grad_norm": 0.5903297662734985, + "learning_rate": 4.1774085672505736e-07, + "loss": 0.9428, + "step": 300280 + }, + { + "epoch": 1.918467219503469, + "grad_norm": 0.7054703831672668, + "learning_rate": 4.170938477581099e-07, + "loss": 0.9902, + "step": 300290 + }, + { + "epoch": 1.9185311066532078, + "grad_norm": 0.6120290160179138, + "learning_rate": 4.16447338133924e-07, + "loss": 0.9751, + "step": 300300 + }, + { + "epoch": 1.9185949938029465, + "grad_norm": 0.9264044165611267, + "learning_rate": 4.1580132785901116e-07, + "loss": 1.0271, + "step": 300310 + }, + { + "epoch": 1.9186588809526852, + "grad_norm": 0.8623595237731934, + "learning_rate": 4.151558169398717e-07, + "loss": 1.0952, + "step": 300320 + }, + { + "epoch": 1.918722768102424, + "grad_norm": 3.1767020225524902, + "learning_rate": 4.1451080538301155e-07, + "loss": 0.9888, + "step": 300330 + }, + { + "epoch": 1.9187866552521626, + "grad_norm": 1.0149223804473877, + "learning_rate": 4.1386629319492556e-07, + "loss": 0.9297, + "step": 300340 + }, + { + "epoch": 1.9188505424019011, + "grad_norm": 1.0157650709152222, + "learning_rate": 4.1322228038210286e-07, + "loss": 0.9221, + "step": 300350 + }, + { + "epoch": 1.91891442955164, + "grad_norm": 2.1870782375335693, + "learning_rate": 4.125787669510328e-07, + "loss": 0.8463, + "step": 300360 + }, + { + "epoch": 1.9189783167013785, + "grad_norm": 1.3804250955581665, + "learning_rate": 4.119357529081935e-07, + "loss": 0.8814, + "step": 300370 + }, + { + "epoch": 1.9190422038511175, + "grad_norm": 0.8124695420265198, + "learning_rate": 4.112932382600576e-07, + "loss": 0.8788, + "step": 300380 + }, + { + "epoch": 1.919106091000856, + "grad_norm": 0.8044643402099609, + "learning_rate": 4.106512230131032e-07, + "loss": 0.7449, + "step": 300390 + }, + { + "epoch": 1.9191699781505949, + "grad_norm": 0.8989754915237427, + "learning_rate": 4.100097071737863e-07, + "loss": 0.6831, + "step": 300400 + }, + { + "epoch": 1.9192338653003334, + "grad_norm": 0.6823838949203491, + "learning_rate": 4.0936869074857943e-07, + "loss": 0.7529, + "step": 300410 + }, + { + "epoch": 1.9192977524500723, + "grad_norm": 1.3462483882904053, + "learning_rate": 4.0872817374392746e-07, + "loss": 1.0186, + "step": 300420 + }, + { + "epoch": 1.9193616395998108, + "grad_norm": 0.971286952495575, + "learning_rate": 4.0808815616628636e-07, + "loss": 0.8376, + "step": 300430 + }, + { + "epoch": 1.9194255267495497, + "grad_norm": 1.3372255563735962, + "learning_rate": 4.07448638022101e-07, + "loss": 1.1777, + "step": 300440 + }, + { + "epoch": 1.9194894138992882, + "grad_norm": 1.064820647239685, + "learning_rate": 4.0680961931781615e-07, + "loss": 0.8509, + "step": 300450 + }, + { + "epoch": 1.919553301049027, + "grad_norm": 1.0935826301574707, + "learning_rate": 4.061711000598545e-07, + "loss": 0.8222, + "step": 300460 + }, + { + "epoch": 1.9196171881987656, + "grad_norm": 1.4883924722671509, + "learning_rate": 4.0553308025466083e-07, + "loss": 0.979, + "step": 300470 + }, + { + "epoch": 1.9196810753485045, + "grad_norm": 0.8018064498901367, + "learning_rate": 4.0489555990865233e-07, + "loss": 0.7354, + "step": 300480 + }, + { + "epoch": 1.919744962498243, + "grad_norm": 1.1229437589645386, + "learning_rate": 4.042585390282516e-07, + "loss": 0.9783, + "step": 300490 + }, + { + "epoch": 1.919808849647982, + "grad_norm": 0.7482228875160217, + "learning_rate": 4.0362201761987017e-07, + "loss": 0.8791, + "step": 300500 + }, + { + "epoch": 1.9198727367977204, + "grad_norm": 0.5550474524497986, + "learning_rate": 4.0298599568992513e-07, + "loss": 0.7997, + "step": 300510 + }, + { + "epoch": 1.9199366239474593, + "grad_norm": 0.8357752561569214, + "learning_rate": 4.023504732448169e-07, + "loss": 1.0888, + "step": 300520 + }, + { + "epoch": 1.9200005110971978, + "grad_norm": 1.0730481147766113, + "learning_rate": 4.0171545029095146e-07, + "loss": 0.994, + "step": 300530 + }, + { + "epoch": 1.9200643982469368, + "grad_norm": 0.8998647928237915, + "learning_rate": 4.010809268347182e-07, + "loss": 1.0256, + "step": 300540 + }, + { + "epoch": 1.9201282853966752, + "grad_norm": 0.7829686999320984, + "learning_rate": 4.004469028825064e-07, + "loss": 0.8715, + "step": 300550 + }, + { + "epoch": 1.9201921725464142, + "grad_norm": 0.7973158955574036, + "learning_rate": 3.99813378440711e-07, + "loss": 0.7151, + "step": 300560 + }, + { + "epoch": 1.9202560596961527, + "grad_norm": 0.7357059121131897, + "learning_rate": 3.9918035351569903e-07, + "loss": 0.8495, + "step": 300570 + }, + { + "epoch": 1.9203199468458916, + "grad_norm": 0.7509437799453735, + "learning_rate": 3.985478281138544e-07, + "loss": 0.8773, + "step": 300580 + }, + { + "epoch": 1.92038383399563, + "grad_norm": 0.8090037107467651, + "learning_rate": 3.9791580224153856e-07, + "loss": 0.9297, + "step": 300590 + }, + { + "epoch": 1.9204477211453688, + "grad_norm": 0.6789357662200928, + "learning_rate": 3.9728427590512984e-07, + "loss": 0.9613, + "step": 300600 + }, + { + "epoch": 1.9205116082951075, + "grad_norm": 0.990573525428772, + "learning_rate": 3.9665324911097866e-07, + "loss": 0.7093, + "step": 300610 + }, + { + "epoch": 1.9205754954448462, + "grad_norm": 0.97950679063797, + "learning_rate": 3.960227218654411e-07, + "loss": 0.6383, + "step": 300620 + }, + { + "epoch": 1.9206393825945849, + "grad_norm": 0.8172192573547363, + "learning_rate": 3.953926941748676e-07, + "loss": 0.8158, + "step": 300630 + }, + { + "epoch": 1.9207032697443236, + "grad_norm": 1.0370420217514038, + "learning_rate": 3.947631660456086e-07, + "loss": 0.9763, + "step": 300640 + }, + { + "epoch": 1.9207671568940623, + "grad_norm": 0.6944184899330139, + "learning_rate": 3.941341374839924e-07, + "loss": 0.7228, + "step": 300650 + }, + { + "epoch": 1.920831044043801, + "grad_norm": 1.1591705083847046, + "learning_rate": 3.93505608496364e-07, + "loss": 0.8656, + "step": 300660 + }, + { + "epoch": 1.9208949311935397, + "grad_norm": 1.8749659061431885, + "learning_rate": 3.9287757908905155e-07, + "loss": 1.0307, + "step": 300670 + }, + { + "epoch": 1.9209588183432784, + "grad_norm": 1.6521705389022827, + "learning_rate": 3.9225004926837784e-07, + "loss": 0.8042, + "step": 300680 + }, + { + "epoch": 1.9210227054930171, + "grad_norm": 1.2027360200881958, + "learning_rate": 3.9162301904066e-07, + "loss": 0.7991, + "step": 300690 + }, + { + "epoch": 1.9210865926427558, + "grad_norm": 0.7796891927719116, + "learning_rate": 3.9099648841221527e-07, + "loss": 0.8677, + "step": 300700 + }, + { + "epoch": 1.9211504797924945, + "grad_norm": 1.0626076459884644, + "learning_rate": 3.903704573893552e-07, + "loss": 1.1278, + "step": 300710 + }, + { + "epoch": 1.9212143669422332, + "grad_norm": 0.7469705939292908, + "learning_rate": 3.8974492597838586e-07, + "loss": 0.7027, + "step": 300720 + }, + { + "epoch": 1.921278254091972, + "grad_norm": 1.140273928642273, + "learning_rate": 3.8911989418560225e-07, + "loss": 0.9368, + "step": 300730 + }, + { + "epoch": 1.9213421412417107, + "grad_norm": 1.0759254693984985, + "learning_rate": 3.884953620172993e-07, + "loss": 1.0817, + "step": 300740 + }, + { + "epoch": 1.9214060283914494, + "grad_norm": 0.7694743871688843, + "learning_rate": 3.87871329479772e-07, + "loss": 0.8649, + "step": 300750 + }, + { + "epoch": 1.921469915541188, + "grad_norm": 0.9597426652908325, + "learning_rate": 3.872477965792931e-07, + "loss": 0.9085, + "step": 300760 + }, + { + "epoch": 1.9215338026909268, + "grad_norm": 1.4340089559555054, + "learning_rate": 3.8662476332215757e-07, + "loss": 0.9228, + "step": 300770 + }, + { + "epoch": 1.9215976898406655, + "grad_norm": 0.9579765200614929, + "learning_rate": 3.8600222971462706e-07, + "loss": 0.6958, + "step": 300780 + }, + { + "epoch": 1.9216615769904042, + "grad_norm": 1.5325475931167603, + "learning_rate": 3.8538019576298546e-07, + "loss": 0.8199, + "step": 300790 + }, + { + "epoch": 1.921725464140143, + "grad_norm": 1.028903603553772, + "learning_rate": 3.847586614734833e-07, + "loss": 0.8857, + "step": 300800 + }, + { + "epoch": 1.9217893512898816, + "grad_norm": 0.8717125058174133, + "learning_rate": 3.841376268523822e-07, + "loss": 0.6886, + "step": 300810 + }, + { + "epoch": 1.9218532384396203, + "grad_norm": 0.8490310311317444, + "learning_rate": 3.8351709190593834e-07, + "loss": 0.8223, + "step": 300820 + }, + { + "epoch": 1.921917125589359, + "grad_norm": 1.1729625463485718, + "learning_rate": 3.828970566404022e-07, + "loss": 0.7418, + "step": 300830 + }, + { + "epoch": 1.9219810127390975, + "grad_norm": 0.8890892863273621, + "learning_rate": 3.8227752106201887e-07, + "loss": 1.0118, + "step": 300840 + }, + { + "epoch": 1.9220448998888364, + "grad_norm": 2.766529083251953, + "learning_rate": 3.816584851770277e-07, + "loss": 0.9139, + "step": 300850 + }, + { + "epoch": 1.922108787038575, + "grad_norm": 0.8185132741928101, + "learning_rate": 3.810399489916627e-07, + "loss": 0.7931, + "step": 300860 + }, + { + "epoch": 1.9221726741883138, + "grad_norm": 0.8807799220085144, + "learning_rate": 3.8042191251214663e-07, + "loss": 0.8877, + "step": 300870 + }, + { + "epoch": 1.9222365613380523, + "grad_norm": 1.0990222692489624, + "learning_rate": 3.7980437574471337e-07, + "loss": 0.841, + "step": 300880 + }, + { + "epoch": 1.9223004484877912, + "grad_norm": 0.7515807747840881, + "learning_rate": 3.7918733869557464e-07, + "loss": 0.8471, + "step": 300890 + }, + { + "epoch": 1.9223643356375297, + "grad_norm": 1.0475271940231323, + "learning_rate": 3.785708013709477e-07, + "loss": 0.8562, + "step": 300900 + }, + { + "epoch": 1.9224282227872687, + "grad_norm": 0.9003559947013855, + "learning_rate": 3.779547637770442e-07, + "loss": 0.9027, + "step": 300910 + }, + { + "epoch": 1.9224921099370071, + "grad_norm": 0.8411440253257751, + "learning_rate": 3.773392259200648e-07, + "loss": 0.8666, + "step": 300920 + }, + { + "epoch": 1.922555997086746, + "grad_norm": 1.0167624950408936, + "learning_rate": 3.767241878062044e-07, + "loss": 0.6216, + "step": 300930 + }, + { + "epoch": 1.9226198842364846, + "grad_norm": 0.7058795094490051, + "learning_rate": 3.761096494416694e-07, + "loss": 0.9369, + "step": 300940 + }, + { + "epoch": 1.9226837713862235, + "grad_norm": 0.8156857490539551, + "learning_rate": 3.754956108326324e-07, + "loss": 1.1769, + "step": 300950 + }, + { + "epoch": 1.922747658535962, + "grad_norm": 1.0297104120254517, + "learning_rate": 3.748820719852941e-07, + "loss": 0.9375, + "step": 300960 + }, + { + "epoch": 1.922811545685701, + "grad_norm": 1.000810146331787, + "learning_rate": 3.742690329058218e-07, + "loss": 0.9047, + "step": 300970 + }, + { + "epoch": 1.9228754328354394, + "grad_norm": 0.8792943954467773, + "learning_rate": 3.736564936003939e-07, + "loss": 0.7601, + "step": 300980 + }, + { + "epoch": 1.9229393199851783, + "grad_norm": 0.8752835988998413, + "learning_rate": 3.730444540751721e-07, + "loss": 0.9696, + "step": 300990 + }, + { + "epoch": 1.9230032071349168, + "grad_norm": 0.7234314680099487, + "learning_rate": 3.7243291433633475e-07, + "loss": 0.7312, + "step": 301000 + }, + { + "epoch": 1.9230670942846557, + "grad_norm": 0.7834398150444031, + "learning_rate": 3.7182187439002704e-07, + "loss": 0.9055, + "step": 301010 + }, + { + "epoch": 1.9231309814343942, + "grad_norm": 0.5239536762237549, + "learning_rate": 3.712113342424051e-07, + "loss": 0.7016, + "step": 301020 + }, + { + "epoch": 1.9231948685841331, + "grad_norm": 1.0821951627731323, + "learning_rate": 3.7060129389962504e-07, + "loss": 0.8404, + "step": 301030 + }, + { + "epoch": 1.9232587557338716, + "grad_norm": 0.8091372847557068, + "learning_rate": 3.69991753367821e-07, + "loss": 1.1122, + "step": 301040 + }, + { + "epoch": 1.9233226428836105, + "grad_norm": 0.5722392797470093, + "learning_rate": 3.693827126531435e-07, + "loss": 1.0596, + "step": 301050 + }, + { + "epoch": 1.923386530033349, + "grad_norm": 1.005053997039795, + "learning_rate": 3.687741717617099e-07, + "loss": 0.6983, + "step": 301060 + }, + { + "epoch": 1.923450417183088, + "grad_norm": 1.0301357507705688, + "learning_rate": 3.6816613069966535e-07, + "loss": 1.074, + "step": 301070 + }, + { + "epoch": 1.9235143043328264, + "grad_norm": 0.8463405966758728, + "learning_rate": 3.675585894731159e-07, + "loss": 0.7987, + "step": 301080 + }, + { + "epoch": 1.9235781914825651, + "grad_norm": 1.8486971855163574, + "learning_rate": 3.669515480882013e-07, + "loss": 1.0181, + "step": 301090 + }, + { + "epoch": 1.9236420786323039, + "grad_norm": 0.6684698462486267, + "learning_rate": 3.66345006551011e-07, + "loss": 0.7149, + "step": 301100 + }, + { + "epoch": 1.9237059657820426, + "grad_norm": 0.8282666802406311, + "learning_rate": 3.6573896486767344e-07, + "loss": 0.7379, + "step": 301110 + }, + { + "epoch": 1.9237698529317813, + "grad_norm": 0.6827027797698975, + "learning_rate": 3.651334230442838e-07, + "loss": 0.7027, + "step": 301120 + }, + { + "epoch": 1.92383374008152, + "grad_norm": 1.1820034980773926, + "learning_rate": 3.6452838108694264e-07, + "loss": 0.9152, + "step": 301130 + }, + { + "epoch": 1.9238976272312587, + "grad_norm": 0.8771212697029114, + "learning_rate": 3.639238390017341e-07, + "loss": 1.0107, + "step": 301140 + }, + { + "epoch": 1.9239615143809974, + "grad_norm": 1.2758761644363403, + "learning_rate": 3.6331979679476435e-07, + "loss": 0.8095, + "step": 301150 + }, + { + "epoch": 1.924025401530736, + "grad_norm": 1.9822547435760498, + "learning_rate": 3.627162544720952e-07, + "loss": 1.2276, + "step": 301160 + }, + { + "epoch": 1.9240892886804748, + "grad_norm": 0.8313500285148621, + "learning_rate": 3.6211321203982183e-07, + "loss": 0.879, + "step": 301170 + }, + { + "epoch": 1.9241531758302135, + "grad_norm": 1.3029310703277588, + "learning_rate": 3.6151066950401155e-07, + "loss": 0.7567, + "step": 301180 + }, + { + "epoch": 1.9242170629799522, + "grad_norm": 1.1256672143936157, + "learning_rate": 3.609086268707318e-07, + "loss": 0.8885, + "step": 301190 + }, + { + "epoch": 1.924280950129691, + "grad_norm": 0.8242726922035217, + "learning_rate": 3.603070841460443e-07, + "loss": 0.8574, + "step": 301200 + }, + { + "epoch": 1.9243448372794296, + "grad_norm": 0.9224770665168762, + "learning_rate": 3.5970604133601095e-07, + "loss": 0.7343, + "step": 301210 + }, + { + "epoch": 1.9244087244291683, + "grad_norm": 1.151964783668518, + "learning_rate": 3.591054984466824e-07, + "loss": 1.0734, + "step": 301220 + }, + { + "epoch": 1.924472611578907, + "grad_norm": 1.1502186059951782, + "learning_rate": 3.5850545548410387e-07, + "loss": 1.0551, + "step": 301230 + }, + { + "epoch": 1.9245364987286457, + "grad_norm": 0.8960140347480774, + "learning_rate": 3.5790591245432603e-07, + "loss": 0.8229, + "step": 301240 + }, + { + "epoch": 1.9246003858783844, + "grad_norm": 0.9007967114448547, + "learning_rate": 3.5730686936337744e-07, + "loss": 0.7307, + "step": 301250 + }, + { + "epoch": 1.9246642730281232, + "grad_norm": 1.0921900272369385, + "learning_rate": 3.5670832621729766e-07, + "loss": 0.8592, + "step": 301260 + }, + { + "epoch": 1.9247281601778619, + "grad_norm": 1.049098014831543, + "learning_rate": 3.5611028302211523e-07, + "loss": 0.802, + "step": 301270 + }, + { + "epoch": 1.9247920473276006, + "grad_norm": 1.1155762672424316, + "learning_rate": 3.555127397838476e-07, + "loss": 0.7605, + "step": 301280 + }, + { + "epoch": 1.9248559344773393, + "grad_norm": 0.8949849605560303, + "learning_rate": 3.549156965085176e-07, + "loss": 0.7584, + "step": 301290 + }, + { + "epoch": 1.924919821627078, + "grad_norm": 1.0263363122940063, + "learning_rate": 3.543191532021317e-07, + "loss": 0.8152, + "step": 301300 + }, + { + "epoch": 1.9249837087768167, + "grad_norm": 1.013625979423523, + "learning_rate": 3.537231098707072e-07, + "loss": 0.8231, + "step": 301310 + }, + { + "epoch": 1.9250475959265554, + "grad_norm": 1.2236038446426392, + "learning_rate": 3.531275665202338e-07, + "loss": 1.0038, + "step": 301320 + }, + { + "epoch": 1.9251114830762939, + "grad_norm": 1.1396390199661255, + "learning_rate": 3.5253252315672337e-07, + "loss": 1.084, + "step": 301330 + }, + { + "epoch": 1.9251753702260328, + "grad_norm": 0.8289651870727539, + "learning_rate": 3.5193797978615996e-07, + "loss": 0.8872, + "step": 301340 + }, + { + "epoch": 1.9252392573757713, + "grad_norm": 1.6858197450637817, + "learning_rate": 3.5134393641452766e-07, + "loss": 0.8278, + "step": 301350 + }, + { + "epoch": 1.9253031445255102, + "grad_norm": 1.236825942993164, + "learning_rate": 3.507503930478162e-07, + "loss": 0.8035, + "step": 301360 + }, + { + "epoch": 1.9253670316752487, + "grad_norm": 1.3187953233718872, + "learning_rate": 3.501573496920041e-07, + "loss": 0.759, + "step": 301370 + }, + { + "epoch": 1.9254309188249876, + "grad_norm": 1.0832971334457397, + "learning_rate": 3.4956480635305877e-07, + "loss": 0.6859, + "step": 301380 + }, + { + "epoch": 1.925494805974726, + "grad_norm": 0.6571474671363831, + "learning_rate": 3.4897276303695324e-07, + "loss": 0.9639, + "step": 301390 + }, + { + "epoch": 1.925558693124465, + "grad_norm": 0.7257376313209534, + "learning_rate": 3.483812197496383e-07, + "loss": 0.7832, + "step": 301400 + }, + { + "epoch": 1.9256225802742035, + "grad_norm": 1.053246259689331, + "learning_rate": 3.47790176497087e-07, + "loss": 0.6743, + "step": 301410 + }, + { + "epoch": 1.9256864674239424, + "grad_norm": 0.8816425204277039, + "learning_rate": 3.4719963328523896e-07, + "loss": 1.0447, + "step": 301420 + }, + { + "epoch": 1.925750354573681, + "grad_norm": 1.0703054666519165, + "learning_rate": 3.4660959012005056e-07, + "loss": 0.8536, + "step": 301430 + }, + { + "epoch": 1.9258142417234199, + "grad_norm": 1.726242184638977, + "learning_rate": 3.4602004700745594e-07, + "loss": 1.0394, + "step": 301440 + }, + { + "epoch": 1.9258781288731583, + "grad_norm": 1.054643988609314, + "learning_rate": 3.4543100395340036e-07, + "loss": 0.7895, + "step": 301450 + }, + { + "epoch": 1.9259420160228973, + "grad_norm": 1.1033471822738647, + "learning_rate": 3.448424609638068e-07, + "loss": 0.7948, + "step": 301460 + }, + { + "epoch": 1.9260059031726358, + "grad_norm": 0.784725546836853, + "learning_rate": 3.44254418044615e-07, + "loss": 0.8194, + "step": 301470 + }, + { + "epoch": 1.9260697903223747, + "grad_norm": 0.9209342002868652, + "learning_rate": 3.436668752017314e-07, + "loss": 0.7762, + "step": 301480 + }, + { + "epoch": 1.9261336774721132, + "grad_norm": 0.9801011085510254, + "learning_rate": 3.430798324410844e-07, + "loss": 0.903, + "step": 301490 + }, + { + "epoch": 1.926197564621852, + "grad_norm": 1.1155471801757812, + "learning_rate": 3.4249328976858066e-07, + "loss": 1.0466, + "step": 301500 + }, + { + "epoch": 1.9262614517715906, + "grad_norm": 0.9278727769851685, + "learning_rate": 3.4190724719013744e-07, + "loss": 0.9425, + "step": 301510 + }, + { + "epoch": 1.9263253389213295, + "grad_norm": 1.0809649229049683, + "learning_rate": 3.4132170471163905e-07, + "loss": 0.6635, + "step": 301520 + }, + { + "epoch": 1.926389226071068, + "grad_norm": 2.041011095046997, + "learning_rate": 3.407366623389974e-07, + "loss": 1.1511, + "step": 301530 + }, + { + "epoch": 1.926453113220807, + "grad_norm": 1.0981637239456177, + "learning_rate": 3.401521200780966e-07, + "loss": 0.8078, + "step": 301540 + }, + { + "epoch": 1.9265170003705454, + "grad_norm": 1.3349926471710205, + "learning_rate": 3.3956807793482646e-07, + "loss": 1.0045, + "step": 301550 + }, + { + "epoch": 1.9265808875202843, + "grad_norm": 0.5713462233543396, + "learning_rate": 3.3898453591506565e-07, + "loss": 0.7818, + "step": 301560 + }, + { + "epoch": 1.9266447746700228, + "grad_norm": 1.0720760822296143, + "learning_rate": 3.3840149402469824e-07, + "loss": 0.7939, + "step": 301570 + }, + { + "epoch": 1.9267086618197615, + "grad_norm": 1.5037866830825806, + "learning_rate": 3.378189522695863e-07, + "loss": 0.6691, + "step": 301580 + }, + { + "epoch": 1.9267725489695002, + "grad_norm": 1.3440207242965698, + "learning_rate": 3.372369106556028e-07, + "loss": 0.7193, + "step": 301590 + }, + { + "epoch": 1.926836436119239, + "grad_norm": 0.6172181367874146, + "learning_rate": 3.366553691886154e-07, + "loss": 0.9351, + "step": 301600 + }, + { + "epoch": 1.9269003232689776, + "grad_norm": 0.747612476348877, + "learning_rate": 3.360743278744638e-07, + "loss": 0.6419, + "step": 301610 + }, + { + "epoch": 1.9269642104187164, + "grad_norm": 1.2818886041641235, + "learning_rate": 3.3549378671901553e-07, + "loss": 0.9875, + "step": 301620 + }, + { + "epoch": 1.927028097568455, + "grad_norm": 0.7893770337104797, + "learning_rate": 3.3491374572810487e-07, + "loss": 1.0888, + "step": 301630 + }, + { + "epoch": 1.9270919847181938, + "grad_norm": 1.241309404373169, + "learning_rate": 3.3433420490758263e-07, + "loss": 0.7368, + "step": 301640 + }, + { + "epoch": 1.9271558718679325, + "grad_norm": 0.5212295055389404, + "learning_rate": 3.33755164263283e-07, + "loss": 0.6227, + "step": 301650 + }, + { + "epoch": 1.9272197590176712, + "grad_norm": 1.048384428024292, + "learning_rate": 3.3317662380103477e-07, + "loss": 0.8716, + "step": 301660 + }, + { + "epoch": 1.9272836461674099, + "grad_norm": 1.3901996612548828, + "learning_rate": 3.3259858352666094e-07, + "loss": 0.6373, + "step": 301670 + }, + { + "epoch": 1.9273475333171486, + "grad_norm": 0.9306744933128357, + "learning_rate": 3.3202104344599583e-07, + "loss": 1.1866, + "step": 301680 + }, + { + "epoch": 1.9274114204668873, + "grad_norm": 1.039258360862732, + "learning_rate": 3.3144400356484585e-07, + "loss": 0.8511, + "step": 301690 + }, + { + "epoch": 1.927475307616626, + "grad_norm": 1.1582444906234741, + "learning_rate": 3.30867463889023e-07, + "loss": 0.9237, + "step": 301700 + }, + { + "epoch": 1.9275391947663647, + "grad_norm": 0.918725311756134, + "learning_rate": 3.302914244243338e-07, + "loss": 0.9144, + "step": 301710 + }, + { + "epoch": 1.9276030819161034, + "grad_norm": 1.1346672773361206, + "learning_rate": 3.297158851765791e-07, + "loss": 1.0005, + "step": 301720 + }, + { + "epoch": 1.9276669690658421, + "grad_norm": 1.0042999982833862, + "learning_rate": 3.291408461515599e-07, + "loss": 0.6809, + "step": 301730 + }, + { + "epoch": 1.9277308562155808, + "grad_norm": 1.322226881980896, + "learning_rate": 3.2856630735506047e-07, + "loss": 1.0392, + "step": 301740 + }, + { + "epoch": 1.9277947433653195, + "grad_norm": 0.7337998747825623, + "learning_rate": 3.2799226879287047e-07, + "loss": 0.7134, + "step": 301750 + }, + { + "epoch": 1.9278586305150582, + "grad_norm": 0.9696584343910217, + "learning_rate": 3.274187304707743e-07, + "loss": 0.9432, + "step": 301760 + }, + { + "epoch": 1.927922517664797, + "grad_norm": 1.2278753519058228, + "learning_rate": 3.268456923945451e-07, + "loss": 0.9087, + "step": 301770 + }, + { + "epoch": 1.9279864048145356, + "grad_norm": 1.1169925928115845, + "learning_rate": 3.2627315456995045e-07, + "loss": 0.8745, + "step": 301780 + }, + { + "epoch": 1.9280502919642744, + "grad_norm": 0.8843469023704529, + "learning_rate": 3.25701117002758e-07, + "loss": 0.7129, + "step": 301790 + }, + { + "epoch": 1.928114179114013, + "grad_norm": 1.417089581489563, + "learning_rate": 3.251295796987297e-07, + "loss": 0.8296, + "step": 301800 + }, + { + "epoch": 1.9281780662637518, + "grad_norm": 1.1206992864608765, + "learning_rate": 3.245585426636222e-07, + "loss": 0.7617, + "step": 301810 + }, + { + "epoch": 1.9282419534134903, + "grad_norm": 0.918014645576477, + "learning_rate": 3.2398800590318636e-07, + "loss": 0.76, + "step": 301820 + }, + { + "epoch": 1.9283058405632292, + "grad_norm": 0.888495147228241, + "learning_rate": 3.234179694231676e-07, + "loss": 0.8968, + "step": 301830 + }, + { + "epoch": 1.9283697277129677, + "grad_norm": 0.9238024950027466, + "learning_rate": 3.228484332293058e-07, + "loss": 0.7197, + "step": 301840 + }, + { + "epoch": 1.9284336148627066, + "grad_norm": 0.8333662152290344, + "learning_rate": 3.2227939732733523e-07, + "loss": 1.0649, + "step": 301850 + }, + { + "epoch": 1.928497502012445, + "grad_norm": 0.6979062557220459, + "learning_rate": 3.2171086172299025e-07, + "loss": 0.7092, + "step": 301860 + }, + { + "epoch": 1.928561389162184, + "grad_norm": 1.1566264629364014, + "learning_rate": 3.21142826421994e-07, + "loss": 0.9163, + "step": 301870 + }, + { + "epoch": 1.9286252763119225, + "grad_norm": 1.1237833499908447, + "learning_rate": 3.205752914300697e-07, + "loss": 0.9294, + "step": 301880 + }, + { + "epoch": 1.9286891634616614, + "grad_norm": 1.5225160121917725, + "learning_rate": 3.2000825675292387e-07, + "loss": 0.8519, + "step": 301890 + }, + { + "epoch": 1.9287530506114, + "grad_norm": 0.8152609467506409, + "learning_rate": 3.194417223962853e-07, + "loss": 0.8797, + "step": 301900 + }, + { + "epoch": 1.9288169377611388, + "grad_norm": 1.680662751197815, + "learning_rate": 3.188756883658384e-07, + "loss": 0.9777, + "step": 301910 + }, + { + "epoch": 1.9288808249108773, + "grad_norm": 0.8997313380241394, + "learning_rate": 3.1831015466730063e-07, + "loss": 0.7692, + "step": 301920 + }, + { + "epoch": 1.9289447120606162, + "grad_norm": 0.9319784641265869, + "learning_rate": 3.177451213063565e-07, + "loss": 0.79, + "step": 301930 + }, + { + "epoch": 1.9290085992103547, + "grad_norm": 1.0063719749450684, + "learning_rate": 3.1718058828870133e-07, + "loss": 1.0246, + "step": 301940 + }, + { + "epoch": 1.9290724863600937, + "grad_norm": 1.1086889505386353, + "learning_rate": 3.166165556200196e-07, + "loss": 0.9792, + "step": 301950 + }, + { + "epoch": 1.9291363735098321, + "grad_norm": 0.9177554845809937, + "learning_rate": 3.160530233059955e-07, + "loss": 0.6821, + "step": 301960 + }, + { + "epoch": 1.929200260659571, + "grad_norm": 0.5704813003540039, + "learning_rate": 3.1548999135229127e-07, + "loss": 0.8721, + "step": 301970 + }, + { + "epoch": 1.9292641478093095, + "grad_norm": 0.9116873741149902, + "learning_rate": 3.149836904067338e-07, + "loss": 1.1118, + "step": 301980 + }, + { + "epoch": 1.9293280349590485, + "grad_norm": 1.0153589248657227, + "learning_rate": 3.144216091532759e-07, + "loss": 1.012, + "step": 301990 + }, + { + "epoch": 1.929391922108787, + "grad_norm": 1.10478937625885, + "learning_rate": 3.1386002827657466e-07, + "loss": 0.933, + "step": 302000 + }, + { + "epoch": 1.929455809258526, + "grad_norm": 0.9765149354934692, + "learning_rate": 3.132989477822923e-07, + "loss": 0.7768, + "step": 302010 + }, + { + "epoch": 1.9295196964082644, + "grad_norm": 1.0319316387176514, + "learning_rate": 3.127383676760687e-07, + "loss": 0.7433, + "step": 302020 + }, + { + "epoch": 1.9295835835580033, + "grad_norm": 0.9615095257759094, + "learning_rate": 3.121782879635604e-07, + "loss": 0.927, + "step": 302030 + }, + { + "epoch": 1.9296474707077418, + "grad_norm": 0.7776680588722229, + "learning_rate": 3.116187086504019e-07, + "loss": 0.9391, + "step": 302040 + }, + { + "epoch": 1.9297113578574805, + "grad_norm": 0.8813909888267517, + "learning_rate": 3.1105962974222745e-07, + "loss": 0.9085, + "step": 302050 + }, + { + "epoch": 1.9297752450072192, + "grad_norm": 0.9444760084152222, + "learning_rate": 3.1050105124467154e-07, + "loss": 1.0147, + "step": 302060 + }, + { + "epoch": 1.929839132156958, + "grad_norm": 1.1206434965133667, + "learning_rate": 3.099429731633574e-07, + "loss": 0.7016, + "step": 302070 + }, + { + "epoch": 1.9299030193066966, + "grad_norm": 1.0431171655654907, + "learning_rate": 3.093853955039028e-07, + "loss": 0.8739, + "step": 302080 + }, + { + "epoch": 1.9299669064564353, + "grad_norm": 1.2197887897491455, + "learning_rate": 3.088283182719309e-07, + "loss": 0.7848, + "step": 302090 + }, + { + "epoch": 1.930030793606174, + "grad_norm": 1.1709030866622925, + "learning_rate": 3.082717414730429e-07, + "loss": 0.6757, + "step": 302100 + }, + { + "epoch": 1.9300946807559127, + "grad_norm": 1.1273075342178345, + "learning_rate": 3.07715665112851e-07, + "loss": 1.007, + "step": 302110 + }, + { + "epoch": 1.9301585679056514, + "grad_norm": 1.6367871761322021, + "learning_rate": 3.0716008919695063e-07, + "loss": 0.7174, + "step": 302120 + }, + { + "epoch": 1.9302224550553901, + "grad_norm": 2.8941264152526855, + "learning_rate": 3.066050137309373e-07, + "loss": 0.8933, + "step": 302130 + }, + { + "epoch": 1.9302863422051288, + "grad_norm": 0.96756911277771, + "learning_rate": 3.060504387204066e-07, + "loss": 0.8054, + "step": 302140 + }, + { + "epoch": 1.9303502293548676, + "grad_norm": 0.7502398490905762, + "learning_rate": 3.054963641709374e-07, + "loss": 0.9063, + "step": 302150 + }, + { + "epoch": 1.9304141165046063, + "grad_norm": 1.5427123308181763, + "learning_rate": 3.0494279008810856e-07, + "loss": 1.2005, + "step": 302160 + }, + { + "epoch": 1.930478003654345, + "grad_norm": 0.8241047859191895, + "learning_rate": 3.043897164774989e-07, + "loss": 0.8046, + "step": 302170 + }, + { + "epoch": 1.9305418908040837, + "grad_norm": 1.026366114616394, + "learning_rate": 3.038371433446763e-07, + "loss": 0.7106, + "step": 302180 + }, + { + "epoch": 1.9306057779538224, + "grad_norm": 0.9416015148162842, + "learning_rate": 3.0328507069521396e-07, + "loss": 0.9412, + "step": 302190 + }, + { + "epoch": 1.930669665103561, + "grad_norm": 1.4748917818069458, + "learning_rate": 3.027334985346575e-07, + "loss": 0.7928, + "step": 302200 + }, + { + "epoch": 1.9307335522532998, + "grad_norm": 0.7493869066238403, + "learning_rate": 3.021824268685691e-07, + "loss": 1.0479, + "step": 302210 + }, + { + "epoch": 1.9307974394030385, + "grad_norm": 1.2239211797714233, + "learning_rate": 3.016318557025055e-07, + "loss": 0.6985, + "step": 302220 + }, + { + "epoch": 1.9308613265527772, + "grad_norm": 1.395402193069458, + "learning_rate": 3.010817850419956e-07, + "loss": 0.8747, + "step": 302230 + }, + { + "epoch": 1.930925213702516, + "grad_norm": 1.5344443321228027, + "learning_rate": 3.0053221489259595e-07, + "loss": 0.8547, + "step": 302240 + }, + { + "epoch": 1.9309891008522546, + "grad_norm": 0.7854205965995789, + "learning_rate": 2.9998314525983005e-07, + "loss": 0.6006, + "step": 302250 + }, + { + "epoch": 1.9310529880019933, + "grad_norm": 0.9757431149482727, + "learning_rate": 2.994345761492268e-07, + "loss": 0.8716, + "step": 302260 + }, + { + "epoch": 1.931116875151732, + "grad_norm": 0.7694618701934814, + "learning_rate": 2.9888650756632054e-07, + "loss": 0.8134, + "step": 302270 + }, + { + "epoch": 1.9311807623014707, + "grad_norm": 1.0986371040344238, + "learning_rate": 2.9833893951661807e-07, + "loss": 0.8574, + "step": 302280 + }, + { + "epoch": 1.9312446494512094, + "grad_norm": 1.9931775331497192, + "learning_rate": 2.9779187200564276e-07, + "loss": 0.8072, + "step": 302290 + }, + { + "epoch": 1.9313085366009481, + "grad_norm": 1.187432885169983, + "learning_rate": 2.9724530503890677e-07, + "loss": 0.9356, + "step": 302300 + }, + { + "epoch": 1.9313724237506866, + "grad_norm": 0.7285720705986023, + "learning_rate": 2.966992386219059e-07, + "loss": 0.716, + "step": 302310 + }, + { + "epoch": 1.9314363109004256, + "grad_norm": 0.7605849504470825, + "learning_rate": 2.9615367276014107e-07, + "loss": 0.7563, + "step": 302320 + }, + { + "epoch": 1.931500198050164, + "grad_norm": 1.7555122375488281, + "learning_rate": 2.956086074591136e-07, + "loss": 0.8992, + "step": 302330 + }, + { + "epoch": 1.931564085199903, + "grad_norm": 1.09848952293396, + "learning_rate": 2.950640427243023e-07, + "loss": 0.9077, + "step": 302340 + }, + { + "epoch": 1.9316279723496415, + "grad_norm": 0.9788870811462402, + "learning_rate": 2.9451997856120294e-07, + "loss": 0.9107, + "step": 302350 + }, + { + "epoch": 1.9316918594993804, + "grad_norm": 1.018257737159729, + "learning_rate": 2.9397641497528327e-07, + "loss": 0.9396, + "step": 302360 + }, + { + "epoch": 1.9317557466491189, + "grad_norm": 0.9232105612754822, + "learning_rate": 2.9343335197202783e-07, + "loss": 0.759, + "step": 302370 + }, + { + "epoch": 1.9318196337988578, + "grad_norm": 0.5679325461387634, + "learning_rate": 2.928907895568989e-07, + "loss": 0.8857, + "step": 302380 + }, + { + "epoch": 1.9318835209485963, + "grad_norm": 1.78494393825531, + "learning_rate": 2.9234872773535873e-07, + "loss": 0.7721, + "step": 302390 + }, + { + "epoch": 1.9319474080983352, + "grad_norm": 0.668113112449646, + "learning_rate": 2.9180716651287523e-07, + "loss": 0.8469, + "step": 302400 + }, + { + "epoch": 1.9320112952480737, + "grad_norm": 1.0490716695785522, + "learning_rate": 2.91266105894894e-07, + "loss": 0.9017, + "step": 302410 + }, + { + "epoch": 1.9320751823978126, + "grad_norm": 0.9475468397140503, + "learning_rate": 2.907255458868663e-07, + "loss": 0.8416, + "step": 302420 + }, + { + "epoch": 1.932139069547551, + "grad_norm": 1.1354217529296875, + "learning_rate": 2.9018548649424326e-07, + "loss": 0.9322, + "step": 302430 + }, + { + "epoch": 1.93220295669729, + "grad_norm": 2.2949318885803223, + "learning_rate": 2.8964592772245393e-07, + "loss": 0.8608, + "step": 302440 + }, + { + "epoch": 1.9322668438470285, + "grad_norm": 0.9771470427513123, + "learning_rate": 2.8910686957693277e-07, + "loss": 0.7114, + "step": 302450 + }, + { + "epoch": 1.9323307309967674, + "grad_norm": 0.8451032638549805, + "learning_rate": 2.885683120631144e-07, + "loss": 0.7742, + "step": 302460 + }, + { + "epoch": 1.932394618146506, + "grad_norm": 0.6596810221672058, + "learning_rate": 2.8803025518642225e-07, + "loss": 0.7939, + "step": 302470 + }, + { + "epoch": 1.9324585052962449, + "grad_norm": 0.5329988598823547, + "learning_rate": 2.874926989522686e-07, + "loss": 1.0137, + "step": 302480 + }, + { + "epoch": 1.9325223924459833, + "grad_norm": 0.6956501007080078, + "learning_rate": 2.869556433660714e-07, + "loss": 1.1259, + "step": 302490 + }, + { + "epoch": 1.9325862795957223, + "grad_norm": 0.7437456846237183, + "learning_rate": 2.864190884332374e-07, + "loss": 0.9319, + "step": 302500 + }, + { + "epoch": 1.9326501667454608, + "grad_norm": 0.9625310301780701, + "learning_rate": 2.858830341591734e-07, + "loss": 0.9455, + "step": 302510 + }, + { + "epoch": 1.9327140538951997, + "grad_norm": 0.8072330951690674, + "learning_rate": 2.8534748054927505e-07, + "loss": 0.8667, + "step": 302520 + }, + { + "epoch": 1.9327779410449382, + "grad_norm": 1.026102066040039, + "learning_rate": 2.848124276089381e-07, + "loss": 0.8568, + "step": 302530 + }, + { + "epoch": 1.9328418281946769, + "grad_norm": 0.8609016537666321, + "learning_rate": 2.842778753435471e-07, + "loss": 1.0871, + "step": 302540 + }, + { + "epoch": 1.9329057153444156, + "grad_norm": 0.9094386696815491, + "learning_rate": 2.837438237584922e-07, + "loss": 0.9309, + "step": 302550 + }, + { + "epoch": 1.9329696024941543, + "grad_norm": 1.1046336889266968, + "learning_rate": 2.8321027285914683e-07, + "loss": 0.6946, + "step": 302560 + }, + { + "epoch": 1.933033489643893, + "grad_norm": 1.0469043254852295, + "learning_rate": 2.826772226508845e-07, + "loss": 0.9, + "step": 302570 + }, + { + "epoch": 1.9330973767936317, + "grad_norm": 0.8629121780395508, + "learning_rate": 2.821446731390731e-07, + "loss": 0.7624, + "step": 302580 + }, + { + "epoch": 1.9331612639433704, + "grad_norm": 0.920183002948761, + "learning_rate": 2.816126243290751e-07, + "loss": 0.8727, + "step": 302590 + }, + { + "epoch": 1.933225151093109, + "grad_norm": 1.0150858163833618, + "learning_rate": 2.810810762262528e-07, + "loss": 0.9346, + "step": 302600 + }, + { + "epoch": 1.9332890382428478, + "grad_norm": 0.9224951267242432, + "learning_rate": 2.8055002883595747e-07, + "loss": 1.0581, + "step": 302610 + }, + { + "epoch": 1.9333529253925865, + "grad_norm": 0.8653743863105774, + "learning_rate": 2.800194821635405e-07, + "loss": 0.8454, + "step": 302620 + }, + { + "epoch": 1.9334168125423252, + "grad_norm": 0.8796379566192627, + "learning_rate": 2.7948943621433633e-07, + "loss": 0.7263, + "step": 302630 + }, + { + "epoch": 1.933480699692064, + "grad_norm": 0.786419153213501, + "learning_rate": 2.7895989099368524e-07, + "loss": 0.8787, + "step": 302640 + }, + { + "epoch": 1.9335445868418026, + "grad_norm": 0.9109292030334473, + "learning_rate": 2.784308465069274e-07, + "loss": 1.0634, + "step": 302650 + }, + { + "epoch": 1.9336084739915413, + "grad_norm": 1.6854918003082275, + "learning_rate": 2.779023027593863e-07, + "loss": 1.245, + "step": 302660 + }, + { + "epoch": 1.93367236114128, + "grad_norm": 3.914057970046997, + "learning_rate": 2.7737425975638554e-07, + "loss": 0.9142, + "step": 302670 + }, + { + "epoch": 1.9337362482910188, + "grad_norm": 0.8332629799842834, + "learning_rate": 2.768467175032374e-07, + "loss": 0.9323, + "step": 302680 + }, + { + "epoch": 1.9338001354407575, + "grad_norm": 1.001237392425537, + "learning_rate": 2.7631967600526555e-07, + "loss": 0.9254, + "step": 302690 + }, + { + "epoch": 1.9338640225904962, + "grad_norm": 0.7326177358627319, + "learning_rate": 2.7579313526776564e-07, + "loss": 0.9705, + "step": 302700 + }, + { + "epoch": 1.9339279097402349, + "grad_norm": 0.7289140820503235, + "learning_rate": 2.752670952960501e-07, + "loss": 0.9911, + "step": 302710 + }, + { + "epoch": 1.9339917968899736, + "grad_norm": 1.3712490797042847, + "learning_rate": 2.7474155609540917e-07, + "loss": 0.7515, + "step": 302720 + }, + { + "epoch": 1.9340556840397123, + "grad_norm": 1.1518125534057617, + "learning_rate": 2.742165176711442e-07, + "loss": 0.9325, + "step": 302730 + }, + { + "epoch": 1.934119571189451, + "grad_norm": 0.7685813307762146, + "learning_rate": 2.7369198002853426e-07, + "loss": 0.9072, + "step": 302740 + }, + { + "epoch": 1.9341834583391897, + "grad_norm": 0.7780716419219971, + "learning_rate": 2.7316794317286953e-07, + "loss": 0.7143, + "step": 302750 + }, + { + "epoch": 1.9342473454889284, + "grad_norm": 0.9062319397926331, + "learning_rate": 2.7264440710941806e-07, + "loss": 0.6679, + "step": 302760 + }, + { + "epoch": 1.934311232638667, + "grad_norm": 1.183876395225525, + "learning_rate": 2.7212137184346453e-07, + "loss": 0.8766, + "step": 302770 + }, + { + "epoch": 1.9343751197884056, + "grad_norm": 1.1930270195007324, + "learning_rate": 2.715988373802658e-07, + "loss": 0.6851, + "step": 302780 + }, + { + "epoch": 1.9344390069381445, + "grad_norm": 1.5688190460205078, + "learning_rate": 2.7107680372508436e-07, + "loss": 1.268, + "step": 302790 + }, + { + "epoch": 1.934502894087883, + "grad_norm": 0.7877197861671448, + "learning_rate": 2.705552708831827e-07, + "loss": 0.7911, + "step": 302800 + }, + { + "epoch": 1.934566781237622, + "grad_norm": 1.4768949747085571, + "learning_rate": 2.700342388598176e-07, + "loss": 0.8737, + "step": 302810 + }, + { + "epoch": 1.9346306683873604, + "grad_norm": 0.8407143950462341, + "learning_rate": 2.695137076602239e-07, + "loss": 0.7297, + "step": 302820 + }, + { + "epoch": 1.9346945555370993, + "grad_norm": 0.7811848521232605, + "learning_rate": 2.6899367728965284e-07, + "loss": 0.8857, + "step": 302830 + }, + { + "epoch": 1.9347584426868378, + "grad_norm": 1.9201631546020508, + "learning_rate": 2.684741477533392e-07, + "loss": 1.0058, + "step": 302840 + }, + { + "epoch": 1.9348223298365768, + "grad_norm": 0.9804142713546753, + "learning_rate": 2.679551190565122e-07, + "loss": 0.9072, + "step": 302850 + }, + { + "epoch": 1.9348862169863152, + "grad_norm": 0.8214455246925354, + "learning_rate": 2.6743659120440635e-07, + "loss": 0.7655, + "step": 302860 + }, + { + "epoch": 1.9349501041360542, + "grad_norm": 0.6904785633087158, + "learning_rate": 2.6691856420223424e-07, + "loss": 0.9752, + "step": 302870 + }, + { + "epoch": 1.9350139912857927, + "grad_norm": 0.8468638062477112, + "learning_rate": 2.664010380552195e-07, + "loss": 0.8078, + "step": 302880 + }, + { + "epoch": 1.9350778784355316, + "grad_norm": 0.8787335157394409, + "learning_rate": 2.658840127685691e-07, + "loss": 1.0969, + "step": 302890 + }, + { + "epoch": 1.93514176558527, + "grad_norm": 0.9245291352272034, + "learning_rate": 2.6536748834750103e-07, + "loss": 0.8035, + "step": 302900 + }, + { + "epoch": 1.935205652735009, + "grad_norm": 1.0244308710098267, + "learning_rate": 2.648514647972e-07, + "loss": 0.7529, + "step": 302910 + }, + { + "epoch": 1.9352695398847475, + "grad_norm": 0.8991485238075256, + "learning_rate": 2.6433594212287303e-07, + "loss": 0.8502, + "step": 302920 + }, + { + "epoch": 1.9353334270344864, + "grad_norm": 0.8746139407157898, + "learning_rate": 2.6382092032971593e-07, + "loss": 0.8885, + "step": 302930 + }, + { + "epoch": 1.935397314184225, + "grad_norm": 0.6522064805030823, + "learning_rate": 2.633063994229079e-07, + "loss": 0.9982, + "step": 302940 + }, + { + "epoch": 1.9354612013339638, + "grad_norm": 0.745275616645813, + "learning_rate": 2.6279237940762813e-07, + "loss": 1.1341, + "step": 302950 + }, + { + "epoch": 1.9355250884837023, + "grad_norm": 0.8822062015533447, + "learning_rate": 2.6227886028906136e-07, + "loss": 1.1846, + "step": 302960 + }, + { + "epoch": 1.9355889756334412, + "grad_norm": 1.1190886497497559, + "learning_rate": 2.6176584207237563e-07, + "loss": 0.9702, + "step": 302970 + }, + { + "epoch": 1.9356528627831797, + "grad_norm": 0.9726095199584961, + "learning_rate": 2.612533247627391e-07, + "loss": 0.8938, + "step": 302980 + }, + { + "epoch": 1.9357167499329186, + "grad_norm": 1.3287533521652222, + "learning_rate": 2.6074130836530876e-07, + "loss": 1.112, + "step": 302990 + }, + { + "epoch": 1.9357806370826571, + "grad_norm": 1.12720787525177, + "learning_rate": 2.602297928852471e-07, + "loss": 0.7258, + "step": 303000 + }, + { + "epoch": 1.935844524232396, + "grad_norm": 1.0907084941864014, + "learning_rate": 2.5971877832769996e-07, + "loss": 1.0114, + "step": 303010 + }, + { + "epoch": 1.9359084113821345, + "grad_norm": 1.0238360166549683, + "learning_rate": 2.592082646978189e-07, + "loss": 0.7974, + "step": 303020 + }, + { + "epoch": 1.9359722985318732, + "grad_norm": 1.0993527173995972, + "learning_rate": 2.5869825200073863e-07, + "loss": 0.8538, + "step": 303030 + }, + { + "epoch": 1.936036185681612, + "grad_norm": 1.4540520906448364, + "learning_rate": 2.5818874024159947e-07, + "loss": 0.8249, + "step": 303040 + }, + { + "epoch": 1.9361000728313507, + "grad_norm": 0.9106485843658447, + "learning_rate": 2.576797294255362e-07, + "loss": 1.0292, + "step": 303050 + }, + { + "epoch": 1.9361639599810894, + "grad_norm": 0.9970775246620178, + "learning_rate": 2.57171219557667e-07, + "loss": 1.1452, + "step": 303060 + }, + { + "epoch": 1.936227847130828, + "grad_norm": 0.9550371170043945, + "learning_rate": 2.5666321064312106e-07, + "loss": 0.7977, + "step": 303070 + }, + { + "epoch": 1.9362917342805668, + "grad_norm": 0.8058514595031738, + "learning_rate": 2.561557026870054e-07, + "loss": 0.7658, + "step": 303080 + }, + { + "epoch": 1.9363556214303055, + "grad_norm": 0.6585938334465027, + "learning_rate": 2.5564869569444374e-07, + "loss": 0.8798, + "step": 303090 + }, + { + "epoch": 1.9364195085800442, + "grad_norm": 1.362809181213379, + "learning_rate": 2.551421896705319e-07, + "loss": 0.8086, + "step": 303100 + }, + { + "epoch": 1.936483395729783, + "grad_norm": 0.8295511603355408, + "learning_rate": 2.546361846203715e-07, + "loss": 0.9155, + "step": 303110 + }, + { + "epoch": 1.9365472828795216, + "grad_norm": 0.8398029208183289, + "learning_rate": 2.5413068054906395e-07, + "loss": 1.1177, + "step": 303120 + }, + { + "epoch": 1.9366111700292603, + "grad_norm": 1.4834344387054443, + "learning_rate": 2.5362567746169407e-07, + "loss": 0.9247, + "step": 303130 + }, + { + "epoch": 1.936675057178999, + "grad_norm": 1.2295825481414795, + "learning_rate": 2.5312117536334665e-07, + "loss": 0.6938, + "step": 303140 + }, + { + "epoch": 1.9367389443287377, + "grad_norm": 4.947011947631836, + "learning_rate": 2.5261717425911216e-07, + "loss": 0.77, + "step": 303150 + }, + { + "epoch": 1.9368028314784764, + "grad_norm": 1.1070027351379395, + "learning_rate": 2.521136741540586e-07, + "loss": 0.8208, + "step": 303160 + }, + { + "epoch": 1.9368667186282151, + "grad_norm": 0.5779455900192261, + "learning_rate": 2.5161067505325987e-07, + "loss": 1.015, + "step": 303170 + }, + { + "epoch": 1.9369306057779538, + "grad_norm": 2.129777669906616, + "learning_rate": 2.5110817696177847e-07, + "loss": 0.7702, + "step": 303180 + }, + { + "epoch": 1.9369944929276925, + "grad_norm": 1.1007846593856812, + "learning_rate": 2.5060617988467705e-07, + "loss": 0.8984, + "step": 303190 + }, + { + "epoch": 1.9370583800774313, + "grad_norm": 2.3898720741271973, + "learning_rate": 2.501046838270127e-07, + "loss": 0.9611, + "step": 303200 + }, + { + "epoch": 1.93712226722717, + "grad_norm": 0.7541255354881287, + "learning_rate": 2.4960368879383133e-07, + "loss": 0.881, + "step": 303210 + }, + { + "epoch": 1.9371861543769087, + "grad_norm": 0.8008748888969421, + "learning_rate": 2.4910319479017895e-07, + "loss": 0.9516, + "step": 303220 + }, + { + "epoch": 1.9372500415266474, + "grad_norm": 1.0474908351898193, + "learning_rate": 2.486032018211015e-07, + "loss": 0.8205, + "step": 303230 + }, + { + "epoch": 1.937313928676386, + "grad_norm": 0.9243830442428589, + "learning_rate": 2.481037098916339e-07, + "loss": 1.1442, + "step": 303240 + }, + { + "epoch": 1.9373778158261248, + "grad_norm": 1.007006049156189, + "learning_rate": 2.4760471900679425e-07, + "loss": 0.9138, + "step": 303250 + }, + { + "epoch": 1.9374417029758635, + "grad_norm": 1.0164581537246704, + "learning_rate": 2.471062291716231e-07, + "loss": 0.6679, + "step": 303260 + }, + { + "epoch": 1.937505590125602, + "grad_norm": 0.7017830014228821, + "learning_rate": 2.4660824039113295e-07, + "loss": 0.6308, + "step": 303270 + }, + { + "epoch": 1.937569477275341, + "grad_norm": 0.7931995987892151, + "learning_rate": 2.4611075267034764e-07, + "loss": 0.8849, + "step": 303280 + }, + { + "epoch": 1.9376333644250794, + "grad_norm": 0.8192692995071411, + "learning_rate": 2.4561376601426323e-07, + "loss": 0.9391, + "step": 303290 + }, + { + "epoch": 1.9376972515748183, + "grad_norm": 0.5092934966087341, + "learning_rate": 2.451172804278923e-07, + "loss": 0.6376, + "step": 303300 + }, + { + "epoch": 1.9377611387245568, + "grad_norm": 0.9124179482460022, + "learning_rate": 2.446212959162364e-07, + "loss": 1.1539, + "step": 303310 + }, + { + "epoch": 1.9378250258742957, + "grad_norm": 0.9786889553070068, + "learning_rate": 2.44125812484286e-07, + "loss": 0.9836, + "step": 303320 + }, + { + "epoch": 1.9378889130240342, + "grad_norm": 1.0967962741851807, + "learning_rate": 2.4363083013703157e-07, + "loss": 0.9354, + "step": 303330 + }, + { + "epoch": 1.9379528001737731, + "grad_norm": 0.8982738852500916, + "learning_rate": 2.431363488794691e-07, + "loss": 0.9411, + "step": 303340 + }, + { + "epoch": 1.9380166873235116, + "grad_norm": 1.1276098489761353, + "learning_rate": 2.426423687165613e-07, + "loss": 0.8338, + "step": 303350 + }, + { + "epoch": 1.9380805744732506, + "grad_norm": 0.9050993323326111, + "learning_rate": 2.421488896532931e-07, + "loss": 1.0627, + "step": 303360 + }, + { + "epoch": 1.938144461622989, + "grad_norm": 1.407092809677124, + "learning_rate": 2.4165591169463266e-07, + "loss": 1.0689, + "step": 303370 + }, + { + "epoch": 1.938208348772728, + "grad_norm": 0.9812001585960388, + "learning_rate": 2.4116343484554274e-07, + "loss": 0.7703, + "step": 303380 + }, + { + "epoch": 1.9382722359224664, + "grad_norm": 0.9568415880203247, + "learning_rate": 2.4067145911099154e-07, + "loss": 0.9541, + "step": 303390 + }, + { + "epoch": 1.9383361230722054, + "grad_norm": 0.8810369372367859, + "learning_rate": 2.401799844959196e-07, + "loss": 0.9046, + "step": 303400 + }, + { + "epoch": 1.9384000102219439, + "grad_norm": 0.6494470834732056, + "learning_rate": 2.3968901100528407e-07, + "loss": 0.7342, + "step": 303410 + }, + { + "epoch": 1.9384638973716828, + "grad_norm": 1.4283164739608765, + "learning_rate": 2.39198538644031e-07, + "loss": 1.0041, + "step": 303420 + }, + { + "epoch": 1.9385277845214213, + "grad_norm": 0.9517856240272522, + "learning_rate": 2.3870856741709526e-07, + "loss": 0.6312, + "step": 303430 + }, + { + "epoch": 1.9385916716711602, + "grad_norm": 0.7215419411659241, + "learning_rate": 2.382190973294174e-07, + "loss": 0.85, + "step": 303440 + }, + { + "epoch": 1.9386555588208987, + "grad_norm": 1.9820024967193604, + "learning_rate": 2.3773012838592125e-07, + "loss": 0.7948, + "step": 303450 + }, + { + "epoch": 1.9387194459706376, + "grad_norm": 0.7442840337753296, + "learning_rate": 2.3724166059153063e-07, + "loss": 0.8042, + "step": 303460 + }, + { + "epoch": 1.938783333120376, + "grad_norm": 1.2596721649169922, + "learning_rate": 2.3675369395117496e-07, + "loss": 0.548, + "step": 303470 + }, + { + "epoch": 1.938847220270115, + "grad_norm": 1.1596099138259888, + "learning_rate": 2.3626622846975588e-07, + "loss": 0.7801, + "step": 303480 + }, + { + "epoch": 1.9389111074198535, + "grad_norm": 0.6521729826927185, + "learning_rate": 2.357792641521861e-07, + "loss": 0.7191, + "step": 303490 + }, + { + "epoch": 1.9389749945695924, + "grad_norm": 1.0326049327850342, + "learning_rate": 2.352928010033728e-07, + "loss": 0.8572, + "step": 303500 + }, + { + "epoch": 1.939038881719331, + "grad_norm": 0.9028391242027283, + "learning_rate": 2.3480683902821765e-07, + "loss": 0.8388, + "step": 303510 + }, + { + "epoch": 1.9391027688690696, + "grad_norm": 1.4557136297225952, + "learning_rate": 2.3432137823160561e-07, + "loss": 1.0357, + "step": 303520 + }, + { + "epoch": 1.9391666560188083, + "grad_norm": 0.7721455693244934, + "learning_rate": 2.3383641861843274e-07, + "loss": 0.8991, + "step": 303530 + }, + { + "epoch": 1.939230543168547, + "grad_norm": 1.0463354587554932, + "learning_rate": 2.3335196019357853e-07, + "loss": 0.8772, + "step": 303540 + }, + { + "epoch": 1.9392944303182857, + "grad_norm": 0.8078722357749939, + "learning_rate": 2.328680029619279e-07, + "loss": 0.9063, + "step": 303550 + }, + { + "epoch": 1.9393583174680245, + "grad_norm": 0.9805110096931458, + "learning_rate": 2.3238454692834922e-07, + "loss": 0.8548, + "step": 303560 + }, + { + "epoch": 1.9394222046177632, + "grad_norm": 1.0593864917755127, + "learning_rate": 2.319015920977108e-07, + "loss": 0.9081, + "step": 303570 + }, + { + "epoch": 1.9394860917675019, + "grad_norm": 0.895837664604187, + "learning_rate": 2.3141913847488094e-07, + "loss": 0.6311, + "step": 303580 + }, + { + "epoch": 1.9395499789172406, + "grad_norm": 0.6257737278938293, + "learning_rate": 2.3093718606471693e-07, + "loss": 0.8553, + "step": 303590 + }, + { + "epoch": 1.9396138660669793, + "grad_norm": 1.059502363204956, + "learning_rate": 2.3045573487207595e-07, + "loss": 1.2677, + "step": 303600 + }, + { + "epoch": 1.939677753216718, + "grad_norm": 0.9159273505210876, + "learning_rate": 2.2997478490179302e-07, + "loss": 0.7429, + "step": 303610 + }, + { + "epoch": 1.9397416403664567, + "grad_norm": 0.8961524367332458, + "learning_rate": 2.2949433615873096e-07, + "loss": 0.7627, + "step": 303620 + }, + { + "epoch": 1.9398055275161954, + "grad_norm": 0.7950904965400696, + "learning_rate": 2.2901438864771362e-07, + "loss": 0.8802, + "step": 303630 + }, + { + "epoch": 1.939869414665934, + "grad_norm": 1.0109800100326538, + "learning_rate": 2.2853494237358163e-07, + "loss": 0.9184, + "step": 303640 + }, + { + "epoch": 1.9399333018156728, + "grad_norm": 0.8042694926261902, + "learning_rate": 2.280559973411589e-07, + "loss": 0.8039, + "step": 303650 + }, + { + "epoch": 1.9399971889654115, + "grad_norm": 1.3180112838745117, + "learning_rate": 2.2757755355526932e-07, + "loss": 0.8882, + "step": 303660 + }, + { + "epoch": 1.9400610761151502, + "grad_norm": 1.3387608528137207, + "learning_rate": 2.2709961102073686e-07, + "loss": 0.9035, + "step": 303670 + }, + { + "epoch": 1.940124963264889, + "grad_norm": 0.9645832180976868, + "learning_rate": 2.2662216974236872e-07, + "loss": 0.737, + "step": 303680 + }, + { + "epoch": 1.9401888504146276, + "grad_norm": 1.5381606817245483, + "learning_rate": 2.2614522972497775e-07, + "loss": 0.9132, + "step": 303690 + }, + { + "epoch": 1.9402527375643663, + "grad_norm": 1.7082456350326538, + "learning_rate": 2.2566879097336567e-07, + "loss": 1.0725, + "step": 303700 + }, + { + "epoch": 1.940316624714105, + "grad_norm": 1.0522699356079102, + "learning_rate": 2.251928534923231e-07, + "loss": 1.2124, + "step": 303710 + }, + { + "epoch": 1.9403805118638437, + "grad_norm": 2.351830244064331, + "learning_rate": 2.247174172866573e-07, + "loss": 0.8393, + "step": 303720 + }, + { + "epoch": 1.9404443990135825, + "grad_norm": 0.865699291229248, + "learning_rate": 2.242424823611422e-07, + "loss": 0.8724, + "step": 303730 + }, + { + "epoch": 1.9405082861633212, + "grad_norm": 0.6963575482368469, + "learning_rate": 2.23768048720574e-07, + "loss": 0.6325, + "step": 303740 + }, + { + "epoch": 1.9405721733130599, + "grad_norm": 0.6768919825553894, + "learning_rate": 2.2329411636972108e-07, + "loss": 0.8137, + "step": 303750 + }, + { + "epoch": 1.9406360604627984, + "grad_norm": 1.1644431352615356, + "learning_rate": 2.2282068531335743e-07, + "loss": 0.8637, + "step": 303760 + }, + { + "epoch": 1.9406999476125373, + "grad_norm": 0.8119156360626221, + "learning_rate": 2.2234775555625698e-07, + "loss": 0.8328, + "step": 303770 + }, + { + "epoch": 1.9407638347622758, + "grad_norm": 1.0110023021697998, + "learning_rate": 2.2187532710317705e-07, + "loss": 0.9217, + "step": 303780 + }, + { + "epoch": 1.9408277219120147, + "grad_norm": 0.6955659985542297, + "learning_rate": 2.2140339995887494e-07, + "loss": 0.8405, + "step": 303790 + }, + { + "epoch": 1.9408916090617532, + "grad_norm": 0.9101753830909729, + "learning_rate": 2.2093197412810796e-07, + "loss": 0.9248, + "step": 303800 + }, + { + "epoch": 1.940955496211492, + "grad_norm": 0.7637187242507935, + "learning_rate": 2.2046104961561677e-07, + "loss": 0.9431, + "step": 303810 + }, + { + "epoch": 1.9410193833612306, + "grad_norm": 1.151958703994751, + "learning_rate": 2.199906264261531e-07, + "loss": 1.0865, + "step": 303820 + }, + { + "epoch": 1.9410832705109695, + "grad_norm": 0.9793457984924316, + "learning_rate": 2.195207045644465e-07, + "loss": 1.0678, + "step": 303830 + }, + { + "epoch": 1.941147157660708, + "grad_norm": 0.9088485836982727, + "learning_rate": 2.1905128403523212e-07, + "loss": 0.953, + "step": 303840 + }, + { + "epoch": 1.941211044810447, + "grad_norm": 1.3817073106765747, + "learning_rate": 2.18582364843245e-07, + "loss": 0.8613, + "step": 303850 + }, + { + "epoch": 1.9412749319601854, + "grad_norm": 0.7287924289703369, + "learning_rate": 2.1811394699319254e-07, + "loss": 0.951, + "step": 303860 + }, + { + "epoch": 1.9413388191099243, + "grad_norm": 2.202221393585205, + "learning_rate": 2.1764603048980426e-07, + "loss": 0.6123, + "step": 303870 + }, + { + "epoch": 1.9414027062596628, + "grad_norm": 0.8882316946983337, + "learning_rate": 2.171786153377875e-07, + "loss": 0.9085, + "step": 303880 + }, + { + "epoch": 1.9414665934094018, + "grad_norm": 1.4518402814865112, + "learning_rate": 2.167117015418496e-07, + "loss": 0.7472, + "step": 303890 + }, + { + "epoch": 1.9415304805591402, + "grad_norm": 1.1848163604736328, + "learning_rate": 2.162452891066924e-07, + "loss": 1.2847, + "step": 303900 + }, + { + "epoch": 1.9415943677088792, + "grad_norm": 1.130953073501587, + "learning_rate": 2.157793780370121e-07, + "loss": 0.8192, + "step": 303910 + }, + { + "epoch": 1.9416582548586176, + "grad_norm": 0.7800238132476807, + "learning_rate": 2.153139683375105e-07, + "loss": 0.517, + "step": 303920 + }, + { + "epoch": 1.9417221420083566, + "grad_norm": 1.3722091913223267, + "learning_rate": 2.1484906001286164e-07, + "loss": 0.8937, + "step": 303930 + }, + { + "epoch": 1.941786029158095, + "grad_norm": 0.8849961161613464, + "learning_rate": 2.1438465306775624e-07, + "loss": 0.6877, + "step": 303940 + }, + { + "epoch": 1.941849916307834, + "grad_norm": 1.1050699949264526, + "learning_rate": 2.1392074750686275e-07, + "loss": 0.7989, + "step": 303950 + }, + { + "epoch": 1.9419138034575725, + "grad_norm": 1.2571942806243896, + "learning_rate": 2.134573433348608e-07, + "loss": 0.7086, + "step": 303960 + }, + { + "epoch": 1.9419776906073114, + "grad_norm": 0.7900700569152832, + "learning_rate": 2.1299444055641882e-07, + "loss": 0.681, + "step": 303970 + }, + { + "epoch": 1.9420415777570499, + "grad_norm": 0.8653547763824463, + "learning_rate": 2.125320391761887e-07, + "loss": 0.874, + "step": 303980 + }, + { + "epoch": 1.9421054649067888, + "grad_norm": 1.1003780364990234, + "learning_rate": 2.120701391988389e-07, + "loss": 0.8899, + "step": 303990 + }, + { + "epoch": 1.9421693520565273, + "grad_norm": 0.8403931260108948, + "learning_rate": 2.1160874062901014e-07, + "loss": 0.9496, + "step": 304000 + }, + { + "epoch": 1.942233239206266, + "grad_norm": 1.0029411315917969, + "learning_rate": 2.1114784347135985e-07, + "loss": 0.6715, + "step": 304010 + }, + { + "epoch": 1.9422971263560047, + "grad_norm": 0.5501324534416199, + "learning_rate": 2.106874477305232e-07, + "loss": 0.6133, + "step": 304020 + }, + { + "epoch": 1.9423610135057434, + "grad_norm": 1.3969063758850098, + "learning_rate": 2.1022755341112977e-07, + "loss": 0.7371, + "step": 304030 + }, + { + "epoch": 1.9424249006554821, + "grad_norm": 1.2871463298797607, + "learning_rate": 2.0976816051783142e-07, + "loss": 0.7898, + "step": 304040 + }, + { + "epoch": 1.9424887878052208, + "grad_norm": 0.9517881274223328, + "learning_rate": 2.0930926905523564e-07, + "loss": 0.8429, + "step": 304050 + }, + { + "epoch": 1.9425526749549595, + "grad_norm": 0.9025202989578247, + "learning_rate": 2.0885087902797195e-07, + "loss": 1.0197, + "step": 304060 + }, + { + "epoch": 1.9426165621046982, + "grad_norm": 1.7701680660247803, + "learning_rate": 2.083929904406534e-07, + "loss": 0.8149, + "step": 304070 + }, + { + "epoch": 1.942680449254437, + "grad_norm": 1.1089845895767212, + "learning_rate": 2.0793560329789297e-07, + "loss": 0.6883, + "step": 304080 + }, + { + "epoch": 1.9427443364041757, + "grad_norm": 1.1055128574371338, + "learning_rate": 2.0747871760429803e-07, + "loss": 0.7522, + "step": 304090 + }, + { + "epoch": 1.9428082235539144, + "grad_norm": 1.0907093286514282, + "learning_rate": 2.0702233336447052e-07, + "loss": 0.9392, + "step": 304100 + }, + { + "epoch": 1.942872110703653, + "grad_norm": 0.8361429572105408, + "learning_rate": 2.0656645058300673e-07, + "loss": 0.9274, + "step": 304110 + }, + { + "epoch": 1.9429359978533918, + "grad_norm": 1.088329553604126, + "learning_rate": 2.0611106926449186e-07, + "loss": 0.8, + "step": 304120 + }, + { + "epoch": 1.9429998850031305, + "grad_norm": 3.630836009979248, + "learning_rate": 2.0565618941352228e-07, + "loss": 1.1398, + "step": 304130 + }, + { + "epoch": 1.9430637721528692, + "grad_norm": 0.8681971430778503, + "learning_rate": 2.052018110346665e-07, + "loss": 0.7294, + "step": 304140 + }, + { + "epoch": 1.943127659302608, + "grad_norm": 0.7005565762519836, + "learning_rate": 2.047479341325098e-07, + "loss": 1.0169, + "step": 304150 + }, + { + "epoch": 1.9431915464523466, + "grad_norm": 1.0016130208969116, + "learning_rate": 2.0429455871162073e-07, + "loss": 0.8592, + "step": 304160 + }, + { + "epoch": 1.9432554336020853, + "grad_norm": 1.0213990211486816, + "learning_rate": 2.0384168477656785e-07, + "loss": 0.6729, + "step": 304170 + }, + { + "epoch": 1.943319320751824, + "grad_norm": 1.0723354816436768, + "learning_rate": 2.0338931233190305e-07, + "loss": 1.1405, + "step": 304180 + }, + { + "epoch": 1.9433832079015627, + "grad_norm": 0.6627349853515625, + "learning_rate": 2.0293744138219495e-07, + "loss": 0.7205, + "step": 304190 + }, + { + "epoch": 1.9434470950513014, + "grad_norm": 0.9345703125, + "learning_rate": 2.0248607193197878e-07, + "loss": 0.7871, + "step": 304200 + }, + { + "epoch": 1.9435109822010401, + "grad_norm": 0.9300385117530823, + "learning_rate": 2.0203520398581754e-07, + "loss": 0.9794, + "step": 304210 + }, + { + "epoch": 1.9435748693507788, + "grad_norm": 1.2261062860488892, + "learning_rate": 2.0158483754824097e-07, + "loss": 0.9665, + "step": 304220 + }, + { + "epoch": 1.9436387565005175, + "grad_norm": 0.9771540760993958, + "learning_rate": 2.0113497262378432e-07, + "loss": 1.0119, + "step": 304230 + }, + { + "epoch": 1.9437026436502562, + "grad_norm": 1.694318413734436, + "learning_rate": 2.0068560921697732e-07, + "loss": 0.8907, + "step": 304240 + }, + { + "epoch": 1.9437665307999947, + "grad_norm": 1.1793835163116455, + "learning_rate": 2.002367473323552e-07, + "loss": 0.6662, + "step": 304250 + }, + { + "epoch": 1.9438304179497337, + "grad_norm": 1.1031192541122437, + "learning_rate": 1.9978838697443103e-07, + "loss": 0.8654, + "step": 304260 + }, + { + "epoch": 1.9438943050994721, + "grad_norm": 1.7324854135513306, + "learning_rate": 1.9934052814771785e-07, + "loss": 1.0354, + "step": 304270 + }, + { + "epoch": 1.943958192249211, + "grad_norm": 1.046745777130127, + "learning_rate": 1.9889317085673432e-07, + "loss": 0.7834, + "step": 304280 + }, + { + "epoch": 1.9440220793989496, + "grad_norm": 1.2141883373260498, + "learning_rate": 1.9844631510597677e-07, + "loss": 0.916, + "step": 304290 + }, + { + "epoch": 1.9440859665486885, + "grad_norm": 1.0867642164230347, + "learning_rate": 1.9799996089994721e-07, + "loss": 0.9096, + "step": 304300 + }, + { + "epoch": 1.944149853698427, + "grad_norm": 0.9960625171661377, + "learning_rate": 1.9755410824314758e-07, + "loss": 0.7334, + "step": 304310 + }, + { + "epoch": 1.944213740848166, + "grad_norm": 0.8736566305160522, + "learning_rate": 1.9710875714006316e-07, + "loss": 0.9111, + "step": 304320 + }, + { + "epoch": 1.9442776279979044, + "grad_norm": 1.8254917860031128, + "learning_rate": 1.9666390759517927e-07, + "loss": 0.9085, + "step": 304330 + }, + { + "epoch": 1.9443415151476433, + "grad_norm": 0.595846951007843, + "learning_rate": 1.9621955961297568e-07, + "loss": 0.8852, + "step": 304340 + }, + { + "epoch": 1.9444054022973818, + "grad_norm": 1.2150890827178955, + "learning_rate": 1.9577571319792098e-07, + "loss": 1.0281, + "step": 304350 + }, + { + "epoch": 1.9444692894471207, + "grad_norm": 1.0095974206924438, + "learning_rate": 1.9533236835450052e-07, + "loss": 0.8541, + "step": 304360 + }, + { + "epoch": 1.9445331765968592, + "grad_norm": 1.1392488479614258, + "learning_rate": 1.948895250871663e-07, + "loss": 0.7374, + "step": 304370 + }, + { + "epoch": 1.9445970637465981, + "grad_norm": 1.0379918813705444, + "learning_rate": 1.9444718340038138e-07, + "loss": 0.8847, + "step": 304380 + }, + { + "epoch": 1.9446609508963366, + "grad_norm": 1.0987335443496704, + "learning_rate": 1.940053432986033e-07, + "loss": 0.7846, + "step": 304390 + }, + { + "epoch": 1.9447248380460755, + "grad_norm": 1.033288598060608, + "learning_rate": 1.9356400478627857e-07, + "loss": 0.8308, + "step": 304400 + }, + { + "epoch": 1.944788725195814, + "grad_norm": 1.2702211141586304, + "learning_rate": 1.93123167867848e-07, + "loss": 0.8131, + "step": 304410 + }, + { + "epoch": 1.944852612345553, + "grad_norm": 0.7985353469848633, + "learning_rate": 1.9268283254776364e-07, + "loss": 0.7777, + "step": 304420 + }, + { + "epoch": 1.9449164994952914, + "grad_norm": 1.0405573844909668, + "learning_rate": 1.922429988304497e-07, + "loss": 0.8938, + "step": 304430 + }, + { + "epoch": 1.9449803866450304, + "grad_norm": 0.81767737865448, + "learning_rate": 1.91803666720336e-07, + "loss": 0.6554, + "step": 304440 + }, + { + "epoch": 1.9450442737947689, + "grad_norm": 0.8658244609832764, + "learning_rate": 1.9136483622185787e-07, + "loss": 0.7811, + "step": 304450 + }, + { + "epoch": 1.9451081609445078, + "grad_norm": 0.8948516249656677, + "learning_rate": 1.909265073394173e-07, + "loss": 0.8768, + "step": 304460 + }, + { + "epoch": 1.9451720480942463, + "grad_norm": 0.7505782246589661, + "learning_rate": 1.9048868007744413e-07, + "loss": 0.8596, + "step": 304470 + }, + { + "epoch": 1.945235935243985, + "grad_norm": 0.6966861486434937, + "learning_rate": 1.900513544403404e-07, + "loss": 0.7792, + "step": 304480 + }, + { + "epoch": 1.9452998223937237, + "grad_norm": 0.9721227288246155, + "learning_rate": 1.896145304325081e-07, + "loss": 0.709, + "step": 304490 + }, + { + "epoch": 1.9453637095434624, + "grad_norm": 0.7984893918037415, + "learning_rate": 1.8917820805834928e-07, + "loss": 1.1086, + "step": 304500 + }, + { + "epoch": 1.945427596693201, + "grad_norm": 1.087112307548523, + "learning_rate": 1.8874238732226047e-07, + "loss": 1.0401, + "step": 304510 + }, + { + "epoch": 1.9454914838429398, + "grad_norm": 0.9860308766365051, + "learning_rate": 1.8830706822863254e-07, + "loss": 1.1688, + "step": 304520 + }, + { + "epoch": 1.9455553709926785, + "grad_norm": 2.163724899291992, + "learning_rate": 1.878722507818398e-07, + "loss": 0.8776, + "step": 304530 + }, + { + "epoch": 1.9456192581424172, + "grad_norm": 1.2752691507339478, + "learning_rate": 1.874379349862676e-07, + "loss": 0.9181, + "step": 304540 + }, + { + "epoch": 1.945683145292156, + "grad_norm": 1.5012123584747314, + "learning_rate": 1.8700412084629027e-07, + "loss": 0.9558, + "step": 304550 + }, + { + "epoch": 1.9457470324418946, + "grad_norm": 0.8935086727142334, + "learning_rate": 1.865708083662765e-07, + "loss": 1.0123, + "step": 304560 + }, + { + "epoch": 1.9458109195916333, + "grad_norm": 0.936139702796936, + "learning_rate": 1.8613799755058948e-07, + "loss": 1.0713, + "step": 304570 + }, + { + "epoch": 1.945874806741372, + "grad_norm": 1.1493687629699707, + "learning_rate": 1.857056884035868e-07, + "loss": 0.9304, + "step": 304580 + }, + { + "epoch": 1.9459386938911107, + "grad_norm": 1.3518346548080444, + "learning_rate": 1.8527388092962616e-07, + "loss": 0.764, + "step": 304590 + }, + { + "epoch": 1.9460025810408494, + "grad_norm": 1.3774468898773193, + "learning_rate": 1.8484257513305403e-07, + "loss": 0.9028, + "step": 304600 + }, + { + "epoch": 1.9460664681905882, + "grad_norm": 0.7998141050338745, + "learning_rate": 1.8441177101821138e-07, + "loss": 0.813, + "step": 304610 + }, + { + "epoch": 1.9461303553403269, + "grad_norm": 0.8194617033004761, + "learning_rate": 1.839814685894392e-07, + "loss": 0.7935, + "step": 304620 + }, + { + "epoch": 1.9461942424900656, + "grad_norm": 0.8891813158988953, + "learning_rate": 1.8355166785106738e-07, + "loss": 0.742, + "step": 304630 + }, + { + "epoch": 1.9462581296398043, + "grad_norm": 1.0683954954147339, + "learning_rate": 1.8312236880743127e-07, + "loss": 0.7811, + "step": 304640 + }, + { + "epoch": 1.946322016789543, + "grad_norm": 0.6153748631477356, + "learning_rate": 1.826935714628497e-07, + "loss": 0.9762, + "step": 304650 + }, + { + "epoch": 1.9463859039392817, + "grad_norm": 0.7627272605895996, + "learning_rate": 1.8226527582164143e-07, + "loss": 0.8678, + "step": 304660 + }, + { + "epoch": 1.9464497910890204, + "grad_norm": 1.9040825366973877, + "learning_rate": 1.8183748188811967e-07, + "loss": 1.0886, + "step": 304670 + }, + { + "epoch": 1.946513678238759, + "grad_norm": 0.7414036393165588, + "learning_rate": 1.8141018966659206e-07, + "loss": 0.8223, + "step": 304680 + }, + { + "epoch": 1.9465775653884978, + "grad_norm": 0.9439243674278259, + "learning_rate": 1.809833991613663e-07, + "loss": 0.7223, + "step": 304690 + }, + { + "epoch": 1.9466414525382365, + "grad_norm": 1.765831470489502, + "learning_rate": 1.8055711037673894e-07, + "loss": 0.9139, + "step": 304700 + }, + { + "epoch": 1.9467053396879752, + "grad_norm": 0.9545259475708008, + "learning_rate": 1.8013132331699546e-07, + "loss": 0.8998, + "step": 304710 + }, + { + "epoch": 1.946769226837714, + "grad_norm": 0.9046018123626709, + "learning_rate": 1.7970603798643237e-07, + "loss": 0.6941, + "step": 304720 + }, + { + "epoch": 1.9468331139874526, + "grad_norm": 0.7828860282897949, + "learning_rate": 1.792812543893352e-07, + "loss": 0.9326, + "step": 304730 + }, + { + "epoch": 1.946897001137191, + "grad_norm": 1.1395444869995117, + "learning_rate": 1.788569725299727e-07, + "loss": 0.8888, + "step": 304740 + }, + { + "epoch": 1.94696088828693, + "grad_norm": 1.9494872093200684, + "learning_rate": 1.7843319241261924e-07, + "loss": 1.1222, + "step": 304750 + }, + { + "epoch": 1.9470247754366685, + "grad_norm": 1.2636685371398926, + "learning_rate": 1.780099140415492e-07, + "loss": 0.8966, + "step": 304760 + }, + { + "epoch": 1.9470886625864074, + "grad_norm": 0.903266191482544, + "learning_rate": 1.775871374210203e-07, + "loss": 0.8379, + "step": 304770 + }, + { + "epoch": 1.947152549736146, + "grad_norm": 1.4494318962097168, + "learning_rate": 1.771648625552902e-07, + "loss": 0.8868, + "step": 304780 + }, + { + "epoch": 1.9472164368858849, + "grad_norm": 0.7543337941169739, + "learning_rate": 1.7674308944861106e-07, + "loss": 0.6507, + "step": 304790 + }, + { + "epoch": 1.9472803240356233, + "grad_norm": 0.6446106433868408, + "learning_rate": 1.7632181810524062e-07, + "loss": 0.9346, + "step": 304800 + }, + { + "epoch": 1.9473442111853623, + "grad_norm": 0.9636729955673218, + "learning_rate": 1.7590104852940326e-07, + "loss": 0.8048, + "step": 304810 + }, + { + "epoch": 1.9474080983351008, + "grad_norm": 0.928029477596283, + "learning_rate": 1.7548078072535114e-07, + "loss": 0.9762, + "step": 304820 + }, + { + "epoch": 1.9474719854848397, + "grad_norm": 2.0930633544921875, + "learning_rate": 1.7506101469731416e-07, + "loss": 0.6671, + "step": 304830 + }, + { + "epoch": 1.9475358726345782, + "grad_norm": 1.1717947721481323, + "learning_rate": 1.7464175044951126e-07, + "loss": 0.9609, + "step": 304840 + }, + { + "epoch": 1.947599759784317, + "grad_norm": 0.8623785376548767, + "learning_rate": 1.7422298798617787e-07, + "loss": 0.8877, + "step": 304850 + }, + { + "epoch": 1.9476636469340556, + "grad_norm": 0.6183810830116272, + "learning_rate": 1.7380472731152175e-07, + "loss": 0.7253, + "step": 304860 + }, + { + "epoch": 1.9477275340837945, + "grad_norm": 0.8609592318534851, + "learning_rate": 1.733869684297562e-07, + "loss": 0.9726, + "step": 304870 + }, + { + "epoch": 1.947791421233533, + "grad_norm": 0.8909668326377869, + "learning_rate": 1.7296971134508898e-07, + "loss": 1.0667, + "step": 304880 + }, + { + "epoch": 1.947855308383272, + "grad_norm": 1.0848865509033203, + "learning_rate": 1.7255295606172784e-07, + "loss": 0.8502, + "step": 304890 + }, + { + "epoch": 1.9479191955330104, + "grad_norm": 0.9160458445549011, + "learning_rate": 1.7213670258386384e-07, + "loss": 0.8162, + "step": 304900 + }, + { + "epoch": 1.9479830826827493, + "grad_norm": 1.1018562316894531, + "learning_rate": 1.7172095091568807e-07, + "loss": 0.8825, + "step": 304910 + }, + { + "epoch": 1.9480469698324878, + "grad_norm": 2.1245996952056885, + "learning_rate": 1.7130570106139166e-07, + "loss": 0.8319, + "step": 304920 + }, + { + "epoch": 1.9481108569822267, + "grad_norm": 3.173818588256836, + "learning_rate": 1.7089095302515456e-07, + "loss": 0.8999, + "step": 304930 + }, + { + "epoch": 1.9481747441319652, + "grad_norm": 0.758355975151062, + "learning_rate": 1.7047670681115125e-07, + "loss": 0.8667, + "step": 304940 + }, + { + "epoch": 1.9482386312817042, + "grad_norm": 0.6632780432701111, + "learning_rate": 1.7006296242355613e-07, + "loss": 0.6416, + "step": 304950 + }, + { + "epoch": 1.9483025184314426, + "grad_norm": 0.8830016851425171, + "learning_rate": 1.6964971986654366e-07, + "loss": 0.8193, + "step": 304960 + }, + { + "epoch": 1.9483664055811813, + "grad_norm": 1.0977959632873535, + "learning_rate": 1.692369791442605e-07, + "loss": 0.867, + "step": 304970 + }, + { + "epoch": 1.94843029273092, + "grad_norm": 1.4547436237335205, + "learning_rate": 1.6882474026087557e-07, + "loss": 0.7399, + "step": 304980 + }, + { + "epoch": 1.9484941798806588, + "grad_norm": 0.8041918277740479, + "learning_rate": 1.6841300322053e-07, + "loss": 0.9664, + "step": 304990 + }, + { + "epoch": 1.9485580670303975, + "grad_norm": 0.667141318321228, + "learning_rate": 1.6800176802738153e-07, + "loss": 0.8188, + "step": 305000 + }, + { + "epoch": 1.9486219541801362, + "grad_norm": 0.9592339396476746, + "learning_rate": 1.6759103468556025e-07, + "loss": 0.8442, + "step": 305010 + }, + { + "epoch": 1.9486858413298749, + "grad_norm": 0.9630610942840576, + "learning_rate": 1.671808031992128e-07, + "loss": 0.8916, + "step": 305020 + }, + { + "epoch": 1.9487497284796136, + "grad_norm": 5.386287689208984, + "learning_rate": 1.6677107357246368e-07, + "loss": 0.7866, + "step": 305030 + }, + { + "epoch": 1.9488136156293523, + "grad_norm": 1.0215415954589844, + "learning_rate": 1.663618458094429e-07, + "loss": 0.8229, + "step": 305040 + }, + { + "epoch": 1.948877502779091, + "grad_norm": 5.606192588806152, + "learning_rate": 1.6595311991426943e-07, + "loss": 1.0394, + "step": 305050 + }, + { + "epoch": 1.9489413899288297, + "grad_norm": 1.9907209873199463, + "learning_rate": 1.655448958910677e-07, + "loss": 1.229, + "step": 305060 + }, + { + "epoch": 1.9490052770785684, + "grad_norm": 0.9894927740097046, + "learning_rate": 1.6513717374393445e-07, + "loss": 0.9943, + "step": 305070 + }, + { + "epoch": 1.9490691642283071, + "grad_norm": 0.8075218796730042, + "learning_rate": 1.6472995347698305e-07, + "loss": 0.7627, + "step": 305080 + }, + { + "epoch": 1.9491330513780458, + "grad_norm": 1.380251169204712, + "learning_rate": 1.643232350943158e-07, + "loss": 0.8527, + "step": 305090 + }, + { + "epoch": 1.9491969385277845, + "grad_norm": 0.6336302161216736, + "learning_rate": 1.639170186000294e-07, + "loss": 0.5664, + "step": 305100 + }, + { + "epoch": 1.9492608256775232, + "grad_norm": 1.226217269897461, + "learning_rate": 1.6351130399820946e-07, + "loss": 0.8283, + "step": 305110 + }, + { + "epoch": 1.949324712827262, + "grad_norm": 1.1361498832702637, + "learning_rate": 1.6310609129294718e-07, + "loss": 1.1582, + "step": 305120 + }, + { + "epoch": 1.9493885999770006, + "grad_norm": 0.9070385694503784, + "learning_rate": 1.6270138048832262e-07, + "loss": 0.921, + "step": 305130 + }, + { + "epoch": 1.9494524871267394, + "grad_norm": 2.407209634780884, + "learning_rate": 1.6229717158841028e-07, + "loss": 0.8725, + "step": 305140 + }, + { + "epoch": 1.949516374276478, + "grad_norm": 0.6049879193305969, + "learning_rate": 1.6189346459727916e-07, + "loss": 0.817, + "step": 305150 + }, + { + "epoch": 1.9495802614262168, + "grad_norm": 0.8451147079467773, + "learning_rate": 1.6149025951899822e-07, + "loss": 0.8979, + "step": 305160 + }, + { + "epoch": 1.9496441485759555, + "grad_norm": 0.7237834930419922, + "learning_rate": 1.6108755635763083e-07, + "loss": 0.638, + "step": 305170 + }, + { + "epoch": 1.9497080357256942, + "grad_norm": 0.8538950085639954, + "learning_rate": 1.6068535511722383e-07, + "loss": 0.7442, + "step": 305180 + }, + { + "epoch": 1.9497719228754329, + "grad_norm": 1.0325095653533936, + "learning_rate": 1.6028365580183503e-07, + "loss": 0.751, + "step": 305190 + }, + { + "epoch": 1.9498358100251716, + "grad_norm": 1.1748923063278198, + "learning_rate": 1.5988245841550566e-07, + "loss": 1.0115, + "step": 305200 + }, + { + "epoch": 1.94989969717491, + "grad_norm": 0.9139209389686584, + "learning_rate": 1.594817629622769e-07, + "loss": 0.8674, + "step": 305210 + }, + { + "epoch": 1.949963584324649, + "grad_norm": 0.9411085844039917, + "learning_rate": 1.5908156944618447e-07, + "loss": 0.8427, + "step": 305220 + }, + { + "epoch": 1.9500274714743875, + "grad_norm": 1.0001469850540161, + "learning_rate": 1.5872182444128803e-07, + "loss": 0.8739, + "step": 305230 + }, + { + "epoch": 1.9500913586241264, + "grad_norm": 0.7743364572525024, + "learning_rate": 1.5832258461685456e-07, + "loss": 0.6349, + "step": 305240 + }, + { + "epoch": 1.950155245773865, + "grad_norm": 1.0801235437393188, + "learning_rate": 1.5792384674123695e-07, + "loss": 0.9892, + "step": 305250 + }, + { + "epoch": 1.9502191329236038, + "grad_norm": 1.2719773054122925, + "learning_rate": 1.575256108184431e-07, + "loss": 0.8488, + "step": 305260 + }, + { + "epoch": 1.9502830200733423, + "grad_norm": 0.8136735558509827, + "learning_rate": 1.5712787685248088e-07, + "loss": 1.0387, + "step": 305270 + }, + { + "epoch": 1.9503469072230812, + "grad_norm": 1.0452874898910522, + "learning_rate": 1.5673064484736933e-07, + "loss": 0.8763, + "step": 305280 + }, + { + "epoch": 1.9504107943728197, + "grad_norm": 0.9515146613121033, + "learning_rate": 1.5633391480709413e-07, + "loss": 0.673, + "step": 305290 + }, + { + "epoch": 1.9504746815225587, + "grad_norm": 0.5954400897026062, + "learning_rate": 1.5593768673566323e-07, + "loss": 0.7687, + "step": 305300 + }, + { + "epoch": 1.9505385686722971, + "grad_norm": 0.7953042387962341, + "learning_rate": 1.5554196063705673e-07, + "loss": 0.7698, + "step": 305310 + }, + { + "epoch": 1.950602455822036, + "grad_norm": 0.829258143901825, + "learning_rate": 1.551467365152659e-07, + "loss": 0.8873, + "step": 305320 + }, + { + "epoch": 1.9506663429717745, + "grad_norm": 0.837475597858429, + "learning_rate": 1.5475201437427088e-07, + "loss": 0.7546, + "step": 305330 + }, + { + "epoch": 1.9507302301215135, + "grad_norm": 1.1831344366073608, + "learning_rate": 1.543577942180463e-07, + "loss": 0.9384, + "step": 305340 + }, + { + "epoch": 1.950794117271252, + "grad_norm": 0.9250369668006897, + "learning_rate": 1.5396407605055564e-07, + "loss": 0.8128, + "step": 305350 + }, + { + "epoch": 1.9508580044209909, + "grad_norm": 1.4125860929489136, + "learning_rate": 1.5357085987577347e-07, + "loss": 0.7333, + "step": 305360 + }, + { + "epoch": 1.9509218915707294, + "grad_norm": 1.054139494895935, + "learning_rate": 1.5317814569765775e-07, + "loss": 0.8228, + "step": 305370 + }, + { + "epoch": 1.9509857787204683, + "grad_norm": 0.9782888889312744, + "learning_rate": 1.5278593352015536e-07, + "loss": 0.9152, + "step": 305380 + }, + { + "epoch": 1.9510496658702068, + "grad_norm": 1.1666638851165771, + "learning_rate": 1.5239422334722974e-07, + "loss": 0.6384, + "step": 305390 + }, + { + "epoch": 1.9511135530199457, + "grad_norm": 1.5675245523452759, + "learning_rate": 1.5200301518281667e-07, + "loss": 1.0474, + "step": 305400 + }, + { + "epoch": 1.9511774401696842, + "grad_norm": 0.962260365486145, + "learning_rate": 1.5161230903085188e-07, + "loss": 0.8123, + "step": 305410 + }, + { + "epoch": 1.9512413273194231, + "grad_norm": 1.3214889764785767, + "learning_rate": 1.512221048952822e-07, + "loss": 0.9154, + "step": 305420 + }, + { + "epoch": 1.9513052144691616, + "grad_norm": 1.0269955396652222, + "learning_rate": 1.508324027800323e-07, + "loss": 1.0752, + "step": 305430 + }, + { + "epoch": 1.9513691016189005, + "grad_norm": 1.121466875076294, + "learning_rate": 1.5044320268902124e-07, + "loss": 0.9382, + "step": 305440 + }, + { + "epoch": 1.951432988768639, + "grad_norm": 1.2383556365966797, + "learning_rate": 1.500545046261792e-07, + "loss": 0.9561, + "step": 305450 + }, + { + "epoch": 1.9514968759183777, + "grad_norm": 0.8118842840194702, + "learning_rate": 1.4966630859540865e-07, + "loss": 0.7875, + "step": 305460 + }, + { + "epoch": 1.9515607630681164, + "grad_norm": 2.2885642051696777, + "learning_rate": 1.4927861460062865e-07, + "loss": 1.2945, + "step": 305470 + }, + { + "epoch": 1.9516246502178551, + "grad_norm": 1.0705455541610718, + "learning_rate": 1.4889142264573607e-07, + "loss": 1.124, + "step": 305480 + }, + { + "epoch": 1.9516885373675938, + "grad_norm": 0.8999394178390503, + "learning_rate": 1.4850473273463895e-07, + "loss": 0.8277, + "step": 305490 + }, + { + "epoch": 1.9517524245173326, + "grad_norm": 0.9403774738311768, + "learning_rate": 1.48118544871223e-07, + "loss": 1.1041, + "step": 305500 + }, + { + "epoch": 1.9518163116670713, + "grad_norm": 1.0024014711380005, + "learning_rate": 1.4773285905938517e-07, + "loss": 0.8263, + "step": 305510 + }, + { + "epoch": 1.95188019881681, + "grad_norm": 0.8567982316017151, + "learning_rate": 1.4734767530300564e-07, + "loss": 0.9281, + "step": 305520 + }, + { + "epoch": 1.9519440859665487, + "grad_norm": 1.4160600900650024, + "learning_rate": 1.469629936059591e-07, + "loss": 1.1148, + "step": 305530 + }, + { + "epoch": 1.9520079731162874, + "grad_norm": 1.0826430320739746, + "learning_rate": 1.465788139721258e-07, + "loss": 0.878, + "step": 305540 + }, + { + "epoch": 1.952071860266026, + "grad_norm": 0.7373409271240234, + "learning_rate": 1.4619513640537486e-07, + "loss": 1.0469, + "step": 305550 + }, + { + "epoch": 1.9521357474157648, + "grad_norm": 1.3082698583602905, + "learning_rate": 1.4581196090956984e-07, + "loss": 0.8581, + "step": 305560 + }, + { + "epoch": 1.9521996345655035, + "grad_norm": 0.9631960988044739, + "learning_rate": 1.4542928748856877e-07, + "loss": 0.8367, + "step": 305570 + }, + { + "epoch": 1.9522635217152422, + "grad_norm": 1.2323166131973267, + "learning_rate": 1.4504711614621857e-07, + "loss": 1.0647, + "step": 305580 + }, + { + "epoch": 1.952327408864981, + "grad_norm": 1.3021869659423828, + "learning_rate": 1.446654468863773e-07, + "loss": 0.8232, + "step": 305590 + }, + { + "epoch": 1.9523912960147196, + "grad_norm": 1.011557936668396, + "learning_rate": 1.4428427971289184e-07, + "loss": 0.749, + "step": 305600 + }, + { + "epoch": 1.9524551831644583, + "grad_norm": 0.98341965675354, + "learning_rate": 1.439036146295869e-07, + "loss": 0.7362, + "step": 305610 + }, + { + "epoch": 1.952519070314197, + "grad_norm": 1.0501110553741455, + "learning_rate": 1.4352345164030945e-07, + "loss": 0.78, + "step": 305620 + }, + { + "epoch": 1.9525829574639357, + "grad_norm": 0.6365110278129578, + "learning_rate": 1.4314379074888418e-07, + "loss": 0.831, + "step": 305630 + }, + { + "epoch": 1.9526468446136744, + "grad_norm": 1.1915727853775024, + "learning_rate": 1.4276463195913025e-07, + "loss": 0.8709, + "step": 305640 + }, + { + "epoch": 1.9527107317634131, + "grad_norm": 0.9435083866119385, + "learning_rate": 1.4238597527486685e-07, + "loss": 0.8048, + "step": 305650 + }, + { + "epoch": 1.9527746189131518, + "grad_norm": 0.7186278700828552, + "learning_rate": 1.4200782069991314e-07, + "loss": 0.6761, + "step": 305660 + }, + { + "epoch": 1.9528385060628906, + "grad_norm": 0.835472822189331, + "learning_rate": 1.4163016823807164e-07, + "loss": 1.257, + "step": 305670 + }, + { + "epoch": 1.9529023932126293, + "grad_norm": 0.987421452999115, + "learning_rate": 1.4125301789315038e-07, + "loss": 0.6752, + "step": 305680 + }, + { + "epoch": 1.952966280362368, + "grad_norm": 0.7175649404525757, + "learning_rate": 1.4087636966894635e-07, + "loss": 1.1305, + "step": 305690 + }, + { + "epoch": 1.9530301675121065, + "grad_norm": 1.1987440586090088, + "learning_rate": 1.4050022356925096e-07, + "loss": 0.7934, + "step": 305700 + }, + { + "epoch": 1.9530940546618454, + "grad_norm": 0.8681856393814087, + "learning_rate": 1.4012457959785007e-07, + "loss": 0.6293, + "step": 305710 + }, + { + "epoch": 1.9531579418115839, + "grad_norm": 1.4752001762390137, + "learning_rate": 1.3974943775852956e-07, + "loss": 0.9793, + "step": 305720 + }, + { + "epoch": 1.9532218289613228, + "grad_norm": 0.9839785695075989, + "learning_rate": 1.393747980550697e-07, + "loss": 0.8663, + "step": 305730 + }, + { + "epoch": 1.9532857161110613, + "grad_norm": 1.206047773361206, + "learning_rate": 1.3900066049123972e-07, + "loss": 1.0417, + "step": 305740 + }, + { + "epoch": 1.9533496032608002, + "grad_norm": 0.652935266494751, + "learning_rate": 1.3862702507080883e-07, + "loss": 0.7282, + "step": 305750 + }, + { + "epoch": 1.9534134904105387, + "grad_norm": 0.7819461226463318, + "learning_rate": 1.3825389179754067e-07, + "loss": 0.9542, + "step": 305760 + }, + { + "epoch": 1.9534773775602776, + "grad_norm": 1.0411031246185303, + "learning_rate": 1.3788126067519337e-07, + "loss": 0.7367, + "step": 305770 + }, + { + "epoch": 1.953541264710016, + "grad_norm": 0.45564350485801697, + "learning_rate": 1.3750913170751944e-07, + "loss": 0.8215, + "step": 305780 + }, + { + "epoch": 1.953605151859755, + "grad_norm": 0.9776589870452881, + "learning_rate": 1.3713750489826038e-07, + "loss": 0.5945, + "step": 305790 + }, + { + "epoch": 1.9536690390094935, + "grad_norm": 1.087541103363037, + "learning_rate": 1.3676638025116872e-07, + "loss": 1.0174, + "step": 305800 + }, + { + "epoch": 1.9537329261592324, + "grad_norm": 0.7229280471801758, + "learning_rate": 1.3639575776997483e-07, + "loss": 0.5723, + "step": 305810 + }, + { + "epoch": 1.953796813308971, + "grad_norm": 1.1533738374710083, + "learning_rate": 1.3602563745842012e-07, + "loss": 1.0214, + "step": 305820 + }, + { + "epoch": 1.9538607004587099, + "grad_norm": 0.9949087500572205, + "learning_rate": 1.3565601932021831e-07, + "loss": 1.2322, + "step": 305830 + }, + { + "epoch": 1.9539245876084483, + "grad_norm": 1.1862506866455078, + "learning_rate": 1.3528690335909978e-07, + "loss": 0.7501, + "step": 305840 + }, + { + "epoch": 1.9539884747581873, + "grad_norm": 1.5264919996261597, + "learning_rate": 1.3491828957878927e-07, + "loss": 0.8117, + "step": 305850 + }, + { + "epoch": 1.9540523619079258, + "grad_norm": 0.72819983959198, + "learning_rate": 1.3455017798298386e-07, + "loss": 1.1966, + "step": 305860 + }, + { + "epoch": 1.9541162490576647, + "grad_norm": 1.30023193359375, + "learning_rate": 1.3418256857539724e-07, + "loss": 1.1788, + "step": 305870 + }, + { + "epoch": 1.9541801362074032, + "grad_norm": 1.353724479675293, + "learning_rate": 1.33815461359732e-07, + "loss": 0.7519, + "step": 305880 + }, + { + "epoch": 1.954244023357142, + "grad_norm": 0.8705479502677917, + "learning_rate": 1.3344885633969073e-07, + "loss": 1.2264, + "step": 305890 + }, + { + "epoch": 1.9543079105068806, + "grad_norm": 0.7442498803138733, + "learning_rate": 1.3308275351895938e-07, + "loss": 0.8841, + "step": 305900 + }, + { + "epoch": 1.9543717976566195, + "grad_norm": 0.5314042568206787, + "learning_rate": 1.327171529012239e-07, + "loss": 0.662, + "step": 305910 + }, + { + "epoch": 1.954435684806358, + "grad_norm": 1.0172762870788574, + "learning_rate": 1.3235205449016463e-07, + "loss": 0.8995, + "step": 305920 + }, + { + "epoch": 1.954499571956097, + "grad_norm": 1.180970549583435, + "learning_rate": 1.3198745828946758e-07, + "loss": 1.0673, + "step": 305930 + }, + { + "epoch": 1.9545634591058354, + "grad_norm": 1.0113781690597534, + "learning_rate": 1.3162336430279642e-07, + "loss": 1.3776, + "step": 305940 + }, + { + "epoch": 1.954627346255574, + "grad_norm": 1.205683708190918, + "learning_rate": 1.3125977253382048e-07, + "loss": 0.9683, + "step": 305950 + }, + { + "epoch": 1.9546912334053128, + "grad_norm": 0.731574296951294, + "learning_rate": 1.3089668298619794e-07, + "loss": 1.0741, + "step": 305960 + }, + { + "epoch": 1.9547551205550515, + "grad_norm": 0.8031790852546692, + "learning_rate": 1.3053409566359253e-07, + "loss": 0.8409, + "step": 305970 + }, + { + "epoch": 1.9548190077047902, + "grad_norm": 0.8294908404350281, + "learning_rate": 1.3017201056965133e-07, + "loss": 0.7015, + "step": 305980 + }, + { + "epoch": 1.954882894854529, + "grad_norm": 0.8706844449043274, + "learning_rate": 1.2981042770802143e-07, + "loss": 1.1126, + "step": 305990 + }, + { + "epoch": 1.9549467820042676, + "grad_norm": 0.9077652096748352, + "learning_rate": 1.2944934708234436e-07, + "loss": 0.7772, + "step": 306000 + }, + { + "epoch": 1.9550106691540063, + "grad_norm": 0.8920031189918518, + "learning_rate": 1.2908876869625608e-07, + "loss": 0.5676, + "step": 306010 + }, + { + "epoch": 1.955074556303745, + "grad_norm": 0.7555814981460571, + "learning_rate": 1.2872869255338704e-07, + "loss": 0.9694, + "step": 306020 + }, + { + "epoch": 1.9551384434534838, + "grad_norm": 1.2185505628585815, + "learning_rate": 1.2836911865736767e-07, + "loss": 0.7554, + "step": 306030 + }, + { + "epoch": 1.9552023306032225, + "grad_norm": 3.282710552215576, + "learning_rate": 1.2801004701181175e-07, + "loss": 0.808, + "step": 306040 + }, + { + "epoch": 1.9552662177529612, + "grad_norm": 0.7775924801826477, + "learning_rate": 1.2765147762034413e-07, + "loss": 0.7981, + "step": 306050 + }, + { + "epoch": 1.9553301049026999, + "grad_norm": 0.8961272239685059, + "learning_rate": 1.2729341048657305e-07, + "loss": 0.9899, + "step": 306060 + }, + { + "epoch": 1.9553939920524386, + "grad_norm": 1.0614861249923706, + "learning_rate": 1.269358456141012e-07, + "loss": 0.8512, + "step": 306070 + }, + { + "epoch": 1.9554578792021773, + "grad_norm": 0.9941128492355347, + "learning_rate": 1.2657878300653125e-07, + "loss": 0.9489, + "step": 306080 + }, + { + "epoch": 1.955521766351916, + "grad_norm": 1.3333861827850342, + "learning_rate": 1.262222226674603e-07, + "loss": 0.6982, + "step": 306090 + }, + { + "epoch": 1.9555856535016547, + "grad_norm": 1.0364742279052734, + "learning_rate": 1.2586616460047996e-07, + "loss": 0.8277, + "step": 306100 + }, + { + "epoch": 1.9556495406513934, + "grad_norm": 0.9991453289985657, + "learning_rate": 1.2551060880917065e-07, + "loss": 0.8136, + "step": 306110 + }, + { + "epoch": 1.955713427801132, + "grad_norm": 0.8465530276298523, + "learning_rate": 1.2515555529711842e-07, + "loss": 0.797, + "step": 306120 + }, + { + "epoch": 1.9557773149508708, + "grad_norm": 0.8986508846282959, + "learning_rate": 1.2480100406790374e-07, + "loss": 0.9838, + "step": 306130 + }, + { + "epoch": 1.9558412021006095, + "grad_norm": 1.2777119874954224, + "learning_rate": 1.2444695512508487e-07, + "loss": 1.0812, + "step": 306140 + }, + { + "epoch": 1.9559050892503482, + "grad_norm": 0.688077986240387, + "learning_rate": 1.2409340847223672e-07, + "loss": 0.8734, + "step": 306150 + }, + { + "epoch": 1.955968976400087, + "grad_norm": 3.1722869873046875, + "learning_rate": 1.23740364112912e-07, + "loss": 0.863, + "step": 306160 + }, + { + "epoch": 1.9560328635498256, + "grad_norm": 1.2481809854507446, + "learning_rate": 1.2338782205067455e-07, + "loss": 0.8628, + "step": 306170 + }, + { + "epoch": 1.9560967506995643, + "grad_norm": 0.9949286580085754, + "learning_rate": 1.2303578228907153e-07, + "loss": 0.857, + "step": 306180 + }, + { + "epoch": 1.9561606378493028, + "grad_norm": 0.8365099430084229, + "learning_rate": 1.2268424483164453e-07, + "loss": 1.0734, + "step": 306190 + }, + { + "epoch": 1.9562245249990418, + "grad_norm": 1.7561908960342407, + "learning_rate": 1.2233320968194072e-07, + "loss": 0.7738, + "step": 306200 + }, + { + "epoch": 1.9562884121487802, + "grad_norm": 0.6921905875205994, + "learning_rate": 1.2198267684349063e-07, + "loss": 0.829, + "step": 306210 + }, + { + "epoch": 1.9563522992985192, + "grad_norm": 1.623157024383545, + "learning_rate": 1.2163264631982474e-07, + "loss": 0.8584, + "step": 306220 + }, + { + "epoch": 1.9564161864482577, + "grad_norm": 0.611640989780426, + "learning_rate": 1.2128311811447356e-07, + "loss": 0.7492, + "step": 306230 + }, + { + "epoch": 1.9564800735979966, + "grad_norm": 1.3138545751571655, + "learning_rate": 1.2093409223094542e-07, + "loss": 1.0977, + "step": 306240 + }, + { + "epoch": 1.956543960747735, + "grad_norm": 0.7451996803283691, + "learning_rate": 1.2058556867276528e-07, + "loss": 0.868, + "step": 306250 + }, + { + "epoch": 1.956607847897474, + "grad_norm": 0.6381491422653198, + "learning_rate": 1.202375474434414e-07, + "loss": 1.0713, + "step": 306260 + }, + { + "epoch": 1.9566717350472125, + "grad_norm": 1.098995327949524, + "learning_rate": 1.1989002854647659e-07, + "loss": 1.1494, + "step": 306270 + }, + { + "epoch": 1.9567356221969514, + "grad_norm": 1.9603251218795776, + "learning_rate": 1.1954301198537353e-07, + "loss": 0.9146, + "step": 306280 + }, + { + "epoch": 1.95679950934669, + "grad_norm": 0.8454369306564331, + "learning_rate": 1.1919649776362397e-07, + "loss": 1.0223, + "step": 306290 + }, + { + "epoch": 1.9568633964964288, + "grad_norm": 1.5903698205947876, + "learning_rate": 1.1885048588471948e-07, + "loss": 0.8199, + "step": 306300 + }, + { + "epoch": 1.9569272836461673, + "grad_norm": 1.051414132118225, + "learning_rate": 1.1850497635214064e-07, + "loss": 0.8365, + "step": 306310 + }, + { + "epoch": 1.9569911707959062, + "grad_norm": 1.1712877750396729, + "learning_rate": 1.1815996916937356e-07, + "loss": 0.8556, + "step": 306320 + }, + { + "epoch": 1.9570550579456447, + "grad_norm": 1.2720564603805542, + "learning_rate": 1.1781546433988766e-07, + "loss": 1.0154, + "step": 306330 + }, + { + "epoch": 1.9571189450953836, + "grad_norm": 0.6502274870872498, + "learning_rate": 1.1747146186715796e-07, + "loss": 0.8979, + "step": 306340 + }, + { + "epoch": 1.9571828322451221, + "grad_norm": 0.8052017092704773, + "learning_rate": 1.171279617546428e-07, + "loss": 0.9568, + "step": 306350 + }, + { + "epoch": 1.957246719394861, + "grad_norm": 1.5580902099609375, + "learning_rate": 1.1678496400580052e-07, + "loss": 1.0563, + "step": 306360 + }, + { + "epoch": 1.9573106065445995, + "grad_norm": 1.3809198141098022, + "learning_rate": 1.1644246862409502e-07, + "loss": 0.8175, + "step": 306370 + }, + { + "epoch": 1.9573744936943385, + "grad_norm": 0.9385179281234741, + "learning_rate": 1.1610047561296245e-07, + "loss": 0.8123, + "step": 306380 + }, + { + "epoch": 1.957438380844077, + "grad_norm": 1.2661628723144531, + "learning_rate": 1.1575898497586113e-07, + "loss": 0.6835, + "step": 306390 + }, + { + "epoch": 1.9575022679938159, + "grad_norm": 0.7913726568222046, + "learning_rate": 1.1541799671621611e-07, + "loss": 0.7138, + "step": 306400 + }, + { + "epoch": 1.9575661551435544, + "grad_norm": 0.999924898147583, + "learning_rate": 1.1507751083747465e-07, + "loss": 0.7999, + "step": 306410 + }, + { + "epoch": 1.9576300422932933, + "grad_norm": 0.5822678804397583, + "learning_rate": 1.147375273430562e-07, + "loss": 0.8937, + "step": 306420 + }, + { + "epoch": 1.9576939294430318, + "grad_norm": 1.0195214748382568, + "learning_rate": 1.1439804623638584e-07, + "loss": 1.106, + "step": 306430 + }, + { + "epoch": 1.9577578165927705, + "grad_norm": 0.6929598450660706, + "learning_rate": 1.1405906752088302e-07, + "loss": 0.9227, + "step": 306440 + }, + { + "epoch": 1.9578217037425092, + "grad_norm": 0.7076100707054138, + "learning_rate": 1.1372059119996725e-07, + "loss": 0.9683, + "step": 306450 + }, + { + "epoch": 1.957885590892248, + "grad_norm": 0.9137086868286133, + "learning_rate": 1.1338261727704136e-07, + "loss": 1.0094, + "step": 306460 + }, + { + "epoch": 1.9579494780419866, + "grad_norm": 2.7839596271514893, + "learning_rate": 1.1304514575551372e-07, + "loss": 0.7638, + "step": 306470 + }, + { + "epoch": 1.9580133651917253, + "grad_norm": 0.9103108048439026, + "learning_rate": 1.1270817663877609e-07, + "loss": 1.1677, + "step": 306480 + }, + { + "epoch": 1.958077252341464, + "grad_norm": 2.217245578765869, + "learning_rate": 1.1237170993022572e-07, + "loss": 0.921, + "step": 306490 + }, + { + "epoch": 1.9581411394912027, + "grad_norm": 0.9462472796440125, + "learning_rate": 1.120357456332488e-07, + "loss": 0.7343, + "step": 306500 + }, + { + "epoch": 1.9582050266409414, + "grad_norm": 1.196720004081726, + "learning_rate": 1.1170028375123709e-07, + "loss": 0.9115, + "step": 306510 + }, + { + "epoch": 1.9582689137906801, + "grad_norm": 2.692883253097534, + "learning_rate": 1.1136532428756008e-07, + "loss": 0.7908, + "step": 306520 + }, + { + "epoch": 1.9583328009404188, + "grad_norm": 0.828401505947113, + "learning_rate": 1.1103086724559287e-07, + "loss": 0.6602, + "step": 306530 + }, + { + "epoch": 1.9583966880901575, + "grad_norm": 1.0882549285888672, + "learning_rate": 1.1069691262870496e-07, + "loss": 0.9501, + "step": 306540 + }, + { + "epoch": 1.9584605752398963, + "grad_norm": 0.9018024802207947, + "learning_rate": 1.1036346044026591e-07, + "loss": 1.0834, + "step": 306550 + }, + { + "epoch": 1.958524462389635, + "grad_norm": 0.62995845079422, + "learning_rate": 1.1003051068361747e-07, + "loss": 0.7957, + "step": 306560 + }, + { + "epoch": 1.9585883495393737, + "grad_norm": 0.7326763272285461, + "learning_rate": 1.0969806336212917e-07, + "loss": 0.938, + "step": 306570 + }, + { + "epoch": 1.9586522366891124, + "grad_norm": 1.4249024391174316, + "learning_rate": 1.093661184791428e-07, + "loss": 0.7235, + "step": 306580 + }, + { + "epoch": 1.958716123838851, + "grad_norm": 0.9121171236038208, + "learning_rate": 1.0903467603800011e-07, + "loss": 0.8777, + "step": 306590 + }, + { + "epoch": 1.9587800109885898, + "grad_norm": 0.9764759540557861, + "learning_rate": 1.0870373604203732e-07, + "loss": 1.0221, + "step": 306600 + }, + { + "epoch": 1.9588438981383285, + "grad_norm": 0.9533472657203674, + "learning_rate": 1.0837329849459066e-07, + "loss": 0.7547, + "step": 306610 + }, + { + "epoch": 1.9589077852880672, + "grad_norm": 0.8116563558578491, + "learning_rate": 1.080433633989908e-07, + "loss": 0.8485, + "step": 306620 + }, + { + "epoch": 1.958971672437806, + "grad_norm": 1.36518394947052, + "learning_rate": 1.0771393075855729e-07, + "loss": 0.7912, + "step": 306630 + }, + { + "epoch": 1.9590355595875446, + "grad_norm": 0.9521485567092896, + "learning_rate": 1.0738500057660417e-07, + "loss": 0.8798, + "step": 306640 + }, + { + "epoch": 1.9590994467372833, + "grad_norm": 0.9593106508255005, + "learning_rate": 1.0705657285644544e-07, + "loss": 0.8244, + "step": 306650 + }, + { + "epoch": 1.959163333887022, + "grad_norm": 0.7078234553337097, + "learning_rate": 1.0672864760139512e-07, + "loss": 0.7332, + "step": 306660 + }, + { + "epoch": 1.9592272210367607, + "grad_norm": 0.908644437789917, + "learning_rate": 1.0640122481475057e-07, + "loss": 0.5235, + "step": 306670 + }, + { + "epoch": 1.9592911081864992, + "grad_norm": 0.8596978783607483, + "learning_rate": 1.0607430449980915e-07, + "loss": 0.6656, + "step": 306680 + }, + { + "epoch": 1.9593549953362381, + "grad_norm": 0.6808596849441528, + "learning_rate": 1.0574788665986269e-07, + "loss": 0.8061, + "step": 306690 + }, + { + "epoch": 1.9594188824859766, + "grad_norm": 0.6964841485023499, + "learning_rate": 1.0542197129819742e-07, + "loss": 0.6471, + "step": 306700 + }, + { + "epoch": 1.9594827696357155, + "grad_norm": 0.8564151525497437, + "learning_rate": 1.0509655841809962e-07, + "loss": 0.6819, + "step": 306710 + }, + { + "epoch": 1.959546656785454, + "grad_norm": 0.9149518609046936, + "learning_rate": 1.0477164802285e-07, + "loss": 1.091, + "step": 306720 + }, + { + "epoch": 1.959610543935193, + "grad_norm": 0.9932882189750671, + "learning_rate": 1.0444724011570706e-07, + "loss": 1.0427, + "step": 306730 + }, + { + "epoch": 1.9596744310849314, + "grad_norm": 0.6701475381851196, + "learning_rate": 1.041233346999515e-07, + "loss": 0.8218, + "step": 306740 + }, + { + "epoch": 1.9597383182346704, + "grad_norm": 2.0861892700195312, + "learning_rate": 1.0379993177884184e-07, + "loss": 0.9127, + "step": 306750 + }, + { + "epoch": 1.9598022053844089, + "grad_norm": 1.76549232006073, + "learning_rate": 1.0347703135563103e-07, + "loss": 0.7404, + "step": 306760 + }, + { + "epoch": 1.9598660925341478, + "grad_norm": 0.678974449634552, + "learning_rate": 1.0315463343356646e-07, + "loss": 0.6949, + "step": 306770 + }, + { + "epoch": 1.9599299796838863, + "grad_norm": 0.9061679244041443, + "learning_rate": 1.028327380159122e-07, + "loss": 0.8652, + "step": 306780 + }, + { + "epoch": 1.9599938668336252, + "grad_norm": 1.2013291120529175, + "learning_rate": 1.0251134510589344e-07, + "loss": 0.6827, + "step": 306790 + }, + { + "epoch": 1.9600577539833637, + "grad_norm": 0.6794036030769348, + "learning_rate": 1.0219045470675203e-07, + "loss": 0.8597, + "step": 306800 + }, + { + "epoch": 1.9601216411331026, + "grad_norm": 0.913158118724823, + "learning_rate": 1.0187006682172429e-07, + "loss": 0.7935, + "step": 306810 + }, + { + "epoch": 1.960185528282841, + "grad_norm": 2.0173139572143555, + "learning_rate": 1.0155018145403539e-07, + "loss": 0.8751, + "step": 306820 + }, + { + "epoch": 1.96024941543258, + "grad_norm": 1.0600955486297607, + "learning_rate": 1.0123079860689943e-07, + "loss": 0.899, + "step": 306830 + }, + { + "epoch": 1.9603133025823185, + "grad_norm": 0.9187150597572327, + "learning_rate": 1.0091191828353608e-07, + "loss": 0.5977, + "step": 306840 + }, + { + "epoch": 1.9603771897320574, + "grad_norm": 0.7382291555404663, + "learning_rate": 1.0059354048716496e-07, + "loss": 0.7518, + "step": 306850 + }, + { + "epoch": 1.960441076881796, + "grad_norm": 0.6964963674545288, + "learning_rate": 1.0027566522097797e-07, + "loss": 0.7174, + "step": 306860 + }, + { + "epoch": 1.9605049640315348, + "grad_norm": 1.0228418111801147, + "learning_rate": 9.995829248818921e-08, + "loss": 0.7848, + "step": 306870 + }, + { + "epoch": 1.9605688511812733, + "grad_norm": 1.48477041721344, + "learning_rate": 9.964142229199058e-08, + "loss": 0.8795, + "step": 306880 + }, + { + "epoch": 1.9606327383310123, + "grad_norm": 1.1190383434295654, + "learning_rate": 9.932505463557396e-08, + "loss": 0.7041, + "step": 306890 + }, + { + "epoch": 1.9606966254807507, + "grad_norm": 0.7254348397254944, + "learning_rate": 9.900918952212013e-08, + "loss": 0.8268, + "step": 306900 + }, + { + "epoch": 1.9607605126304894, + "grad_norm": 1.076117992401123, + "learning_rate": 9.869382695482099e-08, + "loss": 0.6709, + "step": 306910 + }, + { + "epoch": 1.9608243997802282, + "grad_norm": 1.279038906097412, + "learning_rate": 9.837896693684068e-08, + "loss": 0.7932, + "step": 306920 + }, + { + "epoch": 1.9608882869299669, + "grad_norm": 0.5272724032402039, + "learning_rate": 9.806460947135443e-08, + "loss": 0.8304, + "step": 306930 + }, + { + "epoch": 1.9609521740797056, + "grad_norm": 1.385393738746643, + "learning_rate": 9.775075456153194e-08, + "loss": 0.9646, + "step": 306940 + }, + { + "epoch": 1.9610160612294443, + "grad_norm": 1.049930214881897, + "learning_rate": 9.743740221053178e-08, + "loss": 0.8105, + "step": 306950 + }, + { + "epoch": 1.961079948379183, + "grad_norm": 0.965087354183197, + "learning_rate": 9.712455242150143e-08, + "loss": 0.7291, + "step": 306960 + }, + { + "epoch": 1.9611438355289217, + "grad_norm": 1.0274797677993774, + "learning_rate": 9.681220519760503e-08, + "loss": 1.0187, + "step": 306970 + }, + { + "epoch": 1.9612077226786604, + "grad_norm": 1.0843732357025146, + "learning_rate": 9.650036054198452e-08, + "loss": 1.0503, + "step": 306980 + }, + { + "epoch": 1.961271609828399, + "grad_norm": 0.9422346353530884, + "learning_rate": 9.618901845777073e-08, + "loss": 0.8191, + "step": 306990 + }, + { + "epoch": 1.9613354969781378, + "grad_norm": 1.1185977458953857, + "learning_rate": 9.587817894811113e-08, + "loss": 0.8307, + "step": 307000 + }, + { + "epoch": 1.9613993841278765, + "grad_norm": 1.0668240785598755, + "learning_rate": 9.556784201613101e-08, + "loss": 1.1259, + "step": 307010 + }, + { + "epoch": 1.9614632712776152, + "grad_norm": 1.0997084379196167, + "learning_rate": 9.525800766495562e-08, + "loss": 0.8358, + "step": 307020 + }, + { + "epoch": 1.961527158427354, + "grad_norm": 1.0606350898742676, + "learning_rate": 9.494867589770473e-08, + "loss": 0.6989, + "step": 307030 + }, + { + "epoch": 1.9615910455770926, + "grad_norm": 0.6498733758926392, + "learning_rate": 9.463984671749804e-08, + "loss": 0.7428, + "step": 307040 + }, + { + "epoch": 1.9616549327268313, + "grad_norm": 0.9367380738258362, + "learning_rate": 9.433152012743863e-08, + "loss": 0.8984, + "step": 307050 + }, + { + "epoch": 1.96171881987657, + "grad_norm": 0.8001465201377869, + "learning_rate": 9.402369613064066e-08, + "loss": 0.828, + "step": 307060 + }, + { + "epoch": 1.9617827070263087, + "grad_norm": 1.2087067365646362, + "learning_rate": 9.371637473019057e-08, + "loss": 0.9922, + "step": 307070 + }, + { + "epoch": 1.9618465941760475, + "grad_norm": 1.5438634157180786, + "learning_rate": 9.340955592919698e-08, + "loss": 0.6449, + "step": 307080 + }, + { + "epoch": 1.9619104813257862, + "grad_norm": 1.0287994146347046, + "learning_rate": 9.310323973074631e-08, + "loss": 0.9819, + "step": 307090 + }, + { + "epoch": 1.9619743684755249, + "grad_norm": 0.9086164236068726, + "learning_rate": 9.279742613791941e-08, + "loss": 0.8067, + "step": 307100 + }, + { + "epoch": 1.9620382556252636, + "grad_norm": 1.034316062927246, + "learning_rate": 9.249211515379719e-08, + "loss": 1.0819, + "step": 307110 + }, + { + "epoch": 1.9621021427750023, + "grad_norm": 0.9390623569488525, + "learning_rate": 9.218730678146048e-08, + "loss": 0.8188, + "step": 307120 + }, + { + "epoch": 1.962166029924741, + "grad_norm": 2.8493175506591797, + "learning_rate": 9.188300102396797e-08, + "loss": 0.9277, + "step": 307130 + }, + { + "epoch": 1.9622299170744797, + "grad_norm": 1.424728512763977, + "learning_rate": 9.15791978843894e-08, + "loss": 1.1248, + "step": 307140 + }, + { + "epoch": 1.9622938042242182, + "grad_norm": 0.9395694136619568, + "learning_rate": 9.127589736578901e-08, + "loss": 1.4445, + "step": 307150 + }, + { + "epoch": 1.962357691373957, + "grad_norm": 1.2583932876586914, + "learning_rate": 9.09730994712199e-08, + "loss": 0.8988, + "step": 307160 + }, + { + "epoch": 1.9624215785236956, + "grad_norm": 1.0137208700180054, + "learning_rate": 9.067080420372409e-08, + "loss": 0.8206, + "step": 307170 + }, + { + "epoch": 1.9624854656734345, + "grad_norm": 1.4127328395843506, + "learning_rate": 9.036901156635469e-08, + "loss": 0.7615, + "step": 307180 + }, + { + "epoch": 1.962549352823173, + "grad_norm": 1.0815893411636353, + "learning_rate": 9.00677215621426e-08, + "loss": 0.7994, + "step": 307190 + }, + { + "epoch": 1.962613239972912, + "grad_norm": 0.8997918367385864, + "learning_rate": 8.976693419412985e-08, + "loss": 0.8337, + "step": 307200 + }, + { + "epoch": 1.9626771271226504, + "grad_norm": 2.5383217334747314, + "learning_rate": 8.946664946534178e-08, + "loss": 0.9905, + "step": 307210 + }, + { + "epoch": 1.9627410142723893, + "grad_norm": 1.107540249824524, + "learning_rate": 8.916686737880375e-08, + "loss": 0.9141, + "step": 307220 + }, + { + "epoch": 1.9628049014221278, + "grad_norm": 1.3632421493530273, + "learning_rate": 8.886758793753558e-08, + "loss": 0.9035, + "step": 307230 + }, + { + "epoch": 1.9628687885718668, + "grad_norm": 0.8134247064590454, + "learning_rate": 8.856881114455151e-08, + "loss": 0.8471, + "step": 307240 + }, + { + "epoch": 1.9629326757216052, + "grad_norm": 2.934614896774292, + "learning_rate": 8.82705370028547e-08, + "loss": 0.7951, + "step": 307250 + }, + { + "epoch": 1.9629965628713442, + "grad_norm": 0.9333258271217346, + "learning_rate": 8.797276551545386e-08, + "loss": 0.7279, + "step": 307260 + }, + { + "epoch": 1.9630604500210826, + "grad_norm": 0.9117864370346069, + "learning_rate": 8.767549668535213e-08, + "loss": 0.9715, + "step": 307270 + }, + { + "epoch": 1.9631243371708216, + "grad_norm": 1.166901707649231, + "learning_rate": 8.737873051553047e-08, + "loss": 1.1847, + "step": 307280 + }, + { + "epoch": 1.96318822432056, + "grad_norm": 0.8172804117202759, + "learning_rate": 8.708246700899203e-08, + "loss": 0.9207, + "step": 307290 + }, + { + "epoch": 1.963252111470299, + "grad_norm": 0.5826395153999329, + "learning_rate": 8.678670616871221e-08, + "loss": 0.6237, + "step": 307300 + }, + { + "epoch": 1.9633159986200375, + "grad_norm": 1.1163614988327026, + "learning_rate": 8.649144799767194e-08, + "loss": 0.996, + "step": 307310 + }, + { + "epoch": 1.9633798857697764, + "grad_norm": 1.161007285118103, + "learning_rate": 8.61966924988411e-08, + "loss": 0.894, + "step": 307320 + }, + { + "epoch": 1.9634437729195149, + "grad_norm": 1.0262027978897095, + "learning_rate": 8.590243967519507e-08, + "loss": 1.1078, + "step": 307330 + }, + { + "epoch": 1.9635076600692538, + "grad_norm": 0.8189107775688171, + "learning_rate": 8.560868952969259e-08, + "loss": 1.0965, + "step": 307340 + }, + { + "epoch": 1.9635715472189923, + "grad_norm": 1.2024630308151245, + "learning_rate": 8.53154420652924e-08, + "loss": 1.1089, + "step": 307350 + }, + { + "epoch": 1.9636354343687312, + "grad_norm": 1.5332752466201782, + "learning_rate": 8.502269728494771e-08, + "loss": 0.7511, + "step": 307360 + }, + { + "epoch": 1.9636993215184697, + "grad_norm": 1.2168183326721191, + "learning_rate": 8.473045519160616e-08, + "loss": 0.9701, + "step": 307370 + }, + { + "epoch": 1.9637632086682086, + "grad_norm": 0.8302614092826843, + "learning_rate": 8.443871578820984e-08, + "loss": 0.675, + "step": 307380 + }, + { + "epoch": 1.9638270958179471, + "grad_norm": 0.7621539235115051, + "learning_rate": 8.414747907770082e-08, + "loss": 0.9875, + "step": 307390 + }, + { + "epoch": 1.9638909829676858, + "grad_norm": 0.7296609282493591, + "learning_rate": 8.385674506301566e-08, + "loss": 0.9113, + "step": 307400 + }, + { + "epoch": 1.9639548701174245, + "grad_norm": 1.1947790384292603, + "learning_rate": 8.35665137470687e-08, + "loss": 0.8201, + "step": 307410 + }, + { + "epoch": 1.9640187572671632, + "grad_norm": 1.920008659362793, + "learning_rate": 8.327678513279646e-08, + "loss": 0.7582, + "step": 307420 + }, + { + "epoch": 1.964082644416902, + "grad_norm": 0.603803277015686, + "learning_rate": 8.298755922310774e-08, + "loss": 0.8512, + "step": 307430 + }, + { + "epoch": 1.9641465315666407, + "grad_norm": 0.8965196013450623, + "learning_rate": 8.269883602091688e-08, + "loss": 0.9519, + "step": 307440 + }, + { + "epoch": 1.9642104187163794, + "grad_norm": 0.8932124376296997, + "learning_rate": 8.24106155291382e-08, + "loss": 0.6974, + "step": 307450 + }, + { + "epoch": 1.964274305866118, + "grad_norm": 0.9798451662063599, + "learning_rate": 8.212289775066384e-08, + "loss": 0.8882, + "step": 307460 + }, + { + "epoch": 1.9643381930158568, + "grad_norm": 0.876908540725708, + "learning_rate": 8.183568268840258e-08, + "loss": 0.8991, + "step": 307470 + }, + { + "epoch": 1.9644020801655955, + "grad_norm": 0.7665677070617676, + "learning_rate": 8.1548970345241e-08, + "loss": 0.9696, + "step": 307480 + }, + { + "epoch": 1.9644659673153342, + "grad_norm": 0.7917914986610413, + "learning_rate": 8.126276072406014e-08, + "loss": 0.7703, + "step": 307490 + }, + { + "epoch": 1.9645298544650729, + "grad_norm": 0.9933443665504456, + "learning_rate": 8.097705382775767e-08, + "loss": 0.8286, + "step": 307500 + }, + { + "epoch": 1.9645937416148116, + "grad_norm": 1.0009136199951172, + "learning_rate": 8.069184965919797e-08, + "loss": 0.7784, + "step": 307510 + }, + { + "epoch": 1.9646576287645503, + "grad_norm": 1.0411992073059082, + "learning_rate": 8.04071482212565e-08, + "loss": 0.9634, + "step": 307520 + }, + { + "epoch": 1.964721515914289, + "grad_norm": 0.5806550979614258, + "learning_rate": 8.012294951680321e-08, + "loss": 0.991, + "step": 307530 + }, + { + "epoch": 1.9647854030640277, + "grad_norm": 0.9990647435188293, + "learning_rate": 7.983925354869693e-08, + "loss": 0.705, + "step": 307540 + }, + { + "epoch": 1.9648492902137664, + "grad_norm": 1.2305150032043457, + "learning_rate": 7.955606031980201e-08, + "loss": 0.9844, + "step": 307550 + }, + { + "epoch": 1.9649131773635051, + "grad_norm": 1.0738801956176758, + "learning_rate": 7.927336983296063e-08, + "loss": 0.8836, + "step": 307560 + }, + { + "epoch": 1.9649770645132438, + "grad_norm": 1.563961148262024, + "learning_rate": 7.899118209102607e-08, + "loss": 0.9084, + "step": 307570 + }, + { + "epoch": 1.9650409516629825, + "grad_norm": 0.9830324053764343, + "learning_rate": 7.870949709683495e-08, + "loss": 0.9476, + "step": 307580 + }, + { + "epoch": 1.9651048388127212, + "grad_norm": 0.7370650172233582, + "learning_rate": 7.842831485322944e-08, + "loss": 1.0117, + "step": 307590 + }, + { + "epoch": 1.96516872596246, + "grad_norm": 1.03929603099823, + "learning_rate": 7.814763536303504e-08, + "loss": 0.8531, + "step": 307600 + }, + { + "epoch": 1.9652326131121987, + "grad_norm": 1.147141695022583, + "learning_rate": 7.786745862908839e-08, + "loss": 0.884, + "step": 307610 + }, + { + "epoch": 1.9652965002619374, + "grad_norm": 1.5236049890518188, + "learning_rate": 7.758778465420392e-08, + "loss": 0.8023, + "step": 307620 + }, + { + "epoch": 1.965360387411676, + "grad_norm": 0.7750815153121948, + "learning_rate": 7.7308613441196e-08, + "loss": 0.7576, + "step": 307630 + }, + { + "epoch": 1.9654242745614146, + "grad_norm": 0.9773805737495422, + "learning_rate": 7.702994499288463e-08, + "loss": 0.8848, + "step": 307640 + }, + { + "epoch": 1.9654881617111535, + "grad_norm": 1.3267862796783447, + "learning_rate": 7.675177931206756e-08, + "loss": 0.955, + "step": 307650 + }, + { + "epoch": 1.965552048860892, + "grad_norm": 1.22646164894104, + "learning_rate": 7.647411640155366e-08, + "loss": 0.673, + "step": 307660 + }, + { + "epoch": 1.965615936010631, + "grad_norm": 1.5440233945846558, + "learning_rate": 7.61969562641296e-08, + "loss": 0.8305, + "step": 307670 + }, + { + "epoch": 1.9656798231603694, + "grad_norm": 0.8694813847541809, + "learning_rate": 7.592029890259867e-08, + "loss": 0.7927, + "step": 307680 + }, + { + "epoch": 1.9657437103101083, + "grad_norm": 0.867393970489502, + "learning_rate": 7.564414431973643e-08, + "loss": 0.7397, + "step": 307690 + }, + { + "epoch": 1.9658075974598468, + "grad_norm": 0.6875196099281311, + "learning_rate": 7.536849251832956e-08, + "loss": 0.7244, + "step": 307700 + }, + { + "epoch": 1.9658714846095857, + "grad_norm": 1.2303906679153442, + "learning_rate": 7.50933435011536e-08, + "loss": 0.8776, + "step": 307710 + }, + { + "epoch": 1.9659353717593242, + "grad_norm": 2.2225420475006104, + "learning_rate": 7.481869727097856e-08, + "loss": 0.7413, + "step": 307720 + }, + { + "epoch": 1.9659992589090631, + "grad_norm": 0.717719554901123, + "learning_rate": 7.45445538305689e-08, + "loss": 0.7105, + "step": 307730 + }, + { + "epoch": 1.9660631460588016, + "grad_norm": 2.5248043537139893, + "learning_rate": 7.427091318268908e-08, + "loss": 1.0971, + "step": 307740 + }, + { + "epoch": 1.9661270332085405, + "grad_norm": 0.8963668346405029, + "learning_rate": 7.399777533009245e-08, + "loss": 0.8243, + "step": 307750 + }, + { + "epoch": 1.966190920358279, + "grad_norm": 0.9372978806495667, + "learning_rate": 7.372514027553235e-08, + "loss": 0.7271, + "step": 307760 + }, + { + "epoch": 1.966254807508018, + "grad_norm": 1.5391466617584229, + "learning_rate": 7.345300802175103e-08, + "loss": 0.7229, + "step": 307770 + }, + { + "epoch": 1.9663186946577564, + "grad_norm": 0.8805508017539978, + "learning_rate": 7.318137857149077e-08, + "loss": 0.9438, + "step": 307780 + }, + { + "epoch": 1.9663825818074954, + "grad_norm": 1.4048082828521729, + "learning_rate": 7.291025192748269e-08, + "loss": 0.7908, + "step": 307790 + }, + { + "epoch": 1.9664464689572339, + "grad_norm": 1.3518507480621338, + "learning_rate": 7.263962809246905e-08, + "loss": 0.8418, + "step": 307800 + }, + { + "epoch": 1.9665103561069728, + "grad_norm": 2.4091033935546875, + "learning_rate": 7.236950706915879e-08, + "loss": 0.8969, + "step": 307810 + }, + { + "epoch": 1.9665742432567113, + "grad_norm": 0.7952820658683777, + "learning_rate": 7.209988886028862e-08, + "loss": 0.8785, + "step": 307820 + }, + { + "epoch": 1.9666381304064502, + "grad_norm": 0.6336418986320496, + "learning_rate": 7.183077346856192e-08, + "loss": 0.9454, + "step": 307830 + }, + { + "epoch": 1.9667020175561887, + "grad_norm": 1.4031201601028442, + "learning_rate": 7.156216089669876e-08, + "loss": 0.8487, + "step": 307840 + }, + { + "epoch": 1.9667659047059276, + "grad_norm": 0.887291669845581, + "learning_rate": 7.129405114739141e-08, + "loss": 0.9261, + "step": 307850 + }, + { + "epoch": 1.966829791855666, + "grad_norm": 0.8543388247489929, + "learning_rate": 7.102644422335436e-08, + "loss": 0.7339, + "step": 307860 + }, + { + "epoch": 1.966893679005405, + "grad_norm": 0.9197491407394409, + "learning_rate": 7.075934012726881e-08, + "loss": 0.8638, + "step": 307870 + }, + { + "epoch": 1.9669575661551435, + "grad_norm": 0.79677814245224, + "learning_rate": 7.049273886183815e-08, + "loss": 0.9238, + "step": 307880 + }, + { + "epoch": 1.9670214533048822, + "grad_norm": 0.9617187976837158, + "learning_rate": 7.022664042973804e-08, + "loss": 0.9432, + "step": 307890 + }, + { + "epoch": 1.967085340454621, + "grad_norm": 1.7885737419128418, + "learning_rate": 6.996104483364963e-08, + "loss": 0.6147, + "step": 307900 + }, + { + "epoch": 1.9671492276043596, + "grad_norm": 0.5837380886077881, + "learning_rate": 6.969595207625412e-08, + "loss": 0.699, + "step": 307910 + }, + { + "epoch": 1.9672131147540983, + "grad_norm": 0.8746609687805176, + "learning_rate": 6.943136216021051e-08, + "loss": 0.8678, + "step": 307920 + }, + { + "epoch": 1.967277001903837, + "grad_norm": 0.8975337147712708, + "learning_rate": 6.916727508819443e-08, + "loss": 0.951, + "step": 307930 + }, + { + "epoch": 1.9673408890535757, + "grad_norm": 1.542599081993103, + "learning_rate": 6.890369086285376e-08, + "loss": 0.9868, + "step": 307940 + }, + { + "epoch": 1.9674047762033144, + "grad_norm": 0.840015172958374, + "learning_rate": 6.864060948685857e-08, + "loss": 0.8198, + "step": 307950 + }, + { + "epoch": 1.9674686633530531, + "grad_norm": 1.1040661334991455, + "learning_rate": 6.837803096284012e-08, + "loss": 0.9692, + "step": 307960 + }, + { + "epoch": 1.9675325505027919, + "grad_norm": 0.7686569690704346, + "learning_rate": 6.811595529345738e-08, + "loss": 0.8241, + "step": 307970 + }, + { + "epoch": 1.9675964376525306, + "grad_norm": 1.4878031015396118, + "learning_rate": 6.785438248134158e-08, + "loss": 0.6764, + "step": 307980 + }, + { + "epoch": 1.9676603248022693, + "grad_norm": 1.2950985431671143, + "learning_rate": 6.759331252912949e-08, + "loss": 1.2905, + "step": 307990 + }, + { + "epoch": 1.967724211952008, + "grad_norm": 1.3230798244476318, + "learning_rate": 6.733274543945234e-08, + "loss": 0.8295, + "step": 308000 + }, + { + "epoch": 1.9677880991017467, + "grad_norm": 1.8880711793899536, + "learning_rate": 6.707268121493027e-08, + "loss": 0.9665, + "step": 308010 + }, + { + "epoch": 1.9678519862514854, + "grad_norm": 0.7278730869293213, + "learning_rate": 6.68131198581834e-08, + "loss": 1.1621, + "step": 308020 + }, + { + "epoch": 1.967915873401224, + "grad_norm": 0.9226792454719543, + "learning_rate": 6.655406137183184e-08, + "loss": 0.7692, + "step": 308030 + }, + { + "epoch": 1.9679797605509628, + "grad_norm": 0.8120977878570557, + "learning_rate": 6.629550575847354e-08, + "loss": 0.8968, + "step": 308040 + }, + { + "epoch": 1.9680436477007015, + "grad_norm": 0.8604071736335754, + "learning_rate": 6.603745302072306e-08, + "loss": 0.7378, + "step": 308050 + }, + { + "epoch": 1.9681075348504402, + "grad_norm": 0.7497774958610535, + "learning_rate": 6.577990316117277e-08, + "loss": 0.7505, + "step": 308060 + }, + { + "epoch": 1.968171422000179, + "grad_norm": 0.9616866111755371, + "learning_rate": 6.552285618241505e-08, + "loss": 0.7913, + "step": 308070 + }, + { + "epoch": 1.9682353091499176, + "grad_norm": 0.7489196062088013, + "learning_rate": 6.52663120870478e-08, + "loss": 1.0104, + "step": 308080 + }, + { + "epoch": 1.9682991962996563, + "grad_norm": 0.9955611824989319, + "learning_rate": 6.501027087764122e-08, + "loss": 0.7075, + "step": 308090 + }, + { + "epoch": 1.968363083449395, + "grad_norm": 0.8881136775016785, + "learning_rate": 6.475473255678765e-08, + "loss": 0.8903, + "step": 308100 + }, + { + "epoch": 1.9684269705991337, + "grad_norm": 0.9819602966308594, + "learning_rate": 6.449969712705173e-08, + "loss": 0.956, + "step": 308110 + }, + { + "epoch": 1.9684908577488724, + "grad_norm": 0.8601024150848389, + "learning_rate": 6.424516459100361e-08, + "loss": 0.8281, + "step": 308120 + }, + { + "epoch": 1.968554744898611, + "grad_norm": 0.9564414620399475, + "learning_rate": 6.39911349512079e-08, + "loss": 1.4346, + "step": 308130 + }, + { + "epoch": 1.9686186320483499, + "grad_norm": 0.954218327999115, + "learning_rate": 6.373760821022367e-08, + "loss": 0.8884, + "step": 308140 + }, + { + "epoch": 1.9686825191980883, + "grad_norm": 0.711357057094574, + "learning_rate": 6.34845843705989e-08, + "loss": 0.7504, + "step": 308150 + }, + { + "epoch": 1.9687464063478273, + "grad_norm": 0.8907003402709961, + "learning_rate": 6.323206343488708e-08, + "loss": 0.9079, + "step": 308160 + }, + { + "epoch": 1.9688102934975658, + "grad_norm": 0.7382941842079163, + "learning_rate": 6.298004540563062e-08, + "loss": 0.8033, + "step": 308170 + }, + { + "epoch": 1.9688741806473047, + "grad_norm": 1.1640852689743042, + "learning_rate": 6.27285302853664e-08, + "loss": 0.6376, + "step": 308180 + }, + { + "epoch": 1.9689380677970432, + "grad_norm": 2.0369021892547607, + "learning_rate": 6.247751807663127e-08, + "loss": 0.8086, + "step": 308190 + }, + { + "epoch": 1.969001954946782, + "grad_norm": 0.8535321950912476, + "learning_rate": 6.222700878194543e-08, + "loss": 0.863, + "step": 308200 + }, + { + "epoch": 1.9690658420965206, + "grad_norm": 1.113789677619934, + "learning_rate": 6.197700240383464e-08, + "loss": 0.7646, + "step": 308210 + }, + { + "epoch": 1.9691297292462595, + "grad_norm": 1.0696840286254883, + "learning_rate": 6.17274989448191e-08, + "loss": 0.7492, + "step": 308220 + }, + { + "epoch": 1.969193616395998, + "grad_norm": 0.6535469889640808, + "learning_rate": 6.147849840741349e-08, + "loss": 0.8728, + "step": 308230 + }, + { + "epoch": 1.969257503545737, + "grad_norm": 0.549797534942627, + "learning_rate": 6.123000079412134e-08, + "loss": 0.788, + "step": 308240 + }, + { + "epoch": 1.9693213906954754, + "grad_norm": 1.2363709211349487, + "learning_rate": 6.09820061074462e-08, + "loss": 0.9564, + "step": 308250 + }, + { + "epoch": 1.9693852778452143, + "grad_norm": 0.9653347730636597, + "learning_rate": 6.073451434988053e-08, + "loss": 0.8188, + "step": 308260 + }, + { + "epoch": 1.9694491649949528, + "grad_norm": 1.123185396194458, + "learning_rate": 6.048752552392789e-08, + "loss": 0.942, + "step": 308270 + }, + { + "epoch": 1.9695130521446917, + "grad_norm": 0.9950637221336365, + "learning_rate": 6.024103963206962e-08, + "loss": 0.9046, + "step": 308280 + }, + { + "epoch": 1.9695769392944302, + "grad_norm": 2.174548625946045, + "learning_rate": 5.999505667678706e-08, + "loss": 0.6862, + "step": 308290 + }, + { + "epoch": 1.9696408264441692, + "grad_norm": 0.6808480620384216, + "learning_rate": 5.974957666055602e-08, + "loss": 0.7023, + "step": 308300 + }, + { + "epoch": 1.9697047135939076, + "grad_norm": 1.1171269416809082, + "learning_rate": 5.950459958585231e-08, + "loss": 0.7116, + "step": 308310 + }, + { + "epoch": 1.9697686007436466, + "grad_norm": 1.3545777797698975, + "learning_rate": 5.9260125455140594e-08, + "loss": 0.8542, + "step": 308320 + }, + { + "epoch": 1.969832487893385, + "grad_norm": 0.9661934971809387, + "learning_rate": 5.9016154270891135e-08, + "loss": 0.9956, + "step": 308330 + }, + { + "epoch": 1.969896375043124, + "grad_norm": 1.025800108909607, + "learning_rate": 5.877268603554642e-08, + "loss": 0.9316, + "step": 308340 + }, + { + "epoch": 1.9699602621928625, + "grad_norm": 0.5792987942695618, + "learning_rate": 5.852972075157115e-08, + "loss": 0.7468, + "step": 308350 + }, + { + "epoch": 1.9700241493426014, + "grad_norm": 1.380750060081482, + "learning_rate": 5.828725842140226e-08, + "loss": 0.8148, + "step": 308360 + }, + { + "epoch": 1.9700880364923399, + "grad_norm": 0.6792870759963989, + "learning_rate": 5.804529904749334e-08, + "loss": 0.7083, + "step": 308370 + }, + { + "epoch": 1.9701519236420786, + "grad_norm": 1.1884015798568726, + "learning_rate": 5.7803842632270234e-08, + "loss": 0.7961, + "step": 308380 + }, + { + "epoch": 1.9702158107918173, + "grad_norm": 0.7494359016418457, + "learning_rate": 5.7562889178164326e-08, + "loss": 0.8931, + "step": 308390 + }, + { + "epoch": 1.970279697941556, + "grad_norm": 0.8433160781860352, + "learning_rate": 5.7322438687612555e-08, + "loss": 0.814, + "step": 308400 + }, + { + "epoch": 1.9703435850912947, + "grad_norm": 0.8369490504264832, + "learning_rate": 5.708249116302966e-08, + "loss": 0.9008, + "step": 308410 + }, + { + "epoch": 1.9704074722410334, + "grad_norm": 0.7244152426719666, + "learning_rate": 5.6843046606830374e-08, + "loss": 0.8711, + "step": 308420 + }, + { + "epoch": 1.9704713593907721, + "grad_norm": 1.4331932067871094, + "learning_rate": 5.660410502142943e-08, + "loss": 0.8069, + "step": 308430 + }, + { + "epoch": 1.9705352465405108, + "grad_norm": 1.290757656097412, + "learning_rate": 5.636566640923602e-08, + "loss": 0.8775, + "step": 308440 + }, + { + "epoch": 1.9705991336902495, + "grad_norm": 1.4813116788864136, + "learning_rate": 5.612773077264266e-08, + "loss": 0.8085, + "step": 308450 + }, + { + "epoch": 1.9706630208399882, + "grad_norm": 0.7159252762794495, + "learning_rate": 5.5890298114052995e-08, + "loss": 0.6825, + "step": 308460 + }, + { + "epoch": 1.970726907989727, + "grad_norm": 0.8382778763771057, + "learning_rate": 5.5653368435854005e-08, + "loss": 1.1421, + "step": 308470 + }, + { + "epoch": 1.9707907951394656, + "grad_norm": 3.4221079349517822, + "learning_rate": 5.5416941740432657e-08, + "loss": 0.7611, + "step": 308480 + }, + { + "epoch": 1.9708546822892044, + "grad_norm": 1.379305124282837, + "learning_rate": 5.518101803017595e-08, + "loss": 0.8903, + "step": 308490 + }, + { + "epoch": 1.970918569438943, + "grad_norm": 1.2596087455749512, + "learning_rate": 5.4945597307448636e-08, + "loss": 1.3971, + "step": 308500 + }, + { + "epoch": 1.9709824565886818, + "grad_norm": 1.0103543996810913, + "learning_rate": 5.471067957463216e-08, + "loss": 0.7434, + "step": 308510 + }, + { + "epoch": 1.9710463437384205, + "grad_norm": 1.200963020324707, + "learning_rate": 5.4476264834085747e-08, + "loss": 0.7768, + "step": 308520 + }, + { + "epoch": 1.9711102308881592, + "grad_norm": 1.0093293190002441, + "learning_rate": 5.424235308817416e-08, + "loss": 0.8774, + "step": 308530 + }, + { + "epoch": 1.9711741180378979, + "grad_norm": 1.93284273147583, + "learning_rate": 5.4008944339251075e-08, + "loss": 0.8628, + "step": 308540 + }, + { + "epoch": 1.9712380051876366, + "grad_norm": 2.522632360458374, + "learning_rate": 5.3776038589670175e-08, + "loss": 0.9007, + "step": 308550 + }, + { + "epoch": 1.9713018923373753, + "grad_norm": 0.8160067200660706, + "learning_rate": 5.354363584176847e-08, + "loss": 0.9622, + "step": 308560 + }, + { + "epoch": 1.971365779487114, + "grad_norm": 1.1085783243179321, + "learning_rate": 5.331173609789408e-08, + "loss": 0.6308, + "step": 308570 + }, + { + "epoch": 1.9714296666368527, + "grad_norm": 0.7385463118553162, + "learning_rate": 5.308033936038403e-08, + "loss": 0.9497, + "step": 308580 + }, + { + "epoch": 1.9714935537865914, + "grad_norm": 1.51162850856781, + "learning_rate": 5.284944563155869e-08, + "loss": 1.1597, + "step": 308590 + }, + { + "epoch": 1.9715574409363301, + "grad_norm": 1.4442684650421143, + "learning_rate": 5.261905491375507e-08, + "loss": 0.8328, + "step": 308600 + }, + { + "epoch": 1.9716213280860688, + "grad_norm": 0.8158069849014282, + "learning_rate": 5.2389167209287995e-08, + "loss": 0.9388, + "step": 308610 + }, + { + "epoch": 1.9716852152358073, + "grad_norm": 1.0222749710083008, + "learning_rate": 5.215978252047227e-08, + "loss": 0.9329, + "step": 308620 + }, + { + "epoch": 1.9717491023855462, + "grad_norm": 1.143006443977356, + "learning_rate": 5.193090084961716e-08, + "loss": 1.398, + "step": 308630 + }, + { + "epoch": 1.9718129895352847, + "grad_norm": 0.857649028301239, + "learning_rate": 5.170252219902638e-08, + "loss": 0.6419, + "step": 308640 + }, + { + "epoch": 1.9718768766850236, + "grad_norm": 1.4051035642623901, + "learning_rate": 5.14746465710092e-08, + "loss": 0.9146, + "step": 308650 + }, + { + "epoch": 1.9719407638347621, + "grad_norm": 1.1671336889266968, + "learning_rate": 5.124727396784712e-08, + "loss": 1.0312, + "step": 308660 + }, + { + "epoch": 1.972004650984501, + "grad_norm": 0.5252185463905334, + "learning_rate": 5.102040439184386e-08, + "loss": 0.6761, + "step": 308670 + }, + { + "epoch": 1.9720685381342395, + "grad_norm": 1.0200812816619873, + "learning_rate": 5.079403784527537e-08, + "loss": 1.1923, + "step": 308680 + }, + { + "epoch": 1.9721324252839785, + "grad_norm": 1.1095391511917114, + "learning_rate": 5.056817433041761e-08, + "loss": 0.7504, + "step": 308690 + }, + { + "epoch": 1.972196312433717, + "grad_norm": 1.6178041696548462, + "learning_rate": 5.0342813849557634e-08, + "loss": 0.7886, + "step": 308700 + }, + { + "epoch": 1.9722601995834559, + "grad_norm": 1.4610570669174194, + "learning_rate": 5.011795640495476e-08, + "loss": 0.6762, + "step": 308710 + }, + { + "epoch": 1.9723240867331944, + "grad_norm": 1.3095816373825073, + "learning_rate": 4.989360199887383e-08, + "loss": 1.0937, + "step": 308720 + }, + { + "epoch": 1.9723879738829333, + "grad_norm": 0.9102385640144348, + "learning_rate": 4.966975063358526e-08, + "loss": 0.7277, + "step": 308730 + }, + { + "epoch": 1.9724518610326718, + "grad_norm": 0.9060350656509399, + "learning_rate": 4.944640231132613e-08, + "loss": 0.8636, + "step": 308740 + }, + { + "epoch": 1.9725157481824107, + "grad_norm": 0.7537937164306641, + "learning_rate": 4.922355703436132e-08, + "loss": 0.6715, + "step": 308750 + }, + { + "epoch": 1.9725796353321492, + "grad_norm": 1.6920832395553589, + "learning_rate": 4.900121480492792e-08, + "loss": 0.8922, + "step": 308760 + }, + { + "epoch": 1.9726435224818881, + "grad_norm": 0.9837844967842102, + "learning_rate": 4.8779375625263026e-08, + "loss": 0.7857, + "step": 308770 + }, + { + "epoch": 1.9727074096316266, + "grad_norm": 0.9435822367668152, + "learning_rate": 4.8558039497609284e-08, + "loss": 1.0804, + "step": 308780 + }, + { + "epoch": 1.9727712967813655, + "grad_norm": 1.6462347507476807, + "learning_rate": 4.8337206424187156e-08, + "loss": 0.7593, + "step": 308790 + }, + { + "epoch": 1.972835183931104, + "grad_norm": 1.322011947631836, + "learning_rate": 4.8116876407222624e-08, + "loss": 0.9249, + "step": 308800 + }, + { + "epoch": 1.972899071080843, + "grad_norm": 0.7767255902290344, + "learning_rate": 4.789704944893614e-08, + "loss": 0.9743, + "step": 308810 + }, + { + "epoch": 1.9729629582305814, + "grad_norm": 1.2136768102645874, + "learning_rate": 4.76777255515426e-08, + "loss": 0.8889, + "step": 308820 + }, + { + "epoch": 1.9730268453803204, + "grad_norm": 1.8056683540344238, + "learning_rate": 4.745890471725134e-08, + "loss": 0.7624, + "step": 308830 + }, + { + "epoch": 1.9730907325300588, + "grad_norm": 0.8533572554588318, + "learning_rate": 4.724058694826061e-08, + "loss": 0.7261, + "step": 308840 + }, + { + "epoch": 1.9731546196797976, + "grad_norm": 1.088926076889038, + "learning_rate": 4.7022772246774205e-08, + "loss": 0.9164, + "step": 308850 + }, + { + "epoch": 1.9732185068295363, + "grad_norm": 1.2552237510681152, + "learning_rate": 4.680546061497926e-08, + "loss": 0.7689, + "step": 308860 + }, + { + "epoch": 1.973282393979275, + "grad_norm": 3.773592948913574, + "learning_rate": 4.658865205507401e-08, + "loss": 1.0516, + "step": 308870 + }, + { + "epoch": 1.9733462811290137, + "grad_norm": 1.4239734411239624, + "learning_rate": 4.63723465692345e-08, + "loss": 0.8565, + "step": 308880 + }, + { + "epoch": 1.9734101682787524, + "grad_norm": 1.5526864528656006, + "learning_rate": 4.6156544159642326e-08, + "loss": 0.8802, + "step": 308890 + }, + { + "epoch": 1.973474055428491, + "grad_norm": 2.3669862747192383, + "learning_rate": 4.594124482847351e-08, + "loss": 0.7934, + "step": 308900 + }, + { + "epoch": 1.9735379425782298, + "grad_norm": 0.9015244841575623, + "learning_rate": 4.572644857788744e-08, + "loss": 0.5639, + "step": 308910 + }, + { + "epoch": 1.9736018297279685, + "grad_norm": 1.3841748237609863, + "learning_rate": 4.55121554100546e-08, + "loss": 0.7559, + "step": 308920 + }, + { + "epoch": 1.9736657168777072, + "grad_norm": 1.267322063446045, + "learning_rate": 4.5298365327128836e-08, + "loss": 1.0862, + "step": 308930 + }, + { + "epoch": 1.973729604027446, + "grad_norm": 0.8511454463005066, + "learning_rate": 4.508507833126951e-08, + "loss": 0.6968, + "step": 308940 + }, + { + "epoch": 1.9737934911771846, + "grad_norm": 1.142594814300537, + "learning_rate": 4.487229442461938e-08, + "loss": 0.9511, + "step": 308950 + }, + { + "epoch": 1.9738573783269233, + "grad_norm": 1.2920407056808472, + "learning_rate": 4.466001360932115e-08, + "loss": 0.76, + "step": 308960 + }, + { + "epoch": 1.973921265476662, + "grad_norm": 0.6771072149276733, + "learning_rate": 4.444823588751756e-08, + "loss": 0.8519, + "step": 308970 + }, + { + "epoch": 1.9739851526264007, + "grad_norm": 1.40349280834198, + "learning_rate": 4.4236961261334697e-08, + "loss": 0.893, + "step": 308980 + }, + { + "epoch": 1.9740490397761394, + "grad_norm": 0.9430562257766724, + "learning_rate": 4.4026189732904175e-08, + "loss": 0.7753, + "step": 308990 + }, + { + "epoch": 1.9741129269258781, + "grad_norm": 0.8726404309272766, + "learning_rate": 4.381592130434653e-08, + "loss": 1.1479, + "step": 309000 + }, + { + "epoch": 1.9741768140756168, + "grad_norm": 0.7614126205444336, + "learning_rate": 4.360615597778228e-08, + "loss": 0.8382, + "step": 309010 + }, + { + "epoch": 1.9742407012253556, + "grad_norm": 1.5872389078140259, + "learning_rate": 4.3396893755320856e-08, + "loss": 0.8007, + "step": 309020 + }, + { + "epoch": 1.9743045883750943, + "grad_norm": 1.073142170906067, + "learning_rate": 4.3188134639071674e-08, + "loss": 0.9587, + "step": 309030 + }, + { + "epoch": 1.974368475524833, + "grad_norm": 0.9952945113182068, + "learning_rate": 4.2979878631138615e-08, + "loss": 0.9594, + "step": 309040 + }, + { + "epoch": 1.9744323626745717, + "grad_norm": 1.0852810144424438, + "learning_rate": 4.277212573361999e-08, + "loss": 0.9547, + "step": 309050 + }, + { + "epoch": 1.9744962498243104, + "grad_norm": 1.8056341409683228, + "learning_rate": 4.256487594859748e-08, + "loss": 0.8571, + "step": 309060 + }, + { + "epoch": 1.974560136974049, + "grad_norm": 0.8054976463317871, + "learning_rate": 4.235812927817495e-08, + "loss": 0.8943, + "step": 309070 + }, + { + "epoch": 1.9746240241237878, + "grad_norm": 0.6693013906478882, + "learning_rate": 4.2151885724417416e-08, + "loss": 0.7472, + "step": 309080 + }, + { + "epoch": 1.9746879112735265, + "grad_norm": 1.0728830099105835, + "learning_rate": 4.19461452894121e-08, + "loss": 1.0022, + "step": 309090 + }, + { + "epoch": 1.9747517984232652, + "grad_norm": 1.1727595329284668, + "learning_rate": 4.174090797523511e-08, + "loss": 1.0071, + "step": 309100 + }, + { + "epoch": 1.9748156855730037, + "grad_norm": 0.8001273274421692, + "learning_rate": 4.153617378394037e-08, + "loss": 0.8537, + "step": 309110 + }, + { + "epoch": 1.9748795727227426, + "grad_norm": 2.7028579711914062, + "learning_rate": 4.1331942717598435e-08, + "loss": 0.8011, + "step": 309120 + }, + { + "epoch": 1.974943459872481, + "grad_norm": 1.1264238357543945, + "learning_rate": 4.112821477826323e-08, + "loss": 0.7103, + "step": 309130 + }, + { + "epoch": 1.97500734702222, + "grad_norm": 0.7403964996337891, + "learning_rate": 4.092498996798866e-08, + "loss": 0.7906, + "step": 309140 + }, + { + "epoch": 1.9750712341719585, + "grad_norm": 0.7814722061157227, + "learning_rate": 4.0722268288823086e-08, + "loss": 1.0551, + "step": 309150 + }, + { + "epoch": 1.9751351213216974, + "grad_norm": 1.4338569641113281, + "learning_rate": 4.052004974279822e-08, + "loss": 1.2511, + "step": 309160 + }, + { + "epoch": 1.975199008471436, + "grad_norm": 0.9130191206932068, + "learning_rate": 4.0318334331962414e-08, + "loss": 0.7111, + "step": 309170 + }, + { + "epoch": 1.9752628956211749, + "grad_norm": 1.1973497867584229, + "learning_rate": 4.0117122058341836e-08, + "loss": 0.8637, + "step": 309180 + }, + { + "epoch": 1.9753267827709133, + "grad_norm": 0.5802940130233765, + "learning_rate": 3.991641292396264e-08, + "loss": 0.9088, + "step": 309190 + }, + { + "epoch": 1.9753906699206523, + "grad_norm": 0.8695168495178223, + "learning_rate": 3.971620693085098e-08, + "loss": 0.8054, + "step": 309200 + }, + { + "epoch": 1.9754545570703907, + "grad_norm": 1.0912967920303345, + "learning_rate": 3.9516504081010817e-08, + "loss": 0.8578, + "step": 309210 + }, + { + "epoch": 1.9755184442201297, + "grad_norm": 0.8944864273071289, + "learning_rate": 3.93173043764683e-08, + "loss": 0.9516, + "step": 309220 + }, + { + "epoch": 1.9755823313698682, + "grad_norm": 1.0011634826660156, + "learning_rate": 3.911860781922183e-08, + "loss": 0.621, + "step": 309230 + }, + { + "epoch": 1.975646218519607, + "grad_norm": 1.4041532278060913, + "learning_rate": 3.892041441126981e-08, + "loss": 0.8882, + "step": 309240 + }, + { + "epoch": 1.9757101056693456, + "grad_norm": 1.770445704460144, + "learning_rate": 3.8722724154610644e-08, + "loss": 0.776, + "step": 309250 + }, + { + "epoch": 1.9757739928190845, + "grad_norm": 0.8279808759689331, + "learning_rate": 3.852553705124273e-08, + "loss": 0.7962, + "step": 309260 + }, + { + "epoch": 1.975837879968823, + "grad_norm": 1.0905953645706177, + "learning_rate": 3.832885310314227e-08, + "loss": 0.7295, + "step": 309270 + }, + { + "epoch": 1.975901767118562, + "grad_norm": 2.3927602767944336, + "learning_rate": 3.8132672312291005e-08, + "loss": 0.858, + "step": 309280 + }, + { + "epoch": 1.9759656542683004, + "grad_norm": 1.1994647979736328, + "learning_rate": 3.793699468067069e-08, + "loss": 1.0402, + "step": 309290 + }, + { + "epoch": 1.9760295414180393, + "grad_norm": 1.07845938205719, + "learning_rate": 3.774182021024641e-08, + "loss": 1.0522, + "step": 309300 + }, + { + "epoch": 1.9760934285677778, + "grad_norm": 1.2150828838348389, + "learning_rate": 3.754714890298328e-08, + "loss": 0.7252, + "step": 309310 + }, + { + "epoch": 1.9761573157175167, + "grad_norm": 0.6640936732292175, + "learning_rate": 3.735298076084637e-08, + "loss": 0.9416, + "step": 309320 + }, + { + "epoch": 1.9762212028672552, + "grad_norm": 0.9532160758972168, + "learning_rate": 3.7159315785789686e-08, + "loss": 0.8159, + "step": 309330 + }, + { + "epoch": 1.976285090016994, + "grad_norm": 1.0099438428878784, + "learning_rate": 3.696615397976166e-08, + "loss": 0.7487, + "step": 309340 + }, + { + "epoch": 1.9763489771667326, + "grad_norm": 0.9834662079811096, + "learning_rate": 3.677349534471075e-08, + "loss": 0.8413, + "step": 309350 + }, + { + "epoch": 1.9764128643164713, + "grad_norm": 1.1218534708023071, + "learning_rate": 3.658133988256873e-08, + "loss": 0.663, + "step": 309360 + }, + { + "epoch": 1.97647675146621, + "grad_norm": 1.4277747869491577, + "learning_rate": 3.638968759528405e-08, + "loss": 1.1349, + "step": 309370 + }, + { + "epoch": 1.9765406386159488, + "grad_norm": 1.0635625123977661, + "learning_rate": 3.619853848477739e-08, + "loss": 0.6781, + "step": 309380 + }, + { + "epoch": 1.9766045257656875, + "grad_norm": 1.3290634155273438, + "learning_rate": 3.6007892552974983e-08, + "loss": 0.9776, + "step": 309390 + }, + { + "epoch": 1.9766684129154262, + "grad_norm": 0.9123384952545166, + "learning_rate": 3.581774980179753e-08, + "loss": 0.8103, + "step": 309400 + }, + { + "epoch": 1.9767323000651649, + "grad_norm": 0.7891640067100525, + "learning_rate": 3.562811023316015e-08, + "loss": 0.9219, + "step": 309410 + }, + { + "epoch": 1.9767961872149036, + "grad_norm": 0.7858989834785461, + "learning_rate": 3.5438973848977985e-08, + "loss": 0.7497, + "step": 309420 + }, + { + "epoch": 1.9768600743646423, + "grad_norm": 2.1860246658325195, + "learning_rate": 3.5250340651149515e-08, + "loss": 0.9047, + "step": 309430 + }, + { + "epoch": 1.976923961514381, + "grad_norm": 3.4610395431518555, + "learning_rate": 3.506221064157322e-08, + "loss": 0.7699, + "step": 309440 + }, + { + "epoch": 1.9769878486641197, + "grad_norm": 1.016252040863037, + "learning_rate": 3.487458382214759e-08, + "loss": 0.7325, + "step": 309450 + }, + { + "epoch": 1.9770517358138584, + "grad_norm": 1.5463958978652954, + "learning_rate": 3.468746019475999e-08, + "loss": 0.7865, + "step": 309460 + }, + { + "epoch": 1.977115622963597, + "grad_norm": 1.0177315473556519, + "learning_rate": 3.450083976129226e-08, + "loss": 0.734, + "step": 309470 + }, + { + "epoch": 1.9771795101133358, + "grad_norm": 1.2989296913146973, + "learning_rate": 3.431472252363177e-08, + "loss": 1.0005, + "step": 309480 + }, + { + "epoch": 1.9772433972630745, + "grad_norm": 1.1454684734344482, + "learning_rate": 3.412910848364925e-08, + "loss": 0.9839, + "step": 309490 + }, + { + "epoch": 1.9773072844128132, + "grad_norm": 0.9672458171844482, + "learning_rate": 3.394399764321543e-08, + "loss": 0.9025, + "step": 309500 + }, + { + "epoch": 1.977371171562552, + "grad_norm": 0.5477538704872131, + "learning_rate": 3.375939000418438e-08, + "loss": 0.7266, + "step": 309510 + }, + { + "epoch": 1.9774350587122906, + "grad_norm": 0.7952908873558044, + "learning_rate": 3.3575285568432365e-08, + "loss": 0.7958, + "step": 309520 + }, + { + "epoch": 1.9774989458620293, + "grad_norm": 1.088985800743103, + "learning_rate": 3.339168433779682e-08, + "loss": 0.7897, + "step": 309530 + }, + { + "epoch": 1.977562833011768, + "grad_norm": 0.9924382567405701, + "learning_rate": 3.3208586314137366e-08, + "loss": 1.0101, + "step": 309540 + }, + { + "epoch": 1.9776267201615068, + "grad_norm": 1.4753326177597046, + "learning_rate": 3.302599149929697e-08, + "loss": 0.7234, + "step": 309550 + }, + { + "epoch": 1.9776906073112455, + "grad_norm": 0.9083718061447144, + "learning_rate": 3.28438998951075e-08, + "loss": 0.7613, + "step": 309560 + }, + { + "epoch": 1.9777544944609842, + "grad_norm": 0.7028256058692932, + "learning_rate": 3.2662311503411925e-08, + "loss": 0.7795, + "step": 309570 + }, + { + "epoch": 1.9778183816107227, + "grad_norm": 0.60731440782547, + "learning_rate": 3.248122632603101e-08, + "loss": 0.8068, + "step": 309580 + }, + { + "epoch": 1.9778822687604616, + "grad_norm": 1.1641340255737305, + "learning_rate": 3.230064436479663e-08, + "loss": 0.9682, + "step": 309590 + }, + { + "epoch": 1.9779461559102, + "grad_norm": 0.8485056161880493, + "learning_rate": 3.2120565621518436e-08, + "loss": 0.9777, + "step": 309600 + }, + { + "epoch": 1.978010043059939, + "grad_norm": 1.0408892631530762, + "learning_rate": 3.194099009801721e-08, + "loss": 0.9244, + "step": 309610 + }, + { + "epoch": 1.9780739302096775, + "grad_norm": 1.219113826751709, + "learning_rate": 3.17619177961026e-08, + "loss": 0.7961, + "step": 309620 + }, + { + "epoch": 1.9781378173594164, + "grad_norm": 1.190337896347046, + "learning_rate": 3.158334871756763e-08, + "loss": 1.0724, + "step": 309630 + }, + { + "epoch": 1.978201704509155, + "grad_norm": 0.6391940116882324, + "learning_rate": 3.140528286422195e-08, + "loss": 0.6665, + "step": 309640 + }, + { + "epoch": 1.9782655916588938, + "grad_norm": 1.1940670013427734, + "learning_rate": 3.122772023784748e-08, + "loss": 1.0091, + "step": 309650 + }, + { + "epoch": 1.9783294788086323, + "grad_norm": 0.9822312593460083, + "learning_rate": 3.1050660840242776e-08, + "loss": 1.0307, + "step": 309660 + }, + { + "epoch": 1.9783933659583712, + "grad_norm": 1.103455901145935, + "learning_rate": 3.087410467318974e-08, + "loss": 0.7808, + "step": 309670 + }, + { + "epoch": 1.9784572531081097, + "grad_norm": 0.7938708066940308, + "learning_rate": 3.0698051738459186e-08, + "loss": 0.9174, + "step": 309680 + }, + { + "epoch": 1.9785211402578486, + "grad_norm": 1.227244257926941, + "learning_rate": 3.0522502037833026e-08, + "loss": 0.8234, + "step": 309690 + }, + { + "epoch": 1.9785850274075871, + "grad_norm": 0.8157173991203308, + "learning_rate": 3.03474555730765e-08, + "loss": 0.8377, + "step": 309700 + }, + { + "epoch": 1.978648914557326, + "grad_norm": 1.1654152870178223, + "learning_rate": 3.0172912345943774e-08, + "loss": 0.9414, + "step": 309710 + }, + { + "epoch": 1.9787128017070645, + "grad_norm": 0.6457794308662415, + "learning_rate": 2.999887235820564e-08, + "loss": 0.8444, + "step": 309720 + }, + { + "epoch": 1.9787766888568035, + "grad_norm": 1.0486998558044434, + "learning_rate": 2.9825335611610716e-08, + "loss": 0.9999, + "step": 309730 + }, + { + "epoch": 1.978840576006542, + "grad_norm": 1.2199923992156982, + "learning_rate": 2.965230210789649e-08, + "loss": 0.8839, + "step": 309740 + }, + { + "epoch": 1.9789044631562809, + "grad_norm": 1.5385959148406982, + "learning_rate": 2.9479771848822668e-08, + "loss": 0.7668, + "step": 309750 + }, + { + "epoch": 1.9789683503060194, + "grad_norm": 0.6414878368377686, + "learning_rate": 2.9307744836115647e-08, + "loss": 0.825, + "step": 309760 + }, + { + "epoch": 1.9790322374557583, + "grad_norm": 1.0736933946609497, + "learning_rate": 2.9136221071507376e-08, + "loss": 0.8665, + "step": 309770 + }, + { + "epoch": 1.9790961246054968, + "grad_norm": 0.9618337154388428, + "learning_rate": 2.8965200556729798e-08, + "loss": 1.4679, + "step": 309780 + }, + { + "epoch": 1.9791600117552357, + "grad_norm": 0.9808238744735718, + "learning_rate": 2.8794683293503764e-08, + "loss": 0.7879, + "step": 309790 + }, + { + "epoch": 1.9792238989049742, + "grad_norm": 1.2491557598114014, + "learning_rate": 2.864164803810243e-08, + "loss": 1.1444, + "step": 309800 + }, + { + "epoch": 1.9792877860547131, + "grad_norm": 1.5579088926315308, + "learning_rate": 2.8472086957548326e-08, + "loss": 0.9873, + "step": 309810 + }, + { + "epoch": 1.9793516732044516, + "grad_norm": 1.5183621644973755, + "learning_rate": 2.8303029133514013e-08, + "loss": 0.9985, + "step": 309820 + }, + { + "epoch": 1.9794155603541903, + "grad_norm": 0.9913473725318909, + "learning_rate": 2.8134474567703684e-08, + "loss": 0.943, + "step": 309830 + }, + { + "epoch": 1.979479447503929, + "grad_norm": 1.0000174045562744, + "learning_rate": 2.7966423261810426e-08, + "loss": 0.752, + "step": 309840 + }, + { + "epoch": 1.9795433346536677, + "grad_norm": 1.1551103591918945, + "learning_rate": 2.779887521752178e-08, + "loss": 0.9543, + "step": 309850 + }, + { + "epoch": 1.9796072218034064, + "grad_norm": 0.5685717463493347, + "learning_rate": 2.7631830436536387e-08, + "loss": 0.766, + "step": 309860 + }, + { + "epoch": 1.9796711089531451, + "grad_norm": 1.0493838787078857, + "learning_rate": 2.7465288920530685e-08, + "loss": 0.9169, + "step": 309870 + }, + { + "epoch": 1.9797349961028838, + "grad_norm": 0.9339484572410583, + "learning_rate": 2.729925067117556e-08, + "loss": 0.8649, + "step": 309880 + }, + { + "epoch": 1.9797988832526225, + "grad_norm": 0.6909673810005188, + "learning_rate": 2.7133715690152994e-08, + "loss": 0.9404, + "step": 309890 + }, + { + "epoch": 1.9798627704023612, + "grad_norm": 1.0615977048873901, + "learning_rate": 2.6968683979128327e-08, + "loss": 0.8283, + "step": 309900 + }, + { + "epoch": 1.9799266575521, + "grad_norm": 1.017637014389038, + "learning_rate": 2.6804155539761346e-08, + "loss": 0.8404, + "step": 309910 + }, + { + "epoch": 1.9799905447018387, + "grad_norm": 1.027228832244873, + "learning_rate": 2.6640130373711824e-08, + "loss": 0.9486, + "step": 309920 + }, + { + "epoch": 1.9800544318515774, + "grad_norm": 0.8696177005767822, + "learning_rate": 2.64766084826229e-08, + "loss": 0.9764, + "step": 309930 + }, + { + "epoch": 1.980118319001316, + "grad_norm": 1.3316969871520996, + "learning_rate": 2.6313589868154354e-08, + "loss": 0.8519, + "step": 309940 + }, + { + "epoch": 1.9801822061510548, + "grad_norm": 1.4391385316848755, + "learning_rate": 2.6151074531938214e-08, + "loss": 1.0183, + "step": 309950 + }, + { + "epoch": 1.9802460933007935, + "grad_norm": 0.9557727575302124, + "learning_rate": 2.598906247561206e-08, + "loss": 1.1704, + "step": 309960 + }, + { + "epoch": 1.9803099804505322, + "grad_norm": 1.243375539779663, + "learning_rate": 2.5827553700813466e-08, + "loss": 1.006, + "step": 309970 + }, + { + "epoch": 1.980373867600271, + "grad_norm": 0.8975056409835815, + "learning_rate": 2.5666548209163367e-08, + "loss": 0.8066, + "step": 309980 + }, + { + "epoch": 1.9804377547500096, + "grad_norm": 0.869646430015564, + "learning_rate": 2.550604600228823e-08, + "loss": 1.264, + "step": 309990 + }, + { + "epoch": 1.9805016418997483, + "grad_norm": 0.8230206370353699, + "learning_rate": 2.5346047081797885e-08, + "loss": 0.808, + "step": 310000 + }, + { + "epoch": 1.980565529049487, + "grad_norm": 0.7488855123519897, + "learning_rate": 2.5186551449307705e-08, + "loss": 0.903, + "step": 310010 + }, + { + "epoch": 1.9806294161992257, + "grad_norm": 1.3847332000732422, + "learning_rate": 2.502755910642196e-08, + "loss": 0.8955, + "step": 310020 + }, + { + "epoch": 1.9806933033489644, + "grad_norm": 0.9645035862922668, + "learning_rate": 2.4869070054744926e-08, + "loss": 0.9145, + "step": 310030 + }, + { + "epoch": 1.9807571904987031, + "grad_norm": 1.4131964445114136, + "learning_rate": 2.471108429586977e-08, + "loss": 0.9992, + "step": 310040 + }, + { + "epoch": 1.9808210776484418, + "grad_norm": 0.8667084574699402, + "learning_rate": 2.455360183138966e-08, + "loss": 0.8105, + "step": 310050 + }, + { + "epoch": 1.9808849647981805, + "grad_norm": 1.352062702178955, + "learning_rate": 2.439662266289222e-08, + "loss": 0.8148, + "step": 310060 + }, + { + "epoch": 1.980948851947919, + "grad_norm": 0.8915233612060547, + "learning_rate": 2.4240146791953966e-08, + "loss": 0.8997, + "step": 310070 + }, + { + "epoch": 1.981012739097658, + "grad_norm": 1.9420161247253418, + "learning_rate": 2.4084174220151412e-08, + "loss": 0.6849, + "step": 310080 + }, + { + "epoch": 1.9810766262473964, + "grad_norm": 0.611120343208313, + "learning_rate": 2.3928704949055525e-08, + "loss": 0.9966, + "step": 310090 + }, + { + "epoch": 1.9811405133971354, + "grad_norm": 2.8182990550994873, + "learning_rate": 2.377373898023727e-08, + "loss": 0.7746, + "step": 310100 + }, + { + "epoch": 1.9812044005468739, + "grad_norm": 0.9300549626350403, + "learning_rate": 2.361927631524541e-08, + "loss": 0.7707, + "step": 310110 + }, + { + "epoch": 1.9812682876966128, + "grad_norm": 0.9150687456130981, + "learning_rate": 2.3465316955650907e-08, + "loss": 0.8798, + "step": 310120 + }, + { + "epoch": 1.9813321748463513, + "grad_norm": 1.082905888557434, + "learning_rate": 2.3311860902991423e-08, + "loss": 0.9707, + "step": 310130 + }, + { + "epoch": 1.9813960619960902, + "grad_norm": 1.0547878742218018, + "learning_rate": 2.3158908158821268e-08, + "loss": 0.8752, + "step": 310140 + }, + { + "epoch": 1.9814599491458287, + "grad_norm": 0.7612301111221313, + "learning_rate": 2.3006458724678103e-08, + "loss": 0.8479, + "step": 310150 + }, + { + "epoch": 1.9815238362955676, + "grad_norm": 0.8997673988342285, + "learning_rate": 2.2854512602094036e-08, + "loss": 0.9854, + "step": 310160 + }, + { + "epoch": 1.981587723445306, + "grad_norm": 1.7977722883224487, + "learning_rate": 2.270306979260117e-08, + "loss": 0.8531, + "step": 310170 + }, + { + "epoch": 1.981651610595045, + "grad_norm": 2.0584871768951416, + "learning_rate": 2.255213029772607e-08, + "loss": 1.1729, + "step": 310180 + }, + { + "epoch": 1.9817154977447835, + "grad_norm": 0.806151807308197, + "learning_rate": 2.2401694118984183e-08, + "loss": 0.8084, + "step": 310190 + }, + { + "epoch": 1.9817793848945224, + "grad_norm": 0.8514753580093384, + "learning_rate": 2.2251761257896518e-08, + "loss": 0.9633, + "step": 310200 + }, + { + "epoch": 1.981843272044261, + "grad_norm": 0.8717397451400757, + "learning_rate": 2.2102331715967428e-08, + "loss": 0.8358, + "step": 310210 + }, + { + "epoch": 1.9819071591939998, + "grad_norm": 0.7835900187492371, + "learning_rate": 2.1953405494712364e-08, + "loss": 0.8919, + "step": 310220 + }, + { + "epoch": 1.9819710463437383, + "grad_norm": 0.8039876222610474, + "learning_rate": 2.1804982595613478e-08, + "loss": 1.0103, + "step": 310230 + }, + { + "epoch": 1.9820349334934773, + "grad_norm": 1.0741451978683472, + "learning_rate": 2.1657063020186218e-08, + "loss": 0.9521, + "step": 310240 + }, + { + "epoch": 1.9820988206432157, + "grad_norm": 1.1201385259628296, + "learning_rate": 2.1509646769901636e-08, + "loss": 1.0668, + "step": 310250 + }, + { + "epoch": 1.9821627077929547, + "grad_norm": 0.6859369874000549, + "learning_rate": 2.1362733846258533e-08, + "loss": 0.8435, + "step": 310260 + }, + { + "epoch": 1.9822265949426932, + "grad_norm": 1.120652198791504, + "learning_rate": 2.1216324250727947e-08, + "loss": 0.7989, + "step": 310270 + }, + { + "epoch": 1.982290482092432, + "grad_norm": 0.8452847003936768, + "learning_rate": 2.1070417984780933e-08, + "loss": 0.7964, + "step": 310280 + }, + { + "epoch": 1.9823543692421706, + "grad_norm": 1.0975067615509033, + "learning_rate": 2.0925015049899633e-08, + "loss": 1.0015, + "step": 310290 + }, + { + "epoch": 1.9824182563919095, + "grad_norm": 1.0435091257095337, + "learning_rate": 2.078011544753844e-08, + "loss": 0.9067, + "step": 310300 + }, + { + "epoch": 1.982482143541648, + "grad_norm": 1.3975369930267334, + "learning_rate": 2.063571917916285e-08, + "loss": 0.9096, + "step": 310310 + }, + { + "epoch": 1.9825460306913867, + "grad_norm": 0.8230214715003967, + "learning_rate": 2.04918262462217e-08, + "loss": 0.7911, + "step": 310320 + }, + { + "epoch": 1.9826099178411254, + "grad_norm": 1.2974622249603271, + "learning_rate": 2.034843665016939e-08, + "loss": 0.6434, + "step": 310330 + }, + { + "epoch": 1.982673804990864, + "grad_norm": 0.7529612183570862, + "learning_rate": 2.020555039244365e-08, + "loss": 0.8052, + "step": 310340 + }, + { + "epoch": 1.9827376921406028, + "grad_norm": 1.1118345260620117, + "learning_rate": 2.0063167474487776e-08, + "loss": 0.7984, + "step": 310350 + }, + { + "epoch": 1.9828015792903415, + "grad_norm": 0.8356834650039673, + "learning_rate": 1.9921287897733953e-08, + "loss": 0.8602, + "step": 310360 + }, + { + "epoch": 1.9828654664400802, + "grad_norm": 0.9013441801071167, + "learning_rate": 1.977991166360882e-08, + "loss": 1.1554, + "step": 310370 + }, + { + "epoch": 1.982929353589819, + "grad_norm": 1.1530930995941162, + "learning_rate": 1.963903877354456e-08, + "loss": 0.9188, + "step": 310380 + }, + { + "epoch": 1.9829932407395576, + "grad_norm": 1.799553632736206, + "learning_rate": 1.949866922895116e-08, + "loss": 0.9647, + "step": 310390 + }, + { + "epoch": 1.9830571278892963, + "grad_norm": 1.3467897176742554, + "learning_rate": 1.9358803031244155e-08, + "loss": 0.796, + "step": 310400 + }, + { + "epoch": 1.983121015039035, + "grad_norm": 2.4441401958465576, + "learning_rate": 1.9219440181839077e-08, + "loss": 0.758, + "step": 310410 + }, + { + "epoch": 1.9831849021887737, + "grad_norm": 1.3251237869262695, + "learning_rate": 1.9080580682129257e-08, + "loss": 0.9851, + "step": 310420 + }, + { + "epoch": 1.9832487893385125, + "grad_norm": 0.8397975564002991, + "learning_rate": 1.8942224533519128e-08, + "loss": 0.7603, + "step": 310430 + }, + { + "epoch": 1.9833126764882512, + "grad_norm": 1.116385579109192, + "learning_rate": 1.8804371737396463e-08, + "loss": 0.7254, + "step": 310440 + }, + { + "epoch": 1.9833765636379899, + "grad_norm": 0.6969117522239685, + "learning_rate": 1.8667022295160153e-08, + "loss": 0.9462, + "step": 310450 + }, + { + "epoch": 1.9834404507877286, + "grad_norm": 1.8197271823883057, + "learning_rate": 1.8530176208181317e-08, + "loss": 0.8398, + "step": 310460 + }, + { + "epoch": 1.9835043379374673, + "grad_norm": 0.9810863733291626, + "learning_rate": 1.8393833477847733e-08, + "loss": 1.0075, + "step": 310470 + }, + { + "epoch": 1.983568225087206, + "grad_norm": 1.3049362897872925, + "learning_rate": 1.825799410553053e-08, + "loss": 0.6365, + "step": 310480 + }, + { + "epoch": 1.9836321122369447, + "grad_norm": 1.0517561435699463, + "learning_rate": 1.8122658092589727e-08, + "loss": 0.7645, + "step": 310490 + }, + { + "epoch": 1.9836959993866834, + "grad_norm": 0.9201875925064087, + "learning_rate": 1.7987825440396456e-08, + "loss": 0.9245, + "step": 310500 + }, + { + "epoch": 1.983759886536422, + "grad_norm": 0.7460044026374817, + "learning_rate": 1.7853496150305183e-08, + "loss": 0.8921, + "step": 310510 + }, + { + "epoch": 1.9838237736861608, + "grad_norm": 0.7730250358581543, + "learning_rate": 1.7719670223675934e-08, + "loss": 0.8903, + "step": 310520 + }, + { + "epoch": 1.9838876608358995, + "grad_norm": 1.0625085830688477, + "learning_rate": 1.7586347661840973e-08, + "loss": 0.8217, + "step": 310530 + }, + { + "epoch": 1.9839515479856382, + "grad_norm": 0.8693110346794128, + "learning_rate": 1.7453528466160328e-08, + "loss": 0.8307, + "step": 310540 + }, + { + "epoch": 1.984015435135377, + "grad_norm": 2.2066643238067627, + "learning_rate": 1.7321212637960716e-08, + "loss": 1.1016, + "step": 310550 + }, + { + "epoch": 1.9840793222851154, + "grad_norm": 1.2037817239761353, + "learning_rate": 1.71894001785744e-08, + "loss": 0.7203, + "step": 310560 + }, + { + "epoch": 1.9841432094348543, + "grad_norm": 0.7629579901695251, + "learning_rate": 1.70580910893392e-08, + "loss": 0.759, + "step": 310570 + }, + { + "epoch": 1.9842070965845928, + "grad_norm": 0.9195466041564941, + "learning_rate": 1.6927285371565182e-08, + "loss": 0.9331, + "step": 310580 + }, + { + "epoch": 1.9842709837343318, + "grad_norm": 0.9034010171890259, + "learning_rate": 1.679698302657351e-08, + "loss": 0.7365, + "step": 310590 + }, + { + "epoch": 1.9843348708840702, + "grad_norm": 1.162416934967041, + "learning_rate": 1.6667184055685348e-08, + "loss": 0.9009, + "step": 310600 + }, + { + "epoch": 1.9843987580338092, + "grad_norm": 1.3139643669128418, + "learning_rate": 1.6537888460194105e-08, + "loss": 0.7501, + "step": 310610 + }, + { + "epoch": 1.9844626451835476, + "grad_norm": 2.04888916015625, + "learning_rate": 1.640909624140985e-08, + "loss": 1.0927, + "step": 310620 + }, + { + "epoch": 1.9845265323332866, + "grad_norm": 0.8494736552238464, + "learning_rate": 1.6280807400625987e-08, + "loss": 0.9263, + "step": 310630 + }, + { + "epoch": 1.984590419483025, + "grad_norm": 0.645581841468811, + "learning_rate": 1.6153021939141476e-08, + "loss": 0.8617, + "step": 310640 + }, + { + "epoch": 1.984654306632764, + "grad_norm": 0.8450150489807129, + "learning_rate": 1.602573985823308e-08, + "loss": 0.7702, + "step": 310650 + }, + { + "epoch": 1.9847181937825025, + "grad_norm": 0.7330366969108582, + "learning_rate": 1.5898961159188652e-08, + "loss": 0.8653, + "step": 310660 + }, + { + "epoch": 1.9847820809322414, + "grad_norm": 0.8303292989730835, + "learning_rate": 1.5772685843284953e-08, + "loss": 0.8591, + "step": 310670 + }, + { + "epoch": 1.9848459680819799, + "grad_norm": 1.4144198894500732, + "learning_rate": 1.5646913911793182e-08, + "loss": 0.9637, + "step": 310680 + }, + { + "epoch": 1.9849098552317188, + "grad_norm": 1.0386841297149658, + "learning_rate": 1.5521645365979e-08, + "loss": 0.8452, + "step": 310690 + }, + { + "epoch": 1.9849737423814573, + "grad_norm": 1.1626369953155518, + "learning_rate": 1.5396880207108056e-08, + "loss": 0.9221, + "step": 310700 + }, + { + "epoch": 1.9850376295311962, + "grad_norm": 0.6234553456306458, + "learning_rate": 1.5272618436429353e-08, + "loss": 1.0333, + "step": 310710 + }, + { + "epoch": 1.9851015166809347, + "grad_norm": 1.2528012990951538, + "learning_rate": 1.5148860055197445e-08, + "loss": 0.9026, + "step": 310720 + }, + { + "epoch": 1.9851654038306736, + "grad_norm": 0.8561525344848633, + "learning_rate": 1.502560506466133e-08, + "loss": 0.8678, + "step": 310730 + }, + { + "epoch": 1.9852292909804121, + "grad_norm": 1.1541293859481812, + "learning_rate": 1.4902853466064458e-08, + "loss": 1.2388, + "step": 310740 + }, + { + "epoch": 1.985293178130151, + "grad_norm": 0.7439695596694946, + "learning_rate": 1.478060526063363e-08, + "loss": 0.792, + "step": 310750 + }, + { + "epoch": 1.9853570652798895, + "grad_norm": 0.8408505320549011, + "learning_rate": 1.4658860449606737e-08, + "loss": 0.8528, + "step": 310760 + }, + { + "epoch": 1.9854209524296285, + "grad_norm": 0.9411364197731018, + "learning_rate": 1.453761903421058e-08, + "loss": 0.7902, + "step": 310770 + }, + { + "epoch": 1.985484839579367, + "grad_norm": 1.618725061416626, + "learning_rate": 1.4416881015660854e-08, + "loss": 0.9176, + "step": 310780 + }, + { + "epoch": 1.9855487267291059, + "grad_norm": 1.5805702209472656, + "learning_rate": 1.4296646395178803e-08, + "loss": 0.8695, + "step": 310790 + }, + { + "epoch": 1.9856126138788444, + "grad_norm": 0.9400895833969116, + "learning_rate": 1.417691517397457e-08, + "loss": 0.8942, + "step": 310800 + }, + { + "epoch": 1.985676501028583, + "grad_norm": 0.7929409742355347, + "learning_rate": 1.4057687353247195e-08, + "loss": 0.8796, + "step": 310810 + }, + { + "epoch": 1.9857403881783218, + "grad_norm": 2.9416072368621826, + "learning_rate": 1.3938962934212373e-08, + "loss": 0.9496, + "step": 310820 + }, + { + "epoch": 1.9858042753280605, + "grad_norm": 1.3560771942138672, + "learning_rate": 1.3820741918046942e-08, + "loss": 0.9342, + "step": 310830 + }, + { + "epoch": 1.9858681624777992, + "grad_norm": 1.4186067581176758, + "learning_rate": 1.3703024305955492e-08, + "loss": 0.9982, + "step": 310840 + }, + { + "epoch": 1.9859320496275379, + "grad_norm": 1.0630927085876465, + "learning_rate": 1.358581009912041e-08, + "loss": 0.8193, + "step": 310850 + }, + { + "epoch": 1.9859959367772766, + "grad_norm": 0.8476179838180542, + "learning_rate": 1.3469099298718535e-08, + "loss": 0.6791, + "step": 310860 + }, + { + "epoch": 1.9860598239270153, + "grad_norm": 0.7086460590362549, + "learning_rate": 1.33528919059267e-08, + "loss": 0.8539, + "step": 310870 + }, + { + "epoch": 1.986123711076754, + "grad_norm": 0.8723922967910767, + "learning_rate": 1.3237187921916195e-08, + "loss": 0.7082, + "step": 310880 + }, + { + "epoch": 1.9861875982264927, + "grad_norm": 1.034899115562439, + "learning_rate": 1.3121987347852748e-08, + "loss": 0.7722, + "step": 310890 + }, + { + "epoch": 1.9862514853762314, + "grad_norm": 0.9084420204162598, + "learning_rate": 1.3007290184890997e-08, + "loss": 0.791, + "step": 310900 + }, + { + "epoch": 1.9863153725259701, + "grad_norm": 1.9150114059448242, + "learning_rate": 1.2893096434196672e-08, + "loss": 0.8656, + "step": 310910 + }, + { + "epoch": 1.9863792596757088, + "grad_norm": 2.226154327392578, + "learning_rate": 1.2779406096913305e-08, + "loss": 0.8388, + "step": 310920 + }, + { + "epoch": 1.9864431468254475, + "grad_norm": 0.9489161968231201, + "learning_rate": 1.2666219174184424e-08, + "loss": 1.0354, + "step": 310930 + }, + { + "epoch": 1.9865070339751862, + "grad_norm": 0.6309258937835693, + "learning_rate": 1.255353566715356e-08, + "loss": 0.8209, + "step": 310940 + }, + { + "epoch": 1.986570921124925, + "grad_norm": 0.710475742816925, + "learning_rate": 1.244135557695314e-08, + "loss": 0.7545, + "step": 310950 + }, + { + "epoch": 1.9866348082746637, + "grad_norm": 1.2402219772338867, + "learning_rate": 1.232967890471004e-08, + "loss": 0.7927, + "step": 310960 + }, + { + "epoch": 1.9866986954244024, + "grad_norm": 1.2162563800811768, + "learning_rate": 1.2218505651556688e-08, + "loss": 0.8432, + "step": 310970 + }, + { + "epoch": 1.986762582574141, + "grad_norm": 0.8971146941184998, + "learning_rate": 1.210783581860886e-08, + "loss": 1.0107, + "step": 310980 + }, + { + "epoch": 1.9868264697238798, + "grad_norm": 1.3195732831954956, + "learning_rate": 1.1997669406982326e-08, + "loss": 0.8304, + "step": 310990 + }, + { + "epoch": 1.9868903568736185, + "grad_norm": 0.9834543466567993, + "learning_rate": 1.188800641778176e-08, + "loss": 0.816, + "step": 311000 + }, + { + "epoch": 1.9869542440233572, + "grad_norm": 0.8268367052078247, + "learning_rate": 1.1778846852111836e-08, + "loss": 0.7153, + "step": 311010 + }, + { + "epoch": 1.987018131173096, + "grad_norm": 0.816234290599823, + "learning_rate": 1.1670190711082773e-08, + "loss": 1.0464, + "step": 311020 + }, + { + "epoch": 1.9870820183228346, + "grad_norm": 1.0838505029678345, + "learning_rate": 1.1562037995777041e-08, + "loss": 0.696, + "step": 311030 + }, + { + "epoch": 1.9871459054725733, + "grad_norm": 1.1296429634094238, + "learning_rate": 1.145438870728266e-08, + "loss": 0.945, + "step": 311040 + }, + { + "epoch": 1.9872097926223118, + "grad_norm": 1.261118769645691, + "learning_rate": 1.1347242846693196e-08, + "loss": 0.6953, + "step": 311050 + }, + { + "epoch": 1.9872736797720507, + "grad_norm": 0.8011787533760071, + "learning_rate": 1.1240600415085567e-08, + "loss": 0.8288, + "step": 311060 + }, + { + "epoch": 1.9873375669217892, + "grad_norm": 1.4028711318969727, + "learning_rate": 1.1134461413531138e-08, + "loss": 0.8766, + "step": 311070 + }, + { + "epoch": 1.9874014540715281, + "grad_norm": 1.1019203662872314, + "learning_rate": 1.1028825843095724e-08, + "loss": 0.6276, + "step": 311080 + }, + { + "epoch": 1.9874653412212666, + "grad_norm": 1.0110036134719849, + "learning_rate": 1.092369370485069e-08, + "loss": 0.6338, + "step": 311090 + }, + { + "epoch": 1.9875292283710055, + "grad_norm": 0.6997506022453308, + "learning_rate": 1.0819064999850747e-08, + "loss": 0.8742, + "step": 311100 + }, + { + "epoch": 1.987593115520744, + "grad_norm": 0.68264240026474, + "learning_rate": 1.0714939729145056e-08, + "loss": 0.6993, + "step": 311110 + }, + { + "epoch": 1.987657002670483, + "grad_norm": 1.01038658618927, + "learning_rate": 1.061131789378833e-08, + "loss": 1.0496, + "step": 311120 + }, + { + "epoch": 1.9877208898202214, + "grad_norm": 0.7514334321022034, + "learning_rate": 1.0508199494824178e-08, + "loss": 0.7158, + "step": 311130 + }, + { + "epoch": 1.9877847769699604, + "grad_norm": 0.6786255836486816, + "learning_rate": 1.0405584533290657e-08, + "loss": 0.932, + "step": 311140 + }, + { + "epoch": 1.9878486641196988, + "grad_norm": 0.9236333966255188, + "learning_rate": 1.0303473010214726e-08, + "loss": 0.8977, + "step": 311150 + }, + { + "epoch": 1.9879125512694378, + "grad_norm": 1.0109336376190186, + "learning_rate": 1.020186492663444e-08, + "loss": 0.9807, + "step": 311160 + }, + { + "epoch": 1.9879764384191763, + "grad_norm": 1.585142970085144, + "learning_rate": 1.0100760283571209e-08, + "loss": 0.8529, + "step": 311170 + }, + { + "epoch": 1.9880403255689152, + "grad_norm": 0.9748506546020508, + "learning_rate": 1.0000159082035332e-08, + "loss": 0.9685, + "step": 311180 + }, + { + "epoch": 1.9881042127186537, + "grad_norm": 0.7866042256355286, + "learning_rate": 9.900061323048215e-09, + "loss": 0.8834, + "step": 311190 + }, + { + "epoch": 1.9881680998683926, + "grad_norm": 1.2893184423446655, + "learning_rate": 9.800467007614611e-09, + "loss": 0.9315, + "step": 311200 + }, + { + "epoch": 1.988231987018131, + "grad_norm": 1.1545730829238892, + "learning_rate": 9.701376136739271e-09, + "loss": 0.8941, + "step": 311210 + }, + { + "epoch": 1.98829587416787, + "grad_norm": 0.687833845615387, + "learning_rate": 9.602788711415844e-09, + "loss": 0.844, + "step": 311220 + }, + { + "epoch": 1.9883597613176085, + "grad_norm": 0.8345115184783936, + "learning_rate": 9.504704732643533e-09, + "loss": 0.8394, + "step": 311230 + }, + { + "epoch": 1.9884236484673474, + "grad_norm": 0.6788913607597351, + "learning_rate": 9.407124201404883e-09, + "loss": 0.7653, + "step": 311240 + }, + { + "epoch": 1.988487535617086, + "grad_norm": 0.9235719442367554, + "learning_rate": 9.31004711868244e-09, + "loss": 0.8282, + "step": 311250 + }, + { + "epoch": 1.9885514227668248, + "grad_norm": 1.3782670497894287, + "learning_rate": 9.213473485458757e-09, + "loss": 0.7283, + "step": 311260 + }, + { + "epoch": 1.9886153099165633, + "grad_norm": 1.1919926404953003, + "learning_rate": 9.117403302705274e-09, + "loss": 0.7445, + "step": 311270 + }, + { + "epoch": 1.988679197066302, + "grad_norm": 1.1234567165374756, + "learning_rate": 9.021836571382336e-09, + "loss": 0.8275, + "step": 311280 + }, + { + "epoch": 1.9887430842160407, + "grad_norm": 1.243963599205017, + "learning_rate": 8.926773292461389e-09, + "loss": 1.1169, + "step": 311290 + }, + { + "epoch": 1.9888069713657794, + "grad_norm": 2.11574649810791, + "learning_rate": 8.83221346689722e-09, + "loss": 1.1771, + "step": 311300 + }, + { + "epoch": 1.9888708585155181, + "grad_norm": 1.3299797773361206, + "learning_rate": 8.738157095639076e-09, + "loss": 0.8361, + "step": 311310 + }, + { + "epoch": 1.9889347456652569, + "grad_norm": 0.8851807117462158, + "learning_rate": 8.644604179636195e-09, + "loss": 0.957, + "step": 311320 + }, + { + "epoch": 1.9889986328149956, + "grad_norm": 0.9992744326591492, + "learning_rate": 8.551554719832267e-09, + "loss": 0.9448, + "step": 311330 + }, + { + "epoch": 1.9890625199647343, + "grad_norm": 0.8495112061500549, + "learning_rate": 8.459008717159878e-09, + "loss": 0.6763, + "step": 311340 + }, + { + "epoch": 1.989126407114473, + "grad_norm": 0.5434170365333557, + "learning_rate": 8.366966172557167e-09, + "loss": 0.6915, + "step": 311350 + }, + { + "epoch": 1.9891902942642117, + "grad_norm": 0.9140389561653137, + "learning_rate": 8.275427086951171e-09, + "loss": 0.9399, + "step": 311360 + }, + { + "epoch": 1.9892541814139504, + "grad_norm": 0.9066022038459778, + "learning_rate": 8.184391461252272e-09, + "loss": 0.8907, + "step": 311370 + }, + { + "epoch": 1.989318068563689, + "grad_norm": 2.6549975872039795, + "learning_rate": 8.093859296393058e-09, + "loss": 1.2238, + "step": 311380 + }, + { + "epoch": 1.9893819557134278, + "grad_norm": 1.2978613376617432, + "learning_rate": 8.00383059327281e-09, + "loss": 0.9597, + "step": 311390 + }, + { + "epoch": 1.9894458428631665, + "grad_norm": 1.0250873565673828, + "learning_rate": 7.91430535280746e-09, + "loss": 1.1152, + "step": 311400 + }, + { + "epoch": 1.9895097300129052, + "grad_norm": 0.81373131275177, + "learning_rate": 7.82528357589074e-09, + "loss": 0.9364, + "step": 311410 + }, + { + "epoch": 1.989573617162644, + "grad_norm": 0.51475989818573, + "learning_rate": 7.736765263427481e-09, + "loss": 0.9326, + "step": 311420 + }, + { + "epoch": 1.9896375043123826, + "grad_norm": 0.9355649948120117, + "learning_rate": 7.64875041630031e-09, + "loss": 0.9437, + "step": 311430 + }, + { + "epoch": 1.9897013914621213, + "grad_norm": 1.1884983777999878, + "learning_rate": 7.561239035397405e-09, + "loss": 0.8401, + "step": 311440 + }, + { + "epoch": 1.98976527861186, + "grad_norm": 1.307823657989502, + "learning_rate": 7.474231121606946e-09, + "loss": 1.0791, + "step": 311450 + }, + { + "epoch": 1.9898291657615987, + "grad_norm": 1.1962790489196777, + "learning_rate": 7.387726675800455e-09, + "loss": 0.9416, + "step": 311460 + }, + { + "epoch": 1.9898930529113374, + "grad_norm": 0.8856204748153687, + "learning_rate": 7.30172569884946e-09, + "loss": 0.8437, + "step": 311470 + }, + { + "epoch": 1.9899569400610762, + "grad_norm": 1.0025514364242554, + "learning_rate": 7.216228191619934e-09, + "loss": 0.6585, + "step": 311480 + }, + { + "epoch": 1.9900208272108149, + "grad_norm": 2.3673927783966064, + "learning_rate": 7.1312341549723e-09, + "loss": 0.829, + "step": 311490 + }, + { + "epoch": 1.9900847143605536, + "grad_norm": 0.5192925333976746, + "learning_rate": 7.046743589761428e-09, + "loss": 0.7879, + "step": 311500 + }, + { + "epoch": 1.9901486015102923, + "grad_norm": 0.9873639345169067, + "learning_rate": 6.9627564968421935e-09, + "loss": 0.6812, + "step": 311510 + }, + { + "epoch": 1.990212488660031, + "grad_norm": 1.7059824466705322, + "learning_rate": 6.879272877052811e-09, + "loss": 1.0209, + "step": 311520 + }, + { + "epoch": 1.9902763758097697, + "grad_norm": 0.8215252757072449, + "learning_rate": 6.796292731248155e-09, + "loss": 0.7219, + "step": 311530 + }, + { + "epoch": 1.9903402629595082, + "grad_norm": 1.047405481338501, + "learning_rate": 6.713816060249789e-09, + "loss": 0.8509, + "step": 311540 + }, + { + "epoch": 1.990404150109247, + "grad_norm": 0.7972722053527832, + "learning_rate": 6.631842864890381e-09, + "loss": 0.912, + "step": 311550 + }, + { + "epoch": 1.9904680372589856, + "grad_norm": 0.8139147758483887, + "learning_rate": 6.5503731460081485e-09, + "loss": 0.6762, + "step": 311560 + }, + { + "epoch": 1.9905319244087245, + "grad_norm": 1.3652825355529785, + "learning_rate": 6.469406904408004e-09, + "loss": 0.8775, + "step": 311570 + }, + { + "epoch": 1.990595811558463, + "grad_norm": 1.0654441118240356, + "learning_rate": 6.388944140911512e-09, + "loss": 0.826, + "step": 311580 + }, + { + "epoch": 1.990659698708202, + "grad_norm": 1.323000192642212, + "learning_rate": 6.308984856329137e-09, + "loss": 1.079, + "step": 311590 + }, + { + "epoch": 1.9907235858579404, + "grad_norm": 1.7142542600631714, + "learning_rate": 6.229529051465788e-09, + "loss": 1.0488, + "step": 311600 + }, + { + "epoch": 1.9907874730076793, + "grad_norm": 2.2610385417938232, + "learning_rate": 6.150576727120827e-09, + "loss": 0.8548, + "step": 311610 + }, + { + "epoch": 1.9908513601574178, + "grad_norm": 0.9243570566177368, + "learning_rate": 6.0721278840936146e-09, + "loss": 0.8044, + "step": 311620 + }, + { + "epoch": 1.9909152473071567, + "grad_norm": 1.067752718925476, + "learning_rate": 5.9941825231724094e-09, + "loss": 0.6292, + "step": 311630 + }, + { + "epoch": 1.9909791344568952, + "grad_norm": 1.1682084798812866, + "learning_rate": 5.916740645134367e-09, + "loss": 0.9441, + "step": 311640 + }, + { + "epoch": 1.9910430216066342, + "grad_norm": 0.7192792296409607, + "learning_rate": 5.839802250773297e-09, + "loss": 0.5446, + "step": 311650 + }, + { + "epoch": 1.9911069087563726, + "grad_norm": 0.6210076808929443, + "learning_rate": 5.763367340849701e-09, + "loss": 0.9302, + "step": 311660 + }, + { + "epoch": 1.9911707959061116, + "grad_norm": 1.1094497442245483, + "learning_rate": 5.687435916146288e-09, + "loss": 0.9404, + "step": 311670 + }, + { + "epoch": 1.99123468305585, + "grad_norm": 0.5943570733070374, + "learning_rate": 5.61200797741801e-09, + "loss": 0.6468, + "step": 311680 + }, + { + "epoch": 1.991298570205589, + "grad_norm": 1.3287687301635742, + "learning_rate": 5.5370835254253685e-09, + "loss": 0.9242, + "step": 311690 + }, + { + "epoch": 1.9913624573553275, + "grad_norm": 1.7294316291809082, + "learning_rate": 5.462662560928866e-09, + "loss": 0.7075, + "step": 311700 + }, + { + "epoch": 1.9914263445050664, + "grad_norm": 1.2055655717849731, + "learning_rate": 5.3887450846779045e-09, + "loss": 0.822, + "step": 311710 + }, + { + "epoch": 1.9914902316548049, + "grad_norm": 1.283769965171814, + "learning_rate": 5.315331097405229e-09, + "loss": 1.0232, + "step": 311720 + }, + { + "epoch": 1.9915541188045438, + "grad_norm": 1.0435206890106201, + "learning_rate": 5.242420599865794e-09, + "loss": 0.7253, + "step": 311730 + }, + { + "epoch": 1.9916180059542823, + "grad_norm": 1.496469497680664, + "learning_rate": 5.1700135927867935e-09, + "loss": 0.9393, + "step": 311740 + }, + { + "epoch": 1.9916818931040212, + "grad_norm": 1.004408836364746, + "learning_rate": 5.0981100768954235e-09, + "loss": 0.848, + "step": 311750 + }, + { + "epoch": 1.9917457802537597, + "grad_norm": 0.7888327836990356, + "learning_rate": 5.026710052918882e-09, + "loss": 0.7427, + "step": 311760 + }, + { + "epoch": 1.9918096674034984, + "grad_norm": 1.417661190032959, + "learning_rate": 4.955813521573261e-09, + "loss": 0.7668, + "step": 311770 + }, + { + "epoch": 1.9918735545532371, + "grad_norm": 1.0522990226745605, + "learning_rate": 4.885420483574654e-09, + "loss": 0.8339, + "step": 311780 + }, + { + "epoch": 1.9919374417029758, + "grad_norm": 1.6315908432006836, + "learning_rate": 4.8155309396336056e-09, + "loss": 0.8946, + "step": 311790 + }, + { + "epoch": 1.9920013288527145, + "grad_norm": 0.8391977548599243, + "learning_rate": 4.746144890449555e-09, + "loss": 0.846, + "step": 311800 + }, + { + "epoch": 1.9920652160024532, + "grad_norm": 0.7376478314399719, + "learning_rate": 4.6772623367274945e-09, + "loss": 0.9631, + "step": 311810 + }, + { + "epoch": 1.992129103152192, + "grad_norm": 0.8697778582572937, + "learning_rate": 4.6088832791557626e-09, + "loss": 0.9359, + "step": 311820 + }, + { + "epoch": 1.9921929903019306, + "grad_norm": 1.1047428846359253, + "learning_rate": 4.541007718422696e-09, + "loss": 0.7677, + "step": 311830 + }, + { + "epoch": 1.9922568774516694, + "grad_norm": 1.339664101600647, + "learning_rate": 4.473635655216635e-09, + "loss": 0.7771, + "step": 311840 + }, + { + "epoch": 1.992320764601408, + "grad_norm": 1.1949299573898315, + "learning_rate": 4.4067670902148136e-09, + "loss": 0.7404, + "step": 311850 + }, + { + "epoch": 1.9923846517511468, + "grad_norm": 1.6755359172821045, + "learning_rate": 4.340402024083368e-09, + "loss": 0.8248, + "step": 311860 + }, + { + "epoch": 1.9924485389008855, + "grad_norm": 1.1501131057739258, + "learning_rate": 4.274540457505083e-09, + "loss": 0.7366, + "step": 311870 + }, + { + "epoch": 1.9925124260506242, + "grad_norm": 1.342016339302063, + "learning_rate": 4.20918239112944e-09, + "loss": 0.8977, + "step": 311880 + }, + { + "epoch": 1.9925763132003629, + "grad_norm": 0.888166069984436, + "learning_rate": 4.1443278256170226e-09, + "loss": 0.9847, + "step": 311890 + }, + { + "epoch": 1.9926402003501016, + "grad_norm": 1.199773907661438, + "learning_rate": 4.079976761628412e-09, + "loss": 0.9824, + "step": 311900 + }, + { + "epoch": 1.9927040874998403, + "grad_norm": 0.8143197894096375, + "learning_rate": 4.016129199801988e-09, + "loss": 0.6692, + "step": 311910 + }, + { + "epoch": 1.992767974649579, + "grad_norm": 0.5461716651916504, + "learning_rate": 3.952785140792781e-09, + "loss": 0.9864, + "step": 311920 + }, + { + "epoch": 1.9928318617993177, + "grad_norm": 0.9898871779441833, + "learning_rate": 3.889944585228067e-09, + "loss": 0.8885, + "step": 311930 + }, + { + "epoch": 1.9928957489490564, + "grad_norm": 1.2186857461929321, + "learning_rate": 3.827607533746225e-09, + "loss": 0.8809, + "step": 311940 + }, + { + "epoch": 1.9929596360987951, + "grad_norm": 0.7133896946907043, + "learning_rate": 3.765773986968979e-09, + "loss": 0.7136, + "step": 311950 + }, + { + "epoch": 1.9930235232485338, + "grad_norm": 1.3756506443023682, + "learning_rate": 3.704443945523606e-09, + "loss": 0.7445, + "step": 311960 + }, + { + "epoch": 1.9930874103982725, + "grad_norm": 0.9272878766059875, + "learning_rate": 3.64361741003183e-09, + "loss": 0.905, + "step": 311970 + }, + { + "epoch": 1.9931512975480112, + "grad_norm": 0.8822316527366638, + "learning_rate": 3.583294381098723e-09, + "loss": 0.6295, + "step": 311980 + }, + { + "epoch": 1.99321518469775, + "grad_norm": 0.501191258430481, + "learning_rate": 3.5234748593349077e-09, + "loss": 0.6983, + "step": 311990 + }, + { + "epoch": 1.9932790718474886, + "grad_norm": 2.474304676055908, + "learning_rate": 3.4641588453454553e-09, + "loss": 0.7978, + "step": 312000 + }, + { + "epoch": 1.9933429589972271, + "grad_norm": 0.9736420512199402, + "learning_rate": 3.4053463397243357e-09, + "loss": 0.905, + "step": 312010 + }, + { + "epoch": 1.993406846146966, + "grad_norm": 0.9467423558235168, + "learning_rate": 3.347037343065518e-09, + "loss": 0.8016, + "step": 312020 + }, + { + "epoch": 1.9934707332967045, + "grad_norm": 0.658368706703186, + "learning_rate": 3.2892318559518685e-09, + "loss": 0.7383, + "step": 312030 + }, + { + "epoch": 1.9935346204464435, + "grad_norm": 1.1242454051971436, + "learning_rate": 3.2319298789718065e-09, + "loss": 0.8804, + "step": 312040 + }, + { + "epoch": 1.993598507596182, + "grad_norm": 0.6961290836334229, + "learning_rate": 3.1751314126970967e-09, + "loss": 0.8945, + "step": 312050 + }, + { + "epoch": 1.9936623947459209, + "grad_norm": 0.6901680827140808, + "learning_rate": 3.1188364577050543e-09, + "loss": 0.7463, + "step": 312060 + }, + { + "epoch": 1.9937262818956594, + "grad_norm": 0.796148955821991, + "learning_rate": 3.063045014556343e-09, + "loss": 1.2063, + "step": 312070 + }, + { + "epoch": 1.9937901690453983, + "grad_norm": 1.2905848026275635, + "learning_rate": 3.0077570838171753e-09, + "loss": 0.8915, + "step": 312080 + }, + { + "epoch": 1.9938540561951368, + "grad_norm": 0.9283998012542725, + "learning_rate": 2.952972666042664e-09, + "loss": 1.0008, + "step": 312090 + }, + { + "epoch": 1.9939179433448757, + "grad_norm": 0.957834005355835, + "learning_rate": 2.8986917617879195e-09, + "loss": 0.9992, + "step": 312100 + }, + { + "epoch": 1.9939818304946142, + "grad_norm": 0.9345075488090515, + "learning_rate": 2.8449143715969517e-09, + "loss": 1.1347, + "step": 312110 + }, + { + "epoch": 1.9940457176443531, + "grad_norm": 0.9846530556678772, + "learning_rate": 2.7916404960137697e-09, + "loss": 0.6848, + "step": 312120 + }, + { + "epoch": 1.9941096047940916, + "grad_norm": 0.8634576797485352, + "learning_rate": 2.73887013556573e-09, + "loss": 1.1417, + "step": 312130 + }, + { + "epoch": 1.9941734919438305, + "grad_norm": 0.9867701530456543, + "learning_rate": 2.6866032907968406e-09, + "loss": 0.9245, + "step": 312140 + }, + { + "epoch": 1.994237379093569, + "grad_norm": 0.9844792485237122, + "learning_rate": 2.6348399622233566e-09, + "loss": 0.8678, + "step": 312150 + }, + { + "epoch": 1.994301266243308, + "grad_norm": NaN, + "learning_rate": 2.5886834732924416e-09, + "loss": 0.9213, + "step": 312160 + }, + { + "epoch": 1.9943651533930464, + "grad_norm": 2.1825661659240723, + "learning_rate": 2.5378768269301856e-09, + "loss": 0.9257, + "step": 312170 + }, + { + "epoch": 1.9944290405427854, + "grad_norm": 0.8919321894645691, + "learning_rate": 2.4875736982676868e-09, + "loss": 0.874, + "step": 312180 + }, + { + "epoch": 1.9944929276925238, + "grad_norm": 3.1658194065093994, + "learning_rate": 2.437774087810096e-09, + "loss": 0.9289, + "step": 312190 + }, + { + "epoch": 1.9945568148422628, + "grad_norm": 0.993950366973877, + "learning_rate": 2.3884779960570147e-09, + "loss": 0.9231, + "step": 312200 + }, + { + "epoch": 1.9946207019920013, + "grad_norm": 0.6404727697372437, + "learning_rate": 2.3396854235080425e-09, + "loss": 1.0877, + "step": 312210 + }, + { + "epoch": 1.9946845891417402, + "grad_norm": 1.1700347661972046, + "learning_rate": 2.2913963706516772e-09, + "loss": 0.972, + "step": 312220 + }, + { + "epoch": 1.9947484762914787, + "grad_norm": 2.679147481918335, + "learning_rate": 2.2436108379764175e-09, + "loss": 0.6191, + "step": 312230 + }, + { + "epoch": 1.9948123634412176, + "grad_norm": 1.6708064079284668, + "learning_rate": 2.1963288259596594e-09, + "loss": 1.2286, + "step": 312240 + }, + { + "epoch": 1.994876250590956, + "grad_norm": 0.9976462125778198, + "learning_rate": 2.1495503350787983e-09, + "loss": 0.686, + "step": 312250 + }, + { + "epoch": 1.9949401377406948, + "grad_norm": 0.7239921689033508, + "learning_rate": 2.1032753658112302e-09, + "loss": 0.9027, + "step": 312260 + }, + { + "epoch": 1.9950040248904335, + "grad_norm": 0.9449382424354553, + "learning_rate": 2.057503918612147e-09, + "loss": 0.8882, + "step": 312270 + }, + { + "epoch": 1.9950679120401722, + "grad_norm": 0.9939311146736145, + "learning_rate": 2.012235993953393e-09, + "loss": 0.9177, + "step": 312280 + }, + { + "epoch": 1.995131799189911, + "grad_norm": 1.0171376466751099, + "learning_rate": 1.9674715922846086e-09, + "loss": 0.7605, + "step": 312290 + }, + { + "epoch": 1.9951956863396496, + "grad_norm": 0.8827845454216003, + "learning_rate": 1.9232107140554346e-09, + "loss": 0.9845, + "step": 312300 + }, + { + "epoch": 1.9952595734893883, + "grad_norm": 0.8483384251594543, + "learning_rate": 1.8794533597155105e-09, + "loss": 0.8771, + "step": 312310 + }, + { + "epoch": 1.995323460639127, + "grad_norm": 1.1981017589569092, + "learning_rate": 1.8361995297033752e-09, + "loss": 0.9435, + "step": 312320 + }, + { + "epoch": 1.9953873477888657, + "grad_norm": 0.6356529593467712, + "learning_rate": 1.7934492244575662e-09, + "loss": 0.8276, + "step": 312330 + }, + { + "epoch": 1.9954512349386044, + "grad_norm": 1.1917228698730469, + "learning_rate": 1.7512024444055197e-09, + "loss": 0.9145, + "step": 312340 + }, + { + "epoch": 1.9955151220883431, + "grad_norm": 0.7620391845703125, + "learning_rate": 1.7094591899691204e-09, + "loss": 1.0847, + "step": 312350 + }, + { + "epoch": 1.9955790092380818, + "grad_norm": 2.1927714347839355, + "learning_rate": 1.668219461575804e-09, + "loss": 0.5758, + "step": 312360 + }, + { + "epoch": 1.9956428963878206, + "grad_norm": 0.8946861624717712, + "learning_rate": 1.6274832596419043e-09, + "loss": 0.8894, + "step": 312370 + }, + { + "epoch": 1.9957067835375593, + "grad_norm": 0.6378492116928101, + "learning_rate": 1.5872505845726525e-09, + "loss": 0.8183, + "step": 312380 + }, + { + "epoch": 1.995770670687298, + "grad_norm": 0.891268253326416, + "learning_rate": 1.5475214367732804e-09, + "loss": 0.7768, + "step": 312390 + }, + { + "epoch": 1.9958345578370367, + "grad_norm": 1.2618356943130493, + "learning_rate": 1.508295816643468e-09, + "loss": 1.0496, + "step": 312400 + }, + { + "epoch": 1.9958984449867754, + "grad_norm": 0.9742398858070374, + "learning_rate": 1.4695737245828955e-09, + "loss": 1.0082, + "step": 312410 + }, + { + "epoch": 1.995962332136514, + "grad_norm": 0.8416606783866882, + "learning_rate": 1.43135516097459e-09, + "loss": 0.7395, + "step": 312420 + }, + { + "epoch": 1.9960262192862528, + "grad_norm": 2.4470884799957275, + "learning_rate": 1.3936401262126809e-09, + "loss": 0.7238, + "step": 312430 + }, + { + "epoch": 1.9960901064359915, + "grad_norm": 0.7618647813796997, + "learning_rate": 1.3564286206690924e-09, + "loss": 0.9034, + "step": 312440 + }, + { + "epoch": 1.9961539935857302, + "grad_norm": 1.1379642486572266, + "learning_rate": 1.3197206447213007e-09, + "loss": 0.9954, + "step": 312450 + }, + { + "epoch": 1.996217880735469, + "grad_norm": 1.0726209878921509, + "learning_rate": 1.2835161987356792e-09, + "loss": 1.0815, + "step": 312460 + }, + { + "epoch": 1.9962817678852076, + "grad_norm": 1.2896431684494019, + "learning_rate": 1.2478152830841528e-09, + "loss": 0.8023, + "step": 312470 + }, + { + "epoch": 1.9963456550349463, + "grad_norm": 1.954415202140808, + "learning_rate": 1.2126178981219928e-09, + "loss": 1.3741, + "step": 312480 + }, + { + "epoch": 1.996409542184685, + "grad_norm": 0.8789839744567871, + "learning_rate": 1.1779240442044703e-09, + "loss": 0.9468, + "step": 312490 + }, + { + "epoch": 1.9964734293344235, + "grad_norm": 1.0630998611450195, + "learning_rate": 1.143733721681306e-09, + "loss": 0.8048, + "step": 312500 + }, + { + "epoch": 1.9965373164841624, + "grad_norm": 1.151191234588623, + "learning_rate": 1.1100469308911176e-09, + "loss": 0.6205, + "step": 312510 + }, + { + "epoch": 1.996601203633901, + "grad_norm": 0.8841159343719482, + "learning_rate": 1.0768636721836257e-09, + "loss": 0.6618, + "step": 312520 + }, + { + "epoch": 1.9966650907836399, + "grad_norm": 1.1014671325683594, + "learning_rate": 1.0441839458807944e-09, + "loss": 0.8356, + "step": 312530 + }, + { + "epoch": 1.9967289779333783, + "grad_norm": 0.8289458155632019, + "learning_rate": 1.0120077523212423e-09, + "loss": 0.7793, + "step": 312540 + }, + { + "epoch": 1.9967928650831173, + "grad_norm": 2.142965078353882, + "learning_rate": 9.803350918269338e-10, + "loss": 0.7983, + "step": 312550 + }, + { + "epoch": 1.9968567522328557, + "grad_norm": 0.8722677230834961, + "learning_rate": 9.491659647198336e-10, + "loss": 0.8305, + "step": 312560 + }, + { + "epoch": 1.9969206393825947, + "grad_norm": 1.0402874946594238, + "learning_rate": 9.185003713052531e-10, + "loss": 1.0401, + "step": 312570 + }, + { + "epoch": 1.9969845265323332, + "grad_norm": 1.0141799449920654, + "learning_rate": 8.883383118940547e-10, + "loss": 0.9602, + "step": 312580 + }, + { + "epoch": 1.997048413682072, + "grad_norm": 1.4456356763839722, + "learning_rate": 8.586797867971008e-10, + "loss": 1.01, + "step": 312590 + }, + { + "epoch": 1.9971123008318106, + "grad_norm": 1.1017581224441528, + "learning_rate": 8.295247963086006e-10, + "loss": 0.9403, + "step": 312600 + }, + { + "epoch": 1.9971761879815495, + "grad_norm": 1.3453593254089355, + "learning_rate": 8.008733407227631e-10, + "loss": 0.7422, + "step": 312610 + }, + { + "epoch": 1.997240075131288, + "grad_norm": 1.323583722114563, + "learning_rate": 7.727254203226952e-10, + "loss": 1.041, + "step": 312620 + }, + { + "epoch": 1.997303962281027, + "grad_norm": 1.0199631452560425, + "learning_rate": 7.450810354026061e-10, + "loss": 0.9727, + "step": 312630 + }, + { + "epoch": 1.9973678494307654, + "grad_norm": 1.100943684577942, + "learning_rate": 7.179401862289492e-10, + "loss": 1.0275, + "step": 312640 + }, + { + "epoch": 1.9974317365805043, + "grad_norm": 1.1497398614883423, + "learning_rate": 6.913028730848315e-10, + "loss": 0.8959, + "step": 312650 + }, + { + "epoch": 1.9974956237302428, + "grad_norm": 1.6988590955734253, + "learning_rate": 6.651690962367064e-10, + "loss": 0.8686, + "step": 312660 + }, + { + "epoch": 1.9975595108799817, + "grad_norm": 1.0665082931518555, + "learning_rate": 6.395388559454762e-10, + "loss": 0.8715, + "step": 312670 + }, + { + "epoch": 1.9976233980297202, + "grad_norm": 1.0004991292953491, + "learning_rate": 6.144121524664925e-10, + "loss": 0.6519, + "step": 312680 + }, + { + "epoch": 1.9976872851794591, + "grad_norm": 0.9688588976860046, + "learning_rate": 5.897889860606576e-10, + "loss": 0.8501, + "step": 312690 + }, + { + "epoch": 1.9977511723291976, + "grad_norm": 0.8588424324989319, + "learning_rate": 5.656693569666694e-10, + "loss": 0.7342, + "step": 312700 + }, + { + "epoch": 1.9978150594789366, + "grad_norm": 0.8054032325744629, + "learning_rate": 5.42053265434328e-10, + "loss": 0.7475, + "step": 312710 + }, + { + "epoch": 1.997878946628675, + "grad_norm": 0.6064567565917969, + "learning_rate": 5.189407116967804e-10, + "loss": 0.8118, + "step": 312720 + }, + { + "epoch": 1.997942833778414, + "grad_norm": 1.036947250366211, + "learning_rate": 4.963316959927245e-10, + "loss": 0.8701, + "step": 312730 + }, + { + "epoch": 1.9980067209281525, + "grad_norm": 1.018930196762085, + "learning_rate": 4.742262185442048e-10, + "loss": 0.7014, + "step": 312740 + }, + { + "epoch": 1.9980706080778912, + "grad_norm": 0.773653507232666, + "learning_rate": 4.5262427957881713e-10, + "loss": 0.9808, + "step": 312750 + }, + { + "epoch": 1.9981344952276299, + "grad_norm": 1.2292520999908447, + "learning_rate": 4.315258793075039e-10, + "loss": 0.7937, + "step": 312760 + }, + { + "epoch": 1.9981983823773686, + "grad_norm": 1.4209551811218262, + "learning_rate": 4.1093101794675847e-10, + "loss": 0.7514, + "step": 312770 + }, + { + "epoch": 1.9982622695271073, + "grad_norm": 0.9260154366493225, + "learning_rate": 3.908396957075233e-10, + "loss": 0.9435, + "step": 312780 + }, + { + "epoch": 1.998326156676846, + "grad_norm": 2.0101253986358643, + "learning_rate": 3.7125191278408745e-10, + "loss": 0.9585, + "step": 312790 + }, + { + "epoch": 1.9983900438265847, + "grad_norm": 0.8051590323448181, + "learning_rate": 3.52167669376291e-10, + "loss": 0.9492, + "step": 312800 + }, + { + "epoch": 1.9984539309763234, + "grad_norm": 0.7497386336326599, + "learning_rate": 3.335869656839741e-10, + "loss": 0.7522, + "step": 312810 + }, + { + "epoch": 1.998517818126062, + "grad_norm": 1.0292730331420898, + "learning_rate": 3.155098018847724e-10, + "loss": 0.866, + "step": 312820 + }, + { + "epoch": 1.9985817052758008, + "grad_norm": 1.0129666328430176, + "learning_rate": 2.979361781674239e-10, + "loss": 0.9058, + "step": 312830 + }, + { + "epoch": 1.9986455924255395, + "grad_norm": 1.1604506969451904, + "learning_rate": 2.808660947040131e-10, + "loss": 0.8215, + "step": 312840 + }, + { + "epoch": 1.9987094795752782, + "grad_norm": 1.2080724239349365, + "learning_rate": 2.6429955166662466e-10, + "loss": 0.8299, + "step": 312850 + }, + { + "epoch": 1.998773366725017, + "grad_norm": 1.4316158294677734, + "learning_rate": 2.4823654922179194e-10, + "loss": 0.6791, + "step": 312860 + }, + { + "epoch": 1.9988372538747556, + "grad_norm": 1.3033369779586792, + "learning_rate": 2.3267708753604845e-10, + "loss": 0.8213, + "step": 312870 + }, + { + "epoch": 1.9989011410244943, + "grad_norm": 0.672759473323822, + "learning_rate": 2.1762116676482537e-10, + "loss": 0.657, + "step": 312880 + }, + { + "epoch": 1.998965028174233, + "grad_norm": 0.6766613721847534, + "learning_rate": 2.0306878705800282e-10, + "loss": 0.7861, + "step": 312890 + }, + { + "epoch": 1.9990289153239718, + "grad_norm": 0.7596450448036194, + "learning_rate": 1.890199485599098e-10, + "loss": 0.8423, + "step": 312900 + }, + { + "epoch": 1.9990928024737105, + "grad_norm": 0.7633396983146667, + "learning_rate": 1.754746514148753e-10, + "loss": 0.583, + "step": 312910 + }, + { + "epoch": 1.9991566896234492, + "grad_norm": 0.7127503156661987, + "learning_rate": 1.6243289576167719e-10, + "loss": 0.8471, + "step": 312920 + }, + { + "epoch": 1.9992205767731879, + "grad_norm": 0.8593207001686096, + "learning_rate": 1.4989468172244002e-10, + "loss": 0.7892, + "step": 312930 + }, + { + "epoch": 1.9992844639229266, + "grad_norm": 1.2852826118469238, + "learning_rate": 1.3786000943594168e-10, + "loss": 1.0036, + "step": 312940 + }, + { + "epoch": 1.9993483510726653, + "grad_norm": 0.5354711413383484, + "learning_rate": 1.2632887901320444e-10, + "loss": 0.8968, + "step": 312950 + }, + { + "epoch": 1.999412238222404, + "grad_norm": 1.204946756362915, + "learning_rate": 1.1530129057635286e-10, + "loss": 1.0277, + "step": 312960 + }, + { + "epoch": 1.9994761253721427, + "grad_norm": 0.9072065949440002, + "learning_rate": 1.0477724423640923e-10, + "loss": 1.0339, + "step": 312970 + }, + { + "epoch": 1.9995400125218814, + "grad_norm": 1.1976101398468018, + "learning_rate": 9.47567400932936e-11, + "loss": 0.7641, + "step": 312980 + }, + { + "epoch": 1.9996038996716199, + "grad_norm": 1.8328163623809814, + "learning_rate": 8.523977825247719e-11, + "loss": 0.8129, + "step": 312990 + }, + { + "epoch": 1.9996677868213588, + "grad_norm": 0.9194107055664062, + "learning_rate": 7.622635881388007e-11, + "loss": 0.7071, + "step": 313000 + }, + { + "epoch": 1.9997316739710973, + "grad_norm": 1.5391420125961304, + "learning_rate": 6.771648186076896e-11, + "loss": 0.6495, + "step": 313010 + }, + { + "epoch": 1.9997955611208362, + "grad_norm": 1.162663221359253, + "learning_rate": 5.971014748196169e-11, + "loss": 0.8881, + "step": 313020 + }, + { + "epoch": 1.9998594482705747, + "grad_norm": 2.731156587600708, + "learning_rate": 5.220735575517388e-11, + "loss": 0.7754, + "step": 313030 + }, + { + "epoch": 1.9999233354203136, + "grad_norm": 1.8060675859451294, + "learning_rate": 4.520810676367227e-11, + "loss": 0.9916, + "step": 313040 + }, + { + "epoch": 1.9999872225700521, + "grad_norm": 1.0147830247879028, + "learning_rate": 3.87124005685191e-11, + "loss": 0.8296, + "step": 313050 + } + ], + "logging_steps": 10, + "max_steps": 313052, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1583506541582582e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}